diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25263 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9991675915649276, + "eval_steps": 500, + "global_step": 3603, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000832408435072142, + "grad_norm": 5.852675437927246, + "learning_rate": 2.770083102493075e-08, + "loss": 0.8812, + "step": 1 + }, + { + "epoch": 0.001664816870144284, + "grad_norm": 5.757553577423096, + "learning_rate": 5.54016620498615e-08, + "loss": 0.8425, + "step": 2 + }, + { + "epoch": 0.0024972253052164264, + "grad_norm": 5.973855972290039, + "learning_rate": 8.310249307479226e-08, + "loss": 0.8658, + "step": 3 + }, + { + "epoch": 0.003329633740288568, + "grad_norm": 6.0041680335998535, + "learning_rate": 1.10803324099723e-07, + "loss": 0.8471, + "step": 4 + }, + { + "epoch": 0.004162042175360711, + "grad_norm": 5.798811912536621, + "learning_rate": 1.3850415512465375e-07, + "loss": 0.8696, + "step": 5 + }, + { + "epoch": 0.004994450610432853, + "grad_norm": 5.87969970703125, + "learning_rate": 1.662049861495845e-07, + "loss": 0.8461, + "step": 6 + }, + { + "epoch": 0.005826859045504994, + "grad_norm": 5.775433540344238, + "learning_rate": 1.9390581717451524e-07, + "loss": 0.859, + "step": 7 + }, + { + "epoch": 0.006659267480577136, + "grad_norm": 6.0282087326049805, + "learning_rate": 2.21606648199446e-07, + "loss": 0.8566, + "step": 8 + }, + { + "epoch": 0.007491675915649278, + "grad_norm": 5.845994472503662, + "learning_rate": 2.4930747922437677e-07, + "loss": 0.8533, + "step": 9 + }, + { + "epoch": 0.008324084350721421, + "grad_norm": 5.8350605964660645, + "learning_rate": 2.770083102493075e-07, + "loss": 0.8669, + "step": 10 + }, + { + "epoch": 0.009156492785793563, + "grad_norm": 5.791550636291504, + "learning_rate": 3.0470914127423823e-07, + "loss": 0.8701, + "step": 11 + }, + { + "epoch": 0.009988901220865706, + "grad_norm": 5.940352439880371, + "learning_rate": 3.32409972299169e-07, + "loss": 0.8909, + "step": 12 + }, + { + "epoch": 0.010821309655937847, + "grad_norm": 5.590038299560547, + "learning_rate": 3.601108033240998e-07, + "loss": 0.8443, + "step": 13 + }, + { + "epoch": 0.011653718091009988, + "grad_norm": 5.473329067230225, + "learning_rate": 3.878116343490305e-07, + "loss": 0.8345, + "step": 14 + }, + { + "epoch": 0.012486126526082131, + "grad_norm": 5.688844680786133, + "learning_rate": 4.155124653739612e-07, + "loss": 0.8813, + "step": 15 + }, + { + "epoch": 0.013318534961154272, + "grad_norm": 5.5658440589904785, + "learning_rate": 4.43213296398892e-07, + "loss": 0.8678, + "step": 16 + }, + { + "epoch": 0.014150943396226415, + "grad_norm": 5.2306599617004395, + "learning_rate": 4.7091412742382274e-07, + "loss": 0.8587, + "step": 17 + }, + { + "epoch": 0.014983351831298557, + "grad_norm": 5.296006202697754, + "learning_rate": 4.986149584487535e-07, + "loss": 0.8373, + "step": 18 + }, + { + "epoch": 0.015815760266370698, + "grad_norm": 4.465157508850098, + "learning_rate": 5.263157894736843e-07, + "loss": 0.7817, + "step": 19 + }, + { + "epoch": 0.016648168701442843, + "grad_norm": 4.338566780090332, + "learning_rate": 5.54016620498615e-07, + "loss": 0.7938, + "step": 20 + }, + { + "epoch": 0.017480577136514984, + "grad_norm": 4.399113655090332, + "learning_rate": 5.817174515235457e-07, + "loss": 0.8391, + "step": 21 + }, + { + "epoch": 0.018312985571587125, + "grad_norm": 4.371567726135254, + "learning_rate": 6.094182825484765e-07, + "loss": 0.8577, + "step": 22 + }, + { + "epoch": 0.019145394006659266, + "grad_norm": 3.9851796627044678, + "learning_rate": 6.371191135734073e-07, + "loss": 0.7962, + "step": 23 + }, + { + "epoch": 0.01997780244173141, + "grad_norm": 4.0287652015686035, + "learning_rate": 6.64819944598338e-07, + "loss": 0.7759, + "step": 24 + }, + { + "epoch": 0.020810210876803552, + "grad_norm": 3.2178382873535156, + "learning_rate": 6.925207756232688e-07, + "loss": 0.7951, + "step": 25 + }, + { + "epoch": 0.021642619311875694, + "grad_norm": 2.373326301574707, + "learning_rate": 7.202216066481996e-07, + "loss": 0.7714, + "step": 26 + }, + { + "epoch": 0.022475027746947835, + "grad_norm": 2.38152813911438, + "learning_rate": 7.479224376731302e-07, + "loss": 0.7554, + "step": 27 + }, + { + "epoch": 0.023307436182019976, + "grad_norm": 2.2056140899658203, + "learning_rate": 7.75623268698061e-07, + "loss": 0.7232, + "step": 28 + }, + { + "epoch": 0.02413984461709212, + "grad_norm": 2.3134679794311523, + "learning_rate": 8.033240997229917e-07, + "loss": 0.7865, + "step": 29 + }, + { + "epoch": 0.024972253052164262, + "grad_norm": 2.2102293968200684, + "learning_rate": 8.310249307479224e-07, + "loss": 0.7796, + "step": 30 + }, + { + "epoch": 0.025804661487236404, + "grad_norm": 1.9953457117080688, + "learning_rate": 8.587257617728533e-07, + "loss": 0.7473, + "step": 31 + }, + { + "epoch": 0.026637069922308545, + "grad_norm": 1.9345048666000366, + "learning_rate": 8.86426592797784e-07, + "loss": 0.7113, + "step": 32 + }, + { + "epoch": 0.02746947835738069, + "grad_norm": 1.8699673414230347, + "learning_rate": 9.141274238227148e-07, + "loss": 0.7649, + "step": 33 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 1.523240566253662, + "learning_rate": 9.418282548476455e-07, + "loss": 0.7485, + "step": 34 + }, + { + "epoch": 0.029134295227524972, + "grad_norm": 1.7488731145858765, + "learning_rate": 9.695290858725762e-07, + "loss": 0.7514, + "step": 35 + }, + { + "epoch": 0.029966703662597113, + "grad_norm": 1.9269617795944214, + "learning_rate": 9.97229916897507e-07, + "loss": 0.7005, + "step": 36 + }, + { + "epoch": 0.030799112097669258, + "grad_norm": 2.1249892711639404, + "learning_rate": 1.024930747922438e-06, + "loss": 0.7309, + "step": 37 + }, + { + "epoch": 0.031631520532741396, + "grad_norm": 2.0195670127868652, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.6937, + "step": 38 + }, + { + "epoch": 0.03246392896781354, + "grad_norm": 2.0689985752105713, + "learning_rate": 1.0803324099722992e-06, + "loss": 0.7177, + "step": 39 + }, + { + "epoch": 0.033296337402885685, + "grad_norm": 2.0408408641815186, + "learning_rate": 1.10803324099723e-06, + "loss": 0.7019, + "step": 40 + }, + { + "epoch": 0.03412874583795782, + "grad_norm": 1.8314168453216553, + "learning_rate": 1.1357340720221608e-06, + "loss": 0.6972, + "step": 41 + }, + { + "epoch": 0.03496115427302997, + "grad_norm": 1.612196922302246, + "learning_rate": 1.1634349030470915e-06, + "loss": 0.6842, + "step": 42 + }, + { + "epoch": 0.035793562708102106, + "grad_norm": 1.4777947664260864, + "learning_rate": 1.1911357340720223e-06, + "loss": 0.6992, + "step": 43 + }, + { + "epoch": 0.03662597114317425, + "grad_norm": 1.207147479057312, + "learning_rate": 1.218836565096953e-06, + "loss": 0.7215, + "step": 44 + }, + { + "epoch": 0.037458379578246395, + "grad_norm": 0.8992202877998352, + "learning_rate": 1.2465373961218838e-06, + "loss": 0.663, + "step": 45 + }, + { + "epoch": 0.03829078801331853, + "grad_norm": 0.8715615272521973, + "learning_rate": 1.2742382271468146e-06, + "loss": 0.696, + "step": 46 + }, + { + "epoch": 0.03912319644839068, + "grad_norm": 0.9824283719062805, + "learning_rate": 1.3019390581717452e-06, + "loss": 0.6709, + "step": 47 + }, + { + "epoch": 0.03995560488346282, + "grad_norm": 1.060660481452942, + "learning_rate": 1.329639889196676e-06, + "loss": 0.6799, + "step": 48 + }, + { + "epoch": 0.04078801331853496, + "grad_norm": 1.121728777885437, + "learning_rate": 1.357340720221607e-06, + "loss": 0.6862, + "step": 49 + }, + { + "epoch": 0.041620421753607105, + "grad_norm": 0.9326874017715454, + "learning_rate": 1.3850415512465375e-06, + "loss": 0.6305, + "step": 50 + }, + { + "epoch": 0.04245283018867924, + "grad_norm": 0.9469634294509888, + "learning_rate": 1.4127423822714684e-06, + "loss": 0.6881, + "step": 51 + }, + { + "epoch": 0.04328523862375139, + "grad_norm": 0.790377140045166, + "learning_rate": 1.4404432132963992e-06, + "loss": 0.6712, + "step": 52 + }, + { + "epoch": 0.04411764705882353, + "grad_norm": 0.8012718558311462, + "learning_rate": 1.4681440443213299e-06, + "loss": 0.6611, + "step": 53 + }, + { + "epoch": 0.04495005549389567, + "grad_norm": 0.7442087531089783, + "learning_rate": 1.4958448753462605e-06, + "loss": 0.6363, + "step": 54 + }, + { + "epoch": 0.045782463928967815, + "grad_norm": 0.6595564484596252, + "learning_rate": 1.5235457063711911e-06, + "loss": 0.6289, + "step": 55 + }, + { + "epoch": 0.04661487236403995, + "grad_norm": 0.7440547943115234, + "learning_rate": 1.551246537396122e-06, + "loss": 0.6602, + "step": 56 + }, + { + "epoch": 0.0474472807991121, + "grad_norm": 0.6962677240371704, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.6092, + "step": 57 + }, + { + "epoch": 0.04827968923418424, + "grad_norm": 0.5988077521324158, + "learning_rate": 1.6066481994459834e-06, + "loss": 0.6156, + "step": 58 + }, + { + "epoch": 0.04911209766925638, + "grad_norm": 0.579781711101532, + "learning_rate": 1.6343490304709143e-06, + "loss": 0.5902, + "step": 59 + }, + { + "epoch": 0.049944506104328525, + "grad_norm": 0.658952534198761, + "learning_rate": 1.6620498614958449e-06, + "loss": 0.6049, + "step": 60 + }, + { + "epoch": 0.05077691453940067, + "grad_norm": 0.7643985748291016, + "learning_rate": 1.6897506925207757e-06, + "loss": 0.6195, + "step": 61 + }, + { + "epoch": 0.05160932297447281, + "grad_norm": 0.6020765900611877, + "learning_rate": 1.7174515235457066e-06, + "loss": 0.6154, + "step": 62 + }, + { + "epoch": 0.05244173140954495, + "grad_norm": 0.5499704480171204, + "learning_rate": 1.7451523545706372e-06, + "loss": 0.6152, + "step": 63 + }, + { + "epoch": 0.05327413984461709, + "grad_norm": 0.5000544190406799, + "learning_rate": 1.772853185595568e-06, + "loss": 0.6228, + "step": 64 + }, + { + "epoch": 0.054106548279689234, + "grad_norm": 0.591923713684082, + "learning_rate": 1.8005540166204989e-06, + "loss": 0.6174, + "step": 65 + }, + { + "epoch": 0.05493895671476138, + "grad_norm": 0.6395013332366943, + "learning_rate": 1.8282548476454295e-06, + "loss": 0.6169, + "step": 66 + }, + { + "epoch": 0.05577136514983352, + "grad_norm": 0.5679120421409607, + "learning_rate": 1.8559556786703603e-06, + "loss": 0.6119, + "step": 67 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 0.5052242279052734, + "learning_rate": 1.883656509695291e-06, + "loss": 0.6275, + "step": 68 + }, + { + "epoch": 0.0574361820199778, + "grad_norm": 0.5107065439224243, + "learning_rate": 1.911357340720222e-06, + "loss": 0.6044, + "step": 69 + }, + { + "epoch": 0.058268590455049944, + "grad_norm": 0.5278398990631104, + "learning_rate": 1.9390581717451524e-06, + "loss": 0.6005, + "step": 70 + }, + { + "epoch": 0.05910099889012209, + "grad_norm": 0.5229373574256897, + "learning_rate": 1.9667590027700835e-06, + "loss": 0.605, + "step": 71 + }, + { + "epoch": 0.05993340732519423, + "grad_norm": 0.45075175166130066, + "learning_rate": 1.994459833795014e-06, + "loss": 0.5785, + "step": 72 + }, + { + "epoch": 0.06076581576026637, + "grad_norm": 0.44926148653030396, + "learning_rate": 2.0221606648199448e-06, + "loss": 0.584, + "step": 73 + }, + { + "epoch": 0.061598224195338516, + "grad_norm": 0.49701979756355286, + "learning_rate": 2.049861495844876e-06, + "loss": 0.5674, + "step": 74 + }, + { + "epoch": 0.062430632630410654, + "grad_norm": 0.4435327351093292, + "learning_rate": 2.077562326869806e-06, + "loss": 0.6039, + "step": 75 + }, + { + "epoch": 0.06326304106548279, + "grad_norm": 0.5081650614738464, + "learning_rate": 2.105263157894737e-06, + "loss": 0.6047, + "step": 76 + }, + { + "epoch": 0.06409544950055494, + "grad_norm": 0.47305235266685486, + "learning_rate": 2.1329639889196677e-06, + "loss": 0.6117, + "step": 77 + }, + { + "epoch": 0.06492785793562708, + "grad_norm": 0.501122772693634, + "learning_rate": 2.1606648199445983e-06, + "loss": 0.5932, + "step": 78 + }, + { + "epoch": 0.06576026637069922, + "grad_norm": 0.4220408797264099, + "learning_rate": 2.1883656509695294e-06, + "loss": 0.5489, + "step": 79 + }, + { + "epoch": 0.06659267480577137, + "grad_norm": 0.45031723380088806, + "learning_rate": 2.21606648199446e-06, + "loss": 0.5997, + "step": 80 + }, + { + "epoch": 0.06742508324084351, + "grad_norm": 0.4954591691493988, + "learning_rate": 2.2437673130193906e-06, + "loss": 0.6027, + "step": 81 + }, + { + "epoch": 0.06825749167591565, + "grad_norm": 0.471713662147522, + "learning_rate": 2.2714681440443217e-06, + "loss": 0.5924, + "step": 82 + }, + { + "epoch": 0.0690899001109878, + "grad_norm": 0.41615861654281616, + "learning_rate": 2.2991689750692523e-06, + "loss": 0.5723, + "step": 83 + }, + { + "epoch": 0.06992230854605994, + "grad_norm": 0.3945973515510559, + "learning_rate": 2.326869806094183e-06, + "loss": 0.5716, + "step": 84 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 0.4024456739425659, + "learning_rate": 2.3545706371191136e-06, + "loss": 0.5668, + "step": 85 + }, + { + "epoch": 0.07158712541620421, + "grad_norm": 0.4162571132183075, + "learning_rate": 2.3822714681440446e-06, + "loss": 0.5706, + "step": 86 + }, + { + "epoch": 0.07241953385127636, + "grad_norm": 0.41097456216812134, + "learning_rate": 2.4099722991689752e-06, + "loss": 0.6016, + "step": 87 + }, + { + "epoch": 0.0732519422863485, + "grad_norm": 0.4599824845790863, + "learning_rate": 2.437673130193906e-06, + "loss": 0.6005, + "step": 88 + }, + { + "epoch": 0.07408435072142064, + "grad_norm": 0.3909335136413574, + "learning_rate": 2.465373961218837e-06, + "loss": 0.5682, + "step": 89 + }, + { + "epoch": 0.07491675915649279, + "grad_norm": 0.4187878370285034, + "learning_rate": 2.4930747922437675e-06, + "loss": 0.5648, + "step": 90 + }, + { + "epoch": 0.07574916759156493, + "grad_norm": 0.41880685091018677, + "learning_rate": 2.520775623268698e-06, + "loss": 0.5944, + "step": 91 + }, + { + "epoch": 0.07658157602663707, + "grad_norm": 0.4465304911136627, + "learning_rate": 2.5484764542936292e-06, + "loss": 0.6203, + "step": 92 + }, + { + "epoch": 0.07741398446170922, + "grad_norm": 0.43194279074668884, + "learning_rate": 2.5761772853185594e-06, + "loss": 0.5817, + "step": 93 + }, + { + "epoch": 0.07824639289678136, + "grad_norm": 0.41561976075172424, + "learning_rate": 2.6038781163434905e-06, + "loss": 0.5741, + "step": 94 + }, + { + "epoch": 0.0790788013318535, + "grad_norm": 0.4317433536052704, + "learning_rate": 2.631578947368421e-06, + "loss": 0.5727, + "step": 95 + }, + { + "epoch": 0.07991120976692564, + "grad_norm": 0.43518805503845215, + "learning_rate": 2.659279778393352e-06, + "loss": 0.5826, + "step": 96 + }, + { + "epoch": 0.08074361820199778, + "grad_norm": 0.42625486850738525, + "learning_rate": 2.686980609418283e-06, + "loss": 0.5984, + "step": 97 + }, + { + "epoch": 0.08157602663706992, + "grad_norm": 0.41016989946365356, + "learning_rate": 2.714681440443214e-06, + "loss": 0.5642, + "step": 98 + }, + { + "epoch": 0.08240843507214206, + "grad_norm": 0.48712158203125, + "learning_rate": 2.742382271468144e-06, + "loss": 0.5968, + "step": 99 + }, + { + "epoch": 0.08324084350721421, + "grad_norm": 0.4600513279438019, + "learning_rate": 2.770083102493075e-06, + "loss": 0.5559, + "step": 100 + }, + { + "epoch": 0.08407325194228635, + "grad_norm": 0.4251794219017029, + "learning_rate": 2.7977839335180057e-06, + "loss": 0.5698, + "step": 101 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 0.4187605082988739, + "learning_rate": 2.8254847645429368e-06, + "loss": 0.5463, + "step": 102 + }, + { + "epoch": 0.08573806881243064, + "grad_norm": 0.45978236198425293, + "learning_rate": 2.8531855955678674e-06, + "loss": 0.5605, + "step": 103 + }, + { + "epoch": 0.08657047724750278, + "grad_norm": 0.4345948398113251, + "learning_rate": 2.8808864265927985e-06, + "loss": 0.548, + "step": 104 + }, + { + "epoch": 0.08740288568257491, + "grad_norm": 0.4089731276035309, + "learning_rate": 2.9085872576177287e-06, + "loss": 0.5333, + "step": 105 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 0.4466627240180969, + "learning_rate": 2.9362880886426597e-06, + "loss": 0.5653, + "step": 106 + }, + { + "epoch": 0.0890677025527192, + "grad_norm": 0.4312840700149536, + "learning_rate": 2.9639889196675903e-06, + "loss": 0.544, + "step": 107 + }, + { + "epoch": 0.08990011098779134, + "grad_norm": 0.3958422541618347, + "learning_rate": 2.991689750692521e-06, + "loss": 0.5314, + "step": 108 + }, + { + "epoch": 0.09073251942286349, + "grad_norm": 0.4052140712738037, + "learning_rate": 3.0193905817174516e-06, + "loss": 0.5724, + "step": 109 + }, + { + "epoch": 0.09156492785793563, + "grad_norm": 0.4097498059272766, + "learning_rate": 3.0470914127423822e-06, + "loss": 0.5559, + "step": 110 + }, + { + "epoch": 0.09239733629300777, + "grad_norm": 0.43090808391571045, + "learning_rate": 3.0747922437673133e-06, + "loss": 0.5456, + "step": 111 + }, + { + "epoch": 0.0932297447280799, + "grad_norm": 0.43639659881591797, + "learning_rate": 3.102493074792244e-06, + "loss": 0.5572, + "step": 112 + }, + { + "epoch": 0.09406215316315206, + "grad_norm": 0.39143499732017517, + "learning_rate": 3.130193905817175e-06, + "loss": 0.5627, + "step": 113 + }, + { + "epoch": 0.0948945615982242, + "grad_norm": 0.4211403727531433, + "learning_rate": 3.157894736842105e-06, + "loss": 0.5701, + "step": 114 + }, + { + "epoch": 0.09572697003329633, + "grad_norm": 0.42813047766685486, + "learning_rate": 3.1855955678670362e-06, + "loss": 0.5594, + "step": 115 + }, + { + "epoch": 0.09655937846836848, + "grad_norm": 0.4047672152519226, + "learning_rate": 3.213296398891967e-06, + "loss": 0.5467, + "step": 116 + }, + { + "epoch": 0.09739178690344062, + "grad_norm": 0.4235449731349945, + "learning_rate": 3.240997229916898e-06, + "loss": 0.5596, + "step": 117 + }, + { + "epoch": 0.09822419533851276, + "grad_norm": 0.5001515746116638, + "learning_rate": 3.2686980609418285e-06, + "loss": 0.55, + "step": 118 + }, + { + "epoch": 0.09905660377358491, + "grad_norm": 0.4500423073768616, + "learning_rate": 3.2963988919667596e-06, + "loss": 0.5578, + "step": 119 + }, + { + "epoch": 0.09988901220865705, + "grad_norm": 0.43707484006881714, + "learning_rate": 3.3240997229916898e-06, + "loss": 0.544, + "step": 120 + }, + { + "epoch": 0.10072142064372919, + "grad_norm": 0.4369399845600128, + "learning_rate": 3.351800554016621e-06, + "loss": 0.5551, + "step": 121 + }, + { + "epoch": 0.10155382907880134, + "grad_norm": 0.4763941764831543, + "learning_rate": 3.3795013850415515e-06, + "loss": 0.5628, + "step": 122 + }, + { + "epoch": 0.10238623751387348, + "grad_norm": 0.4882802963256836, + "learning_rate": 3.4072022160664825e-06, + "loss": 0.5585, + "step": 123 + }, + { + "epoch": 0.10321864594894561, + "grad_norm": 0.441522479057312, + "learning_rate": 3.434903047091413e-06, + "loss": 0.5494, + "step": 124 + }, + { + "epoch": 0.10405105438401775, + "grad_norm": 0.4602923095226288, + "learning_rate": 3.462603878116344e-06, + "loss": 0.5152, + "step": 125 + }, + { + "epoch": 0.1048834628190899, + "grad_norm": 0.5620714426040649, + "learning_rate": 3.4903047091412744e-06, + "loss": 0.5424, + "step": 126 + }, + { + "epoch": 0.10571587125416204, + "grad_norm": 0.5059617161750793, + "learning_rate": 3.5180055401662054e-06, + "loss": 0.5758, + "step": 127 + }, + { + "epoch": 0.10654827968923418, + "grad_norm": 0.44737622141838074, + "learning_rate": 3.545706371191136e-06, + "loss": 0.53, + "step": 128 + }, + { + "epoch": 0.10738068812430633, + "grad_norm": 0.5812498331069946, + "learning_rate": 3.5734072022160667e-06, + "loss": 0.5437, + "step": 129 + }, + { + "epoch": 0.10821309655937847, + "grad_norm": 0.4357758164405823, + "learning_rate": 3.6011080332409978e-06, + "loss": 0.5482, + "step": 130 + }, + { + "epoch": 0.1090455049944506, + "grad_norm": 0.46321603655815125, + "learning_rate": 3.628808864265928e-06, + "loss": 0.5671, + "step": 131 + }, + { + "epoch": 0.10987791342952276, + "grad_norm": 0.4342377185821533, + "learning_rate": 3.656509695290859e-06, + "loss": 0.5532, + "step": 132 + }, + { + "epoch": 0.1107103218645949, + "grad_norm": 0.4561794102191925, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.5395, + "step": 133 + }, + { + "epoch": 0.11154273029966703, + "grad_norm": 0.40543898940086365, + "learning_rate": 3.7119113573407207e-06, + "loss": 0.5023, + "step": 134 + }, + { + "epoch": 0.11237513873473919, + "grad_norm": 0.4146581292152405, + "learning_rate": 3.739612188365651e-06, + "loss": 0.5455, + "step": 135 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.3970547914505005, + "learning_rate": 3.767313019390582e-06, + "loss": 0.5341, + "step": 136 + }, + { + "epoch": 0.11403995560488346, + "grad_norm": 0.45214366912841797, + "learning_rate": 3.7950138504155126e-06, + "loss": 0.5509, + "step": 137 + }, + { + "epoch": 0.1148723640399556, + "grad_norm": 0.4272449314594269, + "learning_rate": 3.822714681440444e-06, + "loss": 0.503, + "step": 138 + }, + { + "epoch": 0.11570477247502775, + "grad_norm": 0.4434300661087036, + "learning_rate": 3.850415512465374e-06, + "loss": 0.5212, + "step": 139 + }, + { + "epoch": 0.11653718091009989, + "grad_norm": 0.4421811103820801, + "learning_rate": 3.878116343490305e-06, + "loss": 0.5389, + "step": 140 + }, + { + "epoch": 0.11736958934517203, + "grad_norm": 0.42131364345550537, + "learning_rate": 3.9058171745152355e-06, + "loss": 0.5443, + "step": 141 + }, + { + "epoch": 0.11820199778024418, + "grad_norm": 0.47368770837783813, + "learning_rate": 3.933518005540167e-06, + "loss": 0.5534, + "step": 142 + }, + { + "epoch": 0.11903440621531632, + "grad_norm": 0.4222336709499359, + "learning_rate": 3.961218836565098e-06, + "loss": 0.5484, + "step": 143 + }, + { + "epoch": 0.11986681465038845, + "grad_norm": 0.4217516779899597, + "learning_rate": 3.988919667590028e-06, + "loss": 0.5359, + "step": 144 + }, + { + "epoch": 0.1206992230854606, + "grad_norm": 0.46646854281425476, + "learning_rate": 4.016620498614959e-06, + "loss": 0.5608, + "step": 145 + }, + { + "epoch": 0.12153163152053274, + "grad_norm": 0.4836723506450653, + "learning_rate": 4.0443213296398895e-06, + "loss": 0.5471, + "step": 146 + }, + { + "epoch": 0.12236403995560488, + "grad_norm": 0.42951828241348267, + "learning_rate": 4.07202216066482e-06, + "loss": 0.5387, + "step": 147 + }, + { + "epoch": 0.12319644839067703, + "grad_norm": 0.4030103385448456, + "learning_rate": 4.099722991689752e-06, + "loss": 0.5426, + "step": 148 + }, + { + "epoch": 0.12402885682574917, + "grad_norm": 0.4542948305606842, + "learning_rate": 4.127423822714681e-06, + "loss": 0.5331, + "step": 149 + }, + { + "epoch": 0.12486126526082131, + "grad_norm": 0.481719046831131, + "learning_rate": 4.155124653739612e-06, + "loss": 0.5384, + "step": 150 + }, + { + "epoch": 0.12569367369589346, + "grad_norm": 0.4303570091724396, + "learning_rate": 4.1828254847645435e-06, + "loss": 0.5398, + "step": 151 + }, + { + "epoch": 0.12652608213096558, + "grad_norm": 0.4159233272075653, + "learning_rate": 4.210526315789474e-06, + "loss": 0.5373, + "step": 152 + }, + { + "epoch": 0.12735849056603774, + "grad_norm": 0.37868037819862366, + "learning_rate": 4.238227146814405e-06, + "loss": 0.5276, + "step": 153 + }, + { + "epoch": 0.1281908990011099, + "grad_norm": 0.4570467472076416, + "learning_rate": 4.265927977839335e-06, + "loss": 0.5514, + "step": 154 + }, + { + "epoch": 0.129023307436182, + "grad_norm": 0.39249682426452637, + "learning_rate": 4.293628808864266e-06, + "loss": 0.5566, + "step": 155 + }, + { + "epoch": 0.12985571587125416, + "grad_norm": 0.37955617904663086, + "learning_rate": 4.321329639889197e-06, + "loss": 0.5411, + "step": 156 + }, + { + "epoch": 0.13068812430632631, + "grad_norm": 0.42655855417251587, + "learning_rate": 4.349030470914128e-06, + "loss": 0.538, + "step": 157 + }, + { + "epoch": 0.13152053274139844, + "grad_norm": 0.41612133383750916, + "learning_rate": 4.376731301939059e-06, + "loss": 0.5628, + "step": 158 + }, + { + "epoch": 0.1323529411764706, + "grad_norm": 0.4542618989944458, + "learning_rate": 4.404432132963989e-06, + "loss": 0.5541, + "step": 159 + }, + { + "epoch": 0.13318534961154274, + "grad_norm": 0.4357217252254486, + "learning_rate": 4.43213296398892e-06, + "loss": 0.536, + "step": 160 + }, + { + "epoch": 0.13401775804661487, + "grad_norm": 0.4199989140033722, + "learning_rate": 4.459833795013851e-06, + "loss": 0.5207, + "step": 161 + }, + { + "epoch": 0.13485016648168702, + "grad_norm": 0.43055838346481323, + "learning_rate": 4.487534626038781e-06, + "loss": 0.5649, + "step": 162 + }, + { + "epoch": 0.13568257491675917, + "grad_norm": 0.4102376103401184, + "learning_rate": 4.515235457063713e-06, + "loss": 0.5379, + "step": 163 + }, + { + "epoch": 0.1365149833518313, + "grad_norm": 0.40180259943008423, + "learning_rate": 4.542936288088643e-06, + "loss": 0.5461, + "step": 164 + }, + { + "epoch": 0.13734739178690344, + "grad_norm": 0.4794843792915344, + "learning_rate": 4.570637119113574e-06, + "loss": 0.5279, + "step": 165 + }, + { + "epoch": 0.1381798002219756, + "grad_norm": 0.5056953430175781, + "learning_rate": 4.598337950138505e-06, + "loss": 0.5348, + "step": 166 + }, + { + "epoch": 0.13901220865704772, + "grad_norm": 0.45556968450546265, + "learning_rate": 4.626038781163435e-06, + "loss": 0.4995, + "step": 167 + }, + { + "epoch": 0.13984461709211987, + "grad_norm": 0.40867018699645996, + "learning_rate": 4.653739612188366e-06, + "loss": 0.5301, + "step": 168 + }, + { + "epoch": 0.140677025527192, + "grad_norm": 0.44523248076438904, + "learning_rate": 4.681440443213297e-06, + "loss": 0.5097, + "step": 169 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 0.44204166531562805, + "learning_rate": 4.709141274238227e-06, + "loss": 0.5348, + "step": 170 + }, + { + "epoch": 0.1423418423973363, + "grad_norm": 0.39262545108795166, + "learning_rate": 4.736842105263158e-06, + "loss": 0.5127, + "step": 171 + }, + { + "epoch": 0.14317425083240842, + "grad_norm": 0.4378338158130646, + "learning_rate": 4.764542936288089e-06, + "loss": 0.498, + "step": 172 + }, + { + "epoch": 0.14400665926748057, + "grad_norm": 0.4008118510246277, + "learning_rate": 4.79224376731302e-06, + "loss": 0.5389, + "step": 173 + }, + { + "epoch": 0.14483906770255273, + "grad_norm": 0.41733667254447937, + "learning_rate": 4.8199445983379505e-06, + "loss": 0.5216, + "step": 174 + }, + { + "epoch": 0.14567147613762485, + "grad_norm": 0.4296148121356964, + "learning_rate": 4.847645429362881e-06, + "loss": 0.5328, + "step": 175 + }, + { + "epoch": 0.146503884572697, + "grad_norm": 0.4301013648509979, + "learning_rate": 4.875346260387812e-06, + "loss": 0.5292, + "step": 176 + }, + { + "epoch": 0.14733629300776915, + "grad_norm": 0.46143457293510437, + "learning_rate": 4.903047091412742e-06, + "loss": 0.5336, + "step": 177 + }, + { + "epoch": 0.14816870144284128, + "grad_norm": 0.4127751886844635, + "learning_rate": 4.930747922437674e-06, + "loss": 0.5322, + "step": 178 + }, + { + "epoch": 0.14900110987791343, + "grad_norm": 0.4373955726623535, + "learning_rate": 4.9584487534626045e-06, + "loss": 0.5486, + "step": 179 + }, + { + "epoch": 0.14983351831298558, + "grad_norm": 0.4892722964286804, + "learning_rate": 4.986149584487535e-06, + "loss": 0.5416, + "step": 180 + }, + { + "epoch": 0.1506659267480577, + "grad_norm": 0.40266159176826477, + "learning_rate": 5.013850415512466e-06, + "loss": 0.5149, + "step": 181 + }, + { + "epoch": 0.15149833518312986, + "grad_norm": 0.4328942894935608, + "learning_rate": 5.041551246537396e-06, + "loss": 0.5364, + "step": 182 + }, + { + "epoch": 0.152330743618202, + "grad_norm": 0.46720901131629944, + "learning_rate": 5.069252077562328e-06, + "loss": 0.5481, + "step": 183 + }, + { + "epoch": 0.15316315205327413, + "grad_norm": 0.5006843209266663, + "learning_rate": 5.0969529085872585e-06, + "loss": 0.5599, + "step": 184 + }, + { + "epoch": 0.15399556048834628, + "grad_norm": 0.4212501347064972, + "learning_rate": 5.124653739612189e-06, + "loss": 0.5316, + "step": 185 + }, + { + "epoch": 0.15482796892341844, + "grad_norm": 0.4488828778266907, + "learning_rate": 5.152354570637119e-06, + "loss": 0.5163, + "step": 186 + }, + { + "epoch": 0.15566037735849056, + "grad_norm": 0.40712884068489075, + "learning_rate": 5.180055401662051e-06, + "loss": 0.529, + "step": 187 + }, + { + "epoch": 0.1564927857935627, + "grad_norm": 0.4387264847755432, + "learning_rate": 5.207756232686981e-06, + "loss": 0.5097, + "step": 188 + }, + { + "epoch": 0.15732519422863486, + "grad_norm": 0.4043707549571991, + "learning_rate": 5.235457063711912e-06, + "loss": 0.4989, + "step": 189 + }, + { + "epoch": 0.158157602663707, + "grad_norm": 0.4202433228492737, + "learning_rate": 5.263157894736842e-06, + "loss": 0.5577, + "step": 190 + }, + { + "epoch": 0.15899001109877914, + "grad_norm": 0.4169784188270569, + "learning_rate": 5.290858725761774e-06, + "loss": 0.5331, + "step": 191 + }, + { + "epoch": 0.1598224195338513, + "grad_norm": 0.42586076259613037, + "learning_rate": 5.318559556786704e-06, + "loss": 0.5323, + "step": 192 + }, + { + "epoch": 0.1606548279689234, + "grad_norm": 0.4811337888240814, + "learning_rate": 5.346260387811635e-06, + "loss": 0.5305, + "step": 193 + }, + { + "epoch": 0.16148723640399557, + "grad_norm": 0.4163426458835602, + "learning_rate": 5.373961218836566e-06, + "loss": 0.5038, + "step": 194 + }, + { + "epoch": 0.1623196448390677, + "grad_norm": 0.4817521572113037, + "learning_rate": 5.401662049861495e-06, + "loss": 0.5423, + "step": 195 + }, + { + "epoch": 0.16315205327413984, + "grad_norm": 0.49575290083885193, + "learning_rate": 5.429362880886428e-06, + "loss": 0.5233, + "step": 196 + }, + { + "epoch": 0.163984461709212, + "grad_norm": 0.4486011266708374, + "learning_rate": 5.4570637119113575e-06, + "loss": 0.5004, + "step": 197 + }, + { + "epoch": 0.16481687014428412, + "grad_norm": 0.4150060713291168, + "learning_rate": 5.484764542936288e-06, + "loss": 0.52, + "step": 198 + }, + { + "epoch": 0.16564927857935627, + "grad_norm": 0.4181516468524933, + "learning_rate": 5.512465373961219e-06, + "loss": 0.498, + "step": 199 + }, + { + "epoch": 0.16648168701442842, + "grad_norm": 0.5380068421363831, + "learning_rate": 5.54016620498615e-06, + "loss": 0.5458, + "step": 200 + }, + { + "epoch": 0.16731409544950054, + "grad_norm": 0.4043119549751282, + "learning_rate": 5.567867036011081e-06, + "loss": 0.5297, + "step": 201 + }, + { + "epoch": 0.1681465038845727, + "grad_norm": 0.481564462184906, + "learning_rate": 5.5955678670360115e-06, + "loss": 0.5275, + "step": 202 + }, + { + "epoch": 0.16897891231964485, + "grad_norm": 0.4668130874633789, + "learning_rate": 5.623268698060942e-06, + "loss": 0.5195, + "step": 203 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 0.4354064166545868, + "learning_rate": 5.6509695290858736e-06, + "loss": 0.5147, + "step": 204 + }, + { + "epoch": 0.17064372918978912, + "grad_norm": 0.45728379487991333, + "learning_rate": 5.678670360110804e-06, + "loss": 0.5487, + "step": 205 + }, + { + "epoch": 0.17147613762486127, + "grad_norm": 0.4586714804172516, + "learning_rate": 5.706371191135735e-06, + "loss": 0.5398, + "step": 206 + }, + { + "epoch": 0.1723085460599334, + "grad_norm": 0.4355411231517792, + "learning_rate": 5.734072022160665e-06, + "loss": 0.5143, + "step": 207 + }, + { + "epoch": 0.17314095449500555, + "grad_norm": 0.4318200647830963, + "learning_rate": 5.761772853185597e-06, + "loss": 0.5153, + "step": 208 + }, + { + "epoch": 0.1739733629300777, + "grad_norm": 0.45262500643730164, + "learning_rate": 5.789473684210527e-06, + "loss": 0.5367, + "step": 209 + }, + { + "epoch": 0.17480577136514983, + "grad_norm": 0.41200655698776245, + "learning_rate": 5.817174515235457e-06, + "loss": 0.4899, + "step": 210 + }, + { + "epoch": 0.17563817980022198, + "grad_norm": 0.4550832211971283, + "learning_rate": 5.844875346260388e-06, + "loss": 0.5312, + "step": 211 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.48159360885620117, + "learning_rate": 5.8725761772853194e-06, + "loss": 0.5105, + "step": 212 + }, + { + "epoch": 0.17730299667036625, + "grad_norm": 0.4670851230621338, + "learning_rate": 5.90027700831025e-06, + "loss": 0.5193, + "step": 213 + }, + { + "epoch": 0.1781354051054384, + "grad_norm": 0.4833730161190033, + "learning_rate": 5.927977839335181e-06, + "loss": 0.5374, + "step": 214 + }, + { + "epoch": 0.17896781354051056, + "grad_norm": 0.41837170720100403, + "learning_rate": 5.955678670360111e-06, + "loss": 0.549, + "step": 215 + }, + { + "epoch": 0.17980022197558268, + "grad_norm": 0.45265597105026245, + "learning_rate": 5.983379501385042e-06, + "loss": 0.5009, + "step": 216 + }, + { + "epoch": 0.18063263041065483, + "grad_norm": 0.4560681879520416, + "learning_rate": 6.011080332409973e-06, + "loss": 0.5201, + "step": 217 + }, + { + "epoch": 0.18146503884572698, + "grad_norm": 0.41413623094558716, + "learning_rate": 6.038781163434903e-06, + "loss": 0.5098, + "step": 218 + }, + { + "epoch": 0.1822974472807991, + "grad_norm": 0.45979252457618713, + "learning_rate": 6.066481994459834e-06, + "loss": 0.5127, + "step": 219 + }, + { + "epoch": 0.18312985571587126, + "grad_norm": 0.42055100202560425, + "learning_rate": 6.0941828254847645e-06, + "loss": 0.4953, + "step": 220 + }, + { + "epoch": 0.18396226415094338, + "grad_norm": 0.4109812378883362, + "learning_rate": 6.121883656509696e-06, + "loss": 0.5282, + "step": 221 + }, + { + "epoch": 0.18479467258601553, + "grad_norm": 0.405984103679657, + "learning_rate": 6.1495844875346266e-06, + "loss": 0.5089, + "step": 222 + }, + { + "epoch": 0.1856270810210877, + "grad_norm": 0.388094037771225, + "learning_rate": 6.177285318559557e-06, + "loss": 0.5212, + "step": 223 + }, + { + "epoch": 0.1864594894561598, + "grad_norm": 0.4275857210159302, + "learning_rate": 6.204986149584488e-06, + "loss": 0.5238, + "step": 224 + }, + { + "epoch": 0.18729189789123196, + "grad_norm": 0.4158404469490051, + "learning_rate": 6.232686980609419e-06, + "loss": 0.5246, + "step": 225 + }, + { + "epoch": 0.1881243063263041, + "grad_norm": 0.44389262795448303, + "learning_rate": 6.26038781163435e-06, + "loss": 0.5089, + "step": 226 + }, + { + "epoch": 0.18895671476137624, + "grad_norm": 0.4087418019771576, + "learning_rate": 6.2880886426592805e-06, + "loss": 0.5089, + "step": 227 + }, + { + "epoch": 0.1897891231964484, + "grad_norm": 0.5286365747451782, + "learning_rate": 6.31578947368421e-06, + "loss": 0.5551, + "step": 228 + }, + { + "epoch": 0.19062153163152054, + "grad_norm": 0.4365133047103882, + "learning_rate": 6.343490304709143e-06, + "loss": 0.511, + "step": 229 + }, + { + "epoch": 0.19145394006659266, + "grad_norm": 0.44955843687057495, + "learning_rate": 6.3711911357340724e-06, + "loss": 0.4946, + "step": 230 + }, + { + "epoch": 0.19228634850166482, + "grad_norm": 0.41679856181144714, + "learning_rate": 6.398891966759003e-06, + "loss": 0.5041, + "step": 231 + }, + { + "epoch": 0.19311875693673697, + "grad_norm": 0.4691244959831238, + "learning_rate": 6.426592797783934e-06, + "loss": 0.4901, + "step": 232 + }, + { + "epoch": 0.1939511653718091, + "grad_norm": 0.49247369170188904, + "learning_rate": 6.454293628808865e-06, + "loss": 0.5176, + "step": 233 + }, + { + "epoch": 0.19478357380688124, + "grad_norm": 0.4147561192512512, + "learning_rate": 6.481994459833796e-06, + "loss": 0.4995, + "step": 234 + }, + { + "epoch": 0.1956159822419534, + "grad_norm": 0.4889633059501648, + "learning_rate": 6.509695290858726e-06, + "loss": 0.5404, + "step": 235 + }, + { + "epoch": 0.19644839067702552, + "grad_norm": 0.5010858774185181, + "learning_rate": 6.537396121883657e-06, + "loss": 0.5358, + "step": 236 + }, + { + "epoch": 0.19728079911209767, + "grad_norm": 0.38324692845344543, + "learning_rate": 6.565096952908588e-06, + "loss": 0.4914, + "step": 237 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 0.489378958940506, + "learning_rate": 6.592797783933519e-06, + "loss": 0.4954, + "step": 238 + }, + { + "epoch": 0.19894561598224195, + "grad_norm": 0.4189784526824951, + "learning_rate": 6.62049861495845e-06, + "loss": 0.5062, + "step": 239 + }, + { + "epoch": 0.1997780244173141, + "grad_norm": 0.42447060346603394, + "learning_rate": 6.6481994459833796e-06, + "loss": 0.5128, + "step": 240 + }, + { + "epoch": 0.20061043285238625, + "grad_norm": 0.4346916079521179, + "learning_rate": 6.67590027700831e-06, + "loss": 0.5257, + "step": 241 + }, + { + "epoch": 0.20144284128745837, + "grad_norm": 0.4251374304294586, + "learning_rate": 6.703601108033242e-06, + "loss": 0.5267, + "step": 242 + }, + { + "epoch": 0.20227524972253053, + "grad_norm": 0.40433380007743835, + "learning_rate": 6.731301939058172e-06, + "loss": 0.4952, + "step": 243 + }, + { + "epoch": 0.20310765815760268, + "grad_norm": 0.44367408752441406, + "learning_rate": 6.759002770083103e-06, + "loss": 0.514, + "step": 244 + }, + { + "epoch": 0.2039400665926748, + "grad_norm": 0.45639634132385254, + "learning_rate": 6.7867036011080335e-06, + "loss": 0.5206, + "step": 245 + }, + { + "epoch": 0.20477247502774695, + "grad_norm": 0.4419868290424347, + "learning_rate": 6.814404432132965e-06, + "loss": 0.5412, + "step": 246 + }, + { + "epoch": 0.20560488346281908, + "grad_norm": 0.4839082658290863, + "learning_rate": 6.842105263157896e-06, + "loss": 0.5237, + "step": 247 + }, + { + "epoch": 0.20643729189789123, + "grad_norm": 0.42248091101646423, + "learning_rate": 6.869806094182826e-06, + "loss": 0.5242, + "step": 248 + }, + { + "epoch": 0.20726970033296338, + "grad_norm": 0.46019700169563293, + "learning_rate": 6.897506925207756e-06, + "loss": 0.4951, + "step": 249 + }, + { + "epoch": 0.2081021087680355, + "grad_norm": 0.5231823325157166, + "learning_rate": 6.925207756232688e-06, + "loss": 0.5459, + "step": 250 + }, + { + "epoch": 0.20893451720310766, + "grad_norm": 0.45339393615722656, + "learning_rate": 6.952908587257618e-06, + "loss": 0.5099, + "step": 251 + }, + { + "epoch": 0.2097669256381798, + "grad_norm": 0.4783773124217987, + "learning_rate": 6.980609418282549e-06, + "loss": 0.4946, + "step": 252 + }, + { + "epoch": 0.21059933407325193, + "grad_norm": 0.46553727984428406, + "learning_rate": 7.008310249307479e-06, + "loss": 0.482, + "step": 253 + }, + { + "epoch": 0.21143174250832408, + "grad_norm": 0.4713238775730133, + "learning_rate": 7.036011080332411e-06, + "loss": 0.4964, + "step": 254 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 0.48873159289360046, + "learning_rate": 7.0637119113573415e-06, + "loss": 0.5337, + "step": 255 + }, + { + "epoch": 0.21309655937846836, + "grad_norm": 0.45275193452835083, + "learning_rate": 7.091412742382272e-06, + "loss": 0.5168, + "step": 256 + }, + { + "epoch": 0.2139289678135405, + "grad_norm": 0.45386070013046265, + "learning_rate": 7.119113573407203e-06, + "loss": 0.5093, + "step": 257 + }, + { + "epoch": 0.21476137624861266, + "grad_norm": 0.46968305110931396, + "learning_rate": 7.146814404432133e-06, + "loss": 0.5128, + "step": 258 + }, + { + "epoch": 0.21559378468368479, + "grad_norm": 0.4741690754890442, + "learning_rate": 7.174515235457065e-06, + "loss": 0.5301, + "step": 259 + }, + { + "epoch": 0.21642619311875694, + "grad_norm": 0.48019328713417053, + "learning_rate": 7.2022160664819955e-06, + "loss": 0.4837, + "step": 260 + }, + { + "epoch": 0.2172586015538291, + "grad_norm": 0.424640029668808, + "learning_rate": 7.229916897506925e-06, + "loss": 0.4917, + "step": 261 + }, + { + "epoch": 0.2180910099889012, + "grad_norm": 0.5357044339179993, + "learning_rate": 7.257617728531856e-06, + "loss": 0.5159, + "step": 262 + }, + { + "epoch": 0.21892341842397336, + "grad_norm": 0.46231383085250854, + "learning_rate": 7.285318559556787e-06, + "loss": 0.5235, + "step": 263 + }, + { + "epoch": 0.21975582685904552, + "grad_norm": 0.4579526484012604, + "learning_rate": 7.313019390581718e-06, + "loss": 0.4905, + "step": 264 + }, + { + "epoch": 0.22058823529411764, + "grad_norm": 0.459525465965271, + "learning_rate": 7.340720221606649e-06, + "loss": 0.525, + "step": 265 + }, + { + "epoch": 0.2214206437291898, + "grad_norm": 0.5286341309547424, + "learning_rate": 7.368421052631579e-06, + "loss": 0.5093, + "step": 266 + }, + { + "epoch": 0.22225305216426194, + "grad_norm": 0.4147150218486786, + "learning_rate": 7.396121883656511e-06, + "loss": 0.4996, + "step": 267 + }, + { + "epoch": 0.22308546059933407, + "grad_norm": 0.4209958612918854, + "learning_rate": 7.423822714681441e-06, + "loss": 0.5076, + "step": 268 + }, + { + "epoch": 0.22391786903440622, + "grad_norm": 0.41367340087890625, + "learning_rate": 7.451523545706372e-06, + "loss": 0.5261, + "step": 269 + }, + { + "epoch": 0.22475027746947837, + "grad_norm": 0.4930833578109741, + "learning_rate": 7.479224376731302e-06, + "loss": 0.4987, + "step": 270 + }, + { + "epoch": 0.2255826859045505, + "grad_norm": 0.45146897435188293, + "learning_rate": 7.506925207756234e-06, + "loss": 0.4857, + "step": 271 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.4591994881629944, + "learning_rate": 7.534626038781164e-06, + "loss": 0.5007, + "step": 272 + }, + { + "epoch": 0.22724750277469477, + "grad_norm": 0.5006039142608643, + "learning_rate": 7.5623268698060945e-06, + "loss": 0.5174, + "step": 273 + }, + { + "epoch": 0.22807991120976692, + "grad_norm": 0.4668283462524414, + "learning_rate": 7.590027700831025e-06, + "loss": 0.5019, + "step": 274 + }, + { + "epoch": 0.22891231964483907, + "grad_norm": 0.5253505110740662, + "learning_rate": 7.617728531855957e-06, + "loss": 0.4974, + "step": 275 + }, + { + "epoch": 0.2297447280799112, + "grad_norm": 0.5322619676589966, + "learning_rate": 7.645429362880887e-06, + "loss": 0.5147, + "step": 276 + }, + { + "epoch": 0.23057713651498335, + "grad_norm": 0.44390517473220825, + "learning_rate": 7.673130193905818e-06, + "loss": 0.5246, + "step": 277 + }, + { + "epoch": 0.2314095449500555, + "grad_norm": 0.5119628310203552, + "learning_rate": 7.700831024930749e-06, + "loss": 0.4879, + "step": 278 + }, + { + "epoch": 0.23224195338512763, + "grad_norm": 0.4466327726840973, + "learning_rate": 7.728531855955679e-06, + "loss": 0.4999, + "step": 279 + }, + { + "epoch": 0.23307436182019978, + "grad_norm": 0.4270954728126526, + "learning_rate": 7.75623268698061e-06, + "loss": 0.4955, + "step": 280 + }, + { + "epoch": 0.23390677025527193, + "grad_norm": 0.5279788970947266, + "learning_rate": 7.78393351800554e-06, + "loss": 0.5138, + "step": 281 + }, + { + "epoch": 0.23473917869034405, + "grad_norm": 0.4631377160549164, + "learning_rate": 7.811634349030471e-06, + "loss": 0.4915, + "step": 282 + }, + { + "epoch": 0.2355715871254162, + "grad_norm": 0.509636402130127, + "learning_rate": 7.839335180055402e-06, + "loss": 0.5205, + "step": 283 + }, + { + "epoch": 0.23640399556048836, + "grad_norm": 0.42661571502685547, + "learning_rate": 7.867036011080334e-06, + "loss": 0.4931, + "step": 284 + }, + { + "epoch": 0.23723640399556048, + "grad_norm": 0.4809859097003937, + "learning_rate": 7.894736842105265e-06, + "loss": 0.514, + "step": 285 + }, + { + "epoch": 0.23806881243063263, + "grad_norm": 0.4813212752342224, + "learning_rate": 7.922437673130195e-06, + "loss": 0.5209, + "step": 286 + }, + { + "epoch": 0.23890122086570478, + "grad_norm": 0.4047999083995819, + "learning_rate": 7.950138504155124e-06, + "loss": 0.4689, + "step": 287 + }, + { + "epoch": 0.2397336293007769, + "grad_norm": 0.48839592933654785, + "learning_rate": 7.977839335180056e-06, + "loss": 0.495, + "step": 288 + }, + { + "epoch": 0.24056603773584906, + "grad_norm": 0.4675354063510895, + "learning_rate": 8.005540166204987e-06, + "loss": 0.5214, + "step": 289 + }, + { + "epoch": 0.2413984461709212, + "grad_norm": 0.5134713053703308, + "learning_rate": 8.033240997229918e-06, + "loss": 0.4755, + "step": 290 + }, + { + "epoch": 0.24223085460599333, + "grad_norm": 0.46296653151512146, + "learning_rate": 8.060941828254848e-06, + "loss": 0.5137, + "step": 291 + }, + { + "epoch": 0.24306326304106549, + "grad_norm": 0.43857628107070923, + "learning_rate": 8.088642659279779e-06, + "loss": 0.5045, + "step": 292 + }, + { + "epoch": 0.24389567147613764, + "grad_norm": 0.4630931615829468, + "learning_rate": 8.11634349030471e-06, + "loss": 0.5113, + "step": 293 + }, + { + "epoch": 0.24472807991120976, + "grad_norm": 0.4841103255748749, + "learning_rate": 8.14404432132964e-06, + "loss": 0.519, + "step": 294 + }, + { + "epoch": 0.2455604883462819, + "grad_norm": 0.4541115462779999, + "learning_rate": 8.171745152354571e-06, + "loss": 0.5125, + "step": 295 + }, + { + "epoch": 0.24639289678135406, + "grad_norm": 0.4210924208164215, + "learning_rate": 8.199445983379503e-06, + "loss": 0.5035, + "step": 296 + }, + { + "epoch": 0.2472253052164262, + "grad_norm": 0.41758114099502563, + "learning_rate": 8.227146814404434e-06, + "loss": 0.4851, + "step": 297 + }, + { + "epoch": 0.24805771365149834, + "grad_norm": 0.5241228938102722, + "learning_rate": 8.254847645429363e-06, + "loss": 0.5046, + "step": 298 + }, + { + "epoch": 0.24889012208657046, + "grad_norm": 0.5144554972648621, + "learning_rate": 8.282548476454293e-06, + "loss": 0.54, + "step": 299 + }, + { + "epoch": 0.24972253052164262, + "grad_norm": 0.5133737921714783, + "learning_rate": 8.310249307479224e-06, + "loss": 0.499, + "step": 300 + }, + { + "epoch": 0.25055493895671477, + "grad_norm": 0.5024670362472534, + "learning_rate": 8.337950138504156e-06, + "loss": 0.5042, + "step": 301 + }, + { + "epoch": 0.2513873473917869, + "grad_norm": 0.5267788767814636, + "learning_rate": 8.365650969529087e-06, + "loss": 0.5023, + "step": 302 + }, + { + "epoch": 0.25221975582685907, + "grad_norm": 0.43696895241737366, + "learning_rate": 8.393351800554018e-06, + "loss": 0.4858, + "step": 303 + }, + { + "epoch": 0.25305216426193117, + "grad_norm": 0.5760444402694702, + "learning_rate": 8.421052631578948e-06, + "loss": 0.5106, + "step": 304 + }, + { + "epoch": 0.2538845726970033, + "grad_norm": 0.4616737961769104, + "learning_rate": 8.448753462603879e-06, + "loss": 0.4843, + "step": 305 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 0.5310185551643372, + "learning_rate": 8.47645429362881e-06, + "loss": 0.5263, + "step": 306 + }, + { + "epoch": 0.2555493895671476, + "grad_norm": 0.4836473762989044, + "learning_rate": 8.50415512465374e-06, + "loss": 0.4889, + "step": 307 + }, + { + "epoch": 0.2563817980022198, + "grad_norm": 0.5024266839027405, + "learning_rate": 8.53185595567867e-06, + "loss": 0.491, + "step": 308 + }, + { + "epoch": 0.25721420643729187, + "grad_norm": 0.47080838680267334, + "learning_rate": 8.559556786703603e-06, + "loss": 0.467, + "step": 309 + }, + { + "epoch": 0.258046614872364, + "grad_norm": 0.44176098704338074, + "learning_rate": 8.587257617728532e-06, + "loss": 0.4699, + "step": 310 + }, + { + "epoch": 0.2588790233074362, + "grad_norm": 0.43928641080856323, + "learning_rate": 8.614958448753463e-06, + "loss": 0.5042, + "step": 311 + }, + { + "epoch": 0.2597114317425083, + "grad_norm": 0.4728699326515198, + "learning_rate": 8.642659279778393e-06, + "loss": 0.4923, + "step": 312 + }, + { + "epoch": 0.2605438401775805, + "grad_norm": 0.45453718304634094, + "learning_rate": 8.670360110803326e-06, + "loss": 0.4983, + "step": 313 + }, + { + "epoch": 0.26137624861265263, + "grad_norm": 0.46343910694122314, + "learning_rate": 8.698060941828256e-06, + "loss": 0.4985, + "step": 314 + }, + { + "epoch": 0.2622086570477247, + "grad_norm": 0.5328369140625, + "learning_rate": 8.725761772853187e-06, + "loss": 0.5047, + "step": 315 + }, + { + "epoch": 0.2630410654827969, + "grad_norm": 0.4438888728618622, + "learning_rate": 8.753462603878117e-06, + "loss": 0.5047, + "step": 316 + }, + { + "epoch": 0.26387347391786903, + "grad_norm": 0.5838459730148315, + "learning_rate": 8.781163434903048e-06, + "loss": 0.5239, + "step": 317 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 0.48084893822669983, + "learning_rate": 8.808864265927979e-06, + "loss": 0.5101, + "step": 318 + }, + { + "epoch": 0.26553829078801333, + "grad_norm": 0.3967612385749817, + "learning_rate": 8.83656509695291e-06, + "loss": 0.474, + "step": 319 + }, + { + "epoch": 0.2663706992230855, + "grad_norm": 0.49214819073677063, + "learning_rate": 8.86426592797784e-06, + "loss": 0.5064, + "step": 320 + }, + { + "epoch": 0.2672031076581576, + "grad_norm": 0.46390125155448914, + "learning_rate": 8.89196675900277e-06, + "loss": 0.4726, + "step": 321 + }, + { + "epoch": 0.26803551609322973, + "grad_norm": 0.43828120827674866, + "learning_rate": 8.919667590027701e-06, + "loss": 0.508, + "step": 322 + }, + { + "epoch": 0.2688679245283019, + "grad_norm": 0.4651317596435547, + "learning_rate": 8.947368421052632e-06, + "loss": 0.4844, + "step": 323 + }, + { + "epoch": 0.26970033296337403, + "grad_norm": 0.45498353242874146, + "learning_rate": 8.975069252077562e-06, + "loss": 0.4959, + "step": 324 + }, + { + "epoch": 0.2705327413984462, + "grad_norm": 0.5337496995925903, + "learning_rate": 9.002770083102493e-06, + "loss": 0.4892, + "step": 325 + }, + { + "epoch": 0.27136514983351834, + "grad_norm": 0.4983648657798767, + "learning_rate": 9.030470914127425e-06, + "loss": 0.5105, + "step": 326 + }, + { + "epoch": 0.27219755826859043, + "grad_norm": 0.4203820824623108, + "learning_rate": 9.058171745152356e-06, + "loss": 0.4674, + "step": 327 + }, + { + "epoch": 0.2730299667036626, + "grad_norm": 0.4564470052719116, + "learning_rate": 9.085872576177287e-06, + "loss": 0.4779, + "step": 328 + }, + { + "epoch": 0.27386237513873474, + "grad_norm": 0.48127999901771545, + "learning_rate": 9.113573407202216e-06, + "loss": 0.504, + "step": 329 + }, + { + "epoch": 0.2746947835738069, + "grad_norm": 0.41335824131965637, + "learning_rate": 9.141274238227148e-06, + "loss": 0.452, + "step": 330 + }, + { + "epoch": 0.27552719200887904, + "grad_norm": 0.48663002252578735, + "learning_rate": 9.168975069252079e-06, + "loss": 0.497, + "step": 331 + }, + { + "epoch": 0.2763596004439512, + "grad_norm": 0.469848096370697, + "learning_rate": 9.19667590027701e-06, + "loss": 0.4722, + "step": 332 + }, + { + "epoch": 0.2771920088790233, + "grad_norm": 0.5372227430343628, + "learning_rate": 9.22437673130194e-06, + "loss": 0.5136, + "step": 333 + }, + { + "epoch": 0.27802441731409544, + "grad_norm": 0.4874361753463745, + "learning_rate": 9.25207756232687e-06, + "loss": 0.508, + "step": 334 + }, + { + "epoch": 0.2788568257491676, + "grad_norm": 0.42719605565071106, + "learning_rate": 9.279778393351801e-06, + "loss": 0.4939, + "step": 335 + }, + { + "epoch": 0.27968923418423974, + "grad_norm": 0.4991985261440277, + "learning_rate": 9.307479224376732e-06, + "loss": 0.5072, + "step": 336 + }, + { + "epoch": 0.2805216426193119, + "grad_norm": 0.5013337135314941, + "learning_rate": 9.335180055401662e-06, + "loss": 0.4705, + "step": 337 + }, + { + "epoch": 0.281354051054384, + "grad_norm": 0.40710508823394775, + "learning_rate": 9.362880886426595e-06, + "loss": 0.4713, + "step": 338 + }, + { + "epoch": 0.28218645948945614, + "grad_norm": 0.4505155682563782, + "learning_rate": 9.390581717451525e-06, + "loss": 0.4912, + "step": 339 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 0.4827728569507599, + "learning_rate": 9.418282548476454e-06, + "loss": 0.5223, + "step": 340 + }, + { + "epoch": 0.28385127635960045, + "grad_norm": 0.5375002026557922, + "learning_rate": 9.445983379501385e-06, + "loss": 0.5245, + "step": 341 + }, + { + "epoch": 0.2846836847946726, + "grad_norm": 0.41850459575653076, + "learning_rate": 9.473684210526315e-06, + "loss": 0.5029, + "step": 342 + }, + { + "epoch": 0.28551609322974475, + "grad_norm": 0.508848249912262, + "learning_rate": 9.501385041551248e-06, + "loss": 0.4864, + "step": 343 + }, + { + "epoch": 0.28634850166481685, + "grad_norm": 0.5495002865791321, + "learning_rate": 9.529085872576178e-06, + "loss": 0.5147, + "step": 344 + }, + { + "epoch": 0.287180910099889, + "grad_norm": 0.49234721064567566, + "learning_rate": 9.556786703601109e-06, + "loss": 0.5126, + "step": 345 + }, + { + "epoch": 0.28801331853496115, + "grad_norm": 0.4997720718383789, + "learning_rate": 9.58448753462604e-06, + "loss": 0.5154, + "step": 346 + }, + { + "epoch": 0.2888457269700333, + "grad_norm": 0.5731498599052429, + "learning_rate": 9.61218836565097e-06, + "loss": 0.5084, + "step": 347 + }, + { + "epoch": 0.28967813540510545, + "grad_norm": 0.5639533400535583, + "learning_rate": 9.639889196675901e-06, + "loss": 0.458, + "step": 348 + }, + { + "epoch": 0.2905105438401776, + "grad_norm": 0.5300630927085876, + "learning_rate": 9.667590027700832e-06, + "loss": 0.499, + "step": 349 + }, + { + "epoch": 0.2913429522752497, + "grad_norm": 0.5944730639457703, + "learning_rate": 9.695290858725762e-06, + "loss": 0.5156, + "step": 350 + }, + { + "epoch": 0.29217536071032185, + "grad_norm": 0.4813040494918823, + "learning_rate": 9.722991689750695e-06, + "loss": 0.492, + "step": 351 + }, + { + "epoch": 0.293007769145394, + "grad_norm": 0.4312509000301361, + "learning_rate": 9.750692520775623e-06, + "loss": 0.4801, + "step": 352 + }, + { + "epoch": 0.29384017758046616, + "grad_norm": 0.474255234003067, + "learning_rate": 9.778393351800554e-06, + "loss": 0.4755, + "step": 353 + }, + { + "epoch": 0.2946725860155383, + "grad_norm": 0.48329421877861023, + "learning_rate": 9.806094182825485e-06, + "loss": 0.4974, + "step": 354 + }, + { + "epoch": 0.29550499445061046, + "grad_norm": 0.47372984886169434, + "learning_rate": 9.833795013850417e-06, + "loss": 0.5232, + "step": 355 + }, + { + "epoch": 0.29633740288568255, + "grad_norm": 0.4526323080062866, + "learning_rate": 9.861495844875348e-06, + "loss": 0.4792, + "step": 356 + }, + { + "epoch": 0.2971698113207547, + "grad_norm": 0.5251845121383667, + "learning_rate": 9.889196675900278e-06, + "loss": 0.4866, + "step": 357 + }, + { + "epoch": 0.29800221975582686, + "grad_norm": 0.4455892741680145, + "learning_rate": 9.916897506925209e-06, + "loss": 0.4904, + "step": 358 + }, + { + "epoch": 0.298834628190899, + "grad_norm": 0.46031251549720764, + "learning_rate": 9.94459833795014e-06, + "loss": 0.4744, + "step": 359 + }, + { + "epoch": 0.29966703662597116, + "grad_norm": 0.4524519443511963, + "learning_rate": 9.97229916897507e-06, + "loss": 0.4958, + "step": 360 + }, + { + "epoch": 0.30049944506104326, + "grad_norm": 0.4535054862499237, + "learning_rate": 1e-05, + "loss": 0.4906, + "step": 361 + }, + { + "epoch": 0.3013318534961154, + "grad_norm": 0.4776564836502075, + "learning_rate": 9.999997652456228e-06, + "loss": 0.5017, + "step": 362 + }, + { + "epoch": 0.30216426193118756, + "grad_norm": 0.5730841159820557, + "learning_rate": 9.999990609827113e-06, + "loss": 0.4679, + "step": 363 + }, + { + "epoch": 0.3029966703662597, + "grad_norm": 0.4576025903224945, + "learning_rate": 9.999978872119267e-06, + "loss": 0.4968, + "step": 364 + }, + { + "epoch": 0.30382907880133186, + "grad_norm": 0.6305115818977356, + "learning_rate": 9.999962439343715e-06, + "loss": 0.4939, + "step": 365 + }, + { + "epoch": 0.304661487236404, + "grad_norm": 0.4445456564426422, + "learning_rate": 9.999941311515888e-06, + "loss": 0.4895, + "step": 366 + }, + { + "epoch": 0.3054938956714761, + "grad_norm": 0.535879909992218, + "learning_rate": 9.999915488655623e-06, + "loss": 0.4841, + "step": 367 + }, + { + "epoch": 0.30632630410654826, + "grad_norm": 0.5554977059364319, + "learning_rate": 9.999884970787168e-06, + "loss": 0.5098, + "step": 368 + }, + { + "epoch": 0.3071587125416204, + "grad_norm": 0.5069095492362976, + "learning_rate": 9.999849757939182e-06, + "loss": 0.4879, + "step": 369 + }, + { + "epoch": 0.30799112097669257, + "grad_norm": 0.48924490809440613, + "learning_rate": 9.99980985014473e-06, + "loss": 0.489, + "step": 370 + }, + { + "epoch": 0.3088235294117647, + "grad_norm": 0.5114119052886963, + "learning_rate": 9.999765247441285e-06, + "loss": 0.4927, + "step": 371 + }, + { + "epoch": 0.30965593784683687, + "grad_norm": 0.6173185706138611, + "learning_rate": 9.999715949870729e-06, + "loss": 0.5053, + "step": 372 + }, + { + "epoch": 0.31048834628190897, + "grad_norm": 0.4988461136817932, + "learning_rate": 9.999661957479354e-06, + "loss": 0.4579, + "step": 373 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 0.5641622543334961, + "learning_rate": 9.999603270317863e-06, + "loss": 0.5126, + "step": 374 + }, + { + "epoch": 0.31215316315205327, + "grad_norm": 0.5810229778289795, + "learning_rate": 9.99953988844136e-06, + "loss": 0.4952, + "step": 375 + }, + { + "epoch": 0.3129855715871254, + "grad_norm": 0.6129553914070129, + "learning_rate": 9.999471811909363e-06, + "loss": 0.5012, + "step": 376 + }, + { + "epoch": 0.3138179800221976, + "grad_norm": 0.5977503061294556, + "learning_rate": 9.999399040785797e-06, + "loss": 0.492, + "step": 377 + }, + { + "epoch": 0.3146503884572697, + "grad_norm": 0.6007142663002014, + "learning_rate": 9.999321575138997e-06, + "loss": 0.501, + "step": 378 + }, + { + "epoch": 0.3154827968923418, + "grad_norm": 0.6089385747909546, + "learning_rate": 9.999239415041701e-06, + "loss": 0.4894, + "step": 379 + }, + { + "epoch": 0.316315205327414, + "grad_norm": 0.6288129091262817, + "learning_rate": 9.999152560571064e-06, + "loss": 0.4886, + "step": 380 + }, + { + "epoch": 0.3171476137624861, + "grad_norm": 0.6314293146133423, + "learning_rate": 9.99906101180864e-06, + "loss": 0.5023, + "step": 381 + }, + { + "epoch": 0.3179800221975583, + "grad_norm": 0.6294896006584167, + "learning_rate": 9.998964768840393e-06, + "loss": 0.495, + "step": 382 + }, + { + "epoch": 0.31881243063263043, + "grad_norm": 0.5049759745597839, + "learning_rate": 9.998863831756702e-06, + "loss": 0.5074, + "step": 383 + }, + { + "epoch": 0.3196448390677026, + "grad_norm": 0.5267007350921631, + "learning_rate": 9.998758200652346e-06, + "loss": 0.4751, + "step": 384 + }, + { + "epoch": 0.3204772475027747, + "grad_norm": 0.5030413866043091, + "learning_rate": 9.998647875626514e-06, + "loss": 0.5, + "step": 385 + }, + { + "epoch": 0.3213096559378468, + "grad_norm": 0.49688029289245605, + "learning_rate": 9.998532856782805e-06, + "loss": 0.4716, + "step": 386 + }, + { + "epoch": 0.322142064372919, + "grad_norm": 0.4237273335456848, + "learning_rate": 9.998413144229224e-06, + "loss": 0.4842, + "step": 387 + }, + { + "epoch": 0.32297447280799113, + "grad_norm": 0.5279334783554077, + "learning_rate": 9.998288738078179e-06, + "loss": 0.4842, + "step": 388 + }, + { + "epoch": 0.3238068812430633, + "grad_norm": 0.43567541241645813, + "learning_rate": 9.998159638446495e-06, + "loss": 0.4772, + "step": 389 + }, + { + "epoch": 0.3246392896781354, + "grad_norm": 0.4781498610973358, + "learning_rate": 9.998025845455394e-06, + "loss": 0.4918, + "step": 390 + }, + { + "epoch": 0.32547169811320753, + "grad_norm": 0.5459445118904114, + "learning_rate": 9.99788735923051e-06, + "loss": 0.4865, + "step": 391 + }, + { + "epoch": 0.3263041065482797, + "grad_norm": 0.4947822391986847, + "learning_rate": 9.997744179901891e-06, + "loss": 0.4784, + "step": 392 + }, + { + "epoch": 0.32713651498335183, + "grad_norm": 0.5778188109397888, + "learning_rate": 9.997596307603979e-06, + "loss": 0.5046, + "step": 393 + }, + { + "epoch": 0.327968923418424, + "grad_norm": 0.46349483728408813, + "learning_rate": 9.997443742475628e-06, + "loss": 0.5002, + "step": 394 + }, + { + "epoch": 0.32880133185349614, + "grad_norm": 0.41382893919944763, + "learning_rate": 9.997286484660101e-06, + "loss": 0.5169, + "step": 395 + }, + { + "epoch": 0.32963374028856823, + "grad_norm": 0.5306146740913391, + "learning_rate": 9.997124534305065e-06, + "loss": 0.5053, + "step": 396 + }, + { + "epoch": 0.3304661487236404, + "grad_norm": 0.4569046199321747, + "learning_rate": 9.996957891562598e-06, + "loss": 0.5099, + "step": 397 + }, + { + "epoch": 0.33129855715871254, + "grad_norm": 0.440790057182312, + "learning_rate": 9.996786556589175e-06, + "loss": 0.5094, + "step": 398 + }, + { + "epoch": 0.3321309655937847, + "grad_norm": 0.5211203694343567, + "learning_rate": 9.996610529545685e-06, + "loss": 0.4936, + "step": 399 + }, + { + "epoch": 0.33296337402885684, + "grad_norm": 0.4785847067832947, + "learning_rate": 9.996429810597421e-06, + "loss": 0.4808, + "step": 400 + }, + { + "epoch": 0.333795782463929, + "grad_norm": 0.5769135355949402, + "learning_rate": 9.996244399914083e-06, + "loss": 0.5027, + "step": 401 + }, + { + "epoch": 0.3346281908990011, + "grad_norm": 0.49235013127326965, + "learning_rate": 9.99605429766977e-06, + "loss": 0.4918, + "step": 402 + }, + { + "epoch": 0.33546059933407324, + "grad_norm": 0.559281587600708, + "learning_rate": 9.995859504042994e-06, + "loss": 0.5126, + "step": 403 + }, + { + "epoch": 0.3362930077691454, + "grad_norm": 0.4233754873275757, + "learning_rate": 9.99566001921667e-06, + "loss": 0.4942, + "step": 404 + }, + { + "epoch": 0.33712541620421754, + "grad_norm": 0.5626436471939087, + "learning_rate": 9.995455843378118e-06, + "loss": 0.4992, + "step": 405 + }, + { + "epoch": 0.3379578246392897, + "grad_norm": 0.4755416810512543, + "learning_rate": 9.995246976719063e-06, + "loss": 0.497, + "step": 406 + }, + { + "epoch": 0.33879023307436185, + "grad_norm": 0.5172478556632996, + "learning_rate": 9.995033419435632e-06, + "loss": 0.4947, + "step": 407 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.5225632786750793, + "learning_rate": 9.994815171728362e-06, + "loss": 0.5041, + "step": 408 + }, + { + "epoch": 0.3404550499445061, + "grad_norm": 0.5508219003677368, + "learning_rate": 9.994592233802189e-06, + "loss": 0.4589, + "step": 409 + }, + { + "epoch": 0.34128745837957825, + "grad_norm": 0.5616341233253479, + "learning_rate": 9.994364605866455e-06, + "loss": 0.4893, + "step": 410 + }, + { + "epoch": 0.3421198668146504, + "grad_norm": 0.5048859119415283, + "learning_rate": 9.99413228813491e-06, + "loss": 0.4887, + "step": 411 + }, + { + "epoch": 0.34295227524972255, + "grad_norm": 0.5626086592674255, + "learning_rate": 9.993895280825702e-06, + "loss": 0.4853, + "step": 412 + }, + { + "epoch": 0.34378468368479465, + "grad_norm": 0.46970364451408386, + "learning_rate": 9.993653584161387e-06, + "loss": 0.4965, + "step": 413 + }, + { + "epoch": 0.3446170921198668, + "grad_norm": 0.6829771995544434, + "learning_rate": 9.993407198368918e-06, + "loss": 0.4942, + "step": 414 + }, + { + "epoch": 0.34544950055493895, + "grad_norm": 0.5054067373275757, + "learning_rate": 9.993156123679662e-06, + "loss": 0.4761, + "step": 415 + }, + { + "epoch": 0.3462819089900111, + "grad_norm": 0.6721978783607483, + "learning_rate": 9.992900360329376e-06, + "loss": 0.4938, + "step": 416 + }, + { + "epoch": 0.34711431742508325, + "grad_norm": 0.4733940362930298, + "learning_rate": 9.992639908558232e-06, + "loss": 0.4801, + "step": 417 + }, + { + "epoch": 0.3479467258601554, + "grad_norm": 0.5857818722724915, + "learning_rate": 9.992374768610795e-06, + "loss": 0.4631, + "step": 418 + }, + { + "epoch": 0.3487791342952275, + "grad_norm": 0.4833131432533264, + "learning_rate": 9.992104940736038e-06, + "loss": 0.469, + "step": 419 + }, + { + "epoch": 0.34961154273029965, + "grad_norm": 0.5365222692489624, + "learning_rate": 9.991830425187333e-06, + "loss": 0.4771, + "step": 420 + }, + { + "epoch": 0.3504439511653718, + "grad_norm": 0.4678042531013489, + "learning_rate": 9.991551222222455e-06, + "loss": 0.4632, + "step": 421 + }, + { + "epoch": 0.35127635960044395, + "grad_norm": 0.4540461599826813, + "learning_rate": 9.99126733210358e-06, + "loss": 0.4764, + "step": 422 + }, + { + "epoch": 0.3521087680355161, + "grad_norm": 0.47178414463996887, + "learning_rate": 9.990978755097287e-06, + "loss": 0.4711, + "step": 423 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.4962081015110016, + "learning_rate": 9.990685491474555e-06, + "loss": 0.5143, + "step": 424 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 0.42502203583717346, + "learning_rate": 9.990387541510761e-06, + "loss": 0.4839, + "step": 425 + }, + { + "epoch": 0.3546059933407325, + "grad_norm": 0.4943620264530182, + "learning_rate": 9.990084905485689e-06, + "loss": 0.4881, + "step": 426 + }, + { + "epoch": 0.35543840177580466, + "grad_norm": 0.4736219644546509, + "learning_rate": 9.989777583683517e-06, + "loss": 0.4774, + "step": 427 + }, + { + "epoch": 0.3562708102108768, + "grad_norm": 0.5000523924827576, + "learning_rate": 9.989465576392828e-06, + "loss": 0.5023, + "step": 428 + }, + { + "epoch": 0.35710321864594896, + "grad_norm": 0.5680121183395386, + "learning_rate": 9.989148883906599e-06, + "loss": 0.4816, + "step": 429 + }, + { + "epoch": 0.3579356270810211, + "grad_norm": 0.4986157715320587, + "learning_rate": 9.988827506522211e-06, + "loss": 0.4624, + "step": 430 + }, + { + "epoch": 0.3587680355160932, + "grad_norm": 0.48457589745521545, + "learning_rate": 9.988501444541445e-06, + "loss": 0.4946, + "step": 431 + }, + { + "epoch": 0.35960044395116536, + "grad_norm": 0.5403779149055481, + "learning_rate": 9.988170698270477e-06, + "loss": 0.4924, + "step": 432 + }, + { + "epoch": 0.3604328523862375, + "grad_norm": 0.388967901468277, + "learning_rate": 9.987835268019883e-06, + "loss": 0.4922, + "step": 433 + }, + { + "epoch": 0.36126526082130966, + "grad_norm": 0.459904283285141, + "learning_rate": 9.98749515410464e-06, + "loss": 0.4948, + "step": 434 + }, + { + "epoch": 0.3620976692563818, + "grad_norm": 0.46121054887771606, + "learning_rate": 9.987150356844118e-06, + "loss": 0.4833, + "step": 435 + }, + { + "epoch": 0.36293007769145397, + "grad_norm": 0.46013152599334717, + "learning_rate": 9.98680087656209e-06, + "loss": 0.4854, + "step": 436 + }, + { + "epoch": 0.36376248612652606, + "grad_norm": 0.5025128126144409, + "learning_rate": 9.986446713586724e-06, + "loss": 0.4917, + "step": 437 + }, + { + "epoch": 0.3645948945615982, + "grad_norm": 0.4966227412223816, + "learning_rate": 9.986087868250584e-06, + "loss": 0.4776, + "step": 438 + }, + { + "epoch": 0.36542730299667037, + "grad_norm": 0.45798709988594055, + "learning_rate": 9.985724340890633e-06, + "loss": 0.4892, + "step": 439 + }, + { + "epoch": 0.3662597114317425, + "grad_norm": 0.4821065068244934, + "learning_rate": 9.98535613184823e-06, + "loss": 0.4889, + "step": 440 + }, + { + "epoch": 0.36709211986681467, + "grad_norm": 0.4548647999763489, + "learning_rate": 9.984983241469129e-06, + "loss": 0.4701, + "step": 441 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 0.43688225746154785, + "learning_rate": 9.984605670103478e-06, + "loss": 0.5215, + "step": 442 + }, + { + "epoch": 0.3687569367369589, + "grad_norm": 0.549662709236145, + "learning_rate": 9.98422341810583e-06, + "loss": 0.4871, + "step": 443 + }, + { + "epoch": 0.36958934517203107, + "grad_norm": 0.40335264801979065, + "learning_rate": 9.98383648583512e-06, + "loss": 0.4678, + "step": 444 + }, + { + "epoch": 0.3704217536071032, + "grad_norm": 0.5139138102531433, + "learning_rate": 9.983444873654683e-06, + "loss": 0.4765, + "step": 445 + }, + { + "epoch": 0.3712541620421754, + "grad_norm": 0.5220227837562561, + "learning_rate": 9.983048581932257e-06, + "loss": 0.4893, + "step": 446 + }, + { + "epoch": 0.3720865704772475, + "grad_norm": 0.47806721925735474, + "learning_rate": 9.982647611039961e-06, + "loss": 0.4884, + "step": 447 + }, + { + "epoch": 0.3729189789123196, + "grad_norm": 0.49555501341819763, + "learning_rate": 9.982241961354317e-06, + "loss": 0.4835, + "step": 448 + }, + { + "epoch": 0.3737513873473918, + "grad_norm": 0.4484332203865051, + "learning_rate": 9.981831633256236e-06, + "loss": 0.4983, + "step": 449 + }, + { + "epoch": 0.3745837957824639, + "grad_norm": 0.41702842712402344, + "learning_rate": 9.981416627131022e-06, + "loss": 0.5161, + "step": 450 + }, + { + "epoch": 0.3754162042175361, + "grad_norm": 0.43207862973213196, + "learning_rate": 9.980996943368373e-06, + "loss": 0.5019, + "step": 451 + }, + { + "epoch": 0.3762486126526082, + "grad_norm": 0.460387647151947, + "learning_rate": 9.98057258236238e-06, + "loss": 0.4698, + "step": 452 + }, + { + "epoch": 0.3770810210876804, + "grad_norm": 0.4922404885292053, + "learning_rate": 9.980143544511527e-06, + "loss": 0.4838, + "step": 453 + }, + { + "epoch": 0.3779134295227525, + "grad_norm": 0.4728861153125763, + "learning_rate": 9.979709830218688e-06, + "loss": 0.4578, + "step": 454 + }, + { + "epoch": 0.3787458379578246, + "grad_norm": 0.5862690210342407, + "learning_rate": 9.979271439891125e-06, + "loss": 0.4802, + "step": 455 + }, + { + "epoch": 0.3795782463928968, + "grad_norm": 0.4514050781726837, + "learning_rate": 9.978828373940498e-06, + "loss": 0.4834, + "step": 456 + }, + { + "epoch": 0.38041065482796893, + "grad_norm": 0.5906931757926941, + "learning_rate": 9.97838063278285e-06, + "loss": 0.4621, + "step": 457 + }, + { + "epoch": 0.3812430632630411, + "grad_norm": 0.4813970625400543, + "learning_rate": 9.977928216838622e-06, + "loss": 0.4894, + "step": 458 + }, + { + "epoch": 0.38207547169811323, + "grad_norm": 0.5167549848556519, + "learning_rate": 9.977471126532636e-06, + "loss": 0.5006, + "step": 459 + }, + { + "epoch": 0.38290788013318533, + "grad_norm": 0.4821680188179016, + "learning_rate": 9.97700936229411e-06, + "loss": 0.4993, + "step": 460 + }, + { + "epoch": 0.3837402885682575, + "grad_norm": 0.4512309432029724, + "learning_rate": 9.976542924556652e-06, + "loss": 0.487, + "step": 461 + }, + { + "epoch": 0.38457269700332963, + "grad_norm": 0.4451935887336731, + "learning_rate": 9.976071813758249e-06, + "loss": 0.4546, + "step": 462 + }, + { + "epoch": 0.3854051054384018, + "grad_norm": 0.4715465307235718, + "learning_rate": 9.975596030341287e-06, + "loss": 0.4891, + "step": 463 + }, + { + "epoch": 0.38623751387347394, + "grad_norm": 0.39830783009529114, + "learning_rate": 9.975115574752532e-06, + "loss": 0.4996, + "step": 464 + }, + { + "epoch": 0.38706992230854603, + "grad_norm": 0.41202181577682495, + "learning_rate": 9.974630447443142e-06, + "loss": 0.4827, + "step": 465 + }, + { + "epoch": 0.3879023307436182, + "grad_norm": 0.42218315601348877, + "learning_rate": 9.974140648868659e-06, + "loss": 0.4635, + "step": 466 + }, + { + "epoch": 0.38873473917869034, + "grad_norm": 0.4501170516014099, + "learning_rate": 9.973646179489014e-06, + "loss": 0.4941, + "step": 467 + }, + { + "epoch": 0.3895671476137625, + "grad_norm": 0.44711339473724365, + "learning_rate": 9.97314703976852e-06, + "loss": 0.4581, + "step": 468 + }, + { + "epoch": 0.39039955604883464, + "grad_norm": 0.486054927110672, + "learning_rate": 9.97264323017588e-06, + "loss": 0.4863, + "step": 469 + }, + { + "epoch": 0.3912319644839068, + "grad_norm": 0.4822620153427124, + "learning_rate": 9.97213475118418e-06, + "loss": 0.5206, + "step": 470 + }, + { + "epoch": 0.3920643729189789, + "grad_norm": 0.40667545795440674, + "learning_rate": 9.971621603270887e-06, + "loss": 0.5174, + "step": 471 + }, + { + "epoch": 0.39289678135405104, + "grad_norm": 0.46628639101982117, + "learning_rate": 9.971103786917862e-06, + "loss": 0.4741, + "step": 472 + }, + { + "epoch": 0.3937291897891232, + "grad_norm": 0.40411314368247986, + "learning_rate": 9.97058130261134e-06, + "loss": 0.4768, + "step": 473 + }, + { + "epoch": 0.39456159822419534, + "grad_norm": 0.4988131523132324, + "learning_rate": 9.970054150841942e-06, + "loss": 0.4741, + "step": 474 + }, + { + "epoch": 0.3953940066592675, + "grad_norm": 0.42282071709632874, + "learning_rate": 9.969522332104675e-06, + "loss": 0.4761, + "step": 475 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 0.4693518877029419, + "learning_rate": 9.968985846898924e-06, + "loss": 0.4808, + "step": 476 + }, + { + "epoch": 0.39705882352941174, + "grad_norm": 0.5171592235565186, + "learning_rate": 9.968444695728461e-06, + "loss": 0.4797, + "step": 477 + }, + { + "epoch": 0.3978912319644839, + "grad_norm": 0.4162120819091797, + "learning_rate": 9.967898879101434e-06, + "loss": 0.5033, + "step": 478 + }, + { + "epoch": 0.39872364039955605, + "grad_norm": 0.5173420906066895, + "learning_rate": 9.967348397530373e-06, + "loss": 0.4704, + "step": 479 + }, + { + "epoch": 0.3995560488346282, + "grad_norm": 0.48679623007774353, + "learning_rate": 9.966793251532197e-06, + "loss": 0.4607, + "step": 480 + }, + { + "epoch": 0.40038845726970035, + "grad_norm": 0.4463588297367096, + "learning_rate": 9.966233441628188e-06, + "loss": 0.4836, + "step": 481 + }, + { + "epoch": 0.4012208657047725, + "grad_norm": 0.5191836357116699, + "learning_rate": 9.965668968344023e-06, + "loss": 0.4966, + "step": 482 + }, + { + "epoch": 0.4020532741398446, + "grad_norm": 0.4401721656322479, + "learning_rate": 9.965099832209753e-06, + "loss": 0.4864, + "step": 483 + }, + { + "epoch": 0.40288568257491675, + "grad_norm": 0.5379999876022339, + "learning_rate": 9.964526033759803e-06, + "loss": 0.4431, + "step": 484 + }, + { + "epoch": 0.4037180910099889, + "grad_norm": 0.46185803413391113, + "learning_rate": 9.963947573532983e-06, + "loss": 0.4894, + "step": 485 + }, + { + "epoch": 0.40455049944506105, + "grad_norm": 0.45974308252334595, + "learning_rate": 9.963364452072475e-06, + "loss": 0.4609, + "step": 486 + }, + { + "epoch": 0.4053829078801332, + "grad_norm": 0.46747586131095886, + "learning_rate": 9.962776669925842e-06, + "loss": 0.4949, + "step": 487 + }, + { + "epoch": 0.40621531631520535, + "grad_norm": 0.45227017998695374, + "learning_rate": 9.962184227645021e-06, + "loss": 0.5051, + "step": 488 + }, + { + "epoch": 0.40704772475027745, + "grad_norm": 0.46203479170799255, + "learning_rate": 9.961587125786328e-06, + "loss": 0.476, + "step": 489 + }, + { + "epoch": 0.4078801331853496, + "grad_norm": 0.4355171024799347, + "learning_rate": 9.960985364910448e-06, + "loss": 0.4809, + "step": 490 + }, + { + "epoch": 0.40871254162042175, + "grad_norm": 0.47708234190940857, + "learning_rate": 9.960378945582446e-06, + "loss": 0.4802, + "step": 491 + }, + { + "epoch": 0.4095449500554939, + "grad_norm": 0.46055299043655396, + "learning_rate": 9.959767868371761e-06, + "loss": 0.479, + "step": 492 + }, + { + "epoch": 0.41037735849056606, + "grad_norm": 0.502031147480011, + "learning_rate": 9.959152133852209e-06, + "loss": 0.4968, + "step": 493 + }, + { + "epoch": 0.41120976692563815, + "grad_norm": 0.48754680156707764, + "learning_rate": 9.958531742601968e-06, + "loss": 0.4791, + "step": 494 + }, + { + "epoch": 0.4120421753607103, + "grad_norm": 0.4217134416103363, + "learning_rate": 9.9579066952036e-06, + "loss": 0.4927, + "step": 495 + }, + { + "epoch": 0.41287458379578246, + "grad_norm": 0.474809467792511, + "learning_rate": 9.957276992244039e-06, + "loss": 0.491, + "step": 496 + }, + { + "epoch": 0.4137069922308546, + "grad_norm": 0.48426496982574463, + "learning_rate": 9.956642634314582e-06, + "loss": 0.4944, + "step": 497 + }, + { + "epoch": 0.41453940066592676, + "grad_norm": 0.4359665811061859, + "learning_rate": 9.956003622010904e-06, + "loss": 0.4695, + "step": 498 + }, + { + "epoch": 0.4153718091009989, + "grad_norm": 0.4425244927406311, + "learning_rate": 9.955359955933048e-06, + "loss": 0.49, + "step": 499 + }, + { + "epoch": 0.416204217536071, + "grad_norm": 0.5776862502098083, + "learning_rate": 9.95471163668543e-06, + "loss": 0.5233, + "step": 500 + }, + { + "epoch": 0.41703662597114316, + "grad_norm": 0.46279191970825195, + "learning_rate": 9.954058664876832e-06, + "loss": 0.4912, + "step": 501 + }, + { + "epoch": 0.4178690344062153, + "grad_norm": 0.5074960589408875, + "learning_rate": 9.953401041120403e-06, + "loss": 0.4979, + "step": 502 + }, + { + "epoch": 0.41870144284128746, + "grad_norm": 0.51701819896698, + "learning_rate": 9.952738766033668e-06, + "loss": 0.4649, + "step": 503 + }, + { + "epoch": 0.4195338512763596, + "grad_norm": 0.424464613199234, + "learning_rate": 9.952071840238511e-06, + "loss": 0.4885, + "step": 504 + }, + { + "epoch": 0.42036625971143177, + "grad_norm": 0.45139414072036743, + "learning_rate": 9.951400264361188e-06, + "loss": 0.4702, + "step": 505 + }, + { + "epoch": 0.42119866814650386, + "grad_norm": 0.46182501316070557, + "learning_rate": 9.950724039032324e-06, + "loss": 0.4643, + "step": 506 + }, + { + "epoch": 0.422031076581576, + "grad_norm": 0.43030858039855957, + "learning_rate": 9.950043164886902e-06, + "loss": 0.4903, + "step": 507 + }, + { + "epoch": 0.42286348501664817, + "grad_norm": 0.4888497292995453, + "learning_rate": 9.949357642564275e-06, + "loss": 0.5164, + "step": 508 + }, + { + "epoch": 0.4236958934517203, + "grad_norm": 0.48107990622520447, + "learning_rate": 9.948667472708163e-06, + "loss": 0.4811, + "step": 509 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 0.4666735827922821, + "learning_rate": 9.947972655966647e-06, + "loss": 0.4719, + "step": 510 + }, + { + "epoch": 0.4253607103218646, + "grad_norm": 0.4709594249725342, + "learning_rate": 9.947273192992171e-06, + "loss": 0.5038, + "step": 511 + }, + { + "epoch": 0.4261931187569367, + "grad_norm": 0.48332247138023376, + "learning_rate": 9.946569084441542e-06, + "loss": 0.4892, + "step": 512 + }, + { + "epoch": 0.42702552719200887, + "grad_norm": 0.507439911365509, + "learning_rate": 9.945860330975933e-06, + "loss": 0.482, + "step": 513 + }, + { + "epoch": 0.427857935627081, + "grad_norm": 0.4573519825935364, + "learning_rate": 9.945146933260876e-06, + "loss": 0.4808, + "step": 514 + }, + { + "epoch": 0.4286903440621532, + "grad_norm": 0.5414947867393494, + "learning_rate": 9.94442889196626e-06, + "loss": 0.4983, + "step": 515 + }, + { + "epoch": 0.4295227524972253, + "grad_norm": 0.43500539660453796, + "learning_rate": 9.94370620776634e-06, + "loss": 0.5014, + "step": 516 + }, + { + "epoch": 0.4303551609322974, + "grad_norm": 0.5725677609443665, + "learning_rate": 9.942978881339732e-06, + "loss": 0.487, + "step": 517 + }, + { + "epoch": 0.43118756936736957, + "grad_norm": 0.4238481819629669, + "learning_rate": 9.942246913369409e-06, + "loss": 0.4854, + "step": 518 + }, + { + "epoch": 0.4320199778024417, + "grad_norm": 0.44186079502105713, + "learning_rate": 9.941510304542695e-06, + "loss": 0.5183, + "step": 519 + }, + { + "epoch": 0.4328523862375139, + "grad_norm": 0.45779183506965637, + "learning_rate": 9.940769055551284e-06, + "loss": 0.4754, + "step": 520 + }, + { + "epoch": 0.433684794672586, + "grad_norm": 0.4756607115268707, + "learning_rate": 9.940023167091219e-06, + "loss": 0.4907, + "step": 521 + }, + { + "epoch": 0.4345172031076582, + "grad_norm": 0.47078803181648254, + "learning_rate": 9.939272639862905e-06, + "loss": 0.4845, + "step": 522 + }, + { + "epoch": 0.4353496115427303, + "grad_norm": 0.48096713423728943, + "learning_rate": 9.9385174745711e-06, + "loss": 0.4778, + "step": 523 + }, + { + "epoch": 0.4361820199778024, + "grad_norm": 0.48447316884994507, + "learning_rate": 9.937757671924915e-06, + "loss": 0.4883, + "step": 524 + }, + { + "epoch": 0.4370144284128746, + "grad_norm": 0.5201837420463562, + "learning_rate": 9.936993232637818e-06, + "loss": 0.5062, + "step": 525 + }, + { + "epoch": 0.43784683684794673, + "grad_norm": 0.5595635175704956, + "learning_rate": 9.936224157427635e-06, + "loss": 0.4843, + "step": 526 + }, + { + "epoch": 0.4386792452830189, + "grad_norm": 0.4055692255496979, + "learning_rate": 9.935450447016535e-06, + "loss": 0.4659, + "step": 527 + }, + { + "epoch": 0.43951165371809103, + "grad_norm": 0.5074310302734375, + "learning_rate": 9.934672102131052e-06, + "loss": 0.4681, + "step": 528 + }, + { + "epoch": 0.44034406215316313, + "grad_norm": 0.4834982454776764, + "learning_rate": 9.933889123502059e-06, + "loss": 0.5108, + "step": 529 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 0.48070859909057617, + "learning_rate": 9.933101511864793e-06, + "loss": 0.4833, + "step": 530 + }, + { + "epoch": 0.44200887902330743, + "grad_norm": 0.4138585031032562, + "learning_rate": 9.93230926795883e-06, + "loss": 0.4869, + "step": 531 + }, + { + "epoch": 0.4428412874583796, + "grad_norm": 0.5085632801055908, + "learning_rate": 9.931512392528104e-06, + "loss": 0.4832, + "step": 532 + }, + { + "epoch": 0.44367369589345174, + "grad_norm": 0.5004951357841492, + "learning_rate": 9.930710886320895e-06, + "loss": 0.4908, + "step": 533 + }, + { + "epoch": 0.4445061043285239, + "grad_norm": 0.498270720243454, + "learning_rate": 9.929904750089829e-06, + "loss": 0.4734, + "step": 534 + }, + { + "epoch": 0.445338512763596, + "grad_norm": 0.4868960976600647, + "learning_rate": 9.929093984591884e-06, + "loss": 0.4477, + "step": 535 + }, + { + "epoch": 0.44617092119866814, + "grad_norm": 0.5053864121437073, + "learning_rate": 9.928278590588382e-06, + "loss": 0.464, + "step": 536 + }, + { + "epoch": 0.4470033296337403, + "grad_norm": 0.47487616539001465, + "learning_rate": 9.927458568844994e-06, + "loss": 0.471, + "step": 537 + }, + { + "epoch": 0.44783573806881244, + "grad_norm": 0.4798586368560791, + "learning_rate": 9.926633920131732e-06, + "loss": 0.4628, + "step": 538 + }, + { + "epoch": 0.4486681465038846, + "grad_norm": 0.44665879011154175, + "learning_rate": 9.925804645222957e-06, + "loss": 0.464, + "step": 539 + }, + { + "epoch": 0.44950055493895674, + "grad_norm": 0.5611540675163269, + "learning_rate": 9.924970744897373e-06, + "loss": 0.4741, + "step": 540 + }, + { + "epoch": 0.45033296337402884, + "grad_norm": 0.4786832332611084, + "learning_rate": 9.924132219938027e-06, + "loss": 0.4732, + "step": 541 + }, + { + "epoch": 0.451165371809101, + "grad_norm": 0.4666735529899597, + "learning_rate": 9.923289071132308e-06, + "loss": 0.502, + "step": 542 + }, + { + "epoch": 0.45199778024417314, + "grad_norm": 0.5262482166290283, + "learning_rate": 9.922441299271948e-06, + "loss": 0.4744, + "step": 543 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.5326946377754211, + "learning_rate": 9.92158890515302e-06, + "loss": 0.4882, + "step": 544 + }, + { + "epoch": 0.45366259711431745, + "grad_norm": 0.4670204222202301, + "learning_rate": 9.920731889575935e-06, + "loss": 0.5037, + "step": 545 + }, + { + "epoch": 0.45449500554938954, + "grad_norm": 0.5759830474853516, + "learning_rate": 9.919870253345446e-06, + "loss": 0.4822, + "step": 546 + }, + { + "epoch": 0.4553274139844617, + "grad_norm": 0.42398568987846375, + "learning_rate": 9.919003997270648e-06, + "loss": 0.4776, + "step": 547 + }, + { + "epoch": 0.45615982241953384, + "grad_norm": 0.5048391819000244, + "learning_rate": 9.918133122164968e-06, + "loss": 0.4621, + "step": 548 + }, + { + "epoch": 0.456992230854606, + "grad_norm": 0.5019201636314392, + "learning_rate": 9.917257628846172e-06, + "loss": 0.4885, + "step": 549 + }, + { + "epoch": 0.45782463928967815, + "grad_norm": 0.43558669090270996, + "learning_rate": 9.916377518136367e-06, + "loss": 0.4843, + "step": 550 + }, + { + "epoch": 0.4586570477247503, + "grad_norm": 0.526262104511261, + "learning_rate": 9.915492790861986e-06, + "loss": 0.4854, + "step": 551 + }, + { + "epoch": 0.4594894561598224, + "grad_norm": 0.5632872581481934, + "learning_rate": 9.91460344785381e-06, + "loss": 0.4879, + "step": 552 + }, + { + "epoch": 0.46032186459489455, + "grad_norm": 0.5479772686958313, + "learning_rate": 9.913709489946946e-06, + "loss": 0.5225, + "step": 553 + }, + { + "epoch": 0.4611542730299667, + "grad_norm": 0.5169483423233032, + "learning_rate": 9.912810917980834e-06, + "loss": 0.4791, + "step": 554 + }, + { + "epoch": 0.46198668146503885, + "grad_norm": 0.5335115194320679, + "learning_rate": 9.911907732799251e-06, + "loss": 0.4974, + "step": 555 + }, + { + "epoch": 0.462819089900111, + "grad_norm": 0.49141526222229004, + "learning_rate": 9.910999935250302e-06, + "loss": 0.4835, + "step": 556 + }, + { + "epoch": 0.46365149833518315, + "grad_norm": 0.7199199795722961, + "learning_rate": 9.910087526186424e-06, + "loss": 0.502, + "step": 557 + }, + { + "epoch": 0.46448390677025525, + "grad_norm": 7.59892463684082, + "learning_rate": 9.909170506464389e-06, + "loss": 0.453, + "step": 558 + }, + { + "epoch": 0.4653163152053274, + "grad_norm": 4702.46728515625, + "learning_rate": 9.908248876945291e-06, + "loss": 1.725, + "step": 559 + }, + { + "epoch": 0.46614872364039955, + "grad_norm": 13.055675506591797, + "learning_rate": 9.907322638494558e-06, + "loss": 0.492, + "step": 560 + }, + { + "epoch": 0.4669811320754717, + "grad_norm": 2.1198976039886475, + "learning_rate": 9.906391791981944e-06, + "loss": 0.504, + "step": 561 + }, + { + "epoch": 0.46781354051054386, + "grad_norm": 0.6735315322875977, + "learning_rate": 9.90545633828153e-06, + "loss": 0.4779, + "step": 562 + }, + { + "epoch": 0.468645948945616, + "grad_norm": 0.7947428226470947, + "learning_rate": 9.904516278271721e-06, + "loss": 0.5156, + "step": 563 + }, + { + "epoch": 0.4694783573806881, + "grad_norm": 0.8350366950035095, + "learning_rate": 9.903571612835254e-06, + "loss": 0.4695, + "step": 564 + }, + { + "epoch": 0.47031076581576026, + "grad_norm": 0.6595231294631958, + "learning_rate": 9.902622342859183e-06, + "loss": 0.5045, + "step": 565 + }, + { + "epoch": 0.4711431742508324, + "grad_norm": 0.7960402369499207, + "learning_rate": 9.901668469234892e-06, + "loss": 0.4872, + "step": 566 + }, + { + "epoch": 0.47197558268590456, + "grad_norm": 0.6408143639564514, + "learning_rate": 9.900709992858083e-06, + "loss": 0.4483, + "step": 567 + }, + { + "epoch": 0.4728079911209767, + "grad_norm": 0.8016465306282043, + "learning_rate": 9.899746914628782e-06, + "loss": 0.4663, + "step": 568 + }, + { + "epoch": 0.4736403995560488, + "grad_norm": 0.5370069146156311, + "learning_rate": 9.898779235451337e-06, + "loss": 0.4861, + "step": 569 + }, + { + "epoch": 0.47447280799112096, + "grad_norm": 0.6317716836929321, + "learning_rate": 9.897806956234417e-06, + "loss": 0.4831, + "step": 570 + }, + { + "epoch": 0.4753052164261931, + "grad_norm": 0.649116575717926, + "learning_rate": 9.896830077891007e-06, + "loss": 0.484, + "step": 571 + }, + { + "epoch": 0.47613762486126526, + "grad_norm": 0.5388279557228088, + "learning_rate": 9.895848601338414e-06, + "loss": 0.5064, + "step": 572 + }, + { + "epoch": 0.4769700332963374, + "grad_norm": 0.6505693793296814, + "learning_rate": 9.894862527498259e-06, + "loss": 0.4757, + "step": 573 + }, + { + "epoch": 0.47780244173140957, + "grad_norm": 0.503017783164978, + "learning_rate": 9.893871857296487e-06, + "loss": 0.5006, + "step": 574 + }, + { + "epoch": 0.47863485016648166, + "grad_norm": 0.7261723875999451, + "learning_rate": 9.892876591663355e-06, + "loss": 0.4688, + "step": 575 + }, + { + "epoch": 0.4794672586015538, + "grad_norm": 0.49724769592285156, + "learning_rate": 9.891876731533429e-06, + "loss": 0.4621, + "step": 576 + }, + { + "epoch": 0.48029966703662597, + "grad_norm": 0.6609309315681458, + "learning_rate": 9.8908722778456e-06, + "loss": 0.4631, + "step": 577 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 0.5452806353569031, + "learning_rate": 9.889863231543065e-06, + "loss": 0.4881, + "step": 578 + }, + { + "epoch": 0.48196448390677027, + "grad_norm": 0.675403892993927, + "learning_rate": 9.888849593573339e-06, + "loss": 0.4709, + "step": 579 + }, + { + "epoch": 0.4827968923418424, + "grad_norm": 0.6482904553413391, + "learning_rate": 9.887831364888243e-06, + "loss": 0.4907, + "step": 580 + }, + { + "epoch": 0.4836293007769145, + "grad_norm": 0.6234882473945618, + "learning_rate": 9.886808546443914e-06, + "loss": 0.4801, + "step": 581 + }, + { + "epoch": 0.48446170921198667, + "grad_norm": 0.60924232006073, + "learning_rate": 9.885781139200794e-06, + "loss": 0.4668, + "step": 582 + }, + { + "epoch": 0.4852941176470588, + "grad_norm": 0.5140445828437805, + "learning_rate": 9.88474914412364e-06, + "loss": 0.471, + "step": 583 + }, + { + "epoch": 0.48612652608213097, + "grad_norm": 0.6285945773124695, + "learning_rate": 9.88371256218151e-06, + "loss": 0.469, + "step": 584 + }, + { + "epoch": 0.4869589345172031, + "grad_norm": 0.45175155997276306, + "learning_rate": 9.882671394347771e-06, + "loss": 0.4646, + "step": 585 + }, + { + "epoch": 0.4877913429522753, + "grad_norm": 0.4605112373828888, + "learning_rate": 9.881625641600104e-06, + "loss": 0.4653, + "step": 586 + }, + { + "epoch": 0.48862375138734737, + "grad_norm": 0.5589239597320557, + "learning_rate": 9.880575304920484e-06, + "loss": 0.4641, + "step": 587 + }, + { + "epoch": 0.4894561598224195, + "grad_norm": 0.38805943727493286, + "learning_rate": 9.879520385295197e-06, + "loss": 0.4584, + "step": 588 + }, + { + "epoch": 0.4902885682574917, + "grad_norm": 0.5301197171211243, + "learning_rate": 9.878460883714831e-06, + "loss": 0.5203, + "step": 589 + }, + { + "epoch": 0.4911209766925638, + "grad_norm": 0.47221046686172485, + "learning_rate": 9.877396801174277e-06, + "loss": 0.489, + "step": 590 + }, + { + "epoch": 0.491953385127636, + "grad_norm": 0.4699765145778656, + "learning_rate": 9.876328138672726e-06, + "loss": 0.4739, + "step": 591 + }, + { + "epoch": 0.49278579356270813, + "grad_norm": 0.4534553587436676, + "learning_rate": 9.875254897213674e-06, + "loss": 0.4624, + "step": 592 + }, + { + "epoch": 0.4936182019977802, + "grad_norm": 0.39965343475341797, + "learning_rate": 9.87417707780491e-06, + "loss": 0.4865, + "step": 593 + }, + { + "epoch": 0.4944506104328524, + "grad_norm": 0.43681901693344116, + "learning_rate": 9.873094681458525e-06, + "loss": 0.4793, + "step": 594 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 0.4199521839618683, + "learning_rate": 9.87200770919091e-06, + "loss": 0.4561, + "step": 595 + }, + { + "epoch": 0.4961154273029967, + "grad_norm": 0.42258721590042114, + "learning_rate": 9.870916162022752e-06, + "loss": 0.4763, + "step": 596 + }, + { + "epoch": 0.49694783573806883, + "grad_norm": 0.4213769733905792, + "learning_rate": 9.86982004097903e-06, + "loss": 0.4815, + "step": 597 + }, + { + "epoch": 0.49778024417314093, + "grad_norm": 0.41223376989364624, + "learning_rate": 9.868719347089024e-06, + "loss": 0.4688, + "step": 598 + }, + { + "epoch": 0.4986126526082131, + "grad_norm": 0.49295932054519653, + "learning_rate": 9.867614081386302e-06, + "loss": 0.5049, + "step": 599 + }, + { + "epoch": 0.49944506104328523, + "grad_norm": 0.3732836842536926, + "learning_rate": 9.866504244908728e-06, + "loss": 0.5007, + "step": 600 + }, + { + "epoch": 0.5002774694783574, + "grad_norm": 0.39977091550827026, + "learning_rate": 9.86538983869846e-06, + "loss": 0.4807, + "step": 601 + }, + { + "epoch": 0.5011098779134295, + "grad_norm": 0.4643203318119049, + "learning_rate": 9.864270863801944e-06, + "loss": 0.4807, + "step": 602 + }, + { + "epoch": 0.5019422863485017, + "grad_norm": 0.391868531703949, + "learning_rate": 9.863147321269918e-06, + "loss": 0.4536, + "step": 603 + }, + { + "epoch": 0.5027746947835738, + "grad_norm": 0.4176725149154663, + "learning_rate": 9.862019212157406e-06, + "loss": 0.4658, + "step": 604 + }, + { + "epoch": 0.503607103218646, + "grad_norm": 0.4765459895133972, + "learning_rate": 9.860886537523721e-06, + "loss": 0.4891, + "step": 605 + }, + { + "epoch": 0.5044395116537181, + "grad_norm": 0.41668006777763367, + "learning_rate": 9.859749298432468e-06, + "loss": 0.4859, + "step": 606 + }, + { + "epoch": 0.5052719200887902, + "grad_norm": 0.45202815532684326, + "learning_rate": 9.858607495951534e-06, + "loss": 0.4732, + "step": 607 + }, + { + "epoch": 0.5061043285238623, + "grad_norm": 0.41504961252212524, + "learning_rate": 9.857461131153089e-06, + "loss": 0.4529, + "step": 608 + }, + { + "epoch": 0.5069367369589345, + "grad_norm": 0.4873095452785492, + "learning_rate": 9.856310205113594e-06, + "loss": 0.4693, + "step": 609 + }, + { + "epoch": 0.5077691453940066, + "grad_norm": 0.42755189538002014, + "learning_rate": 9.855154718913782e-06, + "loss": 0.4502, + "step": 610 + }, + { + "epoch": 0.5086015538290788, + "grad_norm": 0.40592634677886963, + "learning_rate": 9.853994673638679e-06, + "loss": 0.469, + "step": 611 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 0.4807058274745941, + "learning_rate": 9.852830070377588e-06, + "loss": 0.489, + "step": 612 + }, + { + "epoch": 0.5102663706992231, + "grad_norm": 0.4219500720500946, + "learning_rate": 9.851660910224092e-06, + "loss": 0.4927, + "step": 613 + }, + { + "epoch": 0.5110987791342952, + "grad_norm": 0.45034486055374146, + "learning_rate": 9.85048719427605e-06, + "loss": 0.4877, + "step": 614 + }, + { + "epoch": 0.5119311875693674, + "grad_norm": 0.39540550112724304, + "learning_rate": 9.849308923635606e-06, + "loss": 0.4769, + "step": 615 + }, + { + "epoch": 0.5127635960044395, + "grad_norm": 0.40875479578971863, + "learning_rate": 9.848126099409175e-06, + "loss": 0.4732, + "step": 616 + }, + { + "epoch": 0.5135960044395117, + "grad_norm": 0.4183914363384247, + "learning_rate": 9.846938722707446e-06, + "loss": 0.4671, + "step": 617 + }, + { + "epoch": 0.5144284128745837, + "grad_norm": 0.3762715458869934, + "learning_rate": 9.845746794645393e-06, + "loss": 0.4749, + "step": 618 + }, + { + "epoch": 0.5152608213096559, + "grad_norm": 0.4459075927734375, + "learning_rate": 9.844550316342252e-06, + "loss": 0.5014, + "step": 619 + }, + { + "epoch": 0.516093229744728, + "grad_norm": 0.4192522466182709, + "learning_rate": 9.843349288921543e-06, + "loss": 0.4872, + "step": 620 + }, + { + "epoch": 0.5169256381798002, + "grad_norm": 0.40821659564971924, + "learning_rate": 9.842143713511044e-06, + "loss": 0.479, + "step": 621 + }, + { + "epoch": 0.5177580466148723, + "grad_norm": 0.45011478662490845, + "learning_rate": 9.840933591242817e-06, + "loss": 0.4743, + "step": 622 + }, + { + "epoch": 0.5185904550499445, + "grad_norm": 0.45597994327545166, + "learning_rate": 9.839718923253186e-06, + "loss": 0.4667, + "step": 623 + }, + { + "epoch": 0.5194228634850167, + "grad_norm": 0.4411117136478424, + "learning_rate": 9.838499710682745e-06, + "loss": 0.4849, + "step": 624 + }, + { + "epoch": 0.5202552719200888, + "grad_norm": 0.46066632866859436, + "learning_rate": 9.837275954676357e-06, + "loss": 0.4375, + "step": 625 + }, + { + "epoch": 0.521087680355161, + "grad_norm": 0.42149725556373596, + "learning_rate": 9.836047656383152e-06, + "loss": 0.4505, + "step": 626 + }, + { + "epoch": 0.5219200887902331, + "grad_norm": 0.4638105630874634, + "learning_rate": 9.834814816956521e-06, + "loss": 0.4442, + "step": 627 + }, + { + "epoch": 0.5227524972253053, + "grad_norm": 0.5468646883964539, + "learning_rate": 9.833577437554121e-06, + "loss": 0.4775, + "step": 628 + }, + { + "epoch": 0.5235849056603774, + "grad_norm": 0.44078686833381653, + "learning_rate": 9.832335519337877e-06, + "loss": 0.4493, + "step": 629 + }, + { + "epoch": 0.5244173140954494, + "grad_norm": 0.5681177973747253, + "learning_rate": 9.831089063473967e-06, + "loss": 0.4996, + "step": 630 + }, + { + "epoch": 0.5252497225305216, + "grad_norm": 0.4630327522754669, + "learning_rate": 9.82983807113284e-06, + "loss": 0.4714, + "step": 631 + }, + { + "epoch": 0.5260821309655938, + "grad_norm": 0.47248589992523193, + "learning_rate": 9.828582543489194e-06, + "loss": 0.4838, + "step": 632 + }, + { + "epoch": 0.5269145394006659, + "grad_norm": 0.5343927145004272, + "learning_rate": 9.827322481721998e-06, + "loss": 0.4678, + "step": 633 + }, + { + "epoch": 0.5277469478357381, + "grad_norm": 0.48827970027923584, + "learning_rate": 9.826057887014466e-06, + "loss": 0.4841, + "step": 634 + }, + { + "epoch": 0.5285793562708102, + "grad_norm": 0.42138999700546265, + "learning_rate": 9.824788760554078e-06, + "loss": 0.4821, + "step": 635 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.46162131428718567, + "learning_rate": 9.823515103532564e-06, + "loss": 0.463, + "step": 636 + }, + { + "epoch": 0.5302441731409545, + "grad_norm": 0.41701412200927734, + "learning_rate": 9.822236917145914e-06, + "loss": 0.5037, + "step": 637 + }, + { + "epoch": 0.5310765815760267, + "grad_norm": 0.43682774901390076, + "learning_rate": 9.820954202594362e-06, + "loss": 0.468, + "step": 638 + }, + { + "epoch": 0.5319089900110988, + "grad_norm": 0.5141327977180481, + "learning_rate": 9.819666961082402e-06, + "loss": 0.5035, + "step": 639 + }, + { + "epoch": 0.532741398446171, + "grad_norm": 0.4251098036766052, + "learning_rate": 9.81837519381878e-06, + "loss": 0.4749, + "step": 640 + }, + { + "epoch": 0.5335738068812431, + "grad_norm": 0.4675353765487671, + "learning_rate": 9.817078902016481e-06, + "loss": 0.4837, + "step": 641 + }, + { + "epoch": 0.5344062153163152, + "grad_norm": 0.44004005193710327, + "learning_rate": 9.81577808689275e-06, + "loss": 0.4729, + "step": 642 + }, + { + "epoch": 0.5352386237513873, + "grad_norm": 0.45024535059928894, + "learning_rate": 9.814472749669076e-06, + "loss": 0.4783, + "step": 643 + }, + { + "epoch": 0.5360710321864595, + "grad_norm": 0.3834257125854492, + "learning_rate": 9.813162891571189e-06, + "loss": 0.4503, + "step": 644 + }, + { + "epoch": 0.5369034406215316, + "grad_norm": 0.47850501537323, + "learning_rate": 9.811848513829074e-06, + "loss": 0.466, + "step": 645 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 0.397339403629303, + "learning_rate": 9.810529617676952e-06, + "loss": 0.4804, + "step": 646 + }, + { + "epoch": 0.5385682574916759, + "grad_norm": 0.45498281717300415, + "learning_rate": 9.809206204353289e-06, + "loss": 0.4587, + "step": 647 + }, + { + "epoch": 0.5394006659267481, + "grad_norm": 0.46825650334358215, + "learning_rate": 9.807878275100795e-06, + "loss": 0.4772, + "step": 648 + }, + { + "epoch": 0.5402330743618202, + "grad_norm": 0.40103840827941895, + "learning_rate": 9.806545831166417e-06, + "loss": 0.4725, + "step": 649 + }, + { + "epoch": 0.5410654827968924, + "grad_norm": 0.4803685247898102, + "learning_rate": 9.805208873801346e-06, + "loss": 0.4799, + "step": 650 + }, + { + "epoch": 0.5418978912319645, + "grad_norm": 0.4705343544483185, + "learning_rate": 9.803867404261005e-06, + "loss": 0.4769, + "step": 651 + }, + { + "epoch": 0.5427302996670367, + "grad_norm": 0.4226051867008209, + "learning_rate": 9.80252142380506e-06, + "loss": 0.4734, + "step": 652 + }, + { + "epoch": 0.5435627081021087, + "grad_norm": 0.42430418729782104, + "learning_rate": 9.80117093369741e-06, + "loss": 0.474, + "step": 653 + }, + { + "epoch": 0.5443951165371809, + "grad_norm": 0.4154928922653198, + "learning_rate": 9.799815935206187e-06, + "loss": 0.4819, + "step": 654 + }, + { + "epoch": 0.545227524972253, + "grad_norm": 0.359573096036911, + "learning_rate": 9.798456429603758e-06, + "loss": 0.4564, + "step": 655 + }, + { + "epoch": 0.5460599334073252, + "grad_norm": 0.5043289661407471, + "learning_rate": 9.797092418166725e-06, + "loss": 0.4784, + "step": 656 + }, + { + "epoch": 0.5468923418423973, + "grad_norm": 0.3750476539134979, + "learning_rate": 9.795723902175918e-06, + "loss": 0.4542, + "step": 657 + }, + { + "epoch": 0.5477247502774695, + "grad_norm": 0.4907470643520355, + "learning_rate": 9.794350882916397e-06, + "loss": 0.4908, + "step": 658 + }, + { + "epoch": 0.5485571587125416, + "grad_norm": 0.457126259803772, + "learning_rate": 9.79297336167745e-06, + "loss": 0.4323, + "step": 659 + }, + { + "epoch": 0.5493895671476138, + "grad_norm": 0.42247408628463745, + "learning_rate": 9.791591339752596e-06, + "loss": 0.4752, + "step": 660 + }, + { + "epoch": 0.5502219755826859, + "grad_norm": 0.49953287839889526, + "learning_rate": 9.790204818439576e-06, + "loss": 0.4852, + "step": 661 + }, + { + "epoch": 0.5510543840177581, + "grad_norm": 0.466348260641098, + "learning_rate": 9.788813799040358e-06, + "loss": 0.4654, + "step": 662 + }, + { + "epoch": 0.5518867924528302, + "grad_norm": 0.47069528698921204, + "learning_rate": 9.787418282861135e-06, + "loss": 0.4919, + "step": 663 + }, + { + "epoch": 0.5527192008879024, + "grad_norm": 0.3939701020717621, + "learning_rate": 9.786018271212318e-06, + "loss": 0.4712, + "step": 664 + }, + { + "epoch": 0.5535516093229744, + "grad_norm": 0.4161626994609833, + "learning_rate": 9.784613765408546e-06, + "loss": 0.4791, + "step": 665 + }, + { + "epoch": 0.5543840177580466, + "grad_norm": 0.39920780062675476, + "learning_rate": 9.783204766768672e-06, + "loss": 0.4669, + "step": 666 + }, + { + "epoch": 0.5552164261931187, + "grad_norm": 0.47431665658950806, + "learning_rate": 9.781791276615774e-06, + "loss": 0.4623, + "step": 667 + }, + { + "epoch": 0.5560488346281909, + "grad_norm": 0.4187391400337219, + "learning_rate": 9.780373296277137e-06, + "loss": 0.4766, + "step": 668 + }, + { + "epoch": 0.556881243063263, + "grad_norm": 0.39394712448120117, + "learning_rate": 9.778950827084277e-06, + "loss": 0.4857, + "step": 669 + }, + { + "epoch": 0.5577136514983352, + "grad_norm": 0.4316924512386322, + "learning_rate": 9.777523870372913e-06, + "loss": 0.472, + "step": 670 + }, + { + "epoch": 0.5585460599334073, + "grad_norm": 0.54462730884552, + "learning_rate": 9.776092427482984e-06, + "loss": 0.4716, + "step": 671 + }, + { + "epoch": 0.5593784683684795, + "grad_norm": 0.40116629004478455, + "learning_rate": 9.774656499758639e-06, + "loss": 0.4498, + "step": 672 + }, + { + "epoch": 0.5602108768035516, + "grad_norm": 0.5211840271949768, + "learning_rate": 9.77321608854824e-06, + "loss": 0.4702, + "step": 673 + }, + { + "epoch": 0.5610432852386238, + "grad_norm": 0.5067088603973389, + "learning_rate": 9.771771195204358e-06, + "loss": 0.4796, + "step": 674 + }, + { + "epoch": 0.5618756936736959, + "grad_norm": 0.5115644335746765, + "learning_rate": 9.770321821083774e-06, + "loss": 0.4809, + "step": 675 + }, + { + "epoch": 0.562708102108768, + "grad_norm": 0.4665434956550598, + "learning_rate": 9.768867967547472e-06, + "loss": 0.4649, + "step": 676 + }, + { + "epoch": 0.5635405105438401, + "grad_norm": 0.5498418211936951, + "learning_rate": 9.767409635960653e-06, + "loss": 0.4799, + "step": 677 + }, + { + "epoch": 0.5643729189789123, + "grad_norm": 0.41861289739608765, + "learning_rate": 9.76594682769271e-06, + "loss": 0.4655, + "step": 678 + }, + { + "epoch": 0.5652053274139844, + "grad_norm": 0.4563583731651306, + "learning_rate": 9.764479544117247e-06, + "loss": 0.4608, + "step": 679 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.45897558331489563, + "learning_rate": 9.76300778661207e-06, + "loss": 0.4476, + "step": 680 + }, + { + "epoch": 0.5668701442841287, + "grad_norm": 0.42618608474731445, + "learning_rate": 9.761531556559183e-06, + "loss": 0.4758, + "step": 681 + }, + { + "epoch": 0.5677025527192009, + "grad_norm": 0.3950604796409607, + "learning_rate": 9.760050855344795e-06, + "loss": 0.4566, + "step": 682 + }, + { + "epoch": 0.568534961154273, + "grad_norm": 0.46975719928741455, + "learning_rate": 9.758565684359307e-06, + "loss": 0.4612, + "step": 683 + }, + { + "epoch": 0.5693673695893452, + "grad_norm": 0.4233880639076233, + "learning_rate": 9.757076044997324e-06, + "loss": 0.4591, + "step": 684 + }, + { + "epoch": 0.5701997780244173, + "grad_norm": 0.42603635787963867, + "learning_rate": 9.75558193865764e-06, + "loss": 0.4841, + "step": 685 + }, + { + "epoch": 0.5710321864594895, + "grad_norm": 0.4705389738082886, + "learning_rate": 9.754083366743249e-06, + "loss": 0.4501, + "step": 686 + }, + { + "epoch": 0.5718645948945617, + "grad_norm": 0.4174215793609619, + "learning_rate": 9.752580330661336e-06, + "loss": 0.4804, + "step": 687 + }, + { + "epoch": 0.5726970033296337, + "grad_norm": 0.47212427854537964, + "learning_rate": 9.751072831823279e-06, + "loss": 0.4987, + "step": 688 + }, + { + "epoch": 0.5735294117647058, + "grad_norm": 0.453812837600708, + "learning_rate": 9.749560871644643e-06, + "loss": 0.4655, + "step": 689 + }, + { + "epoch": 0.574361820199778, + "grad_norm": 0.44600024819374084, + "learning_rate": 9.748044451545188e-06, + "loss": 0.4926, + "step": 690 + }, + { + "epoch": 0.5751942286348501, + "grad_norm": 0.5240420699119568, + "learning_rate": 9.746523572948857e-06, + "loss": 0.4919, + "step": 691 + }, + { + "epoch": 0.5760266370699223, + "grad_norm": 0.415798544883728, + "learning_rate": 9.744998237283785e-06, + "loss": 0.4447, + "step": 692 + }, + { + "epoch": 0.5768590455049944, + "grad_norm": 0.39677396416664124, + "learning_rate": 9.743468445982284e-06, + "loss": 0.4626, + "step": 693 + }, + { + "epoch": 0.5776914539400666, + "grad_norm": 0.4481954276561737, + "learning_rate": 9.741934200480857e-06, + "loss": 0.4845, + "step": 694 + }, + { + "epoch": 0.5785238623751388, + "grad_norm": 0.4083997309207916, + "learning_rate": 9.740395502220192e-06, + "loss": 0.4673, + "step": 695 + }, + { + "epoch": 0.5793562708102109, + "grad_norm": 0.4454471170902252, + "learning_rate": 9.738852352645145e-06, + "loss": 0.4606, + "step": 696 + }, + { + "epoch": 0.5801886792452831, + "grad_norm": 0.45555463433265686, + "learning_rate": 9.737304753204767e-06, + "loss": 0.482, + "step": 697 + }, + { + "epoch": 0.5810210876803552, + "grad_norm": 0.5052851438522339, + "learning_rate": 9.735752705352278e-06, + "loss": 0.4909, + "step": 698 + }, + { + "epoch": 0.5818534961154272, + "grad_norm": 0.40319934487342834, + "learning_rate": 9.734196210545079e-06, + "loss": 0.4804, + "step": 699 + }, + { + "epoch": 0.5826859045504994, + "grad_norm": 0.4926762878894806, + "learning_rate": 9.732635270244745e-06, + "loss": 0.455, + "step": 700 + }, + { + "epoch": 0.5835183129855716, + "grad_norm": 0.41983771324157715, + "learning_rate": 9.731069885917029e-06, + "loss": 0.4748, + "step": 701 + }, + { + "epoch": 0.5843507214206437, + "grad_norm": 0.4828875958919525, + "learning_rate": 9.729500059031851e-06, + "loss": 0.4664, + "step": 702 + }, + { + "epoch": 0.5851831298557159, + "grad_norm": 0.4084136188030243, + "learning_rate": 9.727925791063306e-06, + "loss": 0.4576, + "step": 703 + }, + { + "epoch": 0.586015538290788, + "grad_norm": 0.5071380138397217, + "learning_rate": 9.726347083489661e-06, + "loss": 0.4702, + "step": 704 + }, + { + "epoch": 0.5868479467258602, + "grad_norm": 0.4290992021560669, + "learning_rate": 9.724763937793352e-06, + "loss": 0.4669, + "step": 705 + }, + { + "epoch": 0.5876803551609323, + "grad_norm": 0.46184346079826355, + "learning_rate": 9.723176355460978e-06, + "loss": 0.4622, + "step": 706 + }, + { + "epoch": 0.5885127635960045, + "grad_norm": 0.5367664694786072, + "learning_rate": 9.721584337983303e-06, + "loss": 0.5039, + "step": 707 + }, + { + "epoch": 0.5893451720310766, + "grad_norm": 0.47268781065940857, + "learning_rate": 9.719987886855264e-06, + "loss": 0.4887, + "step": 708 + }, + { + "epoch": 0.5901775804661488, + "grad_norm": 0.5064681768417358, + "learning_rate": 9.718387003575957e-06, + "loss": 0.4529, + "step": 709 + }, + { + "epoch": 0.5910099889012209, + "grad_norm": 0.4127980172634125, + "learning_rate": 9.716781689648638e-06, + "loss": 0.4661, + "step": 710 + }, + { + "epoch": 0.591842397336293, + "grad_norm": 0.525039553642273, + "learning_rate": 9.715171946580724e-06, + "loss": 0.4509, + "step": 711 + }, + { + "epoch": 0.5926748057713651, + "grad_norm": 0.5468900799751282, + "learning_rate": 9.713557775883793e-06, + "loss": 0.4854, + "step": 712 + }, + { + "epoch": 0.5935072142064373, + "grad_norm": 0.4610214829444885, + "learning_rate": 9.71193917907358e-06, + "loss": 0.4783, + "step": 713 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 0.5212530493736267, + "learning_rate": 9.710316157669972e-06, + "loss": 0.4461, + "step": 714 + }, + { + "epoch": 0.5951720310765816, + "grad_norm": 0.4037787616252899, + "learning_rate": 9.708688713197021e-06, + "loss": 0.4931, + "step": 715 + }, + { + "epoch": 0.5960044395116537, + "grad_norm": 0.4619676172733307, + "learning_rate": 9.707056847182921e-06, + "loss": 0.4709, + "step": 716 + }, + { + "epoch": 0.5968368479467259, + "grad_norm": 0.4411291778087616, + "learning_rate": 9.705420561160024e-06, + "loss": 0.4923, + "step": 717 + }, + { + "epoch": 0.597669256381798, + "grad_norm": 0.45065465569496155, + "learning_rate": 9.703779856664833e-06, + "loss": 0.457, + "step": 718 + }, + { + "epoch": 0.5985016648168702, + "grad_norm": 0.4192379415035248, + "learning_rate": 9.702134735237994e-06, + "loss": 0.4815, + "step": 719 + }, + { + "epoch": 0.5993340732519423, + "grad_norm": 0.5066999793052673, + "learning_rate": 9.700485198424307e-06, + "loss": 0.4763, + "step": 720 + }, + { + "epoch": 0.6001664816870145, + "grad_norm": 0.4297652840614319, + "learning_rate": 9.69883124777272e-06, + "loss": 0.4545, + "step": 721 + }, + { + "epoch": 0.6009988901220865, + "grad_norm": 0.4559658169746399, + "learning_rate": 9.697172884836315e-06, + "loss": 0.4581, + "step": 722 + }, + { + "epoch": 0.6018312985571587, + "grad_norm": 0.41913849115371704, + "learning_rate": 9.695510111172329e-06, + "loss": 0.443, + "step": 723 + }, + { + "epoch": 0.6026637069922308, + "grad_norm": 0.40670254826545715, + "learning_rate": 9.693842928342132e-06, + "loss": 0.4689, + "step": 724 + }, + { + "epoch": 0.603496115427303, + "grad_norm": 0.43783038854599, + "learning_rate": 9.69217133791124e-06, + "loss": 0.4633, + "step": 725 + }, + { + "epoch": 0.6043285238623751, + "grad_norm": 0.4118957221508026, + "learning_rate": 9.690495341449304e-06, + "loss": 0.4812, + "step": 726 + }, + { + "epoch": 0.6051609322974473, + "grad_norm": 0.41395294666290283, + "learning_rate": 9.688814940530115e-06, + "loss": 0.4975, + "step": 727 + }, + { + "epoch": 0.6059933407325194, + "grad_norm": 0.485917866230011, + "learning_rate": 9.6871301367316e-06, + "loss": 0.5011, + "step": 728 + }, + { + "epoch": 0.6068257491675916, + "grad_norm": 0.39293450117111206, + "learning_rate": 9.68544093163582e-06, + "loss": 0.4541, + "step": 729 + }, + { + "epoch": 0.6076581576026637, + "grad_norm": 0.47208675742149353, + "learning_rate": 9.683747326828962e-06, + "loss": 0.4707, + "step": 730 + }, + { + "epoch": 0.6084905660377359, + "grad_norm": 0.41672036051750183, + "learning_rate": 9.682049323901358e-06, + "loss": 0.4689, + "step": 731 + }, + { + "epoch": 0.609322974472808, + "grad_norm": 0.45332202315330505, + "learning_rate": 9.680346924447458e-06, + "loss": 0.5, + "step": 732 + }, + { + "epoch": 0.6101553829078802, + "grad_norm": 0.4699179530143738, + "learning_rate": 9.678640130065846e-06, + "loss": 0.4914, + "step": 733 + }, + { + "epoch": 0.6109877913429522, + "grad_norm": 0.42764753103256226, + "learning_rate": 9.676928942359233e-06, + "loss": 0.4713, + "step": 734 + }, + { + "epoch": 0.6118201997780244, + "grad_norm": 0.4510103464126587, + "learning_rate": 9.675213362934454e-06, + "loss": 0.4422, + "step": 735 + }, + { + "epoch": 0.6126526082130965, + "grad_norm": 0.44094187021255493, + "learning_rate": 9.673493393402466e-06, + "loss": 0.478, + "step": 736 + }, + { + "epoch": 0.6134850166481687, + "grad_norm": 0.4860018491744995, + "learning_rate": 9.671769035378352e-06, + "loss": 0.4597, + "step": 737 + }, + { + "epoch": 0.6143174250832408, + "grad_norm": 0.4602179229259491, + "learning_rate": 9.670040290481315e-06, + "loss": 0.477, + "step": 738 + }, + { + "epoch": 0.615149833518313, + "grad_norm": 0.4050056040287018, + "learning_rate": 9.668307160334676e-06, + "loss": 0.4669, + "step": 739 + }, + { + "epoch": 0.6159822419533851, + "grad_norm": 0.49610546231269836, + "learning_rate": 9.666569646565875e-06, + "loss": 0.4783, + "step": 740 + }, + { + "epoch": 0.6168146503884573, + "grad_norm": 0.37179750204086304, + "learning_rate": 9.664827750806465e-06, + "loss": 0.4661, + "step": 741 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 0.42984944581985474, + "learning_rate": 9.663081474692123e-06, + "loss": 0.4516, + "step": 742 + }, + { + "epoch": 0.6184794672586016, + "grad_norm": 0.4149248003959656, + "learning_rate": 9.661330819862626e-06, + "loss": 0.4644, + "step": 743 + }, + { + "epoch": 0.6193118756936737, + "grad_norm": 0.4002021551132202, + "learning_rate": 9.659575787961872e-06, + "loss": 0.4571, + "step": 744 + }, + { + "epoch": 0.6201442841287459, + "grad_norm": 0.3851153552532196, + "learning_rate": 9.657816380637868e-06, + "loss": 0.4761, + "step": 745 + }, + { + "epoch": 0.6209766925638179, + "grad_norm": 0.4163179397583008, + "learning_rate": 9.656052599542728e-06, + "loss": 0.4712, + "step": 746 + }, + { + "epoch": 0.6218091009988901, + "grad_norm": 0.41883882880210876, + "learning_rate": 9.654284446332673e-06, + "loss": 0.4481, + "step": 747 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 0.35763731598854065, + "learning_rate": 9.652511922668029e-06, + "loss": 0.4512, + "step": 748 + }, + { + "epoch": 0.6234739178690344, + "grad_norm": 0.4117819368839264, + "learning_rate": 9.650735030213228e-06, + "loss": 0.483, + "step": 749 + }, + { + "epoch": 0.6243063263041065, + "grad_norm": 0.4240819811820984, + "learning_rate": 9.648953770636801e-06, + "loss": 0.476, + "step": 750 + }, + { + "epoch": 0.6251387347391787, + "grad_norm": 0.3860422372817993, + "learning_rate": 9.647168145611385e-06, + "loss": 0.4808, + "step": 751 + }, + { + "epoch": 0.6259711431742508, + "grad_norm": 0.39864394068717957, + "learning_rate": 9.645378156813709e-06, + "loss": 0.4881, + "step": 752 + }, + { + "epoch": 0.626803551609323, + "grad_norm": 0.4221175014972687, + "learning_rate": 9.643583805924608e-06, + "loss": 0.4726, + "step": 753 + }, + { + "epoch": 0.6276359600443951, + "grad_norm": 0.40564703941345215, + "learning_rate": 9.641785094629008e-06, + "loss": 0.4507, + "step": 754 + }, + { + "epoch": 0.6284683684794673, + "grad_norm": 0.44357532262802124, + "learning_rate": 9.639982024615928e-06, + "loss": 0.4721, + "step": 755 + }, + { + "epoch": 0.6293007769145395, + "grad_norm": 0.4528225064277649, + "learning_rate": 9.638174597578486e-06, + "loss": 0.4774, + "step": 756 + }, + { + "epoch": 0.6301331853496115, + "grad_norm": 0.41333818435668945, + "learning_rate": 9.636362815213884e-06, + "loss": 0.4824, + "step": 757 + }, + { + "epoch": 0.6309655937846836, + "grad_norm": 0.41459783911705017, + "learning_rate": 9.63454667922342e-06, + "loss": 0.4808, + "step": 758 + }, + { + "epoch": 0.6317980022197558, + "grad_norm": 0.4365558624267578, + "learning_rate": 9.632726191312475e-06, + "loss": 0.4583, + "step": 759 + }, + { + "epoch": 0.632630410654828, + "grad_norm": 0.4275365471839905, + "learning_rate": 9.630901353190522e-06, + "loss": 0.4966, + "step": 760 + }, + { + "epoch": 0.6334628190899001, + "grad_norm": 0.4615970253944397, + "learning_rate": 9.629072166571114e-06, + "loss": 0.4945, + "step": 761 + }, + { + "epoch": 0.6342952275249722, + "grad_norm": 0.43543118238449097, + "learning_rate": 9.627238633171889e-06, + "loss": 0.4534, + "step": 762 + }, + { + "epoch": 0.6351276359600444, + "grad_norm": 0.37240052223205566, + "learning_rate": 9.625400754714568e-06, + "loss": 0.443, + "step": 763 + }, + { + "epoch": 0.6359600443951166, + "grad_norm": 0.47188568115234375, + "learning_rate": 9.623558532924952e-06, + "loss": 0.4559, + "step": 764 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 0.38750484585762024, + "learning_rate": 9.621711969532917e-06, + "loss": 0.4805, + "step": 765 + }, + { + "epoch": 0.6376248612652609, + "grad_norm": 0.40850406885147095, + "learning_rate": 9.61986106627242e-06, + "loss": 0.4768, + "step": 766 + }, + { + "epoch": 0.638457269700333, + "grad_norm": 0.38356107473373413, + "learning_rate": 9.618005824881491e-06, + "loss": 0.4671, + "step": 767 + }, + { + "epoch": 0.6392896781354052, + "grad_norm": 0.4014689028263092, + "learning_rate": 9.616146247102233e-06, + "loss": 0.4405, + "step": 768 + }, + { + "epoch": 0.6401220865704772, + "grad_norm": 0.3854086995124817, + "learning_rate": 9.614282334680827e-06, + "loss": 0.4562, + "step": 769 + }, + { + "epoch": 0.6409544950055494, + "grad_norm": 0.4454103708267212, + "learning_rate": 9.612414089367512e-06, + "loss": 0.4494, + "step": 770 + }, + { + "epoch": 0.6417869034406215, + "grad_norm": 0.4228101968765259, + "learning_rate": 9.61054151291661e-06, + "loss": 0.4675, + "step": 771 + }, + { + "epoch": 0.6426193118756937, + "grad_norm": 0.4053763151168823, + "learning_rate": 9.608664607086497e-06, + "loss": 0.4618, + "step": 772 + }, + { + "epoch": 0.6434517203107658, + "grad_norm": 0.4087989926338196, + "learning_rate": 9.606783373639626e-06, + "loss": 0.4618, + "step": 773 + }, + { + "epoch": 0.644284128745838, + "grad_norm": 0.4213400185108185, + "learning_rate": 9.604897814342504e-06, + "loss": 0.4909, + "step": 774 + }, + { + "epoch": 0.6451165371809101, + "grad_norm": 0.4600432217121124, + "learning_rate": 9.603007930965706e-06, + "loss": 0.5026, + "step": 775 + }, + { + "epoch": 0.6459489456159823, + "grad_norm": 0.3727787435054779, + "learning_rate": 9.601113725283864e-06, + "loss": 0.4797, + "step": 776 + }, + { + "epoch": 0.6467813540510544, + "grad_norm": 0.4441661834716797, + "learning_rate": 9.599215199075674e-06, + "loss": 0.4748, + "step": 777 + }, + { + "epoch": 0.6476137624861266, + "grad_norm": 0.386455237865448, + "learning_rate": 9.597312354123882e-06, + "loss": 0.4657, + "step": 778 + }, + { + "epoch": 0.6484461709211987, + "grad_norm": 0.4307407736778259, + "learning_rate": 9.595405192215293e-06, + "loss": 0.4416, + "step": 779 + }, + { + "epoch": 0.6492785793562708, + "grad_norm": 0.3854341208934784, + "learning_rate": 9.593493715140767e-06, + "loss": 0.4849, + "step": 780 + }, + { + "epoch": 0.6501109877913429, + "grad_norm": 0.38782092928886414, + "learning_rate": 9.591577924695213e-06, + "loss": 0.4506, + "step": 781 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 0.4059392809867859, + "learning_rate": 9.589657822677592e-06, + "loss": 0.4686, + "step": 782 + }, + { + "epoch": 0.6517758046614872, + "grad_norm": 0.39608749747276306, + "learning_rate": 9.587733410890916e-06, + "loss": 0.4789, + "step": 783 + }, + { + "epoch": 0.6526082130965594, + "grad_norm": 0.40410831570625305, + "learning_rate": 9.585804691142237e-06, + "loss": 0.4462, + "step": 784 + }, + { + "epoch": 0.6534406215316315, + "grad_norm": 0.40202003717422485, + "learning_rate": 9.583871665242659e-06, + "loss": 0.4329, + "step": 785 + }, + { + "epoch": 0.6542730299667037, + "grad_norm": 0.42759740352630615, + "learning_rate": 9.581934335007326e-06, + "loss": 0.4415, + "step": 786 + }, + { + "epoch": 0.6551054384017758, + "grad_norm": 0.44305333495140076, + "learning_rate": 9.579992702255428e-06, + "loss": 0.4938, + "step": 787 + }, + { + "epoch": 0.655937846836848, + "grad_norm": 0.427234947681427, + "learning_rate": 9.57804676881019e-06, + "loss": 0.4579, + "step": 788 + }, + { + "epoch": 0.6567702552719201, + "grad_norm": 0.45137763023376465, + "learning_rate": 9.576096536498875e-06, + "loss": 0.493, + "step": 789 + }, + { + "epoch": 0.6576026637069923, + "grad_norm": 0.47418224811553955, + "learning_rate": 9.574142007152789e-06, + "loss": 0.4789, + "step": 790 + }, + { + "epoch": 0.6584350721420644, + "grad_norm": 0.4159716069698334, + "learning_rate": 9.572183182607269e-06, + "loss": 0.4555, + "step": 791 + }, + { + "epoch": 0.6592674805771365, + "grad_norm": 0.40891221165657043, + "learning_rate": 9.570220064701686e-06, + "loss": 0.4561, + "step": 792 + }, + { + "epoch": 0.6600998890122086, + "grad_norm": 0.4291469156742096, + "learning_rate": 9.568252655279438e-06, + "loss": 0.4844, + "step": 793 + }, + { + "epoch": 0.6609322974472808, + "grad_norm": 0.41984859108924866, + "learning_rate": 9.566280956187961e-06, + "loss": 0.4797, + "step": 794 + }, + { + "epoch": 0.6617647058823529, + "grad_norm": 0.43832656741142273, + "learning_rate": 9.564304969278714e-06, + "loss": 0.4559, + "step": 795 + }, + { + "epoch": 0.6625971143174251, + "grad_norm": 0.42272278666496277, + "learning_rate": 9.562324696407181e-06, + "loss": 0.4788, + "step": 796 + }, + { + "epoch": 0.6634295227524972, + "grad_norm": 0.4183189868927002, + "learning_rate": 9.560340139432877e-06, + "loss": 0.4894, + "step": 797 + }, + { + "epoch": 0.6642619311875694, + "grad_norm": 0.3984436094760895, + "learning_rate": 9.558351300219335e-06, + "loss": 0.4917, + "step": 798 + }, + { + "epoch": 0.6650943396226415, + "grad_norm": 0.37821829319000244, + "learning_rate": 9.556358180634105e-06, + "loss": 0.4486, + "step": 799 + }, + { + "epoch": 0.6659267480577137, + "grad_norm": 0.40212714672088623, + "learning_rate": 9.554360782548766e-06, + "loss": 0.4569, + "step": 800 + }, + { + "epoch": 0.6667591564927858, + "grad_norm": 0.421878457069397, + "learning_rate": 9.55235910783891e-06, + "loss": 0.4667, + "step": 801 + }, + { + "epoch": 0.667591564927858, + "grad_norm": 0.4031965732574463, + "learning_rate": 9.550353158384142e-06, + "loss": 0.4908, + "step": 802 + }, + { + "epoch": 0.66842397336293, + "grad_norm": 0.3970634937286377, + "learning_rate": 9.548342936068085e-06, + "loss": 0.4644, + "step": 803 + }, + { + "epoch": 0.6692563817980022, + "grad_norm": 0.437465637922287, + "learning_rate": 9.54632844277837e-06, + "loss": 0.4628, + "step": 804 + }, + { + "epoch": 0.6700887902330743, + "grad_norm": 0.3883856236934662, + "learning_rate": 9.544309680406648e-06, + "loss": 0.4712, + "step": 805 + }, + { + "epoch": 0.6709211986681465, + "grad_norm": 0.40419673919677734, + "learning_rate": 9.542286650848567e-06, + "loss": 0.4819, + "step": 806 + }, + { + "epoch": 0.6717536071032186, + "grad_norm": 0.4148818254470825, + "learning_rate": 9.540259356003787e-06, + "loss": 0.4714, + "step": 807 + }, + { + "epoch": 0.6725860155382908, + "grad_norm": 0.4469767212867737, + "learning_rate": 9.538227797775976e-06, + "loss": 0.4601, + "step": 808 + }, + { + "epoch": 0.6734184239733629, + "grad_norm": 0.37725868821144104, + "learning_rate": 9.536191978072802e-06, + "loss": 0.4671, + "step": 809 + }, + { + "epoch": 0.6742508324084351, + "grad_norm": 0.4010414183139801, + "learning_rate": 9.534151898805934e-06, + "loss": 0.4641, + "step": 810 + }, + { + "epoch": 0.6750832408435072, + "grad_norm": 0.37603193521499634, + "learning_rate": 9.532107561891044e-06, + "loss": 0.4483, + "step": 811 + }, + { + "epoch": 0.6759156492785794, + "grad_norm": 0.4010283350944519, + "learning_rate": 9.5300589692478e-06, + "loss": 0.4779, + "step": 812 + }, + { + "epoch": 0.6767480577136515, + "grad_norm": 0.3795294463634491, + "learning_rate": 9.528006122799864e-06, + "loss": 0.4757, + "step": 813 + }, + { + "epoch": 0.6775804661487237, + "grad_norm": 0.42904961109161377, + "learning_rate": 9.525949024474897e-06, + "loss": 0.4592, + "step": 814 + }, + { + "epoch": 0.6784128745837957, + "grad_norm": 0.392860472202301, + "learning_rate": 9.52388767620455e-06, + "loss": 0.477, + "step": 815 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.4124400019645691, + "learning_rate": 9.521822079924465e-06, + "loss": 0.4737, + "step": 816 + }, + { + "epoch": 0.68007769145394, + "grad_norm": 0.4388231039047241, + "learning_rate": 9.519752237574273e-06, + "loss": 0.4866, + "step": 817 + }, + { + "epoch": 0.6809100998890122, + "grad_norm": 0.372005820274353, + "learning_rate": 9.517678151097591e-06, + "loss": 0.4705, + "step": 818 + }, + { + "epoch": 0.6817425083240843, + "grad_norm": 0.4569007158279419, + "learning_rate": 9.515599822442025e-06, + "loss": 0.4756, + "step": 819 + }, + { + "epoch": 0.6825749167591565, + "grad_norm": 0.4299950897693634, + "learning_rate": 9.51351725355916e-06, + "loss": 0.4807, + "step": 820 + }, + { + "epoch": 0.6834073251942286, + "grad_norm": 0.3495566248893738, + "learning_rate": 9.511430446404566e-06, + "loss": 0.4593, + "step": 821 + }, + { + "epoch": 0.6842397336293008, + "grad_norm": 0.43988698720932007, + "learning_rate": 9.50933940293779e-06, + "loss": 0.4946, + "step": 822 + }, + { + "epoch": 0.685072142064373, + "grad_norm": 0.4119141399860382, + "learning_rate": 9.507244125122358e-06, + "loss": 0.4565, + "step": 823 + }, + { + "epoch": 0.6859045504994451, + "grad_norm": 0.40885767340660095, + "learning_rate": 9.505144614925776e-06, + "loss": 0.4624, + "step": 824 + }, + { + "epoch": 0.6867369589345172, + "grad_norm": 0.4196932017803192, + "learning_rate": 9.503040874319519e-06, + "loss": 0.4623, + "step": 825 + }, + { + "epoch": 0.6875693673695893, + "grad_norm": 0.40707823634147644, + "learning_rate": 9.500932905279034e-06, + "loss": 0.4807, + "step": 826 + }, + { + "epoch": 0.6884017758046614, + "grad_norm": 0.43840739130973816, + "learning_rate": 9.498820709783743e-06, + "loss": 0.4698, + "step": 827 + }, + { + "epoch": 0.6892341842397336, + "grad_norm": 0.3804190158843994, + "learning_rate": 9.496704289817035e-06, + "loss": 0.4404, + "step": 828 + }, + { + "epoch": 0.6900665926748057, + "grad_norm": 0.44267645478248596, + "learning_rate": 9.494583647366264e-06, + "loss": 0.4905, + "step": 829 + }, + { + "epoch": 0.6908990011098779, + "grad_norm": 0.41382384300231934, + "learning_rate": 9.492458784422751e-06, + "loss": 0.4689, + "step": 830 + }, + { + "epoch": 0.69173140954495, + "grad_norm": 0.42739763855934143, + "learning_rate": 9.49032970298178e-06, + "loss": 0.4782, + "step": 831 + }, + { + "epoch": 0.6925638179800222, + "grad_norm": 0.4234493672847748, + "learning_rate": 9.488196405042596e-06, + "loss": 0.4639, + "step": 832 + }, + { + "epoch": 0.6933962264150944, + "grad_norm": 0.3597807288169861, + "learning_rate": 9.486058892608401e-06, + "loss": 0.4541, + "step": 833 + }, + { + "epoch": 0.6942286348501665, + "grad_norm": 0.38074445724487305, + "learning_rate": 9.483917167686358e-06, + "loss": 0.4801, + "step": 834 + }, + { + "epoch": 0.6950610432852387, + "grad_norm": 0.38729429244995117, + "learning_rate": 9.481771232287585e-06, + "loss": 0.482, + "step": 835 + }, + { + "epoch": 0.6958934517203108, + "grad_norm": 0.45910367369651794, + "learning_rate": 9.479621088427152e-06, + "loss": 0.4814, + "step": 836 + }, + { + "epoch": 0.696725860155383, + "grad_norm": 0.4074380397796631, + "learning_rate": 9.47746673812408e-06, + "loss": 0.4817, + "step": 837 + }, + { + "epoch": 0.697558268590455, + "grad_norm": 0.4241724908351898, + "learning_rate": 9.475308183401347e-06, + "loss": 0.4727, + "step": 838 + }, + { + "epoch": 0.6983906770255272, + "grad_norm": 0.38416868448257446, + "learning_rate": 9.473145426285869e-06, + "loss": 0.456, + "step": 839 + }, + { + "epoch": 0.6992230854605993, + "grad_norm": 0.3833445608615875, + "learning_rate": 9.470978468808514e-06, + "loss": 0.4428, + "step": 840 + }, + { + "epoch": 0.7000554938956715, + "grad_norm": 0.39522784948349, + "learning_rate": 9.46880731300409e-06, + "loss": 0.435, + "step": 841 + }, + { + "epoch": 0.7008879023307436, + "grad_norm": 0.37949830293655396, + "learning_rate": 9.466631960911358e-06, + "loss": 0.4574, + "step": 842 + }, + { + "epoch": 0.7017203107658158, + "grad_norm": 0.3668852746486664, + "learning_rate": 9.464452414573004e-06, + "loss": 0.4629, + "step": 843 + }, + { + "epoch": 0.7025527192008879, + "grad_norm": 0.4042564332485199, + "learning_rate": 9.462268676035664e-06, + "loss": 0.4774, + "step": 844 + }, + { + "epoch": 0.7033851276359601, + "grad_norm": 0.4077381491661072, + "learning_rate": 9.460080747349907e-06, + "loss": 0.4776, + "step": 845 + }, + { + "epoch": 0.7042175360710322, + "grad_norm": 0.38546618819236755, + "learning_rate": 9.457888630570234e-06, + "loss": 0.4547, + "step": 846 + }, + { + "epoch": 0.7050499445061044, + "grad_norm": 0.3926650583744049, + "learning_rate": 9.455692327755082e-06, + "loss": 0.4741, + "step": 847 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.4238012135028839, + "learning_rate": 9.45349184096682e-06, + "loss": 0.4618, + "step": 848 + }, + { + "epoch": 0.7067147613762487, + "grad_norm": 0.42878350615501404, + "learning_rate": 9.451287172271741e-06, + "loss": 0.4404, + "step": 849 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 0.4180905222892761, + "learning_rate": 9.449078323740066e-06, + "loss": 0.4795, + "step": 850 + }, + { + "epoch": 0.7083795782463929, + "grad_norm": 0.40476009249687195, + "learning_rate": 9.446865297445947e-06, + "loss": 0.459, + "step": 851 + }, + { + "epoch": 0.709211986681465, + "grad_norm": 0.3711640238761902, + "learning_rate": 9.444648095467453e-06, + "loss": 0.4731, + "step": 852 + }, + { + "epoch": 0.7100443951165372, + "grad_norm": 0.3749478757381439, + "learning_rate": 9.442426719886572e-06, + "loss": 0.4579, + "step": 853 + }, + { + "epoch": 0.7108768035516093, + "grad_norm": 0.381773442029953, + "learning_rate": 9.440201172789218e-06, + "loss": 0.4455, + "step": 854 + }, + { + "epoch": 0.7117092119866815, + "grad_norm": 0.4290565848350525, + "learning_rate": 9.437971456265218e-06, + "loss": 0.4741, + "step": 855 + }, + { + "epoch": 0.7125416204217536, + "grad_norm": 0.3964015245437622, + "learning_rate": 9.435737572408316e-06, + "loss": 0.4771, + "step": 856 + }, + { + "epoch": 0.7133740288568258, + "grad_norm": 0.39722374081611633, + "learning_rate": 9.433499523316165e-06, + "loss": 0.4639, + "step": 857 + }, + { + "epoch": 0.7142064372918979, + "grad_norm": 0.39079853892326355, + "learning_rate": 9.431257311090336e-06, + "loss": 0.438, + "step": 858 + }, + { + "epoch": 0.7150388457269701, + "grad_norm": 0.3988889157772064, + "learning_rate": 9.429010937836302e-06, + "loss": 0.4471, + "step": 859 + }, + { + "epoch": 0.7158712541620422, + "grad_norm": 0.39545944333076477, + "learning_rate": 9.426760405663448e-06, + "loss": 0.4542, + "step": 860 + }, + { + "epoch": 0.7167036625971143, + "grad_norm": 0.39783865213394165, + "learning_rate": 9.424505716685064e-06, + "loss": 0.4667, + "step": 861 + }, + { + "epoch": 0.7175360710321864, + "grad_norm": 0.38103538751602173, + "learning_rate": 9.422246873018343e-06, + "loss": 0.4689, + "step": 862 + }, + { + "epoch": 0.7183684794672586, + "grad_norm": 0.3842844069004059, + "learning_rate": 9.419983876784378e-06, + "loss": 0.4659, + "step": 863 + }, + { + "epoch": 0.7192008879023307, + "grad_norm": 0.36254891753196716, + "learning_rate": 9.41771673010816e-06, + "loss": 0.437, + "step": 864 + }, + { + "epoch": 0.7200332963374029, + "grad_norm": 0.411409467458725, + "learning_rate": 9.415445435118581e-06, + "loss": 0.4671, + "step": 865 + }, + { + "epoch": 0.720865704772475, + "grad_norm": 0.4189411401748657, + "learning_rate": 9.41316999394843e-06, + "loss": 0.507, + "step": 866 + }, + { + "epoch": 0.7216981132075472, + "grad_norm": 0.448301762342453, + "learning_rate": 9.410890408734381e-06, + "loss": 0.4789, + "step": 867 + }, + { + "epoch": 0.7225305216426193, + "grad_norm": 0.39381828904151917, + "learning_rate": 9.408606681617006e-06, + "loss": 0.4514, + "step": 868 + }, + { + "epoch": 0.7233629300776915, + "grad_norm": 0.42399051785469055, + "learning_rate": 9.406318814740767e-06, + "loss": 0.4513, + "step": 869 + }, + { + "epoch": 0.7241953385127636, + "grad_norm": 0.39758798480033875, + "learning_rate": 9.404026810254007e-06, + "loss": 0.4539, + "step": 870 + }, + { + "epoch": 0.7250277469478358, + "grad_norm": 0.4223097562789917, + "learning_rate": 9.401730670308963e-06, + "loss": 0.4824, + "step": 871 + }, + { + "epoch": 0.7258601553829079, + "grad_norm": 0.42844802141189575, + "learning_rate": 9.399430397061746e-06, + "loss": 0.4759, + "step": 872 + }, + { + "epoch": 0.72669256381798, + "grad_norm": 0.4464619755744934, + "learning_rate": 9.397125992672358e-06, + "loss": 0.4437, + "step": 873 + }, + { + "epoch": 0.7275249722530521, + "grad_norm": 0.3978061079978943, + "learning_rate": 9.394817459304671e-06, + "loss": 0.4828, + "step": 874 + }, + { + "epoch": 0.7283573806881243, + "grad_norm": 0.4394519329071045, + "learning_rate": 9.392504799126439e-06, + "loss": 0.4746, + "step": 875 + }, + { + "epoch": 0.7291897891231964, + "grad_norm": 0.40214264392852783, + "learning_rate": 9.39018801430929e-06, + "loss": 0.4673, + "step": 876 + }, + { + "epoch": 0.7300221975582686, + "grad_norm": 0.40048447251319885, + "learning_rate": 9.387867107028727e-06, + "loss": 0.4793, + "step": 877 + }, + { + "epoch": 0.7308546059933407, + "grad_norm": 0.4119110703468323, + "learning_rate": 9.385542079464123e-06, + "loss": 0.4615, + "step": 878 + }, + { + "epoch": 0.7316870144284129, + "grad_norm": 0.39967912435531616, + "learning_rate": 9.383212933798718e-06, + "loss": 0.4664, + "step": 879 + }, + { + "epoch": 0.732519422863485, + "grad_norm": 0.39798152446746826, + "learning_rate": 9.38087967221962e-06, + "loss": 0.4575, + "step": 880 + }, + { + "epoch": 0.7333518312985572, + "grad_norm": 0.38400161266326904, + "learning_rate": 9.378542296917804e-06, + "loss": 0.456, + "step": 881 + }, + { + "epoch": 0.7341842397336293, + "grad_norm": 0.4055246412754059, + "learning_rate": 9.376200810088108e-06, + "loss": 0.4613, + "step": 882 + }, + { + "epoch": 0.7350166481687015, + "grad_norm": 0.3930290639400482, + "learning_rate": 9.373855213929227e-06, + "loss": 0.4699, + "step": 883 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 0.39786094427108765, + "learning_rate": 9.371505510643714e-06, + "loss": 0.483, + "step": 884 + }, + { + "epoch": 0.7366814650388457, + "grad_norm": 0.3793211579322815, + "learning_rate": 9.369151702437987e-06, + "loss": 0.4762, + "step": 885 + }, + { + "epoch": 0.7375138734739178, + "grad_norm": 0.38085439801216125, + "learning_rate": 9.366793791522308e-06, + "loss": 0.4535, + "step": 886 + }, + { + "epoch": 0.73834628190899, + "grad_norm": 0.371055543422699, + "learning_rate": 9.364431780110801e-06, + "loss": 0.4722, + "step": 887 + }, + { + "epoch": 0.7391786903440621, + "grad_norm": 0.3615517318248749, + "learning_rate": 9.362065670421434e-06, + "loss": 0.4177, + "step": 888 + }, + { + "epoch": 0.7400110987791343, + "grad_norm": 0.3889777660369873, + "learning_rate": 9.359695464676025e-06, + "loss": 0.455, + "step": 889 + }, + { + "epoch": 0.7408435072142064, + "grad_norm": 0.4410359561443329, + "learning_rate": 9.35732116510024e-06, + "loss": 0.4966, + "step": 890 + }, + { + "epoch": 0.7416759156492786, + "grad_norm": 0.42662596702575684, + "learning_rate": 9.354942773923588e-06, + "loss": 0.46, + "step": 891 + }, + { + "epoch": 0.7425083240843507, + "grad_norm": 0.394815593957901, + "learning_rate": 9.352560293379417e-06, + "loss": 0.4762, + "step": 892 + }, + { + "epoch": 0.7433407325194229, + "grad_norm": 0.39383867383003235, + "learning_rate": 9.350173725704922e-06, + "loss": 0.4519, + "step": 893 + }, + { + "epoch": 0.744173140954495, + "grad_norm": 0.40083327889442444, + "learning_rate": 9.34778307314113e-06, + "loss": 0.4345, + "step": 894 + }, + { + "epoch": 0.7450055493895672, + "grad_norm": 0.386772096157074, + "learning_rate": 9.345388337932906e-06, + "loss": 0.4519, + "step": 895 + }, + { + "epoch": 0.7458379578246392, + "grad_norm": 0.4634164273738861, + "learning_rate": 9.342989522328947e-06, + "loss": 0.4256, + "step": 896 + }, + { + "epoch": 0.7466703662597114, + "grad_norm": 0.4075722396373749, + "learning_rate": 9.340586628581783e-06, + "loss": 0.4548, + "step": 897 + }, + { + "epoch": 0.7475027746947835, + "grad_norm": 0.4286724925041199, + "learning_rate": 9.338179658947774e-06, + "loss": 0.4737, + "step": 898 + }, + { + "epoch": 0.7483351831298557, + "grad_norm": 0.43397256731987, + "learning_rate": 9.335768615687108e-06, + "loss": 0.4543, + "step": 899 + }, + { + "epoch": 0.7491675915649278, + "grad_norm": 0.40175217390060425, + "learning_rate": 9.333353501063796e-06, + "loss": 0.4702, + "step": 900 + }, + { + "epoch": 0.75, + "grad_norm": 0.3902556300163269, + "learning_rate": 9.330934317345673e-06, + "loss": 0.4734, + "step": 901 + }, + { + "epoch": 0.7508324084350722, + "grad_norm": 0.40193432569503784, + "learning_rate": 9.328511066804391e-06, + "loss": 0.4382, + "step": 902 + }, + { + "epoch": 0.7516648168701443, + "grad_norm": 0.4484616219997406, + "learning_rate": 9.32608375171543e-06, + "loss": 0.4704, + "step": 903 + }, + { + "epoch": 0.7524972253052165, + "grad_norm": 0.39739924669265747, + "learning_rate": 9.32365237435808e-06, + "loss": 0.4643, + "step": 904 + }, + { + "epoch": 0.7533296337402886, + "grad_norm": 0.42331647872924805, + "learning_rate": 9.321216937015446e-06, + "loss": 0.4584, + "step": 905 + }, + { + "epoch": 0.7541620421753608, + "grad_norm": 0.3581715524196625, + "learning_rate": 9.318777441974446e-06, + "loss": 0.467, + "step": 906 + }, + { + "epoch": 0.7549944506104328, + "grad_norm": 0.40118566155433655, + "learning_rate": 9.316333891525809e-06, + "loss": 0.443, + "step": 907 + }, + { + "epoch": 0.755826859045505, + "grad_norm": 0.4858897626399994, + "learning_rate": 9.313886287964072e-06, + "loss": 0.4666, + "step": 908 + }, + { + "epoch": 0.7566592674805771, + "grad_norm": 0.43074485659599304, + "learning_rate": 9.311434633587577e-06, + "loss": 0.4605, + "step": 909 + }, + { + "epoch": 0.7574916759156493, + "grad_norm": 0.4585159718990326, + "learning_rate": 9.308978930698472e-06, + "loss": 0.4605, + "step": 910 + }, + { + "epoch": 0.7583240843507214, + "grad_norm": 0.433880478143692, + "learning_rate": 9.306519181602704e-06, + "loss": 0.4644, + "step": 911 + }, + { + "epoch": 0.7591564927857936, + "grad_norm": 0.38200658559799194, + "learning_rate": 9.304055388610019e-06, + "loss": 0.4427, + "step": 912 + }, + { + "epoch": 0.7599889012208657, + "grad_norm": 0.49386703968048096, + "learning_rate": 9.301587554033965e-06, + "loss": 0.4637, + "step": 913 + }, + { + "epoch": 0.7608213096559379, + "grad_norm": 0.4780905544757843, + "learning_rate": 9.299115680191876e-06, + "loss": 0.4648, + "step": 914 + }, + { + "epoch": 0.76165371809101, + "grad_norm": 0.4210283160209656, + "learning_rate": 9.296639769404892e-06, + "loss": 0.4691, + "step": 915 + }, + { + "epoch": 0.7624861265260822, + "grad_norm": 0.4278091788291931, + "learning_rate": 9.294159823997933e-06, + "loss": 0.4551, + "step": 916 + }, + { + "epoch": 0.7633185349611543, + "grad_norm": 0.39724352955818176, + "learning_rate": 9.291675846299711e-06, + "loss": 0.4963, + "step": 917 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 0.4462328851222992, + "learning_rate": 9.289187838642724e-06, + "loss": 0.4781, + "step": 918 + }, + { + "epoch": 0.7649833518312985, + "grad_norm": 0.3846145570278168, + "learning_rate": 9.286695803363257e-06, + "loss": 0.442, + "step": 919 + }, + { + "epoch": 0.7658157602663707, + "grad_norm": 0.3947022557258606, + "learning_rate": 9.284199742801373e-06, + "loss": 0.4804, + "step": 920 + }, + { + "epoch": 0.7666481687014428, + "grad_norm": 0.4000520408153534, + "learning_rate": 9.281699659300917e-06, + "loss": 0.5051, + "step": 921 + }, + { + "epoch": 0.767480577136515, + "grad_norm": 0.4070630669593811, + "learning_rate": 9.279195555209513e-06, + "loss": 0.4547, + "step": 922 + }, + { + "epoch": 0.7683129855715871, + "grad_norm": 0.4421361982822418, + "learning_rate": 9.276687432878554e-06, + "loss": 0.4619, + "step": 923 + }, + { + "epoch": 0.7691453940066593, + "grad_norm": 0.404674232006073, + "learning_rate": 9.274175294663215e-06, + "loss": 0.462, + "step": 924 + }, + { + "epoch": 0.7699778024417314, + "grad_norm": 0.4667954444885254, + "learning_rate": 9.271659142922438e-06, + "loss": 0.4739, + "step": 925 + }, + { + "epoch": 0.7708102108768036, + "grad_norm": 0.3848412036895752, + "learning_rate": 9.26913898001893e-06, + "loss": 0.489, + "step": 926 + }, + { + "epoch": 0.7716426193118757, + "grad_norm": 0.4155692458152771, + "learning_rate": 9.26661480831917e-06, + "loss": 0.4522, + "step": 927 + }, + { + "epoch": 0.7724750277469479, + "grad_norm": 0.4222828447818756, + "learning_rate": 9.2640866301934e-06, + "loss": 0.4756, + "step": 928 + }, + { + "epoch": 0.77330743618202, + "grad_norm": 0.3686828911304474, + "learning_rate": 9.261554448015625e-06, + "loss": 0.4513, + "step": 929 + }, + { + "epoch": 0.7741398446170921, + "grad_norm": 0.4023358225822449, + "learning_rate": 9.259018264163604e-06, + "loss": 0.4447, + "step": 930 + }, + { + "epoch": 0.7749722530521642, + "grad_norm": 0.46268701553344727, + "learning_rate": 9.25647808101886e-06, + "loss": 0.4696, + "step": 931 + }, + { + "epoch": 0.7758046614872364, + "grad_norm": 0.39935389161109924, + "learning_rate": 9.253933900966672e-06, + "loss": 0.4549, + "step": 932 + }, + { + "epoch": 0.7766370699223085, + "grad_norm": 0.41673651337623596, + "learning_rate": 9.251385726396065e-06, + "loss": 0.4756, + "step": 933 + }, + { + "epoch": 0.7774694783573807, + "grad_norm": 0.47298237681388855, + "learning_rate": 9.248833559699824e-06, + "loss": 0.4617, + "step": 934 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 0.3737104833126068, + "learning_rate": 9.246277403274475e-06, + "loss": 0.437, + "step": 935 + }, + { + "epoch": 0.779134295227525, + "grad_norm": 0.47945085167884827, + "learning_rate": 9.243717259520296e-06, + "loss": 0.4657, + "step": 936 + }, + { + "epoch": 0.7799667036625971, + "grad_norm": 0.3635734021663666, + "learning_rate": 9.241153130841305e-06, + "loss": 0.4205, + "step": 937 + }, + { + "epoch": 0.7807991120976693, + "grad_norm": 0.4091576933860779, + "learning_rate": 9.238585019645265e-06, + "loss": 0.4579, + "step": 938 + }, + { + "epoch": 0.7816315205327414, + "grad_norm": 0.4312977194786072, + "learning_rate": 9.236012928343676e-06, + "loss": 0.4557, + "step": 939 + }, + { + "epoch": 0.7824639289678136, + "grad_norm": 0.44428691267967224, + "learning_rate": 9.233436859351778e-06, + "loss": 0.4538, + "step": 940 + }, + { + "epoch": 0.7832963374028857, + "grad_norm": 0.43025487661361694, + "learning_rate": 9.230856815088546e-06, + "loss": 0.4668, + "step": 941 + }, + { + "epoch": 0.7841287458379578, + "grad_norm": 0.42932865023612976, + "learning_rate": 9.228272797976685e-06, + "loss": 0.4588, + "step": 942 + }, + { + "epoch": 0.7849611542730299, + "grad_norm": 0.45379704236984253, + "learning_rate": 9.22568481044263e-06, + "loss": 0.4248, + "step": 943 + }, + { + "epoch": 0.7857935627081021, + "grad_norm": 0.5083206295967102, + "learning_rate": 9.223092854916552e-06, + "loss": 0.4797, + "step": 944 + }, + { + "epoch": 0.7866259711431742, + "grad_norm": 0.47013911604881287, + "learning_rate": 9.220496933832338e-06, + "loss": 0.4839, + "step": 945 + }, + { + "epoch": 0.7874583795782464, + "grad_norm": 0.38621267676353455, + "learning_rate": 9.217897049627605e-06, + "loss": 0.4352, + "step": 946 + }, + { + "epoch": 0.7882907880133185, + "grad_norm": 0.48977726697921753, + "learning_rate": 9.21529320474369e-06, + "loss": 0.4596, + "step": 947 + }, + { + "epoch": 0.7891231964483907, + "grad_norm": 0.4230138063430786, + "learning_rate": 9.212685401625649e-06, + "loss": 0.4623, + "step": 948 + }, + { + "epoch": 0.7899556048834628, + "grad_norm": 0.41531112790107727, + "learning_rate": 9.210073642722256e-06, + "loss": 0.4596, + "step": 949 + }, + { + "epoch": 0.790788013318535, + "grad_norm": 0.4705289602279663, + "learning_rate": 9.207457930485996e-06, + "loss": 0.4578, + "step": 950 + }, + { + "epoch": 0.7916204217536071, + "grad_norm": 0.4125988781452179, + "learning_rate": 9.20483826737307e-06, + "loss": 0.4498, + "step": 951 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.42193499207496643, + "learning_rate": 9.202214655843386e-06, + "loss": 0.447, + "step": 952 + }, + { + "epoch": 0.7932852386237513, + "grad_norm": 0.3790290057659149, + "learning_rate": 9.199587098360563e-06, + "loss": 0.4602, + "step": 953 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 0.37495550513267517, + "learning_rate": 9.196955597391923e-06, + "loss": 0.4458, + "step": 954 + }, + { + "epoch": 0.7949500554938956, + "grad_norm": 0.4026005268096924, + "learning_rate": 9.19432015540849e-06, + "loss": 0.4445, + "step": 955 + }, + { + "epoch": 0.7957824639289678, + "grad_norm": 0.40387800335884094, + "learning_rate": 9.191680774884992e-06, + "loss": 0.4688, + "step": 956 + }, + { + "epoch": 0.7966148723640399, + "grad_norm": 0.3699960708618164, + "learning_rate": 9.189037458299854e-06, + "loss": 0.4725, + "step": 957 + }, + { + "epoch": 0.7974472807991121, + "grad_norm": 0.41380584239959717, + "learning_rate": 9.186390208135194e-06, + "loss": 0.4589, + "step": 958 + }, + { + "epoch": 0.7982796892341842, + "grad_norm": 0.41292324662208557, + "learning_rate": 9.18373902687683e-06, + "loss": 0.4501, + "step": 959 + }, + { + "epoch": 0.7991120976692564, + "grad_norm": 0.3478350341320038, + "learning_rate": 9.181083917014262e-06, + "loss": 0.4391, + "step": 960 + }, + { + "epoch": 0.7999445061043285, + "grad_norm": 0.468945175409317, + "learning_rate": 9.17842488104069e-06, + "loss": 0.475, + "step": 961 + }, + { + "epoch": 0.8007769145394007, + "grad_norm": 0.40263715386390686, + "learning_rate": 9.175761921452992e-06, + "loss": 0.4416, + "step": 962 + }, + { + "epoch": 0.8016093229744728, + "grad_norm": 0.4122201204299927, + "learning_rate": 9.173095040751738e-06, + "loss": 0.4474, + "step": 963 + }, + { + "epoch": 0.802441731409545, + "grad_norm": 0.3991578221321106, + "learning_rate": 9.17042424144117e-06, + "loss": 0.4571, + "step": 964 + }, + { + "epoch": 0.803274139844617, + "grad_norm": 0.39560526609420776, + "learning_rate": 9.16774952602922e-06, + "loss": 0.4917, + "step": 965 + }, + { + "epoch": 0.8041065482796892, + "grad_norm": 0.40626710653305054, + "learning_rate": 9.165070897027487e-06, + "loss": 0.4676, + "step": 966 + }, + { + "epoch": 0.8049389567147613, + "grad_norm": 0.4202491343021393, + "learning_rate": 9.162388356951257e-06, + "loss": 0.454, + "step": 967 + }, + { + "epoch": 0.8057713651498335, + "grad_norm": 0.35746338963508606, + "learning_rate": 9.15970190831948e-06, + "loss": 0.4639, + "step": 968 + }, + { + "epoch": 0.8066037735849056, + "grad_norm": 0.3897421956062317, + "learning_rate": 9.157011553654776e-06, + "loss": 0.4548, + "step": 969 + }, + { + "epoch": 0.8074361820199778, + "grad_norm": 0.3858961760997772, + "learning_rate": 9.154317295483437e-06, + "loss": 0.4629, + "step": 970 + }, + { + "epoch": 0.80826859045505, + "grad_norm": 0.4071405827999115, + "learning_rate": 9.151619136335419e-06, + "loss": 0.4685, + "step": 971 + }, + { + "epoch": 0.8091009988901221, + "grad_norm": 0.41344863176345825, + "learning_rate": 9.14891707874434e-06, + "loss": 0.4563, + "step": 972 + }, + { + "epoch": 0.8099334073251943, + "grad_norm": 0.41553568840026855, + "learning_rate": 9.146211125247478e-06, + "loss": 0.4347, + "step": 973 + }, + { + "epoch": 0.8107658157602664, + "grad_norm": 0.44281312823295593, + "learning_rate": 9.143501278385773e-06, + "loss": 0.4563, + "step": 974 + }, + { + "epoch": 0.8115982241953386, + "grad_norm": 0.45483851432800293, + "learning_rate": 9.140787540703817e-06, + "loss": 0.4658, + "step": 975 + }, + { + "epoch": 0.8124306326304107, + "grad_norm": 0.382942795753479, + "learning_rate": 9.138069914749859e-06, + "loss": 0.4486, + "step": 976 + }, + { + "epoch": 0.8132630410654828, + "grad_norm": 0.42402827739715576, + "learning_rate": 9.135348403075795e-06, + "loss": 0.4853, + "step": 977 + }, + { + "epoch": 0.8140954495005549, + "grad_norm": 0.3964520990848541, + "learning_rate": 9.132623008237174e-06, + "loss": 0.4955, + "step": 978 + }, + { + "epoch": 0.814927857935627, + "grad_norm": 0.5112728476524353, + "learning_rate": 9.12989373279319e-06, + "loss": 0.4493, + "step": 979 + }, + { + "epoch": 0.8157602663706992, + "grad_norm": 0.4075373113155365, + "learning_rate": 9.127160579306678e-06, + "loss": 0.4551, + "step": 980 + }, + { + "epoch": 0.8165926748057714, + "grad_norm": 0.4562476873397827, + "learning_rate": 9.124423550344118e-06, + "loss": 0.4479, + "step": 981 + }, + { + "epoch": 0.8174250832408435, + "grad_norm": 0.4151056706905365, + "learning_rate": 9.12168264847563e-06, + "loss": 0.4721, + "step": 982 + }, + { + "epoch": 0.8182574916759157, + "grad_norm": 0.3973326086997986, + "learning_rate": 9.118937876274965e-06, + "loss": 0.4544, + "step": 983 + }, + { + "epoch": 0.8190899001109878, + "grad_norm": 0.4236065745353699, + "learning_rate": 9.116189236319515e-06, + "loss": 0.4596, + "step": 984 + }, + { + "epoch": 0.81992230854606, + "grad_norm": 0.3826847970485687, + "learning_rate": 9.113436731190302e-06, + "loss": 0.4686, + "step": 985 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 0.34965115785598755, + "learning_rate": 9.110680363471973e-06, + "loss": 0.4528, + "step": 986 + }, + { + "epoch": 0.8215871254162043, + "grad_norm": 0.40207812190055847, + "learning_rate": 9.10792013575281e-06, + "loss": 0.4559, + "step": 987 + }, + { + "epoch": 0.8224195338512763, + "grad_norm": 0.4016001522541046, + "learning_rate": 9.10515605062471e-06, + "loss": 0.4553, + "step": 988 + }, + { + "epoch": 0.8232519422863485, + "grad_norm": 0.44705718755722046, + "learning_rate": 9.102388110683201e-06, + "loss": 0.4915, + "step": 989 + }, + { + "epoch": 0.8240843507214206, + "grad_norm": 0.43472737073898315, + "learning_rate": 9.099616318527426e-06, + "loss": 0.4828, + "step": 990 + }, + { + "epoch": 0.8249167591564928, + "grad_norm": 0.42259252071380615, + "learning_rate": 9.096840676760146e-06, + "loss": 0.4427, + "step": 991 + }, + { + "epoch": 0.8257491675915649, + "grad_norm": 0.44350579380989075, + "learning_rate": 9.09406118798774e-06, + "loss": 0.4859, + "step": 992 + }, + { + "epoch": 0.8265815760266371, + "grad_norm": 0.39259374141693115, + "learning_rate": 9.091277854820191e-06, + "loss": 0.4398, + "step": 993 + }, + { + "epoch": 0.8274139844617092, + "grad_norm": 0.3800427317619324, + "learning_rate": 9.088490679871102e-06, + "loss": 0.4363, + "step": 994 + }, + { + "epoch": 0.8282463928967814, + "grad_norm": 0.4360864758491516, + "learning_rate": 9.085699665757679e-06, + "loss": 0.4651, + "step": 995 + }, + { + "epoch": 0.8290788013318535, + "grad_norm": 0.39429229497909546, + "learning_rate": 9.082904815100732e-06, + "loss": 0.4669, + "step": 996 + }, + { + "epoch": 0.8299112097669257, + "grad_norm": 0.4399055242538452, + "learning_rate": 9.080106130524675e-06, + "loss": 0.4396, + "step": 997 + }, + { + "epoch": 0.8307436182019978, + "grad_norm": 0.4866039454936981, + "learning_rate": 9.07730361465752e-06, + "loss": 0.4543, + "step": 998 + }, + { + "epoch": 0.83157602663707, + "grad_norm": 0.4812076687812805, + "learning_rate": 9.07449727013088e-06, + "loss": 0.4731, + "step": 999 + }, + { + "epoch": 0.832408435072142, + "grad_norm": 0.43150269985198975, + "learning_rate": 9.071687099579962e-06, + "loss": 0.4774, + "step": 1000 + }, + { + "epoch": 0.8332408435072142, + "grad_norm": 0.3944185674190521, + "learning_rate": 9.068873105643565e-06, + "loss": 0.4399, + "step": 1001 + }, + { + "epoch": 0.8340732519422863, + "grad_norm": 0.5351603031158447, + "learning_rate": 9.066055290964079e-06, + "loss": 0.4518, + "step": 1002 + }, + { + "epoch": 0.8349056603773585, + "grad_norm": 0.38343068957328796, + "learning_rate": 9.063233658187482e-06, + "loss": 0.4843, + "step": 1003 + }, + { + "epoch": 0.8357380688124306, + "grad_norm": 0.4362078607082367, + "learning_rate": 9.060408209963334e-06, + "loss": 0.4642, + "step": 1004 + }, + { + "epoch": 0.8365704772475028, + "grad_norm": 0.4815027117729187, + "learning_rate": 9.057578948944783e-06, + "loss": 0.4497, + "step": 1005 + }, + { + "epoch": 0.8374028856825749, + "grad_norm": 0.4579710066318512, + "learning_rate": 9.054745877788554e-06, + "loss": 0.4475, + "step": 1006 + }, + { + "epoch": 0.8382352941176471, + "grad_norm": 0.49727219343185425, + "learning_rate": 9.051908999154948e-06, + "loss": 0.4707, + "step": 1007 + }, + { + "epoch": 0.8390677025527192, + "grad_norm": 0.43605858087539673, + "learning_rate": 9.049068315707847e-06, + "loss": 0.4365, + "step": 1008 + }, + { + "epoch": 0.8399001109877914, + "grad_norm": 0.5100065469741821, + "learning_rate": 9.0462238301147e-06, + "loss": 0.4332, + "step": 1009 + }, + { + "epoch": 0.8407325194228635, + "grad_norm": 0.38812533020973206, + "learning_rate": 9.04337554504653e-06, + "loss": 0.4384, + "step": 1010 + }, + { + "epoch": 0.8415649278579356, + "grad_norm": 0.384883314371109, + "learning_rate": 9.040523463177928e-06, + "loss": 0.4663, + "step": 1011 + }, + { + "epoch": 0.8423973362930077, + "grad_norm": 0.44887587428092957, + "learning_rate": 9.037667587187045e-06, + "loss": 0.486, + "step": 1012 + }, + { + "epoch": 0.8432297447280799, + "grad_norm": 0.46977436542510986, + "learning_rate": 9.034807919755602e-06, + "loss": 0.4575, + "step": 1013 + }, + { + "epoch": 0.844062153163152, + "grad_norm": 0.3983987867832184, + "learning_rate": 9.031944463568877e-06, + "loss": 0.4532, + "step": 1014 + }, + { + "epoch": 0.8448945615982242, + "grad_norm": 0.4556075632572174, + "learning_rate": 9.029077221315703e-06, + "loss": 0.4685, + "step": 1015 + }, + { + "epoch": 0.8457269700332963, + "grad_norm": 0.38745394349098206, + "learning_rate": 9.026206195688472e-06, + "loss": 0.4608, + "step": 1016 + }, + { + "epoch": 0.8465593784683685, + "grad_norm": 0.4005940556526184, + "learning_rate": 9.023331389383126e-06, + "loss": 0.4628, + "step": 1017 + }, + { + "epoch": 0.8473917869034406, + "grad_norm": 0.39503729343414307, + "learning_rate": 9.02045280509916e-06, + "loss": 0.4688, + "step": 1018 + }, + { + "epoch": 0.8482241953385128, + "grad_norm": 0.4228137731552124, + "learning_rate": 9.017570445539616e-06, + "loss": 0.4594, + "step": 1019 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.36277511715888977, + "learning_rate": 9.014684313411077e-06, + "loss": 0.4274, + "step": 1020 + }, + { + "epoch": 0.8498890122086571, + "grad_norm": 0.3664073050022125, + "learning_rate": 9.011794411423675e-06, + "loss": 0.4592, + "step": 1021 + }, + { + "epoch": 0.8507214206437292, + "grad_norm": 0.3959209620952606, + "learning_rate": 9.008900742291075e-06, + "loss": 0.4583, + "step": 1022 + }, + { + "epoch": 0.8515538290788013, + "grad_norm": 0.3981209099292755, + "learning_rate": 9.006003308730487e-06, + "loss": 0.4543, + "step": 1023 + }, + { + "epoch": 0.8523862375138734, + "grad_norm": 0.33361658453941345, + "learning_rate": 9.003102113462647e-06, + "loss": 0.4526, + "step": 1024 + }, + { + "epoch": 0.8532186459489456, + "grad_norm": 0.37563925981521606, + "learning_rate": 9.000197159211834e-06, + "loss": 0.4657, + "step": 1025 + }, + { + "epoch": 0.8540510543840177, + "grad_norm": 0.36745545268058777, + "learning_rate": 8.997288448705846e-06, + "loss": 0.4451, + "step": 1026 + }, + { + "epoch": 0.8548834628190899, + "grad_norm": 0.3501058518886566, + "learning_rate": 8.994375984676014e-06, + "loss": 0.4413, + "step": 1027 + }, + { + "epoch": 0.855715871254162, + "grad_norm": 0.38974809646606445, + "learning_rate": 8.991459769857195e-06, + "loss": 0.4492, + "step": 1028 + }, + { + "epoch": 0.8565482796892342, + "grad_norm": 0.3752514719963074, + "learning_rate": 8.988539806987764e-06, + "loss": 0.4533, + "step": 1029 + }, + { + "epoch": 0.8573806881243063, + "grad_norm": 0.3805655241012573, + "learning_rate": 8.985616098809618e-06, + "loss": 0.4733, + "step": 1030 + }, + { + "epoch": 0.8582130965593785, + "grad_norm": 0.46047642827033997, + "learning_rate": 8.982688648068169e-06, + "loss": 0.4593, + "step": 1031 + }, + { + "epoch": 0.8590455049944506, + "grad_norm": 0.39984360337257385, + "learning_rate": 8.979757457512347e-06, + "loss": 0.4749, + "step": 1032 + }, + { + "epoch": 0.8598779134295228, + "grad_norm": 0.3868159353733063, + "learning_rate": 8.976822529894588e-06, + "loss": 0.4577, + "step": 1033 + }, + { + "epoch": 0.8607103218645948, + "grad_norm": 0.41874581575393677, + "learning_rate": 8.973883867970844e-06, + "loss": 0.4784, + "step": 1034 + }, + { + "epoch": 0.861542730299667, + "grad_norm": 0.3759986162185669, + "learning_rate": 8.970941474500565e-06, + "loss": 0.4554, + "step": 1035 + }, + { + "epoch": 0.8623751387347391, + "grad_norm": 0.4303062856197357, + "learning_rate": 8.967995352246714e-06, + "loss": 0.4724, + "step": 1036 + }, + { + "epoch": 0.8632075471698113, + "grad_norm": 0.420803040266037, + "learning_rate": 8.965045503975752e-06, + "loss": 0.486, + "step": 1037 + }, + { + "epoch": 0.8640399556048834, + "grad_norm": 0.41059285402297974, + "learning_rate": 8.962091932457635e-06, + "loss": 0.4549, + "step": 1038 + }, + { + "epoch": 0.8648723640399556, + "grad_norm": 0.3770391047000885, + "learning_rate": 8.959134640465821e-06, + "loss": 0.4783, + "step": 1039 + }, + { + "epoch": 0.8657047724750278, + "grad_norm": 0.5017075538635254, + "learning_rate": 8.956173630777255e-06, + "loss": 0.4476, + "step": 1040 + }, + { + "epoch": 0.8665371809100999, + "grad_norm": 0.33851176500320435, + "learning_rate": 8.953208906172384e-06, + "loss": 0.4677, + "step": 1041 + }, + { + "epoch": 0.867369589345172, + "grad_norm": 0.40573734045028687, + "learning_rate": 8.95024046943513e-06, + "loss": 0.4607, + "step": 1042 + }, + { + "epoch": 0.8682019977802442, + "grad_norm": 0.40829798579216003, + "learning_rate": 8.947268323352909e-06, + "loss": 0.4613, + "step": 1043 + }, + { + "epoch": 0.8690344062153164, + "grad_norm": 0.40306246280670166, + "learning_rate": 8.944292470716617e-06, + "loss": 0.487, + "step": 1044 + }, + { + "epoch": 0.8698668146503885, + "grad_norm": 0.4743926227092743, + "learning_rate": 8.941312914320636e-06, + "loss": 0.4623, + "step": 1045 + }, + { + "epoch": 0.8706992230854605, + "grad_norm": 0.388696551322937, + "learning_rate": 8.938329656962818e-06, + "loss": 0.4678, + "step": 1046 + }, + { + "epoch": 0.8715316315205327, + "grad_norm": 0.3969860076904297, + "learning_rate": 8.935342701444495e-06, + "loss": 0.437, + "step": 1047 + }, + { + "epoch": 0.8723640399556049, + "grad_norm": 0.3880145251750946, + "learning_rate": 8.932352050570467e-06, + "loss": 0.4702, + "step": 1048 + }, + { + "epoch": 0.873196448390677, + "grad_norm": 0.41687145829200745, + "learning_rate": 8.929357707149014e-06, + "loss": 0.4662, + "step": 1049 + }, + { + "epoch": 0.8740288568257492, + "grad_norm": 0.3698790967464447, + "learning_rate": 8.926359673991874e-06, + "loss": 0.4579, + "step": 1050 + }, + { + "epoch": 0.8748612652608213, + "grad_norm": 0.3788878917694092, + "learning_rate": 8.92335795391425e-06, + "loss": 0.4726, + "step": 1051 + }, + { + "epoch": 0.8756936736958935, + "grad_norm": 0.43458738923072815, + "learning_rate": 8.920352549734812e-06, + "loss": 0.4771, + "step": 1052 + }, + { + "epoch": 0.8765260821309656, + "grad_norm": 0.3463033139705658, + "learning_rate": 8.91734346427569e-06, + "loss": 0.4503, + "step": 1053 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 0.3873491585254669, + "learning_rate": 8.914330700362461e-06, + "loss": 0.4513, + "step": 1054 + }, + { + "epoch": 0.8781908990011099, + "grad_norm": 0.36356833577156067, + "learning_rate": 8.91131426082417e-06, + "loss": 0.4362, + "step": 1055 + }, + { + "epoch": 0.8790233074361821, + "grad_norm": 0.3933038115501404, + "learning_rate": 8.908294148493303e-06, + "loss": 0.4602, + "step": 1056 + }, + { + "epoch": 0.8798557158712541, + "grad_norm": 0.3321017026901245, + "learning_rate": 8.905270366205798e-06, + "loss": 0.4457, + "step": 1057 + }, + { + "epoch": 0.8806881243063263, + "grad_norm": 0.42186474800109863, + "learning_rate": 8.902242916801043e-06, + "loss": 0.4743, + "step": 1058 + }, + { + "epoch": 0.8815205327413984, + "grad_norm": 0.3869630694389343, + "learning_rate": 8.899211803121861e-06, + "loss": 0.432, + "step": 1059 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.3925645351409912, + "learning_rate": 8.896177028014524e-06, + "loss": 0.481, + "step": 1060 + }, + { + "epoch": 0.8831853496115427, + "grad_norm": 0.35652878880500793, + "learning_rate": 8.893138594328738e-06, + "loss": 0.4576, + "step": 1061 + }, + { + "epoch": 0.8840177580466149, + "grad_norm": 0.3799140453338623, + "learning_rate": 8.890096504917647e-06, + "loss": 0.4318, + "step": 1062 + }, + { + "epoch": 0.884850166481687, + "grad_norm": 0.43312981724739075, + "learning_rate": 8.887050762637825e-06, + "loss": 0.4647, + "step": 1063 + }, + { + "epoch": 0.8856825749167592, + "grad_norm": 0.3973630666732788, + "learning_rate": 8.884001370349275e-06, + "loss": 0.4598, + "step": 1064 + }, + { + "epoch": 0.8865149833518313, + "grad_norm": 0.3882988393306732, + "learning_rate": 8.880948330915435e-06, + "loss": 0.4827, + "step": 1065 + }, + { + "epoch": 0.8873473917869035, + "grad_norm": 0.3990275263786316, + "learning_rate": 8.877891647203157e-06, + "loss": 0.4571, + "step": 1066 + }, + { + "epoch": 0.8881798002219756, + "grad_norm": 0.38141873478889465, + "learning_rate": 8.874831322082725e-06, + "loss": 0.4471, + "step": 1067 + }, + { + "epoch": 0.8890122086570478, + "grad_norm": 0.3606696128845215, + "learning_rate": 8.871767358427835e-06, + "loss": 0.4216, + "step": 1068 + }, + { + "epoch": 0.8898446170921198, + "grad_norm": 0.41676607728004456, + "learning_rate": 8.868699759115604e-06, + "loss": 0.4574, + "step": 1069 + }, + { + "epoch": 0.890677025527192, + "grad_norm": 0.38336870074272156, + "learning_rate": 8.86562852702656e-06, + "loss": 0.4666, + "step": 1070 + }, + { + "epoch": 0.8915094339622641, + "grad_norm": 0.41286566853523254, + "learning_rate": 8.862553665044644e-06, + "loss": 0.4788, + "step": 1071 + }, + { + "epoch": 0.8923418423973363, + "grad_norm": 0.42297109961509705, + "learning_rate": 8.859475176057208e-06, + "loss": 0.4831, + "step": 1072 + }, + { + "epoch": 0.8931742508324084, + "grad_norm": 0.41496801376342773, + "learning_rate": 8.856393062955003e-06, + "loss": 0.4696, + "step": 1073 + }, + { + "epoch": 0.8940066592674806, + "grad_norm": 0.4308110177516937, + "learning_rate": 8.85330732863219e-06, + "loss": 0.452, + "step": 1074 + }, + { + "epoch": 0.8948390677025527, + "grad_norm": 0.4222694933414459, + "learning_rate": 8.850217975986326e-06, + "loss": 0.4499, + "step": 1075 + }, + { + "epoch": 0.8956714761376249, + "grad_norm": 0.43070679903030396, + "learning_rate": 8.84712500791837e-06, + "loss": 0.4717, + "step": 1076 + }, + { + "epoch": 0.896503884572697, + "grad_norm": 0.3741142153739929, + "learning_rate": 8.844028427332667e-06, + "loss": 0.4676, + "step": 1077 + }, + { + "epoch": 0.8973362930077692, + "grad_norm": 0.4449978470802307, + "learning_rate": 8.840928237136967e-06, + "loss": 0.4547, + "step": 1078 + }, + { + "epoch": 0.8981687014428413, + "grad_norm": 0.36339613795280457, + "learning_rate": 8.837824440242402e-06, + "loss": 0.4672, + "step": 1079 + }, + { + "epoch": 0.8990011098779135, + "grad_norm": 0.385516494512558, + "learning_rate": 8.834717039563488e-06, + "loss": 0.4796, + "step": 1080 + }, + { + "epoch": 0.8998335183129855, + "grad_norm": 0.3869156539440155, + "learning_rate": 8.83160603801813e-06, + "loss": 0.4498, + "step": 1081 + }, + { + "epoch": 0.9006659267480577, + "grad_norm": 0.3644091784954071, + "learning_rate": 8.828491438527614e-06, + "loss": 0.4296, + "step": 1082 + }, + { + "epoch": 0.9014983351831298, + "grad_norm": 0.3746865689754486, + "learning_rate": 8.825373244016604e-06, + "loss": 0.4595, + "step": 1083 + }, + { + "epoch": 0.902330743618202, + "grad_norm": 0.36787107586860657, + "learning_rate": 8.822251457413138e-06, + "loss": 0.4635, + "step": 1084 + }, + { + "epoch": 0.9031631520532741, + "grad_norm": 0.3880446255207062, + "learning_rate": 8.819126081648627e-06, + "loss": 0.4619, + "step": 1085 + }, + { + "epoch": 0.9039955604883463, + "grad_norm": 0.3723534047603607, + "learning_rate": 8.815997119657856e-06, + "loss": 0.4545, + "step": 1086 + }, + { + "epoch": 0.9048279689234184, + "grad_norm": 0.3879696726799011, + "learning_rate": 8.812864574378974e-06, + "loss": 0.474, + "step": 1087 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.36703255772590637, + "learning_rate": 8.809728448753496e-06, + "loss": 0.4609, + "step": 1088 + }, + { + "epoch": 0.9064927857935627, + "grad_norm": 0.3635331094264984, + "learning_rate": 8.8065887457263e-06, + "loss": 0.4472, + "step": 1089 + }, + { + "epoch": 0.9073251942286349, + "grad_norm": 0.3628758192062378, + "learning_rate": 8.803445468245618e-06, + "loss": 0.4376, + "step": 1090 + }, + { + "epoch": 0.908157602663707, + "grad_norm": 0.41849446296691895, + "learning_rate": 8.800298619263047e-06, + "loss": 0.4313, + "step": 1091 + }, + { + "epoch": 0.9089900110987791, + "grad_norm": 0.4343433976173401, + "learning_rate": 8.797148201733533e-06, + "loss": 0.4294, + "step": 1092 + }, + { + "epoch": 0.9098224195338512, + "grad_norm": 0.37368395924568176, + "learning_rate": 8.793994218615371e-06, + "loss": 0.4416, + "step": 1093 + }, + { + "epoch": 0.9106548279689234, + "grad_norm": 0.4110583961009979, + "learning_rate": 8.79083667287021e-06, + "loss": 0.4457, + "step": 1094 + }, + { + "epoch": 0.9114872364039955, + "grad_norm": 0.43879762291908264, + "learning_rate": 8.787675567463034e-06, + "loss": 0.4837, + "step": 1095 + }, + { + "epoch": 0.9123196448390677, + "grad_norm": 0.44292664527893066, + "learning_rate": 8.784510905362185e-06, + "loss": 0.4603, + "step": 1096 + }, + { + "epoch": 0.9131520532741398, + "grad_norm": 0.4232296943664551, + "learning_rate": 8.781342689539329e-06, + "loss": 0.4736, + "step": 1097 + }, + { + "epoch": 0.913984461709212, + "grad_norm": 0.3918737769126892, + "learning_rate": 8.778170922969478e-06, + "loss": 0.4531, + "step": 1098 + }, + { + "epoch": 0.9148168701442841, + "grad_norm": 0.4186196029186249, + "learning_rate": 8.774995608630979e-06, + "loss": 0.417, + "step": 1099 + }, + { + "epoch": 0.9156492785793563, + "grad_norm": 0.41225314140319824, + "learning_rate": 8.771816749505504e-06, + "loss": 0.4499, + "step": 1100 + }, + { + "epoch": 0.9164816870144284, + "grad_norm": 0.4248989522457123, + "learning_rate": 8.768634348578062e-06, + "loss": 0.4186, + "step": 1101 + }, + { + "epoch": 0.9173140954495006, + "grad_norm": 0.4327683746814728, + "learning_rate": 8.765448408836978e-06, + "loss": 0.4625, + "step": 1102 + }, + { + "epoch": 0.9181465038845728, + "grad_norm": 0.4192464053630829, + "learning_rate": 8.762258933273908e-06, + "loss": 0.4337, + "step": 1103 + }, + { + "epoch": 0.9189789123196448, + "grad_norm": 0.4388251304626465, + "learning_rate": 8.759065924883827e-06, + "loss": 0.4489, + "step": 1104 + }, + { + "epoch": 0.9198113207547169, + "grad_norm": 0.4132365882396698, + "learning_rate": 8.755869386665022e-06, + "loss": 0.4482, + "step": 1105 + }, + { + "epoch": 0.9206437291897891, + "grad_norm": 0.4240826964378357, + "learning_rate": 8.7526693216191e-06, + "loss": 0.4612, + "step": 1106 + }, + { + "epoch": 0.9214761376248612, + "grad_norm": 0.4309800863265991, + "learning_rate": 8.749465732750982e-06, + "loss": 0.4827, + "step": 1107 + }, + { + "epoch": 0.9223085460599334, + "grad_norm": 0.43188637495040894, + "learning_rate": 8.746258623068886e-06, + "loss": 0.4666, + "step": 1108 + }, + { + "epoch": 0.9231409544950056, + "grad_norm": 0.4251968264579773, + "learning_rate": 8.74304799558435e-06, + "loss": 0.4654, + "step": 1109 + }, + { + "epoch": 0.9239733629300777, + "grad_norm": 0.4286282956600189, + "learning_rate": 8.739833853312208e-06, + "loss": 0.4504, + "step": 1110 + }, + { + "epoch": 0.9248057713651499, + "grad_norm": 0.39924654364585876, + "learning_rate": 8.736616199270595e-06, + "loss": 0.4432, + "step": 1111 + }, + { + "epoch": 0.925638179800222, + "grad_norm": 0.44831129908561707, + "learning_rate": 8.733395036480946e-06, + "loss": 0.4497, + "step": 1112 + }, + { + "epoch": 0.9264705882352942, + "grad_norm": 0.38468286395072937, + "learning_rate": 8.73017036796799e-06, + "loss": 0.4564, + "step": 1113 + }, + { + "epoch": 0.9273029966703663, + "grad_norm": 0.380112886428833, + "learning_rate": 8.726942196759744e-06, + "loss": 0.4557, + "step": 1114 + }, + { + "epoch": 0.9281354051054383, + "grad_norm": 0.3891477584838867, + "learning_rate": 8.72371052588752e-06, + "loss": 0.4478, + "step": 1115 + }, + { + "epoch": 0.9289678135405105, + "grad_norm": 0.41235268115997314, + "learning_rate": 8.720475358385912e-06, + "loss": 0.4429, + "step": 1116 + }, + { + "epoch": 0.9298002219755827, + "grad_norm": 0.3957020938396454, + "learning_rate": 8.7172366972928e-06, + "loss": 0.4848, + "step": 1117 + }, + { + "epoch": 0.9306326304106548, + "grad_norm": 0.42292919754981995, + "learning_rate": 8.713994545649343e-06, + "loss": 0.466, + "step": 1118 + }, + { + "epoch": 0.931465038845727, + "grad_norm": 0.40453973412513733, + "learning_rate": 8.710748906499977e-06, + "loss": 0.4424, + "step": 1119 + }, + { + "epoch": 0.9322974472807991, + "grad_norm": 0.4147048592567444, + "learning_rate": 8.707499782892414e-06, + "loss": 0.471, + "step": 1120 + }, + { + "epoch": 0.9331298557158713, + "grad_norm": 0.38894933462142944, + "learning_rate": 8.704247177877643e-06, + "loss": 0.4822, + "step": 1121 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 0.39160868525505066, + "learning_rate": 8.700991094509909e-06, + "loss": 0.4814, + "step": 1122 + }, + { + "epoch": 0.9347946725860156, + "grad_norm": 0.362416535615921, + "learning_rate": 8.697731535846739e-06, + "loss": 0.4605, + "step": 1123 + }, + { + "epoch": 0.9356270810210877, + "grad_norm": 0.44123584032058716, + "learning_rate": 8.69446850494891e-06, + "loss": 0.4594, + "step": 1124 + }, + { + "epoch": 0.9364594894561599, + "grad_norm": 0.3935483992099762, + "learning_rate": 8.691202004880468e-06, + "loss": 0.4643, + "step": 1125 + }, + { + "epoch": 0.937291897891232, + "grad_norm": 0.37961041927337646, + "learning_rate": 8.687932038708712e-06, + "loss": 0.4547, + "step": 1126 + }, + { + "epoch": 0.9381243063263041, + "grad_norm": 0.4113844037055969, + "learning_rate": 8.684658609504199e-06, + "loss": 0.4683, + "step": 1127 + }, + { + "epoch": 0.9389567147613762, + "grad_norm": 0.4119846820831299, + "learning_rate": 8.681381720340736e-06, + "loss": 0.4563, + "step": 1128 + }, + { + "epoch": 0.9397891231964484, + "grad_norm": 0.3995191156864166, + "learning_rate": 8.67810137429538e-06, + "loss": 0.4544, + "step": 1129 + }, + { + "epoch": 0.9406215316315205, + "grad_norm": 0.403024822473526, + "learning_rate": 8.674817574448431e-06, + "loss": 0.4732, + "step": 1130 + }, + { + "epoch": 0.9414539400665927, + "grad_norm": 0.38387593626976013, + "learning_rate": 8.671530323883437e-06, + "loss": 0.4698, + "step": 1131 + }, + { + "epoch": 0.9422863485016648, + "grad_norm": 0.3703984022140503, + "learning_rate": 8.668239625687183e-06, + "loss": 0.4346, + "step": 1132 + }, + { + "epoch": 0.943118756936737, + "grad_norm": 0.35403862595558167, + "learning_rate": 8.664945482949691e-06, + "loss": 0.4518, + "step": 1133 + }, + { + "epoch": 0.9439511653718091, + "grad_norm": 0.3765757381916046, + "learning_rate": 8.661647898764221e-06, + "loss": 0.4547, + "step": 1134 + }, + { + "epoch": 0.9447835738068813, + "grad_norm": 0.35943272709846497, + "learning_rate": 8.658346876227261e-06, + "loss": 0.4333, + "step": 1135 + }, + { + "epoch": 0.9456159822419534, + "grad_norm": 0.36773252487182617, + "learning_rate": 8.655042418438529e-06, + "loss": 0.4498, + "step": 1136 + }, + { + "epoch": 0.9464483906770256, + "grad_norm": 0.3745127022266388, + "learning_rate": 8.651734528500968e-06, + "loss": 0.461, + "step": 1137 + }, + { + "epoch": 0.9472807991120976, + "grad_norm": 0.3643935024738312, + "learning_rate": 8.648423209520746e-06, + "loss": 0.4351, + "step": 1138 + }, + { + "epoch": 0.9481132075471698, + "grad_norm": 0.4308556616306305, + "learning_rate": 8.64510846460725e-06, + "loss": 0.4828, + "step": 1139 + }, + { + "epoch": 0.9489456159822419, + "grad_norm": 0.331564337015152, + "learning_rate": 8.641790296873081e-06, + "loss": 0.4513, + "step": 1140 + }, + { + "epoch": 0.9497780244173141, + "grad_norm": 0.3650985658168793, + "learning_rate": 8.638468709434057e-06, + "loss": 0.4507, + "step": 1141 + }, + { + "epoch": 0.9506104328523862, + "grad_norm": 0.4312916696071625, + "learning_rate": 8.63514370540921e-06, + "loss": 0.4469, + "step": 1142 + }, + { + "epoch": 0.9514428412874584, + "grad_norm": 0.3546101748943329, + "learning_rate": 8.631815287920773e-06, + "loss": 0.4594, + "step": 1143 + }, + { + "epoch": 0.9522752497225305, + "grad_norm": 0.3340010643005371, + "learning_rate": 8.62848346009419e-06, + "loss": 0.4506, + "step": 1144 + }, + { + "epoch": 0.9531076581576027, + "grad_norm": 0.4434676468372345, + "learning_rate": 8.625148225058107e-06, + "loss": 0.4825, + "step": 1145 + }, + { + "epoch": 0.9539400665926748, + "grad_norm": 0.39392799139022827, + "learning_rate": 8.621809585944366e-06, + "loss": 0.4532, + "step": 1146 + }, + { + "epoch": 0.954772475027747, + "grad_norm": 0.4085777699947357, + "learning_rate": 8.61846754588801e-06, + "loss": 0.4852, + "step": 1147 + }, + { + "epoch": 0.9556048834628191, + "grad_norm": 0.42657431960105896, + "learning_rate": 8.61512210802727e-06, + "loss": 0.4647, + "step": 1148 + }, + { + "epoch": 0.9564372918978913, + "grad_norm": 0.35785362124443054, + "learning_rate": 8.611773275503572e-06, + "loss": 0.4393, + "step": 1149 + }, + { + "epoch": 0.9572697003329633, + "grad_norm": 0.37917736172676086, + "learning_rate": 8.608421051461529e-06, + "loss": 0.4496, + "step": 1150 + }, + { + "epoch": 0.9581021087680355, + "grad_norm": 0.37420985102653503, + "learning_rate": 8.605065439048936e-06, + "loss": 0.4475, + "step": 1151 + }, + { + "epoch": 0.9589345172031076, + "grad_norm": 0.38180387020111084, + "learning_rate": 8.601706441416776e-06, + "loss": 0.4694, + "step": 1152 + }, + { + "epoch": 0.9597669256381798, + "grad_norm": 0.4098156988620758, + "learning_rate": 8.598344061719204e-06, + "loss": 0.4602, + "step": 1153 + }, + { + "epoch": 0.9605993340732519, + "grad_norm": 0.38434961438179016, + "learning_rate": 8.594978303113552e-06, + "loss": 0.4214, + "step": 1154 + }, + { + "epoch": 0.9614317425083241, + "grad_norm": 0.37275487184524536, + "learning_rate": 8.59160916876033e-06, + "loss": 0.4565, + "step": 1155 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 0.4662790298461914, + "learning_rate": 8.588236661823209e-06, + "loss": 0.4807, + "step": 1156 + }, + { + "epoch": 0.9630965593784684, + "grad_norm": 0.3571013808250427, + "learning_rate": 8.584860785469036e-06, + "loss": 0.4392, + "step": 1157 + }, + { + "epoch": 0.9639289678135405, + "grad_norm": 0.3700477182865143, + "learning_rate": 8.581481542867818e-06, + "loss": 0.4548, + "step": 1158 + }, + { + "epoch": 0.9647613762486127, + "grad_norm": 0.4246416687965393, + "learning_rate": 8.578098937192723e-06, + "loss": 0.4503, + "step": 1159 + }, + { + "epoch": 0.9655937846836848, + "grad_norm": 0.4045102894306183, + "learning_rate": 8.574712971620075e-06, + "loss": 0.4749, + "step": 1160 + }, + { + "epoch": 0.9664261931187569, + "grad_norm": 0.35602596402168274, + "learning_rate": 8.571323649329352e-06, + "loss": 0.4275, + "step": 1161 + }, + { + "epoch": 0.967258601553829, + "grad_norm": 0.49385765194892883, + "learning_rate": 8.567930973503196e-06, + "loss": 0.4623, + "step": 1162 + }, + { + "epoch": 0.9680910099889012, + "grad_norm": 0.4097510576248169, + "learning_rate": 8.564534947327381e-06, + "loss": 0.46, + "step": 1163 + }, + { + "epoch": 0.9689234184239733, + "grad_norm": 0.38444530963897705, + "learning_rate": 8.561135573990839e-06, + "loss": 0.441, + "step": 1164 + }, + { + "epoch": 0.9697558268590455, + "grad_norm": 0.43890970945358276, + "learning_rate": 8.55773285668564e-06, + "loss": 0.4766, + "step": 1165 + }, + { + "epoch": 0.9705882352941176, + "grad_norm": 0.37467607855796814, + "learning_rate": 8.554326798606994e-06, + "loss": 0.4743, + "step": 1166 + }, + { + "epoch": 0.9714206437291898, + "grad_norm": 0.3442972004413605, + "learning_rate": 8.55091740295325e-06, + "loss": 0.4558, + "step": 1167 + }, + { + "epoch": 0.9722530521642619, + "grad_norm": 0.3897690176963806, + "learning_rate": 8.547504672925892e-06, + "loss": 0.4647, + "step": 1168 + }, + { + "epoch": 0.9730854605993341, + "grad_norm": 0.33819976449012756, + "learning_rate": 8.544088611729533e-06, + "loss": 0.4507, + "step": 1169 + }, + { + "epoch": 0.9739178690344062, + "grad_norm": 0.3626435399055481, + "learning_rate": 8.540669222571911e-06, + "loss": 0.4365, + "step": 1170 + }, + { + "epoch": 0.9747502774694784, + "grad_norm": 0.3824900686740875, + "learning_rate": 8.537246508663894e-06, + "loss": 0.4761, + "step": 1171 + }, + { + "epoch": 0.9755826859045506, + "grad_norm": 0.3584446310997009, + "learning_rate": 8.533820473219472e-06, + "loss": 0.4595, + "step": 1172 + }, + { + "epoch": 0.9764150943396226, + "grad_norm": 0.39771831035614014, + "learning_rate": 8.53039111945575e-06, + "loss": 0.4779, + "step": 1173 + }, + { + "epoch": 0.9772475027746947, + "grad_norm": 0.37545955181121826, + "learning_rate": 8.526958450592952e-06, + "loss": 0.4357, + "step": 1174 + }, + { + "epoch": 0.9780799112097669, + "grad_norm": 0.3663276731967926, + "learning_rate": 8.523522469854415e-06, + "loss": 0.4777, + "step": 1175 + }, + { + "epoch": 0.978912319644839, + "grad_norm": 0.3252856135368347, + "learning_rate": 8.520083180466585e-06, + "loss": 0.4317, + "step": 1176 + }, + { + "epoch": 0.9797447280799112, + "grad_norm": 0.4126368463039398, + "learning_rate": 8.516640585659012e-06, + "loss": 0.4715, + "step": 1177 + }, + { + "epoch": 0.9805771365149833, + "grad_norm": 0.35552918910980225, + "learning_rate": 8.513194688664356e-06, + "loss": 0.4821, + "step": 1178 + }, + { + "epoch": 0.9814095449500555, + "grad_norm": 0.34943288564682007, + "learning_rate": 8.509745492718375e-06, + "loss": 0.4405, + "step": 1179 + }, + { + "epoch": 0.9822419533851277, + "grad_norm": 0.3988153636455536, + "learning_rate": 8.506293001059922e-06, + "loss": 0.4575, + "step": 1180 + }, + { + "epoch": 0.9830743618201998, + "grad_norm": 0.3627513647079468, + "learning_rate": 8.502837216930947e-06, + "loss": 0.4551, + "step": 1181 + }, + { + "epoch": 0.983906770255272, + "grad_norm": 0.3588287830352783, + "learning_rate": 8.499378143576496e-06, + "loss": 0.4544, + "step": 1182 + }, + { + "epoch": 0.9847391786903441, + "grad_norm": 0.3817938566207886, + "learning_rate": 8.495915784244694e-06, + "loss": 0.458, + "step": 1183 + }, + { + "epoch": 0.9855715871254163, + "grad_norm": 0.38958173990249634, + "learning_rate": 8.49245014218676e-06, + "loss": 0.4605, + "step": 1184 + }, + { + "epoch": 0.9864039955604883, + "grad_norm": 0.3958968222141266, + "learning_rate": 8.488981220656993e-06, + "loss": 0.4407, + "step": 1185 + }, + { + "epoch": 0.9872364039955605, + "grad_norm": 0.3889720141887665, + "learning_rate": 8.48550902291277e-06, + "loss": 0.4434, + "step": 1186 + }, + { + "epoch": 0.9880688124306326, + "grad_norm": 0.3535591959953308, + "learning_rate": 8.482033552214546e-06, + "loss": 0.4687, + "step": 1187 + }, + { + "epoch": 0.9889012208657048, + "grad_norm": 0.44984614849090576, + "learning_rate": 8.478554811825846e-06, + "loss": 0.4464, + "step": 1188 + }, + { + "epoch": 0.9897336293007769, + "grad_norm": 0.3912472724914551, + "learning_rate": 8.475072805013274e-06, + "loss": 0.4488, + "step": 1189 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 0.4023473858833313, + "learning_rate": 8.471587535046487e-06, + "loss": 0.4632, + "step": 1190 + }, + { + "epoch": 0.9913984461709212, + "grad_norm": 0.3754764795303345, + "learning_rate": 8.468099005198224e-06, + "loss": 0.4345, + "step": 1191 + }, + { + "epoch": 0.9922308546059934, + "grad_norm": 0.37986260652542114, + "learning_rate": 8.46460721874427e-06, + "loss": 0.4315, + "step": 1192 + }, + { + "epoch": 0.9930632630410655, + "grad_norm": 0.37531065940856934, + "learning_rate": 8.461112178963475e-06, + "loss": 0.4342, + "step": 1193 + }, + { + "epoch": 0.9938956714761377, + "grad_norm": 0.4208608865737915, + "learning_rate": 8.45761388913774e-06, + "loss": 0.4849, + "step": 1194 + }, + { + "epoch": 0.9947280799112098, + "grad_norm": 0.3419855237007141, + "learning_rate": 8.454112352552025e-06, + "loss": 0.4543, + "step": 1195 + }, + { + "epoch": 0.9955604883462819, + "grad_norm": 0.3961000144481659, + "learning_rate": 8.450607572494332e-06, + "loss": 0.4526, + "step": 1196 + }, + { + "epoch": 0.996392896781354, + "grad_norm": 0.42482149600982666, + "learning_rate": 8.447099552255708e-06, + "loss": 0.4498, + "step": 1197 + }, + { + "epoch": 0.9972253052164262, + "grad_norm": 0.3759855329990387, + "learning_rate": 8.44358829513025e-06, + "loss": 0.4251, + "step": 1198 + }, + { + "epoch": 0.9980577136514983, + "grad_norm": 0.3645038306713104, + "learning_rate": 8.44007380441509e-06, + "loss": 0.4439, + "step": 1199 + }, + { + "epoch": 0.9988901220865705, + "grad_norm": 0.46094900369644165, + "learning_rate": 8.436556083410392e-06, + "loss": 0.4726, + "step": 1200 + }, + { + "epoch": 0.9997225305216426, + "grad_norm": 0.42478302121162415, + "learning_rate": 8.433035135419358e-06, + "loss": 0.4445, + "step": 1201 + }, + { + "epoch": 1.0005549389567148, + "grad_norm": 0.7507914900779724, + "learning_rate": 8.429510963748224e-06, + "loss": 0.7647, + "step": 1202 + }, + { + "epoch": 1.001387347391787, + "grad_norm": 0.43571215867996216, + "learning_rate": 8.425983571706247e-06, + "loss": 0.4272, + "step": 1203 + }, + { + "epoch": 1.002219755826859, + "grad_norm": 0.4088655114173889, + "learning_rate": 8.422452962605709e-06, + "loss": 0.4526, + "step": 1204 + }, + { + "epoch": 1.0030521642619312, + "grad_norm": 0.34656473994255066, + "learning_rate": 8.418919139761914e-06, + "loss": 0.3969, + "step": 1205 + }, + { + "epoch": 1.0038845726970034, + "grad_norm": 0.4557936489582062, + "learning_rate": 8.415382106493183e-06, + "loss": 0.45, + "step": 1206 + }, + { + "epoch": 1.0047169811320755, + "grad_norm": 0.3934694528579712, + "learning_rate": 8.411841866120855e-06, + "loss": 0.4424, + "step": 1207 + }, + { + "epoch": 1.0055493895671477, + "grad_norm": 0.410007119178772, + "learning_rate": 8.408298421969275e-06, + "loss": 0.4463, + "step": 1208 + }, + { + "epoch": 1.0063817980022198, + "grad_norm": 0.3848797678947449, + "learning_rate": 8.4047517773658e-06, + "loss": 0.4238, + "step": 1209 + }, + { + "epoch": 1.007214206437292, + "grad_norm": 0.3674464821815491, + "learning_rate": 8.40120193564079e-06, + "loss": 0.3869, + "step": 1210 + }, + { + "epoch": 1.0080466148723641, + "grad_norm": 0.42686253786087036, + "learning_rate": 8.39764890012761e-06, + "loss": 0.4377, + "step": 1211 + }, + { + "epoch": 1.0088790233074363, + "grad_norm": 0.4299822151660919, + "learning_rate": 8.394092674162625e-06, + "loss": 0.4527, + "step": 1212 + }, + { + "epoch": 1.0097114317425082, + "grad_norm": 0.4144967496395111, + "learning_rate": 8.390533261085188e-06, + "loss": 0.4261, + "step": 1213 + }, + { + "epoch": 1.0105438401775804, + "grad_norm": 0.3909236192703247, + "learning_rate": 8.386970664237653e-06, + "loss": 0.4208, + "step": 1214 + }, + { + "epoch": 1.0113762486126525, + "grad_norm": 0.404340535402298, + "learning_rate": 8.383404886965361e-06, + "loss": 0.4307, + "step": 1215 + }, + { + "epoch": 1.0122086570477247, + "grad_norm": 0.39880749583244324, + "learning_rate": 8.37983593261664e-06, + "loss": 0.422, + "step": 1216 + }, + { + "epoch": 1.0130410654827968, + "grad_norm": 0.37474164366722107, + "learning_rate": 8.376263804542798e-06, + "loss": 0.4333, + "step": 1217 + }, + { + "epoch": 1.013873473917869, + "grad_norm": 0.39734119176864624, + "learning_rate": 8.372688506098128e-06, + "loss": 0.4147, + "step": 1218 + }, + { + "epoch": 1.0147058823529411, + "grad_norm": 0.3793570399284363, + "learning_rate": 8.369110040639899e-06, + "loss": 0.4257, + "step": 1219 + }, + { + "epoch": 1.0155382907880133, + "grad_norm": 0.38969290256500244, + "learning_rate": 8.365528411528348e-06, + "loss": 0.4657, + "step": 1220 + }, + { + "epoch": 1.0163706992230854, + "grad_norm": 0.33815810084342957, + "learning_rate": 8.361943622126694e-06, + "loss": 0.3868, + "step": 1221 + }, + { + "epoch": 1.0172031076581576, + "grad_norm": 0.4408543109893799, + "learning_rate": 8.358355675801112e-06, + "loss": 0.4481, + "step": 1222 + }, + { + "epoch": 1.0180355160932297, + "grad_norm": 0.3819434940814972, + "learning_rate": 8.354764575920747e-06, + "loss": 0.4484, + "step": 1223 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.3943055272102356, + "learning_rate": 8.351170325857705e-06, + "loss": 0.4066, + "step": 1224 + }, + { + "epoch": 1.019700332963374, + "grad_norm": 0.3979198634624481, + "learning_rate": 8.347572928987052e-06, + "loss": 0.4461, + "step": 1225 + }, + { + "epoch": 1.0205327413984462, + "grad_norm": 0.3459455370903015, + "learning_rate": 8.343972388686806e-06, + "loss": 0.3935, + "step": 1226 + }, + { + "epoch": 1.0213651498335183, + "grad_norm": 0.4697541892528534, + "learning_rate": 8.340368708337934e-06, + "loss": 0.478, + "step": 1227 + }, + { + "epoch": 1.0221975582685905, + "grad_norm": 0.392013818025589, + "learning_rate": 8.336761891324357e-06, + "loss": 0.4317, + "step": 1228 + }, + { + "epoch": 1.0230299667036626, + "grad_norm": 0.3422892987728119, + "learning_rate": 8.333151941032941e-06, + "loss": 0.3749, + "step": 1229 + }, + { + "epoch": 1.0238623751387348, + "grad_norm": 0.4656471312046051, + "learning_rate": 8.32953886085349e-06, + "loss": 0.4637, + "step": 1230 + }, + { + "epoch": 1.024694783573807, + "grad_norm": 0.4500276744365692, + "learning_rate": 8.325922654178752e-06, + "loss": 0.454, + "step": 1231 + }, + { + "epoch": 1.025527192008879, + "grad_norm": 0.3819878399372101, + "learning_rate": 8.322303324404408e-06, + "loss": 0.4158, + "step": 1232 + }, + { + "epoch": 1.0263596004439512, + "grad_norm": 0.4874471426010132, + "learning_rate": 8.318680874929068e-06, + "loss": 0.4479, + "step": 1233 + }, + { + "epoch": 1.0271920088790234, + "grad_norm": 0.39757636189460754, + "learning_rate": 8.315055309154283e-06, + "loss": 0.4527, + "step": 1234 + }, + { + "epoch": 1.0280244173140956, + "grad_norm": 0.3715905547142029, + "learning_rate": 8.311426630484513e-06, + "loss": 0.4181, + "step": 1235 + }, + { + "epoch": 1.0288568257491675, + "grad_norm": 0.44828590750694275, + "learning_rate": 8.30779484232716e-06, + "loss": 0.4454, + "step": 1236 + }, + { + "epoch": 1.0296892341842396, + "grad_norm": 0.4298873245716095, + "learning_rate": 8.304159948092532e-06, + "loss": 0.4382, + "step": 1237 + }, + { + "epoch": 1.0305216426193118, + "grad_norm": 0.36218270659446716, + "learning_rate": 8.30052195119386e-06, + "loss": 0.4058, + "step": 1238 + }, + { + "epoch": 1.031354051054384, + "grad_norm": 0.4292842149734497, + "learning_rate": 8.296880855047284e-06, + "loss": 0.4512, + "step": 1239 + }, + { + "epoch": 1.032186459489456, + "grad_norm": 0.3632907569408417, + "learning_rate": 8.293236663071859e-06, + "loss": 0.4331, + "step": 1240 + }, + { + "epoch": 1.0330188679245282, + "grad_norm": 0.3891299366950989, + "learning_rate": 8.289589378689548e-06, + "loss": 0.4313, + "step": 1241 + }, + { + "epoch": 1.0338512763596004, + "grad_norm": 0.37545377016067505, + "learning_rate": 8.28593900532521e-06, + "loss": 0.375, + "step": 1242 + }, + { + "epoch": 1.0346836847946725, + "grad_norm": 0.3906904458999634, + "learning_rate": 8.28228554640661e-06, + "loss": 0.4785, + "step": 1243 + }, + { + "epoch": 1.0355160932297447, + "grad_norm": 0.3426535725593567, + "learning_rate": 8.278629005364412e-06, + "loss": 0.387, + "step": 1244 + }, + { + "epoch": 1.0363485016648168, + "grad_norm": 0.4440067410469055, + "learning_rate": 8.274969385632173e-06, + "loss": 0.4773, + "step": 1245 + }, + { + "epoch": 1.037180910099889, + "grad_norm": 0.36117398738861084, + "learning_rate": 8.271306690646336e-06, + "loss": 0.4375, + "step": 1246 + }, + { + "epoch": 1.0380133185349611, + "grad_norm": 0.4358084499835968, + "learning_rate": 8.267640923846242e-06, + "loss": 0.4346, + "step": 1247 + }, + { + "epoch": 1.0388457269700333, + "grad_norm": 0.3763819932937622, + "learning_rate": 8.263972088674103e-06, + "loss": 0.4179, + "step": 1248 + }, + { + "epoch": 1.0396781354051055, + "grad_norm": 0.3596190810203552, + "learning_rate": 8.260300188575024e-06, + "loss": 0.4451, + "step": 1249 + }, + { + "epoch": 1.0405105438401776, + "grad_norm": 0.42740023136138916, + "learning_rate": 8.256625226996981e-06, + "loss": 0.4607, + "step": 1250 + }, + { + "epoch": 1.0413429522752498, + "grad_norm": 0.3612176775932312, + "learning_rate": 8.252947207390832e-06, + "loss": 0.4038, + "step": 1251 + }, + { + "epoch": 1.042175360710322, + "grad_norm": 0.3865748941898346, + "learning_rate": 8.249266133210296e-06, + "loss": 0.4395, + "step": 1252 + }, + { + "epoch": 1.043007769145394, + "grad_norm": 0.37287238240242004, + "learning_rate": 8.245582007911967e-06, + "loss": 0.4055, + "step": 1253 + }, + { + "epoch": 1.0438401775804662, + "grad_norm": 0.4149041771888733, + "learning_rate": 8.241894834955306e-06, + "loss": 0.421, + "step": 1254 + }, + { + "epoch": 1.0446725860155384, + "grad_norm": 0.42680850625038147, + "learning_rate": 8.238204617802633e-06, + "loss": 0.4522, + "step": 1255 + }, + { + "epoch": 1.0455049944506105, + "grad_norm": 0.3849968910217285, + "learning_rate": 8.234511359919125e-06, + "loss": 0.3983, + "step": 1256 + }, + { + "epoch": 1.0463374028856827, + "grad_norm": 0.405513197183609, + "learning_rate": 8.230815064772815e-06, + "loss": 0.4165, + "step": 1257 + }, + { + "epoch": 1.0471698113207548, + "grad_norm": 0.4122266471385956, + "learning_rate": 8.22711573583459e-06, + "loss": 0.4478, + "step": 1258 + }, + { + "epoch": 1.0480022197558267, + "grad_norm": 0.3673398196697235, + "learning_rate": 8.223413376578182e-06, + "loss": 0.4134, + "step": 1259 + }, + { + "epoch": 1.048834628190899, + "grad_norm": 0.40100428462028503, + "learning_rate": 8.219707990480177e-06, + "loss": 0.4296, + "step": 1260 + }, + { + "epoch": 1.049667036625971, + "grad_norm": 0.34723445773124695, + "learning_rate": 8.215999581019993e-06, + "loss": 0.4036, + "step": 1261 + }, + { + "epoch": 1.0504994450610432, + "grad_norm": 0.4086048901081085, + "learning_rate": 8.212288151679892e-06, + "loss": 0.4462, + "step": 1262 + }, + { + "epoch": 1.0513318534961154, + "grad_norm": 0.37759000062942505, + "learning_rate": 8.208573705944972e-06, + "loss": 0.4191, + "step": 1263 + }, + { + "epoch": 1.0521642619311875, + "grad_norm": 0.43619635701179504, + "learning_rate": 8.204856247303163e-06, + "loss": 0.4364, + "step": 1264 + }, + { + "epoch": 1.0529966703662597, + "grad_norm": 0.395222544670105, + "learning_rate": 8.201135779245222e-06, + "loss": 0.4511, + "step": 1265 + }, + { + "epoch": 1.0538290788013318, + "grad_norm": 0.4762343466281891, + "learning_rate": 8.197412305264735e-06, + "loss": 0.4423, + "step": 1266 + }, + { + "epoch": 1.054661487236404, + "grad_norm": 0.4282004237174988, + "learning_rate": 8.193685828858109e-06, + "loss": 0.4506, + "step": 1267 + }, + { + "epoch": 1.0554938956714761, + "grad_norm": 0.35303211212158203, + "learning_rate": 8.189956353524568e-06, + "loss": 0.4602, + "step": 1268 + }, + { + "epoch": 1.0563263041065483, + "grad_norm": 0.3769274652004242, + "learning_rate": 8.18622388276616e-06, + "loss": 0.4354, + "step": 1269 + }, + { + "epoch": 1.0571587125416204, + "grad_norm": 0.41085049510002136, + "learning_rate": 8.182488420087737e-06, + "loss": 0.4079, + "step": 1270 + }, + { + "epoch": 1.0579911209766926, + "grad_norm": 0.40207546949386597, + "learning_rate": 8.178749968996965e-06, + "loss": 0.4262, + "step": 1271 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.4222968518733978, + "learning_rate": 8.175008533004312e-06, + "loss": 0.4536, + "step": 1272 + }, + { + "epoch": 1.0596559378468369, + "grad_norm": 0.4397706091403961, + "learning_rate": 8.171264115623056e-06, + "loss": 0.4143, + "step": 1273 + }, + { + "epoch": 1.060488346281909, + "grad_norm": 0.37260058522224426, + "learning_rate": 8.167516720369268e-06, + "loss": 0.3961, + "step": 1274 + }, + { + "epoch": 1.0613207547169812, + "grad_norm": 0.4061874449253082, + "learning_rate": 8.163766350761819e-06, + "loss": 0.4376, + "step": 1275 + }, + { + "epoch": 1.0621531631520533, + "grad_norm": 0.4455544054508209, + "learning_rate": 8.160013010322372e-06, + "loss": 0.4101, + "step": 1276 + }, + { + "epoch": 1.0629855715871255, + "grad_norm": 0.36185914278030396, + "learning_rate": 8.156256702575378e-06, + "loss": 0.4202, + "step": 1277 + }, + { + "epoch": 1.0638179800221976, + "grad_norm": 0.4335435628890991, + "learning_rate": 8.152497431048076e-06, + "loss": 0.4351, + "step": 1278 + }, + { + "epoch": 1.0646503884572698, + "grad_norm": 0.3470878303050995, + "learning_rate": 8.148735199270487e-06, + "loss": 0.3819, + "step": 1279 + }, + { + "epoch": 1.065482796892342, + "grad_norm": 0.3797052800655365, + "learning_rate": 8.144970010775417e-06, + "loss": 0.4304, + "step": 1280 + }, + { + "epoch": 1.066315205327414, + "grad_norm": 0.44937318563461304, + "learning_rate": 8.141201869098439e-06, + "loss": 0.4613, + "step": 1281 + }, + { + "epoch": 1.067147613762486, + "grad_norm": 0.3713797628879547, + "learning_rate": 8.137430777777904e-06, + "loss": 0.4366, + "step": 1282 + }, + { + "epoch": 1.0679800221975582, + "grad_norm": 0.3958394527435303, + "learning_rate": 8.133656740354936e-06, + "loss": 0.4576, + "step": 1283 + }, + { + "epoch": 1.0688124306326303, + "grad_norm": 0.4216252267360687, + "learning_rate": 8.129879760373419e-06, + "loss": 0.4375, + "step": 1284 + }, + { + "epoch": 1.0696448390677025, + "grad_norm": 0.3756641745567322, + "learning_rate": 8.126099841380008e-06, + "loss": 0.442, + "step": 1285 + }, + { + "epoch": 1.0704772475027746, + "grad_norm": 0.35558274388313293, + "learning_rate": 8.122316986924108e-06, + "loss": 0.4089, + "step": 1286 + }, + { + "epoch": 1.0713096559378468, + "grad_norm": 0.4253517985343933, + "learning_rate": 8.118531200557888e-06, + "loss": 0.479, + "step": 1287 + }, + { + "epoch": 1.072142064372919, + "grad_norm": 0.3373982012271881, + "learning_rate": 8.114742485836267e-06, + "loss": 0.404, + "step": 1288 + }, + { + "epoch": 1.072974472807991, + "grad_norm": 0.41005459427833557, + "learning_rate": 8.110950846316915e-06, + "loss": 0.4544, + "step": 1289 + }, + { + "epoch": 1.0738068812430632, + "grad_norm": 0.3480115532875061, + "learning_rate": 8.107156285560249e-06, + "loss": 0.4235, + "step": 1290 + }, + { + "epoch": 1.0746392896781354, + "grad_norm": 0.373367041349411, + "learning_rate": 8.103358807129424e-06, + "loss": 0.4552, + "step": 1291 + }, + { + "epoch": 1.0754716981132075, + "grad_norm": 0.3816337585449219, + "learning_rate": 8.099558414590343e-06, + "loss": 0.4019, + "step": 1292 + }, + { + "epoch": 1.0763041065482797, + "grad_norm": 0.3785970211029053, + "learning_rate": 8.09575511151164e-06, + "loss": 0.4361, + "step": 1293 + }, + { + "epoch": 1.0771365149833518, + "grad_norm": 0.37990981340408325, + "learning_rate": 8.091948901464683e-06, + "loss": 0.4375, + "step": 1294 + }, + { + "epoch": 1.077968923418424, + "grad_norm": 0.36004897952079773, + "learning_rate": 8.088139788023568e-06, + "loss": 0.4331, + "step": 1295 + }, + { + "epoch": 1.0788013318534961, + "grad_norm": 0.38282451033592224, + "learning_rate": 8.084327774765121e-06, + "loss": 0.4659, + "step": 1296 + }, + { + "epoch": 1.0796337402885683, + "grad_norm": 0.3787173330783844, + "learning_rate": 8.08051286526889e-06, + "loss": 0.4041, + "step": 1297 + }, + { + "epoch": 1.0804661487236404, + "grad_norm": 0.366184264421463, + "learning_rate": 8.076695063117141e-06, + "loss": 0.4211, + "step": 1298 + }, + { + "epoch": 1.0812985571587126, + "grad_norm": 0.3390844166278839, + "learning_rate": 8.072874371894856e-06, + "loss": 0.4288, + "step": 1299 + }, + { + "epoch": 1.0821309655937847, + "grad_norm": 0.35803496837615967, + "learning_rate": 8.069050795189732e-06, + "loss": 0.4241, + "step": 1300 + }, + { + "epoch": 1.082963374028857, + "grad_norm": 0.3761143982410431, + "learning_rate": 8.065224336592175e-06, + "loss": 0.4112, + "step": 1301 + }, + { + "epoch": 1.083795782463929, + "grad_norm": 0.3917039930820465, + "learning_rate": 8.061394999695295e-06, + "loss": 0.398, + "step": 1302 + }, + { + "epoch": 1.0846281908990012, + "grad_norm": 0.35649728775024414, + "learning_rate": 8.057562788094909e-06, + "loss": 0.4079, + "step": 1303 + }, + { + "epoch": 1.0854605993340734, + "grad_norm": 0.4020186960697174, + "learning_rate": 8.053727705389527e-06, + "loss": 0.421, + "step": 1304 + }, + { + "epoch": 1.0862930077691453, + "grad_norm": 0.3835377097129822, + "learning_rate": 8.049889755180363e-06, + "loss": 0.4015, + "step": 1305 + }, + { + "epoch": 1.0871254162042177, + "grad_norm": 0.3527611196041107, + "learning_rate": 8.046048941071316e-06, + "loss": 0.4202, + "step": 1306 + }, + { + "epoch": 1.0879578246392896, + "grad_norm": 0.41720572113990784, + "learning_rate": 8.042205266668982e-06, + "loss": 0.494, + "step": 1307 + }, + { + "epoch": 1.0887902330743617, + "grad_norm": 0.4160442054271698, + "learning_rate": 8.038358735582632e-06, + "loss": 0.4654, + "step": 1308 + }, + { + "epoch": 1.0896226415094339, + "grad_norm": 0.3363052010536194, + "learning_rate": 8.034509351424231e-06, + "loss": 0.3651, + "step": 1309 + }, + { + "epoch": 1.090455049944506, + "grad_norm": 0.45190951228141785, + "learning_rate": 8.030657117808415e-06, + "loss": 0.4048, + "step": 1310 + }, + { + "epoch": 1.0912874583795782, + "grad_norm": 0.3866722881793976, + "learning_rate": 8.026802038352503e-06, + "loss": 0.4191, + "step": 1311 + }, + { + "epoch": 1.0921198668146503, + "grad_norm": 0.4028158187866211, + "learning_rate": 8.02294411667648e-06, + "loss": 0.3982, + "step": 1312 + }, + { + "epoch": 1.0929522752497225, + "grad_norm": 0.4108687937259674, + "learning_rate": 8.019083356403002e-06, + "loss": 0.4326, + "step": 1313 + }, + { + "epoch": 1.0937846836847946, + "grad_norm": 0.417467325925827, + "learning_rate": 8.015219761157387e-06, + "loss": 0.4195, + "step": 1314 + }, + { + "epoch": 1.0946170921198668, + "grad_norm": 0.3949446678161621, + "learning_rate": 8.011353334567625e-06, + "loss": 0.4035, + "step": 1315 + }, + { + "epoch": 1.095449500554939, + "grad_norm": 0.374812513589859, + "learning_rate": 8.007484080264355e-06, + "loss": 0.4318, + "step": 1316 + }, + { + "epoch": 1.096281908990011, + "grad_norm": 0.4128841459751129, + "learning_rate": 8.003612001880872e-06, + "loss": 0.436, + "step": 1317 + }, + { + "epoch": 1.0971143174250833, + "grad_norm": 0.4258720278739929, + "learning_rate": 7.99973710305313e-06, + "loss": 0.4393, + "step": 1318 + }, + { + "epoch": 1.0979467258601554, + "grad_norm": 0.38994690775871277, + "learning_rate": 7.995859387419726e-06, + "loss": 0.4135, + "step": 1319 + }, + { + "epoch": 1.0987791342952276, + "grad_norm": 0.3956961929798126, + "learning_rate": 7.9919788586219e-06, + "loss": 0.4261, + "step": 1320 + }, + { + "epoch": 1.0996115427302997, + "grad_norm": 0.4195059537887573, + "learning_rate": 7.988095520303539e-06, + "loss": 0.4351, + "step": 1321 + }, + { + "epoch": 1.1004439511653719, + "grad_norm": 0.3911471366882324, + "learning_rate": 7.984209376111165e-06, + "loss": 0.4434, + "step": 1322 + }, + { + "epoch": 1.101276359600444, + "grad_norm": 0.48853620886802673, + "learning_rate": 7.980320429693934e-06, + "loss": 0.4585, + "step": 1323 + }, + { + "epoch": 1.1021087680355162, + "grad_norm": 0.40332579612731934, + "learning_rate": 7.976428684703637e-06, + "loss": 0.3821, + "step": 1324 + }, + { + "epoch": 1.1029411764705883, + "grad_norm": 0.41588088870048523, + "learning_rate": 7.97253414479469e-06, + "loss": 0.469, + "step": 1325 + }, + { + "epoch": 1.1037735849056605, + "grad_norm": 0.405784010887146, + "learning_rate": 7.968636813624134e-06, + "loss": 0.3994, + "step": 1326 + }, + { + "epoch": 1.1046059933407326, + "grad_norm": 0.39917218685150146, + "learning_rate": 7.964736694851632e-06, + "loss": 0.4317, + "step": 1327 + }, + { + "epoch": 1.1054384017758045, + "grad_norm": 0.46947672963142395, + "learning_rate": 7.960833792139461e-06, + "loss": 0.4775, + "step": 1328 + }, + { + "epoch": 1.106270810210877, + "grad_norm": 0.42204639315605164, + "learning_rate": 7.95692810915252e-06, + "loss": 0.4228, + "step": 1329 + }, + { + "epoch": 1.1071032186459488, + "grad_norm": 0.41625940799713135, + "learning_rate": 7.953019649558309e-06, + "loss": 0.4016, + "step": 1330 + }, + { + "epoch": 1.107935627081021, + "grad_norm": 0.39135950803756714, + "learning_rate": 7.949108417026941e-06, + "loss": 0.4445, + "step": 1331 + }, + { + "epoch": 1.1087680355160932, + "grad_norm": 0.4039277136325836, + "learning_rate": 7.945194415231133e-06, + "loss": 0.4099, + "step": 1332 + }, + { + "epoch": 1.1096004439511653, + "grad_norm": 0.3570069670677185, + "learning_rate": 7.9412776478462e-06, + "loss": 0.4073, + "step": 1333 + }, + { + "epoch": 1.1104328523862375, + "grad_norm": 0.4042835235595703, + "learning_rate": 7.937358118550058e-06, + "loss": 0.4037, + "step": 1334 + }, + { + "epoch": 1.1112652608213096, + "grad_norm": 0.4284457266330719, + "learning_rate": 7.933435831023211e-06, + "loss": 0.4517, + "step": 1335 + }, + { + "epoch": 1.1120976692563818, + "grad_norm": 0.3967071771621704, + "learning_rate": 7.929510788948755e-06, + "loss": 0.4205, + "step": 1336 + }, + { + "epoch": 1.112930077691454, + "grad_norm": 0.3720296323299408, + "learning_rate": 7.925582996012375e-06, + "loss": 0.4472, + "step": 1337 + }, + { + "epoch": 1.113762486126526, + "grad_norm": 0.38390690088272095, + "learning_rate": 7.921652455902337e-06, + "loss": 0.3934, + "step": 1338 + }, + { + "epoch": 1.1145948945615982, + "grad_norm": 0.48500263690948486, + "learning_rate": 7.917719172309487e-06, + "loss": 0.4521, + "step": 1339 + }, + { + "epoch": 1.1154273029966704, + "grad_norm": 0.38441312313079834, + "learning_rate": 7.913783148927246e-06, + "loss": 0.4474, + "step": 1340 + }, + { + "epoch": 1.1162597114317425, + "grad_norm": 0.462807297706604, + "learning_rate": 7.909844389451611e-06, + "loss": 0.4397, + "step": 1341 + }, + { + "epoch": 1.1170921198668147, + "grad_norm": 0.38858792185783386, + "learning_rate": 7.905902897581145e-06, + "loss": 0.3972, + "step": 1342 + }, + { + "epoch": 1.1179245283018868, + "grad_norm": 0.3973788022994995, + "learning_rate": 7.901958677016977e-06, + "loss": 0.4265, + "step": 1343 + }, + { + "epoch": 1.118756936736959, + "grad_norm": 0.3690992593765259, + "learning_rate": 7.898011731462801e-06, + "loss": 0.4439, + "step": 1344 + }, + { + "epoch": 1.1195893451720311, + "grad_norm": 0.5391969680786133, + "learning_rate": 7.894062064624865e-06, + "loss": 0.4472, + "step": 1345 + }, + { + "epoch": 1.1204217536071033, + "grad_norm": 0.3407292664051056, + "learning_rate": 7.890109680211979e-06, + "loss": 0.3753, + "step": 1346 + }, + { + "epoch": 1.1212541620421754, + "grad_norm": 0.3949512541294098, + "learning_rate": 7.886154581935499e-06, + "loss": 0.459, + "step": 1347 + }, + { + "epoch": 1.1220865704772476, + "grad_norm": 0.3942648470401764, + "learning_rate": 7.88219677350933e-06, + "loss": 0.3809, + "step": 1348 + }, + { + "epoch": 1.1229189789123197, + "grad_norm": 0.4611280560493469, + "learning_rate": 7.878236258649927e-06, + "loss": 0.4762, + "step": 1349 + }, + { + "epoch": 1.1237513873473919, + "grad_norm": 0.37650808691978455, + "learning_rate": 7.874273041076283e-06, + "loss": 0.4166, + "step": 1350 + }, + { + "epoch": 1.1245837957824638, + "grad_norm": 0.4160518944263458, + "learning_rate": 7.870307124509926e-06, + "loss": 0.3948, + "step": 1351 + }, + { + "epoch": 1.1254162042175362, + "grad_norm": 0.39588451385498047, + "learning_rate": 7.86633851267492e-06, + "loss": 0.4322, + "step": 1352 + }, + { + "epoch": 1.1262486126526081, + "grad_norm": 0.3730158507823944, + "learning_rate": 7.862367209297864e-06, + "loss": 0.4327, + "step": 1353 + }, + { + "epoch": 1.1270810210876803, + "grad_norm": 0.43521034717559814, + "learning_rate": 7.85839321810788e-06, + "loss": 0.392, + "step": 1354 + }, + { + "epoch": 1.1279134295227524, + "grad_norm": 0.4675239622592926, + "learning_rate": 7.854416542836617e-06, + "loss": 0.4395, + "step": 1355 + }, + { + "epoch": 1.1287458379578246, + "grad_norm": 0.38083502650260925, + "learning_rate": 7.85043718721824e-06, + "loss": 0.417, + "step": 1356 + }, + { + "epoch": 1.1295782463928967, + "grad_norm": 0.46890681982040405, + "learning_rate": 7.846455154989437e-06, + "loss": 0.4463, + "step": 1357 + }, + { + "epoch": 1.1304106548279689, + "grad_norm": 0.4093262851238251, + "learning_rate": 7.842470449889403e-06, + "loss": 0.394, + "step": 1358 + }, + { + "epoch": 1.131243063263041, + "grad_norm": 0.35766276717185974, + "learning_rate": 7.838483075659846e-06, + "loss": 0.4444, + "step": 1359 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.4069897532463074, + "learning_rate": 7.83449303604498e-06, + "loss": 0.4139, + "step": 1360 + }, + { + "epoch": 1.1329078801331853, + "grad_norm": 0.4673629403114319, + "learning_rate": 7.830500334791525e-06, + "loss": 0.4268, + "step": 1361 + }, + { + "epoch": 1.1337402885682575, + "grad_norm": 0.4079434275627136, + "learning_rate": 7.826504975648696e-06, + "loss": 0.4431, + "step": 1362 + }, + { + "epoch": 1.1345726970033296, + "grad_norm": 0.4580193758010864, + "learning_rate": 7.822506962368204e-06, + "loss": 0.424, + "step": 1363 + }, + { + "epoch": 1.1354051054384018, + "grad_norm": 0.4982101023197174, + "learning_rate": 7.818506298704254e-06, + "loss": 0.4285, + "step": 1364 + }, + { + "epoch": 1.136237513873474, + "grad_norm": 0.3587871491909027, + "learning_rate": 7.814502988413539e-06, + "loss": 0.4058, + "step": 1365 + }, + { + "epoch": 1.137069922308546, + "grad_norm": 0.48973003029823303, + "learning_rate": 7.810497035255239e-06, + "loss": 0.4631, + "step": 1366 + }, + { + "epoch": 1.1379023307436182, + "grad_norm": 0.4257657825946808, + "learning_rate": 7.80648844299101e-06, + "loss": 0.4013, + "step": 1367 + }, + { + "epoch": 1.1387347391786904, + "grad_norm": 0.5077852010726929, + "learning_rate": 7.802477215384997e-06, + "loss": 0.4421, + "step": 1368 + }, + { + "epoch": 1.1395671476137625, + "grad_norm": 0.41671276092529297, + "learning_rate": 7.79846335620381e-06, + "loss": 0.4297, + "step": 1369 + }, + { + "epoch": 1.1403995560488347, + "grad_norm": 0.392742395401001, + "learning_rate": 7.794446869216527e-06, + "loss": 0.3575, + "step": 1370 + }, + { + "epoch": 1.1412319644839068, + "grad_norm": 0.504817545413971, + "learning_rate": 7.79042775819471e-06, + "loss": 0.4213, + "step": 1371 + }, + { + "epoch": 1.142064372918979, + "grad_norm": 0.4020238518714905, + "learning_rate": 7.786406026912368e-06, + "loss": 0.4363, + "step": 1372 + }, + { + "epoch": 1.1428967813540512, + "grad_norm": 0.49842900037765503, + "learning_rate": 7.782381679145979e-06, + "loss": 0.4386, + "step": 1373 + }, + { + "epoch": 1.143729189789123, + "grad_norm": 0.48727041482925415, + "learning_rate": 7.778354718674475e-06, + "loss": 0.463, + "step": 1374 + }, + { + "epoch": 1.1445615982241955, + "grad_norm": 0.39957672357559204, + "learning_rate": 7.774325149279243e-06, + "loss": 0.402, + "step": 1375 + }, + { + "epoch": 1.1453940066592674, + "grad_norm": 0.5904676914215088, + "learning_rate": 7.770292974744119e-06, + "loss": 0.4446, + "step": 1376 + }, + { + "epoch": 1.1462264150943395, + "grad_norm": 0.34512779116630554, + "learning_rate": 7.766258198855386e-06, + "loss": 0.4331, + "step": 1377 + }, + { + "epoch": 1.1470588235294117, + "grad_norm": 0.44855794310569763, + "learning_rate": 7.76222082540177e-06, + "loss": 0.4014, + "step": 1378 + }, + { + "epoch": 1.1478912319644838, + "grad_norm": 0.5417425632476807, + "learning_rate": 7.758180858174434e-06, + "loss": 0.4453, + "step": 1379 + }, + { + "epoch": 1.148723640399556, + "grad_norm": 0.3855729401111603, + "learning_rate": 7.754138300966978e-06, + "loss": 0.4654, + "step": 1380 + }, + { + "epoch": 1.1495560488346281, + "grad_norm": 0.4581325650215149, + "learning_rate": 7.750093157575433e-06, + "loss": 0.4398, + "step": 1381 + }, + { + "epoch": 1.1503884572697003, + "grad_norm": 0.4510229825973511, + "learning_rate": 7.746045431798264e-06, + "loss": 0.4312, + "step": 1382 + }, + { + "epoch": 1.1512208657047724, + "grad_norm": 0.3615362346172333, + "learning_rate": 7.74199512743635e-06, + "loss": 0.4297, + "step": 1383 + }, + { + "epoch": 1.1520532741398446, + "grad_norm": 0.4092762768268585, + "learning_rate": 7.737942248293001e-06, + "loss": 0.4472, + "step": 1384 + }, + { + "epoch": 1.1528856825749167, + "grad_norm": 0.46709978580474854, + "learning_rate": 7.733886798173945e-06, + "loss": 0.456, + "step": 1385 + }, + { + "epoch": 1.153718091009989, + "grad_norm": 0.3712662160396576, + "learning_rate": 7.729828780887313e-06, + "loss": 0.4129, + "step": 1386 + }, + { + "epoch": 1.154550499445061, + "grad_norm": 0.35037344694137573, + "learning_rate": 7.72576820024366e-06, + "loss": 0.4172, + "step": 1387 + }, + { + "epoch": 1.1553829078801332, + "grad_norm": 0.4096495807170868, + "learning_rate": 7.72170506005594e-06, + "loss": 0.4243, + "step": 1388 + }, + { + "epoch": 1.1562153163152054, + "grad_norm": 0.4149743914604187, + "learning_rate": 7.717639364139514e-06, + "loss": 0.4728, + "step": 1389 + }, + { + "epoch": 1.1570477247502775, + "grad_norm": 0.3861239552497864, + "learning_rate": 7.713571116312143e-06, + "loss": 0.4409, + "step": 1390 + }, + { + "epoch": 1.1578801331853497, + "grad_norm": 0.3818301558494568, + "learning_rate": 7.709500320393976e-06, + "loss": 0.415, + "step": 1391 + }, + { + "epoch": 1.1587125416204218, + "grad_norm": 0.3710886240005493, + "learning_rate": 7.70542698020757e-06, + "loss": 0.4251, + "step": 1392 + }, + { + "epoch": 1.159544950055494, + "grad_norm": 0.3728558421134949, + "learning_rate": 7.70135109957786e-06, + "loss": 0.4196, + "step": 1393 + }, + { + "epoch": 1.1603773584905661, + "grad_norm": 0.42186808586120605, + "learning_rate": 7.697272682332168e-06, + "loss": 0.4473, + "step": 1394 + }, + { + "epoch": 1.1612097669256383, + "grad_norm": 0.37886783480644226, + "learning_rate": 7.6931917323002e-06, + "loss": 0.4295, + "step": 1395 + }, + { + "epoch": 1.1620421753607104, + "grad_norm": 0.36047130823135376, + "learning_rate": 7.689108253314038e-06, + "loss": 0.4145, + "step": 1396 + }, + { + "epoch": 1.1628745837957823, + "grad_norm": 0.41782742738723755, + "learning_rate": 7.685022249208142e-06, + "loss": 0.4459, + "step": 1397 + }, + { + "epoch": 1.1637069922308547, + "grad_norm": 0.3524583578109741, + "learning_rate": 7.680933723819343e-06, + "loss": 0.4361, + "step": 1398 + }, + { + "epoch": 1.1645394006659266, + "grad_norm": 0.34524109959602356, + "learning_rate": 7.676842680986836e-06, + "loss": 0.405, + "step": 1399 + }, + { + "epoch": 1.1653718091009988, + "grad_norm": 0.40391451120376587, + "learning_rate": 7.67274912455218e-06, + "loss": 0.445, + "step": 1400 + }, + { + "epoch": 1.166204217536071, + "grad_norm": 0.3440330922603607, + "learning_rate": 7.6686530583593e-06, + "loss": 0.4126, + "step": 1401 + }, + { + "epoch": 1.167036625971143, + "grad_norm": 0.4131713807582855, + "learning_rate": 7.664554486254468e-06, + "loss": 0.4831, + "step": 1402 + }, + { + "epoch": 1.1678690344062153, + "grad_norm": 0.3954821527004242, + "learning_rate": 7.660453412086323e-06, + "loss": 0.4501, + "step": 1403 + }, + { + "epoch": 1.1687014428412874, + "grad_norm": 0.3479725420475006, + "learning_rate": 7.656349839705838e-06, + "loss": 0.4125, + "step": 1404 + }, + { + "epoch": 1.1695338512763596, + "grad_norm": 0.4041290581226349, + "learning_rate": 7.652243772966345e-06, + "loss": 0.3941, + "step": 1405 + }, + { + "epoch": 1.1703662597114317, + "grad_norm": 0.3935270607471466, + "learning_rate": 7.648135215723511e-06, + "loss": 0.4381, + "step": 1406 + }, + { + "epoch": 1.1711986681465039, + "grad_norm": 0.3363988399505615, + "learning_rate": 7.64402417183534e-06, + "loss": 0.3909, + "step": 1407 + }, + { + "epoch": 1.172031076581576, + "grad_norm": 0.358073353767395, + "learning_rate": 7.639910645162179e-06, + "loss": 0.4092, + "step": 1408 + }, + { + "epoch": 1.1728634850166482, + "grad_norm": 0.38340896368026733, + "learning_rate": 7.635794639566697e-06, + "loss": 0.4579, + "step": 1409 + }, + { + "epoch": 1.1736958934517203, + "grad_norm": 0.3633081912994385, + "learning_rate": 7.631676158913899e-06, + "loss": 0.4123, + "step": 1410 + }, + { + "epoch": 1.1745283018867925, + "grad_norm": 0.4083158075809479, + "learning_rate": 7.627555207071108e-06, + "loss": 0.451, + "step": 1411 + }, + { + "epoch": 1.1753607103218646, + "grad_norm": 0.3736763894557953, + "learning_rate": 7.623431787907971e-06, + "loss": 0.4061, + "step": 1412 + }, + { + "epoch": 1.1761931187569368, + "grad_norm": 0.3408348858356476, + "learning_rate": 7.61930590529645e-06, + "loss": 0.4175, + "step": 1413 + }, + { + "epoch": 1.177025527192009, + "grad_norm": 0.38261574506759644, + "learning_rate": 7.6151775631108245e-06, + "loss": 0.4458, + "step": 1414 + }, + { + "epoch": 1.177857935627081, + "grad_norm": 0.3570031225681305, + "learning_rate": 7.611046765227675e-06, + "loss": 0.3869, + "step": 1415 + }, + { + "epoch": 1.1786903440621532, + "grad_norm": 0.3958790600299835, + "learning_rate": 7.606913515525896e-06, + "loss": 0.4224, + "step": 1416 + }, + { + "epoch": 1.1795227524972254, + "grad_norm": 0.43983784317970276, + "learning_rate": 7.602777817886678e-06, + "loss": 0.456, + "step": 1417 + }, + { + "epoch": 1.1803551609322975, + "grad_norm": 0.3442942202091217, + "learning_rate": 7.59863967619352e-06, + "loss": 0.4149, + "step": 1418 + }, + { + "epoch": 1.1811875693673697, + "grad_norm": 0.3674313426017761, + "learning_rate": 7.594499094332204e-06, + "loss": 0.4029, + "step": 1419 + }, + { + "epoch": 1.1820199778024416, + "grad_norm": 0.4117499589920044, + "learning_rate": 7.59035607619081e-06, + "loss": 0.434, + "step": 1420 + }, + { + "epoch": 1.182852386237514, + "grad_norm": 0.3603156507015228, + "learning_rate": 7.586210625659707e-06, + "loss": 0.4318, + "step": 1421 + }, + { + "epoch": 1.183684794672586, + "grad_norm": 0.38503581285476685, + "learning_rate": 7.582062746631542e-06, + "loss": 0.4139, + "step": 1422 + }, + { + "epoch": 1.184517203107658, + "grad_norm": 0.43392035365104675, + "learning_rate": 7.577912443001247e-06, + "loss": 0.4058, + "step": 1423 + }, + { + "epoch": 1.1853496115427302, + "grad_norm": 0.3828240633010864, + "learning_rate": 7.573759718666031e-06, + "loss": 0.4342, + "step": 1424 + }, + { + "epoch": 1.1861820199778024, + "grad_norm": 0.39743468165397644, + "learning_rate": 7.569604577525376e-06, + "loss": 0.4351, + "step": 1425 + }, + { + "epoch": 1.1870144284128745, + "grad_norm": 0.417901873588562, + "learning_rate": 7.56544702348103e-06, + "loss": 0.4358, + "step": 1426 + }, + { + "epoch": 1.1878468368479467, + "grad_norm": 0.4021197259426117, + "learning_rate": 7.5612870604370106e-06, + "loss": 0.4402, + "step": 1427 + }, + { + "epoch": 1.1886792452830188, + "grad_norm": 0.32264193892478943, + "learning_rate": 7.557124692299593e-06, + "loss": 0.3862, + "step": 1428 + }, + { + "epoch": 1.189511653718091, + "grad_norm": 0.41591644287109375, + "learning_rate": 7.552959922977317e-06, + "loss": 0.4142, + "step": 1429 + }, + { + "epoch": 1.1903440621531631, + "grad_norm": 0.38096314668655396, + "learning_rate": 7.548792756380972e-06, + "loss": 0.4072, + "step": 1430 + }, + { + "epoch": 1.1911764705882353, + "grad_norm": 0.3876970410346985, + "learning_rate": 7.5446231964236025e-06, + "loss": 0.4199, + "step": 1431 + }, + { + "epoch": 1.1920088790233074, + "grad_norm": 0.36842256784439087, + "learning_rate": 7.540451247020495e-06, + "loss": 0.4071, + "step": 1432 + }, + { + "epoch": 1.1928412874583796, + "grad_norm": 0.3903951644897461, + "learning_rate": 7.536276912089187e-06, + "loss": 0.4546, + "step": 1433 + }, + { + "epoch": 1.1936736958934517, + "grad_norm": 0.38267070055007935, + "learning_rate": 7.53210019554945e-06, + "loss": 0.4082, + "step": 1434 + }, + { + "epoch": 1.1945061043285239, + "grad_norm": 0.3489694893360138, + "learning_rate": 7.527921101323292e-06, + "loss": 0.4278, + "step": 1435 + }, + { + "epoch": 1.195338512763596, + "grad_norm": 0.36168771982192993, + "learning_rate": 7.523739633334959e-06, + "loss": 0.4372, + "step": 1436 + }, + { + "epoch": 1.1961709211986682, + "grad_norm": 0.37979066371917725, + "learning_rate": 7.5195557955109225e-06, + "loss": 0.4514, + "step": 1437 + }, + { + "epoch": 1.1970033296337403, + "grad_norm": 0.3345673382282257, + "learning_rate": 7.515369591779876e-06, + "loss": 0.4163, + "step": 1438 + }, + { + "epoch": 1.1978357380688125, + "grad_norm": 0.3836076855659485, + "learning_rate": 7.511181026072741e-06, + "loss": 0.4598, + "step": 1439 + }, + { + "epoch": 1.1986681465038846, + "grad_norm": 0.37280598282814026, + "learning_rate": 7.5069901023226545e-06, + "loss": 0.4072, + "step": 1440 + }, + { + "epoch": 1.1995005549389568, + "grad_norm": 0.35891035199165344, + "learning_rate": 7.502796824464966e-06, + "loss": 0.4475, + "step": 1441 + }, + { + "epoch": 1.200332963374029, + "grad_norm": 0.3753512501716614, + "learning_rate": 7.498601196437238e-06, + "loss": 0.4583, + "step": 1442 + }, + { + "epoch": 1.2011653718091009, + "grad_norm": 0.33655253052711487, + "learning_rate": 7.494403222179235e-06, + "loss": 0.399, + "step": 1443 + }, + { + "epoch": 1.2019977802441733, + "grad_norm": 0.34640976786613464, + "learning_rate": 7.490202905632933e-06, + "loss": 0.401, + "step": 1444 + }, + { + "epoch": 1.2028301886792452, + "grad_norm": 0.34061723947525024, + "learning_rate": 7.4860002507425004e-06, + "loss": 0.408, + "step": 1445 + }, + { + "epoch": 1.2036625971143173, + "grad_norm": 0.36065739393234253, + "learning_rate": 7.481795261454304e-06, + "loss": 0.4472, + "step": 1446 + }, + { + "epoch": 1.2044950055493895, + "grad_norm": 0.33626216650009155, + "learning_rate": 7.477587941716904e-06, + "loss": 0.4088, + "step": 1447 + }, + { + "epoch": 1.2053274139844616, + "grad_norm": 0.3538858890533447, + "learning_rate": 7.4733782954810444e-06, + "loss": 0.4513, + "step": 1448 + }, + { + "epoch": 1.2061598224195338, + "grad_norm": 0.3653189539909363, + "learning_rate": 7.469166326699658e-06, + "loss": 0.4268, + "step": 1449 + }, + { + "epoch": 1.206992230854606, + "grad_norm": 0.3585223853588104, + "learning_rate": 7.4649520393278575e-06, + "loss": 0.3983, + "step": 1450 + }, + { + "epoch": 1.207824639289678, + "grad_norm": 0.38904839754104614, + "learning_rate": 7.460735437322933e-06, + "loss": 0.4459, + "step": 1451 + }, + { + "epoch": 1.2086570477247502, + "grad_norm": 0.34138140082359314, + "learning_rate": 7.456516524644347e-06, + "loss": 0.4143, + "step": 1452 + }, + { + "epoch": 1.2094894561598224, + "grad_norm": 0.3598451614379883, + "learning_rate": 7.452295305253731e-06, + "loss": 0.4108, + "step": 1453 + }, + { + "epoch": 1.2103218645948945, + "grad_norm": 0.35666725039482117, + "learning_rate": 7.448071783114887e-06, + "loss": 0.4157, + "step": 1454 + }, + { + "epoch": 1.2111542730299667, + "grad_norm": 0.42830565571784973, + "learning_rate": 7.443845962193775e-06, + "loss": 0.4488, + "step": 1455 + }, + { + "epoch": 1.2119866814650389, + "grad_norm": 0.3601066470146179, + "learning_rate": 7.439617846458513e-06, + "loss": 0.4221, + "step": 1456 + }, + { + "epoch": 1.212819089900111, + "grad_norm": 0.36113888025283813, + "learning_rate": 7.435387439879378e-06, + "loss": 0.368, + "step": 1457 + }, + { + "epoch": 1.2136514983351832, + "grad_norm": 0.4079122245311737, + "learning_rate": 7.431154746428794e-06, + "loss": 0.4567, + "step": 1458 + }, + { + "epoch": 1.2144839067702553, + "grad_norm": 0.3279145658016205, + "learning_rate": 7.4269197700813375e-06, + "loss": 0.401, + "step": 1459 + }, + { + "epoch": 1.2153163152053275, + "grad_norm": 0.3636506497859955, + "learning_rate": 7.4226825148137225e-06, + "loss": 0.4394, + "step": 1460 + }, + { + "epoch": 1.2161487236403996, + "grad_norm": 0.3573419153690338, + "learning_rate": 7.418442984604805e-06, + "loss": 0.3929, + "step": 1461 + }, + { + "epoch": 1.2169811320754718, + "grad_norm": 0.3526058793067932, + "learning_rate": 7.414201183435581e-06, + "loss": 0.4305, + "step": 1462 + }, + { + "epoch": 1.217813540510544, + "grad_norm": 0.34084025025367737, + "learning_rate": 7.409957115289175e-06, + "loss": 0.4018, + "step": 1463 + }, + { + "epoch": 1.218645948945616, + "grad_norm": 0.3763788342475891, + "learning_rate": 7.40571078415084e-06, + "loss": 0.4106, + "step": 1464 + }, + { + "epoch": 1.2194783573806882, + "grad_norm": 0.37996524572372437, + "learning_rate": 7.401462194007957e-06, + "loss": 0.4664, + "step": 1465 + }, + { + "epoch": 1.2203107658157601, + "grad_norm": 0.3957565724849701, + "learning_rate": 7.397211348850025e-06, + "loss": 0.4947, + "step": 1466 + }, + { + "epoch": 1.2211431742508325, + "grad_norm": 0.33967241644859314, + "learning_rate": 7.392958252668663e-06, + "loss": 0.3549, + "step": 1467 + }, + { + "epoch": 1.2219755826859044, + "grad_norm": 0.37266120314598083, + "learning_rate": 7.388702909457603e-06, + "loss": 0.4341, + "step": 1468 + }, + { + "epoch": 1.2228079911209766, + "grad_norm": 0.37723714113235474, + "learning_rate": 7.384445323212687e-06, + "loss": 0.4435, + "step": 1469 + }, + { + "epoch": 1.2236403995560488, + "grad_norm": 0.3732614815235138, + "learning_rate": 7.380185497931862e-06, + "loss": 0.4508, + "step": 1470 + }, + { + "epoch": 1.224472807991121, + "grad_norm": 0.35898712277412415, + "learning_rate": 7.375923437615179e-06, + "loss": 0.4299, + "step": 1471 + }, + { + "epoch": 1.225305216426193, + "grad_norm": 0.37605753540992737, + "learning_rate": 7.371659146264787e-06, + "loss": 0.4495, + "step": 1472 + }, + { + "epoch": 1.2261376248612652, + "grad_norm": 0.35970860719680786, + "learning_rate": 7.367392627884931e-06, + "loss": 0.3909, + "step": 1473 + }, + { + "epoch": 1.2269700332963374, + "grad_norm": 0.39131295680999756, + "learning_rate": 7.363123886481947e-06, + "loss": 0.4308, + "step": 1474 + }, + { + "epoch": 1.2278024417314095, + "grad_norm": 0.38385316729545593, + "learning_rate": 7.3588529260642564e-06, + "loss": 0.4483, + "step": 1475 + }, + { + "epoch": 1.2286348501664817, + "grad_norm": 0.35251525044441223, + "learning_rate": 7.3545797506423655e-06, + "loss": 0.4427, + "step": 1476 + }, + { + "epoch": 1.2294672586015538, + "grad_norm": 0.3673636317253113, + "learning_rate": 7.3503043642288614e-06, + "loss": 0.4065, + "step": 1477 + }, + { + "epoch": 1.230299667036626, + "grad_norm": 0.3953494727611542, + "learning_rate": 7.3460267708384084e-06, + "loss": 0.4341, + "step": 1478 + }, + { + "epoch": 1.2311320754716981, + "grad_norm": 0.34204229712486267, + "learning_rate": 7.3417469744877375e-06, + "loss": 0.4064, + "step": 1479 + }, + { + "epoch": 1.2319644839067703, + "grad_norm": 0.35544899106025696, + "learning_rate": 7.337464979195658e-06, + "loss": 0.4086, + "step": 1480 + }, + { + "epoch": 1.2327968923418424, + "grad_norm": 0.37943002581596375, + "learning_rate": 7.333180788983034e-06, + "loss": 0.4459, + "step": 1481 + }, + { + "epoch": 1.2336293007769146, + "grad_norm": 0.33336302638053894, + "learning_rate": 7.328894407872797e-06, + "loss": 0.409, + "step": 1482 + }, + { + "epoch": 1.2344617092119867, + "grad_norm": 0.3566553294658661, + "learning_rate": 7.324605839889936e-06, + "loss": 0.4377, + "step": 1483 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.39889898896217346, + "learning_rate": 7.320315089061486e-06, + "loss": 0.4603, + "step": 1484 + }, + { + "epoch": 1.236126526082131, + "grad_norm": 0.4692656695842743, + "learning_rate": 7.3160221594165415e-06, + "loss": 0.4108, + "step": 1485 + }, + { + "epoch": 1.2369589345172032, + "grad_norm": 0.39253804087638855, + "learning_rate": 7.3117270549862385e-06, + "loss": 0.4393, + "step": 1486 + }, + { + "epoch": 1.2377913429522753, + "grad_norm": 0.4060220718383789, + "learning_rate": 7.3074297798037515e-06, + "loss": 0.423, + "step": 1487 + }, + { + "epoch": 1.2386237513873475, + "grad_norm": 0.32819664478302, + "learning_rate": 7.303130337904303e-06, + "loss": 0.3855, + "step": 1488 + }, + { + "epoch": 1.2394561598224194, + "grad_norm": 0.36876818537712097, + "learning_rate": 7.298828733325138e-06, + "loss": 0.4056, + "step": 1489 + }, + { + "epoch": 1.2402885682574918, + "grad_norm": 0.37898144125938416, + "learning_rate": 7.294524970105543e-06, + "loss": 0.3985, + "step": 1490 + }, + { + "epoch": 1.2411209766925637, + "grad_norm": 0.3846915662288666, + "learning_rate": 7.290219052286826e-06, + "loss": 0.4752, + "step": 1491 + }, + { + "epoch": 1.2419533851276359, + "grad_norm": 0.4225359857082367, + "learning_rate": 7.285910983912317e-06, + "loss": 0.4152, + "step": 1492 + }, + { + "epoch": 1.242785793562708, + "grad_norm": 0.3616348206996918, + "learning_rate": 7.281600769027371e-06, + "loss": 0.4244, + "step": 1493 + }, + { + "epoch": 1.2436182019977802, + "grad_norm": 0.35701385140419006, + "learning_rate": 7.277288411679352e-06, + "loss": 0.4284, + "step": 1494 + }, + { + "epoch": 1.2444506104328523, + "grad_norm": 0.37644630670547485, + "learning_rate": 7.272973915917642e-06, + "loss": 0.4185, + "step": 1495 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.3375144898891449, + "learning_rate": 7.268657285793625e-06, + "loss": 0.3967, + "step": 1496 + }, + { + "epoch": 1.2461154273029966, + "grad_norm": 0.36483046412467957, + "learning_rate": 7.264338525360695e-06, + "loss": 0.4346, + "step": 1497 + }, + { + "epoch": 1.2469478357380688, + "grad_norm": 0.364069402217865, + "learning_rate": 7.260017638674244e-06, + "loss": 0.4308, + "step": 1498 + }, + { + "epoch": 1.247780244173141, + "grad_norm": 0.3628464639186859, + "learning_rate": 7.255694629791659e-06, + "loss": 0.4345, + "step": 1499 + }, + { + "epoch": 1.248612652608213, + "grad_norm": 0.33808740973472595, + "learning_rate": 7.251369502772318e-06, + "loss": 0.405, + "step": 1500 + }, + { + "epoch": 1.2494450610432852, + "grad_norm": 0.4051111042499542, + "learning_rate": 7.247042261677597e-06, + "loss": 0.4147, + "step": 1501 + }, + { + "epoch": 1.2502774694783574, + "grad_norm": 0.34634676575660706, + "learning_rate": 7.242712910570846e-06, + "loss": 0.4635, + "step": 1502 + }, + { + "epoch": 1.2511098779134295, + "grad_norm": 0.42578983306884766, + "learning_rate": 7.238381453517405e-06, + "loss": 0.4309, + "step": 1503 + }, + { + "epoch": 1.2519422863485017, + "grad_norm": 0.38502374291419983, + "learning_rate": 7.234047894584586e-06, + "loss": 0.4128, + "step": 1504 + }, + { + "epoch": 1.2527746947835738, + "grad_norm": 0.37332308292388916, + "learning_rate": 7.229712237841679e-06, + "loss": 0.4229, + "step": 1505 + }, + { + "epoch": 1.253607103218646, + "grad_norm": 0.3640212118625641, + "learning_rate": 7.225374487359937e-06, + "loss": 0.4239, + "step": 1506 + }, + { + "epoch": 1.2544395116537181, + "grad_norm": 0.3998686671257019, + "learning_rate": 7.221034647212588e-06, + "loss": 0.3917, + "step": 1507 + }, + { + "epoch": 1.2552719200887903, + "grad_norm": 0.3780185282230377, + "learning_rate": 7.216692721474816e-06, + "loss": 0.4168, + "step": 1508 + }, + { + "epoch": 1.2561043285238624, + "grad_norm": 0.3652763068675995, + "learning_rate": 7.212348714223767e-06, + "loss": 0.4179, + "step": 1509 + }, + { + "epoch": 1.2569367369589346, + "grad_norm": 0.3468812108039856, + "learning_rate": 7.208002629538537e-06, + "loss": 0.3956, + "step": 1510 + }, + { + "epoch": 1.2577691453940067, + "grad_norm": 0.46192142367362976, + "learning_rate": 7.203654471500179e-06, + "loss": 0.4355, + "step": 1511 + }, + { + "epoch": 1.2586015538290787, + "grad_norm": 0.336422324180603, + "learning_rate": 7.199304244191687e-06, + "loss": 0.4085, + "step": 1512 + }, + { + "epoch": 1.259433962264151, + "grad_norm": 0.3533170521259308, + "learning_rate": 7.1949519516980005e-06, + "loss": 0.4076, + "step": 1513 + }, + { + "epoch": 1.260266370699223, + "grad_norm": 0.34538713097572327, + "learning_rate": 7.190597598106001e-06, + "loss": 0.3822, + "step": 1514 + }, + { + "epoch": 1.2610987791342954, + "grad_norm": 0.35250788927078247, + "learning_rate": 7.186241187504499e-06, + "loss": 0.4416, + "step": 1515 + }, + { + "epoch": 1.2619311875693673, + "grad_norm": 0.38682812452316284, + "learning_rate": 7.1818827239842446e-06, + "loss": 0.4295, + "step": 1516 + }, + { + "epoch": 1.2627635960044394, + "grad_norm": 0.3738081455230713, + "learning_rate": 7.177522211637906e-06, + "loss": 0.4203, + "step": 1517 + }, + { + "epoch": 1.2635960044395116, + "grad_norm": 0.39243027567863464, + "learning_rate": 7.173159654560087e-06, + "loss": 0.4994, + "step": 1518 + }, + { + "epoch": 1.2644284128745837, + "grad_norm": 0.337700754404068, + "learning_rate": 7.168795056847301e-06, + "loss": 0.407, + "step": 1519 + }, + { + "epoch": 1.265260821309656, + "grad_norm": 0.4204353094100952, + "learning_rate": 7.164428422597982e-06, + "loss": 0.4189, + "step": 1520 + }, + { + "epoch": 1.266093229744728, + "grad_norm": 0.38597655296325684, + "learning_rate": 7.1600597559124765e-06, + "loss": 0.4476, + "step": 1521 + }, + { + "epoch": 1.2669256381798002, + "grad_norm": 0.32296887040138245, + "learning_rate": 7.155689060893038e-06, + "loss": 0.3669, + "step": 1522 + }, + { + "epoch": 1.2677580466148723, + "grad_norm": 0.35379377007484436, + "learning_rate": 7.151316341643828e-06, + "loss": 0.4346, + "step": 1523 + }, + { + "epoch": 1.2685904550499445, + "grad_norm": 0.5642543435096741, + "learning_rate": 7.146941602270905e-06, + "loss": 0.4237, + "step": 1524 + }, + { + "epoch": 1.2694228634850167, + "grad_norm": 0.4288339614868164, + "learning_rate": 7.142564846882227e-06, + "loss": 0.3797, + "step": 1525 + }, + { + "epoch": 1.2702552719200888, + "grad_norm": 0.36099711060523987, + "learning_rate": 7.1381860795876415e-06, + "loss": 0.4519, + "step": 1526 + }, + { + "epoch": 1.271087680355161, + "grad_norm": 0.4087908864021301, + "learning_rate": 7.13380530449889e-06, + "loss": 0.4499, + "step": 1527 + }, + { + "epoch": 1.271920088790233, + "grad_norm": 0.37486109137535095, + "learning_rate": 7.129422525729594e-06, + "loss": 0.4141, + "step": 1528 + }, + { + "epoch": 1.2727524972253053, + "grad_norm": 0.3277265429496765, + "learning_rate": 7.125037747395264e-06, + "loss": 0.3747, + "step": 1529 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 0.36414340138435364, + "learning_rate": 7.120650973613279e-06, + "loss": 0.4181, + "step": 1530 + }, + { + "epoch": 1.2744173140954496, + "grad_norm": 0.3803533911705017, + "learning_rate": 7.116262208502901e-06, + "loss": 0.429, + "step": 1531 + }, + { + "epoch": 1.2752497225305217, + "grad_norm": 0.3974792957305908, + "learning_rate": 7.111871456185253e-06, + "loss": 0.4555, + "step": 1532 + }, + { + "epoch": 1.2760821309655939, + "grad_norm": 0.35720840096473694, + "learning_rate": 7.107478720783332e-06, + "loss": 0.4415, + "step": 1533 + }, + { + "epoch": 1.276914539400666, + "grad_norm": 0.4027242362499237, + "learning_rate": 7.1030840064219906e-06, + "loss": 0.4108, + "step": 1534 + }, + { + "epoch": 1.277746947835738, + "grad_norm": 0.38121697306632996, + "learning_rate": 7.098687317227943e-06, + "loss": 0.4387, + "step": 1535 + }, + { + "epoch": 1.2785793562708103, + "grad_norm": 0.37340235710144043, + "learning_rate": 7.09428865732976e-06, + "loss": 0.4174, + "step": 1536 + }, + { + "epoch": 1.2794117647058822, + "grad_norm": 0.4037769138813019, + "learning_rate": 7.089888030857857e-06, + "loss": 0.4613, + "step": 1537 + }, + { + "epoch": 1.2802441731409546, + "grad_norm": 0.33645448088645935, + "learning_rate": 7.0854854419445e-06, + "loss": 0.3608, + "step": 1538 + }, + { + "epoch": 1.2810765815760266, + "grad_norm": 0.4041195511817932, + "learning_rate": 7.0810808947237975e-06, + "loss": 0.4305, + "step": 1539 + }, + { + "epoch": 1.2819089900110987, + "grad_norm": 0.34097573161125183, + "learning_rate": 7.076674393331697e-06, + "loss": 0.4156, + "step": 1540 + }, + { + "epoch": 1.2827413984461709, + "grad_norm": 0.40564876794815063, + "learning_rate": 7.0722659419059806e-06, + "loss": 0.4365, + "step": 1541 + }, + { + "epoch": 1.283573806881243, + "grad_norm": 0.39337435364723206, + "learning_rate": 7.0678555445862605e-06, + "loss": 0.4482, + "step": 1542 + }, + { + "epoch": 1.2844062153163152, + "grad_norm": 0.3427252173423767, + "learning_rate": 7.063443205513975e-06, + "loss": 0.3806, + "step": 1543 + }, + { + "epoch": 1.2852386237513873, + "grad_norm": 0.4486899673938751, + "learning_rate": 7.059028928832394e-06, + "loss": 0.4238, + "step": 1544 + }, + { + "epoch": 1.2860710321864595, + "grad_norm": 0.37656182050704956, + "learning_rate": 7.054612718686593e-06, + "loss": 0.4165, + "step": 1545 + }, + { + "epoch": 1.2869034406215316, + "grad_norm": 0.37508389353752136, + "learning_rate": 7.0501945792234776e-06, + "loss": 0.4368, + "step": 1546 + }, + { + "epoch": 1.2877358490566038, + "grad_norm": 0.385955274105072, + "learning_rate": 7.045774514591753e-06, + "loss": 0.4263, + "step": 1547 + }, + { + "epoch": 1.288568257491676, + "grad_norm": 0.3990999162197113, + "learning_rate": 7.041352528941939e-06, + "loss": 0.4219, + "step": 1548 + }, + { + "epoch": 1.289400665926748, + "grad_norm": 0.43571391701698303, + "learning_rate": 7.036928626426358e-06, + "loss": 0.4525, + "step": 1549 + }, + { + "epoch": 1.2902330743618202, + "grad_norm": 0.40208232402801514, + "learning_rate": 7.0325028111991325e-06, + "loss": 0.4166, + "step": 1550 + }, + { + "epoch": 1.2910654827968924, + "grad_norm": 0.5420854687690735, + "learning_rate": 7.02807508741618e-06, + "loss": 0.4908, + "step": 1551 + }, + { + "epoch": 1.2918978912319645, + "grad_norm": 0.3404446840286255, + "learning_rate": 7.0236454592352065e-06, + "loss": 0.3513, + "step": 1552 + }, + { + "epoch": 1.2927302996670367, + "grad_norm": 0.37210413813591003, + "learning_rate": 7.019213930815718e-06, + "loss": 0.4292, + "step": 1553 + }, + { + "epoch": 1.2935627081021088, + "grad_norm": 0.4521826207637787, + "learning_rate": 7.01478050631899e-06, + "loss": 0.4314, + "step": 1554 + }, + { + "epoch": 1.294395116537181, + "grad_norm": 0.3963296413421631, + "learning_rate": 7.010345189908092e-06, + "loss": 0.4345, + "step": 1555 + }, + { + "epoch": 1.2952275249722531, + "grad_norm": 0.3517422378063202, + "learning_rate": 7.0059079857478596e-06, + "loss": 0.4088, + "step": 1556 + }, + { + "epoch": 1.2960599334073253, + "grad_norm": 0.4621788263320923, + "learning_rate": 7.001468898004907e-06, + "loss": 0.4385, + "step": 1557 + }, + { + "epoch": 1.2968923418423972, + "grad_norm": 0.41421958804130554, + "learning_rate": 6.997027930847614e-06, + "loss": 0.4428, + "step": 1558 + }, + { + "epoch": 1.2977247502774696, + "grad_norm": 0.32892701029777527, + "learning_rate": 6.992585088446129e-06, + "loss": 0.4213, + "step": 1559 + }, + { + "epoch": 1.2985571587125415, + "grad_norm": 0.37421393394470215, + "learning_rate": 6.988140374972357e-06, + "loss": 0.3801, + "step": 1560 + }, + { + "epoch": 1.2993895671476139, + "grad_norm": 0.4647720456123352, + "learning_rate": 6.983693794599959e-06, + "loss": 0.4472, + "step": 1561 + }, + { + "epoch": 1.3002219755826858, + "grad_norm": 0.3581669330596924, + "learning_rate": 6.979245351504358e-06, + "loss": 0.4224, + "step": 1562 + }, + { + "epoch": 1.301054384017758, + "grad_norm": 0.37526553869247437, + "learning_rate": 6.974795049862715e-06, + "loss": 0.4071, + "step": 1563 + }, + { + "epoch": 1.3018867924528301, + "grad_norm": 0.3938126266002655, + "learning_rate": 6.970342893853943e-06, + "loss": 0.4474, + "step": 1564 + }, + { + "epoch": 1.3027192008879023, + "grad_norm": 0.33351486921310425, + "learning_rate": 6.965888887658695e-06, + "loss": 0.3736, + "step": 1565 + }, + { + "epoch": 1.3035516093229744, + "grad_norm": 0.4295842945575714, + "learning_rate": 6.961433035459361e-06, + "loss": 0.4689, + "step": 1566 + }, + { + "epoch": 1.3043840177580466, + "grad_norm": 0.3872586786746979, + "learning_rate": 6.956975341440061e-06, + "loss": 0.4328, + "step": 1567 + }, + { + "epoch": 1.3052164261931187, + "grad_norm": 0.3561748266220093, + "learning_rate": 6.952515809786652e-06, + "loss": 0.4492, + "step": 1568 + }, + { + "epoch": 1.3060488346281909, + "grad_norm": 0.41043657064437866, + "learning_rate": 6.948054444686709e-06, + "loss": 0.4037, + "step": 1569 + }, + { + "epoch": 1.306881243063263, + "grad_norm": 0.4166451394557953, + "learning_rate": 6.943591250329534e-06, + "loss": 0.4192, + "step": 1570 + }, + { + "epoch": 1.3077136514983352, + "grad_norm": 0.37019088864326477, + "learning_rate": 6.939126230906144e-06, + "loss": 0.4187, + "step": 1571 + }, + { + "epoch": 1.3085460599334073, + "grad_norm": 0.3775978684425354, + "learning_rate": 6.934659390609271e-06, + "loss": 0.4589, + "step": 1572 + }, + { + "epoch": 1.3093784683684795, + "grad_norm": 0.4076300859451294, + "learning_rate": 6.930190733633355e-06, + "loss": 0.437, + "step": 1573 + }, + { + "epoch": 1.3102108768035516, + "grad_norm": 0.36930978298187256, + "learning_rate": 6.925720264174543e-06, + "loss": 0.3679, + "step": 1574 + }, + { + "epoch": 1.3110432852386238, + "grad_norm": 0.40303748846054077, + "learning_rate": 6.921247986430686e-06, + "loss": 0.4646, + "step": 1575 + }, + { + "epoch": 1.311875693673696, + "grad_norm": 0.3812013566493988, + "learning_rate": 6.9167739046013305e-06, + "loss": 0.4285, + "step": 1576 + }, + { + "epoch": 1.312708102108768, + "grad_norm": 0.4709984064102173, + "learning_rate": 6.912298022887716e-06, + "loss": 0.4492, + "step": 1577 + }, + { + "epoch": 1.3135405105438402, + "grad_norm": 0.33343759179115295, + "learning_rate": 6.907820345492775e-06, + "loss": 0.3764, + "step": 1578 + }, + { + "epoch": 1.3143729189789124, + "grad_norm": 0.4059264361858368, + "learning_rate": 6.903340876621125e-06, + "loss": 0.4234, + "step": 1579 + }, + { + "epoch": 1.3152053274139845, + "grad_norm": 0.39759019017219543, + "learning_rate": 6.8988596204790655e-06, + "loss": 0.4593, + "step": 1580 + }, + { + "epoch": 1.3160377358490565, + "grad_norm": 0.3702462315559387, + "learning_rate": 6.894376581274578e-06, + "loss": 0.4284, + "step": 1581 + }, + { + "epoch": 1.3168701442841289, + "grad_norm": 0.3988160490989685, + "learning_rate": 6.889891763217307e-06, + "loss": 0.4043, + "step": 1582 + }, + { + "epoch": 1.3177025527192008, + "grad_norm": 0.3909897208213806, + "learning_rate": 6.8854051705185825e-06, + "loss": 0.4282, + "step": 1583 + }, + { + "epoch": 1.3185349611542732, + "grad_norm": 0.37735849618911743, + "learning_rate": 6.880916807391388e-06, + "loss": 0.4185, + "step": 1584 + }, + { + "epoch": 1.319367369589345, + "grad_norm": 0.4028033912181854, + "learning_rate": 6.876426678050379e-06, + "loss": 0.427, + "step": 1585 + }, + { + "epoch": 1.3201997780244172, + "grad_norm": 0.3585635721683502, + "learning_rate": 6.871934786711866e-06, + "loss": 0.401, + "step": 1586 + }, + { + "epoch": 1.3210321864594894, + "grad_norm": 0.36207467317581177, + "learning_rate": 6.86744113759381e-06, + "loss": 0.4119, + "step": 1587 + }, + { + "epoch": 1.3218645948945615, + "grad_norm": 0.3779536485671997, + "learning_rate": 6.862945734915829e-06, + "loss": 0.4531, + "step": 1588 + }, + { + "epoch": 1.3226970033296337, + "grad_norm": 0.38031184673309326, + "learning_rate": 6.858448582899183e-06, + "loss": 0.4332, + "step": 1589 + }, + { + "epoch": 1.3235294117647058, + "grad_norm": 0.3429410755634308, + "learning_rate": 6.8539496857667785e-06, + "loss": 0.3944, + "step": 1590 + }, + { + "epoch": 1.324361820199778, + "grad_norm": 0.3546883463859558, + "learning_rate": 6.849449047743158e-06, + "loss": 0.4342, + "step": 1591 + }, + { + "epoch": 1.3251942286348501, + "grad_norm": 0.39290136098861694, + "learning_rate": 6.844946673054498e-06, + "loss": 0.431, + "step": 1592 + }, + { + "epoch": 1.3260266370699223, + "grad_norm": 0.3893829584121704, + "learning_rate": 6.840442565928609e-06, + "loss": 0.4844, + "step": 1593 + }, + { + "epoch": 1.3268590455049944, + "grad_norm": 0.344115287065506, + "learning_rate": 6.8359367305949256e-06, + "loss": 0.4036, + "step": 1594 + }, + { + "epoch": 1.3276914539400666, + "grad_norm": 0.3980282247066498, + "learning_rate": 6.831429171284506e-06, + "loss": 0.4888, + "step": 1595 + }, + { + "epoch": 1.3285238623751388, + "grad_norm": 0.36405783891677856, + "learning_rate": 6.8269198922300274e-06, + "loss": 0.4067, + "step": 1596 + }, + { + "epoch": 1.329356270810211, + "grad_norm": 0.3426407277584076, + "learning_rate": 6.822408897665782e-06, + "loss": 0.3926, + "step": 1597 + }, + { + "epoch": 1.330188679245283, + "grad_norm": 0.35977932810783386, + "learning_rate": 6.817896191827673e-06, + "loss": 0.4423, + "step": 1598 + }, + { + "epoch": 1.3310210876803552, + "grad_norm": 0.34656643867492676, + "learning_rate": 6.81338177895321e-06, + "loss": 0.4234, + "step": 1599 + }, + { + "epoch": 1.3318534961154274, + "grad_norm": 0.4227568507194519, + "learning_rate": 6.808865663281504e-06, + "loss": 0.4866, + "step": 1600 + }, + { + "epoch": 1.3326859045504995, + "grad_norm": 0.3399163782596588, + "learning_rate": 6.8043478490532695e-06, + "loss": 0.4099, + "step": 1601 + }, + { + "epoch": 1.3335183129855717, + "grad_norm": 0.37561044096946716, + "learning_rate": 6.799828340510811e-06, + "loss": 0.4149, + "step": 1602 + }, + { + "epoch": 1.3343507214206438, + "grad_norm": 0.40371450781822205, + "learning_rate": 6.795307141898027e-06, + "loss": 0.3866, + "step": 1603 + }, + { + "epoch": 1.3351831298557157, + "grad_norm": 0.4308793842792511, + "learning_rate": 6.790784257460403e-06, + "loss": 0.4635, + "step": 1604 + }, + { + "epoch": 1.3360155382907881, + "grad_norm": 0.33165696263313293, + "learning_rate": 6.786259691445005e-06, + "loss": 0.3694, + "step": 1605 + }, + { + "epoch": 1.33684794672586, + "grad_norm": 0.37500032782554626, + "learning_rate": 6.781733448100482e-06, + "loss": 0.4279, + "step": 1606 + }, + { + "epoch": 1.3376803551609324, + "grad_norm": 0.3972361981868744, + "learning_rate": 6.777205531677052e-06, + "loss": 0.4096, + "step": 1607 + }, + { + "epoch": 1.3385127635960044, + "grad_norm": 0.41537413001060486, + "learning_rate": 6.772675946426511e-06, + "loss": 0.4562, + "step": 1608 + }, + { + "epoch": 1.3393451720310765, + "grad_norm": 0.3932049870491028, + "learning_rate": 6.768144696602219e-06, + "loss": 0.4244, + "step": 1609 + }, + { + "epoch": 1.3401775804661487, + "grad_norm": 0.3874600827693939, + "learning_rate": 6.763611786459097e-06, + "loss": 0.4485, + "step": 1610 + }, + { + "epoch": 1.3410099889012208, + "grad_norm": 0.3644678294658661, + "learning_rate": 6.759077220253628e-06, + "loss": 0.3804, + "step": 1611 + }, + { + "epoch": 1.341842397336293, + "grad_norm": 0.36026453971862793, + "learning_rate": 6.7545410022438495e-06, + "loss": 0.4498, + "step": 1612 + }, + { + "epoch": 1.342674805771365, + "grad_norm": 0.3394271433353424, + "learning_rate": 6.750003136689349e-06, + "loss": 0.4059, + "step": 1613 + }, + { + "epoch": 1.3435072142064373, + "grad_norm": 0.3929471969604492, + "learning_rate": 6.745463627851261e-06, + "loss": 0.4259, + "step": 1614 + }, + { + "epoch": 1.3443396226415094, + "grad_norm": 0.34686771035194397, + "learning_rate": 6.740922479992264e-06, + "loss": 0.4158, + "step": 1615 + }, + { + "epoch": 1.3451720310765816, + "grad_norm": 0.37497010827064514, + "learning_rate": 6.736379697376578e-06, + "loss": 0.3876, + "step": 1616 + }, + { + "epoch": 1.3460044395116537, + "grad_norm": 0.4034978449344635, + "learning_rate": 6.731835284269952e-06, + "loss": 0.4623, + "step": 1617 + }, + { + "epoch": 1.3468368479467259, + "grad_norm": 0.37284499406814575, + "learning_rate": 6.727289244939671e-06, + "loss": 0.3982, + "step": 1618 + }, + { + "epoch": 1.347669256381798, + "grad_norm": 0.37045595049858093, + "learning_rate": 6.722741583654545e-06, + "loss": 0.4012, + "step": 1619 + }, + { + "epoch": 1.3485016648168702, + "grad_norm": 0.39600858092308044, + "learning_rate": 6.718192304684909e-06, + "loss": 0.4241, + "step": 1620 + }, + { + "epoch": 1.3493340732519423, + "grad_norm": 0.35542434453964233, + "learning_rate": 6.713641412302614e-06, + "loss": 0.4276, + "step": 1621 + }, + { + "epoch": 1.3501664816870145, + "grad_norm": 0.40306901931762695, + "learning_rate": 6.7090889107810275e-06, + "loss": 0.4232, + "step": 1622 + }, + { + "epoch": 1.3509988901220866, + "grad_norm": 0.4040415287017822, + "learning_rate": 6.704534804395029e-06, + "loss": 0.428, + "step": 1623 + }, + { + "epoch": 1.3518312985571588, + "grad_norm": 0.41643235087394714, + "learning_rate": 6.699979097421004e-06, + "loss": 0.4166, + "step": 1624 + }, + { + "epoch": 1.352663706992231, + "grad_norm": 0.38821861147880554, + "learning_rate": 6.695421794136843e-06, + "loss": 0.4237, + "step": 1625 + }, + { + "epoch": 1.353496115427303, + "grad_norm": 0.42019572854042053, + "learning_rate": 6.690862898821928e-06, + "loss": 0.4018, + "step": 1626 + }, + { + "epoch": 1.354328523862375, + "grad_norm": 0.41116052865982056, + "learning_rate": 6.686302415757149e-06, + "loss": 0.4343, + "step": 1627 + }, + { + "epoch": 1.3551609322974474, + "grad_norm": 0.346463143825531, + "learning_rate": 6.681740349224873e-06, + "loss": 0.4361, + "step": 1628 + }, + { + "epoch": 1.3559933407325193, + "grad_norm": 0.4325784742832184, + "learning_rate": 6.677176703508963e-06, + "loss": 0.4472, + "step": 1629 + }, + { + "epoch": 1.3568257491675917, + "grad_norm": 0.3932659924030304, + "learning_rate": 6.672611482894763e-06, + "loss": 0.4421, + "step": 1630 + }, + { + "epoch": 1.3576581576026636, + "grad_norm": 0.3726898729801178, + "learning_rate": 6.668044691669094e-06, + "loss": 0.4374, + "step": 1631 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.3914749324321747, + "learning_rate": 6.663476334120254e-06, + "loss": 0.4275, + "step": 1632 + }, + { + "epoch": 1.359322974472808, + "grad_norm": 0.3771442174911499, + "learning_rate": 6.658906414538009e-06, + "loss": 0.4356, + "step": 1633 + }, + { + "epoch": 1.36015538290788, + "grad_norm": 0.41853395104408264, + "learning_rate": 6.6543349372135946e-06, + "loss": 0.4489, + "step": 1634 + }, + { + "epoch": 1.3609877913429522, + "grad_norm": 0.3918192982673645, + "learning_rate": 6.649761906439708e-06, + "loss": 0.4469, + "step": 1635 + }, + { + "epoch": 1.3618201997780244, + "grad_norm": 0.3729172348976135, + "learning_rate": 6.6451873265105045e-06, + "loss": 0.4394, + "step": 1636 + }, + { + "epoch": 1.3626526082130965, + "grad_norm": 0.45446979999542236, + "learning_rate": 6.6406112017215966e-06, + "loss": 0.4286, + "step": 1637 + }, + { + "epoch": 1.3634850166481687, + "grad_norm": 0.4050742983818054, + "learning_rate": 6.6360335363700435e-06, + "loss": 0.4214, + "step": 1638 + }, + { + "epoch": 1.3643174250832408, + "grad_norm": 0.3428737223148346, + "learning_rate": 6.631454334754353e-06, + "loss": 0.4248, + "step": 1639 + }, + { + "epoch": 1.365149833518313, + "grad_norm": 0.44034427404403687, + "learning_rate": 6.626873601174478e-06, + "loss": 0.4406, + "step": 1640 + }, + { + "epoch": 1.3659822419533851, + "grad_norm": 0.38823384046554565, + "learning_rate": 6.622291339931806e-06, + "loss": 0.4313, + "step": 1641 + }, + { + "epoch": 1.3668146503884573, + "grad_norm": 0.36457860469818115, + "learning_rate": 6.61770755532916e-06, + "loss": 0.4313, + "step": 1642 + }, + { + "epoch": 1.3676470588235294, + "grad_norm": 0.3769315481185913, + "learning_rate": 6.613122251670795e-06, + "loss": 0.4295, + "step": 1643 + }, + { + "epoch": 1.3684794672586016, + "grad_norm": 0.35854166746139526, + "learning_rate": 6.608535433262391e-06, + "loss": 0.4386, + "step": 1644 + }, + { + "epoch": 1.3693118756936737, + "grad_norm": 0.33545204997062683, + "learning_rate": 6.60394710441105e-06, + "loss": 0.3737, + "step": 1645 + }, + { + "epoch": 1.370144284128746, + "grad_norm": 0.37274548411369324, + "learning_rate": 6.599357269425294e-06, + "loss": 0.4362, + "step": 1646 + }, + { + "epoch": 1.370976692563818, + "grad_norm": 0.3416080176830292, + "learning_rate": 6.594765932615059e-06, + "loss": 0.4346, + "step": 1647 + }, + { + "epoch": 1.3718091009988902, + "grad_norm": 0.3786942660808563, + "learning_rate": 6.59017309829169e-06, + "loss": 0.4237, + "step": 1648 + }, + { + "epoch": 1.3726415094339623, + "grad_norm": 0.36071640253067017, + "learning_rate": 6.585578770767939e-06, + "loss": 0.4231, + "step": 1649 + }, + { + "epoch": 1.3734739178690343, + "grad_norm": 0.3784758746623993, + "learning_rate": 6.5809829543579595e-06, + "loss": 0.458, + "step": 1650 + }, + { + "epoch": 1.3743063263041067, + "grad_norm": 0.31035900115966797, + "learning_rate": 6.576385653377303e-06, + "loss": 0.3688, + "step": 1651 + }, + { + "epoch": 1.3751387347391786, + "grad_norm": 0.3701719641685486, + "learning_rate": 6.5717868721429175e-06, + "loss": 0.4114, + "step": 1652 + }, + { + "epoch": 1.375971143174251, + "grad_norm": 0.42909175157546997, + "learning_rate": 6.56718661497314e-06, + "loss": 0.4538, + "step": 1653 + }, + { + "epoch": 1.3768035516093229, + "grad_norm": 0.3627909719944, + "learning_rate": 6.562584886187687e-06, + "loss": 0.4097, + "step": 1654 + }, + { + "epoch": 1.377635960044395, + "grad_norm": 0.36822018027305603, + "learning_rate": 6.557981690107669e-06, + "loss": 0.4246, + "step": 1655 + }, + { + "epoch": 1.3784683684794672, + "grad_norm": 0.3948631286621094, + "learning_rate": 6.553377031055564e-06, + "loss": 0.4232, + "step": 1656 + }, + { + "epoch": 1.3793007769145393, + "grad_norm": 0.4063318371772766, + "learning_rate": 6.5487709133552275e-06, + "loss": 0.4494, + "step": 1657 + }, + { + "epoch": 1.3801331853496115, + "grad_norm": 0.36222216486930847, + "learning_rate": 6.544163341331886e-06, + "loss": 0.3822, + "step": 1658 + }, + { + "epoch": 1.3809655937846836, + "grad_norm": 0.37411925196647644, + "learning_rate": 6.539554319312129e-06, + "loss": 0.4213, + "step": 1659 + }, + { + "epoch": 1.3817980022197558, + "grad_norm": 0.39759203791618347, + "learning_rate": 6.534943851623911e-06, + "loss": 0.4124, + "step": 1660 + }, + { + "epoch": 1.382630410654828, + "grad_norm": 0.3889487087726593, + "learning_rate": 6.530331942596539e-06, + "loss": 0.4449, + "step": 1661 + }, + { + "epoch": 1.3834628190899, + "grad_norm": 0.3533934950828552, + "learning_rate": 6.525718596560679e-06, + "loss": 0.4068, + "step": 1662 + }, + { + "epoch": 1.3842952275249722, + "grad_norm": 0.4298970699310303, + "learning_rate": 6.521103817848342e-06, + "loss": 0.4447, + "step": 1663 + }, + { + "epoch": 1.3851276359600444, + "grad_norm": 0.3471847176551819, + "learning_rate": 6.516487610792888e-06, + "loss": 0.4082, + "step": 1664 + }, + { + "epoch": 1.3859600443951166, + "grad_norm": 0.3483141362667084, + "learning_rate": 6.511869979729013e-06, + "loss": 0.4342, + "step": 1665 + }, + { + "epoch": 1.3867924528301887, + "grad_norm": 0.34657421708106995, + "learning_rate": 6.507250928992757e-06, + "loss": 0.3625, + "step": 1666 + }, + { + "epoch": 1.3876248612652609, + "grad_norm": 0.3735605478286743, + "learning_rate": 6.5026304629214846e-06, + "loss": 0.4006, + "step": 1667 + }, + { + "epoch": 1.388457269700333, + "grad_norm": 0.35017845034599304, + "learning_rate": 6.498008585853901e-06, + "loss": 0.4373, + "step": 1668 + }, + { + "epoch": 1.3892896781354052, + "grad_norm": 0.3881807029247284, + "learning_rate": 6.493385302130023e-06, + "loss": 0.3704, + "step": 1669 + }, + { + "epoch": 1.3901220865704773, + "grad_norm": 0.3898671567440033, + "learning_rate": 6.488760616091201e-06, + "loss": 0.4166, + "step": 1670 + }, + { + "epoch": 1.3909544950055495, + "grad_norm": 0.3844543695449829, + "learning_rate": 6.484134532080091e-06, + "loss": 0.444, + "step": 1671 + }, + { + "epoch": 1.3917869034406216, + "grad_norm": 0.3733096718788147, + "learning_rate": 6.479507054440671e-06, + "loss": 0.4312, + "step": 1672 + }, + { + "epoch": 1.3926193118756935, + "grad_norm": 0.3414270281791687, + "learning_rate": 6.474878187518221e-06, + "loss": 0.4285, + "step": 1673 + }, + { + "epoch": 1.393451720310766, + "grad_norm": 0.3758377134799957, + "learning_rate": 6.470247935659328e-06, + "loss": 0.4341, + "step": 1674 + }, + { + "epoch": 1.3942841287458378, + "grad_norm": 0.35513973236083984, + "learning_rate": 6.465616303211881e-06, + "loss": 0.4031, + "step": 1675 + }, + { + "epoch": 1.3951165371809102, + "grad_norm": 0.36078712344169617, + "learning_rate": 6.460983294525064e-06, + "loss": 0.4193, + "step": 1676 + }, + { + "epoch": 1.3959489456159822, + "grad_norm": 0.3634653091430664, + "learning_rate": 6.456348913949352e-06, + "loss": 0.4201, + "step": 1677 + }, + { + "epoch": 1.3967813540510543, + "grad_norm": 0.36076661944389343, + "learning_rate": 6.451713165836511e-06, + "loss": 0.3709, + "step": 1678 + }, + { + "epoch": 1.3976137624861265, + "grad_norm": 0.3891527056694031, + "learning_rate": 6.447076054539588e-06, + "loss": 0.4665, + "step": 1679 + }, + { + "epoch": 1.3984461709211986, + "grad_norm": 0.32691043615341187, + "learning_rate": 6.442437584412912e-06, + "loss": 0.3864, + "step": 1680 + }, + { + "epoch": 1.3992785793562708, + "grad_norm": 0.37780606746673584, + "learning_rate": 6.43779775981209e-06, + "loss": 0.4385, + "step": 1681 + }, + { + "epoch": 1.400110987791343, + "grad_norm": 0.3391205966472626, + "learning_rate": 6.433156585093994e-06, + "loss": 0.4281, + "step": 1682 + }, + { + "epoch": 1.400943396226415, + "grad_norm": 0.32850903272628784, + "learning_rate": 6.4285140646167735e-06, + "loss": 0.4434, + "step": 1683 + }, + { + "epoch": 1.4017758046614872, + "grad_norm": 0.3246311843395233, + "learning_rate": 6.423870202739831e-06, + "loss": 0.3899, + "step": 1684 + }, + { + "epoch": 1.4026082130965594, + "grad_norm": 0.36959534883499146, + "learning_rate": 6.41922500382384e-06, + "loss": 0.4113, + "step": 1685 + }, + { + "epoch": 1.4034406215316315, + "grad_norm": 0.3844015598297119, + "learning_rate": 6.414578472230719e-06, + "loss": 0.4143, + "step": 1686 + }, + { + "epoch": 1.4042730299667037, + "grad_norm": 0.3275579512119293, + "learning_rate": 6.409930612323646e-06, + "loss": 0.4199, + "step": 1687 + }, + { + "epoch": 1.4051054384017758, + "grad_norm": 0.388668030500412, + "learning_rate": 6.405281428467041e-06, + "loss": 0.4404, + "step": 1688 + }, + { + "epoch": 1.405937846836848, + "grad_norm": 0.375386506319046, + "learning_rate": 6.400630925026568e-06, + "loss": 0.4471, + "step": 1689 + }, + { + "epoch": 1.4067702552719201, + "grad_norm": 0.3668792247772217, + "learning_rate": 6.395979106369132e-06, + "loss": 0.4422, + "step": 1690 + }, + { + "epoch": 1.4076026637069923, + "grad_norm": 0.34209975600242615, + "learning_rate": 6.391325976862872e-06, + "loss": 0.3815, + "step": 1691 + }, + { + "epoch": 1.4084350721420644, + "grad_norm": 0.33450645208358765, + "learning_rate": 6.386671540877162e-06, + "loss": 0.3906, + "step": 1692 + }, + { + "epoch": 1.4092674805771366, + "grad_norm": 0.34266212582588196, + "learning_rate": 6.382015802782592e-06, + "loss": 0.3992, + "step": 1693 + }, + { + "epoch": 1.4100998890122087, + "grad_norm": 0.38996613025665283, + "learning_rate": 6.377358766950987e-06, + "loss": 0.4352, + "step": 1694 + }, + { + "epoch": 1.4109322974472809, + "grad_norm": 0.3618418872356415, + "learning_rate": 6.372700437755381e-06, + "loss": 0.4297, + "step": 1695 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.36019423604011536, + "learning_rate": 6.368040819570032e-06, + "loss": 0.4046, + "step": 1696 + }, + { + "epoch": 1.4125971143174252, + "grad_norm": 0.3539135158061981, + "learning_rate": 6.3633799167703954e-06, + "loss": 0.4039, + "step": 1697 + }, + { + "epoch": 1.4134295227524971, + "grad_norm": 0.37196677923202515, + "learning_rate": 6.35871773373315e-06, + "loss": 0.4241, + "step": 1698 + }, + { + "epoch": 1.4142619311875695, + "grad_norm": 0.33772680163383484, + "learning_rate": 6.3540542748361585e-06, + "loss": 0.4144, + "step": 1699 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 0.36471793055534363, + "learning_rate": 6.349389544458497e-06, + "loss": 0.4414, + "step": 1700 + }, + { + "epoch": 1.4159267480577136, + "grad_norm": 0.39760175347328186, + "learning_rate": 6.3447235469804255e-06, + "loss": 0.4318, + "step": 1701 + }, + { + "epoch": 1.4167591564927857, + "grad_norm": 0.34777477383613586, + "learning_rate": 6.3400562867833984e-06, + "loss": 0.406, + "step": 1702 + }, + { + "epoch": 1.4175915649278579, + "grad_norm": 0.37017935514450073, + "learning_rate": 6.335387768250054e-06, + "loss": 0.4616, + "step": 1703 + }, + { + "epoch": 1.41842397336293, + "grad_norm": 0.36705178022384644, + "learning_rate": 6.330717995764215e-06, + "loss": 0.4275, + "step": 1704 + }, + { + "epoch": 1.4192563817980022, + "grad_norm": 0.34226226806640625, + "learning_rate": 6.326046973710878e-06, + "loss": 0.428, + "step": 1705 + }, + { + "epoch": 1.4200887902330743, + "grad_norm": 0.3641026020050049, + "learning_rate": 6.321374706476212e-06, + "loss": 0.3847, + "step": 1706 + }, + { + "epoch": 1.4209211986681465, + "grad_norm": 0.39205509424209595, + "learning_rate": 6.316701198447562e-06, + "loss": 0.4306, + "step": 1707 + }, + { + "epoch": 1.4217536071032186, + "grad_norm": 0.32389676570892334, + "learning_rate": 6.312026454013431e-06, + "loss": 0.4076, + "step": 1708 + }, + { + "epoch": 1.4225860155382908, + "grad_norm": 0.35079190135002136, + "learning_rate": 6.3073504775634884e-06, + "loss": 0.4088, + "step": 1709 + }, + { + "epoch": 1.423418423973363, + "grad_norm": 0.3681551218032837, + "learning_rate": 6.302673273488556e-06, + "loss": 0.4128, + "step": 1710 + }, + { + "epoch": 1.424250832408435, + "grad_norm": 0.3662969470024109, + "learning_rate": 6.297994846180611e-06, + "loss": 0.4279, + "step": 1711 + }, + { + "epoch": 1.4250832408435072, + "grad_norm": 0.5505783557891846, + "learning_rate": 6.293315200032777e-06, + "loss": 0.4447, + "step": 1712 + }, + { + "epoch": 1.4259156492785794, + "grad_norm": 0.35179439187049866, + "learning_rate": 6.288634339439328e-06, + "loss": 0.4202, + "step": 1713 + }, + { + "epoch": 1.4267480577136515, + "grad_norm": 0.3551366329193115, + "learning_rate": 6.283952268795669e-06, + "loss": 0.435, + "step": 1714 + }, + { + "epoch": 1.4275804661487237, + "grad_norm": 0.4332524240016937, + "learning_rate": 6.279268992498349e-06, + "loss": 0.4654, + "step": 1715 + }, + { + "epoch": 1.4284128745837958, + "grad_norm": 0.35650166869163513, + "learning_rate": 6.274584514945046e-06, + "loss": 0.4416, + "step": 1716 + }, + { + "epoch": 1.429245283018868, + "grad_norm": 0.3940298855304718, + "learning_rate": 6.269898840534566e-06, + "loss": 0.4315, + "step": 1717 + }, + { + "epoch": 1.4300776914539401, + "grad_norm": 0.3638506829738617, + "learning_rate": 6.26521197366684e-06, + "loss": 0.4081, + "step": 1718 + }, + { + "epoch": 1.430910099889012, + "grad_norm": 0.4078425168991089, + "learning_rate": 6.2605239187429175e-06, + "loss": 0.472, + "step": 1719 + }, + { + "epoch": 1.4317425083240845, + "grad_norm": 0.42004722356796265, + "learning_rate": 6.255834680164966e-06, + "loss": 0.3941, + "step": 1720 + }, + { + "epoch": 1.4325749167591564, + "grad_norm": 0.38885217905044556, + "learning_rate": 6.2511442623362585e-06, + "loss": 0.4549, + "step": 1721 + }, + { + "epoch": 1.4334073251942288, + "grad_norm": 0.3824394643306732, + "learning_rate": 6.246452669661184e-06, + "loss": 0.4207, + "step": 1722 + }, + { + "epoch": 1.4342397336293007, + "grad_norm": 0.39993953704833984, + "learning_rate": 6.241759906545226e-06, + "loss": 0.4386, + "step": 1723 + }, + { + "epoch": 1.435072142064373, + "grad_norm": 0.34289178252220154, + "learning_rate": 6.237065977394976e-06, + "loss": 0.4224, + "step": 1724 + }, + { + "epoch": 1.435904550499445, + "grad_norm": 0.3350009024143219, + "learning_rate": 6.23237088661811e-06, + "loss": 0.4408, + "step": 1725 + }, + { + "epoch": 1.4367369589345171, + "grad_norm": 0.337027907371521, + "learning_rate": 6.227674638623406e-06, + "loss": 0.3893, + "step": 1726 + }, + { + "epoch": 1.4375693673695893, + "grad_norm": 0.37013328075408936, + "learning_rate": 6.22297723782072e-06, + "loss": 0.4376, + "step": 1727 + }, + { + "epoch": 1.4384017758046614, + "grad_norm": 0.3607780337333679, + "learning_rate": 6.218278688620994e-06, + "loss": 0.4236, + "step": 1728 + }, + { + "epoch": 1.4392341842397336, + "grad_norm": 0.3762837052345276, + "learning_rate": 6.213578995436248e-06, + "loss": 0.4209, + "step": 1729 + }, + { + "epoch": 1.4400665926748057, + "grad_norm": 0.365888774394989, + "learning_rate": 6.208878162679577e-06, + "loss": 0.4778, + "step": 1730 + }, + { + "epoch": 1.440899001109878, + "grad_norm": 0.37897226214408875, + "learning_rate": 6.204176194765143e-06, + "loss": 0.4331, + "step": 1731 + }, + { + "epoch": 1.44173140954495, + "grad_norm": 0.3936547040939331, + "learning_rate": 6.199473096108179e-06, + "loss": 0.4007, + "step": 1732 + }, + { + "epoch": 1.4425638179800222, + "grad_norm": 0.33423808217048645, + "learning_rate": 6.194768871124976e-06, + "loss": 0.3973, + "step": 1733 + }, + { + "epoch": 1.4433962264150944, + "grad_norm": 0.395235538482666, + "learning_rate": 6.190063524232883e-06, + "loss": 0.4326, + "step": 1734 + }, + { + "epoch": 1.4442286348501665, + "grad_norm": 0.35545146465301514, + "learning_rate": 6.1853570598503045e-06, + "loss": 0.3881, + "step": 1735 + }, + { + "epoch": 1.4450610432852387, + "grad_norm": 0.36360153555870056, + "learning_rate": 6.18064948239669e-06, + "loss": 0.3702, + "step": 1736 + }, + { + "epoch": 1.4458934517203108, + "grad_norm": 0.4225115478038788, + "learning_rate": 6.175940796292541e-06, + "loss": 0.4497, + "step": 1737 + }, + { + "epoch": 1.446725860155383, + "grad_norm": 0.3660155236721039, + "learning_rate": 6.171231005959393e-06, + "loss": 0.4226, + "step": 1738 + }, + { + "epoch": 1.447558268590455, + "grad_norm": 0.38432952761650085, + "learning_rate": 6.166520115819825e-06, + "loss": 0.4009, + "step": 1739 + }, + { + "epoch": 1.4483906770255273, + "grad_norm": 0.3980556130409241, + "learning_rate": 6.161808130297442e-06, + "loss": 0.4006, + "step": 1740 + }, + { + "epoch": 1.4492230854605994, + "grad_norm": 0.34962958097457886, + "learning_rate": 6.157095053816882e-06, + "loss": 0.4289, + "step": 1741 + }, + { + "epoch": 1.4500554938956713, + "grad_norm": 0.3962990939617157, + "learning_rate": 6.152380890803806e-06, + "loss": 0.4005, + "step": 1742 + }, + { + "epoch": 1.4508879023307437, + "grad_norm": 0.3911704123020172, + "learning_rate": 6.147665645684897e-06, + "loss": 0.4301, + "step": 1743 + }, + { + "epoch": 1.4517203107658156, + "grad_norm": 0.35797592997550964, + "learning_rate": 6.142949322887852e-06, + "loss": 0.4265, + "step": 1744 + }, + { + "epoch": 1.452552719200888, + "grad_norm": 0.39295870065689087, + "learning_rate": 6.138231926841381e-06, + "loss": 0.3893, + "step": 1745 + }, + { + "epoch": 1.45338512763596, + "grad_norm": 0.387833833694458, + "learning_rate": 6.1335134619751994e-06, + "loss": 0.4233, + "step": 1746 + }, + { + "epoch": 1.4542175360710323, + "grad_norm": 0.3874453902244568, + "learning_rate": 6.128793932720031e-06, + "loss": 0.4878, + "step": 1747 + }, + { + "epoch": 1.4550499445061043, + "grad_norm": 0.4330272078514099, + "learning_rate": 6.1240733435075946e-06, + "loss": 0.4074, + "step": 1748 + }, + { + "epoch": 1.4558823529411764, + "grad_norm": 0.3374917507171631, + "learning_rate": 6.119351698770607e-06, + "loss": 0.4121, + "step": 1749 + }, + { + "epoch": 1.4567147613762486, + "grad_norm": 0.37870532274246216, + "learning_rate": 6.1146290029427755e-06, + "loss": 0.4317, + "step": 1750 + }, + { + "epoch": 1.4575471698113207, + "grad_norm": 0.42150020599365234, + "learning_rate": 6.1099052604587935e-06, + "loss": 0.4376, + "step": 1751 + }, + { + "epoch": 1.4583795782463929, + "grad_norm": 0.3680340051651001, + "learning_rate": 6.105180475754341e-06, + "loss": 0.4167, + "step": 1752 + }, + { + "epoch": 1.459211986681465, + "grad_norm": 0.32261571288108826, + "learning_rate": 6.100454653266068e-06, + "loss": 0.3828, + "step": 1753 + }, + { + "epoch": 1.4600443951165372, + "grad_norm": 0.3715158700942993, + "learning_rate": 6.095727797431607e-06, + "loss": 0.4435, + "step": 1754 + }, + { + "epoch": 1.4608768035516093, + "grad_norm": 0.3581849932670593, + "learning_rate": 6.0909999126895605e-06, + "loss": 0.431, + "step": 1755 + }, + { + "epoch": 1.4617092119866815, + "grad_norm": 0.3825138509273529, + "learning_rate": 6.086271003479492e-06, + "loss": 0.4651, + "step": 1756 + }, + { + "epoch": 1.4625416204217536, + "grad_norm": 0.38319095969200134, + "learning_rate": 6.081541074241932e-06, + "loss": 0.4446, + "step": 1757 + }, + { + "epoch": 1.4633740288568258, + "grad_norm": 0.34215807914733887, + "learning_rate": 6.076810129418367e-06, + "loss": 0.3892, + "step": 1758 + }, + { + "epoch": 1.464206437291898, + "grad_norm": 0.3696172833442688, + "learning_rate": 6.072078173451235e-06, + "loss": 0.4422, + "step": 1759 + }, + { + "epoch": 1.46503884572697, + "grad_norm": 0.42305469512939453, + "learning_rate": 6.067345210783927e-06, + "loss": 0.4343, + "step": 1760 + }, + { + "epoch": 1.4658712541620422, + "grad_norm": 0.4272174537181854, + "learning_rate": 6.062611245860778e-06, + "loss": 0.416, + "step": 1761 + }, + { + "epoch": 1.4667036625971144, + "grad_norm": 0.33553504943847656, + "learning_rate": 6.057876283127062e-06, + "loss": 0.3888, + "step": 1762 + }, + { + "epoch": 1.4675360710321865, + "grad_norm": 0.4211355149745941, + "learning_rate": 6.053140327028996e-06, + "loss": 0.4565, + "step": 1763 + }, + { + "epoch": 1.4683684794672587, + "grad_norm": 0.4059634208679199, + "learning_rate": 6.048403382013721e-06, + "loss": 0.4007, + "step": 1764 + }, + { + "epoch": 1.4692008879023306, + "grad_norm": 0.3843512535095215, + "learning_rate": 6.043665452529315e-06, + "loss": 0.4307, + "step": 1765 + }, + { + "epoch": 1.470033296337403, + "grad_norm": 0.44470739364624023, + "learning_rate": 6.038926543024774e-06, + "loss": 0.4408, + "step": 1766 + }, + { + "epoch": 1.470865704772475, + "grad_norm": 0.3952719271183014, + "learning_rate": 6.034186657950019e-06, + "loss": 0.3943, + "step": 1767 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.4131404161453247, + "learning_rate": 6.029445801755884e-06, + "loss": 0.4505, + "step": 1768 + }, + { + "epoch": 1.4725305216426192, + "grad_norm": 0.4458027184009552, + "learning_rate": 6.024703978894118e-06, + "loss": 0.4549, + "step": 1769 + }, + { + "epoch": 1.4733629300776916, + "grad_norm": 0.3414630889892578, + "learning_rate": 6.019961193817371e-06, + "loss": 0.3777, + "step": 1770 + }, + { + "epoch": 1.4741953385127635, + "grad_norm": 0.3965328633785248, + "learning_rate": 6.015217450979206e-06, + "loss": 0.4192, + "step": 1771 + }, + { + "epoch": 1.4750277469478357, + "grad_norm": 0.4391860365867615, + "learning_rate": 6.010472754834078e-06, + "loss": 0.471, + "step": 1772 + }, + { + "epoch": 1.4758601553829078, + "grad_norm": 0.34298470616340637, + "learning_rate": 6.00572710983734e-06, + "loss": 0.4105, + "step": 1773 + }, + { + "epoch": 1.47669256381798, + "grad_norm": 0.37611621618270874, + "learning_rate": 6.000980520445237e-06, + "loss": 0.3869, + "step": 1774 + }, + { + "epoch": 1.4775249722530521, + "grad_norm": 0.3933986723423004, + "learning_rate": 5.9962329911148985e-06, + "loss": 0.4365, + "step": 1775 + }, + { + "epoch": 1.4783573806881243, + "grad_norm": 0.36345475912094116, + "learning_rate": 5.991484526304338e-06, + "loss": 0.401, + "step": 1776 + }, + { + "epoch": 1.4791897891231964, + "grad_norm": 0.4031011462211609, + "learning_rate": 5.986735130472449e-06, + "loss": 0.4051, + "step": 1777 + }, + { + "epoch": 1.4800221975582686, + "grad_norm": 0.3900188207626343, + "learning_rate": 5.981984808078993e-06, + "loss": 0.4398, + "step": 1778 + }, + { + "epoch": 1.4808546059933407, + "grad_norm": 0.3524153232574463, + "learning_rate": 5.97723356358461e-06, + "loss": 0.4145, + "step": 1779 + }, + { + "epoch": 1.4816870144284129, + "grad_norm": 0.3924466669559479, + "learning_rate": 5.972481401450798e-06, + "loss": 0.4317, + "step": 1780 + }, + { + "epoch": 1.482519422863485, + "grad_norm": 0.33220553398132324, + "learning_rate": 5.967728326139926e-06, + "loss": 0.4181, + "step": 1781 + }, + { + "epoch": 1.4833518312985572, + "grad_norm": 0.39167076349258423, + "learning_rate": 5.962974342115209e-06, + "loss": 0.4228, + "step": 1782 + }, + { + "epoch": 1.4841842397336293, + "grad_norm": 0.3629782497882843, + "learning_rate": 5.9582194538407235e-06, + "loss": 0.4085, + "step": 1783 + }, + { + "epoch": 1.4850166481687015, + "grad_norm": 0.3289967179298401, + "learning_rate": 5.9534636657813935e-06, + "loss": 0.4189, + "step": 1784 + }, + { + "epoch": 1.4858490566037736, + "grad_norm": 0.36576929688453674, + "learning_rate": 5.948706982402987e-06, + "loss": 0.452, + "step": 1785 + }, + { + "epoch": 1.4866814650388458, + "grad_norm": 0.3725149631500244, + "learning_rate": 5.9439494081721125e-06, + "loss": 0.4378, + "step": 1786 + }, + { + "epoch": 1.487513873473918, + "grad_norm": 0.3603670299053192, + "learning_rate": 5.939190947556216e-06, + "loss": 0.4281, + "step": 1787 + }, + { + "epoch": 1.4883462819089899, + "grad_norm": 0.35488083958625793, + "learning_rate": 5.934431605023575e-06, + "loss": 0.4333, + "step": 1788 + }, + { + "epoch": 1.4891786903440623, + "grad_norm": 0.33910173177719116, + "learning_rate": 5.929671385043296e-06, + "loss": 0.3941, + "step": 1789 + }, + { + "epoch": 1.4900110987791342, + "grad_norm": 0.4183632433414459, + "learning_rate": 5.924910292085308e-06, + "loss": 0.4207, + "step": 1790 + }, + { + "epoch": 1.4908435072142066, + "grad_norm": 0.3865908086299896, + "learning_rate": 5.920148330620362e-06, + "loss": 0.4192, + "step": 1791 + }, + { + "epoch": 1.4916759156492785, + "grad_norm": 0.34254950284957886, + "learning_rate": 5.915385505120024e-06, + "loss": 0.4404, + "step": 1792 + }, + { + "epoch": 1.4925083240843509, + "grad_norm": 0.399553120136261, + "learning_rate": 5.9106218200566646e-06, + "loss": 0.4519, + "step": 1793 + }, + { + "epoch": 1.4933407325194228, + "grad_norm": 0.33408740162849426, + "learning_rate": 5.905857279903475e-06, + "loss": 0.3971, + "step": 1794 + }, + { + "epoch": 1.494173140954495, + "grad_norm": 0.3779832422733307, + "learning_rate": 5.9010918891344375e-06, + "loss": 0.4478, + "step": 1795 + }, + { + "epoch": 1.495005549389567, + "grad_norm": 0.3475707471370697, + "learning_rate": 5.896325652224339e-06, + "loss": 0.4329, + "step": 1796 + }, + { + "epoch": 1.4958379578246392, + "grad_norm": 0.3472834527492523, + "learning_rate": 5.891558573648759e-06, + "loss": 0.4036, + "step": 1797 + }, + { + "epoch": 1.4966703662597114, + "grad_norm": 0.36704128980636597, + "learning_rate": 5.886790657884067e-06, + "loss": 0.4479, + "step": 1798 + }, + { + "epoch": 1.4975027746947835, + "grad_norm": 0.39827099442481995, + "learning_rate": 5.8820219094074215e-06, + "loss": 0.4401, + "step": 1799 + }, + { + "epoch": 1.4983351831298557, + "grad_norm": 0.37675169110298157, + "learning_rate": 5.877252332696759e-06, + "loss": 0.439, + "step": 1800 + }, + { + "epoch": 1.4991675915649278, + "grad_norm": 0.3367178738117218, + "learning_rate": 5.8724819322307955e-06, + "loss": 0.4434, + "step": 1801 + }, + { + "epoch": 1.5, + "grad_norm": 0.3833068013191223, + "learning_rate": 5.8677107124890206e-06, + "loss": 0.4348, + "step": 1802 + }, + { + "epoch": 1.5008324084350722, + "grad_norm": 0.333822101354599, + "learning_rate": 5.862938677951695e-06, + "loss": 0.4245, + "step": 1803 + }, + { + "epoch": 1.5016648168701443, + "grad_norm": 0.33005255460739136, + "learning_rate": 5.85816583309984e-06, + "loss": 0.4149, + "step": 1804 + }, + { + "epoch": 1.5024972253052165, + "grad_norm": 0.3507244288921356, + "learning_rate": 5.853392182415244e-06, + "loss": 0.4029, + "step": 1805 + }, + { + "epoch": 1.5033296337402886, + "grad_norm": 0.34437042474746704, + "learning_rate": 5.848617730380444e-06, + "loss": 0.3865, + "step": 1806 + }, + { + "epoch": 1.5041620421753608, + "grad_norm": 0.33490440249443054, + "learning_rate": 5.843842481478739e-06, + "loss": 0.4468, + "step": 1807 + }, + { + "epoch": 1.504994450610433, + "grad_norm": 0.34904393553733826, + "learning_rate": 5.839066440194165e-06, + "loss": 0.4286, + "step": 1808 + }, + { + "epoch": 1.5058268590455048, + "grad_norm": 0.3498096168041229, + "learning_rate": 5.834289611011515e-06, + "loss": 0.4244, + "step": 1809 + }, + { + "epoch": 1.5066592674805772, + "grad_norm": 0.3397802710533142, + "learning_rate": 5.82951199841631e-06, + "loss": 0.4203, + "step": 1810 + }, + { + "epoch": 1.5074916759156491, + "grad_norm": 0.34732288122177124, + "learning_rate": 5.824733606894818e-06, + "loss": 0.4276, + "step": 1811 + }, + { + "epoch": 1.5083240843507215, + "grad_norm": 0.3793588876724243, + "learning_rate": 5.819954440934026e-06, + "loss": 0.4587, + "step": 1812 + }, + { + "epoch": 1.5091564927857934, + "grad_norm": 0.339600145816803, + "learning_rate": 5.815174505021659e-06, + "loss": 0.3988, + "step": 1813 + }, + { + "epoch": 1.5099889012208658, + "grad_norm": 0.37940147519111633, + "learning_rate": 5.810393803646157e-06, + "loss": 0.4525, + "step": 1814 + }, + { + "epoch": 1.5108213096559377, + "grad_norm": 0.36495909094810486, + "learning_rate": 5.805612341296685e-06, + "loss": 0.3835, + "step": 1815 + }, + { + "epoch": 1.5116537180910101, + "grad_norm": 0.38348937034606934, + "learning_rate": 5.800830122463117e-06, + "loss": 0.4352, + "step": 1816 + }, + { + "epoch": 1.512486126526082, + "grad_norm": 0.35472238063812256, + "learning_rate": 5.7960471516360435e-06, + "loss": 0.4223, + "step": 1817 + }, + { + "epoch": 1.5133185349611544, + "grad_norm": 0.4020749032497406, + "learning_rate": 5.791263433306758e-06, + "loss": 0.4726, + "step": 1818 + }, + { + "epoch": 1.5141509433962264, + "grad_norm": 0.3420332372188568, + "learning_rate": 5.786478971967249e-06, + "loss": 0.3763, + "step": 1819 + }, + { + "epoch": 1.5149833518312985, + "grad_norm": 0.35677722096443176, + "learning_rate": 5.781693772110219e-06, + "loss": 0.4016, + "step": 1820 + }, + { + "epoch": 1.5158157602663707, + "grad_norm": 0.3244923949241638, + "learning_rate": 5.776907838229049e-06, + "loss": 0.4264, + "step": 1821 + }, + { + "epoch": 1.5166481687014428, + "grad_norm": 0.3919678032398224, + "learning_rate": 5.772121174817816e-06, + "loss": 0.451, + "step": 1822 + }, + { + "epoch": 1.517480577136515, + "grad_norm": 0.3128311336040497, + "learning_rate": 5.767333786371279e-06, + "loss": 0.3684, + "step": 1823 + }, + { + "epoch": 1.5183129855715871, + "grad_norm": 0.3211652338504791, + "learning_rate": 5.762545677384884e-06, + "loss": 0.4176, + "step": 1824 + }, + { + "epoch": 1.5191453940066593, + "grad_norm": 0.3597281277179718, + "learning_rate": 5.757756852354743e-06, + "loss": 0.4235, + "step": 1825 + }, + { + "epoch": 1.5199778024417314, + "grad_norm": 0.3358439803123474, + "learning_rate": 5.752967315777653e-06, + "loss": 0.3894, + "step": 1826 + }, + { + "epoch": 1.5208102108768036, + "grad_norm": 0.3780466616153717, + "learning_rate": 5.748177072151068e-06, + "loss": 0.4414, + "step": 1827 + }, + { + "epoch": 1.5216426193118757, + "grad_norm": 0.341908723115921, + "learning_rate": 5.743386125973112e-06, + "loss": 0.4439, + "step": 1828 + }, + { + "epoch": 1.5224750277469479, + "grad_norm": 0.3324446976184845, + "learning_rate": 5.738594481742568e-06, + "loss": 0.4143, + "step": 1829 + }, + { + "epoch": 1.52330743618202, + "grad_norm": 0.37258070707321167, + "learning_rate": 5.733802143958872e-06, + "loss": 0.4369, + "step": 1830 + }, + { + "epoch": 1.5241398446170922, + "grad_norm": 0.34372997283935547, + "learning_rate": 5.729009117122117e-06, + "loss": 0.4175, + "step": 1831 + }, + { + "epoch": 1.524972253052164, + "grad_norm": 0.3553544580936432, + "learning_rate": 5.724215405733033e-06, + "loss": 0.4233, + "step": 1832 + }, + { + "epoch": 1.5258046614872365, + "grad_norm": 0.34760257601737976, + "learning_rate": 5.7194210142930065e-06, + "loss": 0.4174, + "step": 1833 + }, + { + "epoch": 1.5266370699223084, + "grad_norm": 0.37886732816696167, + "learning_rate": 5.714625947304048e-06, + "loss": 0.4328, + "step": 1834 + }, + { + "epoch": 1.5274694783573808, + "grad_norm": 0.35653242468833923, + "learning_rate": 5.709830209268814e-06, + "loss": 0.4064, + "step": 1835 + }, + { + "epoch": 1.5283018867924527, + "grad_norm": 0.4173387885093689, + "learning_rate": 5.705033804690583e-06, + "loss": 0.4177, + "step": 1836 + }, + { + "epoch": 1.529134295227525, + "grad_norm": 0.33839157223701477, + "learning_rate": 5.7002367380732685e-06, + "loss": 0.3887, + "step": 1837 + }, + { + "epoch": 1.529966703662597, + "grad_norm": 0.35368192195892334, + "learning_rate": 5.695439013921391e-06, + "loss": 0.3985, + "step": 1838 + }, + { + "epoch": 1.5307991120976694, + "grad_norm": 0.3887537121772766, + "learning_rate": 5.6906406367401075e-06, + "loss": 0.4437, + "step": 1839 + }, + { + "epoch": 1.5316315205327413, + "grad_norm": 0.3811011016368866, + "learning_rate": 5.6858416110351715e-06, + "loss": 0.4423, + "step": 1840 + }, + { + "epoch": 1.5324639289678137, + "grad_norm": 0.33723312616348267, + "learning_rate": 5.681041941312954e-06, + "loss": 0.411, + "step": 1841 + }, + { + "epoch": 1.5332963374028856, + "grad_norm": 0.4133073389530182, + "learning_rate": 5.676241632080429e-06, + "loss": 0.4312, + "step": 1842 + }, + { + "epoch": 1.5341287458379578, + "grad_norm": 0.3302142918109894, + "learning_rate": 5.6714406878451715e-06, + "loss": 0.3994, + "step": 1843 + }, + { + "epoch": 1.53496115427303, + "grad_norm": 0.36849483847618103, + "learning_rate": 5.666639113115351e-06, + "loss": 0.4616, + "step": 1844 + }, + { + "epoch": 1.535793562708102, + "grad_norm": 0.38487333059310913, + "learning_rate": 5.661836912399731e-06, + "loss": 0.4145, + "step": 1845 + }, + { + "epoch": 1.5366259711431742, + "grad_norm": 0.37898313999176025, + "learning_rate": 5.657034090207663e-06, + "loss": 0.4573, + "step": 1846 + }, + { + "epoch": 1.5374583795782464, + "grad_norm": 0.35560891032218933, + "learning_rate": 5.652230651049077e-06, + "loss": 0.4162, + "step": 1847 + }, + { + "epoch": 1.5382907880133185, + "grad_norm": 0.3300834894180298, + "learning_rate": 5.647426599434493e-06, + "loss": 0.4577, + "step": 1848 + }, + { + "epoch": 1.5391231964483907, + "grad_norm": 0.32240939140319824, + "learning_rate": 5.642621939874995e-06, + "loss": 0.3774, + "step": 1849 + }, + { + "epoch": 1.5399556048834628, + "grad_norm": 0.41031232476234436, + "learning_rate": 5.637816676882244e-06, + "loss": 0.4475, + "step": 1850 + }, + { + "epoch": 1.540788013318535, + "grad_norm": 0.32196757197380066, + "learning_rate": 5.633010814968465e-06, + "loss": 0.4096, + "step": 1851 + }, + { + "epoch": 1.5416204217536071, + "grad_norm": 0.38092145323753357, + "learning_rate": 5.628204358646448e-06, + "loss": 0.4481, + "step": 1852 + }, + { + "epoch": 1.5424528301886793, + "grad_norm": 0.34755241870880127, + "learning_rate": 5.623397312429537e-06, + "loss": 0.431, + "step": 1853 + }, + { + "epoch": 1.5432852386237514, + "grad_norm": 0.33400028944015503, + "learning_rate": 5.618589680831636e-06, + "loss": 0.3853, + "step": 1854 + }, + { + "epoch": 1.5441176470588234, + "grad_norm": 0.35710418224334717, + "learning_rate": 5.6137814683671935e-06, + "loss": 0.4172, + "step": 1855 + }, + { + "epoch": 1.5449500554938957, + "grad_norm": 0.37034469842910767, + "learning_rate": 5.608972679551205e-06, + "loss": 0.3994, + "step": 1856 + }, + { + "epoch": 1.5457824639289677, + "grad_norm": 0.33173617720603943, + "learning_rate": 5.604163318899207e-06, + "loss": 0.4299, + "step": 1857 + }, + { + "epoch": 1.54661487236404, + "grad_norm": 0.3156852424144745, + "learning_rate": 5.599353390927275e-06, + "loss": 0.4213, + "step": 1858 + }, + { + "epoch": 1.547447280799112, + "grad_norm": 0.3290148377418518, + "learning_rate": 5.594542900152015e-06, + "loss": 0.4192, + "step": 1859 + }, + { + "epoch": 1.5482796892341844, + "grad_norm": 0.3706801235675812, + "learning_rate": 5.589731851090559e-06, + "loss": 0.4677, + "step": 1860 + }, + { + "epoch": 1.5491120976692563, + "grad_norm": 0.33532100915908813, + "learning_rate": 5.584920248260572e-06, + "loss": 0.4131, + "step": 1861 + }, + { + "epoch": 1.5499445061043287, + "grad_norm": 0.3689619302749634, + "learning_rate": 5.580108096180229e-06, + "loss": 0.4205, + "step": 1862 + }, + { + "epoch": 1.5507769145394006, + "grad_norm": 0.32412606477737427, + "learning_rate": 5.575295399368228e-06, + "loss": 0.4127, + "step": 1863 + }, + { + "epoch": 1.551609322974473, + "grad_norm": 0.363091379404068, + "learning_rate": 5.570482162343772e-06, + "loss": 0.4362, + "step": 1864 + }, + { + "epoch": 1.552441731409545, + "grad_norm": 0.37017905712127686, + "learning_rate": 5.5656683896265786e-06, + "loss": 0.4014, + "step": 1865 + }, + { + "epoch": 1.553274139844617, + "grad_norm": 0.3684568405151367, + "learning_rate": 5.560854085736861e-06, + "loss": 0.4703, + "step": 1866 + }, + { + "epoch": 1.5541065482796892, + "grad_norm": 0.32582852244377136, + "learning_rate": 5.556039255195338e-06, + "loss": 0.4202, + "step": 1867 + }, + { + "epoch": 1.5549389567147613, + "grad_norm": 0.37376418709754944, + "learning_rate": 5.551223902523218e-06, + "loss": 0.4237, + "step": 1868 + }, + { + "epoch": 1.5557713651498335, + "grad_norm": 0.3530517518520355, + "learning_rate": 5.546408032242202e-06, + "loss": 0.4311, + "step": 1869 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 0.34218358993530273, + "learning_rate": 5.541591648874476e-06, + "loss": 0.4404, + "step": 1870 + }, + { + "epoch": 1.5574361820199778, + "grad_norm": 0.31669047474861145, + "learning_rate": 5.53677475694271e-06, + "loss": 0.4103, + "step": 1871 + }, + { + "epoch": 1.55826859045505, + "grad_norm": 0.3490392863750458, + "learning_rate": 5.531957360970048e-06, + "loss": 0.3915, + "step": 1872 + }, + { + "epoch": 1.559100998890122, + "grad_norm": 0.3820657730102539, + "learning_rate": 5.527139465480109e-06, + "loss": 0.4277, + "step": 1873 + }, + { + "epoch": 1.5599334073251943, + "grad_norm": 0.35189470648765564, + "learning_rate": 5.5223210749969845e-06, + "loss": 0.4014, + "step": 1874 + }, + { + "epoch": 1.5607658157602664, + "grad_norm": 0.36156129837036133, + "learning_rate": 5.5175021940452225e-06, + "loss": 0.413, + "step": 1875 + }, + { + "epoch": 1.5615982241953386, + "grad_norm": 0.32655417919158936, + "learning_rate": 5.512682827149841e-06, + "loss": 0.4177, + "step": 1876 + }, + { + "epoch": 1.5624306326304107, + "grad_norm": 0.3407149314880371, + "learning_rate": 5.507862978836306e-06, + "loss": 0.4112, + "step": 1877 + }, + { + "epoch": 1.5632630410654826, + "grad_norm": 0.41151124238967896, + "learning_rate": 5.503042653630543e-06, + "loss": 0.4631, + "step": 1878 + }, + { + "epoch": 1.564095449500555, + "grad_norm": 0.3386502265930176, + "learning_rate": 5.49822185605892e-06, + "loss": 0.3822, + "step": 1879 + }, + { + "epoch": 1.564927857935627, + "grad_norm": 0.3886801302433014, + "learning_rate": 5.4934005906482525e-06, + "loss": 0.4602, + "step": 1880 + }, + { + "epoch": 1.5657602663706993, + "grad_norm": 0.3777741491794586, + "learning_rate": 5.488578861925788e-06, + "loss": 0.4215, + "step": 1881 + }, + { + "epoch": 1.5665926748057712, + "grad_norm": 0.3248192369937897, + "learning_rate": 5.4837566744192196e-06, + "loss": 0.3973, + "step": 1882 + }, + { + "epoch": 1.5674250832408436, + "grad_norm": 0.3611052930355072, + "learning_rate": 5.478934032656663e-06, + "loss": 0.4408, + "step": 1883 + }, + { + "epoch": 1.5682574916759155, + "grad_norm": 0.35995787382125854, + "learning_rate": 5.4741109411666635e-06, + "loss": 0.401, + "step": 1884 + }, + { + "epoch": 1.569089900110988, + "grad_norm": 0.39710691571235657, + "learning_rate": 5.46928740447819e-06, + "loss": 0.4711, + "step": 1885 + }, + { + "epoch": 1.5699223085460599, + "grad_norm": 0.34556522965431213, + "learning_rate": 5.464463427120626e-06, + "loss": 0.4347, + "step": 1886 + }, + { + "epoch": 1.5707547169811322, + "grad_norm": 0.3444367051124573, + "learning_rate": 5.459639013623772e-06, + "loss": 0.3928, + "step": 1887 + }, + { + "epoch": 1.5715871254162042, + "grad_norm": 0.3429131805896759, + "learning_rate": 5.454814168517836e-06, + "loss": 0.4362, + "step": 1888 + }, + { + "epoch": 1.5724195338512763, + "grad_norm": 0.3870256841182709, + "learning_rate": 5.449988896333431e-06, + "loss": 0.4537, + "step": 1889 + }, + { + "epoch": 1.5732519422863485, + "grad_norm": 0.3829275071620941, + "learning_rate": 5.445163201601575e-06, + "loss": 0.4134, + "step": 1890 + }, + { + "epoch": 1.5740843507214206, + "grad_norm": 0.37084445357322693, + "learning_rate": 5.440337088853679e-06, + "loss": 0.3903, + "step": 1891 + }, + { + "epoch": 1.5749167591564928, + "grad_norm": 0.36003977060317993, + "learning_rate": 5.435510562621544e-06, + "loss": 0.4641, + "step": 1892 + }, + { + "epoch": 1.575749167591565, + "grad_norm": 0.4320586025714874, + "learning_rate": 5.4306836274373675e-06, + "loss": 0.4261, + "step": 1893 + }, + { + "epoch": 1.576581576026637, + "grad_norm": 0.35853341221809387, + "learning_rate": 5.425856287833723e-06, + "loss": 0.4363, + "step": 1894 + }, + { + "epoch": 1.5774139844617092, + "grad_norm": 0.3714522123336792, + "learning_rate": 5.421028548343568e-06, + "loss": 0.4157, + "step": 1895 + }, + { + "epoch": 1.5782463928967814, + "grad_norm": 0.37098610401153564, + "learning_rate": 5.4162004135002336e-06, + "loss": 0.4233, + "step": 1896 + }, + { + "epoch": 1.5790788013318535, + "grad_norm": 0.35609301924705505, + "learning_rate": 5.411371887837423e-06, + "loss": 0.4317, + "step": 1897 + }, + { + "epoch": 1.5799112097669257, + "grad_norm": 0.31282979249954224, + "learning_rate": 5.406542975889209e-06, + "loss": 0.3921, + "step": 1898 + }, + { + "epoch": 1.5807436182019978, + "grad_norm": 0.34338200092315674, + "learning_rate": 5.40171368219002e-06, + "loss": 0.4239, + "step": 1899 + }, + { + "epoch": 1.58157602663707, + "grad_norm": 0.357689768075943, + "learning_rate": 5.396884011274651e-06, + "loss": 0.4402, + "step": 1900 + }, + { + "epoch": 1.582408435072142, + "grad_norm": 0.3389754891395569, + "learning_rate": 5.3920539676782455e-06, + "loss": 0.3947, + "step": 1901 + }, + { + "epoch": 1.5832408435072143, + "grad_norm": 0.348406046628952, + "learning_rate": 5.387223555936301e-06, + "loss": 0.4213, + "step": 1902 + }, + { + "epoch": 1.5840732519422862, + "grad_norm": 0.3109308183193207, + "learning_rate": 5.382392780584655e-06, + "loss": 0.3912, + "step": 1903 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.3686921298503876, + "learning_rate": 5.377561646159495e-06, + "loss": 0.4632, + "step": 1904 + }, + { + "epoch": 1.5857380688124305, + "grad_norm": 0.33451026678085327, + "learning_rate": 5.372730157197338e-06, + "loss": 0.3807, + "step": 1905 + }, + { + "epoch": 1.5865704772475029, + "grad_norm": 0.3415146768093109, + "learning_rate": 5.367898318235037e-06, + "loss": 0.4282, + "step": 1906 + }, + { + "epoch": 1.5874028856825748, + "grad_norm": 0.3252672553062439, + "learning_rate": 5.363066133809773e-06, + "loss": 0.4125, + "step": 1907 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.35067319869995117, + "learning_rate": 5.3582336084590535e-06, + "loss": 0.399, + "step": 1908 + }, + { + "epoch": 1.5890677025527191, + "grad_norm": 0.33104994893074036, + "learning_rate": 5.3534007467207024e-06, + "loss": 0.4203, + "step": 1909 + }, + { + "epoch": 1.5899001109877915, + "grad_norm": 0.33715182542800903, + "learning_rate": 5.348567553132862e-06, + "loss": 0.4237, + "step": 1910 + }, + { + "epoch": 1.5907325194228634, + "grad_norm": 0.3445633053779602, + "learning_rate": 5.343734032233986e-06, + "loss": 0.4466, + "step": 1911 + }, + { + "epoch": 1.5915649278579356, + "grad_norm": 0.3408707082271576, + "learning_rate": 5.338900188562836e-06, + "loss": 0.3845, + "step": 1912 + }, + { + "epoch": 1.5923973362930077, + "grad_norm": 0.3361123204231262, + "learning_rate": 5.334066026658475e-06, + "loss": 0.4134, + "step": 1913 + }, + { + "epoch": 1.5932297447280799, + "grad_norm": 0.3556373119354248, + "learning_rate": 5.329231551060264e-06, + "loss": 0.416, + "step": 1914 + }, + { + "epoch": 1.594062153163152, + "grad_norm": 0.34920355677604675, + "learning_rate": 5.324396766307863e-06, + "loss": 0.4377, + "step": 1915 + }, + { + "epoch": 1.5948945615982242, + "grad_norm": 0.3253536820411682, + "learning_rate": 5.31956167694122e-06, + "loss": 0.4011, + "step": 1916 + }, + { + "epoch": 1.5957269700332963, + "grad_norm": 0.35277408361434937, + "learning_rate": 5.314726287500565e-06, + "loss": 0.4428, + "step": 1917 + }, + { + "epoch": 1.5965593784683685, + "grad_norm": 0.36568132042884827, + "learning_rate": 5.309890602526416e-06, + "loss": 0.438, + "step": 1918 + }, + { + "epoch": 1.5973917869034406, + "grad_norm": 0.31992262601852417, + "learning_rate": 5.305054626559565e-06, + "loss": 0.3983, + "step": 1919 + }, + { + "epoch": 1.5982241953385128, + "grad_norm": 0.3432292640209198, + "learning_rate": 5.30021836414108e-06, + "loss": 0.446, + "step": 1920 + }, + { + "epoch": 1.599056603773585, + "grad_norm": 0.3381594121456146, + "learning_rate": 5.295381819812293e-06, + "loss": 0.43, + "step": 1921 + }, + { + "epoch": 1.599889012208657, + "grad_norm": 0.32569068670272827, + "learning_rate": 5.290544998114805e-06, + "loss": 0.4224, + "step": 1922 + }, + { + "epoch": 1.6007214206437292, + "grad_norm": 0.32701462507247925, + "learning_rate": 5.2857079035904764e-06, + "loss": 0.4351, + "step": 1923 + }, + { + "epoch": 1.6015538290788012, + "grad_norm": 0.32411783933639526, + "learning_rate": 5.280870540781425e-06, + "loss": 0.4015, + "step": 1924 + }, + { + "epoch": 1.6023862375138735, + "grad_norm": 0.35999277234077454, + "learning_rate": 5.2760329142300174e-06, + "loss": 0.4268, + "step": 1925 + }, + { + "epoch": 1.6032186459489455, + "grad_norm": 0.3214718997478485, + "learning_rate": 5.271195028478871e-06, + "loss": 0.3986, + "step": 1926 + }, + { + "epoch": 1.6040510543840178, + "grad_norm": 0.3498198986053467, + "learning_rate": 5.266356888070843e-06, + "loss": 0.4462, + "step": 1927 + }, + { + "epoch": 1.6048834628190898, + "grad_norm": 0.3669768273830414, + "learning_rate": 5.261518497549033e-06, + "loss": 0.4052, + "step": 1928 + }, + { + "epoch": 1.6057158712541622, + "grad_norm": 0.3158254623413086, + "learning_rate": 5.256679861456776e-06, + "loss": 0.3936, + "step": 1929 + }, + { + "epoch": 1.606548279689234, + "grad_norm": 0.33665063977241516, + "learning_rate": 5.251840984337634e-06, + "loss": 0.4338, + "step": 1930 + }, + { + "epoch": 1.6073806881243065, + "grad_norm": 0.37514346837997437, + "learning_rate": 5.247001870735398e-06, + "loss": 0.464, + "step": 1931 + }, + { + "epoch": 1.6082130965593784, + "grad_norm": 0.33054375648498535, + "learning_rate": 5.242162525194082e-06, + "loss": 0.3726, + "step": 1932 + }, + { + "epoch": 1.6090455049944508, + "grad_norm": 0.3735671937465668, + "learning_rate": 5.237322952257915e-06, + "loss": 0.4846, + "step": 1933 + }, + { + "epoch": 1.6098779134295227, + "grad_norm": 0.30594101548194885, + "learning_rate": 5.232483156471339e-06, + "loss": 0.3673, + "step": 1934 + }, + { + "epoch": 1.6107103218645948, + "grad_norm": 0.3897792100906372, + "learning_rate": 5.227643142379009e-06, + "loss": 0.4665, + "step": 1935 + }, + { + "epoch": 1.611542730299667, + "grad_norm": 0.3239308297634125, + "learning_rate": 5.222802914525782e-06, + "loss": 0.4004, + "step": 1936 + }, + { + "epoch": 1.6123751387347391, + "grad_norm": 0.33833977580070496, + "learning_rate": 5.217962477456718e-06, + "loss": 0.4278, + "step": 1937 + }, + { + "epoch": 1.6132075471698113, + "grad_norm": 0.3144145905971527, + "learning_rate": 5.21312183571707e-06, + "loss": 0.3737, + "step": 1938 + }, + { + "epoch": 1.6140399556048834, + "grad_norm": 0.34893402457237244, + "learning_rate": 5.208280993852287e-06, + "loss": 0.4249, + "step": 1939 + }, + { + "epoch": 1.6148723640399556, + "grad_norm": 0.32780539989471436, + "learning_rate": 5.203439956408005e-06, + "loss": 0.4224, + "step": 1940 + }, + { + "epoch": 1.6157047724750278, + "grad_norm": 0.3463350534439087, + "learning_rate": 5.198598727930041e-06, + "loss": 0.395, + "step": 1941 + }, + { + "epoch": 1.6165371809101, + "grad_norm": 0.3848700225353241, + "learning_rate": 5.193757312964394e-06, + "loss": 0.3988, + "step": 1942 + }, + { + "epoch": 1.617369589345172, + "grad_norm": 0.3813340961933136, + "learning_rate": 5.188915716057238e-06, + "loss": 0.4296, + "step": 1943 + }, + { + "epoch": 1.6182019977802442, + "grad_norm": 0.396963894367218, + "learning_rate": 5.184073941754916e-06, + "loss": 0.473, + "step": 1944 + }, + { + "epoch": 1.6190344062153164, + "grad_norm": 0.34140685200691223, + "learning_rate": 5.1792319946039405e-06, + "loss": 0.3853, + "step": 1945 + }, + { + "epoch": 1.6198668146503885, + "grad_norm": 0.3750185966491699, + "learning_rate": 5.174389879150985e-06, + "loss": 0.4811, + "step": 1946 + }, + { + "epoch": 1.6206992230854604, + "grad_norm": 0.28287845849990845, + "learning_rate": 5.169547599942877e-06, + "loss": 0.3697, + "step": 1947 + }, + { + "epoch": 1.6215316315205328, + "grad_norm": 0.3522551357746124, + "learning_rate": 5.164705161526605e-06, + "loss": 0.482, + "step": 1948 + }, + { + "epoch": 1.6223640399556047, + "grad_norm": 0.35294002294540405, + "learning_rate": 5.159862568449302e-06, + "loss": 0.4473, + "step": 1949 + }, + { + "epoch": 1.6231964483906771, + "grad_norm": 0.34549230337142944, + "learning_rate": 5.155019825258251e-06, + "loss": 0.3954, + "step": 1950 + }, + { + "epoch": 1.624028856825749, + "grad_norm": 0.3237864673137665, + "learning_rate": 5.1501769365008654e-06, + "loss": 0.4288, + "step": 1951 + }, + { + "epoch": 1.6248612652608214, + "grad_norm": 0.3312610685825348, + "learning_rate": 5.14533390672471e-06, + "loss": 0.4135, + "step": 1952 + }, + { + "epoch": 1.6256936736958933, + "grad_norm": 0.39398959279060364, + "learning_rate": 5.140490740477471e-06, + "loss": 0.4262, + "step": 1953 + }, + { + "epoch": 1.6265260821309657, + "grad_norm": 0.30289697647094727, + "learning_rate": 5.135647442306966e-06, + "loss": 0.3799, + "step": 1954 + }, + { + "epoch": 1.6273584905660377, + "grad_norm": 0.3318493366241455, + "learning_rate": 5.130804016761138e-06, + "loss": 0.436, + "step": 1955 + }, + { + "epoch": 1.62819089900111, + "grad_norm": 0.3526514172554016, + "learning_rate": 5.1259604683880485e-06, + "loss": 0.4636, + "step": 1956 + }, + { + "epoch": 1.629023307436182, + "grad_norm": 0.3857317864894867, + "learning_rate": 5.121116801735873e-06, + "loss": 0.443, + "step": 1957 + }, + { + "epoch": 1.629855715871254, + "grad_norm": 0.3597477078437805, + "learning_rate": 5.1162730213529e-06, + "loss": 0.4177, + "step": 1958 + }, + { + "epoch": 1.6306881243063263, + "grad_norm": 0.34153300523757935, + "learning_rate": 5.1114291317875244e-06, + "loss": 0.4612, + "step": 1959 + }, + { + "epoch": 1.6315205327413984, + "grad_norm": 0.3484551012516022, + "learning_rate": 5.1065851375882425e-06, + "loss": 0.4088, + "step": 1960 + }, + { + "epoch": 1.6323529411764706, + "grad_norm": 0.36641940474510193, + "learning_rate": 5.101741043303651e-06, + "loss": 0.4182, + "step": 1961 + }, + { + "epoch": 1.6331853496115427, + "grad_norm": 0.3681751787662506, + "learning_rate": 5.096896853482437e-06, + "loss": 0.4161, + "step": 1962 + }, + { + "epoch": 1.6340177580466149, + "grad_norm": 0.3790402114391327, + "learning_rate": 5.092052572673383e-06, + "loss": 0.4297, + "step": 1963 + }, + { + "epoch": 1.634850166481687, + "grad_norm": 0.3269292116165161, + "learning_rate": 5.087208205425349e-06, + "loss": 0.3941, + "step": 1964 + }, + { + "epoch": 1.6356825749167592, + "grad_norm": 0.3894573748111725, + "learning_rate": 5.082363756287285e-06, + "loss": 0.4084, + "step": 1965 + }, + { + "epoch": 1.6365149833518313, + "grad_norm": 0.39526158571243286, + "learning_rate": 5.077519229808211e-06, + "loss": 0.4229, + "step": 1966 + }, + { + "epoch": 1.6373473917869035, + "grad_norm": 0.3372214436531067, + "learning_rate": 5.072674630537223e-06, + "loss": 0.4109, + "step": 1967 + }, + { + "epoch": 1.6381798002219756, + "grad_norm": 0.4343195855617523, + "learning_rate": 5.067829963023485e-06, + "loss": 0.4377, + "step": 1968 + }, + { + "epoch": 1.6390122086570478, + "grad_norm": 0.3801560699939728, + "learning_rate": 5.062985231816225e-06, + "loss": 0.4452, + "step": 1969 + }, + { + "epoch": 1.6398446170921197, + "grad_norm": 0.3717063367366791, + "learning_rate": 5.0581404414647276e-06, + "loss": 0.4361, + "step": 1970 + }, + { + "epoch": 1.640677025527192, + "grad_norm": 0.3857610523700714, + "learning_rate": 5.053295596518337e-06, + "loss": 0.4063, + "step": 1971 + }, + { + "epoch": 1.641509433962264, + "grad_norm": 0.3806234300136566, + "learning_rate": 5.04845070152645e-06, + "loss": 0.4423, + "step": 1972 + }, + { + "epoch": 1.6423418423973364, + "grad_norm": 0.39662259817123413, + "learning_rate": 5.043605761038505e-06, + "loss": 0.4482, + "step": 1973 + }, + { + "epoch": 1.6431742508324083, + "grad_norm": 0.3193778395652771, + "learning_rate": 5.038760779603989e-06, + "loss": 0.3858, + "step": 1974 + }, + { + "epoch": 1.6440066592674807, + "grad_norm": 0.35678938031196594, + "learning_rate": 5.033915761772419e-06, + "loss": 0.4691, + "step": 1975 + }, + { + "epoch": 1.6448390677025526, + "grad_norm": 0.3585291802883148, + "learning_rate": 5.029070712093357e-06, + "loss": 0.3971, + "step": 1976 + }, + { + "epoch": 1.645671476137625, + "grad_norm": 0.3424661457538605, + "learning_rate": 5.024225635116386e-06, + "loss": 0.4139, + "step": 1977 + }, + { + "epoch": 1.646503884572697, + "grad_norm": 0.35479363799095154, + "learning_rate": 5.01938053539112e-06, + "loss": 0.4225, + "step": 1978 + }, + { + "epoch": 1.6473362930077693, + "grad_norm": 0.3378525972366333, + "learning_rate": 5.014535417467191e-06, + "loss": 0.4122, + "step": 1979 + }, + { + "epoch": 1.6481687014428412, + "grad_norm": 0.36042535305023193, + "learning_rate": 5.009690285894252e-06, + "loss": 0.4601, + "step": 1980 + }, + { + "epoch": 1.6490011098779136, + "grad_norm": 0.3910295069217682, + "learning_rate": 5.004845145221965e-06, + "loss": 0.4279, + "step": 1981 + }, + { + "epoch": 1.6498335183129855, + "grad_norm": 0.3552689552307129, + "learning_rate": 5e-06, + "loss": 0.4187, + "step": 1982 + }, + { + "epoch": 1.6506659267480577, + "grad_norm": 0.33062514662742615, + "learning_rate": 4.995154854778036e-06, + "loss": 0.4125, + "step": 1983 + }, + { + "epoch": 1.6514983351831298, + "grad_norm": 0.3373256325721741, + "learning_rate": 4.99030971410575e-06, + "loss": 0.444, + "step": 1984 + }, + { + "epoch": 1.652330743618202, + "grad_norm": 0.3146938383579254, + "learning_rate": 4.9854645825328096e-06, + "loss": 0.3551, + "step": 1985 + }, + { + "epoch": 1.6531631520532741, + "grad_norm": 0.34619879722595215, + "learning_rate": 4.980619464608881e-06, + "loss": 0.4253, + "step": 1986 + }, + { + "epoch": 1.6539955604883463, + "grad_norm": 0.3549228012561798, + "learning_rate": 4.975774364883617e-06, + "loss": 0.434, + "step": 1987 + }, + { + "epoch": 1.6548279689234184, + "grad_norm": 0.3266083896160126, + "learning_rate": 4.9709292879066464e-06, + "loss": 0.4061, + "step": 1988 + }, + { + "epoch": 1.6556603773584906, + "grad_norm": 0.30864620208740234, + "learning_rate": 4.966084238227582e-06, + "loss": 0.4607, + "step": 1989 + }, + { + "epoch": 1.6564927857935627, + "grad_norm": 0.357719361782074, + "learning_rate": 4.961239220396014e-06, + "loss": 0.4443, + "step": 1990 + }, + { + "epoch": 1.657325194228635, + "grad_norm": 0.36556276679039, + "learning_rate": 4.956394238961497e-06, + "loss": 0.3715, + "step": 1991 + }, + { + "epoch": 1.658157602663707, + "grad_norm": 0.3319959342479706, + "learning_rate": 4.951549298473552e-06, + "loss": 0.4252, + "step": 1992 + }, + { + "epoch": 1.658990011098779, + "grad_norm": 0.35500237345695496, + "learning_rate": 4.946704403481663e-06, + "loss": 0.4447, + "step": 1993 + }, + { + "epoch": 1.6598224195338513, + "grad_norm": 0.38562414050102234, + "learning_rate": 4.941859558535275e-06, + "loss": 0.4528, + "step": 1994 + }, + { + "epoch": 1.6606548279689233, + "grad_norm": 0.3319378197193146, + "learning_rate": 4.937014768183778e-06, + "loss": 0.3608, + "step": 1995 + }, + { + "epoch": 1.6614872364039956, + "grad_norm": 0.34083935618400574, + "learning_rate": 4.9321700369765165e-06, + "loss": 0.4559, + "step": 1996 + }, + { + "epoch": 1.6623196448390676, + "grad_norm": 0.3207685947418213, + "learning_rate": 4.927325369462777e-06, + "loss": 0.375, + "step": 1997 + }, + { + "epoch": 1.66315205327414, + "grad_norm": 0.3647385835647583, + "learning_rate": 4.92248077019179e-06, + "loss": 0.4085, + "step": 1998 + }, + { + "epoch": 1.6639844617092119, + "grad_norm": 0.3710618317127228, + "learning_rate": 4.917636243712716e-06, + "loss": 0.4605, + "step": 1999 + }, + { + "epoch": 1.6648168701442843, + "grad_norm": 0.35661581158638, + "learning_rate": 4.912791794574653e-06, + "loss": 0.407, + "step": 2000 + }, + { + "epoch": 1.6656492785793562, + "grad_norm": 0.39443153142929077, + "learning_rate": 4.9079474273266195e-06, + "loss": 0.4512, + "step": 2001 + }, + { + "epoch": 1.6664816870144286, + "grad_norm": 0.36028945446014404, + "learning_rate": 4.903103146517564e-06, + "loss": 0.4333, + "step": 2002 + }, + { + "epoch": 1.6673140954495005, + "grad_norm": 0.3345014154911041, + "learning_rate": 4.898258956696351e-06, + "loss": 0.3976, + "step": 2003 + }, + { + "epoch": 1.6681465038845729, + "grad_norm": 0.3746415376663208, + "learning_rate": 4.893414862411759e-06, + "loss": 0.4256, + "step": 2004 + }, + { + "epoch": 1.6689789123196448, + "grad_norm": 0.35627481341362, + "learning_rate": 4.888570868212478e-06, + "loss": 0.4167, + "step": 2005 + }, + { + "epoch": 1.669811320754717, + "grad_norm": 0.30849483609199524, + "learning_rate": 4.883726978647101e-06, + "loss": 0.3915, + "step": 2006 + }, + { + "epoch": 1.670643729189789, + "grad_norm": 0.3932332396507263, + "learning_rate": 4.878883198264129e-06, + "loss": 0.461, + "step": 2007 + }, + { + "epoch": 1.6714761376248612, + "grad_norm": 0.32576408982276917, + "learning_rate": 4.874039531611954e-06, + "loss": 0.4585, + "step": 2008 + }, + { + "epoch": 1.6723085460599334, + "grad_norm": 0.3283080756664276, + "learning_rate": 4.8691959832388635e-06, + "loss": 0.3801, + "step": 2009 + }, + { + "epoch": 1.6731409544950056, + "grad_norm": 0.40403053164482117, + "learning_rate": 4.864352557693035e-06, + "loss": 0.4624, + "step": 2010 + }, + { + "epoch": 1.6739733629300777, + "grad_norm": 0.3406950831413269, + "learning_rate": 4.859509259522531e-06, + "loss": 0.4116, + "step": 2011 + }, + { + "epoch": 1.6748057713651499, + "grad_norm": 0.3636447787284851, + "learning_rate": 4.854666093275291e-06, + "loss": 0.4214, + "step": 2012 + }, + { + "epoch": 1.675638179800222, + "grad_norm": 0.34086188673973083, + "learning_rate": 4.849823063499136e-06, + "loss": 0.4077, + "step": 2013 + }, + { + "epoch": 1.6764705882352942, + "grad_norm": 0.345948189496994, + "learning_rate": 4.844980174741752e-06, + "loss": 0.4133, + "step": 2014 + }, + { + "epoch": 1.6773029966703663, + "grad_norm": 0.339345782995224, + "learning_rate": 4.840137431550698e-06, + "loss": 0.431, + "step": 2015 + }, + { + "epoch": 1.6781354051054382, + "grad_norm": 0.3828071057796478, + "learning_rate": 4.835294838473396e-06, + "loss": 0.4574, + "step": 2016 + }, + { + "epoch": 1.6789678135405106, + "grad_norm": 0.3178552985191345, + "learning_rate": 4.8304524000571255e-06, + "loss": 0.4198, + "step": 2017 + }, + { + "epoch": 1.6798002219755825, + "grad_norm": 0.35455217957496643, + "learning_rate": 4.825610120849018e-06, + "loss": 0.4129, + "step": 2018 + }, + { + "epoch": 1.680632630410655, + "grad_norm": 0.35618969798088074, + "learning_rate": 4.8207680053960594e-06, + "loss": 0.4373, + "step": 2019 + }, + { + "epoch": 1.6814650388457268, + "grad_norm": 0.33656421303749084, + "learning_rate": 4.815926058245085e-06, + "loss": 0.4209, + "step": 2020 + }, + { + "epoch": 1.6822974472807992, + "grad_norm": 0.34791067242622375, + "learning_rate": 4.811084283942764e-06, + "loss": 0.4398, + "step": 2021 + }, + { + "epoch": 1.6831298557158711, + "grad_norm": 0.32604023814201355, + "learning_rate": 4.806242687035608e-06, + "loss": 0.4346, + "step": 2022 + }, + { + "epoch": 1.6839622641509435, + "grad_norm": 0.3848741352558136, + "learning_rate": 4.80140127206996e-06, + "loss": 0.4152, + "step": 2023 + }, + { + "epoch": 1.6847946725860155, + "grad_norm": 0.34885889291763306, + "learning_rate": 4.796560043591996e-06, + "loss": 0.3928, + "step": 2024 + }, + { + "epoch": 1.6856270810210878, + "grad_norm": 0.3154467046260834, + "learning_rate": 4.791719006147714e-06, + "loss": 0.4268, + "step": 2025 + }, + { + "epoch": 1.6864594894561598, + "grad_norm": 0.34713122248649597, + "learning_rate": 4.7868781642829326e-06, + "loss": 0.4561, + "step": 2026 + }, + { + "epoch": 1.6872918978912321, + "grad_norm": 0.2982744872570038, + "learning_rate": 4.782037522543283e-06, + "loss": 0.3945, + "step": 2027 + }, + { + "epoch": 1.688124306326304, + "grad_norm": 0.33279117941856384, + "learning_rate": 4.777197085474219e-06, + "loss": 0.4445, + "step": 2028 + }, + { + "epoch": 1.6889567147613762, + "grad_norm": 0.3618778586387634, + "learning_rate": 4.772356857620992e-06, + "loss": 0.4938, + "step": 2029 + }, + { + "epoch": 1.6897891231964484, + "grad_norm": 0.311985582113266, + "learning_rate": 4.767516843528664e-06, + "loss": 0.4027, + "step": 2030 + }, + { + "epoch": 1.6906215316315205, + "grad_norm": 0.30986541509628296, + "learning_rate": 4.762677047742088e-06, + "loss": 0.3541, + "step": 2031 + }, + { + "epoch": 1.6914539400665927, + "grad_norm": 0.3394172191619873, + "learning_rate": 4.757837474805918e-06, + "loss": 0.4316, + "step": 2032 + }, + { + "epoch": 1.6922863485016648, + "grad_norm": 0.331486314535141, + "learning_rate": 4.7529981292646025e-06, + "loss": 0.4357, + "step": 2033 + }, + { + "epoch": 1.693118756936737, + "grad_norm": 0.3562561869621277, + "learning_rate": 4.748159015662367e-06, + "loss": 0.4528, + "step": 2034 + }, + { + "epoch": 1.6939511653718091, + "grad_norm": 0.3469091057777405, + "learning_rate": 4.743320138543225e-06, + "loss": 0.4298, + "step": 2035 + }, + { + "epoch": 1.6947835738068813, + "grad_norm": 0.3128523528575897, + "learning_rate": 4.738481502450967e-06, + "loss": 0.43, + "step": 2036 + }, + { + "epoch": 1.6956159822419534, + "grad_norm": 0.36427655816078186, + "learning_rate": 4.733643111929159e-06, + "loss": 0.4474, + "step": 2037 + }, + { + "epoch": 1.6964483906770256, + "grad_norm": 0.32562559843063354, + "learning_rate": 4.728804971521132e-06, + "loss": 0.4002, + "step": 2038 + }, + { + "epoch": 1.6972807991120975, + "grad_norm": 0.3482353091239929, + "learning_rate": 4.723967085769985e-06, + "loss": 0.3841, + "step": 2039 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.3613423705101013, + "learning_rate": 4.719129459218575e-06, + "loss": 0.4263, + "step": 2040 + }, + { + "epoch": 1.6989456159822418, + "grad_norm": 0.3004484474658966, + "learning_rate": 4.714292096409524e-06, + "loss": 0.389, + "step": 2041 + }, + { + "epoch": 1.6997780244173142, + "grad_norm": 0.3427788317203522, + "learning_rate": 4.709455001885196e-06, + "loss": 0.4658, + "step": 2042 + }, + { + "epoch": 1.7006104328523861, + "grad_norm": 0.3731909394264221, + "learning_rate": 4.704618180187709e-06, + "loss": 0.4452, + "step": 2043 + }, + { + "epoch": 1.7014428412874585, + "grad_norm": 0.34222519397735596, + "learning_rate": 4.699781635858923e-06, + "loss": 0.4187, + "step": 2044 + }, + { + "epoch": 1.7022752497225304, + "grad_norm": 0.32763800024986267, + "learning_rate": 4.694945373440435e-06, + "loss": 0.4081, + "step": 2045 + }, + { + "epoch": 1.7031076581576028, + "grad_norm": 0.3098640739917755, + "learning_rate": 4.690109397473586e-06, + "loss": 0.3837, + "step": 2046 + }, + { + "epoch": 1.7039400665926747, + "grad_norm": 0.3192623555660248, + "learning_rate": 4.685273712499436e-06, + "loss": 0.429, + "step": 2047 + }, + { + "epoch": 1.704772475027747, + "grad_norm": 0.34456512331962585, + "learning_rate": 4.680438323058783e-06, + "loss": 0.4263, + "step": 2048 + }, + { + "epoch": 1.705604883462819, + "grad_norm": 0.3312320411205292, + "learning_rate": 4.675603233692137e-06, + "loss": 0.4054, + "step": 2049 + }, + { + "epoch": 1.7064372918978914, + "grad_norm": 0.33208805322647095, + "learning_rate": 4.670768448939737e-06, + "loss": 0.3949, + "step": 2050 + }, + { + "epoch": 1.7072697003329633, + "grad_norm": 0.3596723973751068, + "learning_rate": 4.665933973341527e-06, + "loss": 0.4395, + "step": 2051 + }, + { + "epoch": 1.7081021087680355, + "grad_norm": 0.3322398066520691, + "learning_rate": 4.661099811437166e-06, + "loss": 0.399, + "step": 2052 + }, + { + "epoch": 1.7089345172031076, + "grad_norm": 0.3125096261501312, + "learning_rate": 4.656265967766014e-06, + "loss": 0.4142, + "step": 2053 + }, + { + "epoch": 1.7097669256381798, + "grad_norm": 0.3498634696006775, + "learning_rate": 4.651432446867139e-06, + "loss": 0.4149, + "step": 2054 + }, + { + "epoch": 1.710599334073252, + "grad_norm": 0.34815841913223267, + "learning_rate": 4.646599253279299e-06, + "loss": 0.4006, + "step": 2055 + }, + { + "epoch": 1.711431742508324, + "grad_norm": 0.3256421983242035, + "learning_rate": 4.641766391540949e-06, + "loss": 0.4225, + "step": 2056 + }, + { + "epoch": 1.7122641509433962, + "grad_norm": 0.3041287362575531, + "learning_rate": 4.636933866190228e-06, + "loss": 0.4198, + "step": 2057 + }, + { + "epoch": 1.7130965593784684, + "grad_norm": 0.3252321481704712, + "learning_rate": 4.632101681764964e-06, + "loss": 0.4309, + "step": 2058 + }, + { + "epoch": 1.7139289678135405, + "grad_norm": 0.3307684361934662, + "learning_rate": 4.627269842802664e-06, + "loss": 0.4331, + "step": 2059 + }, + { + "epoch": 1.7147613762486127, + "grad_norm": 0.34094369411468506, + "learning_rate": 4.622438353840506e-06, + "loss": 0.4036, + "step": 2060 + }, + { + "epoch": 1.7155937846836848, + "grad_norm": 0.31734779477119446, + "learning_rate": 4.617607219415346e-06, + "loss": 0.4096, + "step": 2061 + }, + { + "epoch": 1.7164261931187568, + "grad_norm": 0.3557928502559662, + "learning_rate": 4.6127764440637e-06, + "loss": 0.4343, + "step": 2062 + }, + { + "epoch": 1.7172586015538291, + "grad_norm": 0.35101959109306335, + "learning_rate": 4.607946032321755e-06, + "loss": 0.418, + "step": 2063 + }, + { + "epoch": 1.718091009988901, + "grad_norm": 0.35029613971710205, + "learning_rate": 4.603115988725351e-06, + "loss": 0.4141, + "step": 2064 + }, + { + "epoch": 1.7189234184239734, + "grad_norm": 0.333061546087265, + "learning_rate": 4.598286317809983e-06, + "loss": 0.4064, + "step": 2065 + }, + { + "epoch": 1.7197558268590454, + "grad_norm": 0.32927843928337097, + "learning_rate": 4.593457024110792e-06, + "loss": 0.4149, + "step": 2066 + }, + { + "epoch": 1.7205882352941178, + "grad_norm": 0.33346185088157654, + "learning_rate": 4.588628112162578e-06, + "loss": 0.4092, + "step": 2067 + }, + { + "epoch": 1.7214206437291897, + "grad_norm": 0.32560572028160095, + "learning_rate": 4.583799586499768e-06, + "loss": 0.4182, + "step": 2068 + }, + { + "epoch": 1.722253052164262, + "grad_norm": 0.3583771586418152, + "learning_rate": 4.578971451656435e-06, + "loss": 0.4472, + "step": 2069 + }, + { + "epoch": 1.723085460599334, + "grad_norm": 0.3493313491344452, + "learning_rate": 4.574143712166279e-06, + "loss": 0.4249, + "step": 2070 + }, + { + "epoch": 1.7239178690344064, + "grad_norm": 0.3568575978279114, + "learning_rate": 4.569316372562634e-06, + "loss": 0.4193, + "step": 2071 + }, + { + "epoch": 1.7247502774694783, + "grad_norm": 0.33443883061408997, + "learning_rate": 4.564489437378457e-06, + "loss": 0.4556, + "step": 2072 + }, + { + "epoch": 1.7255826859045507, + "grad_norm": 0.31167763471603394, + "learning_rate": 4.559662911146324e-06, + "loss": 0.3944, + "step": 2073 + }, + { + "epoch": 1.7264150943396226, + "grad_norm": 0.32042884826660156, + "learning_rate": 4.554836798398425e-06, + "loss": 0.4126, + "step": 2074 + }, + { + "epoch": 1.7272475027746947, + "grad_norm": 0.33394762873649597, + "learning_rate": 4.550011103666568e-06, + "loss": 0.4307, + "step": 2075 + }, + { + "epoch": 1.728079911209767, + "grad_norm": 0.3465491831302643, + "learning_rate": 4.545185831482166e-06, + "loss": 0.4582, + "step": 2076 + }, + { + "epoch": 1.728912319644839, + "grad_norm": 0.30535024404525757, + "learning_rate": 4.5403609863762295e-06, + "loss": 0.3633, + "step": 2077 + }, + { + "epoch": 1.7297447280799112, + "grad_norm": 0.3142050802707672, + "learning_rate": 4.535536572879376e-06, + "loss": 0.4242, + "step": 2078 + }, + { + "epoch": 1.7305771365149833, + "grad_norm": 0.3482121229171753, + "learning_rate": 4.53071259552181e-06, + "loss": 0.447, + "step": 2079 + }, + { + "epoch": 1.7314095449500555, + "grad_norm": 0.3179130256175995, + "learning_rate": 4.525889058833337e-06, + "loss": 0.4052, + "step": 2080 + }, + { + "epoch": 1.7322419533851277, + "grad_norm": 0.2993527352809906, + "learning_rate": 4.5210659673433386e-06, + "loss": 0.3627, + "step": 2081 + }, + { + "epoch": 1.7330743618201998, + "grad_norm": 0.3593730032444, + "learning_rate": 4.516243325580782e-06, + "loss": 0.4697, + "step": 2082 + }, + { + "epoch": 1.733906770255272, + "grad_norm": 0.3211425542831421, + "learning_rate": 4.511421138074213e-06, + "loss": 0.401, + "step": 2083 + }, + { + "epoch": 1.734739178690344, + "grad_norm": 0.33504122495651245, + "learning_rate": 4.50659940935175e-06, + "loss": 0.4107, + "step": 2084 + }, + { + "epoch": 1.7355715871254163, + "grad_norm": 0.36016252636909485, + "learning_rate": 4.5017781439410806e-06, + "loss": 0.4275, + "step": 2085 + }, + { + "epoch": 1.7364039955604884, + "grad_norm": 0.3467753827571869, + "learning_rate": 4.496957346369458e-06, + "loss": 0.4255, + "step": 2086 + }, + { + "epoch": 1.7372364039955603, + "grad_norm": 0.37390729784965515, + "learning_rate": 4.492137021163694e-06, + "loss": 0.4342, + "step": 2087 + }, + { + "epoch": 1.7380688124306327, + "grad_norm": 0.38485151529312134, + "learning_rate": 4.4873171728501604e-06, + "loss": 0.3993, + "step": 2088 + }, + { + "epoch": 1.7389012208657046, + "grad_norm": 0.37274548411369324, + "learning_rate": 4.482497805954779e-06, + "loss": 0.4139, + "step": 2089 + }, + { + "epoch": 1.739733629300777, + "grad_norm": 0.3811771273612976, + "learning_rate": 4.477678925003018e-06, + "loss": 0.4109, + "step": 2090 + }, + { + "epoch": 1.740566037735849, + "grad_norm": 0.37200403213500977, + "learning_rate": 4.472860534519893e-06, + "loss": 0.4366, + "step": 2091 + }, + { + "epoch": 1.7413984461709213, + "grad_norm": 0.3533194065093994, + "learning_rate": 4.468042639029952e-06, + "loss": 0.3646, + "step": 2092 + }, + { + "epoch": 1.7422308546059933, + "grad_norm": 0.3777799904346466, + "learning_rate": 4.463225243057292e-06, + "loss": 0.4576, + "step": 2093 + }, + { + "epoch": 1.7430632630410656, + "grad_norm": 0.34515514969825745, + "learning_rate": 4.458408351125525e-06, + "loss": 0.421, + "step": 2094 + }, + { + "epoch": 1.7438956714761376, + "grad_norm": 0.340119868516922, + "learning_rate": 4.453591967757801e-06, + "loss": 0.41, + "step": 2095 + }, + { + "epoch": 1.74472807991121, + "grad_norm": 0.3491295576095581, + "learning_rate": 4.4487760974767835e-06, + "loss": 0.4169, + "step": 2096 + }, + { + "epoch": 1.7455604883462819, + "grad_norm": 0.35444924235343933, + "learning_rate": 4.4439607448046636e-06, + "loss": 0.4326, + "step": 2097 + }, + { + "epoch": 1.746392896781354, + "grad_norm": 0.36455461382865906, + "learning_rate": 4.43914591426314e-06, + "loss": 0.3829, + "step": 2098 + }, + { + "epoch": 1.7472253052164262, + "grad_norm": 0.3415604531764984, + "learning_rate": 4.434331610373424e-06, + "loss": 0.4204, + "step": 2099 + }, + { + "epoch": 1.7480577136514983, + "grad_norm": 0.3621603846549988, + "learning_rate": 4.4295178376562285e-06, + "loss": 0.4484, + "step": 2100 + }, + { + "epoch": 1.7488901220865705, + "grad_norm": 0.3243803381919861, + "learning_rate": 4.424704600631774e-06, + "loss": 0.3944, + "step": 2101 + }, + { + "epoch": 1.7497225305216426, + "grad_norm": 0.32423099875450134, + "learning_rate": 4.419891903819773e-06, + "loss": 0.3849, + "step": 2102 + }, + { + "epoch": 1.7505549389567148, + "grad_norm": 0.35436517000198364, + "learning_rate": 4.4150797517394295e-06, + "loss": 0.4817, + "step": 2103 + }, + { + "epoch": 1.751387347391787, + "grad_norm": 0.3532561659812927, + "learning_rate": 4.410268148909441e-06, + "loss": 0.448, + "step": 2104 + }, + { + "epoch": 1.752219755826859, + "grad_norm": 0.360914409160614, + "learning_rate": 4.405457099847986e-06, + "loss": 0.4412, + "step": 2105 + }, + { + "epoch": 1.7530521642619312, + "grad_norm": 0.34743252396583557, + "learning_rate": 4.400646609072727e-06, + "loss": 0.4094, + "step": 2106 + }, + { + "epoch": 1.7538845726970034, + "grad_norm": 0.3823837339878082, + "learning_rate": 4.395836681100794e-06, + "loss": 0.4252, + "step": 2107 + }, + { + "epoch": 1.7547169811320755, + "grad_norm": 0.36400553584098816, + "learning_rate": 4.391027320448798e-06, + "loss": 0.4383, + "step": 2108 + }, + { + "epoch": 1.7555493895671477, + "grad_norm": 0.3543497920036316, + "learning_rate": 4.386218531632808e-06, + "loss": 0.3985, + "step": 2109 + }, + { + "epoch": 1.7563817980022196, + "grad_norm": 0.33355042338371277, + "learning_rate": 4.3814103191683655e-06, + "loss": 0.447, + "step": 2110 + }, + { + "epoch": 1.757214206437292, + "grad_norm": 0.31347766518592834, + "learning_rate": 4.376602687570464e-06, + "loss": 0.4034, + "step": 2111 + }, + { + "epoch": 1.758046614872364, + "grad_norm": 0.3570155203342438, + "learning_rate": 4.371795641353555e-06, + "loss": 0.3879, + "step": 2112 + }, + { + "epoch": 1.7588790233074363, + "grad_norm": 0.38342198729515076, + "learning_rate": 4.366989185031536e-06, + "loss": 0.4377, + "step": 2113 + }, + { + "epoch": 1.7597114317425082, + "grad_norm": 0.3486819565296173, + "learning_rate": 4.362183323117757e-06, + "loss": 0.4352, + "step": 2114 + }, + { + "epoch": 1.7605438401775806, + "grad_norm": 0.3399284780025482, + "learning_rate": 4.357378060125007e-06, + "loss": 0.4193, + "step": 2115 + }, + { + "epoch": 1.7613762486126525, + "grad_norm": 0.36731529235839844, + "learning_rate": 4.3525734005655085e-06, + "loss": 0.4648, + "step": 2116 + }, + { + "epoch": 1.762208657047725, + "grad_norm": 0.3027511537075043, + "learning_rate": 4.347769348950922e-06, + "loss": 0.4082, + "step": 2117 + }, + { + "epoch": 1.7630410654827968, + "grad_norm": 0.31376221776008606, + "learning_rate": 4.342965909792338e-06, + "loss": 0.4031, + "step": 2118 + }, + { + "epoch": 1.7638734739178692, + "grad_norm": 0.31292369961738586, + "learning_rate": 4.338163087600271e-06, + "loss": 0.4102, + "step": 2119 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.29499831795692444, + "learning_rate": 4.33336088688465e-06, + "loss": 0.3865, + "step": 2120 + }, + { + "epoch": 1.7655382907880133, + "grad_norm": 0.34691059589385986, + "learning_rate": 4.328559312154831e-06, + "loss": 0.4631, + "step": 2121 + }, + { + "epoch": 1.7663706992230854, + "grad_norm": 0.32868635654449463, + "learning_rate": 4.323758367919572e-06, + "loss": 0.4256, + "step": 2122 + }, + { + "epoch": 1.7672031076581576, + "grad_norm": 0.33149829506874084, + "learning_rate": 4.318958058687047e-06, + "loss": 0.4242, + "step": 2123 + }, + { + "epoch": 1.7680355160932297, + "grad_norm": 0.3130408823490143, + "learning_rate": 4.31415838896483e-06, + "loss": 0.4296, + "step": 2124 + }, + { + "epoch": 1.7688679245283019, + "grad_norm": 0.3201256990432739, + "learning_rate": 4.309359363259895e-06, + "loss": 0.4092, + "step": 2125 + }, + { + "epoch": 1.769700332963374, + "grad_norm": 0.3389187455177307, + "learning_rate": 4.304560986078609e-06, + "loss": 0.4557, + "step": 2126 + }, + { + "epoch": 1.7705327413984462, + "grad_norm": 0.32760846614837646, + "learning_rate": 4.299763261926734e-06, + "loss": 0.4082, + "step": 2127 + }, + { + "epoch": 1.7713651498335183, + "grad_norm": 0.3498232960700989, + "learning_rate": 4.294966195309418e-06, + "loss": 0.4329, + "step": 2128 + }, + { + "epoch": 1.7721975582685905, + "grad_norm": 0.3230178952217102, + "learning_rate": 4.2901697907311876e-06, + "loss": 0.4315, + "step": 2129 + }, + { + "epoch": 1.7730299667036626, + "grad_norm": 0.3200167715549469, + "learning_rate": 4.285374052695953e-06, + "loss": 0.3921, + "step": 2130 + }, + { + "epoch": 1.7738623751387348, + "grad_norm": 0.3524058163166046, + "learning_rate": 4.280578985706995e-06, + "loss": 0.4731, + "step": 2131 + }, + { + "epoch": 1.774694783573807, + "grad_norm": 0.2892106771469116, + "learning_rate": 4.2757845942669674e-06, + "loss": 0.3912, + "step": 2132 + }, + { + "epoch": 1.7755271920088789, + "grad_norm": 0.3491550087928772, + "learning_rate": 4.270990882877885e-06, + "loss": 0.4164, + "step": 2133 + }, + { + "epoch": 1.7763596004439512, + "grad_norm": 0.3432190418243408, + "learning_rate": 4.2661978560411274e-06, + "loss": 0.4305, + "step": 2134 + }, + { + "epoch": 1.7771920088790232, + "grad_norm": 0.2854191064834595, + "learning_rate": 4.261405518257434e-06, + "loss": 0.3578, + "step": 2135 + }, + { + "epoch": 1.7780244173140956, + "grad_norm": 0.36918261647224426, + "learning_rate": 4.25661387402689e-06, + "loss": 0.4849, + "step": 2136 + }, + { + "epoch": 1.7788568257491675, + "grad_norm": 0.3176991939544678, + "learning_rate": 4.251822927848934e-06, + "loss": 0.3844, + "step": 2137 + }, + { + "epoch": 1.7796892341842399, + "grad_norm": 0.3359120488166809, + "learning_rate": 4.24703268422235e-06, + "loss": 0.4387, + "step": 2138 + }, + { + "epoch": 1.7805216426193118, + "grad_norm": 0.3274317681789398, + "learning_rate": 4.242243147645257e-06, + "loss": 0.4009, + "step": 2139 + }, + { + "epoch": 1.7813540510543842, + "grad_norm": 0.3508370518684387, + "learning_rate": 4.237454322615118e-06, + "loss": 0.4086, + "step": 2140 + }, + { + "epoch": 1.782186459489456, + "grad_norm": 0.3434849977493286, + "learning_rate": 4.232666213628722e-06, + "loss": 0.4478, + "step": 2141 + }, + { + "epoch": 1.7830188679245285, + "grad_norm": 0.32151880860328674, + "learning_rate": 4.227878825182186e-06, + "loss": 0.3955, + "step": 2142 + }, + { + "epoch": 1.7838512763596004, + "grad_norm": 0.40656089782714844, + "learning_rate": 4.223092161770952e-06, + "loss": 0.4138, + "step": 2143 + }, + { + "epoch": 1.7846836847946725, + "grad_norm": 0.3283717930316925, + "learning_rate": 4.218306227889782e-06, + "loss": 0.4156, + "step": 2144 + }, + { + "epoch": 1.7855160932297447, + "grad_norm": 0.32961905002593994, + "learning_rate": 4.213521028032751e-06, + "loss": 0.3817, + "step": 2145 + }, + { + "epoch": 1.7863485016648168, + "grad_norm": 0.35188883543014526, + "learning_rate": 4.2087365666932456e-06, + "loss": 0.4086, + "step": 2146 + }, + { + "epoch": 1.787180910099889, + "grad_norm": 0.36771219968795776, + "learning_rate": 4.203952848363957e-06, + "loss": 0.4582, + "step": 2147 + }, + { + "epoch": 1.7880133185349611, + "grad_norm": 0.305984765291214, + "learning_rate": 4.199169877536884e-06, + "loss": 0.371, + "step": 2148 + }, + { + "epoch": 1.7888457269700333, + "grad_norm": 0.3949022591114044, + "learning_rate": 4.194387658703317e-06, + "loss": 0.4562, + "step": 2149 + }, + { + "epoch": 1.7896781354051055, + "grad_norm": 0.3683740496635437, + "learning_rate": 4.189606196353844e-06, + "loss": 0.4009, + "step": 2150 + }, + { + "epoch": 1.7905105438401776, + "grad_norm": 0.32549092173576355, + "learning_rate": 4.184825494978342e-06, + "loss": 0.4445, + "step": 2151 + }, + { + "epoch": 1.7913429522752498, + "grad_norm": 0.35574018955230713, + "learning_rate": 4.180045559065974e-06, + "loss": 0.4337, + "step": 2152 + }, + { + "epoch": 1.792175360710322, + "grad_norm": 0.3322107195854187, + "learning_rate": 4.175266393105183e-06, + "loss": 0.3973, + "step": 2153 + }, + { + "epoch": 1.793007769145394, + "grad_norm": 0.3085898458957672, + "learning_rate": 4.1704880015836905e-06, + "loss": 0.4016, + "step": 2154 + }, + { + "epoch": 1.7938401775804662, + "grad_norm": 0.3286598026752472, + "learning_rate": 4.165710388988487e-06, + "loss": 0.4377, + "step": 2155 + }, + { + "epoch": 1.7946725860155381, + "grad_norm": 0.32184141874313354, + "learning_rate": 4.1609335598058355e-06, + "loss": 0.3827, + "step": 2156 + }, + { + "epoch": 1.7955049944506105, + "grad_norm": 0.3248361051082611, + "learning_rate": 4.156157518521264e-06, + "loss": 0.4447, + "step": 2157 + }, + { + "epoch": 1.7963374028856824, + "grad_norm": 0.35561513900756836, + "learning_rate": 4.151382269619558e-06, + "loss": 0.4322, + "step": 2158 + }, + { + "epoch": 1.7971698113207548, + "grad_norm": 0.33568158745765686, + "learning_rate": 4.146607817584759e-06, + "loss": 0.417, + "step": 2159 + }, + { + "epoch": 1.7980022197558267, + "grad_norm": 0.325347363948822, + "learning_rate": 4.14183416690016e-06, + "loss": 0.4263, + "step": 2160 + }, + { + "epoch": 1.7988346281908991, + "grad_norm": 0.31573471426963806, + "learning_rate": 4.137061322048307e-06, + "loss": 0.4142, + "step": 2161 + }, + { + "epoch": 1.799667036625971, + "grad_norm": 0.33735018968582153, + "learning_rate": 4.13228928751098e-06, + "loss": 0.4247, + "step": 2162 + }, + { + "epoch": 1.8004994450610434, + "grad_norm": 0.3238171339035034, + "learning_rate": 4.127518067769206e-06, + "loss": 0.3978, + "step": 2163 + }, + { + "epoch": 1.8013318534961154, + "grad_norm": 0.33231449127197266, + "learning_rate": 4.122747667303242e-06, + "loss": 0.4361, + "step": 2164 + }, + { + "epoch": 1.8021642619311877, + "grad_norm": 0.32602575421333313, + "learning_rate": 4.11797809059258e-06, + "loss": 0.3914, + "step": 2165 + }, + { + "epoch": 1.8029966703662597, + "grad_norm": 0.29990747570991516, + "learning_rate": 4.1132093421159335e-06, + "loss": 0.3839, + "step": 2166 + }, + { + "epoch": 1.8038290788013318, + "grad_norm": 0.3527059257030487, + "learning_rate": 4.108441426351243e-06, + "loss": 0.4338, + "step": 2167 + }, + { + "epoch": 1.804661487236404, + "grad_norm": 0.3696962893009186, + "learning_rate": 4.103674347775663e-06, + "loss": 0.4063, + "step": 2168 + }, + { + "epoch": 1.8054938956714761, + "grad_norm": 0.3006901144981384, + "learning_rate": 4.098908110865563e-06, + "loss": 0.3799, + "step": 2169 + }, + { + "epoch": 1.8063263041065483, + "grad_norm": 0.350769579410553, + "learning_rate": 4.094142720096526e-06, + "loss": 0.4381, + "step": 2170 + }, + { + "epoch": 1.8071587125416204, + "grad_norm": 0.3283845782279968, + "learning_rate": 4.089378179943336e-06, + "loss": 0.4216, + "step": 2171 + }, + { + "epoch": 1.8079911209766926, + "grad_norm": 0.2863730490207672, + "learning_rate": 4.084614494879979e-06, + "loss": 0.4155, + "step": 2172 + }, + { + "epoch": 1.8088235294117647, + "grad_norm": 0.3665149211883545, + "learning_rate": 4.079851669379638e-06, + "loss": 0.4351, + "step": 2173 + }, + { + "epoch": 1.8096559378468369, + "grad_norm": 0.39417773485183716, + "learning_rate": 4.0750897079146924e-06, + "loss": 0.4559, + "step": 2174 + }, + { + "epoch": 1.810488346281909, + "grad_norm": 0.3096025288105011, + "learning_rate": 4.070328614956705e-06, + "loss": 0.4012, + "step": 2175 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.3567507863044739, + "learning_rate": 4.065568394976426e-06, + "loss": 0.416, + "step": 2176 + }, + { + "epoch": 1.8121531631520533, + "grad_norm": 0.29146334528923035, + "learning_rate": 4.060809052443784e-06, + "loss": 0.3917, + "step": 2177 + }, + { + "epoch": 1.8129855715871255, + "grad_norm": 0.3747054636478424, + "learning_rate": 4.056050591827888e-06, + "loss": 0.4189, + "step": 2178 + }, + { + "epoch": 1.8138179800221974, + "grad_norm": 0.3492047190666199, + "learning_rate": 4.051293017597014e-06, + "loss": 0.4179, + "step": 2179 + }, + { + "epoch": 1.8146503884572698, + "grad_norm": 0.3406948745250702, + "learning_rate": 4.046536334218609e-06, + "loss": 0.421, + "step": 2180 + }, + { + "epoch": 1.8154827968923417, + "grad_norm": 0.34447532892227173, + "learning_rate": 4.0417805461592764e-06, + "loss": 0.4349, + "step": 2181 + }, + { + "epoch": 1.816315205327414, + "grad_norm": 0.3566656708717346, + "learning_rate": 4.037025657884793e-06, + "loss": 0.4326, + "step": 2182 + }, + { + "epoch": 1.817147613762486, + "grad_norm": 0.3476259112358093, + "learning_rate": 4.032271673860077e-06, + "loss": 0.4038, + "step": 2183 + }, + { + "epoch": 1.8179800221975584, + "grad_norm": 0.3235064446926117, + "learning_rate": 4.0275185985492025e-06, + "loss": 0.412, + "step": 2184 + }, + { + "epoch": 1.8188124306326303, + "grad_norm": 0.37167060375213623, + "learning_rate": 4.022766436415392e-06, + "loss": 0.4514, + "step": 2185 + }, + { + "epoch": 1.8196448390677027, + "grad_norm": 0.31849905848503113, + "learning_rate": 4.018015191921008e-06, + "loss": 0.4254, + "step": 2186 + }, + { + "epoch": 1.8204772475027746, + "grad_norm": 0.2981938123703003, + "learning_rate": 4.013264869527553e-06, + "loss": 0.379, + "step": 2187 + }, + { + "epoch": 1.821309655937847, + "grad_norm": 0.36852484941482544, + "learning_rate": 4.008515473695663e-06, + "loss": 0.4213, + "step": 2188 + }, + { + "epoch": 1.822142064372919, + "grad_norm": 0.37913084030151367, + "learning_rate": 4.003767008885102e-06, + "loss": 0.4507, + "step": 2189 + }, + { + "epoch": 1.822974472807991, + "grad_norm": 0.30263346433639526, + "learning_rate": 3.999019479554764e-06, + "loss": 0.3763, + "step": 2190 + }, + { + "epoch": 1.8238068812430632, + "grad_norm": 0.33267539739608765, + "learning_rate": 3.9942728901626605e-06, + "loss": 0.4024, + "step": 2191 + }, + { + "epoch": 1.8246392896781354, + "grad_norm": 0.32982137799263, + "learning_rate": 3.989527245165924e-06, + "loss": 0.4079, + "step": 2192 + }, + { + "epoch": 1.8254716981132075, + "grad_norm": 0.35067951679229736, + "learning_rate": 3.984782549020797e-06, + "loss": 0.4182, + "step": 2193 + }, + { + "epoch": 1.8263041065482797, + "grad_norm": 0.328118234872818, + "learning_rate": 3.980038806182629e-06, + "loss": 0.4588, + "step": 2194 + }, + { + "epoch": 1.8271365149833518, + "grad_norm": 0.37364038825035095, + "learning_rate": 3.975296021105885e-06, + "loss": 0.4546, + "step": 2195 + }, + { + "epoch": 1.827968923418424, + "grad_norm": 0.3203776180744171, + "learning_rate": 3.970554198244116e-06, + "loss": 0.4091, + "step": 2196 + }, + { + "epoch": 1.8288013318534961, + "grad_norm": 0.36616021394729614, + "learning_rate": 3.965813342049983e-06, + "loss": 0.4341, + "step": 2197 + }, + { + "epoch": 1.8296337402885683, + "grad_norm": 0.3346719741821289, + "learning_rate": 3.961073456975227e-06, + "loss": 0.4067, + "step": 2198 + }, + { + "epoch": 1.8304661487236404, + "grad_norm": 0.3459213674068451, + "learning_rate": 3.956334547470686e-06, + "loss": 0.3825, + "step": 2199 + }, + { + "epoch": 1.8312985571587126, + "grad_norm": 0.336300253868103, + "learning_rate": 3.95159661798628e-06, + "loss": 0.4056, + "step": 2200 + }, + { + "epoch": 1.8321309655937847, + "grad_norm": 0.3746432662010193, + "learning_rate": 3.946859672971006e-06, + "loss": 0.4649, + "step": 2201 + }, + { + "epoch": 1.8329633740288567, + "grad_norm": 0.3622547388076782, + "learning_rate": 3.9421237168729386e-06, + "loss": 0.3967, + "step": 2202 + }, + { + "epoch": 1.833795782463929, + "grad_norm": 0.3507779836654663, + "learning_rate": 3.937388754139223e-06, + "loss": 0.4041, + "step": 2203 + }, + { + "epoch": 1.834628190899001, + "grad_norm": 0.37171730399131775, + "learning_rate": 3.9326547892160746e-06, + "loss": 0.4587, + "step": 2204 + }, + { + "epoch": 1.8354605993340734, + "grad_norm": 0.3501371443271637, + "learning_rate": 3.927921826548767e-06, + "loss": 0.4174, + "step": 2205 + }, + { + "epoch": 1.8362930077691453, + "grad_norm": 0.3328426778316498, + "learning_rate": 3.923189870581636e-06, + "loss": 0.4201, + "step": 2206 + }, + { + "epoch": 1.8371254162042177, + "grad_norm": 0.3560123145580292, + "learning_rate": 3.918458925758068e-06, + "loss": 0.4023, + "step": 2207 + }, + { + "epoch": 1.8379578246392896, + "grad_norm": 0.35025811195373535, + "learning_rate": 3.9137289965205086e-06, + "loss": 0.4353, + "step": 2208 + }, + { + "epoch": 1.838790233074362, + "grad_norm": 0.29967784881591797, + "learning_rate": 3.909000087310441e-06, + "loss": 0.4074, + "step": 2209 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 0.35869383811950684, + "learning_rate": 3.9042722025683945e-06, + "loss": 0.3998, + "step": 2210 + }, + { + "epoch": 1.8404550499445063, + "grad_norm": 0.3381917178630829, + "learning_rate": 3.899545346733933e-06, + "loss": 0.4623, + "step": 2211 + }, + { + "epoch": 1.8412874583795782, + "grad_norm": 0.30779141187667847, + "learning_rate": 3.894819524245661e-06, + "loss": 0.395, + "step": 2212 + }, + { + "epoch": 1.8421198668146503, + "grad_norm": 0.35666483640670776, + "learning_rate": 3.890094739541207e-06, + "loss": 0.3997, + "step": 2213 + }, + { + "epoch": 1.8429522752497225, + "grad_norm": 0.3682236969470978, + "learning_rate": 3.885370997057225e-06, + "loss": 0.4708, + "step": 2214 + }, + { + "epoch": 1.8437846836847946, + "grad_norm": 0.3033082187175751, + "learning_rate": 3.880648301229394e-06, + "loss": 0.368, + "step": 2215 + }, + { + "epoch": 1.8446170921198668, + "grad_norm": 0.335415244102478, + "learning_rate": 3.875926656492406e-06, + "loss": 0.4432, + "step": 2216 + }, + { + "epoch": 1.845449500554939, + "grad_norm": 0.35151904821395874, + "learning_rate": 3.871206067279971e-06, + "loss": 0.4582, + "step": 2217 + }, + { + "epoch": 1.846281908990011, + "grad_norm": 0.32559242844581604, + "learning_rate": 3.866486538024802e-06, + "loss": 0.4067, + "step": 2218 + }, + { + "epoch": 1.8471143174250833, + "grad_norm": 0.3076154589653015, + "learning_rate": 3.861768073158623e-06, + "loss": 0.4061, + "step": 2219 + }, + { + "epoch": 1.8479467258601554, + "grad_norm": 0.29312387108802795, + "learning_rate": 3.8570506771121484e-06, + "loss": 0.3832, + "step": 2220 + }, + { + "epoch": 1.8487791342952276, + "grad_norm": 0.3544323742389679, + "learning_rate": 3.852334354315104e-06, + "loss": 0.4602, + "step": 2221 + }, + { + "epoch": 1.8496115427302997, + "grad_norm": 0.3405257761478424, + "learning_rate": 3.847619109196195e-06, + "loss": 0.4202, + "step": 2222 + }, + { + "epoch": 1.8504439511653719, + "grad_norm": 0.35631346702575684, + "learning_rate": 3.842904946183121e-06, + "loss": 0.4349, + "step": 2223 + }, + { + "epoch": 1.851276359600444, + "grad_norm": 0.35186129808425903, + "learning_rate": 3.83819186970256e-06, + "loss": 0.4067, + "step": 2224 + }, + { + "epoch": 1.852108768035516, + "grad_norm": 0.3434910476207733, + "learning_rate": 3.833479884180177e-06, + "loss": 0.4053, + "step": 2225 + }, + { + "epoch": 1.8529411764705883, + "grad_norm": 0.3569307029247284, + "learning_rate": 3.828768994040608e-06, + "loss": 0.4356, + "step": 2226 + }, + { + "epoch": 1.8537735849056602, + "grad_norm": 0.33176884055137634, + "learning_rate": 3.824059203707461e-06, + "loss": 0.4422, + "step": 2227 + }, + { + "epoch": 1.8546059933407326, + "grad_norm": 0.2953312397003174, + "learning_rate": 3.81935051760331e-06, + "loss": 0.3875, + "step": 2228 + }, + { + "epoch": 1.8554384017758045, + "grad_norm": 0.336987167596817, + "learning_rate": 3.8146429401496963e-06, + "loss": 0.4396, + "step": 2229 + }, + { + "epoch": 1.856270810210877, + "grad_norm": 0.3004775643348694, + "learning_rate": 3.8099364757671188e-06, + "loss": 0.4045, + "step": 2230 + }, + { + "epoch": 1.8571032186459488, + "grad_norm": 0.3074621856212616, + "learning_rate": 3.8052311288750255e-06, + "loss": 0.3944, + "step": 2231 + }, + { + "epoch": 1.8579356270810212, + "grad_norm": 0.31900620460510254, + "learning_rate": 3.800526903891823e-06, + "loss": 0.4074, + "step": 2232 + }, + { + "epoch": 1.8587680355160932, + "grad_norm": 0.3192499876022339, + "learning_rate": 3.795823805234857e-06, + "loss": 0.4026, + "step": 2233 + }, + { + "epoch": 1.8596004439511655, + "grad_norm": 0.3406783640384674, + "learning_rate": 3.791121837320425e-06, + "loss": 0.4335, + "step": 2234 + }, + { + "epoch": 1.8604328523862375, + "grad_norm": 0.33335891366004944, + "learning_rate": 3.786421004563753e-06, + "loss": 0.4305, + "step": 2235 + }, + { + "epoch": 1.8612652608213096, + "grad_norm": 0.30208635330200195, + "learning_rate": 3.7817213113790088e-06, + "loss": 0.4136, + "step": 2236 + }, + { + "epoch": 1.8620976692563818, + "grad_norm": 0.3383508026599884, + "learning_rate": 3.7770227621792815e-06, + "loss": 0.4167, + "step": 2237 + }, + { + "epoch": 1.862930077691454, + "grad_norm": 0.32437726855278015, + "learning_rate": 3.7723253613765954e-06, + "loss": 0.4547, + "step": 2238 + }, + { + "epoch": 1.863762486126526, + "grad_norm": 0.3173195719718933, + "learning_rate": 3.767629113381891e-06, + "loss": 0.4056, + "step": 2239 + }, + { + "epoch": 1.8645948945615982, + "grad_norm": 0.3287826180458069, + "learning_rate": 3.762934022605027e-06, + "loss": 0.4125, + "step": 2240 + }, + { + "epoch": 1.8654273029966704, + "grad_norm": 0.33371874690055847, + "learning_rate": 3.758240093454775e-06, + "loss": 0.4262, + "step": 2241 + }, + { + "epoch": 1.8662597114317425, + "grad_norm": 0.3588714599609375, + "learning_rate": 3.7535473303388175e-06, + "loss": 0.3972, + "step": 2242 + }, + { + "epoch": 1.8670921198668147, + "grad_norm": 0.32598569989204407, + "learning_rate": 3.7488557376637436e-06, + "loss": 0.3772, + "step": 2243 + }, + { + "epoch": 1.8679245283018868, + "grad_norm": 0.3502293825149536, + "learning_rate": 3.744165319835037e-06, + "loss": 0.4538, + "step": 2244 + }, + { + "epoch": 1.868756936736959, + "grad_norm": 0.31929466128349304, + "learning_rate": 3.739476081257085e-06, + "loss": 0.4287, + "step": 2245 + }, + { + "epoch": 1.8695893451720311, + "grad_norm": 0.366178959608078, + "learning_rate": 3.7347880263331603e-06, + "loss": 0.442, + "step": 2246 + }, + { + "epoch": 1.8704217536071033, + "grad_norm": 0.40710756182670593, + "learning_rate": 3.730101159465435e-06, + "loss": 0.441, + "step": 2247 + }, + { + "epoch": 1.8712541620421752, + "grad_norm": 0.3121664226055145, + "learning_rate": 3.725415485054955e-06, + "loss": 0.3975, + "step": 2248 + }, + { + "epoch": 1.8720865704772476, + "grad_norm": 0.3380068242549896, + "learning_rate": 3.7207310075016533e-06, + "loss": 0.4533, + "step": 2249 + }, + { + "epoch": 1.8729189789123195, + "grad_norm": 0.35721099376678467, + "learning_rate": 3.716047731204332e-06, + "loss": 0.4422, + "step": 2250 + }, + { + "epoch": 1.8737513873473919, + "grad_norm": 0.35127249360084534, + "learning_rate": 3.711365660560674e-06, + "loss": 0.4263, + "step": 2251 + }, + { + "epoch": 1.8745837957824638, + "grad_norm": 0.36538636684417725, + "learning_rate": 3.706684799967224e-06, + "loss": 0.4003, + "step": 2252 + }, + { + "epoch": 1.8754162042175362, + "grad_norm": 0.3314541280269623, + "learning_rate": 3.702005153819391e-06, + "loss": 0.369, + "step": 2253 + }, + { + "epoch": 1.8762486126526081, + "grad_norm": 0.35317155718803406, + "learning_rate": 3.6973267265114456e-06, + "loss": 0.4318, + "step": 2254 + }, + { + "epoch": 1.8770810210876805, + "grad_norm": 0.397045373916626, + "learning_rate": 3.6926495224365124e-06, + "loss": 0.4304, + "step": 2255 + }, + { + "epoch": 1.8779134295227524, + "grad_norm": 0.34217485785484314, + "learning_rate": 3.6879735459865708e-06, + "loss": 0.3961, + "step": 2256 + }, + { + "epoch": 1.8787458379578248, + "grad_norm": 0.3433960974216461, + "learning_rate": 3.68329880155244e-06, + "loss": 0.4528, + "step": 2257 + }, + { + "epoch": 1.8795782463928967, + "grad_norm": 0.36350932717323303, + "learning_rate": 3.6786252935237886e-06, + "loss": 0.4229, + "step": 2258 + }, + { + "epoch": 1.8804106548279689, + "grad_norm": 0.3405548334121704, + "learning_rate": 3.6739530262891245e-06, + "loss": 0.4055, + "step": 2259 + }, + { + "epoch": 1.881243063263041, + "grad_norm": 0.3336378037929535, + "learning_rate": 3.669282004235787e-06, + "loss": 0.3886, + "step": 2260 + }, + { + "epoch": 1.8820754716981132, + "grad_norm": 0.3233022093772888, + "learning_rate": 3.6646122317499465e-06, + "loss": 0.4265, + "step": 2261 + }, + { + "epoch": 1.8829078801331853, + "grad_norm": 0.3442718982696533, + "learning_rate": 3.6599437132166036e-06, + "loss": 0.4322, + "step": 2262 + }, + { + "epoch": 1.8837402885682575, + "grad_norm": 0.3712242841720581, + "learning_rate": 3.655276453019575e-06, + "loss": 0.4382, + "step": 2263 + }, + { + "epoch": 1.8845726970033296, + "grad_norm": 0.3307363986968994, + "learning_rate": 3.650610455541504e-06, + "loss": 0.387, + "step": 2264 + }, + { + "epoch": 1.8854051054384018, + "grad_norm": 0.3208552896976471, + "learning_rate": 3.6459457251638423e-06, + "loss": 0.4071, + "step": 2265 + }, + { + "epoch": 1.886237513873474, + "grad_norm": 0.345205157995224, + "learning_rate": 3.641282266266853e-06, + "loss": 0.4206, + "step": 2266 + }, + { + "epoch": 1.887069922308546, + "grad_norm": 0.4148430824279785, + "learning_rate": 3.636620083229604e-06, + "loss": 0.445, + "step": 2267 + }, + { + "epoch": 1.8879023307436182, + "grad_norm": 0.33523353934288025, + "learning_rate": 3.6319591804299703e-06, + "loss": 0.389, + "step": 2268 + }, + { + "epoch": 1.8887347391786904, + "grad_norm": 0.3195513188838959, + "learning_rate": 3.6272995622446204e-06, + "loss": 0.4133, + "step": 2269 + }, + { + "epoch": 1.8895671476137625, + "grad_norm": 0.36299553513526917, + "learning_rate": 3.622641233049016e-06, + "loss": 0.4297, + "step": 2270 + }, + { + "epoch": 1.8903995560488345, + "grad_norm": 0.3698355555534363, + "learning_rate": 3.617984197217409e-06, + "loss": 0.4338, + "step": 2271 + }, + { + "epoch": 1.8912319644839068, + "grad_norm": 0.332303524017334, + "learning_rate": 3.6133284591228403e-06, + "loss": 0.3874, + "step": 2272 + }, + { + "epoch": 1.8920643729189788, + "grad_norm": 0.34952232241630554, + "learning_rate": 3.608674023137129e-06, + "loss": 0.425, + "step": 2273 + }, + { + "epoch": 1.8928967813540512, + "grad_norm": 0.3578547239303589, + "learning_rate": 3.6040208936308697e-06, + "loss": 0.4576, + "step": 2274 + }, + { + "epoch": 1.893729189789123, + "grad_norm": 0.3056720793247223, + "learning_rate": 3.599369074973433e-06, + "loss": 0.4044, + "step": 2275 + }, + { + "epoch": 1.8945615982241955, + "grad_norm": 0.3444676101207733, + "learning_rate": 3.5947185715329614e-06, + "loss": 0.3809, + "step": 2276 + }, + { + "epoch": 1.8953940066592674, + "grad_norm": 0.3490372896194458, + "learning_rate": 3.5900693876763556e-06, + "loss": 0.4221, + "step": 2277 + }, + { + "epoch": 1.8962264150943398, + "grad_norm": 0.3269912600517273, + "learning_rate": 3.585421527769283e-06, + "loss": 0.4139, + "step": 2278 + }, + { + "epoch": 1.8970588235294117, + "grad_norm": 0.3209238350391388, + "learning_rate": 3.580774996176162e-06, + "loss": 0.3649, + "step": 2279 + }, + { + "epoch": 1.897891231964484, + "grad_norm": 0.3572242259979248, + "learning_rate": 3.5761297972601695e-06, + "loss": 0.4546, + "step": 2280 + }, + { + "epoch": 1.898723640399556, + "grad_norm": 0.31195253133773804, + "learning_rate": 3.5714859353832286e-06, + "loss": 0.4141, + "step": 2281 + }, + { + "epoch": 1.8995560488346281, + "grad_norm": 0.3187641501426697, + "learning_rate": 3.5668434149060076e-06, + "loss": 0.4276, + "step": 2282 + }, + { + "epoch": 1.9003884572697003, + "grad_norm": 0.34391510486602783, + "learning_rate": 3.562202240187913e-06, + "loss": 0.4086, + "step": 2283 + }, + { + "epoch": 1.9012208657047724, + "grad_norm": 0.35638102889060974, + "learning_rate": 3.5575624155870885e-06, + "loss": 0.4511, + "step": 2284 + }, + { + "epoch": 1.9020532741398446, + "grad_norm": 0.30889418721199036, + "learning_rate": 3.552923945460413e-06, + "loss": 0.417, + "step": 2285 + }, + { + "epoch": 1.9028856825749167, + "grad_norm": 0.3231523036956787, + "learning_rate": 3.548286834163491e-06, + "loss": 0.3931, + "step": 2286 + }, + { + "epoch": 1.903718091009989, + "grad_norm": 0.3281930983066559, + "learning_rate": 3.543651086050649e-06, + "loss": 0.3971, + "step": 2287 + }, + { + "epoch": 1.904550499445061, + "grad_norm": 0.3393603563308716, + "learning_rate": 3.5390167054749363e-06, + "loss": 0.3878, + "step": 2288 + }, + { + "epoch": 1.9053829078801332, + "grad_norm": 0.33854469656944275, + "learning_rate": 3.5343836967881194e-06, + "loss": 0.4232, + "step": 2289 + }, + { + "epoch": 1.9062153163152054, + "grad_norm": 0.3317667245864868, + "learning_rate": 3.529752064340673e-06, + "loss": 0.3732, + "step": 2290 + }, + { + "epoch": 1.9070477247502775, + "grad_norm": 0.31549349427223206, + "learning_rate": 3.5251218124817803e-06, + "loss": 0.4203, + "step": 2291 + }, + { + "epoch": 1.9078801331853497, + "grad_norm": 0.3274786174297333, + "learning_rate": 3.5204929455593316e-06, + "loss": 0.4142, + "step": 2292 + }, + { + "epoch": 1.9087125416204218, + "grad_norm": 0.3583471477031708, + "learning_rate": 3.51586546791991e-06, + "loss": 0.4494, + "step": 2293 + }, + { + "epoch": 1.9095449500554937, + "grad_norm": 0.34534788131713867, + "learning_rate": 3.511239383908801e-06, + "loss": 0.4386, + "step": 2294 + }, + { + "epoch": 1.9103773584905661, + "grad_norm": 0.34133318066596985, + "learning_rate": 3.5066146978699785e-06, + "loss": 0.4227, + "step": 2295 + }, + { + "epoch": 1.911209766925638, + "grad_norm": 0.3159574568271637, + "learning_rate": 3.501991414146102e-06, + "loss": 0.42, + "step": 2296 + }, + { + "epoch": 1.9120421753607104, + "grad_norm": 0.321329265832901, + "learning_rate": 3.4973695370785154e-06, + "loss": 0.391, + "step": 2297 + }, + { + "epoch": 1.9128745837957823, + "grad_norm": 0.3425721526145935, + "learning_rate": 3.4927490710072454e-06, + "loss": 0.427, + "step": 2298 + }, + { + "epoch": 1.9137069922308547, + "grad_norm": 0.34851303696632385, + "learning_rate": 3.488130020270989e-06, + "loss": 0.4335, + "step": 2299 + }, + { + "epoch": 1.9145394006659266, + "grad_norm": 0.26636067032814026, + "learning_rate": 3.4835123892071145e-06, + "loss": 0.3935, + "step": 2300 + }, + { + "epoch": 1.915371809100999, + "grad_norm": 0.30461832880973816, + "learning_rate": 3.4788961821516576e-06, + "loss": 0.3859, + "step": 2301 + }, + { + "epoch": 1.916204217536071, + "grad_norm": 0.3343258500099182, + "learning_rate": 3.4742814034393224e-06, + "loss": 0.4512, + "step": 2302 + }, + { + "epoch": 1.9170366259711433, + "grad_norm": 0.3263416588306427, + "learning_rate": 3.4696680574034613e-06, + "loss": 0.4181, + "step": 2303 + }, + { + "epoch": 1.9178690344062153, + "grad_norm": 0.30046147108078003, + "learning_rate": 3.46505614837609e-06, + "loss": 0.3963, + "step": 2304 + }, + { + "epoch": 1.9187014428412874, + "grad_norm": 0.3026406764984131, + "learning_rate": 3.4604456806878704e-06, + "loss": 0.4206, + "step": 2305 + }, + { + "epoch": 1.9195338512763596, + "grad_norm": 0.30180978775024414, + "learning_rate": 3.4558366586681152e-06, + "loss": 0.4089, + "step": 2306 + }, + { + "epoch": 1.9203662597114317, + "grad_norm": 0.2909837067127228, + "learning_rate": 3.451229086644774e-06, + "loss": 0.3858, + "step": 2307 + }, + { + "epoch": 1.9211986681465039, + "grad_norm": 0.30994731187820435, + "learning_rate": 3.4466229689444384e-06, + "loss": 0.3947, + "step": 2308 + }, + { + "epoch": 1.922031076581576, + "grad_norm": 0.3551662862300873, + "learning_rate": 3.442018309892333e-06, + "loss": 0.4318, + "step": 2309 + }, + { + "epoch": 1.9228634850166482, + "grad_norm": 0.3257605731487274, + "learning_rate": 3.4374151138123135e-06, + "loss": 0.4569, + "step": 2310 + }, + { + "epoch": 1.9236958934517203, + "grad_norm": 0.28167983889579773, + "learning_rate": 3.432813385026862e-06, + "loss": 0.3689, + "step": 2311 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.36538660526275635, + "learning_rate": 3.4282131278570833e-06, + "loss": 0.4454, + "step": 2312 + }, + { + "epoch": 1.9253607103218646, + "grad_norm": 0.33179739117622375, + "learning_rate": 3.423614346622698e-06, + "loss": 0.3962, + "step": 2313 + }, + { + "epoch": 1.9261931187569368, + "grad_norm": 0.31999471783638, + "learning_rate": 3.4190170456420413e-06, + "loss": 0.3943, + "step": 2314 + }, + { + "epoch": 1.927025527192009, + "grad_norm": 0.3487468659877777, + "learning_rate": 3.4144212292320634e-06, + "loss": 0.4468, + "step": 2315 + }, + { + "epoch": 1.927857935627081, + "grad_norm": 0.3570094406604767, + "learning_rate": 3.409826901708312e-06, + "loss": 0.4241, + "step": 2316 + }, + { + "epoch": 1.928690344062153, + "grad_norm": 0.354623407125473, + "learning_rate": 3.4052340673849426e-06, + "loss": 0.4656, + "step": 2317 + }, + { + "epoch": 1.9295227524972254, + "grad_norm": 0.29796338081359863, + "learning_rate": 3.400642730574706e-06, + "loss": 0.422, + "step": 2318 + }, + { + "epoch": 1.9303551609322973, + "grad_norm": 0.32531046867370605, + "learning_rate": 3.3960528955889516e-06, + "loss": 0.417, + "step": 2319 + }, + { + "epoch": 1.9311875693673697, + "grad_norm": 0.3640810251235962, + "learning_rate": 3.391464566737611e-06, + "loss": 0.4625, + "step": 2320 + }, + { + "epoch": 1.9320199778024416, + "grad_norm": 0.3448679745197296, + "learning_rate": 3.386877748329208e-06, + "loss": 0.4093, + "step": 2321 + }, + { + "epoch": 1.932852386237514, + "grad_norm": 0.3299039900302887, + "learning_rate": 3.382292444670843e-06, + "loss": 0.4068, + "step": 2322 + }, + { + "epoch": 1.933684794672586, + "grad_norm": 0.3216043710708618, + "learning_rate": 3.3777086600681954e-06, + "loss": 0.4023, + "step": 2323 + }, + { + "epoch": 1.9345172031076583, + "grad_norm": 0.3004588484764099, + "learning_rate": 3.3731263988255223e-06, + "loss": 0.3881, + "step": 2324 + }, + { + "epoch": 1.9353496115427302, + "grad_norm": 0.3696260452270508, + "learning_rate": 3.3685456652456484e-06, + "loss": 0.4373, + "step": 2325 + }, + { + "epoch": 1.9361820199778026, + "grad_norm": 0.35806453227996826, + "learning_rate": 3.3639664636299586e-06, + "loss": 0.4515, + "step": 2326 + }, + { + "epoch": 1.9370144284128745, + "grad_norm": 0.28217509388923645, + "learning_rate": 3.3593887982784047e-06, + "loss": 0.3911, + "step": 2327 + }, + { + "epoch": 1.9378468368479467, + "grad_norm": 0.3391430974006653, + "learning_rate": 3.354812673489497e-06, + "loss": 0.4178, + "step": 2328 + }, + { + "epoch": 1.9386792452830188, + "grad_norm": 0.3683750629425049, + "learning_rate": 3.3502380935602942e-06, + "loss": 0.4342, + "step": 2329 + }, + { + "epoch": 1.939511653718091, + "grad_norm": 0.34491217136383057, + "learning_rate": 3.3456650627864075e-06, + "loss": 0.389, + "step": 2330 + }, + { + "epoch": 1.9403440621531631, + "grad_norm": 0.3198521137237549, + "learning_rate": 3.341093585461992e-06, + "loss": 0.3983, + "step": 2331 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.34554436802864075, + "learning_rate": 3.336523665879748e-06, + "loss": 0.4338, + "step": 2332 + }, + { + "epoch": 1.9420088790233074, + "grad_norm": 0.32669246196746826, + "learning_rate": 3.331955308330907e-06, + "loss": 0.3832, + "step": 2333 + }, + { + "epoch": 1.9428412874583796, + "grad_norm": 0.3620488941669464, + "learning_rate": 3.327388517105239e-06, + "loss": 0.493, + "step": 2334 + }, + { + "epoch": 1.9436736958934517, + "grad_norm": 0.2980594038963318, + "learning_rate": 3.3228232964910377e-06, + "loss": 0.4098, + "step": 2335 + }, + { + "epoch": 1.9445061043285239, + "grad_norm": 0.32015350461006165, + "learning_rate": 3.3182596507751288e-06, + "loss": 0.3994, + "step": 2336 + }, + { + "epoch": 1.945338512763596, + "grad_norm": 0.32040566205978394, + "learning_rate": 3.313697584242853e-06, + "loss": 0.4373, + "step": 2337 + }, + { + "epoch": 1.9461709211986682, + "grad_norm": 0.3319530785083771, + "learning_rate": 3.309137101178073e-06, + "loss": 0.4209, + "step": 2338 + }, + { + "epoch": 1.9470033296337403, + "grad_norm": 0.3502316474914551, + "learning_rate": 3.3045782058631597e-06, + "loss": 0.4313, + "step": 2339 + }, + { + "epoch": 1.9478357380688123, + "grad_norm": 0.3295128047466278, + "learning_rate": 3.3000209025789965e-06, + "loss": 0.429, + "step": 2340 + }, + { + "epoch": 1.9486681465038846, + "grad_norm": 0.30817556381225586, + "learning_rate": 3.295465195604972e-06, + "loss": 0.3908, + "step": 2341 + }, + { + "epoch": 1.9495005549389566, + "grad_norm": 0.34200671315193176, + "learning_rate": 3.2909110892189745e-06, + "loss": 0.4588, + "step": 2342 + }, + { + "epoch": 1.950332963374029, + "grad_norm": 0.3436935842037201, + "learning_rate": 3.286358587697388e-06, + "loss": 0.4262, + "step": 2343 + }, + { + "epoch": 1.9511653718091009, + "grad_norm": 0.29214903712272644, + "learning_rate": 3.2818076953150917e-06, + "loss": 0.4161, + "step": 2344 + }, + { + "epoch": 1.9519977802441733, + "grad_norm": 0.2840372920036316, + "learning_rate": 3.277258416345456e-06, + "loss": 0.3877, + "step": 2345 + }, + { + "epoch": 1.9528301886792452, + "grad_norm": 0.34386762976646423, + "learning_rate": 3.2727107550603305e-06, + "loss": 0.4524, + "step": 2346 + }, + { + "epoch": 1.9536625971143176, + "grad_norm": 0.3042285740375519, + "learning_rate": 3.26816471573005e-06, + "loss": 0.4206, + "step": 2347 + }, + { + "epoch": 1.9544950055493895, + "grad_norm": 0.28492388129234314, + "learning_rate": 3.2636203026234236e-06, + "loss": 0.3565, + "step": 2348 + }, + { + "epoch": 1.9553274139844619, + "grad_norm": 0.33080917596817017, + "learning_rate": 3.2590775200077364e-06, + "loss": 0.4431, + "step": 2349 + }, + { + "epoch": 1.9561598224195338, + "grad_norm": 0.3305113613605499, + "learning_rate": 3.25453637214874e-06, + "loss": 0.4194, + "step": 2350 + }, + { + "epoch": 1.956992230854606, + "grad_norm": 0.3372589647769928, + "learning_rate": 3.249996863310654e-06, + "loss": 0.4155, + "step": 2351 + }, + { + "epoch": 1.957824639289678, + "grad_norm": 0.33676356077194214, + "learning_rate": 3.2454589977561513e-06, + "loss": 0.3825, + "step": 2352 + }, + { + "epoch": 1.9586570477247502, + "grad_norm": 0.37880995869636536, + "learning_rate": 3.2409227797463727e-06, + "loss": 0.4373, + "step": 2353 + }, + { + "epoch": 1.9594894561598224, + "grad_norm": 0.3162456750869751, + "learning_rate": 3.236388213540904e-06, + "loss": 0.3984, + "step": 2354 + }, + { + "epoch": 1.9603218645948945, + "grad_norm": 0.33563530445098877, + "learning_rate": 3.231855303397783e-06, + "loss": 0.4313, + "step": 2355 + }, + { + "epoch": 1.9611542730299667, + "grad_norm": 0.31390097737312317, + "learning_rate": 3.2273240535734895e-06, + "loss": 0.4177, + "step": 2356 + }, + { + "epoch": 1.9619866814650389, + "grad_norm": 0.3237724006175995, + "learning_rate": 3.2227944683229484e-06, + "loss": 0.4041, + "step": 2357 + }, + { + "epoch": 1.962819089900111, + "grad_norm": 0.32047805190086365, + "learning_rate": 3.2182665518995203e-06, + "loss": 0.4324, + "step": 2358 + }, + { + "epoch": 1.9636514983351832, + "grad_norm": 0.2876075506210327, + "learning_rate": 3.2137403085549962e-06, + "loss": 0.3837, + "step": 2359 + }, + { + "epoch": 1.9644839067702553, + "grad_norm": 0.2909439504146576, + "learning_rate": 3.2092157425395996e-06, + "loss": 0.4206, + "step": 2360 + }, + { + "epoch": 1.9653163152053275, + "grad_norm": 0.29555991291999817, + "learning_rate": 3.2046928581019744e-06, + "loss": 0.3688, + "step": 2361 + }, + { + "epoch": 1.9661487236403996, + "grad_norm": 0.35118407011032104, + "learning_rate": 3.20017165948919e-06, + "loss": 0.4504, + "step": 2362 + }, + { + "epoch": 1.9669811320754715, + "grad_norm": 0.309773325920105, + "learning_rate": 3.195652150946732e-06, + "loss": 0.3804, + "step": 2363 + }, + { + "epoch": 1.967813540510544, + "grad_norm": 0.32229289412498474, + "learning_rate": 3.1911343367184977e-06, + "loss": 0.4497, + "step": 2364 + }, + { + "epoch": 1.9686459489456158, + "grad_norm": 0.30695047974586487, + "learning_rate": 3.1866182210467923e-06, + "loss": 0.4304, + "step": 2365 + }, + { + "epoch": 1.9694783573806882, + "grad_norm": 0.33170729875564575, + "learning_rate": 3.1821038081723283e-06, + "loss": 0.4071, + "step": 2366 + }, + { + "epoch": 1.9703107658157601, + "grad_norm": 0.30925998091697693, + "learning_rate": 3.1775911023342197e-06, + "loss": 0.3772, + "step": 2367 + }, + { + "epoch": 1.9711431742508325, + "grad_norm": 0.3299291729927063, + "learning_rate": 3.1730801077699747e-06, + "loss": 0.4137, + "step": 2368 + }, + { + "epoch": 1.9719755826859044, + "grad_norm": 0.3230315148830414, + "learning_rate": 3.168570828715496e-06, + "loss": 0.4289, + "step": 2369 + }, + { + "epoch": 1.9728079911209768, + "grad_norm": 0.33906981348991394, + "learning_rate": 3.1640632694050753e-06, + "loss": 0.4349, + "step": 2370 + }, + { + "epoch": 1.9736403995560488, + "grad_norm": 0.31247228384017944, + "learning_rate": 3.159557434071393e-06, + "loss": 0.4172, + "step": 2371 + }, + { + "epoch": 1.9744728079911211, + "grad_norm": 0.31131434440612793, + "learning_rate": 3.155053326945503e-06, + "loss": 0.4167, + "step": 2372 + }, + { + "epoch": 1.975305216426193, + "grad_norm": 0.3348797857761383, + "learning_rate": 3.1505509522568444e-06, + "loss": 0.4398, + "step": 2373 + }, + { + "epoch": 1.9761376248612652, + "grad_norm": 0.33107826113700867, + "learning_rate": 3.1460503142332227e-06, + "loss": 0.394, + "step": 2374 + }, + { + "epoch": 1.9769700332963374, + "grad_norm": 0.30621546506881714, + "learning_rate": 3.1415514171008176e-06, + "loss": 0.4005, + "step": 2375 + }, + { + "epoch": 1.9778024417314095, + "grad_norm": 0.3266296088695526, + "learning_rate": 3.137054265084173e-06, + "loss": 0.4249, + "step": 2376 + }, + { + "epoch": 1.9786348501664817, + "grad_norm": 0.3151954412460327, + "learning_rate": 3.1325588624061925e-06, + "loss": 0.3982, + "step": 2377 + }, + { + "epoch": 1.9794672586015538, + "grad_norm": 0.32240432500839233, + "learning_rate": 3.128065213288136e-06, + "loss": 0.3918, + "step": 2378 + }, + { + "epoch": 1.980299667036626, + "grad_norm": 0.3490481376647949, + "learning_rate": 3.123573321949621e-06, + "loss": 0.4313, + "step": 2379 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 0.3069087862968445, + "learning_rate": 3.119083192608614e-06, + "loss": 0.3813, + "step": 2380 + }, + { + "epoch": 1.9819644839067703, + "grad_norm": 0.3291850984096527, + "learning_rate": 3.114594829481421e-06, + "loss": 0.4728, + "step": 2381 + }, + { + "epoch": 1.9827968923418424, + "grad_norm": 0.33388224244117737, + "learning_rate": 3.110108236782694e-06, + "loss": 0.4257, + "step": 2382 + }, + { + "epoch": 1.9836293007769146, + "grad_norm": 0.31437182426452637, + "learning_rate": 3.105623418725424e-06, + "loss": 0.3869, + "step": 2383 + }, + { + "epoch": 1.9844617092119867, + "grad_norm": 0.31633225083351135, + "learning_rate": 3.101140379520935e-06, + "loss": 0.3808, + "step": 2384 + }, + { + "epoch": 1.9852941176470589, + "grad_norm": 0.3501662015914917, + "learning_rate": 3.0966591233788757e-06, + "loss": 0.4576, + "step": 2385 + }, + { + "epoch": 1.9861265260821308, + "grad_norm": 0.3176516592502594, + "learning_rate": 3.092179654507227e-06, + "loss": 0.4181, + "step": 2386 + }, + { + "epoch": 1.9869589345172032, + "grad_norm": 0.354187935590744, + "learning_rate": 3.0877019771122848e-06, + "loss": 0.4359, + "step": 2387 + }, + { + "epoch": 1.987791342952275, + "grad_norm": 0.32240673899650574, + "learning_rate": 3.0832260953986716e-06, + "loss": 0.3794, + "step": 2388 + }, + { + "epoch": 1.9886237513873475, + "grad_norm": 0.3612145781517029, + "learning_rate": 3.078752013569315e-06, + "loss": 0.4573, + "step": 2389 + }, + { + "epoch": 1.9894561598224194, + "grad_norm": 0.29033204913139343, + "learning_rate": 3.0742797358254584e-06, + "loss": 0.4075, + "step": 2390 + }, + { + "epoch": 1.9902885682574918, + "grad_norm": 0.3590858280658722, + "learning_rate": 3.069809266366647e-06, + "loss": 0.4718, + "step": 2391 + }, + { + "epoch": 1.9911209766925637, + "grad_norm": 0.3385516405105591, + "learning_rate": 3.06534060939073e-06, + "loss": 0.4209, + "step": 2392 + }, + { + "epoch": 1.991953385127636, + "grad_norm": 0.32453739643096924, + "learning_rate": 3.060873769093858e-06, + "loss": 0.3889, + "step": 2393 + }, + { + "epoch": 1.992785793562708, + "grad_norm": 0.3326762616634369, + "learning_rate": 3.0564087496704676e-06, + "loss": 0.4476, + "step": 2394 + }, + { + "epoch": 1.9936182019977804, + "grad_norm": 0.30887943506240845, + "learning_rate": 3.0519455553132914e-06, + "loss": 0.422, + "step": 2395 + }, + { + "epoch": 1.9944506104328523, + "grad_norm": 0.32724782824516296, + "learning_rate": 3.047484190213349e-06, + "loss": 0.4304, + "step": 2396 + }, + { + "epoch": 1.9952830188679245, + "grad_norm": 0.3449188768863678, + "learning_rate": 3.0430246585599402e-06, + "loss": 0.4361, + "step": 2397 + }, + { + "epoch": 1.9961154273029966, + "grad_norm": 0.3099921941757202, + "learning_rate": 3.0385669645406413e-06, + "loss": 0.378, + "step": 2398 + }, + { + "epoch": 1.9969478357380688, + "grad_norm": 0.3242183327674866, + "learning_rate": 3.034111112341307e-06, + "loss": 0.4265, + "step": 2399 + }, + { + "epoch": 1.997780244173141, + "grad_norm": 0.3651365637779236, + "learning_rate": 3.029657106146057e-06, + "loss": 0.4341, + "step": 2400 + }, + { + "epoch": 1.998612652608213, + "grad_norm": 0.32321080565452576, + "learning_rate": 3.025204950137286e-06, + "loss": 0.4153, + "step": 2401 + }, + { + "epoch": 1.9994450610432852, + "grad_norm": 0.3187124729156494, + "learning_rate": 3.020754648495644e-06, + "loss": 0.3873, + "step": 2402 + }, + { + "epoch": 2.000277469478357, + "grad_norm": 0.69718337059021, + "learning_rate": 3.0163062054000424e-06, + "loss": 0.683, + "step": 2403 + }, + { + "epoch": 2.0011098779134295, + "grad_norm": 0.33480337262153625, + "learning_rate": 3.0118596250276453e-06, + "loss": 0.4026, + "step": 2404 + }, + { + "epoch": 2.0019422863485015, + "grad_norm": 0.3858184814453125, + "learning_rate": 3.0074149115538725e-06, + "loss": 0.4126, + "step": 2405 + }, + { + "epoch": 2.002774694783574, + "grad_norm": 0.3335935175418854, + "learning_rate": 3.0029720691523873e-06, + "loss": 0.372, + "step": 2406 + }, + { + "epoch": 2.0036071032186458, + "grad_norm": 0.3692198693752289, + "learning_rate": 2.9985311019950945e-06, + "loss": 0.4148, + "step": 2407 + }, + { + "epoch": 2.004439511653718, + "grad_norm": 0.333741158246994, + "learning_rate": 2.9940920142521413e-06, + "loss": 0.3529, + "step": 2408 + }, + { + "epoch": 2.00527192008879, + "grad_norm": 0.3616425395011902, + "learning_rate": 2.9896548100919087e-06, + "loss": 0.3824, + "step": 2409 + }, + { + "epoch": 2.0061043285238624, + "grad_norm": 0.34554699063301086, + "learning_rate": 2.985219493681011e-06, + "loss": 0.3984, + "step": 2410 + }, + { + "epoch": 2.0069367369589344, + "grad_norm": 0.35826876759529114, + "learning_rate": 2.980786069184285e-06, + "loss": 0.4494, + "step": 2411 + }, + { + "epoch": 2.0077691453940067, + "grad_norm": 0.31827691197395325, + "learning_rate": 2.976354540764793e-06, + "loss": 0.3831, + "step": 2412 + }, + { + "epoch": 2.0086015538290787, + "grad_norm": 0.3482878506183624, + "learning_rate": 2.971924912583822e-06, + "loss": 0.3803, + "step": 2413 + }, + { + "epoch": 2.009433962264151, + "grad_norm": 0.3719305098056793, + "learning_rate": 2.9674971888008696e-06, + "loss": 0.4542, + "step": 2414 + }, + { + "epoch": 2.010266370699223, + "grad_norm": 0.30256187915802, + "learning_rate": 2.9630713735736428e-06, + "loss": 0.3519, + "step": 2415 + }, + { + "epoch": 2.0110987791342954, + "grad_norm": 0.3034214973449707, + "learning_rate": 2.9586474710580627e-06, + "loss": 0.3865, + "step": 2416 + }, + { + "epoch": 2.0119311875693673, + "grad_norm": 0.3587876260280609, + "learning_rate": 2.954225485408248e-06, + "loss": 0.4371, + "step": 2417 + }, + { + "epoch": 2.0127635960044397, + "grad_norm": 0.35651448369026184, + "learning_rate": 2.9498054207765237e-06, + "loss": 0.3977, + "step": 2418 + }, + { + "epoch": 2.0135960044395116, + "grad_norm": 0.2968187630176544, + "learning_rate": 2.945387281313408e-06, + "loss": 0.3755, + "step": 2419 + }, + { + "epoch": 2.014428412874584, + "grad_norm": 0.34130021929740906, + "learning_rate": 2.940971071167608e-06, + "loss": 0.4274, + "step": 2420 + }, + { + "epoch": 2.015260821309656, + "grad_norm": 0.3059341609477997, + "learning_rate": 2.936556794486024e-06, + "loss": 0.3419, + "step": 2421 + }, + { + "epoch": 2.0160932297447283, + "grad_norm": 0.3298993408679962, + "learning_rate": 2.932144455413741e-06, + "loss": 0.4018, + "step": 2422 + }, + { + "epoch": 2.0169256381798, + "grad_norm": 0.3346230387687683, + "learning_rate": 2.9277340580940215e-06, + "loss": 0.4095, + "step": 2423 + }, + { + "epoch": 2.0177580466148726, + "grad_norm": 0.35374805331230164, + "learning_rate": 2.9233256066683047e-06, + "loss": 0.3878, + "step": 2424 + }, + { + "epoch": 2.0185904550499445, + "grad_norm": 0.2975861430168152, + "learning_rate": 2.9189191052762038e-06, + "loss": 0.3523, + "step": 2425 + }, + { + "epoch": 2.0194228634850164, + "grad_norm": 0.3865160048007965, + "learning_rate": 2.914514558055502e-06, + "loss": 0.4404, + "step": 2426 + }, + { + "epoch": 2.020255271920089, + "grad_norm": 0.2803787887096405, + "learning_rate": 2.9101119691421453e-06, + "loss": 0.3509, + "step": 2427 + }, + { + "epoch": 2.0210876803551607, + "grad_norm": 0.35009345412254333, + "learning_rate": 2.905711342670242e-06, + "loss": 0.3977, + "step": 2428 + }, + { + "epoch": 2.021920088790233, + "grad_norm": 0.3857073485851288, + "learning_rate": 2.901312682772058e-06, + "loss": 0.4167, + "step": 2429 + }, + { + "epoch": 2.022752497225305, + "grad_norm": 0.3246917426586151, + "learning_rate": 2.896915993578011e-06, + "loss": 0.3457, + "step": 2430 + }, + { + "epoch": 2.0235849056603774, + "grad_norm": 0.32466015219688416, + "learning_rate": 2.8925212792166694e-06, + "loss": 0.3807, + "step": 2431 + }, + { + "epoch": 2.0244173140954493, + "grad_norm": 0.3205777704715729, + "learning_rate": 2.8881285438147477e-06, + "loss": 0.3801, + "step": 2432 + }, + { + "epoch": 2.0252497225305217, + "grad_norm": 0.31383705139160156, + "learning_rate": 2.8837377914971003e-06, + "loss": 0.3692, + "step": 2433 + }, + { + "epoch": 2.0260821309655936, + "grad_norm": 0.36327627301216125, + "learning_rate": 2.8793490263867212e-06, + "loss": 0.395, + "step": 2434 + }, + { + "epoch": 2.026914539400666, + "grad_norm": 0.33775612711906433, + "learning_rate": 2.8749622526047373e-06, + "loss": 0.3764, + "step": 2435 + }, + { + "epoch": 2.027746947835738, + "grad_norm": 0.31958720088005066, + "learning_rate": 2.8705774742704063e-06, + "loss": 0.4473, + "step": 2436 + }, + { + "epoch": 2.0285793562708103, + "grad_norm": 0.3061386048793793, + "learning_rate": 2.8661946955011145e-06, + "loss": 0.4101, + "step": 2437 + }, + { + "epoch": 2.0294117647058822, + "grad_norm": 0.27900204062461853, + "learning_rate": 2.8618139204123597e-06, + "loss": 0.3726, + "step": 2438 + }, + { + "epoch": 2.0302441731409546, + "grad_norm": 0.32336297631263733, + "learning_rate": 2.8574351531177747e-06, + "loss": 0.403, + "step": 2439 + }, + { + "epoch": 2.0310765815760266, + "grad_norm": 0.3574887216091156, + "learning_rate": 2.853058397729095e-06, + "loss": 0.354, + "step": 2440 + }, + { + "epoch": 2.031908990011099, + "grad_norm": 0.3319253623485565, + "learning_rate": 2.8486836583561737e-06, + "loss": 0.4185, + "step": 2441 + }, + { + "epoch": 2.032741398446171, + "grad_norm": 0.31709638237953186, + "learning_rate": 2.8443109391069616e-06, + "loss": 0.3966, + "step": 2442 + }, + { + "epoch": 2.0335738068812432, + "grad_norm": 0.28898531198501587, + "learning_rate": 2.8399402440875248e-06, + "loss": 0.3537, + "step": 2443 + }, + { + "epoch": 2.034406215316315, + "grad_norm": 0.32134294509887695, + "learning_rate": 2.835571577402021e-06, + "loss": 0.3991, + "step": 2444 + }, + { + "epoch": 2.0352386237513875, + "grad_norm": 0.3410552144050598, + "learning_rate": 2.831204943152701e-06, + "loss": 0.4234, + "step": 2445 + }, + { + "epoch": 2.0360710321864595, + "grad_norm": 0.31080541014671326, + "learning_rate": 2.8268403454399154e-06, + "loss": 0.3578, + "step": 2446 + }, + { + "epoch": 2.036903440621532, + "grad_norm": 0.3171520531177521, + "learning_rate": 2.8224777883620926e-06, + "loss": 0.3696, + "step": 2447 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.3251802921295166, + "learning_rate": 2.8181172760157575e-06, + "loss": 0.4128, + "step": 2448 + }, + { + "epoch": 2.0385682574916757, + "grad_norm": 0.3506574034690857, + "learning_rate": 2.8137588124955017e-06, + "loss": 0.3945, + "step": 2449 + }, + { + "epoch": 2.039400665926748, + "grad_norm": 0.32733920216560364, + "learning_rate": 2.8094024018940012e-06, + "loss": 0.4038, + "step": 2450 + }, + { + "epoch": 2.04023307436182, + "grad_norm": 0.3144792914390564, + "learning_rate": 2.8050480483020003e-06, + "loss": 0.4005, + "step": 2451 + }, + { + "epoch": 2.0410654827968924, + "grad_norm": 0.27931925654411316, + "learning_rate": 2.8006957558083147e-06, + "loss": 0.3664, + "step": 2452 + }, + { + "epoch": 2.0418978912319643, + "grad_norm": 0.3300316333770752, + "learning_rate": 2.7963455284998225e-06, + "loss": 0.438, + "step": 2453 + }, + { + "epoch": 2.0427302996670367, + "grad_norm": 0.31316685676574707, + "learning_rate": 2.7919973704614632e-06, + "loss": 0.3973, + "step": 2454 + }, + { + "epoch": 2.0435627081021086, + "grad_norm": 0.3359510600566864, + "learning_rate": 2.7876512857762343e-06, + "loss": 0.3725, + "step": 2455 + }, + { + "epoch": 2.044395116537181, + "grad_norm": 0.3232194483280182, + "learning_rate": 2.7833072785251846e-06, + "loss": 0.4057, + "step": 2456 + }, + { + "epoch": 2.045227524972253, + "grad_norm": 0.31120437383651733, + "learning_rate": 2.778965352787413e-06, + "loss": 0.3684, + "step": 2457 + }, + { + "epoch": 2.0460599334073253, + "grad_norm": 0.32518357038497925, + "learning_rate": 2.774625512640064e-06, + "loss": 0.4007, + "step": 2458 + }, + { + "epoch": 2.046892341842397, + "grad_norm": 0.3096177875995636, + "learning_rate": 2.7702877621583234e-06, + "loss": 0.3879, + "step": 2459 + }, + { + "epoch": 2.0477247502774696, + "grad_norm": 0.3457062840461731, + "learning_rate": 2.7659521054154147e-06, + "loss": 0.4057, + "step": 2460 + }, + { + "epoch": 2.0485571587125415, + "grad_norm": 0.33439356088638306, + "learning_rate": 2.7616185464825963e-06, + "loss": 0.4092, + "step": 2461 + }, + { + "epoch": 2.049389567147614, + "grad_norm": 0.33163490891456604, + "learning_rate": 2.7572870894291542e-06, + "loss": 0.4207, + "step": 2462 + }, + { + "epoch": 2.050221975582686, + "grad_norm": 0.3037160336971283, + "learning_rate": 2.752957738322406e-06, + "loss": 0.3703, + "step": 2463 + }, + { + "epoch": 2.051054384017758, + "grad_norm": 0.3286396563053131, + "learning_rate": 2.748630497227682e-06, + "loss": 0.4138, + "step": 2464 + }, + { + "epoch": 2.05188679245283, + "grad_norm": 0.3129012882709503, + "learning_rate": 2.744305370208342e-06, + "loss": 0.3747, + "step": 2465 + }, + { + "epoch": 2.0527192008879025, + "grad_norm": 0.3447505831718445, + "learning_rate": 2.7399823613257565e-06, + "loss": 0.3802, + "step": 2466 + }, + { + "epoch": 2.0535516093229744, + "grad_norm": 0.32165050506591797, + "learning_rate": 2.7356614746393063e-06, + "loss": 0.4049, + "step": 2467 + }, + { + "epoch": 2.054384017758047, + "grad_norm": 0.31354621052742004, + "learning_rate": 2.7313427142063742e-06, + "loss": 0.3912, + "step": 2468 + }, + { + "epoch": 2.0552164261931187, + "grad_norm": 0.3174445331096649, + "learning_rate": 2.7270260840823588e-06, + "loss": 0.3831, + "step": 2469 + }, + { + "epoch": 2.056048834628191, + "grad_norm": 0.3428478538990021, + "learning_rate": 2.72271158832065e-06, + "loss": 0.3866, + "step": 2470 + }, + { + "epoch": 2.056881243063263, + "grad_norm": 0.35326361656188965, + "learning_rate": 2.718399230972632e-06, + "loss": 0.4147, + "step": 2471 + }, + { + "epoch": 2.057713651498335, + "grad_norm": 0.31062963604927063, + "learning_rate": 2.714089016087683e-06, + "loss": 0.3806, + "step": 2472 + }, + { + "epoch": 2.0585460599334073, + "grad_norm": 0.3279687166213989, + "learning_rate": 2.7097809477131754e-06, + "loss": 0.4389, + "step": 2473 + }, + { + "epoch": 2.0593784683684793, + "grad_norm": 0.33881494402885437, + "learning_rate": 2.705475029894459e-06, + "loss": 0.4214, + "step": 2474 + }, + { + "epoch": 2.0602108768035516, + "grad_norm": 0.3012351989746094, + "learning_rate": 2.7011712666748636e-06, + "loss": 0.3647, + "step": 2475 + }, + { + "epoch": 2.0610432852386236, + "grad_norm": 0.36581099033355713, + "learning_rate": 2.696869662095698e-06, + "loss": 0.4215, + "step": 2476 + }, + { + "epoch": 2.061875693673696, + "grad_norm": 0.3162044286727905, + "learning_rate": 2.6925702201962493e-06, + "loss": 0.4061, + "step": 2477 + }, + { + "epoch": 2.062708102108768, + "grad_norm": 0.30521532893180847, + "learning_rate": 2.6882729450137636e-06, + "loss": 0.368, + "step": 2478 + }, + { + "epoch": 2.0635405105438402, + "grad_norm": 0.36388954520225525, + "learning_rate": 2.6839778405834593e-06, + "loss": 0.3992, + "step": 2479 + }, + { + "epoch": 2.064372918978912, + "grad_norm": 0.3033269941806793, + "learning_rate": 2.6796849109385147e-06, + "loss": 0.3603, + "step": 2480 + }, + { + "epoch": 2.0652053274139845, + "grad_norm": 0.32970061898231506, + "learning_rate": 2.6753941601100662e-06, + "loss": 0.4103, + "step": 2481 + }, + { + "epoch": 2.0660377358490565, + "grad_norm": 0.3365088105201721, + "learning_rate": 2.6711055921272033e-06, + "loss": 0.43, + "step": 2482 + }, + { + "epoch": 2.066870144284129, + "grad_norm": 0.3210076689720154, + "learning_rate": 2.6668192110169664e-06, + "loss": 0.3518, + "step": 2483 + }, + { + "epoch": 2.067702552719201, + "grad_norm": 0.37223634123802185, + "learning_rate": 2.6625350208043432e-06, + "loss": 0.3965, + "step": 2484 + }, + { + "epoch": 2.068534961154273, + "grad_norm": 0.3405874967575073, + "learning_rate": 2.658253025512263e-06, + "loss": 0.401, + "step": 2485 + }, + { + "epoch": 2.069367369589345, + "grad_norm": 0.32502299547195435, + "learning_rate": 2.6539732291615937e-06, + "loss": 0.365, + "step": 2486 + }, + { + "epoch": 2.0701997780244175, + "grad_norm": 0.36859750747680664, + "learning_rate": 2.6496956357711402e-06, + "loss": 0.4403, + "step": 2487 + }, + { + "epoch": 2.0710321864594894, + "grad_norm": 0.3416479527950287, + "learning_rate": 2.6454202493576366e-06, + "loss": 0.3544, + "step": 2488 + }, + { + "epoch": 2.0718645948945618, + "grad_norm": 0.3147336542606354, + "learning_rate": 2.641147073935746e-06, + "loss": 0.3804, + "step": 2489 + }, + { + "epoch": 2.0726970033296337, + "grad_norm": 0.3625737428665161, + "learning_rate": 2.6368761135180544e-06, + "loss": 0.4079, + "step": 2490 + }, + { + "epoch": 2.073529411764706, + "grad_norm": 0.3262125849723816, + "learning_rate": 2.632607372115069e-06, + "loss": 0.4015, + "step": 2491 + }, + { + "epoch": 2.074361820199778, + "grad_norm": 0.3019062280654907, + "learning_rate": 2.628340853735213e-06, + "loss": 0.3562, + "step": 2492 + }, + { + "epoch": 2.0751942286348504, + "grad_norm": 0.35128647089004517, + "learning_rate": 2.624076562384823e-06, + "loss": 0.416, + "step": 2493 + }, + { + "epoch": 2.0760266370699223, + "grad_norm": 0.3422330915927887, + "learning_rate": 2.619814502068139e-06, + "loss": 0.3876, + "step": 2494 + }, + { + "epoch": 2.0768590455049942, + "grad_norm": 0.3407348692417145, + "learning_rate": 2.6155546767873136e-06, + "loss": 0.4153, + "step": 2495 + }, + { + "epoch": 2.0776914539400666, + "grad_norm": 0.3248727023601532, + "learning_rate": 2.611297090542399e-06, + "loss": 0.3867, + "step": 2496 + }, + { + "epoch": 2.0785238623751385, + "grad_norm": 0.2963961958885193, + "learning_rate": 2.607041747331339e-06, + "loss": 0.372, + "step": 2497 + }, + { + "epoch": 2.079356270810211, + "grad_norm": 0.3318076431751251, + "learning_rate": 2.6027886511499756e-06, + "loss": 0.3884, + "step": 2498 + }, + { + "epoch": 2.080188679245283, + "grad_norm": 0.32221075892448425, + "learning_rate": 2.598537805992044e-06, + "loss": 0.3999, + "step": 2499 + }, + { + "epoch": 2.081021087680355, + "grad_norm": 0.3247624635696411, + "learning_rate": 2.5942892158491626e-06, + "loss": 0.4089, + "step": 2500 + }, + { + "epoch": 2.081853496115427, + "grad_norm": 0.33710792660713196, + "learning_rate": 2.590042884710828e-06, + "loss": 0.4056, + "step": 2501 + }, + { + "epoch": 2.0826859045504995, + "grad_norm": 0.3118056058883667, + "learning_rate": 2.585798816564419e-06, + "loss": 0.4116, + "step": 2502 + }, + { + "epoch": 2.0835183129855714, + "grad_norm": 0.2979108691215515, + "learning_rate": 2.5815570153951942e-06, + "loss": 0.3746, + "step": 2503 + }, + { + "epoch": 2.084350721420644, + "grad_norm": 0.33021363615989685, + "learning_rate": 2.5773174851862796e-06, + "loss": 0.394, + "step": 2504 + }, + { + "epoch": 2.0851831298557157, + "grad_norm": 0.32653099298477173, + "learning_rate": 2.573080229918664e-06, + "loss": 0.4189, + "step": 2505 + }, + { + "epoch": 2.086015538290788, + "grad_norm": 0.3247196674346924, + "learning_rate": 2.568845253571204e-06, + "loss": 0.3851, + "step": 2506 + }, + { + "epoch": 2.08684794672586, + "grad_norm": 0.3410961329936981, + "learning_rate": 2.564612560120623e-06, + "loss": 0.4209, + "step": 2507 + }, + { + "epoch": 2.0876803551609324, + "grad_norm": 0.3214920163154602, + "learning_rate": 2.5603821535414874e-06, + "loss": 0.3618, + "step": 2508 + }, + { + "epoch": 2.0885127635960044, + "grad_norm": 0.34245097637176514, + "learning_rate": 2.556154037806226e-06, + "loss": 0.4046, + "step": 2509 + }, + { + "epoch": 2.0893451720310767, + "grad_norm": 0.3048022389411926, + "learning_rate": 2.5519282168851134e-06, + "loss": 0.3755, + "step": 2510 + }, + { + "epoch": 2.0901775804661487, + "grad_norm": 0.3066607415676117, + "learning_rate": 2.547704694746269e-06, + "loss": 0.3606, + "step": 2511 + }, + { + "epoch": 2.091009988901221, + "grad_norm": 0.30674123764038086, + "learning_rate": 2.543483475355654e-06, + "loss": 0.3706, + "step": 2512 + }, + { + "epoch": 2.091842397336293, + "grad_norm": 0.32365816831588745, + "learning_rate": 2.5392645626770686e-06, + "loss": 0.3973, + "step": 2513 + }, + { + "epoch": 2.0926748057713653, + "grad_norm": 0.3154730498790741, + "learning_rate": 2.5350479606721433e-06, + "loss": 0.4019, + "step": 2514 + }, + { + "epoch": 2.0935072142064373, + "grad_norm": 0.28480201959609985, + "learning_rate": 2.5308336733003435e-06, + "loss": 0.362, + "step": 2515 + }, + { + "epoch": 2.0943396226415096, + "grad_norm": 0.3131362497806549, + "learning_rate": 2.5266217045189572e-06, + "loss": 0.4336, + "step": 2516 + }, + { + "epoch": 2.0951720310765816, + "grad_norm": 0.3055063784122467, + "learning_rate": 2.522412058283098e-06, + "loss": 0.3658, + "step": 2517 + }, + { + "epoch": 2.0960044395116535, + "grad_norm": 0.3128129839897156, + "learning_rate": 2.5182047385456967e-06, + "loss": 0.3963, + "step": 2518 + }, + { + "epoch": 2.096836847946726, + "grad_norm": 0.3318641483783722, + "learning_rate": 2.513999749257501e-06, + "loss": 0.4023, + "step": 2519 + }, + { + "epoch": 2.097669256381798, + "grad_norm": 0.30515703558921814, + "learning_rate": 2.509797094367068e-06, + "loss": 0.3823, + "step": 2520 + }, + { + "epoch": 2.09850166481687, + "grad_norm": 0.3311365842819214, + "learning_rate": 2.505596777820766e-06, + "loss": 0.3941, + "step": 2521 + }, + { + "epoch": 2.099334073251942, + "grad_norm": 0.3067466914653778, + "learning_rate": 2.5013988035627656e-06, + "loss": 0.4081, + "step": 2522 + }, + { + "epoch": 2.1001664816870145, + "grad_norm": 0.344066858291626, + "learning_rate": 2.4972031755350366e-06, + "loss": 0.3707, + "step": 2523 + }, + { + "epoch": 2.1009988901220864, + "grad_norm": 0.3539952039718628, + "learning_rate": 2.493009897677346e-06, + "loss": 0.4235, + "step": 2524 + }, + { + "epoch": 2.101831298557159, + "grad_norm": 0.34413817524909973, + "learning_rate": 2.4888189739272587e-06, + "loss": 0.382, + "step": 2525 + }, + { + "epoch": 2.1026637069922307, + "grad_norm": 0.30042779445648193, + "learning_rate": 2.484630408220126e-06, + "loss": 0.3718, + "step": 2526 + }, + { + "epoch": 2.103496115427303, + "grad_norm": 0.3354708254337311, + "learning_rate": 2.480444204489081e-06, + "loss": 0.4273, + "step": 2527 + }, + { + "epoch": 2.104328523862375, + "grad_norm": 0.31707751750946045, + "learning_rate": 2.476260366665041e-06, + "loss": 0.4271, + "step": 2528 + }, + { + "epoch": 2.1051609322974474, + "grad_norm": 0.28172051906585693, + "learning_rate": 2.472078898676708e-06, + "loss": 0.3713, + "step": 2529 + }, + { + "epoch": 2.1059933407325193, + "grad_norm": 0.31706756353378296, + "learning_rate": 2.467899804450553e-06, + "loss": 0.4136, + "step": 2530 + }, + { + "epoch": 2.1068257491675917, + "grad_norm": 0.31650689244270325, + "learning_rate": 2.463723087910815e-06, + "loss": 0.3989, + "step": 2531 + }, + { + "epoch": 2.1076581576026636, + "grad_norm": 0.3059511184692383, + "learning_rate": 2.4595487529795044e-06, + "loss": 0.3801, + "step": 2532 + }, + { + "epoch": 2.108490566037736, + "grad_norm": 0.30133718252182007, + "learning_rate": 2.4553768035763996e-06, + "loss": 0.3809, + "step": 2533 + }, + { + "epoch": 2.109322974472808, + "grad_norm": 0.3258849084377289, + "learning_rate": 2.451207243619029e-06, + "loss": 0.4455, + "step": 2534 + }, + { + "epoch": 2.1101553829078803, + "grad_norm": 0.29617834091186523, + "learning_rate": 2.447040077022685e-06, + "loss": 0.3728, + "step": 2535 + }, + { + "epoch": 2.1109877913429522, + "grad_norm": 0.3328491747379303, + "learning_rate": 2.4428753077004067e-06, + "loss": 0.3945, + "step": 2536 + }, + { + "epoch": 2.1118201997780246, + "grad_norm": 0.3035510778427124, + "learning_rate": 2.438712939562992e-06, + "loss": 0.3755, + "step": 2537 + }, + { + "epoch": 2.1126526082130965, + "grad_norm": 0.31253474950790405, + "learning_rate": 2.434552976518971e-06, + "loss": 0.3796, + "step": 2538 + }, + { + "epoch": 2.113485016648169, + "grad_norm": 0.35178011655807495, + "learning_rate": 2.430395422474625e-06, + "loss": 0.4061, + "step": 2539 + }, + { + "epoch": 2.114317425083241, + "grad_norm": 0.3211081326007843, + "learning_rate": 2.426240281333969e-06, + "loss": 0.3668, + "step": 2540 + }, + { + "epoch": 2.1151498335183128, + "grad_norm": 0.35287511348724365, + "learning_rate": 2.422087556998754e-06, + "loss": 0.4332, + "step": 2541 + }, + { + "epoch": 2.115982241953385, + "grad_norm": 0.31688159704208374, + "learning_rate": 2.41793725336846e-06, + "loss": 0.392, + "step": 2542 + }, + { + "epoch": 2.116814650388457, + "grad_norm": 0.3495129644870758, + "learning_rate": 2.4137893743402954e-06, + "loss": 0.3871, + "step": 2543 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.29781919717788696, + "learning_rate": 2.409643923809191e-06, + "loss": 0.366, + "step": 2544 + }, + { + "epoch": 2.1184794672586014, + "grad_norm": 0.3296952545642853, + "learning_rate": 2.4055009056677977e-06, + "loss": 0.4213, + "step": 2545 + }, + { + "epoch": 2.1193118756936737, + "grad_norm": 0.3212566077709198, + "learning_rate": 2.4013603238064814e-06, + "loss": 0.3989, + "step": 2546 + }, + { + "epoch": 2.1201442841287457, + "grad_norm": 0.31362348794937134, + "learning_rate": 2.397222182113322e-06, + "loss": 0.364, + "step": 2547 + }, + { + "epoch": 2.120976692563818, + "grad_norm": 0.3248312473297119, + "learning_rate": 2.393086484474108e-06, + "loss": 0.3903, + "step": 2548 + }, + { + "epoch": 2.12180910099889, + "grad_norm": 0.3590147793292999, + "learning_rate": 2.3889532347723266e-06, + "loss": 0.418, + "step": 2549 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 0.3201029300689697, + "learning_rate": 2.384822436889177e-06, + "loss": 0.4164, + "step": 2550 + }, + { + "epoch": 2.1234739178690343, + "grad_norm": 0.2909408211708069, + "learning_rate": 2.3806940947035497e-06, + "loss": 0.35, + "step": 2551 + }, + { + "epoch": 2.1243063263041067, + "grad_norm": 0.29795750975608826, + "learning_rate": 2.3765682120920315e-06, + "loss": 0.3996, + "step": 2552 + }, + { + "epoch": 2.1251387347391786, + "grad_norm": 0.31154143810272217, + "learning_rate": 2.3724447929288925e-06, + "loss": 0.3876, + "step": 2553 + }, + { + "epoch": 2.125971143174251, + "grad_norm": 0.33874204754829407, + "learning_rate": 2.368323841086102e-06, + "loss": 0.3969, + "step": 2554 + }, + { + "epoch": 2.126803551609323, + "grad_norm": 0.3370445966720581, + "learning_rate": 2.3642053604333032e-06, + "loss": 0.4318, + "step": 2555 + }, + { + "epoch": 2.1276359600443953, + "grad_norm": 0.30151233077049255, + "learning_rate": 2.3600893548378238e-06, + "loss": 0.3805, + "step": 2556 + }, + { + "epoch": 2.128468368479467, + "grad_norm": 0.3622559607028961, + "learning_rate": 2.3559758281646615e-06, + "loss": 0.418, + "step": 2557 + }, + { + "epoch": 2.1293007769145396, + "grad_norm": 0.307041734457016, + "learning_rate": 2.35186478427649e-06, + "loss": 0.3494, + "step": 2558 + }, + { + "epoch": 2.1301331853496115, + "grad_norm": 0.2808810770511627, + "learning_rate": 2.3477562270336564e-06, + "loss": 0.4012, + "step": 2559 + }, + { + "epoch": 2.130965593784684, + "grad_norm": 0.2891574800014496, + "learning_rate": 2.343650160294163e-06, + "loss": 0.358, + "step": 2560 + }, + { + "epoch": 2.131798002219756, + "grad_norm": 0.32515814900398254, + "learning_rate": 2.3395465879136795e-06, + "loss": 0.4128, + "step": 2561 + }, + { + "epoch": 2.132630410654828, + "grad_norm": 0.30551859736442566, + "learning_rate": 2.3354455137455312e-06, + "loss": 0.3673, + "step": 2562 + }, + { + "epoch": 2.1334628190899, + "grad_norm": 0.3107419013977051, + "learning_rate": 2.3313469416407037e-06, + "loss": 0.3966, + "step": 2563 + }, + { + "epoch": 2.134295227524972, + "grad_norm": 0.3009784519672394, + "learning_rate": 2.3272508754478224e-06, + "loss": 0.3692, + "step": 2564 + }, + { + "epoch": 2.1351276359600444, + "grad_norm": 0.3275810778141022, + "learning_rate": 2.3231573190131666e-06, + "loss": 0.4089, + "step": 2565 + }, + { + "epoch": 2.1359600443951163, + "grad_norm": 0.32291144132614136, + "learning_rate": 2.3190662761806586e-06, + "loss": 0.363, + "step": 2566 + }, + { + "epoch": 2.1367924528301887, + "grad_norm": 0.32055914402008057, + "learning_rate": 2.3149777507918587e-06, + "loss": 0.4111, + "step": 2567 + }, + { + "epoch": 2.1376248612652606, + "grad_norm": 0.31367000937461853, + "learning_rate": 2.310891746685963e-06, + "loss": 0.3907, + "step": 2568 + }, + { + "epoch": 2.138457269700333, + "grad_norm": 0.31795892119407654, + "learning_rate": 2.3068082676998022e-06, + "loss": 0.4011, + "step": 2569 + }, + { + "epoch": 2.139289678135405, + "grad_norm": 0.30362680554389954, + "learning_rate": 2.3027273176678337e-06, + "loss": 0.3924, + "step": 2570 + }, + { + "epoch": 2.1401220865704773, + "grad_norm": 0.3192872405052185, + "learning_rate": 2.298648900422141e-06, + "loss": 0.382, + "step": 2571 + }, + { + "epoch": 2.1409544950055492, + "grad_norm": 0.33818933367729187, + "learning_rate": 2.2945730197924303e-06, + "loss": 0.4072, + "step": 2572 + }, + { + "epoch": 2.1417869034406216, + "grad_norm": 0.3056696951389313, + "learning_rate": 2.2904996796060243e-06, + "loss": 0.3397, + "step": 2573 + }, + { + "epoch": 2.1426193118756935, + "grad_norm": 0.3521409034729004, + "learning_rate": 2.2864288836878616e-06, + "loss": 0.4124, + "step": 2574 + }, + { + "epoch": 2.143451720310766, + "grad_norm": 0.3235545754432678, + "learning_rate": 2.2823606358604868e-06, + "loss": 0.3854, + "step": 2575 + }, + { + "epoch": 2.144284128745838, + "grad_norm": 0.33744242787361145, + "learning_rate": 2.278294939944061e-06, + "loss": 0.3948, + "step": 2576 + }, + { + "epoch": 2.14511653718091, + "grad_norm": 0.30026137828826904, + "learning_rate": 2.2742317997563407e-06, + "loss": 0.3687, + "step": 2577 + }, + { + "epoch": 2.145948945615982, + "grad_norm": 0.3329930007457733, + "learning_rate": 2.2701712191126895e-06, + "loss": 0.4255, + "step": 2578 + }, + { + "epoch": 2.1467813540510545, + "grad_norm": 0.3148747682571411, + "learning_rate": 2.266113201826057e-06, + "loss": 0.3728, + "step": 2579 + }, + { + "epoch": 2.1476137624861265, + "grad_norm": 0.300738126039505, + "learning_rate": 2.2620577517069986e-06, + "loss": 0.3788, + "step": 2580 + }, + { + "epoch": 2.148446170921199, + "grad_norm": 0.31569600105285645, + "learning_rate": 2.2580048725636506e-06, + "loss": 0.4349, + "step": 2581 + }, + { + "epoch": 2.1492785793562708, + "grad_norm": 0.3264215290546417, + "learning_rate": 2.2539545682017394e-06, + "loss": 0.3995, + "step": 2582 + }, + { + "epoch": 2.150110987791343, + "grad_norm": 0.3219297230243683, + "learning_rate": 2.2499068424245667e-06, + "loss": 0.3659, + "step": 2583 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.32074615359306335, + "learning_rate": 2.245861699033023e-06, + "loss": 0.3935, + "step": 2584 + }, + { + "epoch": 2.1517758046614874, + "grad_norm": 0.31240931153297424, + "learning_rate": 2.2418191418255684e-06, + "loss": 0.4093, + "step": 2585 + }, + { + "epoch": 2.1526082130965594, + "grad_norm": 0.30113837122917175, + "learning_rate": 2.2377791745982323e-06, + "loss": 0.367, + "step": 2586 + }, + { + "epoch": 2.1534406215316313, + "grad_norm": 0.3172069489955902, + "learning_rate": 2.2337418011446154e-06, + "loss": 0.3933, + "step": 2587 + }, + { + "epoch": 2.1542730299667037, + "grad_norm": 0.3283001780509949, + "learning_rate": 2.229707025255881e-06, + "loss": 0.4171, + "step": 2588 + }, + { + "epoch": 2.1551054384017756, + "grad_norm": 0.30055010318756104, + "learning_rate": 2.225674850720759e-06, + "loss": 0.3517, + "step": 2589 + }, + { + "epoch": 2.155937846836848, + "grad_norm": 0.33835846185684204, + "learning_rate": 2.2216452813255273e-06, + "loss": 0.3955, + "step": 2590 + }, + { + "epoch": 2.15677025527192, + "grad_norm": 0.3244904577732086, + "learning_rate": 2.2176183208540236e-06, + "loss": 0.4064, + "step": 2591 + }, + { + "epoch": 2.1576026637069923, + "grad_norm": 0.32495352625846863, + "learning_rate": 2.2135939730876344e-06, + "loss": 0.4063, + "step": 2592 + }, + { + "epoch": 2.158435072142064, + "grad_norm": 0.32691752910614014, + "learning_rate": 2.2095722418052916e-06, + "loss": 0.3824, + "step": 2593 + }, + { + "epoch": 2.1592674805771366, + "grad_norm": 0.33834582567214966, + "learning_rate": 2.2055531307834734e-06, + "loss": 0.3959, + "step": 2594 + }, + { + "epoch": 2.1600998890122085, + "grad_norm": 0.3010924756526947, + "learning_rate": 2.2015366437961932e-06, + "loss": 0.3838, + "step": 2595 + }, + { + "epoch": 2.160932297447281, + "grad_norm": 0.3164057731628418, + "learning_rate": 2.197522784615004e-06, + "loss": 0.3656, + "step": 2596 + }, + { + "epoch": 2.161764705882353, + "grad_norm": 0.3379717171192169, + "learning_rate": 2.1935115570089897e-06, + "loss": 0.4121, + "step": 2597 + }, + { + "epoch": 2.162597114317425, + "grad_norm": 0.2782793939113617, + "learning_rate": 2.189502964744763e-06, + "loss": 0.3191, + "step": 2598 + }, + { + "epoch": 2.163429522752497, + "grad_norm": 0.2974027991294861, + "learning_rate": 2.1854970115864623e-06, + "loss": 0.4204, + "step": 2599 + }, + { + "epoch": 2.1642619311875695, + "grad_norm": 0.2943418622016907, + "learning_rate": 2.1814937012957476e-06, + "loss": 0.4057, + "step": 2600 + }, + { + "epoch": 2.1650943396226414, + "grad_norm": 0.30796945095062256, + "learning_rate": 2.1774930376317976e-06, + "loss": 0.3434, + "step": 2601 + }, + { + "epoch": 2.165926748057714, + "grad_norm": 0.35572606325149536, + "learning_rate": 2.1734950243513054e-06, + "loss": 0.4329, + "step": 2602 + }, + { + "epoch": 2.1667591564927857, + "grad_norm": 0.30681103467941284, + "learning_rate": 2.1694996652084752e-06, + "loss": 0.3559, + "step": 2603 + }, + { + "epoch": 2.167591564927858, + "grad_norm": 0.30400606989860535, + "learning_rate": 2.165506963955022e-06, + "loss": 0.3995, + "step": 2604 + }, + { + "epoch": 2.16842397336293, + "grad_norm": 0.30469441413879395, + "learning_rate": 2.1615169243401557e-06, + "loss": 0.3993, + "step": 2605 + }, + { + "epoch": 2.1692563817980024, + "grad_norm": 0.3207082748413086, + "learning_rate": 2.1575295501105987e-06, + "loss": 0.4184, + "step": 2606 + }, + { + "epoch": 2.1700887902330743, + "grad_norm": 0.30796340107917786, + "learning_rate": 2.1535448450105644e-06, + "loss": 0.3651, + "step": 2607 + }, + { + "epoch": 2.1709211986681467, + "grad_norm": 0.3026013672351837, + "learning_rate": 2.1495628127817618e-06, + "loss": 0.3816, + "step": 2608 + }, + { + "epoch": 2.1717536071032186, + "grad_norm": 0.33026769757270813, + "learning_rate": 2.1455834571633836e-06, + "loss": 0.4104, + "step": 2609 + }, + { + "epoch": 2.1725860155382906, + "grad_norm": 0.32119134068489075, + "learning_rate": 2.14160678189212e-06, + "loss": 0.4247, + "step": 2610 + }, + { + "epoch": 2.173418423973363, + "grad_norm": 0.30723845958709717, + "learning_rate": 2.1376327907021385e-06, + "loss": 0.3703, + "step": 2611 + }, + { + "epoch": 2.1742508324084353, + "grad_norm": 0.29472365975379944, + "learning_rate": 2.133661487325082e-06, + "loss": 0.3878, + "step": 2612 + }, + { + "epoch": 2.1750832408435072, + "grad_norm": 0.3001120090484619, + "learning_rate": 2.1296928754900753e-06, + "loss": 0.3795, + "step": 2613 + }, + { + "epoch": 2.175915649278579, + "grad_norm": 0.31931760907173157, + "learning_rate": 2.125726958923718e-06, + "loss": 0.3901, + "step": 2614 + }, + { + "epoch": 2.1767480577136515, + "grad_norm": 0.3011782765388489, + "learning_rate": 2.1217637413500735e-06, + "loss": 0.3744, + "step": 2615 + }, + { + "epoch": 2.1775804661487235, + "grad_norm": 0.3152015507221222, + "learning_rate": 2.1178032264906704e-06, + "loss": 0.382, + "step": 2616 + }, + { + "epoch": 2.178412874583796, + "grad_norm": 0.3104303479194641, + "learning_rate": 2.1138454180645035e-06, + "loss": 0.3745, + "step": 2617 + }, + { + "epoch": 2.1792452830188678, + "grad_norm": 0.32811030745506287, + "learning_rate": 2.109890319788023e-06, + "loss": 0.3892, + "step": 2618 + }, + { + "epoch": 2.18007769145394, + "grad_norm": 0.32053300738334656, + "learning_rate": 2.105937935375136e-06, + "loss": 0.372, + "step": 2619 + }, + { + "epoch": 2.180910099889012, + "grad_norm": 0.3680615723133087, + "learning_rate": 2.1019882685372016e-06, + "loss": 0.4414, + "step": 2620 + }, + { + "epoch": 2.1817425083240845, + "grad_norm": 0.29054370522499084, + "learning_rate": 2.0980413229830248e-06, + "loss": 0.3299, + "step": 2621 + }, + { + "epoch": 2.1825749167591564, + "grad_norm": 0.31951627135276794, + "learning_rate": 2.094097102418857e-06, + "loss": 0.3982, + "step": 2622 + }, + { + "epoch": 2.1834073251942288, + "grad_norm": 0.3075707256793976, + "learning_rate": 2.09015561054839e-06, + "loss": 0.3976, + "step": 2623 + }, + { + "epoch": 2.1842397336293007, + "grad_norm": 0.28511252999305725, + "learning_rate": 2.0862168510727545e-06, + "loss": 0.3907, + "step": 2624 + }, + { + "epoch": 2.185072142064373, + "grad_norm": 0.2891198694705963, + "learning_rate": 2.0822808276905144e-06, + "loss": 0.3904, + "step": 2625 + }, + { + "epoch": 2.185904550499445, + "grad_norm": 0.3299737870693207, + "learning_rate": 2.0783475440976635e-06, + "loss": 0.3974, + "step": 2626 + }, + { + "epoch": 2.1867369589345174, + "grad_norm": 0.3298386335372925, + "learning_rate": 2.0744170039876255e-06, + "loss": 0.3923, + "step": 2627 + }, + { + "epoch": 2.1875693673695893, + "grad_norm": 0.295673131942749, + "learning_rate": 2.0704892110512458e-06, + "loss": 0.3811, + "step": 2628 + }, + { + "epoch": 2.1884017758046617, + "grad_norm": 0.3085336983203888, + "learning_rate": 2.0665641689767902e-06, + "loss": 0.392, + "step": 2629 + }, + { + "epoch": 2.1892341842397336, + "grad_norm": 0.3050670921802521, + "learning_rate": 2.0626418814499428e-06, + "loss": 0.3935, + "step": 2630 + }, + { + "epoch": 2.190066592674806, + "grad_norm": 0.32354554533958435, + "learning_rate": 2.0587223521537996e-06, + "loss": 0.4429, + "step": 2631 + }, + { + "epoch": 2.190899001109878, + "grad_norm": 0.3790428638458252, + "learning_rate": 2.0548055847688676e-06, + "loss": 0.3709, + "step": 2632 + }, + { + "epoch": 2.19173140954495, + "grad_norm": 0.2934742271900177, + "learning_rate": 2.0508915829730595e-06, + "loss": 0.3677, + "step": 2633 + }, + { + "epoch": 2.192563817980022, + "grad_norm": 0.30265501141548157, + "learning_rate": 2.046980350441694e-06, + "loss": 0.4166, + "step": 2634 + }, + { + "epoch": 2.1933962264150946, + "grad_norm": 0.3148394525051117, + "learning_rate": 2.0430718908474813e-06, + "loss": 0.3848, + "step": 2635 + }, + { + "epoch": 2.1942286348501665, + "grad_norm": 0.35043779015541077, + "learning_rate": 2.0391662078605383e-06, + "loss": 0.4055, + "step": 2636 + }, + { + "epoch": 2.1950610432852384, + "grad_norm": 0.27816712856292725, + "learning_rate": 2.0352633051483705e-06, + "loss": 0.3301, + "step": 2637 + }, + { + "epoch": 2.195893451720311, + "grad_norm": 0.3395947813987732, + "learning_rate": 2.0313631863758677e-06, + "loss": 0.4272, + "step": 2638 + }, + { + "epoch": 2.1967258601553827, + "grad_norm": 0.3466813564300537, + "learning_rate": 2.02746585520531e-06, + "loss": 0.3971, + "step": 2639 + }, + { + "epoch": 2.197558268590455, + "grad_norm": 0.32729557156562805, + "learning_rate": 2.0235713152963627e-06, + "loss": 0.3828, + "step": 2640 + }, + { + "epoch": 2.198390677025527, + "grad_norm": 0.35937994718551636, + "learning_rate": 2.019679570306068e-06, + "loss": 0.3956, + "step": 2641 + }, + { + "epoch": 2.1992230854605994, + "grad_norm": 0.2958609163761139, + "learning_rate": 2.0157906238888376e-06, + "loss": 0.3495, + "step": 2642 + }, + { + "epoch": 2.2000554938956713, + "grad_norm": 0.31037241220474243, + "learning_rate": 2.0119044796964614e-06, + "loss": 0.3748, + "step": 2643 + }, + { + "epoch": 2.2008879023307437, + "grad_norm": 0.3517056107521057, + "learning_rate": 2.008021141378102e-06, + "loss": 0.4095, + "step": 2644 + }, + { + "epoch": 2.2017203107658156, + "grad_norm": 0.3355526030063629, + "learning_rate": 2.0041406125802764e-06, + "loss": 0.3765, + "step": 2645 + }, + { + "epoch": 2.202552719200888, + "grad_norm": 0.32547831535339355, + "learning_rate": 2.0002628969468713e-06, + "loss": 0.4366, + "step": 2646 + }, + { + "epoch": 2.20338512763596, + "grad_norm": 0.2848454415798187, + "learning_rate": 1.9963879981191288e-06, + "loss": 0.3593, + "step": 2647 + }, + { + "epoch": 2.2042175360710323, + "grad_norm": 0.3068901598453522, + "learning_rate": 1.9925159197356475e-06, + "loss": 0.38, + "step": 2648 + }, + { + "epoch": 2.2050499445061043, + "grad_norm": 0.33143675327301025, + "learning_rate": 1.9886466654323765e-06, + "loss": 0.4175, + "step": 2649 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 0.323045551776886, + "learning_rate": 1.9847802388426137e-06, + "loss": 0.4157, + "step": 2650 + }, + { + "epoch": 2.2067147613762486, + "grad_norm": 0.27590107917785645, + "learning_rate": 1.9809166435970006e-06, + "loss": 0.3653, + "step": 2651 + }, + { + "epoch": 2.207547169811321, + "grad_norm": 0.3093126714229584, + "learning_rate": 1.9770558833235215e-06, + "loss": 0.4101, + "step": 2652 + }, + { + "epoch": 2.208379578246393, + "grad_norm": 0.32659175992012024, + "learning_rate": 1.973197961647498e-06, + "loss": 0.4096, + "step": 2653 + }, + { + "epoch": 2.2092119866814652, + "grad_norm": 0.29490897059440613, + "learning_rate": 1.969342882191585e-06, + "loss": 0.3598, + "step": 2654 + }, + { + "epoch": 2.210044395116537, + "grad_norm": 0.2893614172935486, + "learning_rate": 1.9654906485757707e-06, + "loss": 0.3784, + "step": 2655 + }, + { + "epoch": 2.210876803551609, + "grad_norm": 0.2968060374259949, + "learning_rate": 1.9616412644173697e-06, + "loss": 0.3807, + "step": 2656 + }, + { + "epoch": 2.2117092119866815, + "grad_norm": 0.2902612090110779, + "learning_rate": 1.957794733331021e-06, + "loss": 0.3769, + "step": 2657 + }, + { + "epoch": 2.212541620421754, + "grad_norm": 0.30103838443756104, + "learning_rate": 1.9539510589286848e-06, + "loss": 0.3738, + "step": 2658 + }, + { + "epoch": 2.2133740288568258, + "grad_norm": 0.3171117901802063, + "learning_rate": 1.950110244819638e-06, + "loss": 0.3899, + "step": 2659 + }, + { + "epoch": 2.2142064372918977, + "grad_norm": 0.32826194167137146, + "learning_rate": 1.9462722946104727e-06, + "loss": 0.3894, + "step": 2660 + }, + { + "epoch": 2.21503884572697, + "grad_norm": 0.32006773352622986, + "learning_rate": 1.942437211905092e-06, + "loss": 0.3982, + "step": 2661 + }, + { + "epoch": 2.215871254162042, + "grad_norm": 0.33344799280166626, + "learning_rate": 1.9386050003047047e-06, + "loss": 0.3706, + "step": 2662 + }, + { + "epoch": 2.2167036625971144, + "grad_norm": 0.30667856335639954, + "learning_rate": 1.9347756634078273e-06, + "loss": 0.4086, + "step": 2663 + }, + { + "epoch": 2.2175360710321863, + "grad_norm": 0.31673869490623474, + "learning_rate": 1.93094920481027e-06, + "loss": 0.4003, + "step": 2664 + }, + { + "epoch": 2.2183684794672587, + "grad_norm": 0.3328154981136322, + "learning_rate": 1.9271256281051443e-06, + "loss": 0.4012, + "step": 2665 + }, + { + "epoch": 2.2192008879023306, + "grad_norm": 0.32742953300476074, + "learning_rate": 1.92330493688286e-06, + "loss": 0.4188, + "step": 2666 + }, + { + "epoch": 2.220033296337403, + "grad_norm": 0.27765265107154846, + "learning_rate": 1.9194871347311115e-06, + "loss": 0.39, + "step": 2667 + }, + { + "epoch": 2.220865704772475, + "grad_norm": 0.28916940093040466, + "learning_rate": 1.91567222523488e-06, + "loss": 0.389, + "step": 2668 + }, + { + "epoch": 2.2216981132075473, + "grad_norm": 0.2962568402290344, + "learning_rate": 1.9118602119764325e-06, + "loss": 0.3644, + "step": 2669 + }, + { + "epoch": 2.222530521642619, + "grad_norm": 0.33371245861053467, + "learning_rate": 1.90805109853532e-06, + "loss": 0.4107, + "step": 2670 + }, + { + "epoch": 2.2233629300776916, + "grad_norm": 0.3214453160762787, + "learning_rate": 1.9042448884883618e-06, + "loss": 0.4116, + "step": 2671 + }, + { + "epoch": 2.2241953385127635, + "grad_norm": 0.2842710614204407, + "learning_rate": 1.9004415854096586e-06, + "loss": 0.3673, + "step": 2672 + }, + { + "epoch": 2.225027746947836, + "grad_norm": 0.3369787633419037, + "learning_rate": 1.8966411928705757e-06, + "loss": 0.3995, + "step": 2673 + }, + { + "epoch": 2.225860155382908, + "grad_norm": 0.3225659132003784, + "learning_rate": 1.8928437144397538e-06, + "loss": 0.403, + "step": 2674 + }, + { + "epoch": 2.22669256381798, + "grad_norm": 0.32059574127197266, + "learning_rate": 1.8890491536830863e-06, + "loss": 0.356, + "step": 2675 + }, + { + "epoch": 2.227524972253052, + "grad_norm": 0.3181704580783844, + "learning_rate": 1.8852575141637347e-06, + "loss": 0.4074, + "step": 2676 + }, + { + "epoch": 2.2283573806881245, + "grad_norm": 0.317874938249588, + "learning_rate": 1.8814687994421138e-06, + "loss": 0.3938, + "step": 2677 + }, + { + "epoch": 2.2291897891231964, + "grad_norm": 0.3188163638114929, + "learning_rate": 1.8776830130758939e-06, + "loss": 0.3644, + "step": 2678 + }, + { + "epoch": 2.2300221975582684, + "grad_norm": 0.3100307285785675, + "learning_rate": 1.873900158619994e-06, + "loss": 0.4062, + "step": 2679 + }, + { + "epoch": 2.2308546059933407, + "grad_norm": 0.3106209635734558, + "learning_rate": 1.8701202396265815e-06, + "loss": 0.3857, + "step": 2680 + }, + { + "epoch": 2.231687014428413, + "grad_norm": 0.35905230045318604, + "learning_rate": 1.866343259645066e-06, + "loss": 0.4164, + "step": 2681 + }, + { + "epoch": 2.232519422863485, + "grad_norm": 0.28830498456954956, + "learning_rate": 1.8625692222220977e-06, + "loss": 0.3477, + "step": 2682 + }, + { + "epoch": 2.233351831298557, + "grad_norm": 0.30920305848121643, + "learning_rate": 1.8587981309015635e-06, + "loss": 0.4061, + "step": 2683 + }, + { + "epoch": 2.2341842397336293, + "grad_norm": 0.3114646375179291, + "learning_rate": 1.8550299892245854e-06, + "loss": 0.3915, + "step": 2684 + }, + { + "epoch": 2.2350166481687013, + "grad_norm": 0.3108902871608734, + "learning_rate": 1.851264800729513e-06, + "loss": 0.3879, + "step": 2685 + }, + { + "epoch": 2.2358490566037736, + "grad_norm": 0.2933717370033264, + "learning_rate": 1.8475025689519256e-06, + "loss": 0.3962, + "step": 2686 + }, + { + "epoch": 2.2366814650388456, + "grad_norm": 0.29493871331214905, + "learning_rate": 1.8437432974246238e-06, + "loss": 0.3765, + "step": 2687 + }, + { + "epoch": 2.237513873473918, + "grad_norm": 0.31428149342536926, + "learning_rate": 1.8399869896776296e-06, + "loss": 0.4303, + "step": 2688 + }, + { + "epoch": 2.23834628190899, + "grad_norm": 0.30307847261428833, + "learning_rate": 1.8362336492381832e-06, + "loss": 0.3772, + "step": 2689 + }, + { + "epoch": 2.2391786903440623, + "grad_norm": 0.31176915764808655, + "learning_rate": 1.8324832796307323e-06, + "loss": 0.3836, + "step": 2690 + }, + { + "epoch": 2.240011098779134, + "grad_norm": 0.2853330075740814, + "learning_rate": 1.8287358843769448e-06, + "loss": 0.3783, + "step": 2691 + }, + { + "epoch": 2.2408435072142066, + "grad_norm": 0.28568682074546814, + "learning_rate": 1.8249914669956886e-06, + "loss": 0.392, + "step": 2692 + }, + { + "epoch": 2.2416759156492785, + "grad_norm": 0.2875567674636841, + "learning_rate": 1.8212500310030385e-06, + "loss": 0.4076, + "step": 2693 + }, + { + "epoch": 2.242508324084351, + "grad_norm": 0.28917601704597473, + "learning_rate": 1.8175115799122656e-06, + "loss": 0.3889, + "step": 2694 + }, + { + "epoch": 2.243340732519423, + "grad_norm": 0.29513514041900635, + "learning_rate": 1.8137761172338404e-06, + "loss": 0.4111, + "step": 2695 + }, + { + "epoch": 2.244173140954495, + "grad_norm": 0.2941649854183197, + "learning_rate": 1.810043646475431e-06, + "loss": 0.3975, + "step": 2696 + }, + { + "epoch": 2.245005549389567, + "grad_norm": 0.28224435448646545, + "learning_rate": 1.8063141711418941e-06, + "loss": 0.3946, + "step": 2697 + }, + { + "epoch": 2.2458379578246395, + "grad_norm": 0.286742627620697, + "learning_rate": 1.8025876947352677e-06, + "loss": 0.3954, + "step": 2698 + }, + { + "epoch": 2.2466703662597114, + "grad_norm": 0.3206160366535187, + "learning_rate": 1.7988642207547784e-06, + "loss": 0.4243, + "step": 2699 + }, + { + "epoch": 2.2475027746947838, + "grad_norm": 0.3006853759288788, + "learning_rate": 1.795143752696839e-06, + "loss": 0.3759, + "step": 2700 + }, + { + "epoch": 2.2483351831298557, + "grad_norm": 0.29394397139549255, + "learning_rate": 1.7914262940550292e-06, + "loss": 0.389, + "step": 2701 + }, + { + "epoch": 2.2491675915649276, + "grad_norm": 0.3121004104614258, + "learning_rate": 1.7877118483201095e-06, + "loss": 0.3977, + "step": 2702 + }, + { + "epoch": 2.25, + "grad_norm": 0.3045893609523773, + "learning_rate": 1.784000418980007e-06, + "loss": 0.4071, + "step": 2703 + }, + { + "epoch": 2.2508324084350724, + "grad_norm": 0.30945533514022827, + "learning_rate": 1.7802920095198246e-06, + "loss": 0.3923, + "step": 2704 + }, + { + "epoch": 2.2516648168701443, + "grad_norm": 0.28781136870384216, + "learning_rate": 1.7765866234218187e-06, + "loss": 0.3648, + "step": 2705 + }, + { + "epoch": 2.2524972253052162, + "grad_norm": 0.32264959812164307, + "learning_rate": 1.7728842641654125e-06, + "loss": 0.4221, + "step": 2706 + }, + { + "epoch": 2.2533296337402886, + "grad_norm": 0.30960628390312195, + "learning_rate": 1.7691849352271872e-06, + "loss": 0.3859, + "step": 2707 + }, + { + "epoch": 2.2541620421753605, + "grad_norm": 0.3129541873931885, + "learning_rate": 1.7654886400808774e-06, + "loss": 0.3869, + "step": 2708 + }, + { + "epoch": 2.254994450610433, + "grad_norm": 0.2916148900985718, + "learning_rate": 1.7617953821973682e-06, + "loss": 0.363, + "step": 2709 + }, + { + "epoch": 2.255826859045505, + "grad_norm": 0.3174515664577484, + "learning_rate": 1.758105165044694e-06, + "loss": 0.4119, + "step": 2710 + }, + { + "epoch": 2.256659267480577, + "grad_norm": 0.28080281615257263, + "learning_rate": 1.7544179920880333e-06, + "loss": 0.3623, + "step": 2711 + }, + { + "epoch": 2.257491675915649, + "grad_norm": 0.31006425619125366, + "learning_rate": 1.7507338667897062e-06, + "loss": 0.4584, + "step": 2712 + }, + { + "epoch": 2.2583240843507215, + "grad_norm": 0.3159677982330322, + "learning_rate": 1.7470527926091702e-06, + "loss": 0.3642, + "step": 2713 + }, + { + "epoch": 2.2591564927857934, + "grad_norm": 0.34636712074279785, + "learning_rate": 1.7433747730030188e-06, + "loss": 0.3917, + "step": 2714 + }, + { + "epoch": 2.259988901220866, + "grad_norm": 0.28924471139907837, + "learning_rate": 1.7396998114249786e-06, + "loss": 0.3665, + "step": 2715 + }, + { + "epoch": 2.2608213096559377, + "grad_norm": 0.3206512928009033, + "learning_rate": 1.7360279113258977e-06, + "loss": 0.3851, + "step": 2716 + }, + { + "epoch": 2.26165371809101, + "grad_norm": 0.30951130390167236, + "learning_rate": 1.7323590761537595e-06, + "loss": 0.4265, + "step": 2717 + }, + { + "epoch": 2.262486126526082, + "grad_norm": 0.318645179271698, + "learning_rate": 1.7286933093536634e-06, + "loss": 0.3968, + "step": 2718 + }, + { + "epoch": 2.2633185349611544, + "grad_norm": 0.31277957558631897, + "learning_rate": 1.7250306143678292e-06, + "loss": 0.3947, + "step": 2719 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.30135253071784973, + "learning_rate": 1.7213709946355879e-06, + "loss": 0.3715, + "step": 2720 + }, + { + "epoch": 2.2649833518312987, + "grad_norm": 0.3124295771121979, + "learning_rate": 1.7177144535933903e-06, + "loss": 0.4376, + "step": 2721 + }, + { + "epoch": 2.2658157602663707, + "grad_norm": 0.30148327350616455, + "learning_rate": 1.7140609946747915e-06, + "loss": 0.382, + "step": 2722 + }, + { + "epoch": 2.266648168701443, + "grad_norm": 0.32914572954177856, + "learning_rate": 1.7104106213104554e-06, + "loss": 0.4407, + "step": 2723 + }, + { + "epoch": 2.267480577136515, + "grad_norm": 0.3174681067466736, + "learning_rate": 1.7067633369281422e-06, + "loss": 0.3585, + "step": 2724 + }, + { + "epoch": 2.268312985571587, + "grad_norm": 0.29542532563209534, + "learning_rate": 1.7031191449527162e-06, + "loss": 0.3803, + "step": 2725 + }, + { + "epoch": 2.2691453940066593, + "grad_norm": 0.2985629439353943, + "learning_rate": 1.699478048806143e-06, + "loss": 0.3619, + "step": 2726 + }, + { + "epoch": 2.2699778024417316, + "grad_norm": 0.3362717628479004, + "learning_rate": 1.6958400519074696e-06, + "loss": 0.3884, + "step": 2727 + }, + { + "epoch": 2.2708102108768036, + "grad_norm": 0.33345919847488403, + "learning_rate": 1.6922051576728415e-06, + "loss": 0.4431, + "step": 2728 + }, + { + "epoch": 2.2716426193118755, + "grad_norm": 0.2820887267589569, + "learning_rate": 1.6885733695154855e-06, + "loss": 0.319, + "step": 2729 + }, + { + "epoch": 2.272475027746948, + "grad_norm": 0.2951180636882782, + "learning_rate": 1.6849446908457201e-06, + "loss": 0.3953, + "step": 2730 + }, + { + "epoch": 2.27330743618202, + "grad_norm": 0.2896723747253418, + "learning_rate": 1.6813191250709326e-06, + "loss": 0.4086, + "step": 2731 + }, + { + "epoch": 2.274139844617092, + "grad_norm": 0.29083332419395447, + "learning_rate": 1.6776966755955941e-06, + "loss": 0.3748, + "step": 2732 + }, + { + "epoch": 2.274972253052164, + "grad_norm": 0.3114146888256073, + "learning_rate": 1.674077345821249e-06, + "loss": 0.4199, + "step": 2733 + }, + { + "epoch": 2.2758046614872365, + "grad_norm": 0.311087965965271, + "learning_rate": 1.6704611391465103e-06, + "loss": 0.4003, + "step": 2734 + }, + { + "epoch": 2.2766370699223084, + "grad_norm": 0.3135244846343994, + "learning_rate": 1.6668480589670604e-06, + "loss": 0.3827, + "step": 2735 + }, + { + "epoch": 2.277469478357381, + "grad_norm": 0.3102346956729889, + "learning_rate": 1.6632381086756439e-06, + "loss": 0.3987, + "step": 2736 + }, + { + "epoch": 2.2783018867924527, + "grad_norm": 0.2921448349952698, + "learning_rate": 1.6596312916620677e-06, + "loss": 0.3694, + "step": 2737 + }, + { + "epoch": 2.279134295227525, + "grad_norm": 0.31638967990875244, + "learning_rate": 1.6560276113131968e-06, + "loss": 0.4139, + "step": 2738 + }, + { + "epoch": 2.279966703662597, + "grad_norm": 0.29766252636909485, + "learning_rate": 1.6524270710129491e-06, + "loss": 0.3582, + "step": 2739 + }, + { + "epoch": 2.2807991120976694, + "grad_norm": 0.308046817779541, + "learning_rate": 1.6488296741422955e-06, + "loss": 0.3882, + "step": 2740 + }, + { + "epoch": 2.2816315205327413, + "grad_norm": 0.30890682339668274, + "learning_rate": 1.6452354240792561e-06, + "loss": 0.4078, + "step": 2741 + }, + { + "epoch": 2.2824639289678137, + "grad_norm": 0.31466445326805115, + "learning_rate": 1.64164432419889e-06, + "loss": 0.3866, + "step": 2742 + }, + { + "epoch": 2.2832963374028856, + "grad_norm": 0.3218589425086975, + "learning_rate": 1.6380563778733078e-06, + "loss": 0.3788, + "step": 2743 + }, + { + "epoch": 2.284128745837958, + "grad_norm": 0.3170068860054016, + "learning_rate": 1.6344715884716517e-06, + "loss": 0.3912, + "step": 2744 + }, + { + "epoch": 2.28496115427303, + "grad_norm": 0.28382614254951477, + "learning_rate": 1.630889959360104e-06, + "loss": 0.3923, + "step": 2745 + }, + { + "epoch": 2.2857935627081023, + "grad_norm": 0.3070010840892792, + "learning_rate": 1.627311493901872e-06, + "loss": 0.4414, + "step": 2746 + }, + { + "epoch": 2.2866259711431742, + "grad_norm": 0.2942955493927002, + "learning_rate": 1.6237361954572023e-06, + "loss": 0.3476, + "step": 2747 + }, + { + "epoch": 2.287458379578246, + "grad_norm": 0.2962301969528198, + "learning_rate": 1.6201640673833613e-06, + "loss": 0.3807, + "step": 2748 + }, + { + "epoch": 2.2882907880133185, + "grad_norm": 0.3063211441040039, + "learning_rate": 1.6165951130346408e-06, + "loss": 0.4005, + "step": 2749 + }, + { + "epoch": 2.289123196448391, + "grad_norm": 0.30910301208496094, + "learning_rate": 1.6130293357623473e-06, + "loss": 0.4213, + "step": 2750 + }, + { + "epoch": 2.289955604883463, + "grad_norm": 0.3249872922897339, + "learning_rate": 1.6094667389148128e-06, + "loss": 0.3776, + "step": 2751 + }, + { + "epoch": 2.2907880133185348, + "grad_norm": 0.3110400140285492, + "learning_rate": 1.605907325837378e-06, + "loss": 0.4065, + "step": 2752 + }, + { + "epoch": 2.291620421753607, + "grad_norm": 0.28765764832496643, + "learning_rate": 1.6023510998723906e-06, + "loss": 0.3952, + "step": 2753 + }, + { + "epoch": 2.292452830188679, + "grad_norm": 0.3131961524486542, + "learning_rate": 1.598798064359211e-06, + "loss": 0.4005, + "step": 2754 + }, + { + "epoch": 2.2932852386237514, + "grad_norm": 0.30940961837768555, + "learning_rate": 1.5952482226342003e-06, + "loss": 0.3693, + "step": 2755 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 0.32620513439178467, + "learning_rate": 1.5917015780307265e-06, + "loss": 0.418, + "step": 2756 + }, + { + "epoch": 2.2949500554938957, + "grad_norm": 0.2872724235057831, + "learning_rate": 1.5881581338791462e-06, + "loss": 0.343, + "step": 2757 + }, + { + "epoch": 2.2957824639289677, + "grad_norm": 0.28437381982803345, + "learning_rate": 1.5846178935068173e-06, + "loss": 0.3797, + "step": 2758 + }, + { + "epoch": 2.29661487236404, + "grad_norm": 0.29751530289649963, + "learning_rate": 1.5810808602380872e-06, + "loss": 0.4177, + "step": 2759 + }, + { + "epoch": 2.297447280799112, + "grad_norm": 0.29856476187705994, + "learning_rate": 1.5775470373942926e-06, + "loss": 0.3655, + "step": 2760 + }, + { + "epoch": 2.2982796892341844, + "grad_norm": 0.3321475684642792, + "learning_rate": 1.5740164282937548e-06, + "loss": 0.4332, + "step": 2761 + }, + { + "epoch": 2.2991120976692563, + "grad_norm": 0.2865569591522217, + "learning_rate": 1.5704890362517772e-06, + "loss": 0.3488, + "step": 2762 + }, + { + "epoch": 2.2999445061043287, + "grad_norm": 0.30639970302581787, + "learning_rate": 1.5669648645806428e-06, + "loss": 0.3751, + "step": 2763 + }, + { + "epoch": 2.3007769145394006, + "grad_norm": 0.3253311812877655, + "learning_rate": 1.5634439165896103e-06, + "loss": 0.3768, + "step": 2764 + }, + { + "epoch": 2.301609322974473, + "grad_norm": 0.33678269386291504, + "learning_rate": 1.5599261955849126e-06, + "loss": 0.3722, + "step": 2765 + }, + { + "epoch": 2.302441731409545, + "grad_norm": 0.32921579480171204, + "learning_rate": 1.5564117048697503e-06, + "loss": 0.425, + "step": 2766 + }, + { + "epoch": 2.3032741398446173, + "grad_norm": 0.29400959610939026, + "learning_rate": 1.5529004477442921e-06, + "loss": 0.3497, + "step": 2767 + }, + { + "epoch": 2.304106548279689, + "grad_norm": 0.30234500765800476, + "learning_rate": 1.5493924275056699e-06, + "loss": 0.3947, + "step": 2768 + }, + { + "epoch": 2.3049389567147616, + "grad_norm": 0.29784688353538513, + "learning_rate": 1.5458876474479757e-06, + "loss": 0.3688, + "step": 2769 + }, + { + "epoch": 2.3057713651498335, + "grad_norm": 0.3415769636631012, + "learning_rate": 1.5423861108622601e-06, + "loss": 0.4432, + "step": 2770 + }, + { + "epoch": 2.3066037735849054, + "grad_norm": 0.2958844304084778, + "learning_rate": 1.5388878210365283e-06, + "loss": 0.3377, + "step": 2771 + }, + { + "epoch": 2.307436182019978, + "grad_norm": 0.3629142940044403, + "learning_rate": 1.5353927812557306e-06, + "loss": 0.4083, + "step": 2772 + }, + { + "epoch": 2.30826859045505, + "grad_norm": 0.3380175530910492, + "learning_rate": 1.5319009948017765e-06, + "loss": 0.3929, + "step": 2773 + }, + { + "epoch": 2.309100998890122, + "grad_norm": 0.2930530607700348, + "learning_rate": 1.528412464953512e-06, + "loss": 0.389, + "step": 2774 + }, + { + "epoch": 2.309933407325194, + "grad_norm": 0.3223347067832947, + "learning_rate": 1.5249271949867294e-06, + "loss": 0.4326, + "step": 2775 + }, + { + "epoch": 2.3107658157602664, + "grad_norm": 0.2680168151855469, + "learning_rate": 1.5214451881741544e-06, + "loss": 0.3597, + "step": 2776 + }, + { + "epoch": 2.3115982241953383, + "grad_norm": 0.29601550102233887, + "learning_rate": 1.5179664477854556e-06, + "loss": 0.3948, + "step": 2777 + }, + { + "epoch": 2.3124306326304107, + "grad_norm": 0.31549885869026184, + "learning_rate": 1.5144909770872324e-06, + "loss": 0.4028, + "step": 2778 + }, + { + "epoch": 2.3132630410654826, + "grad_norm": 0.3089504837989807, + "learning_rate": 1.5110187793430086e-06, + "loss": 0.3954, + "step": 2779 + }, + { + "epoch": 2.314095449500555, + "grad_norm": 0.3053048551082611, + "learning_rate": 1.5075498578132398e-06, + "loss": 0.4161, + "step": 2780 + }, + { + "epoch": 2.314927857935627, + "grad_norm": 0.3325631320476532, + "learning_rate": 1.504084215755306e-06, + "loss": 0.4186, + "step": 2781 + }, + { + "epoch": 2.3157602663706993, + "grad_norm": 0.2945425510406494, + "learning_rate": 1.5006218564235058e-06, + "loss": 0.3561, + "step": 2782 + }, + { + "epoch": 2.3165926748057712, + "grad_norm": 0.35173118114471436, + "learning_rate": 1.4971627830690533e-06, + "loss": 0.4315, + "step": 2783 + }, + { + "epoch": 2.3174250832408436, + "grad_norm": 0.326571524143219, + "learning_rate": 1.4937069989400782e-06, + "loss": 0.3789, + "step": 2784 + }, + { + "epoch": 2.3182574916759155, + "grad_norm": 0.32909318804740906, + "learning_rate": 1.4902545072816266e-06, + "loss": 0.4114, + "step": 2785 + }, + { + "epoch": 2.319089900110988, + "grad_norm": 0.3058091998100281, + "learning_rate": 1.4868053113356446e-06, + "loss": 0.3674, + "step": 2786 + }, + { + "epoch": 2.31992230854606, + "grad_norm": 0.31880733370780945, + "learning_rate": 1.483359414340989e-06, + "loss": 0.4133, + "step": 2787 + }, + { + "epoch": 2.3207547169811322, + "grad_norm": 0.28259992599487305, + "learning_rate": 1.4799168195334174e-06, + "loss": 0.3619, + "step": 2788 + }, + { + "epoch": 2.321587125416204, + "grad_norm": 0.33174195885658264, + "learning_rate": 1.4764775301455859e-06, + "loss": 0.41, + "step": 2789 + }, + { + "epoch": 2.3224195338512765, + "grad_norm": 0.3177911043167114, + "learning_rate": 1.4730415494070482e-06, + "loss": 0.3861, + "step": 2790 + }, + { + "epoch": 2.3232519422863485, + "grad_norm": 0.2958433926105499, + "learning_rate": 1.4696088805442505e-06, + "loss": 0.3628, + "step": 2791 + }, + { + "epoch": 2.324084350721421, + "grad_norm": 0.2952271103858948, + "learning_rate": 1.466179526780529e-06, + "loss": 0.3991, + "step": 2792 + }, + { + "epoch": 2.3249167591564928, + "grad_norm": 0.3214179575443268, + "learning_rate": 1.4627534913361064e-06, + "loss": 0.409, + "step": 2793 + }, + { + "epoch": 2.3257491675915647, + "grad_norm": 0.31064051389694214, + "learning_rate": 1.4593307774280895e-06, + "loss": 0.4192, + "step": 2794 + }, + { + "epoch": 2.326581576026637, + "grad_norm": 0.30104267597198486, + "learning_rate": 1.4559113882704683e-06, + "loss": 0.3648, + "step": 2795 + }, + { + "epoch": 2.3274139844617094, + "grad_norm": 0.3028804659843445, + "learning_rate": 1.4524953270741077e-06, + "loss": 0.3694, + "step": 2796 + }, + { + "epoch": 2.3282463928967814, + "grad_norm": 0.309151828289032, + "learning_rate": 1.4490825970467493e-06, + "loss": 0.4093, + "step": 2797 + }, + { + "epoch": 2.3290788013318533, + "grad_norm": 0.31857770681381226, + "learning_rate": 1.4456732013930064e-06, + "loss": 0.3984, + "step": 2798 + }, + { + "epoch": 2.3299112097669257, + "grad_norm": 0.3101446032524109, + "learning_rate": 1.442267143314361e-06, + "loss": 0.3722, + "step": 2799 + }, + { + "epoch": 2.3307436182019976, + "grad_norm": 0.29778382182121277, + "learning_rate": 1.4388644260091617e-06, + "loss": 0.3865, + "step": 2800 + }, + { + "epoch": 2.33157602663707, + "grad_norm": 0.309423565864563, + "learning_rate": 1.435465052672621e-06, + "loss": 0.3819, + "step": 2801 + }, + { + "epoch": 2.332408435072142, + "grad_norm": 0.301540732383728, + "learning_rate": 1.432069026496805e-06, + "loss": 0.3653, + "step": 2802 + }, + { + "epoch": 2.3332408435072143, + "grad_norm": 0.28494688868522644, + "learning_rate": 1.4286763506706474e-06, + "loss": 0.3809, + "step": 2803 + }, + { + "epoch": 2.334073251942286, + "grad_norm": 0.2828214764595032, + "learning_rate": 1.425287028379929e-06, + "loss": 0.4076, + "step": 2804 + }, + { + "epoch": 2.3349056603773586, + "grad_norm": 0.3035745620727539, + "learning_rate": 1.4219010628072806e-06, + "loss": 0.4237, + "step": 2805 + }, + { + "epoch": 2.3357380688124305, + "grad_norm": 0.30221620202064514, + "learning_rate": 1.418518457132182e-06, + "loss": 0.4185, + "step": 2806 + }, + { + "epoch": 2.336570477247503, + "grad_norm": 0.29032158851623535, + "learning_rate": 1.4151392145309634e-06, + "loss": 0.4039, + "step": 2807 + }, + { + "epoch": 2.337402885682575, + "grad_norm": 0.3030306398868561, + "learning_rate": 1.4117633381767925e-06, + "loss": 0.4108, + "step": 2808 + }, + { + "epoch": 2.338235294117647, + "grad_norm": 0.3300309181213379, + "learning_rate": 1.4083908312396727e-06, + "loss": 0.4123, + "step": 2809 + }, + { + "epoch": 2.339067702552719, + "grad_norm": 0.3179803490638733, + "learning_rate": 1.4050216968864477e-06, + "loss": 0.3796, + "step": 2810 + }, + { + "epoch": 2.3399001109877915, + "grad_norm": 0.28045690059661865, + "learning_rate": 1.401655938280798e-06, + "loss": 0.368, + "step": 2811 + }, + { + "epoch": 2.3407325194228634, + "grad_norm": 0.32688331604003906, + "learning_rate": 1.3982935585832253e-06, + "loss": 0.3847, + "step": 2812 + }, + { + "epoch": 2.341564927857936, + "grad_norm": 0.29030436277389526, + "learning_rate": 1.3949345609510645e-06, + "loss": 0.3711, + "step": 2813 + }, + { + "epoch": 2.3423973362930077, + "grad_norm": 0.33373093605041504, + "learning_rate": 1.3915789485384718e-06, + "loss": 0.4009, + "step": 2814 + }, + { + "epoch": 2.34322974472808, + "grad_norm": 0.3265412151813507, + "learning_rate": 1.3882267244964304e-06, + "loss": 0.4302, + "step": 2815 + }, + { + "epoch": 2.344062153163152, + "grad_norm": 0.30916792154312134, + "learning_rate": 1.3848778919727324e-06, + "loss": 0.392, + "step": 2816 + }, + { + "epoch": 2.344894561598224, + "grad_norm": 0.30912598967552185, + "learning_rate": 1.3815324541119924e-06, + "loss": 0.3968, + "step": 2817 + }, + { + "epoch": 2.3457269700332963, + "grad_norm": 0.31527265906333923, + "learning_rate": 1.3781904140556352e-06, + "loss": 0.3742, + "step": 2818 + }, + { + "epoch": 2.3465593784683687, + "grad_norm": 0.2920280694961548, + "learning_rate": 1.3748517749418944e-06, + "loss": 0.3646, + "step": 2819 + }, + { + "epoch": 2.3473917869034406, + "grad_norm": 0.3410710394382477, + "learning_rate": 1.3715165399058106e-06, + "loss": 0.426, + "step": 2820 + }, + { + "epoch": 2.3482241953385126, + "grad_norm": 0.3250875771045685, + "learning_rate": 1.368184712079228e-06, + "loss": 0.417, + "step": 2821 + }, + { + "epoch": 2.349056603773585, + "grad_norm": 0.2889362573623657, + "learning_rate": 1.3648562945907916e-06, + "loss": 0.3865, + "step": 2822 + }, + { + "epoch": 2.349889012208657, + "grad_norm": 0.3278926610946655, + "learning_rate": 1.3615312905659434e-06, + "loss": 0.4317, + "step": 2823 + }, + { + "epoch": 2.3507214206437292, + "grad_norm": 0.3064905107021332, + "learning_rate": 1.3582097031269208e-06, + "loss": 0.3656, + "step": 2824 + }, + { + "epoch": 2.351553829078801, + "grad_norm": 0.31251177191734314, + "learning_rate": 1.3548915353927516e-06, + "loss": 0.415, + "step": 2825 + }, + { + "epoch": 2.3523862375138735, + "grad_norm": 0.3331948220729828, + "learning_rate": 1.3515767904792548e-06, + "loss": 0.3624, + "step": 2826 + }, + { + "epoch": 2.3532186459489455, + "grad_norm": 0.31363222002983093, + "learning_rate": 1.3482654714990323e-06, + "loss": 0.3953, + "step": 2827 + }, + { + "epoch": 2.354051054384018, + "grad_norm": 0.30317604541778564, + "learning_rate": 1.3449575815614719e-06, + "loss": 0.4045, + "step": 2828 + }, + { + "epoch": 2.35488346281909, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.3416531237727398e-06, + "loss": 0.4031, + "step": 2829 + }, + { + "epoch": 2.355715871254162, + "grad_norm": 0.30564767122268677, + "learning_rate": 1.338352101235781e-06, + "loss": 0.3676, + "step": 2830 + }, + { + "epoch": 2.356548279689234, + "grad_norm": 0.3132300078868866, + "learning_rate": 1.3350545170503087e-06, + "loss": 0.4245, + "step": 2831 + }, + { + "epoch": 2.3573806881243065, + "grad_norm": 0.3112890124320984, + "learning_rate": 1.3317603743128177e-06, + "loss": 0.3984, + "step": 2832 + }, + { + "epoch": 2.3582130965593784, + "grad_norm": 0.32551440596580505, + "learning_rate": 1.3284696761165634e-06, + "loss": 0.4194, + "step": 2833 + }, + { + "epoch": 2.3590455049944508, + "grad_norm": 0.3218403160572052, + "learning_rate": 1.3251824255515704e-06, + "loss": 0.4249, + "step": 2834 + }, + { + "epoch": 2.3598779134295227, + "grad_norm": 0.2741534113883972, + "learning_rate": 1.3218986257046217e-06, + "loss": 0.3283, + "step": 2835 + }, + { + "epoch": 2.360710321864595, + "grad_norm": 0.33001041412353516, + "learning_rate": 1.3186182796592634e-06, + "loss": 0.4112, + "step": 2836 + }, + { + "epoch": 2.361542730299667, + "grad_norm": 0.29912424087524414, + "learning_rate": 1.3153413904958024e-06, + "loss": 0.3664, + "step": 2837 + }, + { + "epoch": 2.3623751387347394, + "grad_norm": 0.2997702360153198, + "learning_rate": 1.3120679612912896e-06, + "loss": 0.3769, + "step": 2838 + }, + { + "epoch": 2.3632075471698113, + "grad_norm": 0.295955628156662, + "learning_rate": 1.308797995119534e-06, + "loss": 0.3931, + "step": 2839 + }, + { + "epoch": 2.3640399556048832, + "grad_norm": 0.3053940236568451, + "learning_rate": 1.30553149505109e-06, + "loss": 0.3741, + "step": 2840 + }, + { + "epoch": 2.3648723640399556, + "grad_norm": 0.3083863854408264, + "learning_rate": 1.302268464153263e-06, + "loss": 0.4192, + "step": 2841 + }, + { + "epoch": 2.365704772475028, + "grad_norm": 0.3068256378173828, + "learning_rate": 1.2990089054900918e-06, + "loss": 0.4053, + "step": 2842 + }, + { + "epoch": 2.3665371809101, + "grad_norm": 0.3323042392730713, + "learning_rate": 1.2957528221223591e-06, + "loss": 0.44, + "step": 2843 + }, + { + "epoch": 2.367369589345172, + "grad_norm": 0.29590079188346863, + "learning_rate": 1.2925002171075846e-06, + "loss": 0.3591, + "step": 2844 + }, + { + "epoch": 2.368201997780244, + "grad_norm": 0.3084840476512909, + "learning_rate": 1.2892510935000252e-06, + "loss": 0.4094, + "step": 2845 + }, + { + "epoch": 2.369034406215316, + "grad_norm": 0.2761549651622772, + "learning_rate": 1.2860054543506595e-06, + "loss": 0.349, + "step": 2846 + }, + { + "epoch": 2.3698668146503885, + "grad_norm": 0.30774176120758057, + "learning_rate": 1.2827633027072017e-06, + "loss": 0.4483, + "step": 2847 + }, + { + "epoch": 2.3706992230854604, + "grad_norm": 0.2745443284511566, + "learning_rate": 1.2795246416140895e-06, + "loss": 0.3591, + "step": 2848 + }, + { + "epoch": 2.371531631520533, + "grad_norm": 0.3255792558193207, + "learning_rate": 1.2762894741124814e-06, + "loss": 0.4007, + "step": 2849 + }, + { + "epoch": 2.3723640399556047, + "grad_norm": 0.34070661664009094, + "learning_rate": 1.273057803240257e-06, + "loss": 0.4215, + "step": 2850 + }, + { + "epoch": 2.373196448390677, + "grad_norm": 0.31039872765541077, + "learning_rate": 1.2698296320320113e-06, + "loss": 0.3986, + "step": 2851 + }, + { + "epoch": 2.374028856825749, + "grad_norm": 0.2800928056240082, + "learning_rate": 1.2666049635190535e-06, + "loss": 0.3828, + "step": 2852 + }, + { + "epoch": 2.3748612652608214, + "grad_norm": 0.28755927085876465, + "learning_rate": 1.2633838007294048e-06, + "loss": 0.3997, + "step": 2853 + }, + { + "epoch": 2.3756936736958933, + "grad_norm": 0.28583160042762756, + "learning_rate": 1.260166146687793e-06, + "loss": 0.3762, + "step": 2854 + }, + { + "epoch": 2.3765260821309657, + "grad_norm": 0.33797693252563477, + "learning_rate": 1.2569520044156509e-06, + "loss": 0.415, + "step": 2855 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.31852585077285767, + "learning_rate": 1.2537413769311163e-06, + "loss": 0.4078, + "step": 2856 + }, + { + "epoch": 2.37819089900111, + "grad_norm": 0.3200073838233948, + "learning_rate": 1.25053426724902e-06, + "loss": 0.3974, + "step": 2857 + }, + { + "epoch": 2.379023307436182, + "grad_norm": 0.3143557608127594, + "learning_rate": 1.247330678380899e-06, + "loss": 0.3691, + "step": 2858 + }, + { + "epoch": 2.3798557158712543, + "grad_norm": 0.30389317870140076, + "learning_rate": 1.2441306133349785e-06, + "loss": 0.4205, + "step": 2859 + }, + { + "epoch": 2.3806881243063263, + "grad_norm": 0.2985035181045532, + "learning_rate": 1.2409340751161753e-06, + "loss": 0.3629, + "step": 2860 + }, + { + "epoch": 2.3815205327413986, + "grad_norm": 0.3365674316883087, + "learning_rate": 1.2377410667260914e-06, + "loss": 0.4372, + "step": 2861 + }, + { + "epoch": 2.3823529411764706, + "grad_norm": 0.31339481472969055, + "learning_rate": 1.2345515911630223e-06, + "loss": 0.4141, + "step": 2862 + }, + { + "epoch": 2.3831853496115425, + "grad_norm": 0.3070381283760071, + "learning_rate": 1.2313656514219408e-06, + "loss": 0.3372, + "step": 2863 + }, + { + "epoch": 2.384017758046615, + "grad_norm": 0.2941051423549652, + "learning_rate": 1.2281832504944967e-06, + "loss": 0.383, + "step": 2864 + }, + { + "epoch": 2.3848501664816872, + "grad_norm": 0.2965616285800934, + "learning_rate": 1.2250043913690235e-06, + "loss": 0.4028, + "step": 2865 + }, + { + "epoch": 2.385682574916759, + "grad_norm": 0.30054527521133423, + "learning_rate": 1.2218290770305218e-06, + "loss": 0.3639, + "step": 2866 + }, + { + "epoch": 2.386514983351831, + "grad_norm": 0.3061928451061249, + "learning_rate": 1.2186573104606735e-06, + "loss": 0.4202, + "step": 2867 + }, + { + "epoch": 2.3873473917869035, + "grad_norm": 0.29752233624458313, + "learning_rate": 1.2154890946378178e-06, + "loss": 0.3646, + "step": 2868 + }, + { + "epoch": 2.3881798002219754, + "grad_norm": 0.3129211664199829, + "learning_rate": 1.2123244325369665e-06, + "loss": 0.4046, + "step": 2869 + }, + { + "epoch": 2.3890122086570478, + "grad_norm": 0.3117455244064331, + "learning_rate": 1.2091633271297916e-06, + "loss": 0.4251, + "step": 2870 + }, + { + "epoch": 2.3898446170921197, + "grad_norm": 0.2949092984199524, + "learning_rate": 1.20600578138463e-06, + "loss": 0.3705, + "step": 2871 + }, + { + "epoch": 2.390677025527192, + "grad_norm": 0.31923535466194153, + "learning_rate": 1.2028517982664683e-06, + "loss": 0.4063, + "step": 2872 + }, + { + "epoch": 2.391509433962264, + "grad_norm": 0.30522382259368896, + "learning_rate": 1.1997013807369535e-06, + "loss": 0.3575, + "step": 2873 + }, + { + "epoch": 2.3923418423973364, + "grad_norm": 0.29187336564064026, + "learning_rate": 1.196554531754383e-06, + "loss": 0.3778, + "step": 2874 + }, + { + "epoch": 2.3931742508324083, + "grad_norm": 0.3098593056201935, + "learning_rate": 1.193411254273703e-06, + "loss": 0.4229, + "step": 2875 + }, + { + "epoch": 2.3940066592674807, + "grad_norm": 0.2713955342769623, + "learning_rate": 1.1902715512465057e-06, + "loss": 0.3677, + "step": 2876 + }, + { + "epoch": 2.3948390677025526, + "grad_norm": 0.26880118250846863, + "learning_rate": 1.1871354256210277e-06, + "loss": 0.4, + "step": 2877 + }, + { + "epoch": 2.395671476137625, + "grad_norm": 0.31260430812835693, + "learning_rate": 1.1840028803421455e-06, + "loss": 0.4367, + "step": 2878 + }, + { + "epoch": 2.396503884572697, + "grad_norm": 0.28427496552467346, + "learning_rate": 1.1808739183513745e-06, + "loss": 0.3538, + "step": 2879 + }, + { + "epoch": 2.3973362930077693, + "grad_norm": 0.32038092613220215, + "learning_rate": 1.1777485425868639e-06, + "loss": 0.368, + "step": 2880 + }, + { + "epoch": 2.398168701442841, + "grad_norm": 0.32951870560646057, + "learning_rate": 1.1746267559833973e-06, + "loss": 0.3891, + "step": 2881 + }, + { + "epoch": 2.3990011098779136, + "grad_norm": 0.2811376452445984, + "learning_rate": 1.1715085614723881e-06, + "loss": 0.375, + "step": 2882 + }, + { + "epoch": 2.3998335183129855, + "grad_norm": 0.29977062344551086, + "learning_rate": 1.1683939619818708e-06, + "loss": 0.4002, + "step": 2883 + }, + { + "epoch": 2.400665926748058, + "grad_norm": 0.303155779838562, + "learning_rate": 1.1652829604365135e-06, + "loss": 0.3454, + "step": 2884 + }, + { + "epoch": 2.40149833518313, + "grad_norm": 0.3123762309551239, + "learning_rate": 1.1621755597575996e-06, + "loss": 0.4271, + "step": 2885 + }, + { + "epoch": 2.4023307436182018, + "grad_norm": 0.28050366044044495, + "learning_rate": 1.1590717628630337e-06, + "loss": 0.3879, + "step": 2886 + }, + { + "epoch": 2.403163152053274, + "grad_norm": 0.2736967206001282, + "learning_rate": 1.155971572667332e-06, + "loss": 0.3857, + "step": 2887 + }, + { + "epoch": 2.4039955604883465, + "grad_norm": 0.3153398334980011, + "learning_rate": 1.1528749920816319e-06, + "loss": 0.391, + "step": 2888 + }, + { + "epoch": 2.4048279689234184, + "grad_norm": 0.3044677972793579, + "learning_rate": 1.1497820240136753e-06, + "loss": 0.4164, + "step": 2889 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 0.3012680411338806, + "learning_rate": 1.1466926713678117e-06, + "loss": 0.382, + "step": 2890 + }, + { + "epoch": 2.4064927857935627, + "grad_norm": 0.32723140716552734, + "learning_rate": 1.143606937044997e-06, + "loss": 0.4242, + "step": 2891 + }, + { + "epoch": 2.4073251942286347, + "grad_norm": 0.3037180006504059, + "learning_rate": 1.140524823942793e-06, + "loss": 0.4052, + "step": 2892 + }, + { + "epoch": 2.408157602663707, + "grad_norm": 0.300297349691391, + "learning_rate": 1.137446334955357e-06, + "loss": 0.3693, + "step": 2893 + }, + { + "epoch": 2.408990011098779, + "grad_norm": 0.30542272329330444, + "learning_rate": 1.1343714729734424e-06, + "loss": 0.3466, + "step": 2894 + }, + { + "epoch": 2.4098224195338513, + "grad_norm": 0.32545021176338196, + "learning_rate": 1.1313002408843986e-06, + "loss": 0.3899, + "step": 2895 + }, + { + "epoch": 2.4106548279689233, + "grad_norm": 0.30949637293815613, + "learning_rate": 1.1282326415721657e-06, + "loss": 0.4122, + "step": 2896 + }, + { + "epoch": 2.4114872364039956, + "grad_norm": 0.27907073497772217, + "learning_rate": 1.1251686779172772e-06, + "loss": 0.3984, + "step": 2897 + }, + { + "epoch": 2.4123196448390676, + "grad_norm": 0.3177613914012909, + "learning_rate": 1.122108352796844e-06, + "loss": 0.3756, + "step": 2898 + }, + { + "epoch": 2.41315205327414, + "grad_norm": 0.31491708755493164, + "learning_rate": 1.119051669084567e-06, + "loss": 0.4117, + "step": 2899 + }, + { + "epoch": 2.413984461709212, + "grad_norm": 0.2828831374645233, + "learning_rate": 1.1159986296507259e-06, + "loss": 0.3907, + "step": 2900 + }, + { + "epoch": 2.4148168701442843, + "grad_norm": 0.281389057636261, + "learning_rate": 1.112949237362177e-06, + "loss": 0.3957, + "step": 2901 + }, + { + "epoch": 2.415649278579356, + "grad_norm": 0.29541143774986267, + "learning_rate": 1.1099034950823539e-06, + "loss": 0.3692, + "step": 2902 + }, + { + "epoch": 2.4164816870144286, + "grad_norm": 0.3205885887145996, + "learning_rate": 1.1068614056712624e-06, + "loss": 0.4481, + "step": 2903 + }, + { + "epoch": 2.4173140954495005, + "grad_norm": 0.3070029020309448, + "learning_rate": 1.103822971985477e-06, + "loss": 0.3735, + "step": 2904 + }, + { + "epoch": 2.418146503884573, + "grad_norm": 0.31543856859207153, + "learning_rate": 1.1007881968781403e-06, + "loss": 0.3967, + "step": 2905 + }, + { + "epoch": 2.418978912319645, + "grad_norm": 0.29121309518814087, + "learning_rate": 1.0977570831989593e-06, + "loss": 0.3626, + "step": 2906 + }, + { + "epoch": 2.419811320754717, + "grad_norm": 0.3155393600463867, + "learning_rate": 1.0947296337942026e-06, + "loss": 0.4054, + "step": 2907 + }, + { + "epoch": 2.420643729189789, + "grad_norm": 0.3240245282649994, + "learning_rate": 1.091705851506698e-06, + "loss": 0.4109, + "step": 2908 + }, + { + "epoch": 2.421476137624861, + "grad_norm": 0.27017274498939514, + "learning_rate": 1.088685739175831e-06, + "loss": 0.3569, + "step": 2909 + }, + { + "epoch": 2.4223085460599334, + "grad_norm": 0.3277089595794678, + "learning_rate": 1.085669299637539e-06, + "loss": 0.4304, + "step": 2910 + }, + { + "epoch": 2.4231409544950058, + "grad_norm": 0.30966663360595703, + "learning_rate": 1.0826565357243125e-06, + "loss": 0.3984, + "step": 2911 + }, + { + "epoch": 2.4239733629300777, + "grad_norm": 0.28824105858802795, + "learning_rate": 1.0796474502651893e-06, + "loss": 0.3827, + "step": 2912 + }, + { + "epoch": 2.4248057713651496, + "grad_norm": 0.29764944314956665, + "learning_rate": 1.0766420460857507e-06, + "loss": 0.3925, + "step": 2913 + }, + { + "epoch": 2.425638179800222, + "grad_norm": 0.31288978457450867, + "learning_rate": 1.0736403260081279e-06, + "loss": 0.3844, + "step": 2914 + }, + { + "epoch": 2.426470588235294, + "grad_norm": 0.2968754172325134, + "learning_rate": 1.070642292850987e-06, + "loss": 0.3646, + "step": 2915 + }, + { + "epoch": 2.4273029966703663, + "grad_norm": 0.3165760636329651, + "learning_rate": 1.067647949429534e-06, + "loss": 0.4002, + "step": 2916 + }, + { + "epoch": 2.4281354051054382, + "grad_norm": 0.2920931875705719, + "learning_rate": 1.0646572985555071e-06, + "loss": 0.3664, + "step": 2917 + }, + { + "epoch": 2.4289678135405106, + "grad_norm": 0.2827328145503998, + "learning_rate": 1.0616703430371833e-06, + "loss": 0.4066, + "step": 2918 + }, + { + "epoch": 2.4298002219755825, + "grad_norm": 0.29017403721809387, + "learning_rate": 1.0586870856793657e-06, + "loss": 0.3744, + "step": 2919 + }, + { + "epoch": 2.430632630410655, + "grad_norm": 0.3129844665527344, + "learning_rate": 1.0557075292833836e-06, + "loss": 0.4184, + "step": 2920 + }, + { + "epoch": 2.431465038845727, + "grad_norm": 0.3073255121707916, + "learning_rate": 1.052731676647092e-06, + "loss": 0.3929, + "step": 2921 + }, + { + "epoch": 2.432297447280799, + "grad_norm": 0.2809605300426483, + "learning_rate": 1.049759530564871e-06, + "loss": 0.3918, + "step": 2922 + }, + { + "epoch": 2.433129855715871, + "grad_norm": 0.29906994104385376, + "learning_rate": 1.0467910938276182e-06, + "loss": 0.4187, + "step": 2923 + }, + { + "epoch": 2.4339622641509435, + "grad_norm": 0.2949870228767395, + "learning_rate": 1.0438263692227452e-06, + "loss": 0.3776, + "step": 2924 + }, + { + "epoch": 2.4347946725860155, + "grad_norm": 0.33523961901664734, + "learning_rate": 1.0408653595341812e-06, + "loss": 0.4238, + "step": 2925 + }, + { + "epoch": 2.435627081021088, + "grad_norm": 0.3024636507034302, + "learning_rate": 1.0379080675423664e-06, + "loss": 0.4127, + "step": 2926 + }, + { + "epoch": 2.4364594894561598, + "grad_norm": 0.2905980944633484, + "learning_rate": 1.0349544960242496e-06, + "loss": 0.3608, + "step": 2927 + }, + { + "epoch": 2.437291897891232, + "grad_norm": 0.3159032464027405, + "learning_rate": 1.0320046477532864e-06, + "loss": 0.4126, + "step": 2928 + }, + { + "epoch": 2.438124306326304, + "grad_norm": 0.3154861629009247, + "learning_rate": 1.0290585254994356e-06, + "loss": 0.4027, + "step": 2929 + }, + { + "epoch": 2.4389567147613764, + "grad_norm": 0.29416272044181824, + "learning_rate": 1.0261161320291586e-06, + "loss": 0.4003, + "step": 2930 + }, + { + "epoch": 2.4397891231964484, + "grad_norm": 0.30298373103141785, + "learning_rate": 1.0231774701054126e-06, + "loss": 0.3521, + "step": 2931 + }, + { + "epoch": 2.4406215316315203, + "grad_norm": 0.3236266076564789, + "learning_rate": 1.020242542487654e-06, + "loss": 0.4003, + "step": 2932 + }, + { + "epoch": 2.4414539400665927, + "grad_norm": 0.2998720407485962, + "learning_rate": 1.017311351931831e-06, + "loss": 0.3764, + "step": 2933 + }, + { + "epoch": 2.442286348501665, + "grad_norm": 0.36174139380455017, + "learning_rate": 1.0143839011903822e-06, + "loss": 0.3811, + "step": 2934 + }, + { + "epoch": 2.443118756936737, + "grad_norm": 0.306686133146286, + "learning_rate": 1.0114601930122363e-06, + "loss": 0.3726, + "step": 2935 + }, + { + "epoch": 2.443951165371809, + "grad_norm": 0.3181160092353821, + "learning_rate": 1.0085402301428055e-06, + "loss": 0.4274, + "step": 2936 + }, + { + "epoch": 2.4447835738068813, + "grad_norm": 0.3226320743560791, + "learning_rate": 1.005624015323986e-06, + "loss": 0.4226, + "step": 2937 + }, + { + "epoch": 2.445615982241953, + "grad_norm": 0.28478971123695374, + "learning_rate": 1.0027115512941549e-06, + "loss": 0.3913, + "step": 2938 + }, + { + "epoch": 2.4464483906770256, + "grad_norm": 0.2990976572036743, + "learning_rate": 9.998028407881672e-07, + "loss": 0.3943, + "step": 2939 + }, + { + "epoch": 2.4472807991120975, + "grad_norm": 0.326755166053772, + "learning_rate": 9.96897886537353e-07, + "loss": 0.4393, + "step": 2940 + }, + { + "epoch": 2.44811320754717, + "grad_norm": 0.29832005500793457, + "learning_rate": 9.939966912695143e-07, + "loss": 0.3817, + "step": 2941 + }, + { + "epoch": 2.448945615982242, + "grad_norm": 0.3191179037094116, + "learning_rate": 9.910992577089269e-07, + "loss": 0.3915, + "step": 2942 + }, + { + "epoch": 2.449778024417314, + "grad_norm": 0.29789450764656067, + "learning_rate": 9.882055885763264e-07, + "loss": 0.4053, + "step": 2943 + }, + { + "epoch": 2.450610432852386, + "grad_norm": 0.3055591285228729, + "learning_rate": 9.853156865889234e-07, + "loss": 0.4068, + "step": 2944 + }, + { + "epoch": 2.4514428412874585, + "grad_norm": 0.2884024381637573, + "learning_rate": 9.824295544603863e-07, + "loss": 0.3908, + "step": 2945 + }, + { + "epoch": 2.4522752497225304, + "grad_norm": 0.29266828298568726, + "learning_rate": 9.795471949008411e-07, + "loss": 0.3763, + "step": 2946 + }, + { + "epoch": 2.453107658157603, + "grad_norm": 0.28186294436454773, + "learning_rate": 9.766686106168744e-07, + "loss": 0.3785, + "step": 2947 + }, + { + "epoch": 2.4539400665926747, + "grad_norm": 0.2927301228046417, + "learning_rate": 9.73793804311529e-07, + "loss": 0.4081, + "step": 2948 + }, + { + "epoch": 2.454772475027747, + "grad_norm": 0.30842170119285583, + "learning_rate": 9.70922778684299e-07, + "loss": 0.4011, + "step": 2949 + }, + { + "epoch": 2.455604883462819, + "grad_norm": 0.30753591656684875, + "learning_rate": 9.680555364311251e-07, + "loss": 0.391, + "step": 2950 + }, + { + "epoch": 2.4564372918978914, + "grad_norm": 0.3024488091468811, + "learning_rate": 9.651920802443971e-07, + "loss": 0.3796, + "step": 2951 + }, + { + "epoch": 2.4572697003329633, + "grad_norm": 0.29086968302726746, + "learning_rate": 9.623324128129557e-07, + "loss": 0.3864, + "step": 2952 + }, + { + "epoch": 2.4581021087680357, + "grad_norm": 0.28945299983024597, + "learning_rate": 9.594765368220737e-07, + "loss": 0.3656, + "step": 2953 + }, + { + "epoch": 2.4589345172031076, + "grad_norm": 0.30900830030441284, + "learning_rate": 9.56624454953471e-07, + "loss": 0.4122, + "step": 2954 + }, + { + "epoch": 2.4597669256381796, + "grad_norm": 0.3098362982273102, + "learning_rate": 9.537761698853016e-07, + "loss": 0.3975, + "step": 2955 + }, + { + "epoch": 2.460599334073252, + "grad_norm": 0.2963133454322815, + "learning_rate": 9.509316842921551e-07, + "loss": 0.3775, + "step": 2956 + }, + { + "epoch": 2.4614317425083243, + "grad_norm": 0.28471919894218445, + "learning_rate": 9.480910008450534e-07, + "loss": 0.3706, + "step": 2957 + }, + { + "epoch": 2.4622641509433962, + "grad_norm": 0.32631343603134155, + "learning_rate": 9.452541222114481e-07, + "loss": 0.4478, + "step": 2958 + }, + { + "epoch": 2.463096559378468, + "grad_norm": 0.2862439751625061, + "learning_rate": 9.424210510552179e-07, + "loss": 0.35, + "step": 2959 + }, + { + "epoch": 2.4639289678135405, + "grad_norm": 0.2939037084579468, + "learning_rate": 9.395917900366663e-07, + "loss": 0.4093, + "step": 2960 + }, + { + "epoch": 2.4647613762486125, + "grad_norm": 0.27758315205574036, + "learning_rate": 9.36766341812519e-07, + "loss": 0.3851, + "step": 2961 + }, + { + "epoch": 2.465593784683685, + "grad_norm": 0.3188072443008423, + "learning_rate": 9.33944709035921e-07, + "loss": 0.4501, + "step": 2962 + }, + { + "epoch": 2.4664261931187568, + "grad_norm": 0.29086801409721375, + "learning_rate": 9.31126894356435e-07, + "loss": 0.386, + "step": 2963 + }, + { + "epoch": 2.467258601553829, + "grad_norm": 0.30861911177635193, + "learning_rate": 9.283129004200381e-07, + "loss": 0.4013, + "step": 2964 + }, + { + "epoch": 2.468091009988901, + "grad_norm": 0.31104356050491333, + "learning_rate": 9.255027298691205e-07, + "loss": 0.3772, + "step": 2965 + }, + { + "epoch": 2.4689234184239734, + "grad_norm": 0.3085804879665375, + "learning_rate": 9.226963853424815e-07, + "loss": 0.3887, + "step": 2966 + }, + { + "epoch": 2.4697558268590454, + "grad_norm": 0.3194611072540283, + "learning_rate": 9.198938694753268e-07, + "loss": 0.4214, + "step": 2967 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.30297356843948364, + "learning_rate": 9.170951848992693e-07, + "loss": 0.4159, + "step": 2968 + }, + { + "epoch": 2.4714206437291897, + "grad_norm": 0.3000932037830353, + "learning_rate": 9.143003342423212e-07, + "loss": 0.4052, + "step": 2969 + }, + { + "epoch": 2.472253052164262, + "grad_norm": 0.300144225358963, + "learning_rate": 9.115093201288977e-07, + "loss": 0.4005, + "step": 2970 + }, + { + "epoch": 2.473085460599334, + "grad_norm": 0.304669588804245, + "learning_rate": 9.0872214517981e-07, + "loss": 0.3844, + "step": 2971 + }, + { + "epoch": 2.4739178690344064, + "grad_norm": 0.2950827181339264, + "learning_rate": 9.059388120122626e-07, + "loss": 0.3914, + "step": 2972 + }, + { + "epoch": 2.4747502774694783, + "grad_norm": 0.2657037079334259, + "learning_rate": 9.031593232398539e-07, + "loss": 0.3532, + "step": 2973 + }, + { + "epoch": 2.4755826859045507, + "grad_norm": 0.3045920133590698, + "learning_rate": 9.003836814725742e-07, + "loss": 0.4007, + "step": 2974 + }, + { + "epoch": 2.4764150943396226, + "grad_norm": 0.295762300491333, + "learning_rate": 8.976118893168006e-07, + "loss": 0.3683, + "step": 2975 + }, + { + "epoch": 2.477247502774695, + "grad_norm": 0.29629120230674744, + "learning_rate": 8.94843949375292e-07, + "loss": 0.4194, + "step": 2976 + }, + { + "epoch": 2.478079911209767, + "grad_norm": 0.28778138756752014, + "learning_rate": 8.920798642471918e-07, + "loss": 0.3705, + "step": 2977 + }, + { + "epoch": 2.478912319644839, + "grad_norm": 0.32166433334350586, + "learning_rate": 8.893196365280282e-07, + "loss": 0.4143, + "step": 2978 + }, + { + "epoch": 2.479744728079911, + "grad_norm": 0.301952987909317, + "learning_rate": 8.865632688097004e-07, + "loss": 0.381, + "step": 2979 + }, + { + "epoch": 2.4805771365149836, + "grad_norm": 0.2990775406360626, + "learning_rate": 8.83810763680486e-07, + "loss": 0.3671, + "step": 2980 + }, + { + "epoch": 2.4814095449500555, + "grad_norm": 0.31099602580070496, + "learning_rate": 8.810621237250355e-07, + "loss": 0.4344, + "step": 2981 + }, + { + "epoch": 2.4822419533851274, + "grad_norm": 0.2816608250141144, + "learning_rate": 8.783173515243725e-07, + "loss": 0.3796, + "step": 2982 + }, + { + "epoch": 2.4830743618202, + "grad_norm": 0.27356284856796265, + "learning_rate": 8.755764496558838e-07, + "loss": 0.3578, + "step": 2983 + }, + { + "epoch": 2.4839067702552717, + "grad_norm": 0.32336634397506714, + "learning_rate": 8.728394206933239e-07, + "loss": 0.4262, + "step": 2984 + }, + { + "epoch": 2.484739178690344, + "grad_norm": 0.3039683699607849, + "learning_rate": 8.701062672068122e-07, + "loss": 0.3967, + "step": 2985 + }, + { + "epoch": 2.485571587125416, + "grad_norm": 0.2810039520263672, + "learning_rate": 8.673769917628272e-07, + "loss": 0.3698, + "step": 2986 + }, + { + "epoch": 2.4864039955604884, + "grad_norm": 0.2974560260772705, + "learning_rate": 8.646515969242065e-07, + "loss": 0.3925, + "step": 2987 + }, + { + "epoch": 2.4872364039955603, + "grad_norm": 0.3146367371082306, + "learning_rate": 8.619300852501427e-07, + "loss": 0.425, + "step": 2988 + }, + { + "epoch": 2.4880688124306327, + "grad_norm": 0.29236578941345215, + "learning_rate": 8.592124592961843e-07, + "loss": 0.3563, + "step": 2989 + }, + { + "epoch": 2.4889012208657046, + "grad_norm": 0.29718172550201416, + "learning_rate": 8.56498721614229e-07, + "loss": 0.3998, + "step": 2990 + }, + { + "epoch": 2.489733629300777, + "grad_norm": 0.313275009393692, + "learning_rate": 8.537888747525236e-07, + "loss": 0.4364, + "step": 2991 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.30545952916145325, + "learning_rate": 8.51082921255662e-07, + "loss": 0.4099, + "step": 2992 + }, + { + "epoch": 2.4913984461709213, + "grad_norm": 0.33663320541381836, + "learning_rate": 8.483808636645824e-07, + "loss": 0.4288, + "step": 2993 + }, + { + "epoch": 2.4922308546059933, + "grad_norm": 0.3012736141681671, + "learning_rate": 8.456827045165638e-07, + "loss": 0.3524, + "step": 2994 + }, + { + "epoch": 2.4930632630410656, + "grad_norm": 0.31417447328567505, + "learning_rate": 8.429884463452248e-07, + "loss": 0.3816, + "step": 2995 + }, + { + "epoch": 2.4938956714761376, + "grad_norm": 0.31972286105155945, + "learning_rate": 8.402980916805215e-07, + "loss": 0.4074, + "step": 2996 + }, + { + "epoch": 2.49472807991121, + "grad_norm": 0.33134639263153076, + "learning_rate": 8.376116430487441e-07, + "loss": 0.4231, + "step": 2997 + }, + { + "epoch": 2.495560488346282, + "grad_norm": 0.28740981221199036, + "learning_rate": 8.349291029725126e-07, + "loss": 0.369, + "step": 2998 + }, + { + "epoch": 2.4963928967813542, + "grad_norm": 0.29424378275871277, + "learning_rate": 8.322504739707821e-07, + "loss": 0.3857, + "step": 2999 + }, + { + "epoch": 2.497225305216426, + "grad_norm": 0.275423526763916, + "learning_rate": 8.295757585588304e-07, + "loss": 0.3734, + "step": 3000 + }, + { + "epoch": 2.498057713651498, + "grad_norm": 0.3008045256137848, + "learning_rate": 8.269049592482648e-07, + "loss": 0.3864, + "step": 3001 + }, + { + "epoch": 2.4988901220865705, + "grad_norm": 0.2958872318267822, + "learning_rate": 8.242380785470088e-07, + "loss": 0.3839, + "step": 3002 + }, + { + "epoch": 2.499722530521643, + "grad_norm": 0.31030040979385376, + "learning_rate": 8.215751189593107e-07, + "loss": 0.4039, + "step": 3003 + }, + { + "epoch": 2.5005549389567148, + "grad_norm": 0.3011883497238159, + "learning_rate": 8.189160829857396e-07, + "loss": 0.3923, + "step": 3004 + }, + { + "epoch": 2.5013873473917867, + "grad_norm": 0.314978688955307, + "learning_rate": 8.16260973123173e-07, + "loss": 0.3885, + "step": 3005 + }, + { + "epoch": 2.502219755826859, + "grad_norm": 0.31006646156311035, + "learning_rate": 8.136097918648073e-07, + "loss": 0.3841, + "step": 3006 + }, + { + "epoch": 2.5030521642619314, + "grad_norm": 0.3044472336769104, + "learning_rate": 8.109625417001465e-07, + "loss": 0.4087, + "step": 3007 + }, + { + "epoch": 2.5038845726970034, + "grad_norm": 0.3212951421737671, + "learning_rate": 8.08319225115009e-07, + "loss": 0.3969, + "step": 3008 + }, + { + "epoch": 2.5047169811320753, + "grad_norm": 0.3018619120121002, + "learning_rate": 8.056798445915115e-07, + "loss": 0.3801, + "step": 3009 + }, + { + "epoch": 2.5055493895671477, + "grad_norm": 0.3130761384963989, + "learning_rate": 8.030444026080791e-07, + "loss": 0.3594, + "step": 3010 + }, + { + "epoch": 2.5063817980022196, + "grad_norm": 0.30970242619514465, + "learning_rate": 8.004129016394374e-07, + "loss": 0.4117, + "step": 3011 + }, + { + "epoch": 2.507214206437292, + "grad_norm": 0.2875807583332062, + "learning_rate": 7.977853441566152e-07, + "loss": 0.3824, + "step": 3012 + }, + { + "epoch": 2.508046614872364, + "grad_norm": 0.3063216805458069, + "learning_rate": 7.951617326269318e-07, + "loss": 0.3899, + "step": 3013 + }, + { + "epoch": 2.5088790233074363, + "grad_norm": 0.2871801257133484, + "learning_rate": 7.925420695140052e-07, + "loss": 0.3954, + "step": 3014 + }, + { + "epoch": 2.509711431742508, + "grad_norm": 0.30276262760162354, + "learning_rate": 7.899263572777454e-07, + "loss": 0.4196, + "step": 3015 + }, + { + "epoch": 2.5105438401775806, + "grad_norm": 0.3048497140407562, + "learning_rate": 7.873145983743513e-07, + "loss": 0.3925, + "step": 3016 + }, + { + "epoch": 2.5113762486126525, + "grad_norm": 0.3085763454437256, + "learning_rate": 7.847067952563103e-07, + "loss": 0.3928, + "step": 3017 + }, + { + "epoch": 2.512208657047725, + "grad_norm": 0.3082400858402252, + "learning_rate": 7.821029503723959e-07, + "loss": 0.3913, + "step": 3018 + }, + { + "epoch": 2.513041065482797, + "grad_norm": 0.2877948582172394, + "learning_rate": 7.795030661676633e-07, + "loss": 0.3754, + "step": 3019 + }, + { + "epoch": 2.513873473917869, + "grad_norm": 0.29405730962753296, + "learning_rate": 7.769071450834498e-07, + "loss": 0.3884, + "step": 3020 + }, + { + "epoch": 2.514705882352941, + "grad_norm": 0.31952765583992004, + "learning_rate": 7.743151895573703e-07, + "loss": 0.4085, + "step": 3021 + }, + { + "epoch": 2.5155382907880135, + "grad_norm": 0.3090550899505615, + "learning_rate": 7.717272020233169e-07, + "loss": 0.4127, + "step": 3022 + }, + { + "epoch": 2.5163706992230854, + "grad_norm": 0.27301451563835144, + "learning_rate": 7.691431849114561e-07, + "loss": 0.3753, + "step": 3023 + }, + { + "epoch": 2.5172031076581574, + "grad_norm": 0.31073275208473206, + "learning_rate": 7.665631406482216e-07, + "loss": 0.4172, + "step": 3024 + }, + { + "epoch": 2.5180355160932297, + "grad_norm": 0.32717081904411316, + "learning_rate": 7.639870716563236e-07, + "loss": 0.4001, + "step": 3025 + }, + { + "epoch": 2.518867924528302, + "grad_norm": 0.27241167426109314, + "learning_rate": 7.614149803547354e-07, + "loss": 0.3609, + "step": 3026 + }, + { + "epoch": 2.519700332963374, + "grad_norm": 0.31278035044670105, + "learning_rate": 7.588468691586964e-07, + "loss": 0.3819, + "step": 3027 + }, + { + "epoch": 2.520532741398446, + "grad_norm": 0.3114195764064789, + "learning_rate": 7.562827404797046e-07, + "loss": 0.3831, + "step": 3028 + }, + { + "epoch": 2.5213651498335183, + "grad_norm": 0.29425179958343506, + "learning_rate": 7.537225967255252e-07, + "loss": 0.3664, + "step": 3029 + }, + { + "epoch": 2.5221975582685907, + "grad_norm": 0.2959001958370209, + "learning_rate": 7.511664403001778e-07, + "loss": 0.3783, + "step": 3030 + }, + { + "epoch": 2.5230299667036626, + "grad_norm": 0.3024890720844269, + "learning_rate": 7.486142736039364e-07, + "loss": 0.3908, + "step": 3031 + }, + { + "epoch": 2.5238623751387346, + "grad_norm": 0.3172726333141327, + "learning_rate": 7.460660990333307e-07, + "loss": 0.422, + "step": 3032 + }, + { + "epoch": 2.524694783573807, + "grad_norm": 0.26689743995666504, + "learning_rate": 7.435219189811404e-07, + "loss": 0.381, + "step": 3033 + }, + { + "epoch": 2.525527192008879, + "grad_norm": 0.29678985476493835, + "learning_rate": 7.409817358363986e-07, + "loss": 0.3954, + "step": 3034 + }, + { + "epoch": 2.5263596004439512, + "grad_norm": 0.2859072983264923, + "learning_rate": 7.38445551984378e-07, + "loss": 0.3582, + "step": 3035 + }, + { + "epoch": 2.527192008879023, + "grad_norm": 0.29791417717933655, + "learning_rate": 7.359133698066012e-07, + "loss": 0.4127, + "step": 3036 + }, + { + "epoch": 2.5280244173140956, + "grad_norm": 0.27158403396606445, + "learning_rate": 7.333851916808298e-07, + "loss": 0.3757, + "step": 3037 + }, + { + "epoch": 2.5288568257491675, + "grad_norm": 0.2889906167984009, + "learning_rate": 7.308610199810717e-07, + "loss": 0.367, + "step": 3038 + }, + { + "epoch": 2.52968923418424, + "grad_norm": 0.31212955713272095, + "learning_rate": 7.28340857077564e-07, + "loss": 0.4116, + "step": 3039 + }, + { + "epoch": 2.530521642619312, + "grad_norm": 0.30906566977500916, + "learning_rate": 7.258247053367856e-07, + "loss": 0.4001, + "step": 3040 + }, + { + "epoch": 2.531354051054384, + "grad_norm": 0.27870288491249084, + "learning_rate": 7.233125671214469e-07, + "loss": 0.3846, + "step": 3041 + }, + { + "epoch": 2.532186459489456, + "grad_norm": 0.2938634157180786, + "learning_rate": 7.208044447904893e-07, + "loss": 0.4034, + "step": 3042 + }, + { + "epoch": 2.5330188679245285, + "grad_norm": 0.2890358865261078, + "learning_rate": 7.183003406990841e-07, + "loss": 0.3691, + "step": 3043 + }, + { + "epoch": 2.5338512763596004, + "grad_norm": 0.28360605239868164, + "learning_rate": 7.158002571986283e-07, + "loss": 0.3825, + "step": 3044 + }, + { + "epoch": 2.5346836847946728, + "grad_norm": 0.277585506439209, + "learning_rate": 7.133041966367443e-07, + "loss": 0.3718, + "step": 3045 + }, + { + "epoch": 2.5355160932297447, + "grad_norm": 0.2901941239833832, + "learning_rate": 7.108121613572771e-07, + "loss": 0.3666, + "step": 3046 + }, + { + "epoch": 2.5363485016648166, + "grad_norm": 0.2988186776638031, + "learning_rate": 7.083241537002905e-07, + "loss": 0.3939, + "step": 3047 + }, + { + "epoch": 2.537180910099889, + "grad_norm": 0.31158897280693054, + "learning_rate": 7.058401760020689e-07, + "loss": 0.4082, + "step": 3048 + }, + { + "epoch": 2.5380133185349614, + "grad_norm": 0.29564598202705383, + "learning_rate": 7.033602305951104e-07, + "loss": 0.3702, + "step": 3049 + }, + { + "epoch": 2.5388457269700333, + "grad_norm": 0.29291555285453796, + "learning_rate": 7.008843198081239e-07, + "loss": 0.4032, + "step": 3050 + }, + { + "epoch": 2.5396781354051052, + "grad_norm": 0.29521968960762024, + "learning_rate": 6.984124459660374e-07, + "loss": 0.3727, + "step": 3051 + }, + { + "epoch": 2.5405105438401776, + "grad_norm": 0.29539427161216736, + "learning_rate": 6.95944611389982e-07, + "loss": 0.4056, + "step": 3052 + }, + { + "epoch": 2.54134295227525, + "grad_norm": 0.3013669550418854, + "learning_rate": 6.934808183972986e-07, + "loss": 0.377, + "step": 3053 + }, + { + "epoch": 2.542175360710322, + "grad_norm": 0.3182709217071533, + "learning_rate": 6.910210693015285e-07, + "loss": 0.4025, + "step": 3054 + }, + { + "epoch": 2.543007769145394, + "grad_norm": 0.3235478401184082, + "learning_rate": 6.885653664124226e-07, + "loss": 0.4023, + "step": 3055 + }, + { + "epoch": 2.543840177580466, + "grad_norm": 0.28089457750320435, + "learning_rate": 6.861137120359296e-07, + "loss": 0.3754, + "step": 3056 + }, + { + "epoch": 2.544672586015538, + "grad_norm": 0.3240266740322113, + "learning_rate": 6.836661084741924e-07, + "loss": 0.4258, + "step": 3057 + }, + { + "epoch": 2.5455049944506105, + "grad_norm": 0.30531975626945496, + "learning_rate": 6.812225580255549e-07, + "loss": 0.4008, + "step": 3058 + }, + { + "epoch": 2.5463374028856824, + "grad_norm": 0.2742519676685333, + "learning_rate": 6.787830629845549e-07, + "loss": 0.3597, + "step": 3059 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 0.3020766079425812, + "learning_rate": 6.763476256419215e-07, + "loss": 0.3942, + "step": 3060 + }, + { + "epoch": 2.5480022197558267, + "grad_norm": 0.314591646194458, + "learning_rate": 6.739162482845707e-07, + "loss": 0.3769, + "step": 3061 + }, + { + "epoch": 2.548834628190899, + "grad_norm": 0.308002233505249, + "learning_rate": 6.714889331956087e-07, + "loss": 0.3942, + "step": 3062 + }, + { + "epoch": 2.549667036625971, + "grad_norm": 0.300611287355423, + "learning_rate": 6.690656826543285e-07, + "loss": 0.4121, + "step": 3063 + }, + { + "epoch": 2.5504994450610434, + "grad_norm": 0.27304670214653015, + "learning_rate": 6.666464989362054e-07, + "loss": 0.3678, + "step": 3064 + }, + { + "epoch": 2.5513318534961154, + "grad_norm": 0.2949070632457733, + "learning_rate": 6.642313843128922e-07, + "loss": 0.4077, + "step": 3065 + }, + { + "epoch": 2.5521642619311877, + "grad_norm": 0.2929401695728302, + "learning_rate": 6.618203410522262e-07, + "loss": 0.376, + "step": 3066 + }, + { + "epoch": 2.5529966703662597, + "grad_norm": 0.30417951941490173, + "learning_rate": 6.594133714182178e-07, + "loss": 0.4095, + "step": 3067 + }, + { + "epoch": 2.553829078801332, + "grad_norm": 0.32251882553100586, + "learning_rate": 6.570104776710551e-07, + "loss": 0.4429, + "step": 3068 + }, + { + "epoch": 2.554661487236404, + "grad_norm": 0.2788321077823639, + "learning_rate": 6.546116620670961e-07, + "loss": 0.3727, + "step": 3069 + }, + { + "epoch": 2.555493895671476, + "grad_norm": 0.3155818581581116, + "learning_rate": 6.522169268588713e-07, + "loss": 0.3847, + "step": 3070 + }, + { + "epoch": 2.5563263041065483, + "grad_norm": 0.3051152527332306, + "learning_rate": 6.49826274295079e-07, + "loss": 0.3391, + "step": 3071 + }, + { + "epoch": 2.5571587125416206, + "grad_norm": 0.2933105230331421, + "learning_rate": 6.474397066205834e-07, + "loss": 0.3977, + "step": 3072 + }, + { + "epoch": 2.5579911209766926, + "grad_norm": 0.2970825135707855, + "learning_rate": 6.450572260764137e-07, + "loss": 0.4147, + "step": 3073 + }, + { + "epoch": 2.5588235294117645, + "grad_norm": 0.31705281138420105, + "learning_rate": 6.42678834899761e-07, + "loss": 0.4048, + "step": 3074 + }, + { + "epoch": 2.559655937846837, + "grad_norm": 0.28795450925827026, + "learning_rate": 6.403045353239757e-07, + "loss": 0.4011, + "step": 3075 + }, + { + "epoch": 2.5604883462819092, + "grad_norm": 0.3047225773334503, + "learning_rate": 6.379343295785673e-07, + "loss": 0.3888, + "step": 3076 + }, + { + "epoch": 2.561320754716981, + "grad_norm": 0.29493415355682373, + "learning_rate": 6.355682198892005e-07, + "loss": 0.3831, + "step": 3077 + }, + { + "epoch": 2.562153163152053, + "grad_norm": 0.29540005326271057, + "learning_rate": 6.33206208477693e-07, + "loss": 0.4203, + "step": 3078 + }, + { + "epoch": 2.5629855715871255, + "grad_norm": 0.266559898853302, + "learning_rate": 6.308482975620161e-07, + "loss": 0.3354, + "step": 3079 + }, + { + "epoch": 2.5638179800221974, + "grad_norm": 0.29652097821235657, + "learning_rate": 6.284944893562872e-07, + "loss": 0.4215, + "step": 3080 + }, + { + "epoch": 2.56465038845727, + "grad_norm": 0.3205231726169586, + "learning_rate": 6.261447860707753e-07, + "loss": 0.4226, + "step": 3081 + }, + { + "epoch": 2.5654827968923417, + "grad_norm": 0.3019580841064453, + "learning_rate": 6.23799189911894e-07, + "loss": 0.3837, + "step": 3082 + }, + { + "epoch": 2.566315205327414, + "grad_norm": 0.26572704315185547, + "learning_rate": 6.214577030821967e-07, + "loss": 0.3495, + "step": 3083 + }, + { + "epoch": 2.567147613762486, + "grad_norm": 0.2902132272720337, + "learning_rate": 6.191203277803798e-07, + "loss": 0.4022, + "step": 3084 + }, + { + "epoch": 2.5679800221975584, + "grad_norm": 0.2867758572101593, + "learning_rate": 6.167870662012831e-07, + "loss": 0.3833, + "step": 3085 + }, + { + "epoch": 2.5688124306326303, + "grad_norm": 0.27622854709625244, + "learning_rate": 6.144579205358786e-07, + "loss": 0.3983, + "step": 3086 + }, + { + "epoch": 2.5696448390677027, + "grad_norm": 0.2973408102989197, + "learning_rate": 6.121328929712739e-07, + "loss": 0.3968, + "step": 3087 + }, + { + "epoch": 2.5704772475027746, + "grad_norm": 0.3048017919063568, + "learning_rate": 6.098119856907103e-07, + "loss": 0.431, + "step": 3088 + }, + { + "epoch": 2.571309655937847, + "grad_norm": 0.29575997591018677, + "learning_rate": 6.074952008735624e-07, + "loss": 0.3685, + "step": 3089 + }, + { + "epoch": 2.572142064372919, + "grad_norm": 0.3209153413772583, + "learning_rate": 6.051825406953316e-07, + "loss": 0.4429, + "step": 3090 + }, + { + "epoch": 2.5729744728079913, + "grad_norm": 0.30033037066459656, + "learning_rate": 6.02874007327644e-07, + "loss": 0.364, + "step": 3091 + }, + { + "epoch": 2.5738068812430632, + "grad_norm": 0.2988393306732178, + "learning_rate": 6.005696029382535e-07, + "loss": 0.3802, + "step": 3092 + }, + { + "epoch": 2.574639289678135, + "grad_norm": 0.3061862289905548, + "learning_rate": 5.982693296910386e-07, + "loss": 0.3916, + "step": 3093 + }, + { + "epoch": 2.5754716981132075, + "grad_norm": 0.31209561228752136, + "learning_rate": 5.959731897459936e-07, + "loss": 0.4099, + "step": 3094 + }, + { + "epoch": 2.57630410654828, + "grad_norm": 0.3136289417743683, + "learning_rate": 5.93681185259235e-07, + "loss": 0.3519, + "step": 3095 + }, + { + "epoch": 2.577136514983352, + "grad_norm": 0.29192236065864563, + "learning_rate": 5.91393318382995e-07, + "loss": 0.3679, + "step": 3096 + }, + { + "epoch": 2.5779689234184238, + "grad_norm": 0.3013075590133667, + "learning_rate": 5.891095912656208e-07, + "loss": 0.471, + "step": 3097 + }, + { + "epoch": 2.578801331853496, + "grad_norm": 0.2735782265663147, + "learning_rate": 5.86830006051572e-07, + "loss": 0.3486, + "step": 3098 + }, + { + "epoch": 2.5796337402885685, + "grad_norm": 0.28365886211395264, + "learning_rate": 5.845545648814188e-07, + "loss": 0.3943, + "step": 3099 + }, + { + "epoch": 2.5804661487236404, + "grad_norm": 0.262623131275177, + "learning_rate": 5.822832698918413e-07, + "loss": 0.3549, + "step": 3100 + }, + { + "epoch": 2.5812985571587124, + "grad_norm": 0.3032148778438568, + "learning_rate": 5.800161232156238e-07, + "loss": 0.3379, + "step": 3101 + }, + { + "epoch": 2.5821309655937847, + "grad_norm": 0.3096363842487335, + "learning_rate": 5.777531269816577e-07, + "loss": 0.4437, + "step": 3102 + }, + { + "epoch": 2.5829633740288567, + "grad_norm": 0.2988356649875641, + "learning_rate": 5.754942833149363e-07, + "loss": 0.3767, + "step": 3103 + }, + { + "epoch": 2.583795782463929, + "grad_norm": 0.29221031069755554, + "learning_rate": 5.732395943365526e-07, + "loss": 0.3716, + "step": 3104 + }, + { + "epoch": 2.584628190899001, + "grad_norm": 0.2928762137889862, + "learning_rate": 5.709890621636993e-07, + "loss": 0.3776, + "step": 3105 + }, + { + "epoch": 2.5854605993340734, + "grad_norm": 0.29212868213653564, + "learning_rate": 5.687426889096659e-07, + "loss": 0.3987, + "step": 3106 + }, + { + "epoch": 2.5862930077691453, + "grad_norm": 0.34992143511772156, + "learning_rate": 5.665004766838356e-07, + "loss": 0.4072, + "step": 3107 + }, + { + "epoch": 2.5871254162042177, + "grad_norm": 0.3027849495410919, + "learning_rate": 5.642624275916852e-07, + "loss": 0.3612, + "step": 3108 + }, + { + "epoch": 2.5879578246392896, + "grad_norm": 0.29607823491096497, + "learning_rate": 5.620285437347834e-07, + "loss": 0.4058, + "step": 3109 + }, + { + "epoch": 2.588790233074362, + "grad_norm": 0.28273841738700867, + "learning_rate": 5.597988272107824e-07, + "loss": 0.3771, + "step": 3110 + }, + { + "epoch": 2.589622641509434, + "grad_norm": 0.293728232383728, + "learning_rate": 5.575732801134287e-07, + "loss": 0.3977, + "step": 3111 + }, + { + "epoch": 2.5904550499445063, + "grad_norm": 0.28861936926841736, + "learning_rate": 5.553519045325501e-07, + "loss": 0.3839, + "step": 3112 + }, + { + "epoch": 2.591287458379578, + "grad_norm": 0.2736756503582001, + "learning_rate": 5.531347025540546e-07, + "loss": 0.374, + "step": 3113 + }, + { + "epoch": 2.5921198668146506, + "grad_norm": 0.2778788208961487, + "learning_rate": 5.509216762599339e-07, + "loss": 0.3654, + "step": 3114 + }, + { + "epoch": 2.5929522752497225, + "grad_norm": 0.29448696970939636, + "learning_rate": 5.487128277282605e-07, + "loss": 0.3952, + "step": 3115 + }, + { + "epoch": 2.5937846836847944, + "grad_norm": 0.31573858857154846, + "learning_rate": 5.465081590331817e-07, + "loss": 0.4203, + "step": 3116 + }, + { + "epoch": 2.594617092119867, + "grad_norm": 0.2849540114402771, + "learning_rate": 5.443076722449186e-07, + "loss": 0.3638, + "step": 3117 + }, + { + "epoch": 2.595449500554939, + "grad_norm": 0.2975523769855499, + "learning_rate": 5.421113694297664e-07, + "loss": 0.3668, + "step": 3118 + }, + { + "epoch": 2.596281908990011, + "grad_norm": 0.3016306459903717, + "learning_rate": 5.399192526500946e-07, + "loss": 0.4132, + "step": 3119 + }, + { + "epoch": 2.597114317425083, + "grad_norm": 0.2861490845680237, + "learning_rate": 5.377313239643367e-07, + "loss": 0.3821, + "step": 3120 + }, + { + "epoch": 2.5979467258601554, + "grad_norm": 0.2761737108230591, + "learning_rate": 5.355475854269964e-07, + "loss": 0.3899, + "step": 3121 + }, + { + "epoch": 2.5987791342952278, + "grad_norm": 0.2985495328903198, + "learning_rate": 5.333680390886426e-07, + "loss": 0.4112, + "step": 3122 + }, + { + "epoch": 2.5996115427302997, + "grad_norm": 0.2759629786014557, + "learning_rate": 5.311926869959094e-07, + "loss": 0.3573, + "step": 3123 + }, + { + "epoch": 2.6004439511653716, + "grad_norm": 0.343628466129303, + "learning_rate": 5.290215311914881e-07, + "loss": 0.4135, + "step": 3124 + }, + { + "epoch": 2.601276359600444, + "grad_norm": 0.2960171103477478, + "learning_rate": 5.268545737141323e-07, + "loss": 0.3888, + "step": 3125 + }, + { + "epoch": 2.602108768035516, + "grad_norm": 0.277442991733551, + "learning_rate": 5.246918165986537e-07, + "loss": 0.3336, + "step": 3126 + }, + { + "epoch": 2.6029411764705883, + "grad_norm": 0.31210342049598694, + "learning_rate": 5.225332618759193e-07, + "loss": 0.4181, + "step": 3127 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.27689164876937866, + "learning_rate": 5.203789115728486e-07, + "loss": 0.3818, + "step": 3128 + }, + { + "epoch": 2.6046059933407326, + "grad_norm": 0.28218722343444824, + "learning_rate": 5.182287677124159e-07, + "loss": 0.3801, + "step": 3129 + }, + { + "epoch": 2.6054384017758045, + "grad_norm": 0.29998162388801575, + "learning_rate": 5.160828323136424e-07, + "loss": 0.4012, + "step": 3130 + }, + { + "epoch": 2.606270810210877, + "grad_norm": 0.2883782684803009, + "learning_rate": 5.139411073916001e-07, + "loss": 0.3856, + "step": 3131 + }, + { + "epoch": 2.607103218645949, + "grad_norm": 0.3021693825721741, + "learning_rate": 5.118035949574057e-07, + "loss": 0.4013, + "step": 3132 + }, + { + "epoch": 2.6079356270810212, + "grad_norm": 0.26575303077697754, + "learning_rate": 5.096702970182204e-07, + "loss": 0.3247, + "step": 3133 + }, + { + "epoch": 2.608768035516093, + "grad_norm": 0.3065766394138336, + "learning_rate": 5.075412155772492e-07, + "loss": 0.382, + "step": 3134 + }, + { + "epoch": 2.6096004439511655, + "grad_norm": 0.30673807859420776, + "learning_rate": 5.054163526337364e-07, + "loss": 0.4343, + "step": 3135 + }, + { + "epoch": 2.6104328523862375, + "grad_norm": 0.3258250951766968, + "learning_rate": 5.032957101829661e-07, + "loss": 0.3875, + "step": 3136 + }, + { + "epoch": 2.61126526082131, + "grad_norm": 0.2834281623363495, + "learning_rate": 5.011792902162572e-07, + "loss": 0.379, + "step": 3137 + }, + { + "epoch": 2.6120976692563818, + "grad_norm": 0.3125614523887634, + "learning_rate": 4.990670947209675e-07, + "loss": 0.3715, + "step": 3138 + }, + { + "epoch": 2.6129300776914537, + "grad_norm": 0.29795554280281067, + "learning_rate": 4.969591256804824e-07, + "loss": 0.402, + "step": 3139 + }, + { + "epoch": 2.613762486126526, + "grad_norm": 0.30222252011299133, + "learning_rate": 4.948553850742238e-07, + "loss": 0.392, + "step": 3140 + }, + { + "epoch": 2.6145948945615984, + "grad_norm": 0.30855488777160645, + "learning_rate": 4.927558748776412e-07, + "loss": 0.4307, + "step": 3141 + }, + { + "epoch": 2.6154273029966704, + "grad_norm": 0.27991849184036255, + "learning_rate": 4.906605970622114e-07, + "loss": 0.3593, + "step": 3142 + }, + { + "epoch": 2.6162597114317423, + "grad_norm": 0.269772469997406, + "learning_rate": 4.885695535954361e-07, + "loss": 0.3599, + "step": 3143 + }, + { + "epoch": 2.6170921198668147, + "grad_norm": 0.2982839345932007, + "learning_rate": 4.8648274644084e-07, + "loss": 0.4084, + "step": 3144 + }, + { + "epoch": 2.617924528301887, + "grad_norm": 0.30452045798301697, + "learning_rate": 4.844001775579766e-07, + "loss": 0.4204, + "step": 3145 + }, + { + "epoch": 2.618756936736959, + "grad_norm": 0.28936830163002014, + "learning_rate": 4.8232184890241e-07, + "loss": 0.377, + "step": 3146 + }, + { + "epoch": 2.619589345172031, + "grad_norm": 0.29566147923469543, + "learning_rate": 4.802477624257285e-07, + "loss": 0.399, + "step": 3147 + }, + { + "epoch": 2.6204217536071033, + "grad_norm": 0.29598045349121094, + "learning_rate": 4.781779200755354e-07, + "loss": 0.3818, + "step": 3148 + }, + { + "epoch": 2.621254162042175, + "grad_norm": 0.2791507840156555, + "learning_rate": 4.7611232379545124e-07, + "loss": 0.3618, + "step": 3149 + }, + { + "epoch": 2.6220865704772476, + "grad_norm": 0.28765299916267395, + "learning_rate": 4.740509755251038e-07, + "loss": 0.3683, + "step": 3150 + }, + { + "epoch": 2.6229189789123195, + "grad_norm": 0.2902831435203552, + "learning_rate": 4.71993877200137e-07, + "loss": 0.3989, + "step": 3151 + }, + { + "epoch": 2.623751387347392, + "grad_norm": 0.29683712124824524, + "learning_rate": 4.6994103075220175e-07, + "loss": 0.4098, + "step": 3152 + }, + { + "epoch": 2.624583795782464, + "grad_norm": 0.28899383544921875, + "learning_rate": 4.678924381089567e-07, + "loss": 0.3437, + "step": 3153 + }, + { + "epoch": 2.625416204217536, + "grad_norm": 0.29837122559547424, + "learning_rate": 4.658481011940663e-07, + "loss": 0.4139, + "step": 3154 + }, + { + "epoch": 2.626248612652608, + "grad_norm": 0.31046080589294434, + "learning_rate": 4.63808021927199e-07, + "loss": 0.3861, + "step": 3155 + }, + { + "epoch": 2.6270810210876805, + "grad_norm": 0.3166535198688507, + "learning_rate": 4.617722022240245e-07, + "loss": 0.4264, + "step": 3156 + }, + { + "epoch": 2.6279134295227524, + "grad_norm": 0.3086240589618683, + "learning_rate": 4.597406439962138e-07, + "loss": 0.4188, + "step": 3157 + }, + { + "epoch": 2.628745837957825, + "grad_norm": 0.29972097277641296, + "learning_rate": 4.5771334915143516e-07, + "loss": 0.3938, + "step": 3158 + }, + { + "epoch": 2.6295782463928967, + "grad_norm": 0.3148014545440674, + "learning_rate": 4.5569031959335374e-07, + "loss": 0.402, + "step": 3159 + }, + { + "epoch": 2.630410654827969, + "grad_norm": 0.31971946358680725, + "learning_rate": 4.536715572216299e-07, + "loss": 0.3992, + "step": 3160 + }, + { + "epoch": 2.631243063263041, + "grad_norm": 0.3092584013938904, + "learning_rate": 4.5165706393191676e-07, + "loss": 0.4067, + "step": 3161 + }, + { + "epoch": 2.632075471698113, + "grad_norm": 0.306518018245697, + "learning_rate": 4.496468416158595e-07, + "loss": 0.3765, + "step": 3162 + }, + { + "epoch": 2.6329078801331853, + "grad_norm": 0.306640625, + "learning_rate": 4.4764089216109144e-07, + "loss": 0.4048, + "step": 3163 + }, + { + "epoch": 2.6337402885682577, + "grad_norm": 0.2979956269264221, + "learning_rate": 4.456392174512347e-07, + "loss": 0.3945, + "step": 3164 + }, + { + "epoch": 2.6345726970033296, + "grad_norm": 0.29669734835624695, + "learning_rate": 4.4364181936589536e-07, + "loss": 0.417, + "step": 3165 + }, + { + "epoch": 2.6354051054384016, + "grad_norm": 0.28595003485679626, + "learning_rate": 4.4164869978066684e-07, + "loss": 0.3818, + "step": 3166 + }, + { + "epoch": 2.636237513873474, + "grad_norm": 0.2954268455505371, + "learning_rate": 4.3965986056712316e-07, + "loss": 0.3785, + "step": 3167 + }, + { + "epoch": 2.6370699223085463, + "grad_norm": 0.3013925850391388, + "learning_rate": 4.376753035928194e-07, + "loss": 0.4055, + "step": 3168 + }, + { + "epoch": 2.6379023307436182, + "grad_norm": 0.2962302565574646, + "learning_rate": 4.3569503072128703e-07, + "loss": 0.4024, + "step": 3169 + }, + { + "epoch": 2.63873473917869, + "grad_norm": 0.2931113839149475, + "learning_rate": 4.3371904381203976e-07, + "loss": 0.3846, + "step": 3170 + }, + { + "epoch": 2.6395671476137625, + "grad_norm": 0.290157675743103, + "learning_rate": 4.3174734472056334e-07, + "loss": 0.3722, + "step": 3171 + }, + { + "epoch": 2.6403995560488345, + "grad_norm": 0.3102055490016937, + "learning_rate": 4.2977993529831675e-07, + "loss": 0.3862, + "step": 3172 + }, + { + "epoch": 2.641231964483907, + "grad_norm": 0.30981799960136414, + "learning_rate": 4.278168173927322e-07, + "loss": 0.3891, + "step": 3173 + }, + { + "epoch": 2.6420643729189788, + "grad_norm": 0.28204596042633057, + "learning_rate": 4.258579928472106e-07, + "loss": 0.36, + "step": 3174 + }, + { + "epoch": 2.642896781354051, + "grad_norm": 0.30792534351348877, + "learning_rate": 4.2390346350112634e-07, + "loss": 0.4047, + "step": 3175 + }, + { + "epoch": 2.643729189789123, + "grad_norm": 0.2940234839916229, + "learning_rate": 4.219532311898128e-07, + "loss": 0.4073, + "step": 3176 + }, + { + "epoch": 2.6445615982241955, + "grad_norm": 0.27562248706817627, + "learning_rate": 4.200072977445735e-07, + "loss": 0.4024, + "step": 3177 + }, + { + "epoch": 2.6453940066592674, + "grad_norm": 0.2981874942779541, + "learning_rate": 4.180656649926745e-07, + "loss": 0.413, + "step": 3178 + }, + { + "epoch": 2.6462264150943398, + "grad_norm": 0.31285783648490906, + "learning_rate": 4.161283347573425e-07, + "loss": 0.4159, + "step": 3179 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.2978651523590088, + "learning_rate": 4.141953088577644e-07, + "loss": 0.3846, + "step": 3180 + }, + { + "epoch": 2.647891231964484, + "grad_norm": 0.28265270590782166, + "learning_rate": 4.12266589109086e-07, + "loss": 0.3768, + "step": 3181 + }, + { + "epoch": 2.648723640399556, + "grad_norm": 0.2694186270236969, + "learning_rate": 4.103421773224081e-07, + "loss": 0.3781, + "step": 3182 + }, + { + "epoch": 2.6495560488346284, + "grad_norm": 0.2963635325431824, + "learning_rate": 4.0842207530478793e-07, + "loss": 0.4111, + "step": 3183 + }, + { + "epoch": 2.6503884572697003, + "grad_norm": 0.29516974091529846, + "learning_rate": 4.0650628485923385e-07, + "loss": 0.3699, + "step": 3184 + }, + { + "epoch": 2.6512208657047722, + "grad_norm": 0.3196733295917511, + "learning_rate": 4.0459480778470786e-07, + "loss": 0.4343, + "step": 3185 + }, + { + "epoch": 2.6520532741398446, + "grad_norm": 0.28815674781799316, + "learning_rate": 4.026876458761192e-07, + "loss": 0.3925, + "step": 3186 + }, + { + "epoch": 2.652885682574917, + "grad_norm": 0.30133241415023804, + "learning_rate": 4.0078480092432705e-07, + "loss": 0.3972, + "step": 3187 + }, + { + "epoch": 2.653718091009989, + "grad_norm": 0.3248240351676941, + "learning_rate": 3.9888627471613595e-07, + "loss": 0.405, + "step": 3188 + }, + { + "epoch": 2.654550499445061, + "grad_norm": 0.3046268820762634, + "learning_rate": 3.969920690342954e-07, + "loss": 0.3948, + "step": 3189 + }, + { + "epoch": 2.655382907880133, + "grad_norm": 0.29888278245925903, + "learning_rate": 3.9510218565749823e-07, + "loss": 0.3884, + "step": 3190 + }, + { + "epoch": 2.6562153163152056, + "grad_norm": 0.2877199351787567, + "learning_rate": 3.9321662636037537e-07, + "loss": 0.3705, + "step": 3191 + }, + { + "epoch": 2.6570477247502775, + "grad_norm": 0.2959778606891632, + "learning_rate": 3.913353929135033e-07, + "loss": 0.3945, + "step": 3192 + }, + { + "epoch": 2.6578801331853494, + "grad_norm": 0.27556687593460083, + "learning_rate": 3.8945848708339173e-07, + "loss": 0.369, + "step": 3193 + }, + { + "epoch": 2.658712541620422, + "grad_norm": 0.27974632382392883, + "learning_rate": 3.8758591063248864e-07, + "loss": 0.3902, + "step": 3194 + }, + { + "epoch": 2.6595449500554937, + "grad_norm": 0.2980170249938965, + "learning_rate": 3.8571766531917466e-07, + "loss": 0.4005, + "step": 3195 + }, + { + "epoch": 2.660377358490566, + "grad_norm": 0.26741084456443787, + "learning_rate": 3.838537528977659e-07, + "loss": 0.3446, + "step": 3196 + }, + { + "epoch": 2.661209766925638, + "grad_norm": 0.2738521993160248, + "learning_rate": 3.8199417511851023e-07, + "loss": 0.3572, + "step": 3197 + }, + { + "epoch": 2.6620421753607104, + "grad_norm": 0.30660688877105713, + "learning_rate": 3.8013893372758125e-07, + "loss": 0.398, + "step": 3198 + }, + { + "epoch": 2.6628745837957823, + "grad_norm": 0.31190434098243713, + "learning_rate": 3.782880304670833e-07, + "loss": 0.3662, + "step": 3199 + }, + { + "epoch": 2.6637069922308547, + "grad_norm": 0.27871236205101013, + "learning_rate": 3.7644146707504826e-07, + "loss": 0.3878, + "step": 3200 + }, + { + "epoch": 2.6645394006659266, + "grad_norm": 0.28115060925483704, + "learning_rate": 3.7459924528543247e-07, + "loss": 0.3932, + "step": 3201 + }, + { + "epoch": 2.665371809100999, + "grad_norm": 0.29334890842437744, + "learning_rate": 3.727613668281116e-07, + "loss": 0.4139, + "step": 3202 + }, + { + "epoch": 2.666204217536071, + "grad_norm": 0.30630093812942505, + "learning_rate": 3.709278334288874e-07, + "loss": 0.4072, + "step": 3203 + }, + { + "epoch": 2.6670366259711433, + "grad_norm": 0.2759256958961487, + "learning_rate": 3.6909864680947815e-07, + "loss": 0.396, + "step": 3204 + }, + { + "epoch": 2.6678690344062153, + "grad_norm": 0.289121150970459, + "learning_rate": 3.672738086875255e-07, + "loss": 0.393, + "step": 3205 + }, + { + "epoch": 2.6687014428412876, + "grad_norm": 0.3110848069190979, + "learning_rate": 3.6545332077658146e-07, + "loss": 0.4241, + "step": 3206 + }, + { + "epoch": 2.6695338512763596, + "grad_norm": 0.2876022160053253, + "learning_rate": 3.63637184786117e-07, + "loss": 0.3783, + "step": 3207 + }, + { + "epoch": 2.6703662597114315, + "grad_norm": 0.28069189190864563, + "learning_rate": 3.618254024215156e-07, + "loss": 0.4027, + "step": 3208 + }, + { + "epoch": 2.671198668146504, + "grad_norm": 0.2795030474662781, + "learning_rate": 3.6001797538407214e-07, + "loss": 0.3889, + "step": 3209 + }, + { + "epoch": 2.6720310765815762, + "grad_norm": 0.28693515062332153, + "learning_rate": 3.582149053709932e-07, + "loss": 0.3846, + "step": 3210 + }, + { + "epoch": 2.672863485016648, + "grad_norm": 0.30839934945106506, + "learning_rate": 3.564161940753924e-07, + "loss": 0.4299, + "step": 3211 + }, + { + "epoch": 2.67369589345172, + "grad_norm": 0.2880555987358093, + "learning_rate": 3.5462184318629134e-07, + "loss": 0.3687, + "step": 3212 + }, + { + "epoch": 2.6745283018867925, + "grad_norm": 0.3026648759841919, + "learning_rate": 3.528318543886172e-07, + "loss": 0.3653, + "step": 3213 + }, + { + "epoch": 2.675360710321865, + "grad_norm": 0.2983366847038269, + "learning_rate": 3.510462293632e-07, + "loss": 0.3878, + "step": 3214 + }, + { + "epoch": 2.6761931187569368, + "grad_norm": 0.2791326344013214, + "learning_rate": 3.4926496978677393e-07, + "loss": 0.3921, + "step": 3215 + }, + { + "epoch": 2.6770255271920087, + "grad_norm": 0.29134318232536316, + "learning_rate": 3.4748807733197223e-07, + "loss": 0.3876, + "step": 3216 + }, + { + "epoch": 2.677857935627081, + "grad_norm": 0.3067108392715454, + "learning_rate": 3.457155536673279e-07, + "loss": 0.3933, + "step": 3217 + }, + { + "epoch": 2.678690344062153, + "grad_norm": 0.31518760323524475, + "learning_rate": 3.439474004572724e-07, + "loss": 0.4012, + "step": 3218 + }, + { + "epoch": 2.6795227524972254, + "grad_norm": 0.27954304218292236, + "learning_rate": 3.4218361936213195e-07, + "loss": 0.3536, + "step": 3219 + }, + { + "epoch": 2.6803551609322973, + "grad_norm": 0.30983665585517883, + "learning_rate": 3.4042421203812904e-07, + "loss": 0.4255, + "step": 3220 + }, + { + "epoch": 2.6811875693673697, + "grad_norm": 0.28379014134407043, + "learning_rate": 3.386691801373754e-07, + "loss": 0.3885, + "step": 3221 + }, + { + "epoch": 2.6820199778024416, + "grad_norm": 0.2883853316307068, + "learning_rate": 3.369185253078794e-07, + "loss": 0.3971, + "step": 3222 + }, + { + "epoch": 2.682852386237514, + "grad_norm": 0.28160157799720764, + "learning_rate": 3.3517224919353555e-07, + "loss": 0.3796, + "step": 3223 + }, + { + "epoch": 2.683684794672586, + "grad_norm": 0.2900620102882385, + "learning_rate": 3.334303534341277e-07, + "loss": 0.4238, + "step": 3224 + }, + { + "epoch": 2.6845172031076583, + "grad_norm": 0.2966312766075134, + "learning_rate": 3.3169283966532517e-07, + "loss": 0.3779, + "step": 3225 + }, + { + "epoch": 2.68534961154273, + "grad_norm": 0.27945834398269653, + "learning_rate": 3.2995970951868574e-07, + "loss": 0.3731, + "step": 3226 + }, + { + "epoch": 2.6861820199778026, + "grad_norm": 0.31320464611053467, + "learning_rate": 3.2823096462164915e-07, + "loss": 0.3988, + "step": 3227 + }, + { + "epoch": 2.6870144284128745, + "grad_norm": 0.28876248002052307, + "learning_rate": 3.265066065975353e-07, + "loss": 0.3668, + "step": 3228 + }, + { + "epoch": 2.687846836847947, + "grad_norm": 0.2871847152709961, + "learning_rate": 3.2478663706554724e-07, + "loss": 0.4239, + "step": 3229 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 0.31527018547058105, + "learning_rate": 3.2307105764076694e-07, + "loss": 0.3833, + "step": 3230 + }, + { + "epoch": 2.6895116537180908, + "grad_norm": 0.31052833795547485, + "learning_rate": 3.213598699341547e-07, + "loss": 0.3889, + "step": 3231 + }, + { + "epoch": 2.690344062153163, + "grad_norm": 0.28673428297042847, + "learning_rate": 3.1965307555254343e-07, + "loss": 0.4006, + "step": 3232 + }, + { + "epoch": 2.6911764705882355, + "grad_norm": 0.28148752450942993, + "learning_rate": 3.1795067609864395e-07, + "loss": 0.3643, + "step": 3233 + }, + { + "epoch": 2.6920088790233074, + "grad_norm": 0.28988078236579895, + "learning_rate": 3.162526731710386e-07, + "loss": 0.4032, + "step": 3234 + }, + { + "epoch": 2.6928412874583794, + "grad_norm": 0.2785470485687256, + "learning_rate": 3.14559068364183e-07, + "loss": 0.3915, + "step": 3235 + }, + { + "epoch": 2.6936736958934517, + "grad_norm": 0.27616074681282043, + "learning_rate": 3.1286986326840076e-07, + "loss": 0.3812, + "step": 3236 + }, + { + "epoch": 2.694506104328524, + "grad_norm": 0.28551867604255676, + "learning_rate": 3.1118505946988506e-07, + "loss": 0.3644, + "step": 3237 + }, + { + "epoch": 2.695338512763596, + "grad_norm": 0.31371888518333435, + "learning_rate": 3.095046585506967e-07, + "loss": 0.4134, + "step": 3238 + }, + { + "epoch": 2.696170921198668, + "grad_norm": 0.299771785736084, + "learning_rate": 3.0782866208876163e-07, + "loss": 0.3827, + "step": 3239 + }, + { + "epoch": 2.6970033296337403, + "grad_norm": 0.29559215903282166, + "learning_rate": 3.0615707165786937e-07, + "loss": 0.385, + "step": 3240 + }, + { + "epoch": 2.6978357380688123, + "grad_norm": 0.27681657671928406, + "learning_rate": 3.044898888276726e-07, + "loss": 0.3581, + "step": 3241 + }, + { + "epoch": 2.6986681465038846, + "grad_norm": 0.29248616099357605, + "learning_rate": 3.0282711516368524e-07, + "loss": 0.3735, + "step": 3242 + }, + { + "epoch": 2.6995005549389566, + "grad_norm": 0.3053610026836395, + "learning_rate": 3.011687522272816e-07, + "loss": 0.4158, + "step": 3243 + }, + { + "epoch": 2.700332963374029, + "grad_norm": 0.31310683488845825, + "learning_rate": 2.995148015756927e-07, + "loss": 0.3995, + "step": 3244 + }, + { + "epoch": 2.701165371809101, + "grad_norm": 0.30295878648757935, + "learning_rate": 2.978652647620073e-07, + "loss": 0.4164, + "step": 3245 + }, + { + "epoch": 2.7019977802441733, + "grad_norm": 0.2850441336631775, + "learning_rate": 2.962201433351697e-07, + "loss": 0.3935, + "step": 3246 + }, + { + "epoch": 2.702830188679245, + "grad_norm": 0.2884097397327423, + "learning_rate": 2.9457943883997696e-07, + "loss": 0.3741, + "step": 3247 + }, + { + "epoch": 2.7036625971143176, + "grad_norm": 0.2926424741744995, + "learning_rate": 2.929431528170801e-07, + "loss": 0.3759, + "step": 3248 + }, + { + "epoch": 2.7044950055493895, + "grad_norm": 0.29279258847236633, + "learning_rate": 2.91311286802981e-07, + "loss": 0.3754, + "step": 3249 + }, + { + "epoch": 2.705327413984462, + "grad_norm": 0.2717744708061218, + "learning_rate": 2.8968384233002855e-07, + "loss": 0.3732, + "step": 3250 + }, + { + "epoch": 2.706159822419534, + "grad_norm": 0.29794079065322876, + "learning_rate": 2.8806082092642186e-07, + "loss": 0.3902, + "step": 3251 + }, + { + "epoch": 2.706992230854606, + "grad_norm": 0.31436416506767273, + "learning_rate": 2.8644222411620793e-07, + "loss": 0.4225, + "step": 3252 + }, + { + "epoch": 2.707824639289678, + "grad_norm": 0.2860269546508789, + "learning_rate": 2.848280534192777e-07, + "loss": 0.3869, + "step": 3253 + }, + { + "epoch": 2.70865704772475, + "grad_norm": 0.2989305257797241, + "learning_rate": 2.832183103513636e-07, + "loss": 0.3976, + "step": 3254 + }, + { + "epoch": 2.7094894561598224, + "grad_norm": 0.30432286858558655, + "learning_rate": 2.816129964240433e-07, + "loss": 0.4112, + "step": 3255 + }, + { + "epoch": 2.7103218645948948, + "grad_norm": 0.3015022575855255, + "learning_rate": 2.800121131447353e-07, + "loss": 0.404, + "step": 3256 + }, + { + "epoch": 2.7111542730299667, + "grad_norm": 0.2788195013999939, + "learning_rate": 2.784156620166983e-07, + "loss": 0.3442, + "step": 3257 + }, + { + "epoch": 2.7119866814650386, + "grad_norm": 0.32228487730026245, + "learning_rate": 2.7682364453902487e-07, + "loss": 0.4665, + "step": 3258 + }, + { + "epoch": 2.712819089900111, + "grad_norm": 0.2844350039958954, + "learning_rate": 2.7523606220664854e-07, + "loss": 0.3602, + "step": 3259 + }, + { + "epoch": 2.7136514983351834, + "grad_norm": 0.270535409450531, + "learning_rate": 2.736529165103385e-07, + "loss": 0.3578, + "step": 3260 + }, + { + "epoch": 2.7144839067702553, + "grad_norm": 0.31310030817985535, + "learning_rate": 2.7207420893669455e-07, + "loss": 0.3897, + "step": 3261 + }, + { + "epoch": 2.7153163152053272, + "grad_norm": 0.3238847851753235, + "learning_rate": 2.704999409681508e-07, + "loss": 0.4193, + "step": 3262 + }, + { + "epoch": 2.7161487236403996, + "grad_norm": 0.2968219220638275, + "learning_rate": 2.6893011408297196e-07, + "loss": 0.3856, + "step": 3263 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.2701689600944519, + "learning_rate": 2.6736472975525564e-07, + "loss": 0.3865, + "step": 3264 + }, + { + "epoch": 2.717813540510544, + "grad_norm": 0.30333271622657776, + "learning_rate": 2.65803789454922e-07, + "loss": 0.4149, + "step": 3265 + }, + { + "epoch": 2.718645948945616, + "grad_norm": 0.2998022735118866, + "learning_rate": 2.6424729464772316e-07, + "loss": 0.3999, + "step": 3266 + }, + { + "epoch": 2.719478357380688, + "grad_norm": 0.2897208034992218, + "learning_rate": 2.626952467952343e-07, + "loss": 0.388, + "step": 3267 + }, + { + "epoch": 2.72031076581576, + "grad_norm": 0.27508842945098877, + "learning_rate": 2.611476473548552e-07, + "loss": 0.3711, + "step": 3268 + }, + { + "epoch": 2.7211431742508325, + "grad_norm": 0.30373603105545044, + "learning_rate": 2.596044977798101e-07, + "loss": 0.4133, + "step": 3269 + }, + { + "epoch": 2.7219755826859044, + "grad_norm": 0.30435818433761597, + "learning_rate": 2.5806579951914214e-07, + "loss": 0.411, + "step": 3270 + }, + { + "epoch": 2.722807991120977, + "grad_norm": 0.28073209524154663, + "learning_rate": 2.5653155401771655e-07, + "loss": 0.3691, + "step": 3271 + }, + { + "epoch": 2.7236403995560488, + "grad_norm": 0.330555260181427, + "learning_rate": 2.550017627162166e-07, + "loss": 0.4357, + "step": 3272 + }, + { + "epoch": 2.724472807991121, + "grad_norm": 0.25710800290107727, + "learning_rate": 2.534764270511431e-07, + "loss": 0.3366, + "step": 3273 + }, + { + "epoch": 2.725305216426193, + "grad_norm": 0.29701754450798035, + "learning_rate": 2.5195554845481306e-07, + "loss": 0.4171, + "step": 3274 + }, + { + "epoch": 2.7261376248612654, + "grad_norm": 0.2765384316444397, + "learning_rate": 2.5043912835535867e-07, + "loss": 0.3471, + "step": 3275 + }, + { + "epoch": 2.7269700332963374, + "grad_norm": 0.29196688532829285, + "learning_rate": 2.4892716817672304e-07, + "loss": 0.4051, + "step": 3276 + }, + { + "epoch": 2.7278024417314093, + "grad_norm": 0.2899174690246582, + "learning_rate": 2.474196693386649e-07, + "loss": 0.3756, + "step": 3277 + }, + { + "epoch": 2.7286348501664817, + "grad_norm": 0.2992348074913025, + "learning_rate": 2.45916633256752e-07, + "loss": 0.4266, + "step": 3278 + }, + { + "epoch": 2.729467258601554, + "grad_norm": 0.29235145449638367, + "learning_rate": 2.4441806134236137e-07, + "loss": 0.4102, + "step": 3279 + }, + { + "epoch": 2.730299667036626, + "grad_norm": 0.26745110750198364, + "learning_rate": 2.4292395500267796e-07, + "loss": 0.3314, + "step": 3280 + }, + { + "epoch": 2.731132075471698, + "grad_norm": 0.3117216229438782, + "learning_rate": 2.4143431564069344e-07, + "loss": 0.4081, + "step": 3281 + }, + { + "epoch": 2.7319644839067703, + "grad_norm": 0.2923487424850464, + "learning_rate": 2.39949144655206e-07, + "loss": 0.3971, + "step": 3282 + }, + { + "epoch": 2.7327968923418426, + "grad_norm": 0.2777581810951233, + "learning_rate": 2.38468443440818e-07, + "loss": 0.3751, + "step": 3283 + }, + { + "epoch": 2.7336293007769146, + "grad_norm": 0.30919012427330017, + "learning_rate": 2.3699221338793155e-07, + "loss": 0.4077, + "step": 3284 + }, + { + "epoch": 2.7344617092119865, + "grad_norm": 0.2907434105873108, + "learning_rate": 2.355204558827534e-07, + "loss": 0.3853, + "step": 3285 + }, + { + "epoch": 2.735294117647059, + "grad_norm": 0.2914058566093445, + "learning_rate": 2.340531723072914e-07, + "loss": 0.3711, + "step": 3286 + }, + { + "epoch": 2.736126526082131, + "grad_norm": 0.2743302583694458, + "learning_rate": 2.3259036403934843e-07, + "loss": 0.4138, + "step": 3287 + }, + { + "epoch": 2.736958934517203, + "grad_norm": 0.30281293392181396, + "learning_rate": 2.3113203245252734e-07, + "loss": 0.3994, + "step": 3288 + }, + { + "epoch": 2.737791342952275, + "grad_norm": 0.303541898727417, + "learning_rate": 2.2967817891622724e-07, + "loss": 0.3588, + "step": 3289 + }, + { + "epoch": 2.7386237513873475, + "grad_norm": 0.31531253457069397, + "learning_rate": 2.2822880479564325e-07, + "loss": 0.3762, + "step": 3290 + }, + { + "epoch": 2.7394561598224194, + "grad_norm": 0.29251205921173096, + "learning_rate": 2.2678391145176115e-07, + "loss": 0.3847, + "step": 3291 + }, + { + "epoch": 2.740288568257492, + "grad_norm": 0.2942062318325043, + "learning_rate": 2.2534350024136232e-07, + "loss": 0.3904, + "step": 3292 + }, + { + "epoch": 2.7411209766925637, + "grad_norm": 0.30196094512939453, + "learning_rate": 2.2390757251701756e-07, + "loss": 0.3586, + "step": 3293 + }, + { + "epoch": 2.741953385127636, + "grad_norm": 0.3154551386833191, + "learning_rate": 2.224761296270883e-07, + "loss": 0.4193, + "step": 3294 + }, + { + "epoch": 2.742785793562708, + "grad_norm": 0.2840886116027832, + "learning_rate": 2.2104917291572435e-07, + "loss": 0.3629, + "step": 3295 + }, + { + "epoch": 2.7436182019977804, + "grad_norm": 0.28531089425086975, + "learning_rate": 2.196267037228633e-07, + "loss": 0.389, + "step": 3296 + }, + { + "epoch": 2.7444506104328523, + "grad_norm": 0.2992643713951111, + "learning_rate": 2.1820872338422838e-07, + "loss": 0.3948, + "step": 3297 + }, + { + "epoch": 2.7452830188679247, + "grad_norm": 0.296223908662796, + "learning_rate": 2.1679523323132835e-07, + "loss": 0.3902, + "step": 3298 + }, + { + "epoch": 2.7461154273029966, + "grad_norm": 0.270108699798584, + "learning_rate": 2.153862345914548e-07, + "loss": 0.3622, + "step": 3299 + }, + { + "epoch": 2.7469478357380686, + "grad_norm": 0.2876308262348175, + "learning_rate": 2.139817287876822e-07, + "loss": 0.3826, + "step": 3300 + }, + { + "epoch": 2.747780244173141, + "grad_norm": 0.28309187293052673, + "learning_rate": 2.125817171388672e-07, + "loss": 0.3575, + "step": 3301 + }, + { + "epoch": 2.7486126526082133, + "grad_norm": 0.31576332449913025, + "learning_rate": 2.111862009596427e-07, + "loss": 0.425, + "step": 3302 + }, + { + "epoch": 2.7494450610432852, + "grad_norm": 0.3037078082561493, + "learning_rate": 2.097951815604249e-07, + "loss": 0.3768, + "step": 3303 + }, + { + "epoch": 2.750277469478357, + "grad_norm": 0.3079027831554413, + "learning_rate": 2.0840866024740502e-07, + "loss": 0.3602, + "step": 3304 + }, + { + "epoch": 2.7511098779134295, + "grad_norm": 0.3146269917488098, + "learning_rate": 2.070266383225511e-07, + "loss": 0.4562, + "step": 3305 + }, + { + "epoch": 2.751942286348502, + "grad_norm": 0.2855137288570404, + "learning_rate": 2.0564911708360447e-07, + "loss": 0.3737, + "step": 3306 + }, + { + "epoch": 2.752774694783574, + "grad_norm": 0.30360037088394165, + "learning_rate": 2.0427609782408265e-07, + "loss": 0.4307, + "step": 3307 + }, + { + "epoch": 2.7536071032186458, + "grad_norm": 0.25891706347465515, + "learning_rate": 2.029075818332754e-07, + "loss": 0.3776, + "step": 3308 + }, + { + "epoch": 2.754439511653718, + "grad_norm": 0.2823725938796997, + "learning_rate": 2.0154357039624317e-07, + "loss": 0.3631, + "step": 3309 + }, + { + "epoch": 2.75527192008879, + "grad_norm": 0.30981922149658203, + "learning_rate": 2.0018406479381525e-07, + "loss": 0.4291, + "step": 3310 + }, + { + "epoch": 2.7561043285238624, + "grad_norm": 0.29691869020462036, + "learning_rate": 1.9882906630259158e-07, + "loss": 0.3768, + "step": 3311 + }, + { + "epoch": 2.7569367369589344, + "grad_norm": 0.28396308422088623, + "learning_rate": 1.9747857619494105e-07, + "loss": 0.3652, + "step": 3312 + }, + { + "epoch": 2.7577691453940067, + "grad_norm": 0.2920913100242615, + "learning_rate": 1.961325957389959e-07, + "loss": 0.3836, + "step": 3313 + }, + { + "epoch": 2.7586015538290787, + "grad_norm": 0.2953694760799408, + "learning_rate": 1.9479112619865513e-07, + "loss": 0.3953, + "step": 3314 + }, + { + "epoch": 2.759433962264151, + "grad_norm": 0.3114762604236603, + "learning_rate": 1.934541688335828e-07, + "loss": 0.4324, + "step": 3315 + }, + { + "epoch": 2.760266370699223, + "grad_norm": 0.28087711334228516, + "learning_rate": 1.9212172489920632e-07, + "loss": 0.3553, + "step": 3316 + }, + { + "epoch": 2.7610987791342954, + "grad_norm": 0.2810649871826172, + "learning_rate": 1.9079379564671207e-07, + "loss": 0.3584, + "step": 3317 + }, + { + "epoch": 2.7619311875693673, + "grad_norm": 0.3019134998321533, + "learning_rate": 1.8947038232304981e-07, + "loss": 0.4394, + "step": 3318 + }, + { + "epoch": 2.7627635960044397, + "grad_norm": 0.2816515266895294, + "learning_rate": 1.8815148617092772e-07, + "loss": 0.3543, + "step": 3319 + }, + { + "epoch": 2.7635960044395116, + "grad_norm": 0.30382615327835083, + "learning_rate": 1.8683710842881174e-07, + "loss": 0.4034, + "step": 3320 + }, + { + "epoch": 2.764428412874584, + "grad_norm": 0.29469168186187744, + "learning_rate": 1.8552725033092635e-07, + "loss": 0.3681, + "step": 3321 + }, + { + "epoch": 2.765260821309656, + "grad_norm": 0.3065103590488434, + "learning_rate": 1.8422191310725147e-07, + "loss": 0.4048, + "step": 3322 + }, + { + "epoch": 2.766093229744728, + "grad_norm": 0.29562950134277344, + "learning_rate": 1.8292109798352054e-07, + "loss": 0.3898, + "step": 3323 + }, + { + "epoch": 2.7669256381798, + "grad_norm": 0.28922104835510254, + "learning_rate": 1.816248061812226e-07, + "loss": 0.3753, + "step": 3324 + }, + { + "epoch": 2.7677580466148726, + "grad_norm": 0.2866446375846863, + "learning_rate": 1.8033303891759835e-07, + "loss": 0.3665, + "step": 3325 + }, + { + "epoch": 2.7685904550499445, + "grad_norm": 0.3014658987522125, + "learning_rate": 1.7904579740563921e-07, + "loss": 0.3785, + "step": 3326 + }, + { + "epoch": 2.7694228634850164, + "grad_norm": 0.28989580273628235, + "learning_rate": 1.7776308285408826e-07, + "loss": 0.4271, + "step": 3327 + }, + { + "epoch": 2.770255271920089, + "grad_norm": 0.27226322889328003, + "learning_rate": 1.7648489646743648e-07, + "loss": 0.3668, + "step": 3328 + }, + { + "epoch": 2.771087680355161, + "grad_norm": 0.286642462015152, + "learning_rate": 1.752112394459232e-07, + "loss": 0.4169, + "step": 3329 + }, + { + "epoch": 2.771920088790233, + "grad_norm": 0.2984205186367035, + "learning_rate": 1.7394211298553508e-07, + "loss": 0.3907, + "step": 3330 + }, + { + "epoch": 2.772752497225305, + "grad_norm": 0.26750481128692627, + "learning_rate": 1.726775182780044e-07, + "loss": 0.3586, + "step": 3331 + }, + { + "epoch": 2.7735849056603774, + "grad_norm": 0.2971529960632324, + "learning_rate": 1.7141745651080565e-07, + "loss": 0.4357, + "step": 3332 + }, + { + "epoch": 2.7744173140954493, + "grad_norm": 0.2746163308620453, + "learning_rate": 1.7016192886716132e-07, + "loss": 0.3554, + "step": 3333 + }, + { + "epoch": 2.7752497225305217, + "grad_norm": 0.2701033651828766, + "learning_rate": 1.689109365260333e-07, + "loss": 0.392, + "step": 3334 + }, + { + "epoch": 2.7760821309655936, + "grad_norm": 0.3108593225479126, + "learning_rate": 1.676644806621247e-07, + "loss": 0.38, + "step": 3335 + }, + { + "epoch": 2.776914539400666, + "grad_norm": 0.28853678703308105, + "learning_rate": 1.664225624458793e-07, + "loss": 0.381, + "step": 3336 + }, + { + "epoch": 2.777746947835738, + "grad_norm": 0.30430543422698975, + "learning_rate": 1.651851830434803e-07, + "loss": 0.4189, + "step": 3337 + }, + { + "epoch": 2.7785793562708103, + "grad_norm": 0.2828444838523865, + "learning_rate": 1.6395234361684943e-07, + "loss": 0.3827, + "step": 3338 + }, + { + "epoch": 2.7794117647058822, + "grad_norm": 0.2710029184818268, + "learning_rate": 1.6272404532364337e-07, + "loss": 0.4074, + "step": 3339 + }, + { + "epoch": 2.7802441731409546, + "grad_norm": 0.2796766757965088, + "learning_rate": 1.615002893172557e-07, + "loss": 0.3915, + "step": 3340 + }, + { + "epoch": 2.7810765815760266, + "grad_norm": 0.31362029910087585, + "learning_rate": 1.6028107674681547e-07, + "loss": 0.395, + "step": 3341 + }, + { + "epoch": 2.781908990011099, + "grad_norm": 0.30837786197662354, + "learning_rate": 1.5906640875718525e-07, + "loss": 0.3694, + "step": 3342 + }, + { + "epoch": 2.782741398446171, + "grad_norm": 0.290594220161438, + "learning_rate": 1.5785628648895767e-07, + "loss": 0.3946, + "step": 3343 + }, + { + "epoch": 2.7835738068812432, + "grad_norm": 0.28468722105026245, + "learning_rate": 1.5665071107845987e-07, + "loss": 0.3844, + "step": 3344 + }, + { + "epoch": 2.784406215316315, + "grad_norm": 0.2934969365596771, + "learning_rate": 1.5544968365774792e-07, + "loss": 0.3824, + "step": 3345 + }, + { + "epoch": 2.785238623751387, + "grad_norm": 0.295932799577713, + "learning_rate": 1.542532053546081e-07, + "loss": 0.4102, + "step": 3346 + }, + { + "epoch": 2.7860710321864595, + "grad_norm": 0.292585551738739, + "learning_rate": 1.5306127729255382e-07, + "loss": 0.3556, + "step": 3347 + }, + { + "epoch": 2.786903440621532, + "grad_norm": 0.30441945791244507, + "learning_rate": 1.5187390059082706e-07, + "loss": 0.3754, + "step": 3348 + }, + { + "epoch": 2.7877358490566038, + "grad_norm": 0.2834872305393219, + "learning_rate": 1.5069107636439484e-07, + "loss": 0.4026, + "step": 3349 + }, + { + "epoch": 2.7885682574916757, + "grad_norm": 0.29688331484794617, + "learning_rate": 1.4951280572394977e-07, + "loss": 0.3684, + "step": 3350 + }, + { + "epoch": 2.789400665926748, + "grad_norm": 0.28374260663986206, + "learning_rate": 1.483390897759096e-07, + "loss": 0.4047, + "step": 3351 + }, + { + "epoch": 2.7902330743618204, + "grad_norm": 0.30708324909210205, + "learning_rate": 1.4716992962241272e-07, + "loss": 0.3917, + "step": 3352 + }, + { + "epoch": 2.7910654827968924, + "grad_norm": 0.2824838161468506, + "learning_rate": 1.4600532636132147e-07, + "loss": 0.3869, + "step": 3353 + }, + { + "epoch": 2.7918978912319643, + "grad_norm": 0.294177383184433, + "learning_rate": 1.4484528108621942e-07, + "loss": 0.4117, + "step": 3354 + }, + { + "epoch": 2.7927302996670367, + "grad_norm": 0.3030126988887787, + "learning_rate": 1.4368979488640855e-07, + "loss": 0.3446, + "step": 3355 + }, + { + "epoch": 2.7935627081021086, + "grad_norm": 0.2987135350704193, + "learning_rate": 1.4253886884691148e-07, + "loss": 0.4387, + "step": 3356 + }, + { + "epoch": 2.794395116537181, + "grad_norm": 0.28978827595710754, + "learning_rate": 1.4139250404846704e-07, + "loss": 0.384, + "step": 3357 + }, + { + "epoch": 2.795227524972253, + "grad_norm": 0.28679752349853516, + "learning_rate": 1.4025070156753196e-07, + "loss": 0.346, + "step": 3358 + }, + { + "epoch": 2.7960599334073253, + "grad_norm": 0.341983437538147, + "learning_rate": 1.391134624762791e-07, + "loss": 0.4132, + "step": 3359 + }, + { + "epoch": 2.796892341842397, + "grad_norm": 0.3002639412879944, + "learning_rate": 1.3798078784259594e-07, + "loss": 0.399, + "step": 3360 + }, + { + "epoch": 2.7977247502774696, + "grad_norm": 0.3027034103870392, + "learning_rate": 1.368526787300839e-07, + "loss": 0.4074, + "step": 3361 + }, + { + "epoch": 2.7985571587125415, + "grad_norm": 0.2931459844112396, + "learning_rate": 1.3572913619805616e-07, + "loss": 0.374, + "step": 3362 + }, + { + "epoch": 2.799389567147614, + "grad_norm": 0.2804553806781769, + "learning_rate": 1.3461016130153993e-07, + "loss": 0.3715, + "step": 3363 + }, + { + "epoch": 2.800221975582686, + "grad_norm": 0.2980172038078308, + "learning_rate": 1.3349575509127244e-07, + "loss": 0.371, + "step": 3364 + }, + { + "epoch": 2.801054384017758, + "grad_norm": 0.28683868050575256, + "learning_rate": 1.3238591861369943e-07, + "loss": 0.3872, + "step": 3365 + }, + { + "epoch": 2.80188679245283, + "grad_norm": 0.28891491889953613, + "learning_rate": 1.3128065291097724e-07, + "loss": 0.3925, + "step": 3366 + }, + { + "epoch": 2.8027192008879025, + "grad_norm": 0.3039287030696869, + "learning_rate": 1.3017995902097013e-07, + "loss": 0.4261, + "step": 3367 + }, + { + "epoch": 2.8035516093229744, + "grad_norm": 0.30227887630462646, + "learning_rate": 1.290838379772491e-07, + "loss": 0.3702, + "step": 3368 + }, + { + "epoch": 2.8043840177580464, + "grad_norm": 0.3222786784172058, + "learning_rate": 1.2799229080909026e-07, + "loss": 0.3801, + "step": 3369 + }, + { + "epoch": 2.8052164261931187, + "grad_norm": 0.2965812683105469, + "learning_rate": 1.2690531854147537e-07, + "loss": 0.3763, + "step": 3370 + }, + { + "epoch": 2.806048834628191, + "grad_norm": 0.2899739146232605, + "learning_rate": 1.2582292219509184e-07, + "loss": 0.3972, + "step": 3371 + }, + { + "epoch": 2.806881243063263, + "grad_norm": 0.3159641921520233, + "learning_rate": 1.2474510278632779e-07, + "loss": 0.4052, + "step": 3372 + }, + { + "epoch": 2.807713651498335, + "grad_norm": 0.30823588371276855, + "learning_rate": 1.2367186132727415e-07, + "loss": 0.3822, + "step": 3373 + }, + { + "epoch": 2.8085460599334073, + "grad_norm": 0.286836177110672, + "learning_rate": 1.2260319882572425e-07, + "loss": 0.3836, + "step": 3374 + }, + { + "epoch": 2.8093784683684797, + "grad_norm": 0.2870038151741028, + "learning_rate": 1.2153911628517036e-07, + "loss": 0.3896, + "step": 3375 + }, + { + "epoch": 2.8102108768035516, + "grad_norm": 0.3150666058063507, + "learning_rate": 1.2047961470480485e-07, + "loss": 0.4282, + "step": 3376 + }, + { + "epoch": 2.8110432852386236, + "grad_norm": 0.28661397099494934, + "learning_rate": 1.1942469507951803e-07, + "loss": 0.3712, + "step": 3377 + }, + { + "epoch": 2.811875693673696, + "grad_norm": 0.29480233788490295, + "learning_rate": 1.1837435839989808e-07, + "loss": 0.416, + "step": 3378 + }, + { + "epoch": 2.812708102108768, + "grad_norm": 0.2871578335762024, + "learning_rate": 1.1732860565222936e-07, + "loss": 0.3905, + "step": 3379 + }, + { + "epoch": 2.8135405105438402, + "grad_norm": 0.27761033177375793, + "learning_rate": 1.162874378184925e-07, + "loss": 0.3864, + "step": 3380 + }, + { + "epoch": 2.814372918978912, + "grad_norm": 0.29687756299972534, + "learning_rate": 1.1525085587636209e-07, + "loss": 0.3494, + "step": 3381 + }, + { + "epoch": 2.8152053274139845, + "grad_norm": 0.2945989668369293, + "learning_rate": 1.1421886079920619e-07, + "loss": 0.3968, + "step": 3382 + }, + { + "epoch": 2.8160377358490565, + "grad_norm": 0.2789284884929657, + "learning_rate": 1.1319145355608684e-07, + "loss": 0.381, + "step": 3383 + }, + { + "epoch": 2.816870144284129, + "grad_norm": 0.3032738268375397, + "learning_rate": 1.1216863511175736e-07, + "loss": 0.3858, + "step": 3384 + }, + { + "epoch": 2.817702552719201, + "grad_norm": 0.30183014273643494, + "learning_rate": 1.111504064266622e-07, + "loss": 0.398, + "step": 3385 + }, + { + "epoch": 2.818534961154273, + "grad_norm": 0.29048189520835876, + "learning_rate": 1.1013676845693544e-07, + "loss": 0.4148, + "step": 3386 + }, + { + "epoch": 2.819367369589345, + "grad_norm": 0.2871493101119995, + "learning_rate": 1.0912772215440182e-07, + "loss": 0.3881, + "step": 3387 + }, + { + "epoch": 2.8201997780244175, + "grad_norm": 0.30180609226226807, + "learning_rate": 1.0812326846657228e-07, + "loss": 0.3678, + "step": 3388 + }, + { + "epoch": 2.8210321864594894, + "grad_norm": 0.2673914134502411, + "learning_rate": 1.0712340833664737e-07, + "loss": 0.3572, + "step": 3389 + }, + { + "epoch": 2.8218645948945618, + "grad_norm": 0.28488829731941223, + "learning_rate": 1.0612814270351324e-07, + "loss": 0.4244, + "step": 3390 + }, + { + "epoch": 2.8226970033296337, + "grad_norm": 0.2882390320301056, + "learning_rate": 1.0513747250174123e-07, + "loss": 0.3718, + "step": 3391 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.30586186051368713, + "learning_rate": 1.0415139866158774e-07, + "loss": 0.3935, + "step": 3392 + }, + { + "epoch": 2.824361820199778, + "grad_norm": 0.296487033367157, + "learning_rate": 1.0316992210899435e-07, + "loss": 0.38, + "step": 3393 + }, + { + "epoch": 2.8251942286348504, + "grad_norm": 0.26386237144470215, + "learning_rate": 1.0219304376558492e-07, + "loss": 0.3801, + "step": 3394 + }, + { + "epoch": 2.8260266370699223, + "grad_norm": 0.26951250433921814, + "learning_rate": 1.0122076454866347e-07, + "loss": 0.3868, + "step": 3395 + }, + { + "epoch": 2.8268590455049942, + "grad_norm": 0.30076080560684204, + "learning_rate": 1.0025308537121859e-07, + "loss": 0.4074, + "step": 3396 + }, + { + "epoch": 2.8276914539400666, + "grad_norm": 0.3036608397960663, + "learning_rate": 9.929000714191838e-08, + "loss": 0.4016, + "step": 3397 + }, + { + "epoch": 2.828523862375139, + "grad_norm": 0.2856789231300354, + "learning_rate": 9.833153076510893e-08, + "loss": 0.3614, + "step": 3398 + }, + { + "epoch": 2.829356270810211, + "grad_norm": 0.3074534237384796, + "learning_rate": 9.737765714081748e-08, + "loss": 0.3782, + "step": 3399 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.2953753173351288, + "learning_rate": 9.642838716474645e-08, + "loss": 0.4025, + "step": 3400 + }, + { + "epoch": 2.831021087680355, + "grad_norm": 0.2885286808013916, + "learning_rate": 9.548372172827946e-08, + "loss": 0.3511, + "step": 3401 + }, + { + "epoch": 2.831853496115427, + "grad_norm": 0.2874261736869812, + "learning_rate": 9.454366171847196e-08, + "loss": 0.3967, + "step": 3402 + }, + { + "epoch": 2.8326859045504995, + "grad_norm": 0.29668954014778137, + "learning_rate": 9.360820801805726e-08, + "loss": 0.4091, + "step": 3403 + }, + { + "epoch": 2.8335183129855714, + "grad_norm": 0.28590044379234314, + "learning_rate": 9.267736150544271e-08, + "loss": 0.3797, + "step": 3404 + }, + { + "epoch": 2.834350721420644, + "grad_norm": 0.31228768825531006, + "learning_rate": 9.175112305470913e-08, + "loss": 0.4138, + "step": 3405 + }, + { + "epoch": 2.8351831298557157, + "grad_norm": 0.29430899024009705, + "learning_rate": 9.082949353561187e-08, + "loss": 0.3866, + "step": 3406 + }, + { + "epoch": 2.836015538290788, + "grad_norm": 0.30940887331962585, + "learning_rate": 8.991247381357593e-08, + "loss": 0.4006, + "step": 3407 + }, + { + "epoch": 2.83684794672586, + "grad_norm": 0.29085415601730347, + "learning_rate": 8.900006474969913e-08, + "loss": 0.3581, + "step": 3408 + }, + { + "epoch": 2.8376803551609324, + "grad_norm": 0.29941922426223755, + "learning_rate": 8.809226720075059e-08, + "loss": 0.4113, + "step": 3409 + }, + { + "epoch": 2.8385127635960044, + "grad_norm": 0.2955285906791687, + "learning_rate": 8.718908201916676e-08, + "loss": 0.3789, + "step": 3410 + }, + { + "epoch": 2.8393451720310767, + "grad_norm": 0.2934470772743225, + "learning_rate": 8.629051005305478e-08, + "loss": 0.3859, + "step": 3411 + }, + { + "epoch": 2.8401775804661487, + "grad_norm": 0.28304851055145264, + "learning_rate": 8.539655214618969e-08, + "loss": 0.3857, + "step": 3412 + }, + { + "epoch": 2.841009988901221, + "grad_norm": 0.2950180172920227, + "learning_rate": 8.450720913801336e-08, + "loss": 0.3564, + "step": 3413 + }, + { + "epoch": 2.841842397336293, + "grad_norm": 0.2903018593788147, + "learning_rate": 8.362248186363441e-08, + "loss": 0.41, + "step": 3414 + }, + { + "epoch": 2.842674805771365, + "grad_norm": 0.3056175708770752, + "learning_rate": 8.274237115382777e-08, + "loss": 0.381, + "step": 3415 + }, + { + "epoch": 2.8435072142064373, + "grad_norm": 0.2846352756023407, + "learning_rate": 8.186687783503289e-08, + "loss": 0.376, + "step": 3416 + }, + { + "epoch": 2.8443396226415096, + "grad_norm": 0.29959842562675476, + "learning_rate": 8.09960027293516e-08, + "loss": 0.395, + "step": 3417 + }, + { + "epoch": 2.8451720310765816, + "grad_norm": 0.30339470505714417, + "learning_rate": 8.012974665455308e-08, + "loss": 0.3928, + "step": 3418 + }, + { + "epoch": 2.8460044395116535, + "grad_norm": 0.3100747764110565, + "learning_rate": 7.926811042406557e-08, + "loss": 0.4121, + "step": 3419 + }, + { + "epoch": 2.846836847946726, + "grad_norm": 0.28325793147087097, + "learning_rate": 7.841109484698184e-08, + "loss": 0.4077, + "step": 3420 + }, + { + "epoch": 2.8476692563817982, + "grad_norm": 0.30448758602142334, + "learning_rate": 7.755870072805316e-08, + "loss": 0.3853, + "step": 3421 + }, + { + "epoch": 2.84850166481687, + "grad_norm": 0.3250058591365814, + "learning_rate": 7.67109288676926e-08, + "loss": 0.4308, + "step": 3422 + }, + { + "epoch": 2.849334073251942, + "grad_norm": 0.27683284878730774, + "learning_rate": 7.586778006197337e-08, + "loss": 0.3575, + "step": 3423 + }, + { + "epoch": 2.8501664816870145, + "grad_norm": 0.30187785625457764, + "learning_rate": 7.50292551026277e-08, + "loss": 0.4145, + "step": 3424 + }, + { + "epoch": 2.8509988901220864, + "grad_norm": 0.2923542261123657, + "learning_rate": 7.419535477704354e-08, + "loss": 0.3981, + "step": 3425 + }, + { + "epoch": 2.851831298557159, + "grad_norm": 0.3036320209503174, + "learning_rate": 7.336607986826839e-08, + "loss": 0.3879, + "step": 3426 + }, + { + "epoch": 2.8526637069922307, + "grad_norm": 0.29370933771133423, + "learning_rate": 7.254143115500711e-08, + "loss": 0.3971, + "step": 3427 + }, + { + "epoch": 2.853496115427303, + "grad_norm": 0.307032972574234, + "learning_rate": 7.17214094116181e-08, + "loss": 0.398, + "step": 3428 + }, + { + "epoch": 2.854328523862375, + "grad_norm": 0.3023212254047394, + "learning_rate": 7.090601540811648e-08, + "loss": 0.3883, + "step": 3429 + }, + { + "epoch": 2.8551609322974474, + "grad_norm": 0.2827087938785553, + "learning_rate": 7.009524991017091e-08, + "loss": 0.3788, + "step": 3430 + }, + { + "epoch": 2.8559933407325193, + "grad_norm": 0.28021904826164246, + "learning_rate": 6.928911367910573e-08, + "loss": 0.3843, + "step": 3431 + }, + { + "epoch": 2.8568257491675917, + "grad_norm": 0.2957780063152313, + "learning_rate": 6.848760747189598e-08, + "loss": 0.3939, + "step": 3432 + }, + { + "epoch": 2.8576581576026636, + "grad_norm": 0.307271808385849, + "learning_rate": 6.769073204117016e-08, + "loss": 0.4193, + "step": 3433 + }, + { + "epoch": 2.858490566037736, + "grad_norm": 0.3014625012874603, + "learning_rate": 6.689848813520805e-08, + "loss": 0.3848, + "step": 3434 + }, + { + "epoch": 2.859322974472808, + "grad_norm": 0.2933945059776306, + "learning_rate": 6.611087649794124e-08, + "loss": 0.3986, + "step": 3435 + }, + { + "epoch": 2.8601553829078803, + "grad_norm": 0.29639217257499695, + "learning_rate": 6.532789786895033e-08, + "loss": 0.3729, + "step": 3436 + }, + { + "epoch": 2.8609877913429522, + "grad_norm": 0.31975582242012024, + "learning_rate": 6.454955298346555e-08, + "loss": 0.4172, + "step": 3437 + }, + { + "epoch": 2.861820199778024, + "grad_norm": 0.2899722754955292, + "learning_rate": 6.377584257236724e-08, + "loss": 0.3822, + "step": 3438 + }, + { + "epoch": 2.8626526082130965, + "grad_norm": 0.27522486448287964, + "learning_rate": 6.300676736218258e-08, + "loss": 0.3855, + "step": 3439 + }, + { + "epoch": 2.863485016648169, + "grad_norm": 0.2691723704338074, + "learning_rate": 6.224232807508667e-08, + "loss": 0.3849, + "step": 3440 + }, + { + "epoch": 2.864317425083241, + "grad_norm": 0.2858082056045532, + "learning_rate": 6.148252542890198e-08, + "loss": 0.4041, + "step": 3441 + }, + { + "epoch": 2.8651498335183128, + "grad_norm": 0.27565300464630127, + "learning_rate": 6.072736013709557e-08, + "loss": 0.3919, + "step": 3442 + }, + { + "epoch": 2.865982241953385, + "grad_norm": 0.2915458679199219, + "learning_rate": 5.997683290878131e-08, + "loss": 0.4133, + "step": 3443 + }, + { + "epoch": 2.8668146503884575, + "grad_norm": 0.27250799536705017, + "learning_rate": 5.923094444871713e-08, + "loss": 0.3583, + "step": 3444 + }, + { + "epoch": 2.8676470588235294, + "grad_norm": 0.28215593099594116, + "learning_rate": 5.848969545730554e-08, + "loss": 0.4154, + "step": 3445 + }, + { + "epoch": 2.8684794672586014, + "grad_norm": 0.2830628752708435, + "learning_rate": 5.775308663059309e-08, + "loss": 0.385, + "step": 3446 + }, + { + "epoch": 2.8693118756936737, + "grad_norm": 0.2738587558269501, + "learning_rate": 5.702111866026705e-08, + "loss": 0.373, + "step": 3447 + }, + { + "epoch": 2.870144284128746, + "grad_norm": 0.28816235065460205, + "learning_rate": 5.629379223365872e-08, + "loss": 0.3963, + "step": 3448 + }, + { + "epoch": 2.870976692563818, + "grad_norm": 0.3096372187137604, + "learning_rate": 5.557110803374066e-08, + "loss": 0.4023, + "step": 3449 + }, + { + "epoch": 2.87180910099889, + "grad_norm": 0.29053497314453125, + "learning_rate": 5.485306673912616e-08, + "loss": 0.401, + "step": 3450 + }, + { + "epoch": 2.8726415094339623, + "grad_norm": 0.29939737915992737, + "learning_rate": 5.413966902406753e-08, + "loss": 0.3722, + "step": 3451 + }, + { + "epoch": 2.8734739178690343, + "grad_norm": 0.285515159368515, + "learning_rate": 5.343091555845781e-08, + "loss": 0.3878, + "step": 3452 + }, + { + "epoch": 2.8743063263041067, + "grad_norm": 0.28648898005485535, + "learning_rate": 5.272680700783073e-08, + "loss": 0.3773, + "step": 3453 + }, + { + "epoch": 2.8751387347391786, + "grad_norm": 0.275084525346756, + "learning_rate": 5.2027344033354077e-08, + "loss": 0.3646, + "step": 3454 + }, + { + "epoch": 2.875971143174251, + "grad_norm": 0.2844296395778656, + "learning_rate": 5.1332527291837465e-08, + "loss": 0.3902, + "step": 3455 + }, + { + "epoch": 2.876803551609323, + "grad_norm": 0.32090917229652405, + "learning_rate": 5.06423574357251e-08, + "loss": 0.3932, + "step": 3456 + }, + { + "epoch": 2.8776359600443953, + "grad_norm": 0.29726335406303406, + "learning_rate": 4.9956835113099676e-08, + "loss": 0.3991, + "step": 3457 + }, + { + "epoch": 2.878468368479467, + "grad_norm": 0.30311334133148193, + "learning_rate": 4.927596096767795e-08, + "loss": 0.385, + "step": 3458 + }, + { + "epoch": 2.8793007769145396, + "grad_norm": 0.3128688633441925, + "learning_rate": 4.8599735638812373e-08, + "loss": 0.4243, + "step": 3459 + }, + { + "epoch": 2.8801331853496115, + "grad_norm": 0.2947447896003723, + "learning_rate": 4.7928159761490566e-08, + "loss": 0.4102, + "step": 3460 + }, + { + "epoch": 2.8809655937846834, + "grad_norm": 0.2862451374530792, + "learning_rate": 4.7261233966334196e-08, + "loss": 0.3962, + "step": 3461 + }, + { + "epoch": 2.881798002219756, + "grad_norm": 0.2894607186317444, + "learning_rate": 4.659895887959787e-08, + "loss": 0.3894, + "step": 3462 + }, + { + "epoch": 2.882630410654828, + "grad_norm": 0.2788543701171875, + "learning_rate": 4.594133512317023e-08, + "loss": 0.3739, + "step": 3463 + }, + { + "epoch": 2.8834628190899, + "grad_norm": 0.289505273103714, + "learning_rate": 4.528836331457065e-08, + "loss": 0.4062, + "step": 3464 + }, + { + "epoch": 2.884295227524972, + "grad_norm": 0.26619216799736023, + "learning_rate": 4.4640044066951994e-08, + "loss": 0.3453, + "step": 3465 + }, + { + "epoch": 2.8851276359600444, + "grad_norm": 0.29299643635749817, + "learning_rate": 4.399637798909673e-08, + "loss": 0.4205, + "step": 3466 + }, + { + "epoch": 2.8859600443951168, + "grad_norm": 0.2827985882759094, + "learning_rate": 4.335736568541915e-08, + "loss": 0.4005, + "step": 3467 + }, + { + "epoch": 2.8867924528301887, + "grad_norm": 0.2879890203475952, + "learning_rate": 4.272300775596205e-08, + "loss": 0.3817, + "step": 3468 + }, + { + "epoch": 2.8876248612652606, + "grad_norm": 0.30056655406951904, + "learning_rate": 4.2093304796399504e-08, + "loss": 0.372, + "step": 3469 + }, + { + "epoch": 2.888457269700333, + "grad_norm": 0.285110205411911, + "learning_rate": 4.146825739803295e-08, + "loss": 0.3608, + "step": 3470 + }, + { + "epoch": 2.8892896781354054, + "grad_norm": 0.29550155997276306, + "learning_rate": 4.084786614779346e-08, + "loss": 0.4042, + "step": 3471 + }, + { + "epoch": 2.8901220865704773, + "grad_norm": 0.3029278516769409, + "learning_rate": 4.023213162823947e-08, + "loss": 0.4232, + "step": 3472 + }, + { + "epoch": 2.8909544950055492, + "grad_norm": 0.2764630913734436, + "learning_rate": 3.962105441755515e-08, + "loss": 0.3862, + "step": 3473 + }, + { + "epoch": 2.8917869034406216, + "grad_norm": 0.2890421748161316, + "learning_rate": 3.9014635089554274e-08, + "loss": 0.3944, + "step": 3474 + }, + { + "epoch": 2.8926193118756935, + "grad_norm": 0.3060329258441925, + "learning_rate": 3.841287421367412e-08, + "loss": 0.3911, + "step": 3475 + }, + { + "epoch": 2.893451720310766, + "grad_norm": 0.29384008049964905, + "learning_rate": 3.781577235497935e-08, + "loss": 0.3783, + "step": 3476 + }, + { + "epoch": 2.894284128745838, + "grad_norm": 0.2772958278656006, + "learning_rate": 3.7223330074158126e-08, + "loss": 0.3298, + "step": 3477 + }, + { + "epoch": 2.89511653718091, + "grad_norm": 0.32114148139953613, + "learning_rate": 3.663554792752544e-08, + "loss": 0.4557, + "step": 3478 + }, + { + "epoch": 2.895948945615982, + "grad_norm": 0.27607429027557373, + "learning_rate": 3.605242646701812e-08, + "loss": 0.3632, + "step": 3479 + }, + { + "epoch": 2.8967813540510545, + "grad_norm": 0.3078801929950714, + "learning_rate": 3.547396624019817e-08, + "loss": 0.4103, + "step": 3480 + }, + { + "epoch": 2.8976137624861265, + "grad_norm": 0.2985548675060272, + "learning_rate": 3.490016779024885e-08, + "loss": 0.3895, + "step": 3481 + }, + { + "epoch": 2.898446170921199, + "grad_norm": 0.3042149245738983, + "learning_rate": 3.4331031655976955e-08, + "loss": 0.3638, + "step": 3482 + }, + { + "epoch": 2.8992785793562708, + "grad_norm": 0.28391167521476746, + "learning_rate": 3.3766558371812754e-08, + "loss": 0.4081, + "step": 3483 + }, + { + "epoch": 2.9001109877913427, + "grad_norm": 0.29962408542633057, + "learning_rate": 3.320674846780503e-08, + "loss": 0.3843, + "step": 3484 + }, + { + "epoch": 2.900943396226415, + "grad_norm": 0.27982330322265625, + "learning_rate": 3.265160246962607e-08, + "loss": 0.3801, + "step": 3485 + }, + { + "epoch": 2.9017758046614874, + "grad_norm": 0.30797380208969116, + "learning_rate": 3.210112089856721e-08, + "loss": 0.4107, + "step": 3486 + }, + { + "epoch": 2.9026082130965594, + "grad_norm": 0.30911773443222046, + "learning_rate": 3.155530427153997e-08, + "loss": 0.4328, + "step": 3487 + }, + { + "epoch": 2.9034406215316313, + "grad_norm": 0.2736690044403076, + "learning_rate": 3.1014153101076026e-08, + "loss": 0.3458, + "step": 3488 + }, + { + "epoch": 2.9042730299667037, + "grad_norm": 0.28448286652565, + "learning_rate": 3.0477667895326133e-08, + "loss": 0.3888, + "step": 3489 + }, + { + "epoch": 2.905105438401776, + "grad_norm": 0.29954880475997925, + "learning_rate": 2.994584915805898e-08, + "loss": 0.3806, + "step": 3490 + }, + { + "epoch": 2.905937846836848, + "grad_norm": 0.27898576855659485, + "learning_rate": 2.9418697388661766e-08, + "loss": 0.3819, + "step": 3491 + }, + { + "epoch": 2.90677025527192, + "grad_norm": 0.305177241563797, + "learning_rate": 2.889621308213908e-08, + "loss": 0.3835, + "step": 3492 + }, + { + "epoch": 2.9076026637069923, + "grad_norm": 0.29762381315231323, + "learning_rate": 2.8378396729113466e-08, + "loss": 0.3718, + "step": 3493 + }, + { + "epoch": 2.9084350721420646, + "grad_norm": 0.29369136691093445, + "learning_rate": 2.7865248815822087e-08, + "loss": 0.3917, + "step": 3494 + }, + { + "epoch": 2.9092674805771366, + "grad_norm": 0.29049572348594666, + "learning_rate": 2.7356769824121166e-08, + "loss": 0.4114, + "step": 3495 + }, + { + "epoch": 2.9100998890122085, + "grad_norm": 0.27511221170425415, + "learning_rate": 2.6852960231480985e-08, + "loss": 0.3677, + "step": 3496 + }, + { + "epoch": 2.910932297447281, + "grad_norm": 0.28010135889053345, + "learning_rate": 2.635382051098756e-08, + "loss": 0.406, + "step": 3497 + }, + { + "epoch": 2.911764705882353, + "grad_norm": 0.28749653697013855, + "learning_rate": 2.585935113134208e-08, + "loss": 0.3838, + "step": 3498 + }, + { + "epoch": 2.912597114317425, + "grad_norm": 0.2730650305747986, + "learning_rate": 2.5369552556859243e-08, + "loss": 0.3455, + "step": 3499 + }, + { + "epoch": 2.913429522752497, + "grad_norm": 0.2961804270744324, + "learning_rate": 2.4884425247468924e-08, + "loss": 0.3869, + "step": 3500 + }, + { + "epoch": 2.9142619311875695, + "grad_norm": 0.29125046730041504, + "learning_rate": 2.44039696587145e-08, + "loss": 0.383, + "step": 3501 + }, + { + "epoch": 2.9150943396226414, + "grad_norm": 0.3115370273590088, + "learning_rate": 2.392818624175175e-08, + "loss": 0.3976, + "step": 3502 + }, + { + "epoch": 2.915926748057714, + "grad_norm": 0.3151339590549469, + "learning_rate": 2.345707544334941e-08, + "loss": 0.3907, + "step": 3503 + }, + { + "epoch": 2.9167591564927857, + "grad_norm": 0.29095959663391113, + "learning_rate": 2.2990637705889717e-08, + "loss": 0.3903, + "step": 3504 + }, + { + "epoch": 2.917591564927858, + "grad_norm": 0.30376264452934265, + "learning_rate": 2.2528873467365098e-08, + "loss": 0.4254, + "step": 3505 + }, + { + "epoch": 2.91842397336293, + "grad_norm": 0.28083696961402893, + "learning_rate": 2.2071783161379812e-08, + "loss": 0.3617, + "step": 3506 + }, + { + "epoch": 2.919256381798002, + "grad_norm": 0.3202035129070282, + "learning_rate": 2.1619367217150522e-08, + "loss": 0.4465, + "step": 3507 + }, + { + "epoch": 2.9200887902330743, + "grad_norm": 0.2700662612915039, + "learning_rate": 2.1171626059503514e-08, + "loss": 0.3814, + "step": 3508 + }, + { + "epoch": 2.9209211986681467, + "grad_norm": 0.2775675058364868, + "learning_rate": 2.0728560108875807e-08, + "loss": 0.3732, + "step": 3509 + }, + { + "epoch": 2.9217536071032186, + "grad_norm": 0.2907252311706543, + "learning_rate": 2.0290169781313483e-08, + "loss": 0.418, + "step": 3510 + }, + { + "epoch": 2.9225860155382906, + "grad_norm": 0.27951502799987793, + "learning_rate": 1.985645548847337e-08, + "loss": 0.3932, + "step": 3511 + }, + { + "epoch": 2.923418423973363, + "grad_norm": 0.2931711673736572, + "learning_rate": 1.9427417637619685e-08, + "loss": 0.3873, + "step": 3512 + }, + { + "epoch": 2.9242508324084353, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.9003056631627935e-08, + "loss": 0.4157, + "step": 3513 + }, + { + "epoch": 2.9250832408435072, + "grad_norm": 0.2937333285808563, + "learning_rate": 1.8583372868979933e-08, + "loss": 0.3687, + "step": 3514 + }, + { + "epoch": 2.925915649278579, + "grad_norm": 0.2885035574436188, + "learning_rate": 1.8168366743765432e-08, + "loss": 0.3595, + "step": 3515 + }, + { + "epoch": 2.9267480577136515, + "grad_norm": 0.2732231914997101, + "learning_rate": 1.775803864568326e-08, + "loss": 0.3982, + "step": 3516 + }, + { + "epoch": 2.927580466148724, + "grad_norm": 0.29774099588394165, + "learning_rate": 1.7352388960038548e-08, + "loss": 0.4037, + "step": 3517 + }, + { + "epoch": 2.928412874583796, + "grad_norm": 0.304595410823822, + "learning_rate": 1.695141806774325e-08, + "loss": 0.3923, + "step": 3518 + }, + { + "epoch": 2.9292452830188678, + "grad_norm": 0.27520307898521423, + "learning_rate": 1.6555126345316197e-08, + "loss": 0.3416, + "step": 3519 + }, + { + "epoch": 2.93007769145394, + "grad_norm": 0.30991899967193604, + "learning_rate": 1.6163514164882486e-08, + "loss": 0.4287, + "step": 3520 + }, + { + "epoch": 2.930910099889012, + "grad_norm": 0.29716676473617554, + "learning_rate": 1.577658189417186e-08, + "loss": 0.3619, + "step": 3521 + }, + { + "epoch": 2.9317425083240845, + "grad_norm": 0.2918042540550232, + "learning_rate": 1.539432989652201e-08, + "loss": 0.3875, + "step": 3522 + }, + { + "epoch": 2.9325749167591564, + "grad_norm": 0.27701038122177124, + "learning_rate": 1.5016758530873033e-08, + "loss": 0.3748, + "step": 3523 + }, + { + "epoch": 2.9334073251942288, + "grad_norm": 0.31016066670417786, + "learning_rate": 1.4643868151771323e-08, + "loss": 0.394, + "step": 3524 + }, + { + "epoch": 2.9342397336293007, + "grad_norm": 0.29906758666038513, + "learning_rate": 1.4275659109367346e-08, + "loss": 0.4065, + "step": 3525 + }, + { + "epoch": 2.935072142064373, + "grad_norm": 0.31481271982192993, + "learning_rate": 1.3912131749416746e-08, + "loss": 0.3975, + "step": 3526 + }, + { + "epoch": 2.935904550499445, + "grad_norm": 0.2872684597969055, + "learning_rate": 1.3553286413277022e-08, + "loss": 0.3697, + "step": 3527 + }, + { + "epoch": 2.9367369589345174, + "grad_norm": 0.29366058111190796, + "learning_rate": 1.3199123437910855e-08, + "loss": 0.3647, + "step": 3528 + }, + { + "epoch": 2.9375693673695893, + "grad_norm": 0.29561111330986023, + "learning_rate": 1.2849643155882773e-08, + "loss": 0.3693, + "step": 3529 + }, + { + "epoch": 2.938401775804661, + "grad_norm": 0.2844444811344147, + "learning_rate": 1.2504845895361384e-08, + "loss": 0.3812, + "step": 3530 + }, + { + "epoch": 2.9392341842397336, + "grad_norm": 0.26833999156951904, + "learning_rate": 1.2164731980117694e-08, + "loss": 0.4002, + "step": 3531 + }, + { + "epoch": 2.940066592674806, + "grad_norm": 0.2657759487628937, + "learning_rate": 1.1829301729524567e-08, + "loss": 0.3429, + "step": 3532 + }, + { + "epoch": 2.940899001109878, + "grad_norm": 0.31585893034935, + "learning_rate": 1.1498555458555604e-08, + "loss": 0.3983, + "step": 3533 + }, + { + "epoch": 2.94173140954495, + "grad_norm": 0.3071114420890808, + "learning_rate": 1.1172493477789037e-08, + "loss": 0.4385, + "step": 3534 + }, + { + "epoch": 2.942563817980022, + "grad_norm": 0.2719644010066986, + "learning_rate": 1.085111609340217e-08, + "loss": 0.3399, + "step": 3535 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.2977891266345978, + "learning_rate": 1.0534423607173604e-08, + "loss": 0.38, + "step": 3536 + }, + { + "epoch": 2.9442286348501665, + "grad_norm": 0.2826799154281616, + "learning_rate": 1.022241631648324e-08, + "loss": 0.3787, + "step": 3537 + }, + { + "epoch": 2.9450610432852384, + "grad_norm": 0.2901372015476227, + "learning_rate": 9.915094514311719e-09, + "loss": 0.4123, + "step": 3538 + }, + { + "epoch": 2.945893451720311, + "grad_norm": 0.2899583578109741, + "learning_rate": 9.612458489239308e-09, + "loss": 0.4093, + "step": 3539 + }, + { + "epoch": 2.946725860155383, + "grad_norm": 0.27518314123153687, + "learning_rate": 9.314508525446464e-09, + "loss": 0.3605, + "step": 3540 + }, + { + "epoch": 2.947558268590455, + "grad_norm": 0.29123416543006897, + "learning_rate": 9.021244902713833e-09, + "loss": 0.3942, + "step": 3541 + }, + { + "epoch": 2.948390677025527, + "grad_norm": 0.2904079258441925, + "learning_rate": 8.732667896421131e-09, + "loss": 0.3999, + "step": 3542 + }, + { + "epoch": 2.9492230854605994, + "grad_norm": 0.29001057147979736, + "learning_rate": 8.448777777546601e-09, + "loss": 0.3751, + "step": 3543 + }, + { + "epoch": 2.9500554938956713, + "grad_norm": 0.2952643632888794, + "learning_rate": 8.169574812668668e-09, + "loss": 0.3758, + "step": 3544 + }, + { + "epoch": 2.9508879023307437, + "grad_norm": 0.3037968873977661, + "learning_rate": 7.895059263963168e-09, + "loss": 0.4062, + "step": 3545 + }, + { + "epoch": 2.9517203107658156, + "grad_norm": 0.29707083106040955, + "learning_rate": 7.625231389205567e-09, + "loss": 0.3736, + "step": 3546 + }, + { + "epoch": 2.952552719200888, + "grad_norm": 0.3003142178058624, + "learning_rate": 7.360091441768746e-09, + "loss": 0.3837, + "step": 3547 + }, + { + "epoch": 2.95338512763596, + "grad_norm": 0.2987581789493561, + "learning_rate": 7.099639670623548e-09, + "loss": 0.3834, + "step": 3548 + }, + { + "epoch": 2.9542175360710323, + "grad_norm": 0.28041496872901917, + "learning_rate": 6.8438763203393375e-09, + "loss": 0.366, + "step": 3549 + }, + { + "epoch": 2.9550499445061043, + "grad_norm": 0.28142842650413513, + "learning_rate": 6.59280163108178e-09, + "loss": 0.4162, + "step": 3550 + }, + { + "epoch": 2.9558823529411766, + "grad_norm": 0.2837742865085602, + "learning_rate": 6.346415838614506e-09, + "loss": 0.3371, + "step": 3551 + }, + { + "epoch": 2.9567147613762486, + "grad_norm": 0.29957345128059387, + "learning_rate": 6.104719174298557e-09, + "loss": 0.4113, + "step": 3552 + }, + { + "epoch": 2.9575471698113205, + "grad_norm": 0.2940880358219147, + "learning_rate": 5.867711865090719e-09, + "loss": 0.3893, + "step": 3553 + }, + { + "epoch": 2.958379578246393, + "grad_norm": 0.29609382152557373, + "learning_rate": 5.635394133545191e-09, + "loss": 0.3688, + "step": 3554 + }, + { + "epoch": 2.9592119866814652, + "grad_norm": 0.29784825444221497, + "learning_rate": 5.40776619781247e-09, + "loss": 0.4102, + "step": 3555 + }, + { + "epoch": 2.960044395116537, + "grad_norm": 0.2848165035247803, + "learning_rate": 5.184828271639353e-09, + "loss": 0.3718, + "step": 3556 + }, + { + "epoch": 2.960876803551609, + "grad_norm": 0.30922722816467285, + "learning_rate": 4.966580564368384e-09, + "loss": 0.3542, + "step": 3557 + }, + { + "epoch": 2.9617092119866815, + "grad_norm": 0.2894100546836853, + "learning_rate": 4.7530232809378515e-09, + "loss": 0.4281, + "step": 3558 + }, + { + "epoch": 2.962541620421754, + "grad_norm": 0.2721484899520874, + "learning_rate": 4.54415662188179e-09, + "loss": 0.392, + "step": 3559 + }, + { + "epoch": 2.9633740288568258, + "grad_norm": 0.30090397596359253, + "learning_rate": 4.339980783329423e-09, + "loss": 0.3977, + "step": 3560 + }, + { + "epoch": 2.9642064372918977, + "grad_norm": 0.29261451959609985, + "learning_rate": 4.140495957006274e-09, + "loss": 0.3847, + "step": 3561 + }, + { + "epoch": 2.96503884572697, + "grad_norm": 0.2850942015647888, + "learning_rate": 3.945702330230839e-09, + "loss": 0.4031, + "step": 3562 + }, + { + "epoch": 2.9658712541620424, + "grad_norm": 0.2756821811199188, + "learning_rate": 3.755600085918465e-09, + "loss": 0.364, + "step": 3563 + }, + { + "epoch": 2.9667036625971144, + "grad_norm": 0.31473076343536377, + "learning_rate": 3.5701894025791383e-09, + "loss": 0.446, + "step": 3564 + }, + { + "epoch": 2.9675360710321863, + "grad_norm": 0.29244500398635864, + "learning_rate": 3.3894704543152578e-09, + "loss": 0.4213, + "step": 3565 + }, + { + "epoch": 2.9683684794672587, + "grad_norm": 0.29834458231925964, + "learning_rate": 3.213443410826078e-09, + "loss": 0.3975, + "step": 3566 + }, + { + "epoch": 2.9692008879023306, + "grad_norm": 0.29561495780944824, + "learning_rate": 3.0421084374038234e-09, + "loss": 0.3765, + "step": 3567 + }, + { + "epoch": 2.970033296337403, + "grad_norm": 0.28357571363449097, + "learning_rate": 2.875465694935353e-09, + "loss": 0.4125, + "step": 3568 + }, + { + "epoch": 2.970865704772475, + "grad_norm": 0.3278956413269043, + "learning_rate": 2.7135153399004967e-09, + "loss": 0.4449, + "step": 3569 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 0.2880827784538269, + "learning_rate": 2.5562575243737176e-09, + "loss": 0.364, + "step": 3570 + }, + { + "epoch": 2.972530521642619, + "grad_norm": 0.30118292570114136, + "learning_rate": 2.4036923960230053e-09, + "loss": 0.3864, + "step": 3571 + }, + { + "epoch": 2.9733629300776916, + "grad_norm": 0.30025699734687805, + "learning_rate": 2.255820098109873e-09, + "loss": 0.3575, + "step": 3572 + }, + { + "epoch": 2.9741953385127635, + "grad_norm": 0.31271275877952576, + "learning_rate": 2.112640769488805e-09, + "loss": 0.424, + "step": 3573 + }, + { + "epoch": 2.975027746947836, + "grad_norm": 0.28177496790885925, + "learning_rate": 1.974154544607254e-09, + "loss": 0.3734, + "step": 3574 + }, + { + "epoch": 2.975860155382908, + "grad_norm": 0.2948905825614929, + "learning_rate": 1.8403615535067531e-09, + "loss": 0.4235, + "step": 3575 + }, + { + "epoch": 2.9766925638179798, + "grad_norm": 0.28703781962394714, + "learning_rate": 1.71126192182125e-09, + "loss": 0.3976, + "step": 3576 + }, + { + "epoch": 2.977524972253052, + "grad_norm": 0.3083464205265045, + "learning_rate": 1.586855770777107e-09, + "loss": 0.4025, + "step": 3577 + }, + { + "epoch": 2.9783573806881245, + "grad_norm": 0.30387553572654724, + "learning_rate": 1.4671432171947663e-09, + "loss": 0.4421, + "step": 3578 + }, + { + "epoch": 2.9791897891231964, + "grad_norm": 0.2780333161354065, + "learning_rate": 1.3521243734854195e-09, + "loss": 0.3695, + "step": 3579 + }, + { + "epoch": 2.9800221975582684, + "grad_norm": 0.31834548711776733, + "learning_rate": 1.2417993476543377e-09, + "loss": 0.4009, + "step": 3580 + }, + { + "epoch": 2.9808546059933407, + "grad_norm": 0.27020248770713806, + "learning_rate": 1.136168243298097e-09, + "loss": 0.3783, + "step": 3581 + }, + { + "epoch": 2.981687014428413, + "grad_norm": 0.29213130474090576, + "learning_rate": 1.0352311596067976e-09, + "loss": 0.398, + "step": 3582 + }, + { + "epoch": 2.982519422863485, + "grad_norm": 0.307743102312088, + "learning_rate": 9.389881913618448e-10, + "loss": 0.4089, + "step": 3583 + }, + { + "epoch": 2.983351831298557, + "grad_norm": 0.31198549270629883, + "learning_rate": 8.474394289376131e-10, + "loss": 0.4056, + "step": 3584 + }, + { + "epoch": 2.9841842397336293, + "grad_norm": 0.30002060532569885, + "learning_rate": 7.605849582986713e-10, + "loss": 0.3865, + "step": 3585 + }, + { + "epoch": 2.9850166481687017, + "grad_norm": 0.28173932433128357, + "learning_rate": 6.784248610042232e-10, + "loss": 0.3695, + "step": 3586 + }, + { + "epoch": 2.9858490566037736, + "grad_norm": 0.29308822751045227, + "learning_rate": 6.009592142036669e-10, + "loss": 0.3928, + "step": 3587 + }, + { + "epoch": 2.9866814650388456, + "grad_norm": 0.317353218793869, + "learning_rate": 5.281880906382597e-10, + "loss": 0.4158, + "step": 3588 + }, + { + "epoch": 2.987513873473918, + "grad_norm": 0.30219030380249023, + "learning_rate": 4.6011155864111865e-10, + "loss": 0.3933, + "step": 3589 + }, + { + "epoch": 2.98834628190899, + "grad_norm": 0.28345680236816406, + "learning_rate": 3.967296821383304e-10, + "loss": 0.3719, + "step": 3590 + }, + { + "epoch": 2.9891786903440623, + "grad_norm": 0.2939116358757019, + "learning_rate": 3.380425206461757e-10, + "loss": 0.3956, + "step": 3591 + }, + { + "epoch": 2.990011098779134, + "grad_norm": 0.30421409010887146, + "learning_rate": 2.8405012927223975e-10, + "loss": 0.384, + "step": 3592 + }, + { + "epoch": 2.9908435072142066, + "grad_norm": 0.2770386338233948, + "learning_rate": 2.3475255871707737e-10, + "loss": 0.3904, + "step": 3593 + }, + { + "epoch": 2.9916759156492785, + "grad_norm": 0.28260740637779236, + "learning_rate": 1.901498552714376e-10, + "loss": 0.3735, + "step": 3594 + }, + { + "epoch": 2.992508324084351, + "grad_norm": 0.29308584332466125, + "learning_rate": 1.5024206081848401e-10, + "loss": 0.4103, + "step": 3595 + }, + { + "epoch": 2.993340732519423, + "grad_norm": 0.3127232491970062, + "learning_rate": 1.1502921283212953e-10, + "loss": 0.442, + "step": 3596 + }, + { + "epoch": 2.994173140954495, + "grad_norm": 0.3048544228076935, + "learning_rate": 8.451134437814646e-11, + "loss": 0.3834, + "step": 3597 + }, + { + "epoch": 2.995005549389567, + "grad_norm": 0.2796577513217926, + "learning_rate": 5.86884841130564e-11, + "loss": 0.3338, + "step": 3598 + }, + { + "epoch": 2.995837957824639, + "grad_norm": 0.29299935698509216, + "learning_rate": 3.7560656284685305e-11, + "loss": 0.418, + "step": 3599 + }, + { + "epoch": 2.9966703662597114, + "grad_norm": 0.28302428126335144, + "learning_rate": 2.1127880733273764e-11, + "loss": 0.3776, + "step": 3600 + }, + { + "epoch": 2.9975027746947838, + "grad_norm": 0.2750588357448578, + "learning_rate": 9.390172888701366e-12, + "loss": 0.3898, + "step": 3601 + }, + { + "epoch": 2.9983351831298557, + "grad_norm": 0.30292966961860657, + "learning_rate": 2.347543773262295e-12, + "loss": 0.4144, + "step": 3602 + }, + { + "epoch": 2.9991675915649276, + "grad_norm": 0.29896464943885803, + "learning_rate": 0.0, + "loss": 0.3686, + "step": 3603 + }, + { + "epoch": 2.9991675915649276, + "step": 3603, + "total_flos": 4646503280312320.0, + "train_loss": 0.438159415813211, + "train_runtime": 72439.8933, + "train_samples_per_second": 4.776, + "train_steps_per_second": 0.05 + } + ], + "logging_steps": 1.0, + "max_steps": 3603, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4646503280312320.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}