| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.986425339366516, |
| "eval_steps": 500, |
| "global_step": 550, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00904977375565611, |
| "grad_norm": 6.353778491475079, |
| "learning_rate": 1.4545454545454546e-06, |
| "loss": 0.8197, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01809954751131222, |
| "grad_norm": 6.38646932272591, |
| "learning_rate": 2.9090909090909093e-06, |
| "loss": 0.8224, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.027149321266968326, |
| "grad_norm": 6.053910996051301, |
| "learning_rate": 4.363636363636364e-06, |
| "loss": 0.8034, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03619909502262444, |
| "grad_norm": 4.366009246926635, |
| "learning_rate": 5.8181818181818185e-06, |
| "loss": 0.774, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.04524886877828054, |
| "grad_norm": 2.274884898169855, |
| "learning_rate": 7.272727272727273e-06, |
| "loss": 0.7292, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.05429864253393665, |
| "grad_norm": 1.9029591404903814, |
| "learning_rate": 8.727272727272728e-06, |
| "loss": 0.7138, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.06334841628959276, |
| "grad_norm": 4.535776001491079, |
| "learning_rate": 1.0181818181818182e-05, |
| "loss": 0.7317, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.07239819004524888, |
| "grad_norm": 5.273781078723551, |
| "learning_rate": 1.1636363636363637e-05, |
| "loss": 0.7227, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.08144796380090498, |
| "grad_norm": 6.613983727436227, |
| "learning_rate": 1.3090909090909092e-05, |
| "loss": 0.7225, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.09049773755656108, |
| "grad_norm": 5.322076805028839, |
| "learning_rate": 1.4545454545454546e-05, |
| "loss": 0.7032, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.09954751131221719, |
| "grad_norm": 2.6848510839250563, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.6662, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1085972850678733, |
| "grad_norm": 2.501883063773779, |
| "learning_rate": 1.7454545454545456e-05, |
| "loss": 0.6361, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.11764705882352941, |
| "grad_norm": 2.4408014518675407, |
| "learning_rate": 1.8909090909090912e-05, |
| "loss": 0.6328, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.12669683257918551, |
| "grad_norm": 1.3581412540415263, |
| "learning_rate": 2.0363636363636365e-05, |
| "loss": 0.6038, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.13574660633484162, |
| "grad_norm": 1.3171074276840706, |
| "learning_rate": 2.1818181818181818e-05, |
| "loss": 0.5949, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.14479638009049775, |
| "grad_norm": 1.1346220985487154, |
| "learning_rate": 2.3272727272727274e-05, |
| "loss": 0.5812, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.15384615384615385, |
| "grad_norm": 0.9805713190517853, |
| "learning_rate": 2.4727272727272727e-05, |
| "loss": 0.5777, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.16289592760180996, |
| "grad_norm": 0.981975629498607, |
| "learning_rate": 2.6181818181818183e-05, |
| "loss": 0.5671, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.17194570135746606, |
| "grad_norm": 0.8633430106624941, |
| "learning_rate": 2.763636363636364e-05, |
| "loss": 0.5617, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.18099547511312217, |
| "grad_norm": 0.9927559005340078, |
| "learning_rate": 2.9090909090909093e-05, |
| "loss": 0.5547, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.19004524886877827, |
| "grad_norm": 0.7908715300138798, |
| "learning_rate": 3.054545454545455e-05, |
| "loss": 0.5392, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.19909502262443438, |
| "grad_norm": 1.077249549339691, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.5373, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.2081447963800905, |
| "grad_norm": 1.3602867857746606, |
| "learning_rate": 3.345454545454546e-05, |
| "loss": 0.534, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.2171945701357466, |
| "grad_norm": 0.687608913929421, |
| "learning_rate": 3.490909090909091e-05, |
| "loss": 0.5286, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.22624434389140272, |
| "grad_norm": 1.2223237721495865, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 0.5301, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 0.8580084887459775, |
| "learning_rate": 3.7818181818181824e-05, |
| "loss": 0.5214, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.24434389140271492, |
| "grad_norm": 1.1071155887269823, |
| "learning_rate": 3.927272727272728e-05, |
| "loss": 0.5115, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.25339366515837103, |
| "grad_norm": 0.7195369247916135, |
| "learning_rate": 4.072727272727273e-05, |
| "loss": 0.5095, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.26244343891402716, |
| "grad_norm": 1.5961441486786672, |
| "learning_rate": 4.218181818181818e-05, |
| "loss": 0.5087, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.27149321266968324, |
| "grad_norm": 0.827814164011704, |
| "learning_rate": 4.3636363636363636e-05, |
| "loss": 0.5111, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.28054298642533937, |
| "grad_norm": 1.1252915824734908, |
| "learning_rate": 4.509090909090909e-05, |
| "loss": 0.5072, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.2895927601809955, |
| "grad_norm": 1.1219477837842433, |
| "learning_rate": 4.654545454545455e-05, |
| "loss": 0.5046, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.2986425339366516, |
| "grad_norm": 1.505289650732032, |
| "learning_rate": 4.8e-05, |
| "loss": 0.503, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.8491008498263081, |
| "learning_rate": 4.9454545454545454e-05, |
| "loss": 0.5011, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.3167420814479638, |
| "grad_norm": 1.4306190204877898, |
| "learning_rate": 5.0909090909090914e-05, |
| "loss": 0.4895, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3257918552036199, |
| "grad_norm": 1.0851621930114523, |
| "learning_rate": 5.236363636363637e-05, |
| "loss": 0.4956, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.334841628959276, |
| "grad_norm": 1.206194741973144, |
| "learning_rate": 5.381818181818182e-05, |
| "loss": 0.4887, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.3438914027149321, |
| "grad_norm": 1.4311749390727002, |
| "learning_rate": 5.527272727272728e-05, |
| "loss": 0.487, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.35294117647058826, |
| "grad_norm": 0.738265754189619, |
| "learning_rate": 5.672727272727273e-05, |
| "loss": 0.4854, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.36199095022624433, |
| "grad_norm": 1.6935271715239177, |
| "learning_rate": 5.8181818181818185e-05, |
| "loss": 0.4943, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.37104072398190047, |
| "grad_norm": 0.8407873740347103, |
| "learning_rate": 5.9636363636363645e-05, |
| "loss": 0.4846, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.38009049773755654, |
| "grad_norm": 1.168749658810784, |
| "learning_rate": 6.10909090909091e-05, |
| "loss": 0.4765, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.3891402714932127, |
| "grad_norm": 0.9365669815926128, |
| "learning_rate": 6.254545454545456e-05, |
| "loss": 0.4865, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.39819004524886875, |
| "grad_norm": 1.481731917951899, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.4858, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.4072398190045249, |
| "grad_norm": 1.52181493703478, |
| "learning_rate": 6.545454545454546e-05, |
| "loss": 0.4949, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.416289592760181, |
| "grad_norm": 1.449133028804705, |
| "learning_rate": 6.690909090909092e-05, |
| "loss": 0.4799, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.4253393665158371, |
| "grad_norm": 1.4529455783010232, |
| "learning_rate": 6.836363636363637e-05, |
| "loss": 0.4779, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.4343891402714932, |
| "grad_norm": 1.3548957795298309, |
| "learning_rate": 6.981818181818182e-05, |
| "loss": 0.4776, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.4434389140271493, |
| "grad_norm": 1.3963891315975814, |
| "learning_rate": 7.127272727272728e-05, |
| "loss": 0.4731, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.45248868778280543, |
| "grad_norm": 1.0657728232300334, |
| "learning_rate": 7.272727272727273e-05, |
| "loss": 0.4702, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.46153846153846156, |
| "grad_norm": 1.2659845812441222, |
| "learning_rate": 7.418181818181818e-05, |
| "loss": 0.4853, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 1.0364505292451895, |
| "learning_rate": 7.563636363636365e-05, |
| "loss": 0.4663, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.4796380090497738, |
| "grad_norm": 1.188728427839461, |
| "learning_rate": 7.70909090909091e-05, |
| "loss": 0.4819, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.48868778280542985, |
| "grad_norm": 1.3147320829735498, |
| "learning_rate": 7.854545454545455e-05, |
| "loss": 0.4765, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.497737556561086, |
| "grad_norm": 1.1857487638939694, |
| "learning_rate": 8e-05, |
| "loss": 0.4764, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5067873303167421, |
| "grad_norm": 1.4561796125999034, |
| "learning_rate": 7.999919440291627e-05, |
| "loss": 0.4853, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.5158371040723982, |
| "grad_norm": 1.3437982531985815, |
| "learning_rate": 7.999677764411438e-05, |
| "loss": 0.4766, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.5248868778280543, |
| "grad_norm": 0.7865846274351967, |
| "learning_rate": 7.999274982094104e-05, |
| "loss": 0.4697, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.5339366515837104, |
| "grad_norm": 1.7177406402851725, |
| "learning_rate": 7.998711109563637e-05, |
| "loss": 0.4753, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.5429864253393665, |
| "grad_norm": 0.765099828673138, |
| "learning_rate": 7.997986169532741e-05, |
| "loss": 0.4646, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5520361990950227, |
| "grad_norm": 1.291319953299644, |
| "learning_rate": 7.997100191201896e-05, |
| "loss": 0.4719, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.5610859728506787, |
| "grad_norm": 1.0231054776331459, |
| "learning_rate": 7.996053210258176e-05, |
| "loss": 0.4597, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.5701357466063348, |
| "grad_norm": 1.020678257497354, |
| "learning_rate": 7.994845268873825e-05, |
| "loss": 0.4631, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.579185520361991, |
| "grad_norm": 1.0920667405954658, |
| "learning_rate": 7.993476415704543e-05, |
| "loss": 0.4557, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 1.1128002216774169, |
| "learning_rate": 7.991946705887539e-05, |
| "loss": 0.4601, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.5972850678733032, |
| "grad_norm": 1.0096600697154985, |
| "learning_rate": 7.990256201039297e-05, |
| "loss": 0.4616, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.6063348416289592, |
| "grad_norm": 1.0630268901035373, |
| "learning_rate": 7.98840496925311e-05, |
| "loss": 0.4536, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.6564221253831782, |
| "learning_rate": 7.986393085096324e-05, |
| "loss": 0.4483, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.6244343891402715, |
| "grad_norm": 0.6260742925219016, |
| "learning_rate": 7.984220629607336e-05, |
| "loss": 0.4508, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.6334841628959276, |
| "grad_norm": 0.8273325354500644, |
| "learning_rate": 7.981887690292339e-05, |
| "loss": 0.4494, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6425339366515838, |
| "grad_norm": 0.884333850023049, |
| "learning_rate": 7.979394361121789e-05, |
| "loss": 0.4519, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.6515837104072398, |
| "grad_norm": 1.0182608804991282, |
| "learning_rate": 7.97674074252662e-05, |
| "loss": 0.4555, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.6606334841628959, |
| "grad_norm": 0.8109090124549581, |
| "learning_rate": 7.9739269413942e-05, |
| "loss": 0.4521, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.669683257918552, |
| "grad_norm": 0.9807135848274302, |
| "learning_rate": 7.970953071064036e-05, |
| "loss": 0.4531, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.6787330316742082, |
| "grad_norm": 1.504171669224647, |
| "learning_rate": 7.967819251323182e-05, |
| "loss": 0.4705, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.6877828054298643, |
| "grad_norm": 0.5907918802273645, |
| "learning_rate": 7.964525608401445e-05, |
| "loss": 0.4488, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.6968325791855203, |
| "grad_norm": 1.198484065683406, |
| "learning_rate": 7.961072274966282e-05, |
| "loss": 0.4551, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 0.7873070678087877, |
| "learning_rate": 7.957459390117458e-05, |
| "loss": 0.4457, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.7149321266968326, |
| "grad_norm": 0.8039969398610661, |
| "learning_rate": 7.95368709938145e-05, |
| "loss": 0.454, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.7239819004524887, |
| "grad_norm": 0.7842737782633313, |
| "learning_rate": 7.949755554705577e-05, |
| "loss": 0.441, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7330316742081447, |
| "grad_norm": 0.8912309653170191, |
| "learning_rate": 7.945664914451888e-05, |
| "loss": 0.4442, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.7420814479638009, |
| "grad_norm": 0.7795917187202375, |
| "learning_rate": 7.941415343390773e-05, |
| "loss": 0.4406, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.751131221719457, |
| "grad_norm": 0.6205090396508431, |
| "learning_rate": 7.937007012694335e-05, |
| "loss": 0.4507, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.7601809954751131, |
| "grad_norm": 0.8937106249538119, |
| "learning_rate": 7.932440099929493e-05, |
| "loss": 0.4532, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.6165097160255976, |
| "learning_rate": 7.927714789050826e-05, |
| "loss": 0.4454, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.7782805429864253, |
| "grad_norm": 0.6003524566440082, |
| "learning_rate": 7.92283127039317e-05, |
| "loss": 0.4367, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.7873303167420814, |
| "grad_norm": 0.6906315651004887, |
| "learning_rate": 7.917789740663941e-05, |
| "loss": 0.4332, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.7963800904977375, |
| "grad_norm": 0.515950260791789, |
| "learning_rate": 7.912590402935223e-05, |
| "loss": 0.4345, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.8054298642533937, |
| "grad_norm": 0.5111081573242718, |
| "learning_rate": 7.907233466635582e-05, |
| "loss": 0.4419, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.8144796380090498, |
| "grad_norm": 0.4275091010122135, |
| "learning_rate": 7.90171914754163e-05, |
| "loss": 0.4383, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.8235294117647058, |
| "grad_norm": 0.5029385491118522, |
| "learning_rate": 7.896047667769335e-05, |
| "loss": 0.4348, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.832579185520362, |
| "grad_norm": 0.4078317539372912, |
| "learning_rate": 7.890219255765077e-05, |
| "loss": 0.4293, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.8416289592760181, |
| "grad_norm": 0.4768996453352078, |
| "learning_rate": 7.884234146296442e-05, |
| "loss": 0.4375, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.8506787330316742, |
| "grad_norm": 0.5642712829841973, |
| "learning_rate": 7.878092580442766e-05, |
| "loss": 0.4301, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.8597285067873304, |
| "grad_norm": 0.7926376614480052, |
| "learning_rate": 7.871794805585427e-05, |
| "loss": 0.4377, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.8687782805429864, |
| "grad_norm": 0.8704449632643672, |
| "learning_rate": 7.865341075397874e-05, |
| "loss": 0.4303, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.8778280542986425, |
| "grad_norm": 0.8638439306395513, |
| "learning_rate": 7.858731649835424e-05, |
| "loss": 0.4321, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.8868778280542986, |
| "grad_norm": 1.1122142269322102, |
| "learning_rate": 7.85196679512477e-05, |
| "loss": 0.4369, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.8959276018099548, |
| "grad_norm": 0.8695186582366218, |
| "learning_rate": 7.845046783753276e-05, |
| "loss": 0.4229, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.9049773755656109, |
| "grad_norm": 0.6907557363110941, |
| "learning_rate": 7.837971894457991e-05, |
| "loss": 0.4294, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.9140271493212669, |
| "grad_norm": 0.5776183515910319, |
| "learning_rate": 7.830742412214422e-05, |
| "loss": 0.4276, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.45053560734923015, |
| "learning_rate": 7.82335862822506e-05, |
| "loss": 0.4284, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.9321266968325792, |
| "grad_norm": 0.643150601551687, |
| "learning_rate": 7.815820839907651e-05, |
| "loss": 0.4237, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 0.6867059068503916, |
| "learning_rate": 7.808129350883207e-05, |
| "loss": 0.4325, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.9502262443438914, |
| "grad_norm": 0.47092962014394424, |
| "learning_rate": 7.800284470963783e-05, |
| "loss": 0.4282, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.9592760180995475, |
| "grad_norm": 0.5794346362766767, |
| "learning_rate": 7.792286516139999e-05, |
| "loss": 0.426, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.9683257918552036, |
| "grad_norm": 0.5899947636404669, |
| "learning_rate": 7.784135808568308e-05, |
| "loss": 0.4241, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.9773755656108597, |
| "grad_norm": 0.5788528664751798, |
| "learning_rate": 7.775832676558028e-05, |
| "loss": 0.4356, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.9864253393665159, |
| "grad_norm": 0.8755965755346112, |
| "learning_rate": 7.7673774545581e-05, |
| "loss": 0.4366, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.995475113122172, |
| "grad_norm": 0.9177669118629138, |
| "learning_rate": 7.758770483143634e-05, |
| "loss": 0.4329, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.006787330316742, |
| "grad_norm": 0.6446624913715124, |
| "learning_rate": 7.750012109002185e-05, |
| "loss": 0.4137, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.0158371040723981, |
| "grad_norm": 0.5702180405824077, |
| "learning_rate": 7.741102684919787e-05, |
| "loss": 0.414, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.0248868778280542, |
| "grad_norm": 0.604164406050923, |
| "learning_rate": 7.732042569766741e-05, |
| "loss": 0.4059, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.0339366515837105, |
| "grad_norm": 0.578300789859933, |
| "learning_rate": 7.722832128483165e-05, |
| "loss": 0.4072, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.0429864253393666, |
| "grad_norm": 0.7009255889818755, |
| "learning_rate": 7.71347173206429e-05, |
| "loss": 0.4044, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.0520361990950227, |
| "grad_norm": 0.709442615518018, |
| "learning_rate": 7.703961757545522e-05, |
| "loss": 0.4171, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.0610859728506787, |
| "grad_norm": 0.6827354691854699, |
| "learning_rate": 7.694302587987245e-05, |
| "loss": 0.4079, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.0701357466063348, |
| "grad_norm": 0.6818158818500779, |
| "learning_rate": 7.6844946124594e-05, |
| "loss": 0.4126, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.079185520361991, |
| "grad_norm": 0.863785651181626, |
| "learning_rate": 7.674538226025815e-05, |
| "loss": 0.4109, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.088235294117647, |
| "grad_norm": 0.5508478361873977, |
| "learning_rate": 7.664433829728279e-05, |
| "loss": 0.4028, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.0972850678733033, |
| "grad_norm": 0.5954544705810189, |
| "learning_rate": 7.654181830570404e-05, |
| "loss": 0.3969, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.1063348416289593, |
| "grad_norm": 0.5872119406822112, |
| "learning_rate": 7.64378264150122e-05, |
| "loss": 0.3993, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.1153846153846154, |
| "grad_norm": 0.5228235141898725, |
| "learning_rate": 7.633236681398549e-05, |
| "loss": 0.4063, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.1244343891402715, |
| "grad_norm": 0.6996331404201509, |
| "learning_rate": 7.622544375052124e-05, |
| "loss": 0.4076, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.1334841628959276, |
| "grad_norm": 0.5428162110150009, |
| "learning_rate": 7.611706153146486e-05, |
| "loss": 0.4016, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.1425339366515836, |
| "grad_norm": 0.4476576669392731, |
| "learning_rate": 7.600722452243632e-05, |
| "loss": 0.4018, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.1515837104072397, |
| "grad_norm": 0.47501951208322907, |
| "learning_rate": 7.589593714765434e-05, |
| "loss": 0.4003, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.160633484162896, |
| "grad_norm": 0.39864387167073856, |
| "learning_rate": 7.578320388975816e-05, |
| "loss": 0.4043, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.169683257918552, |
| "grad_norm": 0.4415298124574496, |
| "learning_rate": 7.566902928962694e-05, |
| "loss": 0.3996, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.1787330316742082, |
| "grad_norm": 0.3522729758159122, |
| "learning_rate": 7.555341794619695e-05, |
| "loss": 0.402, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.1877828054298643, |
| "grad_norm": 0.37499820674043755, |
| "learning_rate": 7.543637451627623e-05, |
| "loss": 0.3963, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.1968325791855203, |
| "grad_norm": 0.3140691864451784, |
| "learning_rate": 7.531790371435709e-05, |
| "loss": 0.3961, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.2058823529411764, |
| "grad_norm": 0.3383990510694389, |
| "learning_rate": 7.519801031242613e-05, |
| "loss": 0.4026, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.2149321266968327, |
| "grad_norm": 0.3704940782389753, |
| "learning_rate": 7.507669913977213e-05, |
| "loss": 0.3964, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.2239819004524888, |
| "grad_norm": 0.3963648582905255, |
| "learning_rate": 7.49539750827914e-05, |
| "loss": 0.3989, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.2330316742081449, |
| "grad_norm": 0.43661160954834166, |
| "learning_rate": 7.482984308479109e-05, |
| "loss": 0.3992, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.242081447963801, |
| "grad_norm": 0.4760701506797231, |
| "learning_rate": 7.470430814578997e-05, |
| "loss": 0.4038, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.251131221719457, |
| "grad_norm": 0.6080541303280323, |
| "learning_rate": 7.457737532231708e-05, |
| "loss": 0.4017, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.260180995475113, |
| "grad_norm": 0.8992194165468265, |
| "learning_rate": 7.444904972720803e-05, |
| "loss": 0.4021, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.2692307692307692, |
| "grad_norm": 0.9388790657173226, |
| "learning_rate": 7.431933652939909e-05, |
| "loss": 0.4012, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.2782805429864252, |
| "grad_norm": 0.7115639008601973, |
| "learning_rate": 7.418824095371895e-05, |
| "loss": 0.3981, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.2873303167420813, |
| "grad_norm": 0.482309574495901, |
| "learning_rate": 7.405576828067829e-05, |
| "loss": 0.4016, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.2963800904977376, |
| "grad_norm": 0.4429825222163161, |
| "learning_rate": 7.392192384625704e-05, |
| "loss": 0.3992, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.3054298642533937, |
| "grad_norm": 0.46955938360600935, |
| "learning_rate": 7.378671304168955e-05, |
| "loss": 0.3963, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.3144796380090498, |
| "grad_norm": 0.41143211547851066, |
| "learning_rate": 7.365014131324725e-05, |
| "loss": 0.3989, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.3235294117647058, |
| "grad_norm": 0.3655361428229988, |
| "learning_rate": 7.35122141620195e-05, |
| "loss": 0.3981, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.332579185520362, |
| "grad_norm": 0.3675624921847117, |
| "learning_rate": 7.337293714369182e-05, |
| "loss": 0.3888, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.3416289592760182, |
| "grad_norm": 0.38410932117144336, |
| "learning_rate": 7.323231586832219e-05, |
| "loss": 0.3983, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.3506787330316743, |
| "grad_norm": 0.35565241834369704, |
| "learning_rate": 7.30903560001151e-05, |
| "loss": 0.3927, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.3597285067873304, |
| "grad_norm": 0.3699524089900378, |
| "learning_rate": 7.294706325719331e-05, |
| "loss": 0.4009, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.3687782805429864, |
| "grad_norm": 0.37565752487407766, |
| "learning_rate": 7.280244341136765e-05, |
| "loss": 0.3968, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.3778280542986425, |
| "grad_norm": 0.41101581863985015, |
| "learning_rate": 7.26565022879044e-05, |
| "loss": 0.396, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.3868778280542986, |
| "grad_norm": 0.4755720566371037, |
| "learning_rate": 7.250924576529072e-05, |
| "loss": 0.3941, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.3959276018099547, |
| "grad_norm": 0.5478106344364355, |
| "learning_rate": 7.236067977499791e-05, |
| "loss": 0.396, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.4049773755656108, |
| "grad_norm": 0.6612146719144061, |
| "learning_rate": 7.221081030124235e-05, |
| "loss": 0.3927, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.4140271493212668, |
| "grad_norm": 0.7051474533138959, |
| "learning_rate": 7.205964338074462e-05, |
| "loss": 0.3991, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.4230769230769231, |
| "grad_norm": 0.6630123949633135, |
| "learning_rate": 7.190718510248622e-05, |
| "loss": 0.3969, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.4321266968325792, |
| "grad_norm": 0.5841525031103902, |
| "learning_rate": 7.175344160746438e-05, |
| "loss": 0.4009, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.4411764705882353, |
| "grad_norm": 0.5791330941522758, |
| "learning_rate": 7.159841908844465e-05, |
| "loss": 0.3904, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.4502262443438914, |
| "grad_norm": 0.6974262225802738, |
| "learning_rate": 7.144212378971151e-05, |
| "loss": 0.3965, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.4592760180995474, |
| "grad_norm": 0.7970932314626533, |
| "learning_rate": 7.128456200681678e-05, |
| "loss": 0.3913, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.4683257918552037, |
| "grad_norm": 0.7048456834043483, |
| "learning_rate": 7.11257400863261e-05, |
| "loss": 0.3901, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.4773755656108598, |
| "grad_norm": 0.4096523489923981, |
| "learning_rate": 7.096566442556331e-05, |
| "loss": 0.3937, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.4864253393665159, |
| "grad_norm": 0.3554945327024847, |
| "learning_rate": 7.080434147235263e-05, |
| "loss": 0.3892, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.495475113122172, |
| "grad_norm": 0.5448243159096916, |
| "learning_rate": 7.064177772475912e-05, |
| "loss": 0.3985, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.504524886877828, |
| "grad_norm": 0.4087602063081316, |
| "learning_rate": 7.047797973082684e-05, |
| "loss": 0.3953, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.5135746606334841, |
| "grad_norm": 0.27977995475090656, |
| "learning_rate": 7.031295408831508e-05, |
| "loss": 0.3859, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.5226244343891402, |
| "grad_norm": 0.4128081941990591, |
| "learning_rate": 7.014670744443267e-05, |
| "loss": 0.3993, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.5316742081447963, |
| "grad_norm": 0.3728102691478845, |
| "learning_rate": 6.997924649557017e-05, |
| "loss": 0.3924, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.5407239819004523, |
| "grad_norm": 0.2814741945781907, |
| "learning_rate": 6.98105779870302e-05, |
| "loss": 0.396, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.5497737556561086, |
| "grad_norm": 0.3864339523003812, |
| "learning_rate": 6.964070871275567e-05, |
| "loss": 0.3958, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.5588235294117647, |
| "grad_norm": 0.32053481390391325, |
| "learning_rate": 6.94696455150562e-05, |
| "loss": 0.3927, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.5678733031674208, |
| "grad_norm": 0.25239085998303645, |
| "learning_rate": 6.929739528433244e-05, |
| "loss": 0.3931, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.5769230769230769, |
| "grad_norm": 0.3797864909903561, |
| "learning_rate": 6.912396495879857e-05, |
| "loss": 0.3947, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.5859728506787332, |
| "grad_norm": 0.3778896067077122, |
| "learning_rate": 6.89493615242028e-05, |
| "loss": 0.3958, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.5950226244343892, |
| "grad_norm": 0.34652871604736857, |
| "learning_rate": 6.877359201354606e-05, |
| "loss": 0.3875, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.6040723981900453, |
| "grad_norm": 0.42674803575084275, |
| "learning_rate": 6.859666350679854e-05, |
| "loss": 0.3909, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.6131221719457014, |
| "grad_norm": 0.29475704922050416, |
| "learning_rate": 6.841858313061477e-05, |
| "loss": 0.3861, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.6221719457013575, |
| "grad_norm": 0.3605164686190752, |
| "learning_rate": 6.823935805804626e-05, |
| "loss": 0.3946, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.6312217194570136, |
| "grad_norm": 0.3793403076875403, |
| "learning_rate": 6.805899550825285e-05, |
| "loss": 0.3849, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.6402714932126696, |
| "grad_norm": 0.38647041207728616, |
| "learning_rate": 6.787750274621175e-05, |
| "loss": 0.3841, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.6493212669683257, |
| "grad_norm": 0.41289104580852704, |
| "learning_rate": 6.769488708242492e-05, |
| "loss": 0.3901, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.6583710407239818, |
| "grad_norm": 0.4086406307183557, |
| "learning_rate": 6.751115587262469e-05, |
| "loss": 0.3886, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.6674208144796379, |
| "grad_norm": 0.35780257992696674, |
| "learning_rate": 6.732631651747739e-05, |
| "loss": 0.3832, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.6764705882352942, |
| "grad_norm": 0.30719607252305003, |
| "learning_rate": 6.714037646228529e-05, |
| "loss": 0.3904, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.6855203619909502, |
| "grad_norm": 0.38674289134885453, |
| "learning_rate": 6.695334319668672e-05, |
| "loss": 0.3886, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.6945701357466063, |
| "grad_norm": 0.4005595923099328, |
| "learning_rate": 6.676522425435433e-05, |
| "loss": 0.3919, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.7036199095022626, |
| "grad_norm": 0.2349267690302485, |
| "learning_rate": 6.65760272126917e-05, |
| "loss": 0.3883, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.7126696832579187, |
| "grad_norm": 0.3574524568496641, |
| "learning_rate": 6.638575969252806e-05, |
| "loss": 0.3865, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.7217194570135748, |
| "grad_norm": 0.42670317548076125, |
| "learning_rate": 6.619442935781141e-05, |
| "loss": 0.3914, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.7307692307692308, |
| "grad_norm": 0.30707518042373566, |
| "learning_rate": 6.600204391529971e-05, |
| "loss": 0.3865, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.739819004524887, |
| "grad_norm": 0.2622083300455114, |
| "learning_rate": 6.580861111425053e-05, |
| "loss": 0.391, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.748868778280543, |
| "grad_norm": 0.38093733410787994, |
| "learning_rate": 6.56141387461089e-05, |
| "loss": 0.3872, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.757918552036199, |
| "grad_norm": 0.43742318682713477, |
| "learning_rate": 6.541863464419346e-05, |
| "loss": 0.3938, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.7669683257918551, |
| "grad_norm": 0.4418280959306971, |
| "learning_rate": 6.52221066833809e-05, |
| "loss": 0.3848, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.7760180995475112, |
| "grad_norm": 0.41049856835634324, |
| "learning_rate": 6.502456277978887e-05, |
| "loss": 0.3852, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.7850678733031673, |
| "grad_norm": 0.4150478314670342, |
| "learning_rate": 6.482601089045696e-05, |
| "loss": 0.3844, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.7941176470588234, |
| "grad_norm": 0.43894520954893845, |
| "learning_rate": 6.462645901302633e-05, |
| "loss": 0.3814, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.8031674208144797, |
| "grad_norm": 0.41995958530789795, |
| "learning_rate": 6.442591518541753e-05, |
| "loss": 0.381, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.8122171945701357, |
| "grad_norm": 0.3201288359968715, |
| "learning_rate": 6.422438748550667e-05, |
| "loss": 0.3852, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.8212669683257918, |
| "grad_norm": 0.2931969021729461, |
| "learning_rate": 6.402188403080013e-05, |
| "loss": 0.3878, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.8303167420814481, |
| "grad_norm": 0.4066483749649814, |
| "learning_rate": 6.381841297810753e-05, |
| "loss": 0.3834, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.8393665158371042, |
| "grad_norm": 0.40957398047436605, |
| "learning_rate": 6.361398252321321e-05, |
| "loss": 0.3886, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.8484162895927603, |
| "grad_norm": 0.32160755061403157, |
| "learning_rate": 6.340860090054608e-05, |
| "loss": 0.3841, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.8574660633484164, |
| "grad_norm": 0.23090971242916025, |
| "learning_rate": 6.320227638284793e-05, |
| "loss": 0.3842, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.8665158371040724, |
| "grad_norm": 0.32946206927022925, |
| "learning_rate": 6.29950172808403e-05, |
| "loss": 0.3901, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.8755656108597285, |
| "grad_norm": 0.3530001057183439, |
| "learning_rate": 6.278683194288956e-05, |
| "loss": 0.3852, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.8846153846153846, |
| "grad_norm": 0.28218082313802967, |
| "learning_rate": 6.257772875467078e-05, |
| "loss": 0.387, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.8936651583710407, |
| "grad_norm": 0.3059885996215407, |
| "learning_rate": 6.236771613882987e-05, |
| "loss": 0.3883, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.9027149321266967, |
| "grad_norm": 0.34171931460456756, |
| "learning_rate": 6.215680255464442e-05, |
| "loss": 0.3824, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.9117647058823528, |
| "grad_norm": 0.29374439911760764, |
| "learning_rate": 6.194499649768281e-05, |
| "loss": 0.3864, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.9208144796380089, |
| "grad_norm": 0.29144021228496453, |
| "learning_rate": 6.173230649946213e-05, |
| "loss": 0.3836, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.9298642533936652, |
| "grad_norm": 0.29148303948179005, |
| "learning_rate": 6.15187411271045e-05, |
| "loss": 0.3796, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.9389140271493213, |
| "grad_norm": 0.2754963794966634, |
| "learning_rate": 6.130430898299199e-05, |
| "loss": 0.3848, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.9479638009049773, |
| "grad_norm": 0.26672266015023416, |
| "learning_rate": 6.10890187044201e-05, |
| "loss": 0.3836, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.9570135746606336, |
| "grad_norm": 0.3413044280319467, |
| "learning_rate": 6.087287896324984e-05, |
| "loss": 0.385, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.9660633484162897, |
| "grad_norm": 0.4653279793260335, |
| "learning_rate": 6.0655898465558484e-05, |
| "loss": 0.3871, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.9751131221719458, |
| "grad_norm": 0.4607547435763325, |
| "learning_rate": 6.043808595128883e-05, |
| "loss": 0.3827, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.9841628959276019, |
| "grad_norm": 0.3114177233194163, |
| "learning_rate": 6.021945019389719e-05, |
| "loss": 0.391, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.993212669683258, |
| "grad_norm": 0.263615834735127, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.381, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.004524886877828, |
| "grad_norm": 0.3080883305611625, |
| "learning_rate": 5.977974420901908e-05, |
| "loss": 0.3732, |
| "step": 221 |
| }, |
| { |
| "epoch": 2.013574660633484, |
| "grad_norm": 0.2983316325982498, |
| "learning_rate": 5.955869169282556e-05, |
| "loss": 0.3539, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.02262443438914, |
| "grad_norm": 0.35442755044439017, |
| "learning_rate": 5.9336851355382557e-05, |
| "loss": 0.3625, |
| "step": 223 |
| }, |
| { |
| "epoch": 2.0316742081447963, |
| "grad_norm": 0.5038394603992613, |
| "learning_rate": 5.911423213238653e-05, |
| "loss": 0.3565, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.0407239819004523, |
| "grad_norm": 0.6565318273183188, |
| "learning_rate": 5.889084299090732e-05, |
| "loss": 0.3562, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.0497737556561084, |
| "grad_norm": 0.8028771505743133, |
| "learning_rate": 5.866669292902695e-05, |
| "loss": 0.36, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.0588235294117645, |
| "grad_norm": 0.8257109533011188, |
| "learning_rate": 5.844179097547725e-05, |
| "loss": 0.3602, |
| "step": 227 |
| }, |
| { |
| "epoch": 2.067873303167421, |
| "grad_norm": 0.6350182381420444, |
| "learning_rate": 5.821614618927613e-05, |
| "loss": 0.3571, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.076923076923077, |
| "grad_norm": 0.38135287559376224, |
| "learning_rate": 5.798976765936264e-05, |
| "loss": 0.3559, |
| "step": 229 |
| }, |
| { |
| "epoch": 2.085972850678733, |
| "grad_norm": 0.5026969066810225, |
| "learning_rate": 5.776266450423097e-05, |
| "loss": 0.355, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.0950226244343892, |
| "grad_norm": 0.6298538131496503, |
| "learning_rate": 5.75348458715631e-05, |
| "loss": 0.3595, |
| "step": 231 |
| }, |
| { |
| "epoch": 2.1040723981900453, |
| "grad_norm": 0.3867446080865823, |
| "learning_rate": 5.7306320937860336e-05, |
| "loss": 0.352, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.1131221719457014, |
| "grad_norm": 0.3115968628085643, |
| "learning_rate": 5.7077098908073676e-05, |
| "loss": 0.3582, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.1221719457013575, |
| "grad_norm": 0.3939923894024485, |
| "learning_rate": 5.684718901523307e-05, |
| "loss": 0.3533, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.1312217194570136, |
| "grad_norm": 0.25112142036368945, |
| "learning_rate": 5.661660052007547e-05, |
| "loss": 0.3564, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.1402714932126696, |
| "grad_norm": 0.3261992149324723, |
| "learning_rate": 5.6385342710671815e-05, |
| "loss": 0.3518, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.1493212669683257, |
| "grad_norm": 0.32489012611444446, |
| "learning_rate": 5.6153424902053e-05, |
| "loss": 0.3535, |
| "step": 237 |
| }, |
| { |
| "epoch": 2.158371040723982, |
| "grad_norm": 0.24162335661441722, |
| "learning_rate": 5.59208564358345e-05, |
| "loss": 0.3548, |
| "step": 238 |
| }, |
| { |
| "epoch": 2.167420814479638, |
| "grad_norm": 0.3778776749277093, |
| "learning_rate": 5.568764667984022e-05, |
| "loss": 0.3605, |
| "step": 239 |
| }, |
| { |
| "epoch": 2.176470588235294, |
| "grad_norm": 0.33672093596701086, |
| "learning_rate": 5.5453805027725145e-05, |
| "loss": 0.3538, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.1855203619909505, |
| "grad_norm": 0.24527198542362685, |
| "learning_rate": 5.521934089859692e-05, |
| "loss": 0.3497, |
| "step": 241 |
| }, |
| { |
| "epoch": 2.1945701357466065, |
| "grad_norm": 0.3958849461770139, |
| "learning_rate": 5.4984263736636494e-05, |
| "loss": 0.355, |
| "step": 242 |
| }, |
| { |
| "epoch": 2.2036199095022626, |
| "grad_norm": 0.38916418294477323, |
| "learning_rate": 5.4748583010717636e-05, |
| "loss": 0.3557, |
| "step": 243 |
| }, |
| { |
| "epoch": 2.2126696832579187, |
| "grad_norm": 0.36390563994328023, |
| "learning_rate": 5.451230821402564e-05, |
| "loss": 0.362, |
| "step": 244 |
| }, |
| { |
| "epoch": 2.2217194570135748, |
| "grad_norm": 0.3301056539987755, |
| "learning_rate": 5.427544886367488e-05, |
| "loss": 0.3519, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.230769230769231, |
| "grad_norm": 0.215969613599219, |
| "learning_rate": 5.403801450032544e-05, |
| "loss": 0.3551, |
| "step": 246 |
| }, |
| { |
| "epoch": 2.239819004524887, |
| "grad_norm": 0.2378223811885189, |
| "learning_rate": 5.380001468779883e-05, |
| "loss": 0.359, |
| "step": 247 |
| }, |
| { |
| "epoch": 2.248868778280543, |
| "grad_norm": 0.3256287018733439, |
| "learning_rate": 5.356145901269282e-05, |
| "loss": 0.3569, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.257918552036199, |
| "grad_norm": 0.2802481742472565, |
| "learning_rate": 5.3322357083995235e-05, |
| "loss": 0.3584, |
| "step": 249 |
| }, |
| { |
| "epoch": 2.266968325791855, |
| "grad_norm": 0.30050063645860137, |
| "learning_rate": 5.3082718532696874e-05, |
| "loss": 0.3527, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.276018099547511, |
| "grad_norm": 0.337462061586617, |
| "learning_rate": 5.284255301140364e-05, |
| "loss": 0.3508, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.2850678733031673, |
| "grad_norm": 0.22314780810636892, |
| "learning_rate": 5.2601870193947716e-05, |
| "loss": 0.3486, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.2941176470588234, |
| "grad_norm": 0.2251099676952424, |
| "learning_rate": 5.23606797749979e-05, |
| "loss": 0.3521, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.3031674208144794, |
| "grad_norm": 0.2657181581176672, |
| "learning_rate": 5.21189914696691e-05, |
| "loss": 0.3506, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.3122171945701355, |
| "grad_norm": 0.23800570673769336, |
| "learning_rate": 5.1876815013131e-05, |
| "loss": 0.3511, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.321266968325792, |
| "grad_norm": 0.2673484662572871, |
| "learning_rate": 5.163416016021597e-05, |
| "loss": 0.3537, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.330316742081448, |
| "grad_norm": 0.25378700284589706, |
| "learning_rate": 5.1391036685026093e-05, |
| "loss": 0.3492, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.339366515837104, |
| "grad_norm": 0.2198551274983534, |
| "learning_rate": 5.114745438053952e-05, |
| "loss": 0.3522, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.3484162895927603, |
| "grad_norm": 0.22757974892064678, |
| "learning_rate": 5.0903423058215925e-05, |
| "loss": 0.3531, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.3574660633484164, |
| "grad_norm": 0.2162305337190899, |
| "learning_rate": 5.06589525476014e-05, |
| "loss": 0.3509, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.3665158371040724, |
| "grad_norm": 0.22176152775329794, |
| "learning_rate": 5.0414052695932486e-05, |
| "loss": 0.3539, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.3755656108597285, |
| "grad_norm": 0.20465996494402094, |
| "learning_rate": 5.016873336773949e-05, |
| "loss": 0.3513, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.3846153846153846, |
| "grad_norm": 0.19749781398286828, |
| "learning_rate": 4.992300444444916e-05, |
| "loss": 0.3569, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.3936651583710407, |
| "grad_norm": 0.21862557545293346, |
| "learning_rate": 4.967687582398671e-05, |
| "loss": 0.3532, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.4027149321266967, |
| "grad_norm": 0.22129803132569947, |
| "learning_rate": 4.94303574203771e-05, |
| "loss": 0.3546, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.411764705882353, |
| "grad_norm": 0.22177138969097251, |
| "learning_rate": 4.9183459163345644e-05, |
| "loss": 0.3502, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.420814479638009, |
| "grad_norm": 0.19327131140015494, |
| "learning_rate": 4.893619099791817e-05, |
| "loss": 0.3522, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.4298642533936654, |
| "grad_norm": 0.20052109770619267, |
| "learning_rate": 4.868856288402032e-05, |
| "loss": 0.356, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.4389140271493215, |
| "grad_norm": 0.19644855379050266, |
| "learning_rate": 4.8440584796076395e-05, |
| "loss": 0.3534, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.4479638009049776, |
| "grad_norm": 0.20192604708935719, |
| "learning_rate": 4.819226672260763e-05, |
| "loss": 0.3592, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.4570135746606336, |
| "grad_norm": 0.1922532533153157, |
| "learning_rate": 4.794361866582982e-05, |
| "loss": 0.3552, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.4660633484162897, |
| "grad_norm": 0.22962467692658226, |
| "learning_rate": 4.7694650641250446e-05, |
| "loss": 0.3531, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.475113122171946, |
| "grad_norm": 0.21255494327094565, |
| "learning_rate": 4.7445372677265205e-05, |
| "loss": 0.3554, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.484162895927602, |
| "grad_norm": 0.19556932541533853, |
| "learning_rate": 4.719579481475416e-05, |
| "loss": 0.3558, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.493212669683258, |
| "grad_norm": 0.2127502030224185, |
| "learning_rate": 4.694592710667723e-05, |
| "loss": 0.3558, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.502262443438914, |
| "grad_norm": 0.2051192723904285, |
| "learning_rate": 4.6695779617669236e-05, |
| "loss": 0.355, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.51131221719457, |
| "grad_norm": 0.21459679896962008, |
| "learning_rate": 4.6445362423634574e-05, |
| "loss": 0.3547, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.520361990950226, |
| "grad_norm": 0.21249030888822018, |
| "learning_rate": 4.61946856113413e-05, |
| "loss": 0.3524, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.5294117647058822, |
| "grad_norm": 0.18973130323510315, |
| "learning_rate": 4.594375927801487e-05, |
| "loss": 0.3518, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.5384615384615383, |
| "grad_norm": 0.20099116045455737, |
| "learning_rate": 4.5692593530931416e-05, |
| "loss": 0.3551, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.5475113122171944, |
| "grad_norm": 0.2404974178503931, |
| "learning_rate": 4.5441198487010574e-05, |
| "loss": 0.3546, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.5565610859728505, |
| "grad_norm": 0.21416667678544446, |
| "learning_rate": 4.5189584272408074e-05, |
| "loss": 0.3521, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.5656108597285066, |
| "grad_norm": 0.1821577098280573, |
| "learning_rate": 4.493776102210779e-05, |
| "loss": 0.351, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.5746606334841626, |
| "grad_norm": 0.1940123893538777, |
| "learning_rate": 4.468573887951354e-05, |
| "loss": 0.352, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.583710407239819, |
| "grad_norm": 0.21908628404417796, |
| "learning_rate": 4.4433527996040443e-05, |
| "loss": 0.3555, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.5927601809954752, |
| "grad_norm": 0.16351576977199667, |
| "learning_rate": 4.418113853070614e-05, |
| "loss": 0.3534, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.6018099547511313, |
| "grad_norm": 0.18499158266871718, |
| "learning_rate": 4.392858064972149e-05, |
| "loss": 0.3555, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.6108597285067874, |
| "grad_norm": 0.17222994187080978, |
| "learning_rate": 4.3675864526081106e-05, |
| "loss": 0.3522, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.6199095022624435, |
| "grad_norm": 0.17596242704110496, |
| "learning_rate": 4.34230003391536e-05, |
| "loss": 0.3529, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.6289592760180995, |
| "grad_norm": 0.1788336399501724, |
| "learning_rate": 4.316999827427154e-05, |
| "loss": 0.3501, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.6380090497737556, |
| "grad_norm": 0.15674058262172438, |
| "learning_rate": 4.2916868522321235e-05, |
| "loss": 0.3593, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.6470588235294117, |
| "grad_norm": 0.19461132000856615, |
| "learning_rate": 4.266362127933216e-05, |
| "loss": 0.3542, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.6561085972850678, |
| "grad_norm": 0.18265415777078592, |
| "learning_rate": 4.2410266746066345e-05, |
| "loss": 0.3529, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.665158371040724, |
| "grad_norm": 0.19366748651198787, |
| "learning_rate": 4.215681512760744e-05, |
| "loss": 0.3545, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.6742081447963804, |
| "grad_norm": 0.19701470985076372, |
| "learning_rate": 4.19032766329497e-05, |
| "loss": 0.3533, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.6832579185520364, |
| "grad_norm": 0.2051473001306171, |
| "learning_rate": 4.1649661474586694e-05, |
| "loss": 0.3543, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.6923076923076925, |
| "grad_norm": 0.18550660724506887, |
| "learning_rate": 4.139597986810005e-05, |
| "loss": 0.3569, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.7013574660633486, |
| "grad_norm": 0.20456848012158177, |
| "learning_rate": 4.114224203174785e-05, |
| "loss": 0.3496, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.7104072398190047, |
| "grad_norm": 0.1780622844651236, |
| "learning_rate": 4.0888458186053184e-05, |
| "loss": 0.354, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.7194570135746607, |
| "grad_norm": 0.1640758315975294, |
| "learning_rate": 4.063463855339232e-05, |
| "loss": 0.3496, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.728506787330317, |
| "grad_norm": 0.19228477606561048, |
| "learning_rate": 4.0380793357583076e-05, |
| "loss": 0.3491, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.737556561085973, |
| "grad_norm": 0.1755323101374011, |
| "learning_rate": 4.012693282347289e-05, |
| "loss": 0.3564, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.746606334841629, |
| "grad_norm": 0.1809635624979587, |
| "learning_rate": 3.9873067176527114e-05, |
| "loss": 0.3544, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.755656108597285, |
| "grad_norm": 0.18201509342269132, |
| "learning_rate": 3.961920664241694e-05, |
| "loss": 0.3546, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.764705882352941, |
| "grad_norm": 0.17873879422611214, |
| "learning_rate": 3.9365361446607684e-05, |
| "loss": 0.355, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.773755656108597, |
| "grad_norm": 0.21315929069191436, |
| "learning_rate": 3.911154181394682e-05, |
| "loss": 0.3564, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.7828054298642533, |
| "grad_norm": 0.19819053615429819, |
| "learning_rate": 3.885775796825216e-05, |
| "loss": 0.3548, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.7918552036199094, |
| "grad_norm": 0.13490710454922517, |
| "learning_rate": 3.860402013189998e-05, |
| "loss": 0.3485, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.8009049773755654, |
| "grad_norm": 0.20244714474335393, |
| "learning_rate": 3.835033852541332e-05, |
| "loss": 0.3562, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.8099547511312215, |
| "grad_norm": 0.1724672504964484, |
| "learning_rate": 3.809672336705031e-05, |
| "loss": 0.348, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.8190045248868776, |
| "grad_norm": 0.15081349665835933, |
| "learning_rate": 3.784318487239257e-05, |
| "loss": 0.3546, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.8280542986425337, |
| "grad_norm": 0.1783285611622551, |
| "learning_rate": 3.758973325393367e-05, |
| "loss": 0.35, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.83710407239819, |
| "grad_norm": 0.16120590818140387, |
| "learning_rate": 3.7336378720667846e-05, |
| "loss": 0.3537, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.8461538461538463, |
| "grad_norm": 0.20165982856627884, |
| "learning_rate": 3.708313147767878e-05, |
| "loss": 0.3543, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.8552036199095023, |
| "grad_norm": 0.21095988182947403, |
| "learning_rate": 3.683000172572846e-05, |
| "loss": 0.3544, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.8642533936651584, |
| "grad_norm": 0.18625234007151376, |
| "learning_rate": 3.657699966084642e-05, |
| "loss": 0.3531, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.8733031674208145, |
| "grad_norm": 0.22479750462747256, |
| "learning_rate": 3.632413547391891e-05, |
| "loss": 0.3556, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.8823529411764706, |
| "grad_norm": 0.20909188100700477, |
| "learning_rate": 3.6071419350278515e-05, |
| "loss": 0.3471, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.8914027149321266, |
| "grad_norm": 0.18778607505398776, |
| "learning_rate": 3.581886146929387e-05, |
| "loss": 0.3451, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.9004524886877827, |
| "grad_norm": 0.21561728314983256, |
| "learning_rate": 3.556647200395956e-05, |
| "loss": 0.352, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.909502262443439, |
| "grad_norm": 0.20718371330348473, |
| "learning_rate": 3.5314261120486474e-05, |
| "loss": 0.3493, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.918552036199095, |
| "grad_norm": 0.16443205391902374, |
| "learning_rate": 3.5062238977892214e-05, |
| "loss": 0.3508, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.9276018099547514, |
| "grad_norm": 0.19396119036177084, |
| "learning_rate": 3.481041572759193e-05, |
| "loss": 0.355, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.9366515837104075, |
| "grad_norm": 0.18132782101939904, |
| "learning_rate": 3.4558801512989446e-05, |
| "loss": 0.354, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.9457013574660635, |
| "grad_norm": 0.17332578387246808, |
| "learning_rate": 3.4307406469068604e-05, |
| "loss": 0.3542, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.9547511312217196, |
| "grad_norm": 0.15354450735940553, |
| "learning_rate": 3.405624072198514e-05, |
| "loss": 0.3553, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.9638009049773757, |
| "grad_norm": 0.17733336095930732, |
| "learning_rate": 3.3805314388658714e-05, |
| "loss": 0.3507, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.9728506787330318, |
| "grad_norm": 0.16922858743324629, |
| "learning_rate": 3.3554637576365446e-05, |
| "loss": 0.3469, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.981900452488688, |
| "grad_norm": 0.1635466514191799, |
| "learning_rate": 3.330422038233078e-05, |
| "loss": 0.3499, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.990950226244344, |
| "grad_norm": 0.181396866738228, |
| "learning_rate": 3.305407289332279e-05, |
| "loss": 0.3475, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.002262443438914, |
| "grad_norm": 0.17232098683504665, |
| "learning_rate": 3.280420518524585e-05, |
| "loss": 0.3429, |
| "step": 331 |
| }, |
| { |
| "epoch": 3.01131221719457, |
| "grad_norm": 0.2085779039684738, |
| "learning_rate": 3.25546273227348e-05, |
| "loss": 0.3276, |
| "step": 332 |
| }, |
| { |
| "epoch": 3.020361990950226, |
| "grad_norm": 0.20454372196591433, |
| "learning_rate": 3.230534935874958e-05, |
| "loss": 0.322, |
| "step": 333 |
| }, |
| { |
| "epoch": 3.0294117647058822, |
| "grad_norm": 0.21400683996161396, |
| "learning_rate": 3.205638133417019e-05, |
| "loss": 0.3247, |
| "step": 334 |
| }, |
| { |
| "epoch": 3.0384615384615383, |
| "grad_norm": 0.20994330105610337, |
| "learning_rate": 3.180773327739238e-05, |
| "loss": 0.328, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.0475113122171944, |
| "grad_norm": 0.2107968970213985, |
| "learning_rate": 3.155941520392362e-05, |
| "loss": 0.3235, |
| "step": 336 |
| }, |
| { |
| "epoch": 3.0565610859728505, |
| "grad_norm": 0.20164240350868537, |
| "learning_rate": 3.1311437115979696e-05, |
| "loss": 0.3203, |
| "step": 337 |
| }, |
| { |
| "epoch": 3.065610859728507, |
| "grad_norm": 0.20071440677649943, |
| "learning_rate": 3.1063809002081834e-05, |
| "loss": 0.3221, |
| "step": 338 |
| }, |
| { |
| "epoch": 3.074660633484163, |
| "grad_norm": 0.17663930655160845, |
| "learning_rate": 3.0816540836654356e-05, |
| "loss": 0.3226, |
| "step": 339 |
| }, |
| { |
| "epoch": 3.083710407239819, |
| "grad_norm": 0.18106709441769947, |
| "learning_rate": 3.0569642579622905e-05, |
| "loss": 0.3214, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.0927601809954752, |
| "grad_norm": 0.17780441082604515, |
| "learning_rate": 3.0323124176013297e-05, |
| "loss": 0.3225, |
| "step": 341 |
| }, |
| { |
| "epoch": 3.1018099547511313, |
| "grad_norm": 0.17074344590150428, |
| "learning_rate": 3.007699555555086e-05, |
| "loss": 0.3183, |
| "step": 342 |
| }, |
| { |
| "epoch": 3.1108597285067874, |
| "grad_norm": 0.22118329908847872, |
| "learning_rate": 2.9831266632260534e-05, |
| "loss": 0.3221, |
| "step": 343 |
| }, |
| { |
| "epoch": 3.1199095022624435, |
| "grad_norm": 0.1862795261167787, |
| "learning_rate": 2.958594730406752e-05, |
| "loss": 0.3233, |
| "step": 344 |
| }, |
| { |
| "epoch": 3.1289592760180995, |
| "grad_norm": 0.18624339502853474, |
| "learning_rate": 2.9341047452398607e-05, |
| "loss": 0.3232, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.1380090497737556, |
| "grad_norm": 0.19906920288843233, |
| "learning_rate": 2.9096576941784095e-05, |
| "loss": 0.3217, |
| "step": 346 |
| }, |
| { |
| "epoch": 3.1470588235294117, |
| "grad_norm": 0.1759509265715728, |
| "learning_rate": 2.8852545619460495e-05, |
| "loss": 0.3294, |
| "step": 347 |
| }, |
| { |
| "epoch": 3.1561085972850678, |
| "grad_norm": 0.18215420840842134, |
| "learning_rate": 2.860896331497391e-05, |
| "loss": 0.3178, |
| "step": 348 |
| }, |
| { |
| "epoch": 3.165158371040724, |
| "grad_norm": 0.16928396196250736, |
| "learning_rate": 2.8365839839784038e-05, |
| "loss": 0.3208, |
| "step": 349 |
| }, |
| { |
| "epoch": 3.17420814479638, |
| "grad_norm": 0.18758285635894378, |
| "learning_rate": 2.8123184986869022e-05, |
| "loss": 0.3252, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.183257918552036, |
| "grad_norm": 0.1727196365880379, |
| "learning_rate": 2.7881008530330914e-05, |
| "loss": 0.3219, |
| "step": 351 |
| }, |
| { |
| "epoch": 3.1923076923076925, |
| "grad_norm": 0.19928771398379677, |
| "learning_rate": 2.7639320225002108e-05, |
| "loss": 0.3279, |
| "step": 352 |
| }, |
| { |
| "epoch": 3.2013574660633486, |
| "grad_norm": 0.15934892115019736, |
| "learning_rate": 2.7398129806052298e-05, |
| "loss": 0.3211, |
| "step": 353 |
| }, |
| { |
| "epoch": 3.2104072398190047, |
| "grad_norm": 0.1620987722032314, |
| "learning_rate": 2.715744698859637e-05, |
| "loss": 0.3264, |
| "step": 354 |
| }, |
| { |
| "epoch": 3.2194570135746607, |
| "grad_norm": 0.14580042456343553, |
| "learning_rate": 2.691728146730314e-05, |
| "loss": 0.3247, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.228506787330317, |
| "grad_norm": 0.1669926797929679, |
| "learning_rate": 2.6677642916004772e-05, |
| "loss": 0.3219, |
| "step": 356 |
| }, |
| { |
| "epoch": 3.237556561085973, |
| "grad_norm": 0.13912969057523117, |
| "learning_rate": 2.6438540987307174e-05, |
| "loss": 0.3224, |
| "step": 357 |
| }, |
| { |
| "epoch": 3.246606334841629, |
| "grad_norm": 0.17393961594448798, |
| "learning_rate": 2.6199985312201185e-05, |
| "loss": 0.3287, |
| "step": 358 |
| }, |
| { |
| "epoch": 3.255656108597285, |
| "grad_norm": 0.13336325385823727, |
| "learning_rate": 2.5961985499674587e-05, |
| "loss": 0.3195, |
| "step": 359 |
| }, |
| { |
| "epoch": 3.264705882352941, |
| "grad_norm": 0.16625718933722608, |
| "learning_rate": 2.5724551136325132e-05, |
| "loss": 0.3155, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.273755656108597, |
| "grad_norm": 0.1313925578933839, |
| "learning_rate": 2.5487691785974366e-05, |
| "loss": 0.3245, |
| "step": 361 |
| }, |
| { |
| "epoch": 3.2828054298642533, |
| "grad_norm": 0.1580151751002634, |
| "learning_rate": 2.5251416989282377e-05, |
| "loss": 0.3244, |
| "step": 362 |
| }, |
| { |
| "epoch": 3.2918552036199094, |
| "grad_norm": 0.13919373737905993, |
| "learning_rate": 2.501573626336352e-05, |
| "loss": 0.3204, |
| "step": 363 |
| }, |
| { |
| "epoch": 3.3009049773755654, |
| "grad_norm": 0.1394478161996878, |
| "learning_rate": 2.478065910140308e-05, |
| "loss": 0.319, |
| "step": 364 |
| }, |
| { |
| "epoch": 3.3099547511312215, |
| "grad_norm": 0.1422678230130045, |
| "learning_rate": 2.4546194972274852e-05, |
| "loss": 0.3234, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.3190045248868776, |
| "grad_norm": 0.13767311772002824, |
| "learning_rate": 2.431235332015978e-05, |
| "loss": 0.3222, |
| "step": 366 |
| }, |
| { |
| "epoch": 3.328054298642534, |
| "grad_norm": 0.15146521934073406, |
| "learning_rate": 2.4079143564165524e-05, |
| "loss": 0.3196, |
| "step": 367 |
| }, |
| { |
| "epoch": 3.33710407239819, |
| "grad_norm": 0.14554574710226792, |
| "learning_rate": 2.3846575097947015e-05, |
| "loss": 0.3238, |
| "step": 368 |
| }, |
| { |
| "epoch": 3.3461538461538463, |
| "grad_norm": 0.12428809362324093, |
| "learning_rate": 2.361465728932819e-05, |
| "loss": 0.3281, |
| "step": 369 |
| }, |
| { |
| "epoch": 3.3552036199095023, |
| "grad_norm": 0.1359067551518604, |
| "learning_rate": 2.338339947992455e-05, |
| "loss": 0.3197, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.3642533936651584, |
| "grad_norm": 0.12263338859613652, |
| "learning_rate": 2.315281098476694e-05, |
| "loss": 0.3217, |
| "step": 371 |
| }, |
| { |
| "epoch": 3.3733031674208145, |
| "grad_norm": 0.13047474507331128, |
| "learning_rate": 2.2922901091926334e-05, |
| "loss": 0.3244, |
| "step": 372 |
| }, |
| { |
| "epoch": 3.3823529411764706, |
| "grad_norm": 0.12872903204410682, |
| "learning_rate": 2.269367906213966e-05, |
| "loss": 0.3229, |
| "step": 373 |
| }, |
| { |
| "epoch": 3.3914027149321266, |
| "grad_norm": 0.11879478583728734, |
| "learning_rate": 2.24651541284369e-05, |
| "loss": 0.3246, |
| "step": 374 |
| }, |
| { |
| "epoch": 3.4004524886877827, |
| "grad_norm": 0.13996450916284633, |
| "learning_rate": 2.2237335495769035e-05, |
| "loss": 0.3255, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.409502262443439, |
| "grad_norm": 0.12258279755344627, |
| "learning_rate": 2.2010232340637375e-05, |
| "loss": 0.3222, |
| "step": 376 |
| }, |
| { |
| "epoch": 3.418552036199095, |
| "grad_norm": 0.13396529243580008, |
| "learning_rate": 2.1783853810723895e-05, |
| "loss": 0.3188, |
| "step": 377 |
| }, |
| { |
| "epoch": 3.427601809954751, |
| "grad_norm": 0.12827246370155101, |
| "learning_rate": 2.155820902452276e-05, |
| "loss": 0.3233, |
| "step": 378 |
| }, |
| { |
| "epoch": 3.4366515837104075, |
| "grad_norm": 0.13361073513914423, |
| "learning_rate": 2.1333307070973054e-05, |
| "loss": 0.32, |
| "step": 379 |
| }, |
| { |
| "epoch": 3.4457013574660635, |
| "grad_norm": 0.1294537429532072, |
| "learning_rate": 2.11091570090927e-05, |
| "loss": 0.3234, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.4547511312217196, |
| "grad_norm": 0.12477628801726541, |
| "learning_rate": 2.0885767867613485e-05, |
| "loss": 0.3236, |
| "step": 381 |
| }, |
| { |
| "epoch": 3.4638009049773757, |
| "grad_norm": 0.12556744064400813, |
| "learning_rate": 2.0663148644617443e-05, |
| "loss": 0.3193, |
| "step": 382 |
| }, |
| { |
| "epoch": 3.4728506787330318, |
| "grad_norm": 0.12838442646777212, |
| "learning_rate": 2.044130830717445e-05, |
| "loss": 0.3197, |
| "step": 383 |
| }, |
| { |
| "epoch": 3.481900452488688, |
| "grad_norm": 0.11666989370139778, |
| "learning_rate": 2.0220255790980935e-05, |
| "loss": 0.3222, |
| "step": 384 |
| }, |
| { |
| "epoch": 3.490950226244344, |
| "grad_norm": 0.1201654171036639, |
| "learning_rate": 2.0000000000000012e-05, |
| "loss": 0.323, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 0.12406834396457092, |
| "learning_rate": 1.9780549806102827e-05, |
| "loss": 0.3233, |
| "step": 386 |
| }, |
| { |
| "epoch": 3.509049773755656, |
| "grad_norm": 0.13211561659340326, |
| "learning_rate": 1.9561914048711182e-05, |
| "loss": 0.3328, |
| "step": 387 |
| }, |
| { |
| "epoch": 3.518099547511312, |
| "grad_norm": 0.11794332998469956, |
| "learning_rate": 1.934410153444153e-05, |
| "loss": 0.3206, |
| "step": 388 |
| }, |
| { |
| "epoch": 3.5271493212669682, |
| "grad_norm": 0.11751517121074768, |
| "learning_rate": 1.912712103675017e-05, |
| "loss": 0.3234, |
| "step": 389 |
| }, |
| { |
| "epoch": 3.5361990950226243, |
| "grad_norm": 0.11384664165774953, |
| "learning_rate": 1.8910981295579903e-05, |
| "loss": 0.326, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.5452488687782804, |
| "grad_norm": 0.11619434472524914, |
| "learning_rate": 1.8695691017008005e-05, |
| "loss": 0.328, |
| "step": 391 |
| }, |
| { |
| "epoch": 3.5542986425339365, |
| "grad_norm": 0.11461255200579494, |
| "learning_rate": 1.8481258872895503e-05, |
| "loss": 0.3249, |
| "step": 392 |
| }, |
| { |
| "epoch": 3.5633484162895925, |
| "grad_norm": 0.1269351876097148, |
| "learning_rate": 1.826769350053789e-05, |
| "loss": 0.325, |
| "step": 393 |
| }, |
| { |
| "epoch": 3.5723981900452486, |
| "grad_norm": 0.1262020845365235, |
| "learning_rate": 1.8055003502317213e-05, |
| "loss": 0.3245, |
| "step": 394 |
| }, |
| { |
| "epoch": 3.581447963800905, |
| "grad_norm": 0.1244584292444969, |
| "learning_rate": 1.7843197445355593e-05, |
| "loss": 0.3257, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.590497737556561, |
| "grad_norm": 0.121141018365639, |
| "learning_rate": 1.7632283861170135e-05, |
| "loss": 0.3267, |
| "step": 396 |
| }, |
| { |
| "epoch": 3.5995475113122173, |
| "grad_norm": 0.1199129930925027, |
| "learning_rate": 1.7422271245329244e-05, |
| "loss": 0.3257, |
| "step": 397 |
| }, |
| { |
| "epoch": 3.6085972850678734, |
| "grad_norm": 0.11856999524364402, |
| "learning_rate": 1.721316805711045e-05, |
| "loss": 0.3211, |
| "step": 398 |
| }, |
| { |
| "epoch": 3.6176470588235294, |
| "grad_norm": 0.11909188120229579, |
| "learning_rate": 1.7004982719159712e-05, |
| "loss": 0.3201, |
| "step": 399 |
| }, |
| { |
| "epoch": 3.6266968325791855, |
| "grad_norm": 0.11144474240232284, |
| "learning_rate": 1.679772361715208e-05, |
| "loss": 0.3223, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.6357466063348416, |
| "grad_norm": 0.12526137904233012, |
| "learning_rate": 1.6591399099453952e-05, |
| "loss": 0.3251, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.6447963800904977, |
| "grad_norm": 0.1129996909861758, |
| "learning_rate": 1.6386017476786818e-05, |
| "loss": 0.3264, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.6538461538461537, |
| "grad_norm": 0.11339940885061946, |
| "learning_rate": 1.6181587021892484e-05, |
| "loss": 0.3252, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.66289592760181, |
| "grad_norm": 0.11557131743659978, |
| "learning_rate": 1.5978115969199882e-05, |
| "loss": 0.3241, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.6719457013574663, |
| "grad_norm": 0.12094104342387753, |
| "learning_rate": 1.5775612514493343e-05, |
| "loss": 0.3224, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.6809954751131224, |
| "grad_norm": 0.12996591315042658, |
| "learning_rate": 1.557408481458247e-05, |
| "loss": 0.3214, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.6900452488687785, |
| "grad_norm": 0.10932053351798611, |
| "learning_rate": 1.537354098697367e-05, |
| "loss": 0.3243, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.6990950226244346, |
| "grad_norm": 0.119643634861931, |
| "learning_rate": 1.5173989109543055e-05, |
| "loss": 0.3262, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.7081447963800906, |
| "grad_norm": 0.11650012137085076, |
| "learning_rate": 1.497543722021114e-05, |
| "loss": 0.3248, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.7171945701357467, |
| "grad_norm": 0.11291091689614972, |
| "learning_rate": 1.4777893316619114e-05, |
| "loss": 0.3203, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.726244343891403, |
| "grad_norm": 0.1162086489522257, |
| "learning_rate": 1.4581365355806555e-05, |
| "loss": 0.3243, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.735294117647059, |
| "grad_norm": 0.10927589657003309, |
| "learning_rate": 1.4385861253891111e-05, |
| "loss": 0.327, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.744343891402715, |
| "grad_norm": 0.10801027792504748, |
| "learning_rate": 1.4191388885749487e-05, |
| "loss": 0.3237, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.753393665158371, |
| "grad_norm": 0.11625407101384182, |
| "learning_rate": 1.3997956084700301e-05, |
| "loss": 0.3223, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.762443438914027, |
| "grad_norm": 0.10784892069818107, |
| "learning_rate": 1.3805570642188602e-05, |
| "loss": 0.3258, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.771493212669683, |
| "grad_norm": 0.11300531775753117, |
| "learning_rate": 1.3614240307471942e-05, |
| "loss": 0.3248, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.7805429864253393, |
| "grad_norm": 0.10835333011179236, |
| "learning_rate": 1.34239727873083e-05, |
| "loss": 0.3223, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.7895927601809953, |
| "grad_norm": 0.11188577965419376, |
| "learning_rate": 1.3234775745645684e-05, |
| "loss": 0.321, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.7986425339366514, |
| "grad_norm": 0.12447909695863361, |
| "learning_rate": 1.3046656803313287e-05, |
| "loss": 0.324, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.8076923076923075, |
| "grad_norm": 0.10209002045981093, |
| "learning_rate": 1.2859623537714719e-05, |
| "loss": 0.3227, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.8167420814479636, |
| "grad_norm": 0.11319252778270614, |
| "learning_rate": 1.2673683482522629e-05, |
| "loss": 0.3289, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.8257918552036196, |
| "grad_norm": 0.10456931028449813, |
| "learning_rate": 1.248884412737532e-05, |
| "loss": 0.3236, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.834841628959276, |
| "grad_norm": 0.10976135305835821, |
| "learning_rate": 1.2305112917575092e-05, |
| "loss": 0.3273, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.8438914027149322, |
| "grad_norm": 0.10710488304052018, |
| "learning_rate": 1.2122497253788267e-05, |
| "loss": 0.3224, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.8529411764705883, |
| "grad_norm": 0.10332457574753726, |
| "learning_rate": 1.1941004491747145e-05, |
| "loss": 0.3174, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.8619909502262444, |
| "grad_norm": 0.10300034100616205, |
| "learning_rate": 1.1760641941953744e-05, |
| "loss": 0.3252, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.8710407239819005, |
| "grad_norm": 0.09955598289499694, |
| "learning_rate": 1.1581416869385253e-05, |
| "loss": 0.3223, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.8800904977375565, |
| "grad_norm": 0.10841097410633975, |
| "learning_rate": 1.1403336493201462e-05, |
| "loss": 0.3215, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.8891402714932126, |
| "grad_norm": 0.10174185620822972, |
| "learning_rate": 1.1226407986453963e-05, |
| "loss": 0.3238, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.8981900452488687, |
| "grad_norm": 0.10861737022921748, |
| "learning_rate": 1.1050638475797193e-05, |
| "loss": 0.3256, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.9072398190045248, |
| "grad_norm": 0.10297039570713754, |
| "learning_rate": 1.0876035041201436e-05, |
| "loss": 0.3221, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.916289592760181, |
| "grad_norm": 0.1076446648237558, |
| "learning_rate": 1.070260471566757e-05, |
| "loss": 0.3252, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.9253393665158374, |
| "grad_norm": 0.10925197562208173, |
| "learning_rate": 1.0530354484943798e-05, |
| "loss": 0.3185, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.9343891402714934, |
| "grad_norm": 0.09823521379039317, |
| "learning_rate": 1.0359291287244334e-05, |
| "loss": 0.3241, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.9434389140271495, |
| "grad_norm": 0.11042331281486452, |
| "learning_rate": 1.0189422012969814e-05, |
| "loss": 0.3264, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.9524886877828056, |
| "grad_norm": 0.0998774378258076, |
| "learning_rate": 1.0020753504429845e-05, |
| "loss": 0.3184, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.9615384615384617, |
| "grad_norm": 0.10587358010063602, |
| "learning_rate": 9.85329255556735e-06, |
| "loss": 0.3259, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.9705882352941178, |
| "grad_norm": 0.10052895526823628, |
| "learning_rate": 9.687045911684928e-06, |
| "loss": 0.3239, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.979638009049774, |
| "grad_norm": 0.09541866120111013, |
| "learning_rate": 9.522020269173172e-06, |
| "loss": 0.323, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.98868778280543, |
| "grad_norm": 0.09557886438970163, |
| "learning_rate": 9.358222275240884e-06, |
| "loss": 0.324, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.997737556561086, |
| "grad_norm": 0.10151624518760664, |
| "learning_rate": 9.195658527647371e-06, |
| "loss": 0.3264, |
| "step": 441 |
| }, |
| { |
| "epoch": 4.009049773755656, |
| "grad_norm": 0.16524757813062563, |
| "learning_rate": 9.034335574436701e-06, |
| "loss": 0.3062, |
| "step": 442 |
| }, |
| { |
| "epoch": 4.018099547511312, |
| "grad_norm": 0.12618495103542215, |
| "learning_rate": 8.8742599136739e-06, |
| "loss": 0.3037, |
| "step": 443 |
| }, |
| { |
| "epoch": 4.027149321266968, |
| "grad_norm": 0.12794305399302355, |
| "learning_rate": 8.715437993183235e-06, |
| "loss": 0.3071, |
| "step": 444 |
| }, |
| { |
| "epoch": 4.036199095022624, |
| "grad_norm": 0.1752311726452474, |
| "learning_rate": 8.55787621028851e-06, |
| "loss": 0.3049, |
| "step": 445 |
| }, |
| { |
| "epoch": 4.04524886877828, |
| "grad_norm": 0.13744941353125903, |
| "learning_rate": 8.401580911555353e-06, |
| "loss": 0.3022, |
| "step": 446 |
| }, |
| { |
| "epoch": 4.0542986425339365, |
| "grad_norm": 0.1442214999179444, |
| "learning_rate": 8.24655839253563e-06, |
| "loss": 0.304, |
| "step": 447 |
| }, |
| { |
| "epoch": 4.0633484162895925, |
| "grad_norm": 0.14721326705532867, |
| "learning_rate": 8.09281489751379e-06, |
| "loss": 0.3014, |
| "step": 448 |
| }, |
| { |
| "epoch": 4.072398190045249, |
| "grad_norm": 0.13164956472111614, |
| "learning_rate": 7.940356619255385e-06, |
| "loss": 0.3, |
| "step": 449 |
| }, |
| { |
| "epoch": 4.081447963800905, |
| "grad_norm": 0.14645232900110186, |
| "learning_rate": 7.789189698757656e-06, |
| "loss": 0.3051, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.090497737556561, |
| "grad_norm": 0.13594041957864836, |
| "learning_rate": 7.639320225002106e-06, |
| "loss": 0.3008, |
| "step": 451 |
| }, |
| { |
| "epoch": 4.099547511312217, |
| "grad_norm": 0.12959173947863253, |
| "learning_rate": 7.490754234709285e-06, |
| "loss": 0.3029, |
| "step": 452 |
| }, |
| { |
| "epoch": 4.108597285067873, |
| "grad_norm": 0.1312102043451062, |
| "learning_rate": 7.343497712095619e-06, |
| "loss": 0.3018, |
| "step": 453 |
| }, |
| { |
| "epoch": 4.117647058823529, |
| "grad_norm": 0.12721628165894824, |
| "learning_rate": 7.1975565886323575e-06, |
| "loss": 0.3021, |
| "step": 454 |
| }, |
| { |
| "epoch": 4.126696832579185, |
| "grad_norm": 0.12135804195023794, |
| "learning_rate": 7.052936742806693e-06, |
| "loss": 0.3029, |
| "step": 455 |
| }, |
| { |
| "epoch": 4.135746606334842, |
| "grad_norm": 0.11021717009086096, |
| "learning_rate": 6.909643999884918e-06, |
| "loss": 0.3036, |
| "step": 456 |
| }, |
| { |
| "epoch": 4.144796380090498, |
| "grad_norm": 0.1171461454928136, |
| "learning_rate": 6.767684131677814e-06, |
| "loss": 0.3019, |
| "step": 457 |
| }, |
| { |
| "epoch": 4.153846153846154, |
| "grad_norm": 0.12398145068200955, |
| "learning_rate": 6.627062856308191e-06, |
| "loss": 0.3057, |
| "step": 458 |
| }, |
| { |
| "epoch": 4.16289592760181, |
| "grad_norm": 0.10813946756475247, |
| "learning_rate": 6.487785837980509e-06, |
| "loss": 0.3055, |
| "step": 459 |
| }, |
| { |
| "epoch": 4.171945701357466, |
| "grad_norm": 0.10512317676255085, |
| "learning_rate": 6.349858686752748e-06, |
| "loss": 0.3028, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.180995475113122, |
| "grad_norm": 0.11457252381271353, |
| "learning_rate": 6.213286958310476e-06, |
| "loss": 0.3029, |
| "step": 461 |
| }, |
| { |
| "epoch": 4.1900452488687785, |
| "grad_norm": 0.11257398302051942, |
| "learning_rate": 6.078076153742962e-06, |
| "loss": 0.3042, |
| "step": 462 |
| }, |
| { |
| "epoch": 4.199095022624435, |
| "grad_norm": 0.1068866308205338, |
| "learning_rate": 5.9442317193217245e-06, |
| "loss": 0.3051, |
| "step": 463 |
| }, |
| { |
| "epoch": 4.208144796380091, |
| "grad_norm": 0.09665417595653125, |
| "learning_rate": 5.811759046281062e-06, |
| "loss": 0.3044, |
| "step": 464 |
| }, |
| { |
| "epoch": 4.217194570135747, |
| "grad_norm": 0.10132756213863214, |
| "learning_rate": 5.680663470600918e-06, |
| "loss": 0.3047, |
| "step": 465 |
| }, |
| { |
| "epoch": 4.226244343891403, |
| "grad_norm": 0.10197900247318212, |
| "learning_rate": 5.550950272791977e-06, |
| "loss": 0.3036, |
| "step": 466 |
| }, |
| { |
| "epoch": 4.235294117647059, |
| "grad_norm": 0.10011788827590404, |
| "learning_rate": 5.422624677682935e-06, |
| "loss": 0.3042, |
| "step": 467 |
| }, |
| { |
| "epoch": 4.244343891402715, |
| "grad_norm": 0.09704681985529298, |
| "learning_rate": 5.29569185421003e-06, |
| "loss": 0.2997, |
| "step": 468 |
| }, |
| { |
| "epoch": 4.253393665158371, |
| "grad_norm": 0.0889335395278227, |
| "learning_rate": 5.1701569152089196e-06, |
| "loss": 0.3033, |
| "step": 469 |
| }, |
| { |
| "epoch": 4.262443438914027, |
| "grad_norm": 0.09701179356263233, |
| "learning_rate": 5.046024917208603e-06, |
| "loss": 0.3074, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.271493212669683, |
| "grad_norm": 0.09991090929407241, |
| "learning_rate": 4.923300860227884e-06, |
| "loss": 0.3074, |
| "step": 471 |
| }, |
| { |
| "epoch": 4.280542986425339, |
| "grad_norm": 0.09090875982688397, |
| "learning_rate": 4.801989687573878e-06, |
| "loss": 0.3049, |
| "step": 472 |
| }, |
| { |
| "epoch": 4.289592760180995, |
| "grad_norm": 0.08947832790676682, |
| "learning_rate": 4.6820962856429205e-06, |
| "loss": 0.3039, |
| "step": 473 |
| }, |
| { |
| "epoch": 4.298642533936651, |
| "grad_norm": 0.08534002305718665, |
| "learning_rate": 4.563625483723777e-06, |
| "loss": 0.3079, |
| "step": 474 |
| }, |
| { |
| "epoch": 4.3076923076923075, |
| "grad_norm": 0.08940524333139004, |
| "learning_rate": 4.446582053803066e-06, |
| "loss": 0.2974, |
| "step": 475 |
| }, |
| { |
| "epoch": 4.316742081447964, |
| "grad_norm": 0.09171172616230228, |
| "learning_rate": 4.330970710373063e-06, |
| "loss": 0.3076, |
| "step": 476 |
| }, |
| { |
| "epoch": 4.32579185520362, |
| "grad_norm": 0.08806621206621702, |
| "learning_rate": 4.216796110241852e-06, |
| "loss": 0.3048, |
| "step": 477 |
| }, |
| { |
| "epoch": 4.334841628959276, |
| "grad_norm": 0.08409826711122455, |
| "learning_rate": 4.104062852345671e-06, |
| "loss": 0.3013, |
| "step": 478 |
| }, |
| { |
| "epoch": 4.343891402714932, |
| "grad_norm": 0.0893222760374292, |
| "learning_rate": 3.9927754775636886e-06, |
| "loss": 0.3052, |
| "step": 479 |
| }, |
| { |
| "epoch": 4.352941176470588, |
| "grad_norm": 0.08399072448662723, |
| "learning_rate": 3.882938468535158e-06, |
| "loss": 0.3024, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.361990950226244, |
| "grad_norm": 0.08499472493876709, |
| "learning_rate": 3.7745562494787645e-06, |
| "loss": 0.3022, |
| "step": 481 |
| }, |
| { |
| "epoch": 4.371040723981901, |
| "grad_norm": 0.08145638475145033, |
| "learning_rate": 3.667633186014512e-06, |
| "loss": 0.3054, |
| "step": 482 |
| }, |
| { |
| "epoch": 4.380090497737557, |
| "grad_norm": 0.08565742656060206, |
| "learning_rate": 3.5621735849877957e-06, |
| "loss": 0.3026, |
| "step": 483 |
| }, |
| { |
| "epoch": 4.389140271493213, |
| "grad_norm": 0.08492763740421684, |
| "learning_rate": 3.458181694295961e-06, |
| "loss": 0.3055, |
| "step": 484 |
| }, |
| { |
| "epoch": 4.398190045248869, |
| "grad_norm": 0.08463274662713963, |
| "learning_rate": 3.3556617027172168e-06, |
| "loss": 0.3024, |
| "step": 485 |
| }, |
| { |
| "epoch": 4.407239819004525, |
| "grad_norm": 0.08462399748218868, |
| "learning_rate": 3.2546177397418677e-06, |
| "loss": 0.3066, |
| "step": 486 |
| }, |
| { |
| "epoch": 4.416289592760181, |
| "grad_norm": 0.08376453881287438, |
| "learning_rate": 3.155053875406e-06, |
| "loss": 0.3042, |
| "step": 487 |
| }, |
| { |
| "epoch": 4.425339366515837, |
| "grad_norm": 0.08830270926768068, |
| "learning_rate": 3.0569741201275626e-06, |
| "loss": 0.2987, |
| "step": 488 |
| }, |
| { |
| "epoch": 4.4343891402714934, |
| "grad_norm": 0.08240870254024944, |
| "learning_rate": 2.96038242454479e-06, |
| "loss": 0.3003, |
| "step": 489 |
| }, |
| { |
| "epoch": 4.4434389140271495, |
| "grad_norm": 0.09052455529057644, |
| "learning_rate": 2.8652826793570975e-06, |
| "loss": 0.3028, |
| "step": 490 |
| }, |
| { |
| "epoch": 4.452488687782806, |
| "grad_norm": 0.08455563785960757, |
| "learning_rate": 2.77167871516836e-06, |
| "loss": 0.3073, |
| "step": 491 |
| }, |
| { |
| "epoch": 4.461538461538462, |
| "grad_norm": 0.08127731938352123, |
| "learning_rate": 2.679574302332597e-06, |
| "loss": 0.2999, |
| "step": 492 |
| }, |
| { |
| "epoch": 4.470588235294118, |
| "grad_norm": 0.08359245964188998, |
| "learning_rate": 2.5889731508021363e-06, |
| "loss": 0.3049, |
| "step": 493 |
| }, |
| { |
| "epoch": 4.479638009049774, |
| "grad_norm": 0.08004277394855593, |
| "learning_rate": 2.4998789099781507e-06, |
| "loss": 0.2995, |
| "step": 494 |
| }, |
| { |
| "epoch": 4.48868778280543, |
| "grad_norm": 0.09293016471118876, |
| "learning_rate": 2.4122951685636674e-06, |
| "loss": 0.3084, |
| "step": 495 |
| }, |
| { |
| "epoch": 4.497737556561086, |
| "grad_norm": 0.08857129082558779, |
| "learning_rate": 2.3262254544190154e-06, |
| "loss": 0.3028, |
| "step": 496 |
| }, |
| { |
| "epoch": 4.506787330316742, |
| "grad_norm": 0.0851690989816207, |
| "learning_rate": 2.2416732344197368e-06, |
| "loss": 0.3052, |
| "step": 497 |
| }, |
| { |
| "epoch": 4.515837104072398, |
| "grad_norm": 0.08380605677642579, |
| "learning_rate": 2.1586419143169125e-06, |
| "loss": 0.3071, |
| "step": 498 |
| }, |
| { |
| "epoch": 4.524886877828054, |
| "grad_norm": 0.07847574450950273, |
| "learning_rate": 2.0771348386000233e-06, |
| "loss": 0.3034, |
| "step": 499 |
| }, |
| { |
| "epoch": 4.53393665158371, |
| "grad_norm": 0.08166491411892216, |
| "learning_rate": 1.997155290362187e-06, |
| "loss": 0.3065, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.542986425339366, |
| "grad_norm": 0.07778971565605453, |
| "learning_rate": 1.9187064911679432e-06, |
| "loss": 0.3024, |
| "step": 501 |
| }, |
| { |
| "epoch": 4.552036199095022, |
| "grad_norm": 0.07933626982513363, |
| "learning_rate": 1.841791600923495e-06, |
| "loss": 0.3023, |
| "step": 502 |
| }, |
| { |
| "epoch": 4.5610859728506785, |
| "grad_norm": 0.08136425063541126, |
| "learning_rate": 1.766413717749389e-06, |
| "loss": 0.304, |
| "step": 503 |
| }, |
| { |
| "epoch": 4.570135746606335, |
| "grad_norm": 0.08138859986883137, |
| "learning_rate": 1.6925758778557933e-06, |
| "loss": 0.3055, |
| "step": 504 |
| }, |
| { |
| "epoch": 4.579185520361991, |
| "grad_norm": 0.0787210528579053, |
| "learning_rate": 1.6202810554201099e-06, |
| "loss": 0.3049, |
| "step": 505 |
| }, |
| { |
| "epoch": 4.588235294117647, |
| "grad_norm": 0.08051236562441969, |
| "learning_rate": 1.5495321624672443e-06, |
| "loss": 0.3044, |
| "step": 506 |
| }, |
| { |
| "epoch": 4.597285067873303, |
| "grad_norm": 0.0788996281741959, |
| "learning_rate": 1.4803320487523087e-06, |
| "loss": 0.3057, |
| "step": 507 |
| }, |
| { |
| "epoch": 4.606334841628959, |
| "grad_norm": 0.07764118519403024, |
| "learning_rate": 1.4126835016457752e-06, |
| "loss": 0.3038, |
| "step": 508 |
| }, |
| { |
| "epoch": 4.615384615384615, |
| "grad_norm": 0.08001544469825624, |
| "learning_rate": 1.3465892460212594e-06, |
| "loss": 0.3064, |
| "step": 509 |
| }, |
| { |
| "epoch": 4.624434389140271, |
| "grad_norm": 0.09063120326499433, |
| "learning_rate": 1.2820519441457502e-06, |
| "loss": 0.3047, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.633484162895927, |
| "grad_norm": 0.07794623861050551, |
| "learning_rate": 1.2190741955723495e-06, |
| "loss": 0.3045, |
| "step": 511 |
| }, |
| { |
| "epoch": 4.642533936651584, |
| "grad_norm": 0.07595212127483163, |
| "learning_rate": 1.1576585370355908e-06, |
| "loss": 0.3065, |
| "step": 512 |
| }, |
| { |
| "epoch": 4.65158371040724, |
| "grad_norm": 0.07739705488047151, |
| "learning_rate": 1.0978074423492368e-06, |
| "loss": 0.3014, |
| "step": 513 |
| }, |
| { |
| "epoch": 4.660633484162896, |
| "grad_norm": 0.07685164772052698, |
| "learning_rate": 1.0395233223066614e-06, |
| "loss": 0.2996, |
| "step": 514 |
| }, |
| { |
| "epoch": 4.669683257918552, |
| "grad_norm": 0.08015550295148605, |
| "learning_rate": 9.828085245837183e-07, |
| "loss": 0.307, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.678733031674208, |
| "grad_norm": 0.08113161848829428, |
| "learning_rate": 9.276653336441943e-07, |
| "loss": 0.3091, |
| "step": 516 |
| }, |
| { |
| "epoch": 4.6877828054298645, |
| "grad_norm": 0.08234301018528185, |
| "learning_rate": 8.740959706477725e-07, |
| "loss": 0.3022, |
| "step": 517 |
| }, |
| { |
| "epoch": 4.6968325791855206, |
| "grad_norm": 0.07811510319887344, |
| "learning_rate": 8.221025933605919e-07, |
| "loss": 0.3027, |
| "step": 518 |
| }, |
| { |
| "epoch": 4.705882352941177, |
| "grad_norm": 0.07966253839252949, |
| "learning_rate": 7.716872960683041e-07, |
| "loss": 0.304, |
| "step": 519 |
| }, |
| { |
| "epoch": 4.714932126696833, |
| "grad_norm": 0.07556300199073022, |
| "learning_rate": 7.228521094917318e-07, |
| "loss": 0.303, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.723981900452489, |
| "grad_norm": 0.07949886614317854, |
| "learning_rate": 6.755990007050761e-07, |
| "loss": 0.3061, |
| "step": 521 |
| }, |
| { |
| "epoch": 4.733031674208145, |
| "grad_norm": 0.07841434742669152, |
| "learning_rate": 6.299298730566516e-07, |
| "loss": 0.3068, |
| "step": 522 |
| }, |
| { |
| "epoch": 4.742081447963801, |
| "grad_norm": 0.07541915667043782, |
| "learning_rate": 5.858465660922808e-07, |
| "loss": 0.3066, |
| "step": 523 |
| }, |
| { |
| "epoch": 4.751131221719457, |
| "grad_norm": 0.07733742513960733, |
| "learning_rate": 5.433508554811307e-07, |
| "loss": 0.3057, |
| "step": 524 |
| }, |
| { |
| "epoch": 4.760180995475113, |
| "grad_norm": 0.07836214413259006, |
| "learning_rate": 5.024444529442285e-07, |
| "loss": 0.3031, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.769230769230769, |
| "grad_norm": 0.07953356517050592, |
| "learning_rate": 4.6312900618550317e-07, |
| "loss": 0.3016, |
| "step": 526 |
| }, |
| { |
| "epoch": 4.778280542986425, |
| "grad_norm": 0.07617521054682652, |
| "learning_rate": 4.254060988254205e-07, |
| "loss": 0.2999, |
| "step": 527 |
| }, |
| { |
| "epoch": 4.787330316742081, |
| "grad_norm": 0.07846308080577853, |
| "learning_rate": 3.8927725033718553e-07, |
| "loss": 0.3051, |
| "step": 528 |
| }, |
| { |
| "epoch": 4.796380090497737, |
| "grad_norm": 0.07896885874732473, |
| "learning_rate": 3.547439159855559e-07, |
| "loss": 0.3063, |
| "step": 529 |
| }, |
| { |
| "epoch": 4.8054298642533935, |
| "grad_norm": 0.07878929850813644, |
| "learning_rate": 3.218074867681864e-07, |
| "loss": 0.302, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.8144796380090495, |
| "grad_norm": 0.07613359977307133, |
| "learning_rate": 2.9046928935966056e-07, |
| "loss": 0.2972, |
| "step": 531 |
| }, |
| { |
| "epoch": 4.823529411764706, |
| "grad_norm": 0.07688492198921053, |
| "learning_rate": 2.607305860579912e-07, |
| "loss": 0.3067, |
| "step": 532 |
| }, |
| { |
| "epoch": 4.832579185520362, |
| "grad_norm": 0.07476518062895497, |
| "learning_rate": 2.3259257473381647e-07, |
| "loss": 0.2991, |
| "step": 533 |
| }, |
| { |
| "epoch": 4.841628959276018, |
| "grad_norm": 0.07552831124187401, |
| "learning_rate": 2.060563887821232e-07, |
| "loss": 0.3005, |
| "step": 534 |
| }, |
| { |
| "epoch": 4.850678733031674, |
| "grad_norm": 0.07631228481012646, |
| "learning_rate": 1.8112309707661647e-07, |
| "loss": 0.3035, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.859728506787331, |
| "grad_norm": 0.07467074977475399, |
| "learning_rate": 1.577937039266475e-07, |
| "loss": 0.3056, |
| "step": 536 |
| }, |
| { |
| "epoch": 4.868778280542987, |
| "grad_norm": 0.07431003159382249, |
| "learning_rate": 1.3606914903677492e-07, |
| "loss": 0.2996, |
| "step": 537 |
| }, |
| { |
| "epoch": 4.877828054298643, |
| "grad_norm": 0.07468768960052453, |
| "learning_rate": 1.1595030746890168e-07, |
| "loss": 0.3077, |
| "step": 538 |
| }, |
| { |
| "epoch": 4.886877828054299, |
| "grad_norm": 0.07691379852821048, |
| "learning_rate": 9.74379896070321e-08, |
| "loss": 0.303, |
| "step": 539 |
| }, |
| { |
| "epoch": 4.895927601809955, |
| "grad_norm": 0.07405617978413666, |
| "learning_rate": 8.053294112462696e-08, |
| "loss": 0.3029, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.904977375565611, |
| "grad_norm": 0.07237727103301038, |
| "learning_rate": 6.523584295457408e-08, |
| "loss": 0.3045, |
| "step": 541 |
| }, |
| { |
| "epoch": 4.914027149321267, |
| "grad_norm": 0.07736995275320246, |
| "learning_rate": 5.154731126176149e-08, |
| "loss": 0.3072, |
| "step": 542 |
| }, |
| { |
| "epoch": 4.923076923076923, |
| "grad_norm": 0.07450947266190626, |
| "learning_rate": 3.946789741824386e-08, |
| "loss": 0.2999, |
| "step": 543 |
| }, |
| { |
| "epoch": 4.932126696832579, |
| "grad_norm": 0.07656920155960474, |
| "learning_rate": 2.8998087981055854e-08, |
| "loss": 0.3001, |
| "step": 544 |
| }, |
| { |
| "epoch": 4.9411764705882355, |
| "grad_norm": 0.07427882247951784, |
| "learning_rate": 2.01383046725967e-08, |
| "loss": 0.3011, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.950226244343892, |
| "grad_norm": 0.07493198557289557, |
| "learning_rate": 1.288890436363488e-08, |
| "loss": 0.3073, |
| "step": 546 |
| }, |
| { |
| "epoch": 4.959276018099548, |
| "grad_norm": 0.07595703866942234, |
| "learning_rate": 7.250179058968521e-09, |
| "loss": 0.302, |
| "step": 547 |
| }, |
| { |
| "epoch": 4.968325791855204, |
| "grad_norm": 0.07667669947739308, |
| "learning_rate": 3.222355885625916e-09, |
| "loss": 0.3058, |
| "step": 548 |
| }, |
| { |
| "epoch": 4.97737556561086, |
| "grad_norm": 0.0778877092524117, |
| "learning_rate": 8.055970837395066e-10, |
| "loss": 0.3071, |
| "step": 549 |
| }, |
| { |
| "epoch": 4.986425339366516, |
| "grad_norm": 0.07598122336675614, |
| "learning_rate": 0.0, |
| "loss": 0.3021, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.986425339366516, |
| "step": 550, |
| "total_flos": 1.3210208775492862e+19, |
| "train_loss": 0.37584147680889474, |
| "train_runtime": 27424.4359, |
| "train_samples_per_second": 10.307, |
| "train_steps_per_second": 0.02 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 550, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3210208775492862e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|