| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 389, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002570694087403599, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.7749, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005141388174807198, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.7549, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.007712082262210797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.7659, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.010282776349614395, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.7842, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.012853470437017995, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 1.8015, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015424164524421594, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7534, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.017994858611825194, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 1.7747, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02056555269922879, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.7783, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02313624678663239, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.803, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02570694087403599, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.7859, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.028277634961439587, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 1.7856, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.030848329048843187, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7825, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.033419023136246784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9999652796146877e-05, | |
| "loss": 1.8127, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03598971722365039, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9998611208697607e-05, | |
| "loss": 1.8115, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.038560411311053984, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9996875309980824e-05, | |
| "loss": 1.7544, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04113110539845758, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9994445220538678e-05, | |
| "loss": 1.7886, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.043701799485861184, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.999132110911845e-05, | |
| "loss": 1.7866, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.04627249357326478, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9987503192660842e-05, | |
| "loss": 1.7612, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04884318766066838, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9982991736284914e-05, | |
| "loss": 1.7944, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.05141388174807198, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.997778705326968e-05, | |
| "loss": 1.7656, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05398457583547558, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9971889505032337e-05, | |
| "loss": 1.7554, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.056555269922879174, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9965299501103178e-05, | |
| "loss": 1.7637, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.05912596401028278, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.995801749909715e-05, | |
| "loss": 1.7803, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.061696658097686374, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.995004400468209e-05, | |
| "loss": 1.7402, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.06426735218508997, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9941379571543597e-05, | |
| "loss": 1.7017, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06683804627249357, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9932024801346583e-05, | |
| "loss": 1.7671, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.06940874035989718, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.992198034369349e-05, | |
| "loss": 1.7014, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07197943444730077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.991124689607921e-05, | |
| "loss": 1.7532, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.07455012853470437, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9899825203842613e-05, | |
| "loss": 1.7129, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07712082262210797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.988771606011481e-05, | |
| "loss": 1.7126, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07969151670951156, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.987492030576407e-05, | |
| "loss": 1.7393, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08226221079691516, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.986143882933744e-05, | |
| "loss": 1.7742, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08483290488431877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9847272566999026e-05, | |
| "loss": 1.7483, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08740359897172237, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9832422502465013e-05, | |
| "loss": 1.707, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.08997429305912596, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9816889666935318e-05, | |
| "loss": 1.7507, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09254498714652956, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9800675139022006e-05, | |
| "loss": 1.7339, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09511568123393316, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9783780044674402e-05, | |
| "loss": 1.748, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.09768637532133675, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.976620555710087e-05, | |
| "loss": 1.686, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.10025706940874037, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.974795289668737e-05, | |
| "loss": 1.7043, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.10282776349614396, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.972902333091271e-05, | |
| "loss": 1.7646, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10539845758354756, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9709418174260523e-05, | |
| "loss": 1.6802, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.10796915167095116, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9689138788127994e-05, | |
| "loss": 1.6775, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.11053984575835475, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.966818658073133e-05, | |
| "loss": 1.6633, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.11311053984575835, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9646563007007952e-05, | |
| "loss": 1.7637, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.11568123393316196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9624269568515486e-05, | |
| "loss": 1.7087, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11825192802056556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.960130781332748e-05, | |
| "loss": 1.6562, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.12082262210796915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.957767933592591e-05, | |
| "loss": 1.698, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.12339331619537275, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.955338577709046e-05, | |
| "loss": 1.7444, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.12596401028277635, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9528428823784567e-05, | |
| "loss": 1.6743, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.12853470437017994, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9502810209038302e-05, | |
| "loss": 1.6741, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13110539845758354, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9476531711828027e-05, | |
| "loss": 1.708, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.13367609254498714, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9449595156952827e-05, | |
| "loss": 1.6587, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.13624678663239073, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9422002414907837e-05, | |
| "loss": 1.6887, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.13881748071979436, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9393755401754324e-05, | |
| "loss": 1.6714, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.14138817480719795, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.936485607898665e-05, | |
| "loss": 1.7432, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.14395886889460155, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9335306453396066e-05, | |
| "loss": 1.6675, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.14652956298200515, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9305108576931336e-05, | |
| "loss": 1.6436, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.14910025706940874, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.927426454655627e-05, | |
| "loss": 1.6853, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.15167095115681234, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.924277650410412e-05, | |
| "loss": 1.6641, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.15424164524421594, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9210646636128805e-05, | |
| "loss": 1.7385, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15681233933161953, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9177877173753127e-05, | |
| "loss": 1.7178, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.15938303341902313, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.91444703925138e-05, | |
| "loss": 1.6785, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.16195372750642673, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9110428612203463e-05, | |
| "loss": 1.6799, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.16452442159383032, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9075754196709574e-05, | |
| "loss": 1.7075, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.16709511568123395, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.904044955385026e-05, | |
| "loss": 1.6621, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.16966580976863754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9004517135207127e-05, | |
| "loss": 1.6492, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.17223650385604114, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8967959435955027e-05, | |
| "loss": 1.7297, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.17480719794344474, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.893077899468876e-05, | |
| "loss": 1.6882, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.17737789203084833, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.889297839324682e-05, | |
| "loss": 1.6714, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.17994858611825193, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8854560256532098e-05, | |
| "loss": 1.6489, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18251928020565553, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8815527252329624e-05, | |
| "loss": 1.6721, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.18508997429305912, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8775882091121282e-05, | |
| "loss": 1.6533, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.18766066838046272, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8735627525897618e-05, | |
| "loss": 1.6443, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.19023136246786632, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8694766351966665e-05, | |
| "loss": 1.6631, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1928020565552699, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8653301406759827e-05, | |
| "loss": 1.6873, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1953727506426735, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8611235569634852e-05, | |
| "loss": 1.7046, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.19794344473007713, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8568571761675893e-05, | |
| "loss": 1.7002, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.20051413881748073, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8525312945490647e-05, | |
| "loss": 1.698, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.20308483290488433, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8481462125004647e-05, | |
| "loss": 1.6765, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.20565552699228792, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8437022345252666e-05, | |
| "loss": 1.7185, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20822622107969152, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8391996692167242e-05, | |
| "loss": 1.6653, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.21079691516709512, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8346388292364438e-05, | |
| "loss": 1.7129, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2133676092544987, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8300200312926674e-05, | |
| "loss": 1.6709, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2159383033419023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8253435961182844e-05, | |
| "loss": 1.6597, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2185089974293059, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8206098484485563e-05, | |
| "loss": 1.6812, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2210796915167095, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8158191169985696e-05, | |
| "loss": 1.6792, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2236503856041131, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.810971734440408e-05, | |
| "loss": 1.6404, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2262210796915167, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.806068037380052e-05, | |
| "loss": 1.6528, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.22879177377892032, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.801108366334004e-05, | |
| "loss": 1.6775, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.23136246786632392, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.796093065705644e-05, | |
| "loss": 1.679, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23393316195372751, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.791022483761312e-05, | |
| "loss": 1.658, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2365038560411311, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7858969726061262e-05, | |
| "loss": 1.6277, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.2390745501285347, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7807168881595304e-05, | |
| "loss": 1.6602, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.2416452442159383, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7754825901305814e-05, | |
| "loss": 1.6758, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.2442159383033419, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7701944419929673e-05, | |
| "loss": 1.6353, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2467866323907455, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7648528109597704e-05, | |
| "loss": 1.6602, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.2493573264781491, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7594580679579654e-05, | |
| "loss": 1.6404, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.2519280205655527, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7540105876026647e-05, | |
| "loss": 1.6365, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.2544987146529563, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7485107481711014e-05, | |
| "loss": 1.6353, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.2570694087403599, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7429589315763637e-05, | |
| "loss": 1.6541, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2596401028277635, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.737355523340875e-05, | |
| "loss": 1.6133, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.2622107969151671, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7317009125696208e-05, | |
| "loss": 1.6687, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.2647814910025707, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.725995491923131e-05, | |
| "loss": 1.636, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.26735218508997427, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7202396575902118e-05, | |
| "loss": 1.6497, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2699228791773779, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.714433809260435e-05, | |
| "loss": 1.6458, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.27249357326478146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7085783500963825e-05, | |
| "loss": 1.624, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.2750642673521851, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.702673686705651e-05, | |
| "loss": 1.6353, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2776349614395887, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6967202291126174e-05, | |
| "loss": 1.6406, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2802056555269923, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.690718390729964e-05, | |
| "loss": 1.6323, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2827763496143959, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.684668588329973e-05, | |
| "loss": 1.665, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2853470437017995, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6785712420155864e-05, | |
| "loss": 1.635, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2879177377892031, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.67242677519123e-05, | |
| "loss": 1.6335, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.29048843187660667, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6662356145334158e-05, | |
| "loss": 1.6846, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2930591259640103, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6599981899611103e-05, | |
| "loss": 1.6353, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.29562982005141386, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.653714934605883e-05, | |
| "loss": 1.6189, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2982005141388175, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.647386284781828e-05, | |
| "loss": 1.7021, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.30077120822622105, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6410126799552653e-05, | |
| "loss": 1.6777, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3033419023136247, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6345945627142264e-05, | |
| "loss": 1.6377, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3059125964010283, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.628132378737718e-05, | |
| "loss": 1.6616, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.30848329048843187, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6216265767647756e-05, | |
| "loss": 1.616, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3110539845758355, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.615077608563302e-05, | |
| "loss": 1.6816, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.31362467866323906, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6084859288986957e-05, | |
| "loss": 1.6099, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3161953727506427, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.601851995502272e-05, | |
| "loss": 1.6274, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.31876606683804626, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5951762690394788e-05, | |
| "loss": 1.6663, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3213367609254499, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5884592130779056e-05, | |
| "loss": 1.6494, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.32390745501285345, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.581701294055095e-05, | |
| "loss": 1.614, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3264781491002571, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5749029812461515e-05, | |
| "loss": 1.6265, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.32904884318766064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.568064746731156e-05, | |
| "loss": 1.5913, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.33161953727506427, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5611870653623826e-05, | |
| "loss": 1.5984, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.3341902313624679, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5542704147313257e-05, | |
| "loss": 1.6343, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33676092544987146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5473152751355353e-05, | |
| "loss": 1.6355, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.3393316195372751, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5403221295452647e-05, | |
| "loss": 1.647, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.34190231362467866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5332914635699327e-05, | |
| "loss": 1.6191, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.3444730077120823, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5262237654244026e-05, | |
| "loss": 1.624, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.34704370179948585, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5191195258950804e-05, | |
| "loss": 1.6055, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3496143958868895, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5119792383058338e-05, | |
| "loss": 1.6492, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.35218508997429304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5048033984837352e-05, | |
| "loss": 1.6155, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.35475578406169667, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4975925047246319e-05, | |
| "loss": 1.6042, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.35732647814910024, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4903470577585433e-05, | |
| "loss": 1.6367, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.35989717223650386, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4830675607148899e-05, | |
| "loss": 1.5928, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.36246786632390743, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.475754519087557e-05, | |
| "loss": 1.6526, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.36503856041131105, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4684084406997903e-05, | |
| "loss": 1.6362, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3676092544987147, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4610298356689341e-05, | |
| "loss": 1.6201, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.37017994858611825, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.453619216371008e-05, | |
| "loss": 1.6162, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.37275064267352187, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.446177097405127e-05, | |
| "loss": 1.6172, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.37532133676092544, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4387039955577668e-05, | |
| "loss": 1.6301, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.37789203084832906, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4312004297668791e-05, | |
| "loss": 1.6096, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.38046272493573263, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4236669210858544e-05, | |
| "loss": 1.6152, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.38303341902313626, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4161039926473412e-05, | |
| "loss": 1.6321, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3856041131105398, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4085121696269185e-05, | |
| "loss": 1.5957, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.38817480719794345, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4008919792066273e-05, | |
| "loss": 1.6421, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.390745501285347, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3932439505383628e-05, | |
| "loss": 1.6189, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.39331619537275064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.385568614707129e-05, | |
| "loss": 1.6106, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.39588688946015427, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3778665046941616e-05, | |
| "loss": 1.6321, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.39845758354755784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3701381553399147e-05, | |
| "loss": 1.5796, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.40102827763496146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3623841033069232e-05, | |
| "loss": 1.6555, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.40359897172236503, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3546048870425356e-05, | |
| "loss": 1.6028, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.40616966580976865, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3468010467415248e-05, | |
| "loss": 1.5969, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.4087403598971722, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3389731243085747e-05, | |
| "loss": 1.6077, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.41131105398457585, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3311216633206514e-05, | |
| "loss": 1.5762, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4138817480719794, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3232472089892567e-05, | |
| "loss": 1.6079, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.41645244215938304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.315350308122567e-05, | |
| "loss": 1.5994, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.4190231362467866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3074315090874652e-05, | |
| "loss": 1.5732, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.42159383033419023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2994913617714573e-05, | |
| "loss": 1.5901, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.4241645244215938, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2915304175444929e-05, | |
| "loss": 1.6138, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4267352185089974, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2835492292206735e-05, | |
| "loss": 1.5945, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.42930591259640105, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2755483510198668e-05, | |
| "loss": 1.6067, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.4318766066838046, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2675283385292212e-05, | |
| "loss": 1.5957, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.43444730077120824, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2594897486645836e-05, | |
| "loss": 1.6089, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.4370179948586118, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2514331396318298e-05, | |
| "loss": 1.6335, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.43958868894601544, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2433590708880991e-05, | |
| "loss": 1.6406, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.442159383033419, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2352681031029476e-05, | |
| "loss": 1.5759, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.44473007712082263, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2271607981194132e-05, | |
| "loss": 1.5955, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.4473007712082262, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2190377189150016e-05, | |
| "loss": 1.6069, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.4498714652956298, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2108994295625924e-05, | |
| "loss": 1.5796, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4524421593830334, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2027464951912703e-05, | |
| "loss": 1.5952, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.455012853470437, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1945794819470805e-05, | |
| "loss": 1.6213, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.45758354755784064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1863989569537165e-05, | |
| "loss": 1.5974, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.4601542416452442, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1782054882731377e-05, | |
| "loss": 1.5188, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.46272493573264784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1699996448661242e-05, | |
| "loss": 1.5964, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4652956298200514, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.161781996552765e-05, | |
| "loss": 1.5681, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.46786632390745503, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1535531139728918e-05, | |
| "loss": 1.5938, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.4704370179948586, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1453135685464524e-05, | |
| "loss": 1.574, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.4730077120822622, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1370639324338313e-05, | |
| "loss": 1.5872, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.4755784061696658, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1288047784961166e-05, | |
| "loss": 1.5806, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4781491002570694, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1205366802553231e-05, | |
| "loss": 1.5542, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.480719794344473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1122602118545642e-05, | |
| "loss": 1.5723, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.4832904884318766, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1039759480181836e-05, | |
| "loss": 1.5645, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.48586118251928023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0956844640118462e-05, | |
| "loss": 1.5884, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.4884318766066838, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0873863356025911e-05, | |
| "loss": 1.5559, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4910025706940874, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0790821390188493e-05, | |
| "loss": 1.5623, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.493573264781491, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0707724509104318e-05, | |
| "loss": 1.5916, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.4961439588688946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.062457848308484e-05, | |
| "loss": 1.5696, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.4987146529562982, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0541389085854177e-05, | |
| "loss": 1.5913, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.5012853470437018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0458162094148185e-05, | |
| "loss": 1.5439, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5038560411311054, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0374903287313307e-05, | |
| "loss": 1.6013, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.506426735218509, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.029161844690525e-05, | |
| "loss": 1.5813, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.5089974293059126, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0208313356287505e-05, | |
| "loss": 1.5757, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5115681233933161, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0124993800229774e-05, | |
| "loss": 1.5508, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.5141388174807198, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.004166556450623e-05, | |
| "loss": 1.5774, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5167095115681234, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.958334435493776e-06, | |
| "loss": 1.594, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.519280205655527, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.87500619977023e-06, | |
| "loss": 1.5977, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.5218508997429306, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.791686643712498e-06, | |
| "loss": 1.5938, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.5244215938303342, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.708381553094754e-06, | |
| "loss": 1.5371, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.5269922879177378, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.625096712686694e-06, | |
| "loss": 1.5315, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5295629820051414, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.541837905851817e-06, | |
| "loss": 1.5708, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.532133676092545, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.458610914145826e-06, | |
| "loss": 1.5691, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.5347043701799485, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.375421516915165e-06, | |
| "loss": 1.5881, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.5372750642673522, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.292275490895685e-06, | |
| "loss": 1.5732, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5398457583547558, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.209178609811509e-06, | |
| "loss": 1.5562, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5424164524421594, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.126136643974094e-06, | |
| "loss": 1.5603, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5449871465295629, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.043155359881538e-06, | |
| "loss": 1.5352, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5475578406169666, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.960240519818167e-06, | |
| "loss": 1.5647, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5501285347043702, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.877397881454358e-06, | |
| "loss": 1.5747, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5526992287917738, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.79463319744677e-06, | |
| "loss": 1.5586, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5552699228791774, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.711952215038837e-06, | |
| "loss": 1.5527, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5578406169665809, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.629360675661693e-06, | |
| "loss": 1.5374, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.5604113110539846, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.546864314535478e-06, | |
| "loss": 1.5647, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5629820051413882, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.464468860271084e-06, | |
| "loss": 1.5356, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5655526992287918, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.382180034472353e-06, | |
| "loss": 1.5483, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5681233933161953, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.30000355133876e-06, | |
| "loss": 1.5386, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.570694087403599, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.217945117268624e-06, | |
| "loss": 1.5552, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.5732647814910026, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.136010430462837e-06, | |
| "loss": 1.5635, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5758354755784062, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.0542051805292e-06, | |
| "loss": 1.5657, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.5784061696658098, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.9725350480873e-06, | |
| "loss": 1.5386, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5809768637532133, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.89100570437408e-06, | |
| "loss": 1.6018, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.583547557840617, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.809622810849986e-06, | |
| "loss": 1.5396, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5861182519280206, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.72839201880587e-06, | |
| "loss": 1.5474, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5886889460154242, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.647318968970528e-06, | |
| "loss": 1.5654, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5912596401028277, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.566409291119008e-06, | |
| "loss": 1.5732, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5938303341902313, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.485668603681706e-06, | |
| "loss": 1.5779, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.596401028277635, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.405102513354166e-06, | |
| "loss": 1.5449, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5989717223650386, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.324716614707794e-06, | |
| "loss": 1.5408, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.6015424164524421, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.2445164898013345e-06, | |
| "loss": 1.5403, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.6041131105398457, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.1645077077932666e-06, | |
| "loss": 1.5159, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6066838046272494, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.084695824555074e-06, | |
| "loss": 1.5557, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.609254498714653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.005086382285426e-06, | |
| "loss": 1.5625, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.6118251928020566, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.925684909125354e-06, | |
| "loss": 1.552, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.6143958868894601, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.84649691877433e-06, | |
| "loss": 1.5488, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.6169665809768637, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.767527910107437e-06, | |
| "loss": 1.5181, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6195372750642674, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.688783366793488e-06, | |
| "loss": 1.5403, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.622107969151671, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.610268756914254e-06, | |
| "loss": 1.5662, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.6246786632390745, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.5319895325847535e-06, | |
| "loss": 1.5222, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.6272493573264781, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.453951129574644e-06, | |
| "loss": 1.5439, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.6298200514138818, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.3761589669307745e-06, | |
| "loss": 1.5312, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6323907455012854, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.298618446600856e-06, | |
| "loss": 1.5383, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.6349614395886889, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.221334953058389e-06, | |
| "loss": 1.5393, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.6375321336760925, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.144313852928712e-06, | |
| "loss": 1.5247, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.6401028277634961, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.067560494616374e-06, | |
| "loss": 1.5454, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.6426735218508998, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.9910802079337285e-06, | |
| "loss": 1.5215, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6452442159383034, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.9148783037308154e-06, | |
| "loss": 1.5427, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.6478149100257069, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.838960073526589e-06, | |
| "loss": 1.5427, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.6503856041131105, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.763330789141457e-06, | |
| "loss": 1.5552, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.6529562982005142, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.687995702331211e-06, | |
| "loss": 1.5388, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6555269922879178, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.612960044422335e-06, | |
| "loss": 1.5854, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6580976863753213, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.538229025948729e-06, | |
| "loss": 1.5588, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6606683804627249, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.463807836289921e-06, | |
| "loss": 1.5217, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.6632390745501285, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.389701643310661e-06, | |
| "loss": 1.5066, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.6658097686375322, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.3159155930021e-06, | |
| "loss": 1.5327, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6683804627249358, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.2424548091244334e-06, | |
| "loss": 1.5522, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6709511568123393, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.169324392851105e-06, | |
| "loss": 1.543, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.6735218508997429, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.096529422414571e-06, | |
| "loss": 1.5483, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.6760925449871465, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.0240749527536845e-06, | |
| "loss": 1.5234, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.6786632390745502, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.951966015162652e-06, | |
| "loss": 1.5315, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.6812339331619537, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.880207616941663e-06, | |
| "loss": 1.5193, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6838046272493573, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8088047410492e-06, | |
| "loss": 1.5586, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.6863753213367609, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.737762345755975e-06, | |
| "loss": 1.481, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.6889460154241646, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.667085364300678e-06, | |
| "loss": 1.5869, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.6915167095115681, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.596778704547359e-06, | |
| "loss": 1.5366, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.6940874035989717, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.526847248644652e-06, | |
| "loss": 1.5007, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6966580976863753, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.457295852686746e-06, | |
| "loss": 1.5352, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.699228791773779, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.388129346376177e-06, | |
| "loss": 1.5447, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.7017994858611826, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.319352532688444e-06, | |
| "loss": 1.5701, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.7043701799485861, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.250970187538484e-06, | |
| "loss": 1.5, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.7069408740359897, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.182987059449056e-06, | |
| "loss": 1.5513, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7095115681233933, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.115407869220948e-06, | |
| "loss": 1.5007, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.712082262210797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.048237309605216e-06, | |
| "loss": 1.5398, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.7146529562982005, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.981480044977284e-06, | |
| "loss": 1.5476, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.7172236503856041, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.915140711013044e-06, | |
| "loss": 1.5015, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.7197943444730077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.849223914366981e-06, | |
| "loss": 1.5405, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7223650385604113, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7837342323522454e-06, | |
| "loss": 1.5413, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.7249357326478149, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7186762126228227e-06, | |
| "loss": 1.5874, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.7275064267352185, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.654054372857738e-06, | |
| "loss": 1.5122, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.7300771208226221, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5898732004473523e-06, | |
| "loss": 1.55, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.7326478149100257, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5261371521817247e-06, | |
| "loss": 1.5337, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7352185089974294, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.462850653941171e-06, | |
| "loss": 1.5159, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.7377892030848329, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.4000181003889e-06, | |
| "loss": 1.5139, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.7403598971722365, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.337643854665843e-06, | |
| "loss": 1.499, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.7429305912596401, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.2757322480876996e-06, | |
| "loss": 1.5149, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.7455012853470437, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.2142875798441376e-06, | |
| "loss": 1.5098, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7480719794344473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.15331411670027e-06, | |
| "loss": 1.5217, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.7506426735218509, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.092816092700366e-06, | |
| "loss": 1.5017, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.7532133676092545, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.032797708873828e-06, | |
| "loss": 1.5398, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.7557840616966581, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.97326313294349e-06, | |
| "loss": 1.4983, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.7583547557840618, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.914216499036178e-06, | |
| "loss": 1.5271, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7609254498714653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.855661907395655e-06, | |
| "loss": 1.5286, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7634961439588689, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7976034240978834e-06, | |
| "loss": 1.4954, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7660668380462725, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.740045080768694e-06, | |
| "loss": 1.4653, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7686375321336761, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.6829908743037936e-06, | |
| "loss": 1.5271, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7712082262210797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.626444766591253e-06, | |
| "loss": 1.48, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7737789203084833, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.570410684236365e-06, | |
| "loss": 1.5093, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.7763496143958869, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.514892518288988e-06, | |
| "loss": 1.531, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.7789203084832905, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4598941239733555e-06, | |
| "loss": 1.4795, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.781491002570694, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4054193204203457e-06, | |
| "loss": 1.5056, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.7840616966580977, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.3514718904022993e-06, | |
| "loss": 1.4841, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7866323907455013, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2980555800703273e-06, | |
| "loss": 1.5337, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.7892030848329049, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2451740986941905e-06, | |
| "loss": 1.5212, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.7917737789203085, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1928311184046967e-06, | |
| "loss": 1.5308, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.794344473007712, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1410302739387424e-06, | |
| "loss": 1.5159, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.7969151670951157, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.0897751623868833e-06, | |
| "loss": 1.5349, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7994858611825193, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.0390693429435626e-06, | |
| "loss": 1.5029, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.8020565552699229, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9889163366599607e-06, | |
| "loss": 1.519, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.8046272493573264, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.939319626199483e-06, | |
| "loss": 1.5054, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.8071979434447301, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.890282655595922e-06, | |
| "loss": 1.4736, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.8097686375321337, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8418088300143044e-06, | |
| "loss": 1.5242, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8123393316195373, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7939015155144378e-06, | |
| "loss": 1.5208, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.8149100257069408, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7465640388171589e-06, | |
| "loss": 1.5332, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.8174807197943444, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6997996870733268e-06, | |
| "loss": 1.4978, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.8200514138817481, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6536117076355652e-06, | |
| "loss": 1.4961, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.8226221079691517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6080033078327585e-06, | |
| "loss": 1.5559, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8251928020565553, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5629776547473397e-06, | |
| "loss": 1.5435, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.8277634961439588, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5185378749953538e-06, | |
| "loss": 1.4744, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.8303341902313625, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4746870545093528e-06, | |
| "loss": 1.4885, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.8329048843187661, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4314282383241097e-06, | |
| "loss": 1.5088, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.8354755784061697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.388764430365147e-06, | |
| "loss": 1.4971, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8380462724935732, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3466985932401743e-06, | |
| "loss": 1.5269, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.8406169665809768, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3052336480333372e-06, | |
| "loss": 1.5088, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.8431876606683805, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2643724741023845e-06, | |
| "loss": 1.5046, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.8457583547557841, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2241179088787192e-06, | |
| "loss": 1.5217, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.8483290488431876, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1844727476703776e-06, | |
| "loss": 1.4951, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8508997429305912, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1454397434679022e-06, | |
| "loss": 1.4941, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.8534704370179949, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1070216067531825e-06, | |
| "loss": 1.5122, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.8560411311053985, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0692210053112451e-06, | |
| "loss": 1.5427, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.8586118251928021, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.032040564044975e-06, | |
| "loss": 1.5278, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.8611825192802056, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.954828647928727e-07, | |
| "loss": 1.4768, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8637532133676092, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.595504461497441e-07, | |
| "loss": 1.5066, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.8663239074550129, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.242458032904311e-07, | |
| "loss": 1.4871, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.8688946015424165, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.895713877965373e-07, | |
| "loss": 1.5212, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.87146529562982, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.555296074861996e-07, | |
| "loss": 1.4919, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8740359897172236, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.22122826246875e-07, | |
| "loss": 1.5476, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8766066838046273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.89353363871197e-07, | |
| "loss": 1.5142, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8791773778920309, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.572234958958846e-07, | |
| "loss": 1.5332, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.8817480719794345, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.2573545344373e-07, | |
| "loss": 1.4924, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.884318766066838, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.948914230686688e-07, | |
| "loss": 1.5181, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.8868894601542416, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.646935466039373e-07, | |
| "loss": 1.5137, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8894601542416453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.351439210133492e-07, | |
| "loss": 1.5056, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.8920308483290489, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.062445982456777e-07, | |
| "loss": 1.4688, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.8946015424164524, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.77997585092166e-07, | |
| "loss": 1.5146, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.897172236503856, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.504048430471753e-07, | |
| "loss": 1.4695, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.8997429305912596, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.234682881719766e-07, | |
| "loss": 1.5129, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9023136246786633, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.971897909616985e-07, | |
| "loss": 1.5061, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.9048843187660668, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.715711762154362e-07, | |
| "loss": 1.4722, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.9074550128534704, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.4661422290954495e-07, | |
| "loss": 1.5056, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.910025706940874, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.2232066407409067e-07, | |
| "loss": 1.5017, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.9125964010282777, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.986921866725202e-07, | |
| "loss": 1.5393, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9151670951156813, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7573043148451673e-07, | |
| "loss": 1.5034, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.9177377892030848, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5343699299205003e-07, | |
| "loss": 1.5139, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.9203084832904884, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3181341926867283e-07, | |
| "loss": 1.4788, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.922879177377892, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.1086121187200667e-07, | |
| "loss": 1.4746, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.9254498714652957, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.905818257394799e-07, | |
| "loss": 1.5112, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9280205655526992, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7097666908729283e-07, | |
| "loss": 1.5071, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.9305912596401028, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.520471033126326e-07, | |
| "loss": 1.4773, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.9331619537275064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.3379444289913344e-07, | |
| "loss": 1.5146, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.9357326478149101, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1621995532559947e-07, | |
| "loss": 1.4978, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.9383033419023136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9932486097799408e-07, | |
| "loss": 1.5183, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9408740359897172, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8311033306468552e-07, | |
| "loss": 1.4761, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.9434447300771208, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6757749753498865e-07, | |
| "loss": 1.509, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.9460154241645244, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5272743300097316e-07, | |
| "loss": 1.5095, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.9485861182519281, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3856117066256225e-07, | |
| "loss": 1.5361, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.9511568123393316, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2507969423593225e-07, | |
| "loss": 1.5051, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9537275064267352, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1228393988519381e-07, | |
| "loss": 1.5532, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.9562982005141388, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0017479615738957e-07, | |
| "loss": 1.553, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.9588688946015425, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.875310392079118e-08, | |
| "loss": 1.5125, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.961439588688946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.801965630651165e-08, | |
| "loss": 1.4321, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.9640102827763496, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.797519865342161e-08, | |
| "loss": 1.5005, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9665809768637532, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.862042845640403e-08, | |
| "loss": 1.4973, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.9691516709511568, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.9955995317908514e-08, | |
| "loss": 1.5449, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.9717223650385605, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.198250090284961e-08, | |
| "loss": 1.4795, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.974293059125964, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.47004988968247e-08, | |
| "loss": 1.5508, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.9768637532133676, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8110494967664713e-08, | |
| "loss": 1.5095, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9794344473007712, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.221294673032004e-08, | |
| "loss": 1.5146, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9820051413881749, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7008263715085904e-08, | |
| "loss": 1.5112, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.9845758354755784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.24968073391607e-08, | |
| "loss": 1.5144, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.987146529562982, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.678890881552715e-09, | |
| "loss": 1.5459, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9897172236503856, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.554779461323101e-09, | |
| "loss": 1.4885, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9922879177377892, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.1246900191761463e-09, | |
| "loss": 1.4919, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.9948586118251928, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3887913023946652e-09, | |
| "loss": 1.5034, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.9974293059125964, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.4720385312492223e-10, | |
| "loss": 1.4812, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 1.3696, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 389, | |
| "total_flos": 1.4102482311698186e+18, | |
| "train_loss": 1.5950692380302056, | |
| "train_runtime": 5789.3639, | |
| "train_samples_per_second": 17.167, | |
| "train_steps_per_second": 0.067 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 389, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 3000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4102482311698186e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |