| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 37.127628326416016, | |
| "learning_rate": 1e-05, | |
| "loss": 15.0452, | |
| "mean_token_accuracy": 0.41070467978715897, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 31.609861373901367, | |
| "learning_rate": 2e-05, | |
| "loss": 14.4012, | |
| "mean_token_accuracy": 0.4362717792391777, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 28.683244705200195, | |
| "learning_rate": 3e-05, | |
| "loss": 13.963, | |
| "mean_token_accuracy": 0.4400983825325966, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 22.16831398010254, | |
| "learning_rate": 4e-05, | |
| "loss": 13.3495, | |
| "mean_token_accuracy": 0.4725849777460098, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 19.798992156982422, | |
| "learning_rate": 5e-05, | |
| "loss": 11.6453, | |
| "mean_token_accuracy": 0.5295183658599854, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 17.516647338867188, | |
| "learning_rate": 4.98989898989899e-05, | |
| "loss": 11.3724, | |
| "mean_token_accuracy": 0.5440531671047211, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 18.01726531982422, | |
| "learning_rate": 4.97979797979798e-05, | |
| "loss": 10.463, | |
| "mean_token_accuracy": 0.5659109503030777, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 16.347856521606445, | |
| "learning_rate": 4.9696969696969694e-05, | |
| "loss": 9.83, | |
| "mean_token_accuracy": 0.5818765759468079, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 17.102680206298828, | |
| "learning_rate": 4.9595959595959594e-05, | |
| "loss": 9.21, | |
| "mean_token_accuracy": 0.6048124134540558, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 13.707213401794434, | |
| "learning_rate": 4.94949494949495e-05, | |
| "loss": 9.0634, | |
| "mean_token_accuracy": 0.6290689557790756, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 12.317888259887695, | |
| "learning_rate": 4.93939393939394e-05, | |
| "loss": 8.7963, | |
| "mean_token_accuracy": 0.6293386816978455, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 12.71199893951416, | |
| "learning_rate": 4.92929292929293e-05, | |
| "loss": 7.9924, | |
| "mean_token_accuracy": 0.6720790416002274, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 11.422212600708008, | |
| "learning_rate": 4.919191919191919e-05, | |
| "loss": 8.0788, | |
| "mean_token_accuracy": 0.6582284867763519, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 10.79257869720459, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 7.5785, | |
| "mean_token_accuracy": 0.673900306224823, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 10.639641761779785, | |
| "learning_rate": 4.898989898989899e-05, | |
| "loss": 7.3491, | |
| "mean_token_accuracy": 0.6823591589927673, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 11.140637397766113, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 7.6078, | |
| "mean_token_accuracy": 0.6652624905109406, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 10.391518592834473, | |
| "learning_rate": 4.878787878787879e-05, | |
| "loss": 7.1951, | |
| "mean_token_accuracy": 0.66854228079319, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 10.393790245056152, | |
| "learning_rate": 4.868686868686869e-05, | |
| "loss": 7.1549, | |
| "mean_token_accuracy": 0.6921204030513763, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 9.204154014587402, | |
| "learning_rate": 4.858585858585859e-05, | |
| "loss": 7.3606, | |
| "mean_token_accuracy": 0.6926577091217041, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 10.39026927947998, | |
| "learning_rate": 4.848484848484849e-05, | |
| "loss": 7.0433, | |
| "mean_token_accuracy": 0.6972606927156448, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 10.690828323364258, | |
| "learning_rate": 4.838383838383839e-05, | |
| "loss": 7.0197, | |
| "mean_token_accuracy": 0.6896309554576874, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 10.476662635803223, | |
| "learning_rate": 4.828282828282829e-05, | |
| "loss": 6.2416, | |
| "mean_token_accuracy": 0.7330080419778824, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 9.495307922363281, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 6.7721, | |
| "mean_token_accuracy": 0.6969246119260788, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 9.304316520690918, | |
| "learning_rate": 4.808080808080808e-05, | |
| "loss": 6.4576, | |
| "mean_token_accuracy": 0.7197146117687225, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 9.24569034576416, | |
| "learning_rate": 4.797979797979798e-05, | |
| "loss": 5.9051, | |
| "mean_token_accuracy": 0.7319369614124298, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 8.99447250366211, | |
| "learning_rate": 4.787878787878788e-05, | |
| "loss": 6.113, | |
| "mean_token_accuracy": 0.7273330986499786, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.245104789733887, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 6.5231, | |
| "mean_token_accuracy": 0.7232353389263153, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 9.193451881408691, | |
| "learning_rate": 4.7676767676767684e-05, | |
| "loss": 6.0893, | |
| "mean_token_accuracy": 0.7589968591928482, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 9.738916397094727, | |
| "learning_rate": 4.7575757575757576e-05, | |
| "loss": 6.2512, | |
| "mean_token_accuracy": 0.7394937723875046, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 9.301070213317871, | |
| "learning_rate": 4.7474747474747476e-05, | |
| "loss": 6.4148, | |
| "mean_token_accuracy": 0.7273970693349838, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 9.732291221618652, | |
| "learning_rate": 4.7373737373737375e-05, | |
| "loss": 6.4581, | |
| "mean_token_accuracy": 0.7421613037586212, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 9.543816566467285, | |
| "learning_rate": 4.7272727272727275e-05, | |
| "loss": 5.7812, | |
| "mean_token_accuracy": 0.7518225610256195, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 8.442834854125977, | |
| "learning_rate": 4.7171717171717174e-05, | |
| "loss": 5.7968, | |
| "mean_token_accuracy": 0.7630135715007782, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 7.911731243133545, | |
| "learning_rate": 4.7070707070707074e-05, | |
| "loss": 5.9594, | |
| "mean_token_accuracy": 0.7520367801189423, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 8.111591339111328, | |
| "learning_rate": 4.696969696969697e-05, | |
| "loss": 6.1152, | |
| "mean_token_accuracy": 0.7495731711387634, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 7.632225036621094, | |
| "learning_rate": 4.686868686868687e-05, | |
| "loss": 5.978, | |
| "mean_token_accuracy": 0.7462944090366364, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 7.140631675720215, | |
| "learning_rate": 4.676767676767677e-05, | |
| "loss": 5.4284, | |
| "mean_token_accuracy": 0.7744311541318893, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 7.504866600036621, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 5.4771, | |
| "mean_token_accuracy": 0.7578010857105255, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 6.958098411560059, | |
| "learning_rate": 4.656565656565657e-05, | |
| "loss": 6.0761, | |
| "mean_token_accuracy": 0.7286622673273087, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.4256696701049805, | |
| "learning_rate": 4.6464646464646464e-05, | |
| "loss": 5.9964, | |
| "mean_token_accuracy": 0.7376932203769684, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 6.8756608963012695, | |
| "learning_rate": 4.636363636363636e-05, | |
| "loss": 5.2819, | |
| "mean_token_accuracy": 0.7755442261695862, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.877252101898193, | |
| "learning_rate": 4.626262626262626e-05, | |
| "loss": 5.6559, | |
| "mean_token_accuracy": 0.7578635513782501, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 7.236316680908203, | |
| "learning_rate": 4.616161616161616e-05, | |
| "loss": 5.3482, | |
| "mean_token_accuracy": 0.758240357041359, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 7.695366859436035, | |
| "learning_rate": 4.606060606060607e-05, | |
| "loss": 5.3286, | |
| "mean_token_accuracy": 0.769702136516571, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 7.2449164390563965, | |
| "learning_rate": 4.595959595959596e-05, | |
| "loss": 5.3446, | |
| "mean_token_accuracy": 0.750788614153862, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 6.902708530426025, | |
| "learning_rate": 4.585858585858586e-05, | |
| "loss": 4.5337, | |
| "mean_token_accuracy": 0.7888868898153305, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 7.465998649597168, | |
| "learning_rate": 4.575757575757576e-05, | |
| "loss": 5.2185, | |
| "mean_token_accuracy": 0.7631554305553436, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.581245422363281, | |
| "learning_rate": 4.565656565656566e-05, | |
| "loss": 5.6984, | |
| "mean_token_accuracy": 0.7439078986644745, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 6.623785495758057, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 5.5339, | |
| "mean_token_accuracy": 0.7492197006940842, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.161900997161865, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 6.0623, | |
| "mean_token_accuracy": 0.7385092377662659, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7.114354133605957, | |
| "learning_rate": 4.535353535353535e-05, | |
| "loss": 5.0984, | |
| "mean_token_accuracy": 0.7639837712049484, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 6.865662097930908, | |
| "learning_rate": 4.525252525252526e-05, | |
| "loss": 4.5515, | |
| "mean_token_accuracy": 0.7843715995550156, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 6.6706109046936035, | |
| "learning_rate": 4.515151515151516e-05, | |
| "loss": 5.7281, | |
| "mean_token_accuracy": 0.7444053590297699, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 7.425290107727051, | |
| "learning_rate": 4.5050505050505056e-05, | |
| "loss": 5.5055, | |
| "mean_token_accuracy": 0.7495678812265396, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7.8452277183532715, | |
| "learning_rate": 4.494949494949495e-05, | |
| "loss": 5.5757, | |
| "mean_token_accuracy": 0.7629143446683884, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 7.112468242645264, | |
| "learning_rate": 4.484848484848485e-05, | |
| "loss": 4.6575, | |
| "mean_token_accuracy": 0.7812573164701462, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 6.542463302612305, | |
| "learning_rate": 4.474747474747475e-05, | |
| "loss": 5.373, | |
| "mean_token_accuracy": 0.7609668523073196, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 6.580011367797852, | |
| "learning_rate": 4.464646464646465e-05, | |
| "loss": 4.5268, | |
| "mean_token_accuracy": 0.7902613431215286, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 6.447749614715576, | |
| "learning_rate": 4.454545454545455e-05, | |
| "loss": 4.948, | |
| "mean_token_accuracy": 0.7766893953084946, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.1748199462890625, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 4.8793, | |
| "mean_token_accuracy": 0.78531713783741, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7.2349114418029785, | |
| "learning_rate": 4.4343434343434346e-05, | |
| "loss": 5.4005, | |
| "mean_token_accuracy": 0.7713855803012848, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 7.137668132781982, | |
| "learning_rate": 4.4242424242424246e-05, | |
| "loss": 4.7612, | |
| "mean_token_accuracy": 0.7853883504867554, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 6.94537353515625, | |
| "learning_rate": 4.4141414141414145e-05, | |
| "loss": 5.0952, | |
| "mean_token_accuracy": 0.7686220556497574, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 7.088626861572266, | |
| "learning_rate": 4.4040404040404044e-05, | |
| "loss": 5.107, | |
| "mean_token_accuracy": 0.7601823508739471, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.948323726654053, | |
| "learning_rate": 4.3939393939393944e-05, | |
| "loss": 4.9925, | |
| "mean_token_accuracy": 0.7808158993721008, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 6.151853084564209, | |
| "learning_rate": 4.383838383838384e-05, | |
| "loss": 4.7631, | |
| "mean_token_accuracy": 0.7758619040250778, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 6.796617031097412, | |
| "learning_rate": 4.3737373737373736e-05, | |
| "loss": 5.1669, | |
| "mean_token_accuracy": 0.7829089462757111, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6.727445602416992, | |
| "learning_rate": 4.3636363636363636e-05, | |
| "loss": 5.9112, | |
| "mean_token_accuracy": 0.7398245632648468, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6.673349380493164, | |
| "learning_rate": 4.3535353535353535e-05, | |
| "loss": 5.5631, | |
| "mean_token_accuracy": 0.7459984719753265, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.960887432098389, | |
| "learning_rate": 4.343434343434344e-05, | |
| "loss": 4.8842, | |
| "mean_token_accuracy": 0.7681605517864227, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 6.884949684143066, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 4.9392, | |
| "mean_token_accuracy": 0.7818175554275513, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 6.305865287780762, | |
| "learning_rate": 4.3232323232323234e-05, | |
| "loss": 4.3928, | |
| "mean_token_accuracy": 0.7899681925773621, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 6.652983665466309, | |
| "learning_rate": 4.313131313131313e-05, | |
| "loss": 5.6515, | |
| "mean_token_accuracy": 0.7437300831079483, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 5.960973262786865, | |
| "learning_rate": 4.303030303030303e-05, | |
| "loss": 4.2172, | |
| "mean_token_accuracy": 0.8126555383205414, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6.401238918304443, | |
| "learning_rate": 4.292929292929293e-05, | |
| "loss": 4.6772, | |
| "mean_token_accuracy": 0.7884032130241394, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 6.02972412109375, | |
| "learning_rate": 4.282828282828283e-05, | |
| "loss": 4.7759, | |
| "mean_token_accuracy": 0.7719396352767944, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 5.91736364364624, | |
| "learning_rate": 4.2727272727272724e-05, | |
| "loss": 4.5561, | |
| "mean_token_accuracy": 0.8052177727222443, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 6.276677131652832, | |
| "learning_rate": 4.262626262626263e-05, | |
| "loss": 4.5458, | |
| "mean_token_accuracy": 0.7864794135093689, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 6.683462142944336, | |
| "learning_rate": 4.252525252525253e-05, | |
| "loss": 4.7156, | |
| "mean_token_accuracy": 0.7799786478281021, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 6.73935079574585, | |
| "learning_rate": 4.242424242424243e-05, | |
| "loss": 4.6576, | |
| "mean_token_accuracy": 0.7870494276285172, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 6.009264945983887, | |
| "learning_rate": 4.232323232323233e-05, | |
| "loss": 4.3781, | |
| "mean_token_accuracy": 0.8029303699731827, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 5.8821797370910645, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 4.6497, | |
| "mean_token_accuracy": 0.7830232828855515, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 6.2794904708862305, | |
| "learning_rate": 4.212121212121212e-05, | |
| "loss": 4.563, | |
| "mean_token_accuracy": 0.7883824855089188, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 6.4131364822387695, | |
| "learning_rate": 4.202020202020202e-05, | |
| "loss": 4.86, | |
| "mean_token_accuracy": 0.7777279317378998, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 6.024682998657227, | |
| "learning_rate": 4.191919191919192e-05, | |
| "loss": 4.5301, | |
| "mean_token_accuracy": 0.8029981702566147, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 6.400689125061035, | |
| "learning_rate": 4.181818181818182e-05, | |
| "loss": 4.5095, | |
| "mean_token_accuracy": 0.7834673821926117, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 6.309573173522949, | |
| "learning_rate": 4.171717171717172e-05, | |
| "loss": 4.9511, | |
| "mean_token_accuracy": 0.7793049812316895, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 6.098147869110107, | |
| "learning_rate": 4.161616161616162e-05, | |
| "loss": 4.8193, | |
| "mean_token_accuracy": 0.7891133576631546, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.305475234985352, | |
| "learning_rate": 4.151515151515152e-05, | |
| "loss": 5.2867, | |
| "mean_token_accuracy": 0.763544350862503, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 5.927306175231934, | |
| "learning_rate": 4.141414141414142e-05, | |
| "loss": 4.6925, | |
| "mean_token_accuracy": 0.7907081097364426, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 6.21611213684082, | |
| "learning_rate": 4.131313131313132e-05, | |
| "loss": 4.1949, | |
| "mean_token_accuracy": 0.7960730195045471, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 5.893104076385498, | |
| "learning_rate": 4.1212121212121216e-05, | |
| "loss": 4.3789, | |
| "mean_token_accuracy": 0.8057213127613068, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 6.116573333740234, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 4.7077, | |
| "mean_token_accuracy": 0.7898276895284653, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 5.917790412902832, | |
| "learning_rate": 4.101010101010101e-05, | |
| "loss": 4.3604, | |
| "mean_token_accuracy": 0.7916617542505264, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 6.027205944061279, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 4.3654, | |
| "mean_token_accuracy": 0.7899818271398544, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 6.685177326202393, | |
| "learning_rate": 4.0808080808080814e-05, | |
| "loss": 4.3956, | |
| "mean_token_accuracy": 0.7992375791072845, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 6.8115081787109375, | |
| "learning_rate": 4.070707070707071e-05, | |
| "loss": 4.4709, | |
| "mean_token_accuracy": 0.7787137180566788, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 6.106410980224609, | |
| "learning_rate": 4.0606060606060606e-05, | |
| "loss": 4.2777, | |
| "mean_token_accuracy": 0.8080956637859344, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 6.435486793518066, | |
| "learning_rate": 4.0505050505050506e-05, | |
| "loss": 4.7062, | |
| "mean_token_accuracy": 0.7773082256317139, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 6.885725498199463, | |
| "learning_rate": 4.0404040404040405e-05, | |
| "loss": 4.7163, | |
| "mean_token_accuracy": 0.7818236798048019, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 5.755829811096191, | |
| "learning_rate": 4.0303030303030305e-05, | |
| "loss": 4.4307, | |
| "mean_token_accuracy": 0.7950419485569, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 6.709305763244629, | |
| "learning_rate": 4.0202020202020204e-05, | |
| "loss": 4.2539, | |
| "mean_token_accuracy": 0.8147921562194824, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 6.947343826293945, | |
| "learning_rate": 4.01010101010101e-05, | |
| "loss": 4.3526, | |
| "mean_token_accuracy": 0.7921736389398575, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 6.356295108795166, | |
| "learning_rate": 4e-05, | |
| "loss": 4.8007, | |
| "mean_token_accuracy": 0.780563622713089, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.941018104553223, | |
| "learning_rate": 3.98989898989899e-05, | |
| "loss": 4.3688, | |
| "mean_token_accuracy": 0.7866577059030533, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 6.366528034210205, | |
| "learning_rate": 3.97979797979798e-05, | |
| "loss": 4.252, | |
| "mean_token_accuracy": 0.799225702881813, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 6.697314739227295, | |
| "learning_rate": 3.96969696969697e-05, | |
| "loss": 4.8091, | |
| "mean_token_accuracy": 0.7797738015651703, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 6.910213947296143, | |
| "learning_rate": 3.9595959595959594e-05, | |
| "loss": 4.8069, | |
| "mean_token_accuracy": 0.7732720673084259, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 5.771172523498535, | |
| "learning_rate": 3.9494949494949494e-05, | |
| "loss": 4.2787, | |
| "mean_token_accuracy": 0.7975706905126572, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.44094181060791, | |
| "learning_rate": 3.939393939393939e-05, | |
| "loss": 4.0384, | |
| "mean_token_accuracy": 0.8148421049118042, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 6.703949451446533, | |
| "learning_rate": 3.929292929292929e-05, | |
| "loss": 4.7667, | |
| "mean_token_accuracy": 0.7871679961681366, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 6.386756896972656, | |
| "learning_rate": 3.91919191919192e-05, | |
| "loss": 4.2787, | |
| "mean_token_accuracy": 0.7989258021116257, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 5.399852275848389, | |
| "learning_rate": 3.909090909090909e-05, | |
| "loss": 3.9899, | |
| "mean_token_accuracy": 0.8192960321903229, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 5.751123905181885, | |
| "learning_rate": 3.898989898989899e-05, | |
| "loss": 4.292, | |
| "mean_token_accuracy": 0.8131812363862991, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 5.929274559020996, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 4.5909, | |
| "mean_token_accuracy": 0.7861783355474472, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 6.452223777770996, | |
| "learning_rate": 3.878787878787879e-05, | |
| "loss": 4.3544, | |
| "mean_token_accuracy": 0.7967888861894608, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 5.868208408355713, | |
| "learning_rate": 3.868686868686869e-05, | |
| "loss": 4.9455, | |
| "mean_token_accuracy": 0.7758602350950241, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 6.712337493896484, | |
| "learning_rate": 3.858585858585859e-05, | |
| "loss": 4.275, | |
| "mean_token_accuracy": 0.8086762726306915, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 5.891403675079346, | |
| "learning_rate": 3.848484848484848e-05, | |
| "loss": 3.9505, | |
| "mean_token_accuracy": 0.8316792696714401, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.26602840423584, | |
| "learning_rate": 3.838383838383838e-05, | |
| "loss": 4.4454, | |
| "mean_token_accuracy": 0.7933251559734344, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 5.930867671966553, | |
| "learning_rate": 3.828282828282829e-05, | |
| "loss": 4.346, | |
| "mean_token_accuracy": 0.7956322878599167, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 6.317086219787598, | |
| "learning_rate": 3.818181818181819e-05, | |
| "loss": 3.9196, | |
| "mean_token_accuracy": 0.8177185207605362, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 6.2951178550720215, | |
| "learning_rate": 3.8080808080808087e-05, | |
| "loss": 4.3412, | |
| "mean_token_accuracy": 0.8002329915761948, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 5.812499523162842, | |
| "learning_rate": 3.797979797979798e-05, | |
| "loss": 4.2135, | |
| "mean_token_accuracy": 0.8055464327335358, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.186211109161377, | |
| "learning_rate": 3.787878787878788e-05, | |
| "loss": 4.2378, | |
| "mean_token_accuracy": 0.7995122522115707, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 5.81443452835083, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 4.1608, | |
| "mean_token_accuracy": 0.80469611287117, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 5.983344078063965, | |
| "learning_rate": 3.767676767676768e-05, | |
| "loss": 3.996, | |
| "mean_token_accuracy": 0.7908979803323746, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 5.305781841278076, | |
| "learning_rate": 3.757575757575758e-05, | |
| "loss": 4.4283, | |
| "mean_token_accuracy": 0.7888950854539871, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 5.880563259124756, | |
| "learning_rate": 3.747474747474748e-05, | |
| "loss": 4.2988, | |
| "mean_token_accuracy": 0.7900789678096771, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 5.588067054748535, | |
| "learning_rate": 3.7373737373737376e-05, | |
| "loss": 3.5971, | |
| "mean_token_accuracy": 0.829292356967926, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 5.570093631744385, | |
| "learning_rate": 3.7272727272727276e-05, | |
| "loss": 3.9778, | |
| "mean_token_accuracy": 0.814909428358078, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 5.66859245300293, | |
| "learning_rate": 3.7171717171717175e-05, | |
| "loss": 3.8962, | |
| "mean_token_accuracy": 0.8082331418991089, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 5.667624473571777, | |
| "learning_rate": 3.7070707070707075e-05, | |
| "loss": 3.3282, | |
| "mean_token_accuracy": 0.8463505655527115, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 5.888784408569336, | |
| "learning_rate": 3.6969696969696974e-05, | |
| "loss": 4.236, | |
| "mean_token_accuracy": 0.7955542504787445, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 6.280521392822266, | |
| "learning_rate": 3.686868686868687e-05, | |
| "loss": 3.8402, | |
| "mean_token_accuracy": 0.8112962692975998, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 5.922068119049072, | |
| "learning_rate": 3.6767676767676766e-05, | |
| "loss": 3.8136, | |
| "mean_token_accuracy": 0.8134618252515793, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 5.7377824783325195, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 4.2849, | |
| "mean_token_accuracy": 0.7926695197820663, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 5.895854949951172, | |
| "learning_rate": 3.656565656565657e-05, | |
| "loss": 3.9685, | |
| "mean_token_accuracy": 0.8166492581367493, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 6.496169567108154, | |
| "learning_rate": 3.6464646464646465e-05, | |
| "loss": 4.2935, | |
| "mean_token_accuracy": 0.798405259847641, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 5.925403594970703, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 3.5502, | |
| "mean_token_accuracy": 0.8250329345464706, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 6.757926940917969, | |
| "learning_rate": 3.6262626262626264e-05, | |
| "loss": 4.0215, | |
| "mean_token_accuracy": 0.8052553087472916, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 5.971244812011719, | |
| "learning_rate": 3.616161616161616e-05, | |
| "loss": 4.1611, | |
| "mean_token_accuracy": 0.7966379076242447, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 6.020002841949463, | |
| "learning_rate": 3.606060606060606e-05, | |
| "loss": 3.7227, | |
| "mean_token_accuracy": 0.8071051388978958, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 5.251856327056885, | |
| "learning_rate": 3.595959595959596e-05, | |
| "loss": 3.8428, | |
| "mean_token_accuracy": 0.8186961710453033, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 5.363802909851074, | |
| "learning_rate": 3.5858585858585855e-05, | |
| "loss": 3.1516, | |
| "mean_token_accuracy": 0.8421822488307953, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 5.687267780303955, | |
| "learning_rate": 3.575757575757576e-05, | |
| "loss": 3.8707, | |
| "mean_token_accuracy": 0.8026652336120605, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 6.096588134765625, | |
| "learning_rate": 3.565656565656566e-05, | |
| "loss": 3.6084, | |
| "mean_token_accuracy": 0.8240565657615662, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 7.285717487335205, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 4.0248, | |
| "mean_token_accuracy": 0.8036085069179535, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 5.360130310058594, | |
| "learning_rate": 3.545454545454546e-05, | |
| "loss": 3.7427, | |
| "mean_token_accuracy": 0.8152587413787842, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 5.485192775726318, | |
| "learning_rate": 3.535353535353535e-05, | |
| "loss": 3.4779, | |
| "mean_token_accuracy": 0.822798877954483, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 6.339715003967285, | |
| "learning_rate": 3.525252525252525e-05, | |
| "loss": 4.64, | |
| "mean_token_accuracy": 0.7758079469203949, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 5.6540422439575195, | |
| "learning_rate": 3.515151515151515e-05, | |
| "loss": 3.5393, | |
| "mean_token_accuracy": 0.8202964663505554, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 6.171947479248047, | |
| "learning_rate": 3.505050505050505e-05, | |
| "loss": 3.9077, | |
| "mean_token_accuracy": 0.8147079646587372, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 5.978511810302734, | |
| "learning_rate": 3.494949494949495e-05, | |
| "loss": 3.9126, | |
| "mean_token_accuracy": 0.8115392625331879, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 6.21269416809082, | |
| "learning_rate": 3.484848484848485e-05, | |
| "loss": 3.5828, | |
| "mean_token_accuracy": 0.8218794912099838, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 5.694769859313965, | |
| "learning_rate": 3.474747474747475e-05, | |
| "loss": 3.4313, | |
| "mean_token_accuracy": 0.8210793286561966, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 5.795802116394043, | |
| "learning_rate": 3.464646464646465e-05, | |
| "loss": 3.601, | |
| "mean_token_accuracy": 0.8306063264608383, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 6.824512004852295, | |
| "learning_rate": 3.454545454545455e-05, | |
| "loss": 4.2305, | |
| "mean_token_accuracy": 0.8084036558866501, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 5.593602180480957, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 4.0785, | |
| "mean_token_accuracy": 0.8063912093639374, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 6.345674991607666, | |
| "learning_rate": 3.434343434343435e-05, | |
| "loss": 3.7281, | |
| "mean_token_accuracy": 0.8293210566043854, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 5.772035598754883, | |
| "learning_rate": 3.424242424242424e-05, | |
| "loss": 3.9053, | |
| "mean_token_accuracy": 0.8084607124328613, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 5.2538042068481445, | |
| "learning_rate": 3.414141414141414e-05, | |
| "loss": 3.5784, | |
| "mean_token_accuracy": 0.8259472250938416, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 6.4647016525268555, | |
| "learning_rate": 3.4040404040404045e-05, | |
| "loss": 3.6876, | |
| "mean_token_accuracy": 0.8131130933761597, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 5.747474670410156, | |
| "learning_rate": 3.3939393939393945e-05, | |
| "loss": 3.4403, | |
| "mean_token_accuracy": 0.8292675763368607, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 5.909630298614502, | |
| "learning_rate": 3.3838383838383844e-05, | |
| "loss": 4.1141, | |
| "mean_token_accuracy": 0.7911320775747299, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 5.7181572914123535, | |
| "learning_rate": 3.373737373737374e-05, | |
| "loss": 3.5109, | |
| "mean_token_accuracy": 0.8262276649475098, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 5.75642728805542, | |
| "learning_rate": 3.3636363636363636e-05, | |
| "loss": 3.8356, | |
| "mean_token_accuracy": 0.8160548806190491, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 5.626729965209961, | |
| "learning_rate": 3.3535353535353536e-05, | |
| "loss": 3.7526, | |
| "mean_token_accuracy": 0.8073613941669464, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 5.47194766998291, | |
| "learning_rate": 3.3434343434343435e-05, | |
| "loss": 3.3927, | |
| "mean_token_accuracy": 0.8345044106245041, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 5.935661792755127, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 3.8877, | |
| "mean_token_accuracy": 0.8028298169374466, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 6.1031107902526855, | |
| "learning_rate": 3.3232323232323234e-05, | |
| "loss": 3.159, | |
| "mean_token_accuracy": 0.8418579548597336, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 5.29210090637207, | |
| "learning_rate": 3.3131313131313134e-05, | |
| "loss": 3.4754, | |
| "mean_token_accuracy": 0.8293817788362503, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 5.36245059967041, | |
| "learning_rate": 3.303030303030303e-05, | |
| "loss": 3.5582, | |
| "mean_token_accuracy": 0.8308709412813187, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 5.807570457458496, | |
| "learning_rate": 3.292929292929293e-05, | |
| "loss": 3.6899, | |
| "mean_token_accuracy": 0.809835895895958, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 6.17417573928833, | |
| "learning_rate": 3.282828282828283e-05, | |
| "loss": 4.438, | |
| "mean_token_accuracy": 0.7785096615552902, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 5.818863868713379, | |
| "learning_rate": 3.272727272727273e-05, | |
| "loss": 4.1754, | |
| "mean_token_accuracy": 0.7955872565507889, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 7.145393371582031, | |
| "learning_rate": 3.2626262626262624e-05, | |
| "loss": 3.9973, | |
| "mean_token_accuracy": 0.7989909946918488, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 5.308239459991455, | |
| "learning_rate": 3.2525252525252524e-05, | |
| "loss": 3.8291, | |
| "mean_token_accuracy": 0.8216778337955475, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 7.332181930541992, | |
| "learning_rate": 3.2424242424242423e-05, | |
| "loss": 3.9109, | |
| "mean_token_accuracy": 0.8071533888578415, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 5.38962984085083, | |
| "learning_rate": 3.232323232323233e-05, | |
| "loss": 3.9099, | |
| "mean_token_accuracy": 0.8167336881160736, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 6.646029949188232, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 4.041, | |
| "mean_token_accuracy": 0.8054783195257187, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 5.449068069458008, | |
| "learning_rate": 3.212121212121212e-05, | |
| "loss": 3.8969, | |
| "mean_token_accuracy": 0.8104897290468216, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 6.279653072357178, | |
| "learning_rate": 3.202020202020202e-05, | |
| "loss": 3.8059, | |
| "mean_token_accuracy": 0.8105663061141968, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 6.5596537590026855, | |
| "learning_rate": 3.191919191919192e-05, | |
| "loss": 4.1534, | |
| "mean_token_accuracy": 0.7909954190254211, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 5.6107354164123535, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 3.7084, | |
| "mean_token_accuracy": 0.8198148310184479, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 5.935013294219971, | |
| "learning_rate": 3.171717171717172e-05, | |
| "loss": 3.6008, | |
| "mean_token_accuracy": 0.8179962188005447, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 6.108509540557861, | |
| "learning_rate": 3.161616161616161e-05, | |
| "loss": 4.2591, | |
| "mean_token_accuracy": 0.802385538816452, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 5.757572650909424, | |
| "learning_rate": 3.151515151515151e-05, | |
| "loss": 3.1575, | |
| "mean_token_accuracy": 0.8446398079395294, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 5.682815074920654, | |
| "learning_rate": 3.141414141414142e-05, | |
| "loss": 3.8183, | |
| "mean_token_accuracy": 0.8149935752153397, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 6.077356338500977, | |
| "learning_rate": 3.131313131313132e-05, | |
| "loss": 3.6911, | |
| "mean_token_accuracy": 0.8199481070041656, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 5.448940277099609, | |
| "learning_rate": 3.121212121212122e-05, | |
| "loss": 3.5303, | |
| "mean_token_accuracy": 0.8298965841531754, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 5.414105415344238, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 3.8164, | |
| "mean_token_accuracy": 0.8245872855186462, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 5.119867324829102, | |
| "learning_rate": 3.101010101010101e-05, | |
| "loss": 3.3991, | |
| "mean_token_accuracy": 0.8331074118614197, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 5.911476135253906, | |
| "learning_rate": 3.090909090909091e-05, | |
| "loss": 3.6991, | |
| "mean_token_accuracy": 0.8288981467485428, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 6.588876247406006, | |
| "learning_rate": 3.080808080808081e-05, | |
| "loss": 3.9679, | |
| "mean_token_accuracy": 0.8007670193910599, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 6.243876934051514, | |
| "learning_rate": 3.070707070707071e-05, | |
| "loss": 3.2798, | |
| "mean_token_accuracy": 0.8313741534948349, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 5.5765180587768555, | |
| "learning_rate": 3.060606060606061e-05, | |
| "loss": 3.5246, | |
| "mean_token_accuracy": 0.8275876641273499, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 5.727989673614502, | |
| "learning_rate": 3.050505050505051e-05, | |
| "loss": 3.577, | |
| "mean_token_accuracy": 0.8309095203876495, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 6.035944938659668, | |
| "learning_rate": 3.0404040404040406e-05, | |
| "loss": 3.9191, | |
| "mean_token_accuracy": 0.8098112493753433, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 6.832411289215088, | |
| "learning_rate": 3.0303030303030306e-05, | |
| "loss": 4.0711, | |
| "mean_token_accuracy": 0.7989842146635056, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 6.298995494842529, | |
| "learning_rate": 3.0202020202020205e-05, | |
| "loss": 3.8327, | |
| "mean_token_accuracy": 0.8034229874610901, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 6.169890880584717, | |
| "learning_rate": 3.01010101010101e-05, | |
| "loss": 3.4308, | |
| "mean_token_accuracy": 0.8316022306680679, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 6.228781700134277, | |
| "learning_rate": 3e-05, | |
| "loss": 4.0449, | |
| "mean_token_accuracy": 0.8119410276412964, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 5.267385959625244, | |
| "learning_rate": 2.98989898989899e-05, | |
| "loss": 3.3364, | |
| "mean_token_accuracy": 0.8189917951822281, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 6.143648624420166, | |
| "learning_rate": 2.9797979797979796e-05, | |
| "loss": 3.6751, | |
| "mean_token_accuracy": 0.8097221404314041, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 5.887429714202881, | |
| "learning_rate": 2.96969696969697e-05, | |
| "loss": 3.2366, | |
| "mean_token_accuracy": 0.8330790549516678, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 6.628138065338135, | |
| "learning_rate": 2.95959595959596e-05, | |
| "loss": 3.4484, | |
| "mean_token_accuracy": 0.825881227850914, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 6.397181034088135, | |
| "learning_rate": 2.9494949494949498e-05, | |
| "loss": 4.1198, | |
| "mean_token_accuracy": 0.801177367568016, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 6.744088649749756, | |
| "learning_rate": 2.9393939393939394e-05, | |
| "loss": 3.7321, | |
| "mean_token_accuracy": 0.8189053982496262, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 6.219143390655518, | |
| "learning_rate": 2.9292929292929294e-05, | |
| "loss": 3.8731, | |
| "mean_token_accuracy": 0.807484045624733, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 5.3475751876831055, | |
| "learning_rate": 2.9191919191919193e-05, | |
| "loss": 3.3026, | |
| "mean_token_accuracy": 0.8284660577774048, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 6.080757141113281, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 3.9731, | |
| "mean_token_accuracy": 0.8116171061992645, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 5.819918155670166, | |
| "learning_rate": 2.898989898989899e-05, | |
| "loss": 3.146, | |
| "mean_token_accuracy": 0.83076611161232, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 6.442020416259766, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 3.1441, | |
| "mean_token_accuracy": 0.834155797958374, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.770529270172119, | |
| "learning_rate": 2.878787878787879e-05, | |
| "loss": 3.701, | |
| "mean_token_accuracy": 0.8147666454315186, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 6.345605373382568, | |
| "learning_rate": 2.868686868686869e-05, | |
| "loss": 3.4011, | |
| "mean_token_accuracy": 0.8231780230998993, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 6.269248008728027, | |
| "learning_rate": 2.8585858585858587e-05, | |
| "loss": 3.3734, | |
| "mean_token_accuracy": 0.8338766843080521, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 6.861104965209961, | |
| "learning_rate": 2.8484848484848486e-05, | |
| "loss": 3.6933, | |
| "mean_token_accuracy": 0.8223045021295547, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 5.622055530548096, | |
| "learning_rate": 2.8383838383838386e-05, | |
| "loss": 3.6327, | |
| "mean_token_accuracy": 0.8273986279964447, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 6.05256462097168, | |
| "learning_rate": 2.8282828282828282e-05, | |
| "loss": 3.9795, | |
| "mean_token_accuracy": 0.8149670958518982, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 6.203214645385742, | |
| "learning_rate": 2.818181818181818e-05, | |
| "loss": 3.8234, | |
| "mean_token_accuracy": 0.8010188341140747, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 5.9735798835754395, | |
| "learning_rate": 2.808080808080808e-05, | |
| "loss": 3.6441, | |
| "mean_token_accuracy": 0.8297218382358551, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 5.534307956695557, | |
| "learning_rate": 2.7979797979797984e-05, | |
| "loss": 3.9049, | |
| "mean_token_accuracy": 0.808393806219101, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 5.976230144500732, | |
| "learning_rate": 2.7878787878787883e-05, | |
| "loss": 3.1354, | |
| "mean_token_accuracy": 0.8418057858943939, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 6.141560077667236, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 3.9247, | |
| "mean_token_accuracy": 0.8154790848493576, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 6.0011773109436035, | |
| "learning_rate": 2.767676767676768e-05, | |
| "loss": 3.9032, | |
| "mean_token_accuracy": 0.8127383142709732, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 6.12231969833374, | |
| "learning_rate": 2.7575757575757578e-05, | |
| "loss": 3.1854, | |
| "mean_token_accuracy": 0.8354455977678299, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 5.492841720581055, | |
| "learning_rate": 2.7474747474747474e-05, | |
| "loss": 3.2232, | |
| "mean_token_accuracy": 0.8330144584178925, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 5.566941738128662, | |
| "learning_rate": 2.7373737373737374e-05, | |
| "loss": 3.4351, | |
| "mean_token_accuracy": 0.8365740329027176, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 6.055516719818115, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 3.5741, | |
| "mean_token_accuracy": 0.819349929690361, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 5.578907489776611, | |
| "learning_rate": 2.717171717171717e-05, | |
| "loss": 3.4948, | |
| "mean_token_accuracy": 0.8307986855506897, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 6.451387882232666, | |
| "learning_rate": 2.7070707070707075e-05, | |
| "loss": 3.7951, | |
| "mean_token_accuracy": 0.8153550177812576, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 6.489166736602783, | |
| "learning_rate": 2.696969696969697e-05, | |
| "loss": 4.1551, | |
| "mean_token_accuracy": 0.8079245835542679, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 6.350606441497803, | |
| "learning_rate": 2.686868686868687e-05, | |
| "loss": 4.2089, | |
| "mean_token_accuracy": 0.7974926680326462, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 6.2156877517700195, | |
| "learning_rate": 2.676767676767677e-05, | |
| "loss": 3.4177, | |
| "mean_token_accuracy": 0.8269297033548355, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 6.369142532348633, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 3.7993, | |
| "mean_token_accuracy": 0.8191967755556107, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 5.774569988250732, | |
| "learning_rate": 2.6565656565656566e-05, | |
| "loss": 3.7034, | |
| "mean_token_accuracy": 0.8201487958431244, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 7.003572940826416, | |
| "learning_rate": 2.6464646464646466e-05, | |
| "loss": 3.7289, | |
| "mean_token_accuracy": 0.8157116621732712, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 5.5132036209106445, | |
| "learning_rate": 2.636363636363636e-05, | |
| "loss": 3.0822, | |
| "mean_token_accuracy": 0.8483001440763474, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 5.702081680297852, | |
| "learning_rate": 2.6262626262626268e-05, | |
| "loss": 3.3978, | |
| "mean_token_accuracy": 0.8383439779281616, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 5.952939987182617, | |
| "learning_rate": 2.6161616161616164e-05, | |
| "loss": 3.7917, | |
| "mean_token_accuracy": 0.8186378180980682, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 5.806432247161865, | |
| "learning_rate": 2.6060606060606063e-05, | |
| "loss": 3.3405, | |
| "mean_token_accuracy": 0.8411876261234283, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": 5.565011501312256, | |
| "learning_rate": 2.5959595959595963e-05, | |
| "loss": 3.7156, | |
| "mean_token_accuracy": 0.8047986626625061, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 6.6874494552612305, | |
| "learning_rate": 2.585858585858586e-05, | |
| "loss": 4.1427, | |
| "mean_token_accuracy": 0.7969915717840195, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.274991989135742, | |
| "learning_rate": 2.575757575757576e-05, | |
| "loss": 3.5603, | |
| "mean_token_accuracy": 0.8269449025392532, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 5.758999824523926, | |
| "learning_rate": 2.5656565656565658e-05, | |
| "loss": 2.9123, | |
| "mean_token_accuracy": 0.8435051888227463, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": 6.024221420288086, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 3.9897, | |
| "mean_token_accuracy": 0.8267959505319595, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 5.789820671081543, | |
| "learning_rate": 2.5454545454545454e-05, | |
| "loss": 3.3965, | |
| "mean_token_accuracy": 0.8389037996530533, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 5.86129903793335, | |
| "learning_rate": 2.5353535353535356e-05, | |
| "loss": 3.9343, | |
| "mean_token_accuracy": 0.8126579076051712, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 6.172614097595215, | |
| "learning_rate": 2.5252525252525256e-05, | |
| "loss": 3.2032, | |
| "mean_token_accuracy": 0.8359029293060303, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": 6.1252899169921875, | |
| "learning_rate": 2.5151515151515155e-05, | |
| "loss": 3.006, | |
| "mean_token_accuracy": 0.8488074690103531, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 5.982184410095215, | |
| "learning_rate": 2.505050505050505e-05, | |
| "loss": 3.2548, | |
| "mean_token_accuracy": 0.8324020653963089, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": 5.628816604614258, | |
| "learning_rate": 2.494949494949495e-05, | |
| "loss": 3.321, | |
| "mean_token_accuracy": 0.8334167748689651, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 5.734603404998779, | |
| "learning_rate": 2.4848484848484847e-05, | |
| "loss": 2.9638, | |
| "mean_token_accuracy": 0.8483208119869232, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 5.817469120025635, | |
| "learning_rate": 2.474747474747475e-05, | |
| "loss": 3.0251, | |
| "mean_token_accuracy": 0.837944746017456, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 5.197110652923584, | |
| "learning_rate": 2.464646464646465e-05, | |
| "loss": 3.1505, | |
| "mean_token_accuracy": 0.8443950265645981, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": 5.90143346786499, | |
| "learning_rate": 2.4545454545454545e-05, | |
| "loss": 3.456, | |
| "mean_token_accuracy": 0.8270305395126343, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 6.32747745513916, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 3.0387, | |
| "mean_token_accuracy": 0.8455974459648132, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": 5.210789203643799, | |
| "learning_rate": 2.4343434343434344e-05, | |
| "loss": 3.1112, | |
| "mean_token_accuracy": 0.847496286034584, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 5.2880167961120605, | |
| "learning_rate": 2.4242424242424244e-05, | |
| "loss": 2.7432, | |
| "mean_token_accuracy": 0.8593413680791855, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": 5.868934154510498, | |
| "learning_rate": 2.4141414141414143e-05, | |
| "loss": 3.1459, | |
| "mean_token_accuracy": 0.8433663100004196, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 5.824329376220703, | |
| "learning_rate": 2.404040404040404e-05, | |
| "loss": 3.5007, | |
| "mean_token_accuracy": 0.8235083371400833, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": 5.528572082519531, | |
| "learning_rate": 2.393939393939394e-05, | |
| "loss": 2.854, | |
| "mean_token_accuracy": 0.8565276563167572, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 6.147289752960205, | |
| "learning_rate": 2.3838383838383842e-05, | |
| "loss": 2.9618, | |
| "mean_token_accuracy": 0.8526737242937088, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.65183687210083, | |
| "learning_rate": 2.3737373737373738e-05, | |
| "loss": 3.0466, | |
| "mean_token_accuracy": 0.8418086171150208, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 6.315326690673828, | |
| "learning_rate": 2.3636363636363637e-05, | |
| "loss": 3.4835, | |
| "mean_token_accuracy": 0.8310810178518295, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": 5.648726463317871, | |
| "learning_rate": 2.3535353535353537e-05, | |
| "loss": 3.6856, | |
| "mean_token_accuracy": 0.8161235004663467, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 5.972370624542236, | |
| "learning_rate": 2.3434343434343436e-05, | |
| "loss": 3.0239, | |
| "mean_token_accuracy": 0.8378438502550125, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": 5.596478462219238, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 3.0652, | |
| "mean_token_accuracy": 0.8487441837787628, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 6.5623908042907715, | |
| "learning_rate": 2.3232323232323232e-05, | |
| "loss": 3.1893, | |
| "mean_token_accuracy": 0.834345743060112, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": 6.067134380340576, | |
| "learning_rate": 2.313131313131313e-05, | |
| "loss": 3.0515, | |
| "mean_token_accuracy": 0.8387996703386307, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 6.862771987915039, | |
| "learning_rate": 2.3030303030303034e-05, | |
| "loss": 3.2861, | |
| "mean_token_accuracy": 0.8361453711986542, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": 6.166702747344971, | |
| "learning_rate": 2.292929292929293e-05, | |
| "loss": 3.6596, | |
| "mean_token_accuracy": 0.813729852437973, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 5.386834621429443, | |
| "learning_rate": 2.282828282828283e-05, | |
| "loss": 3.4533, | |
| "mean_token_accuracy": 0.8274287581443787, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 5.261857986450195, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 2.7643, | |
| "mean_token_accuracy": 0.8519167453050613, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 5.241461753845215, | |
| "learning_rate": 2.262626262626263e-05, | |
| "loss": 3.051, | |
| "mean_token_accuracy": 0.8460838198661804, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": 5.825733184814453, | |
| "learning_rate": 2.2525252525252528e-05, | |
| "loss": 3.2424, | |
| "mean_token_accuracy": 0.8403096050024033, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 5.554264545440674, | |
| "learning_rate": 2.2424242424242424e-05, | |
| "loss": 3.1676, | |
| "mean_token_accuracy": 0.8293324261903763, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": 5.81028413772583, | |
| "learning_rate": 2.2323232323232324e-05, | |
| "loss": 3.184, | |
| "mean_token_accuracy": 0.8394870609045029, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 6.055649280548096, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 3.0117, | |
| "mean_token_accuracy": 0.850115180015564, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": 5.307283401489258, | |
| "learning_rate": 2.2121212121212123e-05, | |
| "loss": 2.9076, | |
| "mean_token_accuracy": 0.8414607346057892, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 5.909352779388428, | |
| "learning_rate": 2.2020202020202022e-05, | |
| "loss": 3.4122, | |
| "mean_token_accuracy": 0.8330782055854797, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 5.730319499969482, | |
| "learning_rate": 2.191919191919192e-05, | |
| "loss": 3.2416, | |
| "mean_token_accuracy": 0.83427894115448, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 6.638869762420654, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 2.8864, | |
| "mean_token_accuracy": 0.8467890173196793, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 6.180927753448486, | |
| "learning_rate": 2.171717171717172e-05, | |
| "loss": 3.3418, | |
| "mean_token_accuracy": 0.8295275717973709, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 5.50770902633667, | |
| "learning_rate": 2.1616161616161617e-05, | |
| "loss": 3.3557, | |
| "mean_token_accuracy": 0.8279571235179901, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": 6.085552215576172, | |
| "learning_rate": 2.1515151515151516e-05, | |
| "loss": 2.7873, | |
| "mean_token_accuracy": 0.8584895879030228, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 5.311261177062988, | |
| "learning_rate": 2.1414141414141416e-05, | |
| "loss": 2.786, | |
| "mean_token_accuracy": 0.8529232293367386, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": 6.482555389404297, | |
| "learning_rate": 2.1313131313131315e-05, | |
| "loss": 3.3625, | |
| "mean_token_accuracy": 0.8343013226985931, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 5.101843357086182, | |
| "learning_rate": 2.1212121212121215e-05, | |
| "loss": 3.0018, | |
| "mean_token_accuracy": 0.8565692156553268, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": 6.496617317199707, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 3.5081, | |
| "mean_token_accuracy": 0.8281321227550507, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 6.459492206573486, | |
| "learning_rate": 2.101010101010101e-05, | |
| "loss": 3.2684, | |
| "mean_token_accuracy": 0.8464506566524506, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": 6.296543121337891, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 2.9787, | |
| "mean_token_accuracy": 0.8391922265291214, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 5.939156532287598, | |
| "learning_rate": 2.080808080808081e-05, | |
| "loss": 3.2943, | |
| "mean_token_accuracy": 0.8347084373235703, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 5.988732814788818, | |
| "learning_rate": 2.070707070707071e-05, | |
| "loss": 3.1281, | |
| "mean_token_accuracy": 0.8483212292194366, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 7.110536098480225, | |
| "learning_rate": 2.0606060606060608e-05, | |
| "loss": 3.1449, | |
| "mean_token_accuracy": 0.8350488543510437, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": 6.519949436187744, | |
| "learning_rate": 2.0505050505050504e-05, | |
| "loss": 2.6449, | |
| "mean_token_accuracy": 0.8593619167804718, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 5.815298557281494, | |
| "learning_rate": 2.0404040404040407e-05, | |
| "loss": 3.0911, | |
| "mean_token_accuracy": 0.837956115603447, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": 5.741540908813477, | |
| "learning_rate": 2.0303030303030303e-05, | |
| "loss": 3.1501, | |
| "mean_token_accuracy": 0.8409707248210907, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.991777420043945, | |
| "learning_rate": 2.0202020202020203e-05, | |
| "loss": 2.9707, | |
| "mean_token_accuracy": 0.858299732208252, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": 6.3398847579956055, | |
| "learning_rate": 2.0101010101010102e-05, | |
| "loss": 3.4141, | |
| "mean_token_accuracy": 0.830435261130333, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 6.837295055389404, | |
| "learning_rate": 2e-05, | |
| "loss": 3.3923, | |
| "mean_token_accuracy": 0.8256559520959854, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": 5.505060195922852, | |
| "learning_rate": 1.98989898989899e-05, | |
| "loss": 3.0673, | |
| "mean_token_accuracy": 0.8478615581989288, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 6.913103103637695, | |
| "learning_rate": 1.9797979797979797e-05, | |
| "loss": 3.5381, | |
| "mean_token_accuracy": 0.8223972916603088, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 6.902682304382324, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 3.598, | |
| "mean_token_accuracy": 0.8203122019767761, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 5.36390495300293, | |
| "learning_rate": 1.95959595959596e-05, | |
| "loss": 3.1101, | |
| "mean_token_accuracy": 0.8461343050003052, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": 6.57292366027832, | |
| "learning_rate": 1.9494949494949496e-05, | |
| "loss": 3.6492, | |
| "mean_token_accuracy": 0.8245173096656799, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 5.893022537231445, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "loss": 2.7866, | |
| "mean_token_accuracy": 0.8550339192152023, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": 5.711099624633789, | |
| "learning_rate": 1.9292929292929295e-05, | |
| "loss": 3.0009, | |
| "mean_token_accuracy": 0.8481545150279999, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 6.395712375640869, | |
| "learning_rate": 1.919191919191919e-05, | |
| "loss": 2.8936, | |
| "mean_token_accuracy": 0.8416745364665985, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": 5.5052289962768555, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "loss": 2.9872, | |
| "mean_token_accuracy": 0.8463147729635239, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 6.273165702819824, | |
| "learning_rate": 1.898989898989899e-05, | |
| "loss": 2.7584, | |
| "mean_token_accuracy": 0.8467618376016617, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": 5.292394161224365, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 2.9572, | |
| "mean_token_accuracy": 0.8431327790021896, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 5.991566181182861, | |
| "learning_rate": 1.878787878787879e-05, | |
| "loss": 2.9398, | |
| "mean_token_accuracy": 0.8492739796638489, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 5.635786533355713, | |
| "learning_rate": 1.8686868686868688e-05, | |
| "loss": 3.1451, | |
| "mean_token_accuracy": 0.854804664850235, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 6.714171886444092, | |
| "learning_rate": 1.8585858585858588e-05, | |
| "loss": 3.0965, | |
| "mean_token_accuracy": 0.8423555940389633, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": 5.751944065093994, | |
| "learning_rate": 1.8484848484848487e-05, | |
| "loss": 3.0769, | |
| "mean_token_accuracy": 0.8551096171140671, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 6.406241416931152, | |
| "learning_rate": 1.8383838383838383e-05, | |
| "loss": 2.8228, | |
| "mean_token_accuracy": 0.861114576458931, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": 6.585404396057129, | |
| "learning_rate": 1.8282828282828286e-05, | |
| "loss": 3.2246, | |
| "mean_token_accuracy": 0.8409701138734818, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 6.6673264503479, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 3.0054, | |
| "mean_token_accuracy": 0.8525452762842178, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": 6.316901206970215, | |
| "learning_rate": 1.808080808080808e-05, | |
| "loss": 3.4642, | |
| "mean_token_accuracy": 0.8289849609136581, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 7.191961288452148, | |
| "learning_rate": 1.797979797979798e-05, | |
| "loss": 3.276, | |
| "mean_token_accuracy": 0.8262393325567245, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": 5.743154525756836, | |
| "learning_rate": 1.787878787878788e-05, | |
| "loss": 2.6852, | |
| "mean_token_accuracy": 0.8636340796947479, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 5.567448616027832, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 3.0029, | |
| "mean_token_accuracy": 0.8480563163757324, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 6.558039665222168, | |
| "learning_rate": 1.7676767676767676e-05, | |
| "loss": 2.8954, | |
| "mean_token_accuracy": 0.8490820229053497, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 6.237626075744629, | |
| "learning_rate": 1.7575757575757576e-05, | |
| "loss": 3.2673, | |
| "mean_token_accuracy": 0.8418715596199036, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": 6.388242721557617, | |
| "learning_rate": 1.7474747474747475e-05, | |
| "loss": 3.5492, | |
| "mean_token_accuracy": 0.8205874115228653, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 6.261191368103027, | |
| "learning_rate": 1.7373737373737375e-05, | |
| "loss": 3.784, | |
| "mean_token_accuracy": 0.8178833723068237, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": 5.890114784240723, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 2.7545, | |
| "mean_token_accuracy": 0.8488318920135498, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 6.187424182891846, | |
| "learning_rate": 1.7171717171717173e-05, | |
| "loss": 3.5939, | |
| "mean_token_accuracy": 0.827822282910347, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.648, | |
| "grad_norm": 6.662416458129883, | |
| "learning_rate": 1.707070707070707e-05, | |
| "loss": 3.2883, | |
| "mean_token_accuracy": 0.8301158398389816, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 6.358303546905518, | |
| "learning_rate": 1.6969696969696972e-05, | |
| "loss": 3.1986, | |
| "mean_token_accuracy": 0.8457056134939194, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.664, | |
| "grad_norm": 5.967710971832275, | |
| "learning_rate": 1.686868686868687e-05, | |
| "loss": 2.9175, | |
| "mean_token_accuracy": 0.8557797521352768, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 5.508559703826904, | |
| "learning_rate": 1.6767676767676768e-05, | |
| "loss": 3.1204, | |
| "mean_token_accuracy": 0.8435854762792587, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 6.057641506195068, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 3.5826, | |
| "mean_token_accuracy": 0.8250070810317993, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 5.788311004638672, | |
| "learning_rate": 1.6565656565656567e-05, | |
| "loss": 2.9172, | |
| "mean_token_accuracy": 0.8496552407741547, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": 7.819416522979736, | |
| "learning_rate": 1.6464646464646466e-05, | |
| "loss": 3.1982, | |
| "mean_token_accuracy": 0.8477791100740433, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 6.290533065795898, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 3.289, | |
| "mean_token_accuracy": 0.8374428898096085, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": 6.164359092712402, | |
| "learning_rate": 1.6262626262626262e-05, | |
| "loss": 3.0955, | |
| "mean_token_accuracy": 0.8332884609699249, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 6.6753339767456055, | |
| "learning_rate": 1.6161616161616165e-05, | |
| "loss": 2.9626, | |
| "mean_token_accuracy": 0.8437196165323257, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": 5.542717933654785, | |
| "learning_rate": 1.606060606060606e-05, | |
| "loss": 3.2322, | |
| "mean_token_accuracy": 0.8421852588653564, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 5.837277889251709, | |
| "learning_rate": 1.595959595959596e-05, | |
| "loss": 3.1454, | |
| "mean_token_accuracy": 0.8469538539648056, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": 5.96569299697876, | |
| "learning_rate": 1.585858585858586e-05, | |
| "loss": 2.8061, | |
| "mean_token_accuracy": 0.8568727970123291, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 6.756287574768066, | |
| "learning_rate": 1.5757575757575756e-05, | |
| "loss": 2.792, | |
| "mean_token_accuracy": 0.8376488238573074, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 6.800328254699707, | |
| "learning_rate": 1.565656565656566e-05, | |
| "loss": 3.3131, | |
| "mean_token_accuracy": 0.8389277309179306, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 5.755626201629639, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 2.9014, | |
| "mean_token_accuracy": 0.8483488708734512, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.776, | |
| "grad_norm": 5.480167865753174, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "loss": 2.8085, | |
| "mean_token_accuracy": 0.8566093593835831, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 6.205705642700195, | |
| "learning_rate": 1.5353535353535354e-05, | |
| "loss": 3.0061, | |
| "mean_token_accuracy": 0.8438935428857803, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.792, | |
| "grad_norm": 6.319244384765625, | |
| "learning_rate": 1.5252525252525255e-05, | |
| "loss": 2.9846, | |
| "mean_token_accuracy": 0.8455973863601685, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.499710559844971, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "loss": 3.0732, | |
| "mean_token_accuracy": 0.8468042612075806, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.808, | |
| "grad_norm": 5.381649494171143, | |
| "learning_rate": 1.505050505050505e-05, | |
| "loss": 3.4026, | |
| "mean_token_accuracy": 0.840883806347847, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 5.975292205810547, | |
| "learning_rate": 1.494949494949495e-05, | |
| "loss": 3.3919, | |
| "mean_token_accuracy": 0.8322191834449768, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.824, | |
| "grad_norm": 6.1554646492004395, | |
| "learning_rate": 1.484848484848485e-05, | |
| "loss": 2.9314, | |
| "mean_token_accuracy": 0.8548919409513474, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 5.832248210906982, | |
| "learning_rate": 1.4747474747474749e-05, | |
| "loss": 2.3767, | |
| "mean_token_accuracy": 0.8693199008703232, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 5.911506175994873, | |
| "learning_rate": 1.4646464646464647e-05, | |
| "loss": 2.7502, | |
| "mean_token_accuracy": 0.8517532050609589, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 6.765252113342285, | |
| "learning_rate": 1.4545454545454545e-05, | |
| "loss": 3.6221, | |
| "mean_token_accuracy": 0.8170515298843384, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.856, | |
| "grad_norm": 6.607264518737793, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 3.1797, | |
| "mean_token_accuracy": 0.8341539800167084, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 6.446462154388428, | |
| "learning_rate": 1.4343434343434345e-05, | |
| "loss": 2.9366, | |
| "mean_token_accuracy": 0.8581055998802185, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.872, | |
| "grad_norm": 6.300421714782715, | |
| "learning_rate": 1.4242424242424243e-05, | |
| "loss": 3.3633, | |
| "mean_token_accuracy": 0.8499312251806259, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 5.600719451904297, | |
| "learning_rate": 1.4141414141414141e-05, | |
| "loss": 3.088, | |
| "mean_token_accuracy": 0.8488198220729828, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.888, | |
| "grad_norm": 5.980778217315674, | |
| "learning_rate": 1.404040404040404e-05, | |
| "loss": 2.9933, | |
| "mean_token_accuracy": 0.8443039804697037, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 6.331740856170654, | |
| "learning_rate": 1.3939393939393942e-05, | |
| "loss": 2.9548, | |
| "mean_token_accuracy": 0.8464246839284897, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.904, | |
| "grad_norm": 6.095213890075684, | |
| "learning_rate": 1.383838383838384e-05, | |
| "loss": 3.4251, | |
| "mean_token_accuracy": 0.8375514298677444, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 6.7452239990234375, | |
| "learning_rate": 1.3737373737373737e-05, | |
| "loss": 3.3106, | |
| "mean_token_accuracy": 0.8348345905542374, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 5.934120178222656, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 3.1045, | |
| "mean_token_accuracy": 0.8491770774126053, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 5.6627912521362305, | |
| "learning_rate": 1.3535353535353538e-05, | |
| "loss": 2.9255, | |
| "mean_token_accuracy": 0.8593737334012985, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.936, | |
| "grad_norm": 6.51497745513916, | |
| "learning_rate": 1.3434343434343436e-05, | |
| "loss": 3.2813, | |
| "mean_token_accuracy": 0.8357561677694321, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 6.286316394805908, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 2.8042, | |
| "mean_token_accuracy": 0.8635920882225037, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.952, | |
| "grad_norm": 6.30144739151001, | |
| "learning_rate": 1.3232323232323233e-05, | |
| "loss": 2.7913, | |
| "mean_token_accuracy": 0.8532185405492783, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 5.6404314041137695, | |
| "learning_rate": 1.3131313131313134e-05, | |
| "loss": 2.8048, | |
| "mean_token_accuracy": 0.8557418137788773, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.968, | |
| "grad_norm": 6.2682342529296875, | |
| "learning_rate": 1.3030303030303032e-05, | |
| "loss": 3.3507, | |
| "mean_token_accuracy": 0.8299184590578079, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 5.7995219230651855, | |
| "learning_rate": 1.292929292929293e-05, | |
| "loss": 2.9806, | |
| "mean_token_accuracy": 0.8536931127309799, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.984, | |
| "grad_norm": 6.288918972015381, | |
| "learning_rate": 1.2828282828282829e-05, | |
| "loss": 3.1404, | |
| "mean_token_accuracy": 0.8438131213188171, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 6.12652587890625, | |
| "learning_rate": 1.2727272727272727e-05, | |
| "loss": 3.1072, | |
| "mean_token_accuracy": 0.8395776003599167, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 6.006952285766602, | |
| "learning_rate": 1.2626262626262628e-05, | |
| "loss": 3.0308, | |
| "mean_token_accuracy": 0.8461359292268753, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 5.877007961273193, | |
| "learning_rate": 1.2525252525252526e-05, | |
| "loss": 2.7955, | |
| "mean_token_accuracy": 0.8688687086105347, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 5.722151756286621, | |
| "learning_rate": 1.2424242424242424e-05, | |
| "loss": 2.4909, | |
| "mean_token_accuracy": 0.8692611008882523, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "grad_norm": 5.512819290161133, | |
| "learning_rate": 1.2323232323232325e-05, | |
| "loss": 2.5139, | |
| "mean_token_accuracy": 0.8634871691465378, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.032, | |
| "grad_norm": 5.516348838806152, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 2.8559, | |
| "mean_token_accuracy": 0.8596174418926239, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 5.508405685424805, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 2.5545, | |
| "mean_token_accuracy": 0.8677777200937271, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.048, | |
| "grad_norm": 6.354642868041992, | |
| "learning_rate": 1.202020202020202e-05, | |
| "loss": 3.0228, | |
| "mean_token_accuracy": 0.8556893318891525, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.056, | |
| "grad_norm": 6.423473834991455, | |
| "learning_rate": 1.1919191919191921e-05, | |
| "loss": 3.072, | |
| "mean_token_accuracy": 0.8429125100374222, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.064, | |
| "grad_norm": 5.950673580169678, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 3.4449, | |
| "mean_token_accuracy": 0.8237967044115067, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 6.789544582366943, | |
| "learning_rate": 1.1717171717171718e-05, | |
| "loss": 2.8032, | |
| "mean_token_accuracy": 0.8490213006734848, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 5.8454413414001465, | |
| "learning_rate": 1.1616161616161616e-05, | |
| "loss": 2.7207, | |
| "mean_token_accuracy": 0.8605297058820724, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.088, | |
| "grad_norm": 6.446214199066162, | |
| "learning_rate": 1.1515151515151517e-05, | |
| "loss": 3.2414, | |
| "mean_token_accuracy": 0.8328104317188263, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 5.578056335449219, | |
| "learning_rate": 1.1414141414141415e-05, | |
| "loss": 2.8079, | |
| "mean_token_accuracy": 0.8580958545207977, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 5.806094169616699, | |
| "learning_rate": 1.1313131313131314e-05, | |
| "loss": 2.6591, | |
| "mean_token_accuracy": 0.8651483207941055, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.112, | |
| "grad_norm": 4.987120151519775, | |
| "learning_rate": 1.1212121212121212e-05, | |
| "loss": 2.2249, | |
| "mean_token_accuracy": 0.8830093741416931, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 6.069543361663818, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 2.8345, | |
| "mean_token_accuracy": 0.8573679178953171, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.128, | |
| "grad_norm": 6.578948974609375, | |
| "learning_rate": 1.1010101010101011e-05, | |
| "loss": 3.4663, | |
| "mean_token_accuracy": 0.8265634626150131, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 6.3034772872924805, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 2.5849, | |
| "mean_token_accuracy": 0.8636100441217422, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.144, | |
| "grad_norm": 5.5531206130981445, | |
| "learning_rate": 1.0808080808080808e-05, | |
| "loss": 2.8643, | |
| "mean_token_accuracy": 0.8591476529836655, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.152, | |
| "grad_norm": 5.342994689941406, | |
| "learning_rate": 1.0707070707070708e-05, | |
| "loss": 2.4832, | |
| "mean_token_accuracy": 0.868215799331665, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 6.020049095153809, | |
| "learning_rate": 1.0606060606060607e-05, | |
| "loss": 2.7882, | |
| "mean_token_accuracy": 0.860662654042244, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 6.296903610229492, | |
| "learning_rate": 1.0505050505050505e-05, | |
| "loss": 2.6621, | |
| "mean_token_accuracy": 0.8558385968208313, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 5.6435980796813965, | |
| "learning_rate": 1.0404040404040405e-05, | |
| "loss": 2.5996, | |
| "mean_token_accuracy": 0.8638162761926651, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.184, | |
| "grad_norm": 5.6095123291015625, | |
| "learning_rate": 1.0303030303030304e-05, | |
| "loss": 2.7824, | |
| "mean_token_accuracy": 0.8573920726776123, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.192, | |
| "grad_norm": 5.77184534072876, | |
| "learning_rate": 1.0202020202020204e-05, | |
| "loss": 3.2982, | |
| "mean_token_accuracy": 0.8455280959606171, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 6.490416049957275, | |
| "learning_rate": 1.0101010101010101e-05, | |
| "loss": 2.8005, | |
| "mean_token_accuracy": 0.8535754829645157, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.208, | |
| "grad_norm": 5.768113613128662, | |
| "learning_rate": 1e-05, | |
| "loss": 2.5534, | |
| "mean_token_accuracy": 0.8649332523345947, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.216, | |
| "grad_norm": 6.118169784545898, | |
| "learning_rate": 9.898989898989899e-06, | |
| "loss": 2.8782, | |
| "mean_token_accuracy": 0.8546314835548401, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.224, | |
| "grad_norm": 5.9188456535339355, | |
| "learning_rate": 9.7979797979798e-06, | |
| "loss": 2.8495, | |
| "mean_token_accuracy": 0.8487619459629059, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 6.8953986167907715, | |
| "learning_rate": 9.696969696969698e-06, | |
| "loss": 3.2653, | |
| "mean_token_accuracy": 0.842998206615448, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 6.311713218688965, | |
| "learning_rate": 9.595959595959595e-06, | |
| "loss": 3.0002, | |
| "mean_token_accuracy": 0.8414562940597534, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.248, | |
| "grad_norm": 6.360631465911865, | |
| "learning_rate": 9.494949494949495e-06, | |
| "loss": 2.5226, | |
| "mean_token_accuracy": 0.8662195056676865, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 6.032021999359131, | |
| "learning_rate": 9.393939393939394e-06, | |
| "loss": 2.4289, | |
| "mean_token_accuracy": 0.8762623816728592, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 6.2928948402404785, | |
| "learning_rate": 9.292929292929294e-06, | |
| "loss": 2.4797, | |
| "mean_token_accuracy": 0.8561984449625015, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.2720000000000002, | |
| "grad_norm": 5.483267784118652, | |
| "learning_rate": 9.191919191919192e-06, | |
| "loss": 2.8567, | |
| "mean_token_accuracy": 0.8590980023145676, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 6.180997848510742, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 2.6063, | |
| "mean_token_accuracy": 0.8648645430803299, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.288, | |
| "grad_norm": 6.12001371383667, | |
| "learning_rate": 8.98989898989899e-06, | |
| "loss": 2.6823, | |
| "mean_token_accuracy": 0.8513016700744629, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 5.99603796005249, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 3.0047, | |
| "mean_token_accuracy": 0.8510095775127411, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.304, | |
| "grad_norm": 5.749976634979248, | |
| "learning_rate": 8.787878787878788e-06, | |
| "loss": 2.9216, | |
| "mean_token_accuracy": 0.8540640473365784, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.312, | |
| "grad_norm": 6.270049095153809, | |
| "learning_rate": 8.686868686868687e-06, | |
| "loss": 3.0965, | |
| "mean_token_accuracy": 0.8414539396762848, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 5.532293319702148, | |
| "learning_rate": 8.585858585858587e-06, | |
| "loss": 3.0462, | |
| "mean_token_accuracy": 0.8508045822381973, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 6.463992118835449, | |
| "learning_rate": 8.484848484848486e-06, | |
| "loss": 2.7394, | |
| "mean_token_accuracy": 0.85701984167099, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 5.6121134757995605, | |
| "learning_rate": 8.383838383838384e-06, | |
| "loss": 2.7231, | |
| "mean_token_accuracy": 0.8531108349561691, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.344, | |
| "grad_norm": 6.132601737976074, | |
| "learning_rate": 8.282828282828283e-06, | |
| "loss": 3.1036, | |
| "mean_token_accuracy": 0.8383132815361023, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.352, | |
| "grad_norm": 6.347646713256836, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 2.5049, | |
| "mean_token_accuracy": 0.8660729080438614, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 5.851092338562012, | |
| "learning_rate": 8.080808080808082e-06, | |
| "loss": 2.5479, | |
| "mean_token_accuracy": 0.8598389625549316, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.368, | |
| "grad_norm": 5.76607084274292, | |
| "learning_rate": 7.97979797979798e-06, | |
| "loss": 3.1152, | |
| "mean_token_accuracy": 0.8485762178897858, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.376, | |
| "grad_norm": 6.1592607498168945, | |
| "learning_rate": 7.878787878787878e-06, | |
| "loss": 2.514, | |
| "mean_token_accuracy": 0.8623416870832443, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.384, | |
| "grad_norm": 6.193600654602051, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 2.9589, | |
| "mean_token_accuracy": 0.8406593501567841, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 5.8620100021362305, | |
| "learning_rate": 7.676767676767677e-06, | |
| "loss": 2.6917, | |
| "mean_token_accuracy": 0.8631896525621414, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 5.631361961364746, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 2.6014, | |
| "mean_token_accuracy": 0.8725499212741852, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.408, | |
| "grad_norm": 6.087384223937988, | |
| "learning_rate": 7.474747474747475e-06, | |
| "loss": 2.5906, | |
| "mean_token_accuracy": 0.8593233972787857, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 6.2712016105651855, | |
| "learning_rate": 7.3737373737373745e-06, | |
| "loss": 2.7093, | |
| "mean_token_accuracy": 0.8598934262990952, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 5.651885986328125, | |
| "learning_rate": 7.272727272727272e-06, | |
| "loss": 2.6062, | |
| "mean_token_accuracy": 0.8617094457149506, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.432, | |
| "grad_norm": 5.506243705749512, | |
| "learning_rate": 7.171717171717173e-06, | |
| "loss": 2.5589, | |
| "mean_token_accuracy": 0.8667820692062378, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 6.425995349884033, | |
| "learning_rate": 7.0707070707070704e-06, | |
| "loss": 3.1032, | |
| "mean_token_accuracy": 0.8365529775619507, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.448, | |
| "grad_norm": 5.543388366699219, | |
| "learning_rate": 6.969696969696971e-06, | |
| "loss": 2.6756, | |
| "mean_token_accuracy": 0.8701014816761017, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 6.339065074920654, | |
| "learning_rate": 6.8686868686868685e-06, | |
| "loss": 2.5654, | |
| "mean_token_accuracy": 0.8716783672571182, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.464, | |
| "grad_norm": 6.019214153289795, | |
| "learning_rate": 6.767676767676769e-06, | |
| "loss": 3.3469, | |
| "mean_token_accuracy": 0.8329573720693588, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.472, | |
| "grad_norm": 6.126465797424316, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 2.5695, | |
| "mean_token_accuracy": 0.8654420524835587, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 5.879609107971191, | |
| "learning_rate": 6.565656565656567e-06, | |
| "loss": 2.5542, | |
| "mean_token_accuracy": 0.861924409866333, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 6.507324695587158, | |
| "learning_rate": 6.464646464646465e-06, | |
| "loss": 2.8441, | |
| "mean_token_accuracy": 0.84620201587677, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 6.121249675750732, | |
| "learning_rate": 6.363636363636363e-06, | |
| "loss": 2.2379, | |
| "mean_token_accuracy": 0.8786091357469559, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.504, | |
| "grad_norm": 5.940850734710693, | |
| "learning_rate": 6.262626262626263e-06, | |
| "loss": 2.5193, | |
| "mean_token_accuracy": 0.8697729557752609, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.512, | |
| "grad_norm": 6.618006229400635, | |
| "learning_rate": 6.161616161616162e-06, | |
| "loss": 3.0884, | |
| "mean_token_accuracy": 0.8440293669700623, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.300545692443848, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 2.9795, | |
| "mean_token_accuracy": 0.8421255350112915, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.528, | |
| "grad_norm": 6.084777355194092, | |
| "learning_rate": 5.9595959595959605e-06, | |
| "loss": 2.8443, | |
| "mean_token_accuracy": 0.8482642769813538, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.536, | |
| "grad_norm": 6.6309685707092285, | |
| "learning_rate": 5.858585858585859e-06, | |
| "loss": 2.7261, | |
| "mean_token_accuracy": 0.8591820150613785, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.544, | |
| "grad_norm": 5.60848331451416, | |
| "learning_rate": 5.7575757575757586e-06, | |
| "loss": 2.4302, | |
| "mean_token_accuracy": 0.8700147867202759, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 6.067654132843018, | |
| "learning_rate": 5.656565656565657e-06, | |
| "loss": 2.5163, | |
| "mean_token_accuracy": 0.8643026798963547, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 5.975160121917725, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 2.8837, | |
| "mean_token_accuracy": 0.8568368703126907, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.568, | |
| "grad_norm": 6.040992736816406, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 2.3984, | |
| "mean_token_accuracy": 0.867580771446228, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 5.610777378082275, | |
| "learning_rate": 5.353535353535354e-06, | |
| "loss": 2.5171, | |
| "mean_token_accuracy": 0.8687434643507004, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 6.74979829788208, | |
| "learning_rate": 5.2525252525252526e-06, | |
| "loss": 2.5815, | |
| "mean_token_accuracy": 0.866498276591301, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.592, | |
| "grad_norm": 5.904205322265625, | |
| "learning_rate": 5.151515151515152e-06, | |
| "loss": 2.7649, | |
| "mean_token_accuracy": 0.8657421469688416, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 5.820155620574951, | |
| "learning_rate": 5.050505050505051e-06, | |
| "loss": 2.7525, | |
| "mean_token_accuracy": 0.8619928807020187, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.608, | |
| "grad_norm": 6.589330196380615, | |
| "learning_rate": 4.949494949494949e-06, | |
| "loss": 2.287, | |
| "mean_token_accuracy": 0.8719237148761749, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 6.643290996551514, | |
| "learning_rate": 4.848484848484849e-06, | |
| "loss": 3.4465, | |
| "mean_token_accuracy": 0.841964989900589, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.624, | |
| "grad_norm": 6.711119174957275, | |
| "learning_rate": 4.747474747474747e-06, | |
| "loss": 2.8413, | |
| "mean_token_accuracy": 0.856516107916832, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.632, | |
| "grad_norm": 5.4684343338012695, | |
| "learning_rate": 4.646464646464647e-06, | |
| "loss": 2.4144, | |
| "mean_token_accuracy": 0.8840565979480743, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 5.740659236907959, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 2.6323, | |
| "mean_token_accuracy": 0.8655981123447418, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 6.271495819091797, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 3.0839, | |
| "mean_token_accuracy": 0.8419709354639053, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 5.493051052093506, | |
| "learning_rate": 4.343434343434344e-06, | |
| "loss": 2.1636, | |
| "mean_token_accuracy": 0.8905516117811203, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.664, | |
| "grad_norm": 6.861065864562988, | |
| "learning_rate": 4.242424242424243e-06, | |
| "loss": 2.9378, | |
| "mean_token_accuracy": 0.8521191477775574, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.672, | |
| "grad_norm": 6.059913158416748, | |
| "learning_rate": 4.141414141414142e-06, | |
| "loss": 2.7574, | |
| "mean_token_accuracy": 0.8573954999446869, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 6.36575174331665, | |
| "learning_rate": 4.040404040404041e-06, | |
| "loss": 2.8077, | |
| "mean_token_accuracy": 0.8544747680425644, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.6879999999999997, | |
| "grad_norm": 6.40785026550293, | |
| "learning_rate": 3.939393939393939e-06, | |
| "loss": 2.9, | |
| "mean_token_accuracy": 0.8473234623670578, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.6959999999999997, | |
| "grad_norm": 6.139939785003662, | |
| "learning_rate": 3.8383838383838385e-06, | |
| "loss": 2.4719, | |
| "mean_token_accuracy": 0.8682427853345871, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.7039999999999997, | |
| "grad_norm": 6.098149299621582, | |
| "learning_rate": 3.7373737373737375e-06, | |
| "loss": 2.7453, | |
| "mean_token_accuracy": 0.8645330965518951, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 6.3011884689331055, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 2.7872, | |
| "mean_token_accuracy": 0.8514862060546875, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.7199999999999998, | |
| "grad_norm": 6.186530590057373, | |
| "learning_rate": 3.5353535353535352e-06, | |
| "loss": 3.1428, | |
| "mean_token_accuracy": 0.8448829352855682, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.7279999999999998, | |
| "grad_norm": 7.000403881072998, | |
| "learning_rate": 3.4343434343434343e-06, | |
| "loss": 3.5395, | |
| "mean_token_accuracy": 0.8234449177980423, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 6.087681770324707, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 3.0334, | |
| "mean_token_accuracy": 0.8479331731796265, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 5.989120960235596, | |
| "learning_rate": 3.2323232323232324e-06, | |
| "loss": 2.7067, | |
| "mean_token_accuracy": 0.866983637213707, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.752, | |
| "grad_norm": 6.254819869995117, | |
| "learning_rate": 3.1313131313131314e-06, | |
| "loss": 2.3856, | |
| "mean_token_accuracy": 0.8707826137542725, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 6.099485874176025, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 2.8972, | |
| "mean_token_accuracy": 0.8589896708726883, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.768, | |
| "grad_norm": 6.424018383026123, | |
| "learning_rate": 2.9292929292929295e-06, | |
| "loss": 2.7964, | |
| "mean_token_accuracy": 0.8504993915557861, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 5.645679950714111, | |
| "learning_rate": 2.8282828282828286e-06, | |
| "loss": 2.4383, | |
| "mean_token_accuracy": 0.8685364574193954, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.784, | |
| "grad_norm": 6.7141523361206055, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 2.8149, | |
| "mean_token_accuracy": 0.8539319187402725, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.792, | |
| "grad_norm": 6.887962818145752, | |
| "learning_rate": 2.6262626262626263e-06, | |
| "loss": 2.6289, | |
| "mean_token_accuracy": 0.8702027946710587, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 6.338537216186523, | |
| "learning_rate": 2.5252525252525253e-06, | |
| "loss": 3.0126, | |
| "mean_token_accuracy": 0.8451626300811768, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 6.215012550354004, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 3.0838, | |
| "mean_token_accuracy": 0.853987067937851, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 6.2042999267578125, | |
| "learning_rate": 2.3232323232323234e-06, | |
| "loss": 2.962, | |
| "mean_token_accuracy": 0.8471638411283493, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.824, | |
| "grad_norm": 5.406651020050049, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 2.4559, | |
| "mean_token_accuracy": 0.872399777173996, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.832, | |
| "grad_norm": 6.384584426879883, | |
| "learning_rate": 2.1212121212121216e-06, | |
| "loss": 2.7063, | |
| "mean_token_accuracy": 0.8628278374671936, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 6.349820137023926, | |
| "learning_rate": 2.0202020202020206e-06, | |
| "loss": 2.3293, | |
| "mean_token_accuracy": 0.8819480836391449, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.848, | |
| "grad_norm": 5.694690227508545, | |
| "learning_rate": 1.9191919191919192e-06, | |
| "loss": 2.554, | |
| "mean_token_accuracy": 0.8645013719797134, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.856, | |
| "grad_norm": 6.480876445770264, | |
| "learning_rate": 1.818181818181818e-06, | |
| "loss": 3.0091, | |
| "mean_token_accuracy": 0.8551051169633865, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.864, | |
| "grad_norm": 6.149241924285889, | |
| "learning_rate": 1.7171717171717171e-06, | |
| "loss": 3.0215, | |
| "mean_token_accuracy": 0.8434359133243561, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 5.84627103805542, | |
| "learning_rate": 1.6161616161616162e-06, | |
| "loss": 2.7796, | |
| "mean_token_accuracy": 0.8535165041685104, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 6.468245506286621, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 2.8329, | |
| "mean_token_accuracy": 0.8529188930988312, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.888, | |
| "grad_norm": 6.755955219268799, | |
| "learning_rate": 1.4141414141414143e-06, | |
| "loss": 3.3661, | |
| "mean_token_accuracy": 0.831203356385231, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 6.618368148803711, | |
| "learning_rate": 1.3131313131313131e-06, | |
| "loss": 2.9949, | |
| "mean_token_accuracy": 0.8433407545089722, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 6.448158264160156, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 2.6009, | |
| "mean_token_accuracy": 0.8493270874023438, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.912, | |
| "grad_norm": 6.494684219360352, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 2.8263, | |
| "mean_token_accuracy": 0.847339928150177, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.943580150604248, | |
| "learning_rate": 1.0101010101010103e-06, | |
| "loss": 3.0355, | |
| "mean_token_accuracy": 0.8504604995250702, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.928, | |
| "grad_norm": 5.839914798736572, | |
| "learning_rate": 9.09090909090909e-07, | |
| "loss": 2.6543, | |
| "mean_token_accuracy": 0.8537915647029877, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 5.800342082977295, | |
| "learning_rate": 8.080808080808081e-07, | |
| "loss": 2.4919, | |
| "mean_token_accuracy": 0.8684473484754562, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.944, | |
| "grad_norm": 6.721318244934082, | |
| "learning_rate": 7.070707070707071e-07, | |
| "loss": 2.7316, | |
| "mean_token_accuracy": 0.8631936460733414, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.952, | |
| "grad_norm": 5.557168960571289, | |
| "learning_rate": 6.060606060606061e-07, | |
| "loss": 2.7026, | |
| "mean_token_accuracy": 0.8652904033660889, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 6.060300350189209, | |
| "learning_rate": 5.050505050505052e-07, | |
| "loss": 2.8709, | |
| "mean_token_accuracy": 0.854452446103096, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 5.997809886932373, | |
| "learning_rate": 4.0404040404040405e-07, | |
| "loss": 2.5419, | |
| "mean_token_accuracy": 0.8659024238586426, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 5.681960105895996, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "loss": 2.7244, | |
| "mean_token_accuracy": 0.8663401901721954, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.984, | |
| "grad_norm": 8.17302417755127, | |
| "learning_rate": 2.0202020202020202e-07, | |
| "loss": 3.1581, | |
| "mean_token_accuracy": 0.8333937376737595, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.992, | |
| "grad_norm": 6.531320095062256, | |
| "learning_rate": 1.0101010101010101e-07, | |
| "loss": 3.0756, | |
| "mean_token_accuracy": 0.8531923592090607, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 6.187903881072998, | |
| "learning_rate": 0.0, | |
| "loss": 2.5067, | |
| "mean_token_accuracy": 0.8733052164316177, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2610056134656000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |