| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 34.44804382324219, | |
| "learning_rate": 1e-05, | |
| "loss": 13.0101, | |
| "mean_token_accuracy": 0.4696590006351471, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 30.779788970947266, | |
| "learning_rate": 2e-05, | |
| "loss": 12.3851, | |
| "mean_token_accuracy": 0.47303473204374313, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 29.67559242248535, | |
| "learning_rate": 3e-05, | |
| "loss": 12.3488, | |
| "mean_token_accuracy": 0.49709559231996536, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 26.862010955810547, | |
| "learning_rate": 4e-05, | |
| "loss": 11.6596, | |
| "mean_token_accuracy": 0.5584611147642136, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 22.10072135925293, | |
| "learning_rate": 5e-05, | |
| "loss": 10.1384, | |
| "mean_token_accuracy": 0.5924926251173019, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 20.171361923217773, | |
| "learning_rate": 4.98989898989899e-05, | |
| "loss": 9.5421, | |
| "mean_token_accuracy": 0.5888276249170303, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 16.452842712402344, | |
| "learning_rate": 4.97979797979798e-05, | |
| "loss": 8.4344, | |
| "mean_token_accuracy": 0.632336363196373, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 12.62137222290039, | |
| "learning_rate": 4.9696969696969694e-05, | |
| "loss": 7.7758, | |
| "mean_token_accuracy": 0.6633019298315048, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 11.533007621765137, | |
| "learning_rate": 4.9595959595959594e-05, | |
| "loss": 7.5523, | |
| "mean_token_accuracy": 0.6721473336219788, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 9.251978874206543, | |
| "learning_rate": 4.94949494949495e-05, | |
| "loss": 7.5504, | |
| "mean_token_accuracy": 0.6924590468406677, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 9.099457740783691, | |
| "learning_rate": 4.93939393939394e-05, | |
| "loss": 7.1883, | |
| "mean_token_accuracy": 0.694612979888916, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 9.096614837646484, | |
| "learning_rate": 4.92929292929293e-05, | |
| "loss": 6.7714, | |
| "mean_token_accuracy": 0.7158520817756653, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 8.9071044921875, | |
| "learning_rate": 4.919191919191919e-05, | |
| "loss": 6.7582, | |
| "mean_token_accuracy": 0.7046481072902679, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 8.336892127990723, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 6.639, | |
| "mean_token_accuracy": 0.6894638538360596, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.760809421539307, | |
| "learning_rate": 4.898989898989899e-05, | |
| "loss": 6.1283, | |
| "mean_token_accuracy": 0.7323453426361084, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 10.2608003616333, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 6.3731, | |
| "mean_token_accuracy": 0.7066609710454941, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 9.326302528381348, | |
| "learning_rate": 4.878787878787879e-05, | |
| "loss": 6.0298, | |
| "mean_token_accuracy": 0.7226346284151077, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 9.35575008392334, | |
| "learning_rate": 4.868686868686869e-05, | |
| "loss": 6.1363, | |
| "mean_token_accuracy": 0.7228655517101288, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 7.2700018882751465, | |
| "learning_rate": 4.858585858585859e-05, | |
| "loss": 6.3529, | |
| "mean_token_accuracy": 0.7191445082426071, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.965806484222412, | |
| "learning_rate": 4.848484848484849e-05, | |
| "loss": 6.1908, | |
| "mean_token_accuracy": 0.7225745767354965, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 9.064255714416504, | |
| "learning_rate": 4.838383838383839e-05, | |
| "loss": 6.0033, | |
| "mean_token_accuracy": 0.7359140962362289, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 6.869060039520264, | |
| "learning_rate": 4.828282828282829e-05, | |
| "loss": 5.1424, | |
| "mean_token_accuracy": 0.7650310397148132, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 7.401363372802734, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 6.2128, | |
| "mean_token_accuracy": 0.7163093239068985, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 7.535747051239014, | |
| "learning_rate": 4.808080808080808e-05, | |
| "loss": 5.561, | |
| "mean_token_accuracy": 0.7451221346855164, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8.211135864257812, | |
| "learning_rate": 4.797979797979798e-05, | |
| "loss": 5.265, | |
| "mean_token_accuracy": 0.7609298378229141, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 6.720511436462402, | |
| "learning_rate": 4.787878787878788e-05, | |
| "loss": 5.3244, | |
| "mean_token_accuracy": 0.7525999993085861, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.114042282104492, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 5.6173, | |
| "mean_token_accuracy": 0.7520534992218018, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 7.770754337310791, | |
| "learning_rate": 4.7676767676767684e-05, | |
| "loss": 5.5093, | |
| "mean_token_accuracy": 0.7558625787496567, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 7.749612808227539, | |
| "learning_rate": 4.7575757575757576e-05, | |
| "loss": 5.5386, | |
| "mean_token_accuracy": 0.7553819268941879, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.486739635467529, | |
| "learning_rate": 4.7474747474747476e-05, | |
| "loss": 5.73, | |
| "mean_token_accuracy": 0.732807844877243, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 7.474062919616699, | |
| "learning_rate": 4.7373737373737375e-05, | |
| "loss": 6.0216, | |
| "mean_token_accuracy": 0.742279127240181, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 7.018927574157715, | |
| "learning_rate": 4.7272727272727275e-05, | |
| "loss": 5.0907, | |
| "mean_token_accuracy": 0.7746435850858688, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 6.719446659088135, | |
| "learning_rate": 4.7171717171717174e-05, | |
| "loss": 4.9967, | |
| "mean_token_accuracy": 0.7717019468545914, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 6.464803695678711, | |
| "learning_rate": 4.7070707070707074e-05, | |
| "loss": 5.2966, | |
| "mean_token_accuracy": 0.7656355202198029, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.2371320724487305, | |
| "learning_rate": 4.696969696969697e-05, | |
| "loss": 5.6333, | |
| "mean_token_accuracy": 0.7543937265872955, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 6.18785285949707, | |
| "learning_rate": 4.686868686868687e-05, | |
| "loss": 5.3846, | |
| "mean_token_accuracy": 0.7626298367977142, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 5.636417865753174, | |
| "learning_rate": 4.676767676767677e-05, | |
| "loss": 4.7779, | |
| "mean_token_accuracy": 0.796937882900238, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 6.951849460601807, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 5.3062, | |
| "mean_token_accuracy": 0.7514433115720749, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 7.596467018127441, | |
| "learning_rate": 4.656565656565657e-05, | |
| "loss": 5.4916, | |
| "mean_token_accuracy": 0.738706648349762, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.036402225494385, | |
| "learning_rate": 4.6464646464646464e-05, | |
| "loss": 5.4346, | |
| "mean_token_accuracy": 0.7636198997497559, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 7.065268039703369, | |
| "learning_rate": 4.636363636363636e-05, | |
| "loss": 4.9636, | |
| "mean_token_accuracy": 0.76950503885746, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 5.93984842300415, | |
| "learning_rate": 4.626262626262626e-05, | |
| "loss": 4.9883, | |
| "mean_token_accuracy": 0.7728704810142517, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 6.929603576660156, | |
| "learning_rate": 4.616161616161616e-05, | |
| "loss": 4.6927, | |
| "mean_token_accuracy": 0.7756397873163223, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 7.2813496589660645, | |
| "learning_rate": 4.606060606060607e-05, | |
| "loss": 4.761, | |
| "mean_token_accuracy": 0.784713864326477, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.584592342376709, | |
| "learning_rate": 4.595959595959596e-05, | |
| "loss": 4.7331, | |
| "mean_token_accuracy": 0.787657305598259, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 7.5701751708984375, | |
| "learning_rate": 4.585858585858586e-05, | |
| "loss": 4.1914, | |
| "mean_token_accuracy": 0.796615332365036, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 6.217738628387451, | |
| "learning_rate": 4.575757575757576e-05, | |
| "loss": 4.6948, | |
| "mean_token_accuracy": 0.7924428284168243, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.909704685211182, | |
| "learning_rate": 4.565656565656566e-05, | |
| "loss": 5.0747, | |
| "mean_token_accuracy": 0.7648352980613708, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 7.039934158325195, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 5.168, | |
| "mean_token_accuracy": 0.7628057301044464, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.007484436035156, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 5.5139, | |
| "mean_token_accuracy": 0.7507320195436478, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7.163206100463867, | |
| "learning_rate": 4.535353535353535e-05, | |
| "loss": 4.6769, | |
| "mean_token_accuracy": 0.789847657084465, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 6.482788562774658, | |
| "learning_rate": 4.525252525252526e-05, | |
| "loss": 4.0523, | |
| "mean_token_accuracy": 0.8091136366128922, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 6.3435564041137695, | |
| "learning_rate": 4.515151515151516e-05, | |
| "loss": 5.0074, | |
| "mean_token_accuracy": 0.7676153928041458, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 6.715310096740723, | |
| "learning_rate": 4.5050505050505056e-05, | |
| "loss": 4.7588, | |
| "mean_token_accuracy": 0.7704059928655624, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.287456512451172, | |
| "learning_rate": 4.494949494949495e-05, | |
| "loss": 5.0595, | |
| "mean_token_accuracy": 0.7789322882890701, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 6.842898845672607, | |
| "learning_rate": 4.484848484848485e-05, | |
| "loss": 4.3176, | |
| "mean_token_accuracy": 0.7993331402540207, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 5.992519378662109, | |
| "learning_rate": 4.474747474747475e-05, | |
| "loss": 5.0297, | |
| "mean_token_accuracy": 0.779092013835907, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 6.67594051361084, | |
| "learning_rate": 4.464646464646465e-05, | |
| "loss": 4.1185, | |
| "mean_token_accuracy": 0.7971874326467514, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 6.712916374206543, | |
| "learning_rate": 4.454545454545455e-05, | |
| "loss": 4.6345, | |
| "mean_token_accuracy": 0.7811515778303146, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.204624652862549, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 4.5869, | |
| "mean_token_accuracy": 0.8028187602758408, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7.798375129699707, | |
| "learning_rate": 4.4343434343434346e-05, | |
| "loss": 5.2535, | |
| "mean_token_accuracy": 0.7713681906461716, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 6.647665500640869, | |
| "learning_rate": 4.4242424242424246e-05, | |
| "loss": 4.1662, | |
| "mean_token_accuracy": 0.809452161192894, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 7.158292770385742, | |
| "learning_rate": 4.4141414141414145e-05, | |
| "loss": 4.5907, | |
| "mean_token_accuracy": 0.7843180149793625, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 6.763492107391357, | |
| "learning_rate": 4.4040404040404044e-05, | |
| "loss": 4.6913, | |
| "mean_token_accuracy": 0.7733302861452103, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.745001792907715, | |
| "learning_rate": 4.3939393939393944e-05, | |
| "loss": 4.7438, | |
| "mean_token_accuracy": 0.7817106246948242, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 5.92717981338501, | |
| "learning_rate": 4.383838383838384e-05, | |
| "loss": 4.3055, | |
| "mean_token_accuracy": 0.7931149005889893, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 7.291048049926758, | |
| "learning_rate": 4.3737373737373736e-05, | |
| "loss": 4.7281, | |
| "mean_token_accuracy": 0.7924042791128159, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6.6944899559021, | |
| "learning_rate": 4.3636363636363636e-05, | |
| "loss": 5.4566, | |
| "mean_token_accuracy": 0.7465656846761703, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6.43382453918457, | |
| "learning_rate": 4.3535353535353535e-05, | |
| "loss": 5.1924, | |
| "mean_token_accuracy": 0.765620231628418, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 7.896528244018555, | |
| "learning_rate": 4.343434343434344e-05, | |
| "loss": 4.535, | |
| "mean_token_accuracy": 0.780782625079155, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 7.0929059982299805, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 4.7798, | |
| "mean_token_accuracy": 0.7812370210886002, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 6.4265031814575195, | |
| "learning_rate": 4.3232323232323234e-05, | |
| "loss": 4.0221, | |
| "mean_token_accuracy": 0.8055804222822189, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 6.464663982391357, | |
| "learning_rate": 4.313131313131313e-05, | |
| "loss": 5.3328, | |
| "mean_token_accuracy": 0.7507916688919067, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 6.163589000701904, | |
| "learning_rate": 4.303030303030303e-05, | |
| "loss": 3.8647, | |
| "mean_token_accuracy": 0.8181672990322113, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6.442709445953369, | |
| "learning_rate": 4.292929292929293e-05, | |
| "loss": 4.1589, | |
| "mean_token_accuracy": 0.7894054055213928, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 6.7371673583984375, | |
| "learning_rate": 4.282828282828283e-05, | |
| "loss": 4.4978, | |
| "mean_token_accuracy": 0.7749454975128174, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 6.681241035461426, | |
| "learning_rate": 4.2727272727272724e-05, | |
| "loss": 4.3404, | |
| "mean_token_accuracy": 0.8082751482725143, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 6.194783687591553, | |
| "learning_rate": 4.262626262626263e-05, | |
| "loss": 4.1043, | |
| "mean_token_accuracy": 0.7980503439903259, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 6.758218288421631, | |
| "learning_rate": 4.252525252525253e-05, | |
| "loss": 4.4278, | |
| "mean_token_accuracy": 0.7885774075984955, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 7.5866546630859375, | |
| "learning_rate": 4.242424242424243e-05, | |
| "loss": 4.227, | |
| "mean_token_accuracy": 0.7932698577642441, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 7.192435264587402, | |
| "learning_rate": 4.232323232323233e-05, | |
| "loss": 4.0345, | |
| "mean_token_accuracy": 0.8199726790189743, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 5.501450061798096, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 4.2629, | |
| "mean_token_accuracy": 0.7981265485286713, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 6.11761474609375, | |
| "learning_rate": 4.212121212121212e-05, | |
| "loss": 4.2, | |
| "mean_token_accuracy": 0.8049240857362747, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 6.632277011871338, | |
| "learning_rate": 4.202020202020202e-05, | |
| "loss": 4.5717, | |
| "mean_token_accuracy": 0.784144937992096, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.618972301483154, | |
| "learning_rate": 4.191919191919192e-05, | |
| "loss": 4.1424, | |
| "mean_token_accuracy": 0.8174345791339874, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 6.348287105560303, | |
| "learning_rate": 4.181818181818182e-05, | |
| "loss": 3.9289, | |
| "mean_token_accuracy": 0.8103939890861511, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 6.326577186584473, | |
| "learning_rate": 4.171717171717172e-05, | |
| "loss": 4.4158, | |
| "mean_token_accuracy": 0.7934867739677429, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 5.938681602478027, | |
| "learning_rate": 4.161616161616162e-05, | |
| "loss": 4.3607, | |
| "mean_token_accuracy": 0.8023031949996948, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.043774604797363, | |
| "learning_rate": 4.151515151515152e-05, | |
| "loss": 4.8988, | |
| "mean_token_accuracy": 0.7769377380609512, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6.513429641723633, | |
| "learning_rate": 4.141414141414142e-05, | |
| "loss": 4.469, | |
| "mean_token_accuracy": 0.7987037748098373, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 6.306874752044678, | |
| "learning_rate": 4.131313131313132e-05, | |
| "loss": 3.8623, | |
| "mean_token_accuracy": 0.8089967370033264, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 6.43092155456543, | |
| "learning_rate": 4.1212121212121216e-05, | |
| "loss": 4.1141, | |
| "mean_token_accuracy": 0.8077200204133987, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 5.928807735443115, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 4.379, | |
| "mean_token_accuracy": 0.7994053959846497, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 6.193979740142822, | |
| "learning_rate": 4.101010101010101e-05, | |
| "loss": 4.0719, | |
| "mean_token_accuracy": 0.7951067984104156, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 5.613356113433838, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 3.9982, | |
| "mean_token_accuracy": 0.8044578582048416, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 5.828372478485107, | |
| "learning_rate": 4.0808080808080814e-05, | |
| "loss": 3.9207, | |
| "mean_token_accuracy": 0.804252415895462, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 6.8737993240356445, | |
| "learning_rate": 4.070707070707071e-05, | |
| "loss": 4.018, | |
| "mean_token_accuracy": 0.7971819192171097, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 6.014230251312256, | |
| "learning_rate": 4.0606060606060606e-05, | |
| "loss": 3.9449, | |
| "mean_token_accuracy": 0.8023800402879715, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 6.424180030822754, | |
| "learning_rate": 4.0505050505050506e-05, | |
| "loss": 4.133, | |
| "mean_token_accuracy": 0.7898548692464828, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5.941647529602051, | |
| "learning_rate": 4.0404040404040405e-05, | |
| "loss": 4.4152, | |
| "mean_token_accuracy": 0.7944744974374771, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 5.532174110412598, | |
| "learning_rate": 4.0303030303030305e-05, | |
| "loss": 3.9543, | |
| "mean_token_accuracy": 0.8119575679302216, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 6.766019344329834, | |
| "learning_rate": 4.0202020202020204e-05, | |
| "loss": 3.8357, | |
| "mean_token_accuracy": 0.8240242004394531, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 5.829742908477783, | |
| "learning_rate": 4.01010101010101e-05, | |
| "loss": 4.1291, | |
| "mean_token_accuracy": 0.798798069357872, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 6.3326416015625, | |
| "learning_rate": 4e-05, | |
| "loss": 4.5605, | |
| "mean_token_accuracy": 0.7854688614606857, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.9824042320251465, | |
| "learning_rate": 3.98989898989899e-05, | |
| "loss": 4.0588, | |
| "mean_token_accuracy": 0.8019276112318039, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 6.245088577270508, | |
| "learning_rate": 3.97979797979798e-05, | |
| "loss": 3.9216, | |
| "mean_token_accuracy": 0.8176022469997406, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 6.343690395355225, | |
| "learning_rate": 3.96969696969697e-05, | |
| "loss": 4.5661, | |
| "mean_token_accuracy": 0.7900134176015854, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 6.176941394805908, | |
| "learning_rate": 3.9595959595959594e-05, | |
| "loss": 4.3509, | |
| "mean_token_accuracy": 0.7881277054548264, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 5.415067672729492, | |
| "learning_rate": 3.9494949494949494e-05, | |
| "loss": 3.9209, | |
| "mean_token_accuracy": 0.8084693402051926, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 6.951507568359375, | |
| "learning_rate": 3.939393939393939e-05, | |
| "loss": 3.8214, | |
| "mean_token_accuracy": 0.8184851258993149, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 6.3500213623046875, | |
| "learning_rate": 3.929292929292929e-05, | |
| "loss": 4.445, | |
| "mean_token_accuracy": 0.7920354455709457, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 6.108852386474609, | |
| "learning_rate": 3.91919191919192e-05, | |
| "loss": 3.9563, | |
| "mean_token_accuracy": 0.8090476542711258, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 5.749330043792725, | |
| "learning_rate": 3.909090909090909e-05, | |
| "loss": 3.7184, | |
| "mean_token_accuracy": 0.8214946985244751, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 6.137975692749023, | |
| "learning_rate": 3.898989898989899e-05, | |
| "loss": 3.8174, | |
| "mean_token_accuracy": 0.8351726979017258, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 5.827854156494141, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 4.1516, | |
| "mean_token_accuracy": 0.8008674532175064, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 6.231594562530518, | |
| "learning_rate": 3.878787878787879e-05, | |
| "loss": 3.9401, | |
| "mean_token_accuracy": 0.8067153543233871, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 6.3449578285217285, | |
| "learning_rate": 3.868686868686869e-05, | |
| "loss": 4.5777, | |
| "mean_token_accuracy": 0.7864043414592743, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 5.758909702301025, | |
| "learning_rate": 3.858585858585859e-05, | |
| "loss": 3.9313, | |
| "mean_token_accuracy": 0.8202812224626541, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 5.801164627075195, | |
| "learning_rate": 3.848484848484848e-05, | |
| "loss": 3.4612, | |
| "mean_token_accuracy": 0.8463838249444962, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.331445217132568, | |
| "learning_rate": 3.838383838383838e-05, | |
| "loss": 4.1784, | |
| "mean_token_accuracy": 0.8084011971950531, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 5.821787357330322, | |
| "learning_rate": 3.828282828282829e-05, | |
| "loss": 4.0625, | |
| "mean_token_accuracy": 0.8006434291601181, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 6.507475852966309, | |
| "learning_rate": 3.818181818181819e-05, | |
| "loss": 3.7078, | |
| "mean_token_accuracy": 0.8223373293876648, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 5.8519511222839355, | |
| "learning_rate": 3.8080808080808087e-05, | |
| "loss": 3.8093, | |
| "mean_token_accuracy": 0.8161063343286514, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 6.515493869781494, | |
| "learning_rate": 3.797979797979798e-05, | |
| "loss": 3.8881, | |
| "mean_token_accuracy": 0.8130738884210587, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.974099159240723, | |
| "learning_rate": 3.787878787878788e-05, | |
| "loss": 3.8128, | |
| "mean_token_accuracy": 0.817224845290184, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 5.158608436584473, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 3.5407, | |
| "mean_token_accuracy": 0.8298159688711166, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 5.91457462310791, | |
| "learning_rate": 3.767676767676768e-05, | |
| "loss": 3.6346, | |
| "mean_token_accuracy": 0.8155378848314285, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 5.550519943237305, | |
| "learning_rate": 3.757575757575758e-05, | |
| "loss": 3.8566, | |
| "mean_token_accuracy": 0.8141728788614273, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 5.610849380493164, | |
| "learning_rate": 3.747474747474748e-05, | |
| "loss": 3.785, | |
| "mean_token_accuracy": 0.811878427863121, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 6.2908854484558105, | |
| "learning_rate": 3.7373737373737376e-05, | |
| "loss": 3.0106, | |
| "mean_token_accuracy": 0.8458054810762405, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 5.267597675323486, | |
| "learning_rate": 3.7272727272727276e-05, | |
| "loss": 3.4967, | |
| "mean_token_accuracy": 0.8339269608259201, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 5.37116003036499, | |
| "learning_rate": 3.7171717171717175e-05, | |
| "loss": 3.4903, | |
| "mean_token_accuracy": 0.8220857381820679, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 5.198507785797119, | |
| "learning_rate": 3.7070707070707075e-05, | |
| "loss": 2.9143, | |
| "mean_token_accuracy": 0.8564625233411789, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 5.638608932495117, | |
| "learning_rate": 3.6969696969696974e-05, | |
| "loss": 3.6283, | |
| "mean_token_accuracy": 0.8230338096618652, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 5.533100605010986, | |
| "learning_rate": 3.686868686868687e-05, | |
| "loss": 3.3163, | |
| "mean_token_accuracy": 0.8364241421222687, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 5.794349670410156, | |
| "learning_rate": 3.6767676767676766e-05, | |
| "loss": 3.4776, | |
| "mean_token_accuracy": 0.822014644742012, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 6.230714797973633, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 3.9834, | |
| "mean_token_accuracy": 0.808947280049324, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 5.761725902557373, | |
| "learning_rate": 3.656565656565657e-05, | |
| "loss": 3.5205, | |
| "mean_token_accuracy": 0.8236398249864578, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 5.620357990264893, | |
| "learning_rate": 3.6464646464646465e-05, | |
| "loss": 3.7957, | |
| "mean_token_accuracy": 0.8184810876846313, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 5.732111930847168, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 3.3031, | |
| "mean_token_accuracy": 0.8268382251262665, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 5.512340545654297, | |
| "learning_rate": 3.6262626262626264e-05, | |
| "loss": 3.4623, | |
| "mean_token_accuracy": 0.8173917531967163, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 5.278458118438721, | |
| "learning_rate": 3.616161616161616e-05, | |
| "loss": 3.6187, | |
| "mean_token_accuracy": 0.8212647438049316, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 6.000738620758057, | |
| "learning_rate": 3.606060606060606e-05, | |
| "loss": 3.4098, | |
| "mean_token_accuracy": 0.8118827193975449, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 5.230849266052246, | |
| "learning_rate": 3.595959595959596e-05, | |
| "loss": 3.3312, | |
| "mean_token_accuracy": 0.8291827887296677, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 6.520547389984131, | |
| "learning_rate": 3.5858585858585855e-05, | |
| "loss": 2.9547, | |
| "mean_token_accuracy": 0.8410928547382355, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 6.905513286590576, | |
| "learning_rate": 3.575757575757576e-05, | |
| "loss": 3.3843, | |
| "mean_token_accuracy": 0.8292149305343628, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 5.25789213180542, | |
| "learning_rate": 3.565656565656566e-05, | |
| "loss": 3.0306, | |
| "mean_token_accuracy": 0.8420234769582748, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 6.301482677459717, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 3.563, | |
| "mean_token_accuracy": 0.816611647605896, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 5.18954610824585, | |
| "learning_rate": 3.545454545454546e-05, | |
| "loss": 3.3712, | |
| "mean_token_accuracy": 0.8429348319768906, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 5.924179553985596, | |
| "learning_rate": 3.535353535353535e-05, | |
| "loss": 3.0173, | |
| "mean_token_accuracy": 0.8484609872102737, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 6.125277996063232, | |
| "learning_rate": 3.525252525252525e-05, | |
| "loss": 4.017, | |
| "mean_token_accuracy": 0.8024349808692932, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 6.013985633850098, | |
| "learning_rate": 3.515151515151515e-05, | |
| "loss": 3.1488, | |
| "mean_token_accuracy": 0.8332614004611969, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 5.863841533660889, | |
| "learning_rate": 3.505050505050505e-05, | |
| "loss": 3.4748, | |
| "mean_token_accuracy": 0.8309570550918579, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 5.565553665161133, | |
| "learning_rate": 3.494949494949495e-05, | |
| "loss": 3.4478, | |
| "mean_token_accuracy": 0.8285814672708511, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.7123565673828125, | |
| "learning_rate": 3.484848484848485e-05, | |
| "loss": 3.1585, | |
| "mean_token_accuracy": 0.8400322198867798, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 5.499763488769531, | |
| "learning_rate": 3.474747474747475e-05, | |
| "loss": 2.9475, | |
| "mean_token_accuracy": 0.8378583043813705, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 5.909451961517334, | |
| "learning_rate": 3.464646464646465e-05, | |
| "loss": 3.0639, | |
| "mean_token_accuracy": 0.8469367325305939, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 6.213925838470459, | |
| "learning_rate": 3.454545454545455e-05, | |
| "loss": 3.7884, | |
| "mean_token_accuracy": 0.8231126517057419, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 5.21215295791626, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 3.4636, | |
| "mean_token_accuracy": 0.8168640285730362, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 5.759207725524902, | |
| "learning_rate": 3.434343434343435e-05, | |
| "loss": 3.2428, | |
| "mean_token_accuracy": 0.8419955372810364, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 5.337326526641846, | |
| "learning_rate": 3.424242424242424e-05, | |
| "loss": 3.5007, | |
| "mean_token_accuracy": 0.826006218791008, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 5.429465293884277, | |
| "learning_rate": 3.414141414141414e-05, | |
| "loss": 3.1902, | |
| "mean_token_accuracy": 0.8436289280653, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 5.5671234130859375, | |
| "learning_rate": 3.4040404040404045e-05, | |
| "loss": 3.1885, | |
| "mean_token_accuracy": 0.837678074836731, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 5.217177391052246, | |
| "learning_rate": 3.3939393939393945e-05, | |
| "loss": 3.0344, | |
| "mean_token_accuracy": 0.8463016897439957, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 6.01118803024292, | |
| "learning_rate": 3.3838383838383844e-05, | |
| "loss": 3.6496, | |
| "mean_token_accuracy": 0.8142215311527252, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 5.723564147949219, | |
| "learning_rate": 3.373737373737374e-05, | |
| "loss": 3.1859, | |
| "mean_token_accuracy": 0.8320183008909225, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 5.838883399963379, | |
| "learning_rate": 3.3636363636363636e-05, | |
| "loss": 3.4536, | |
| "mean_token_accuracy": 0.8264990895986557, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 5.59492301940918, | |
| "learning_rate": 3.3535353535353536e-05, | |
| "loss": 3.1706, | |
| "mean_token_accuracy": 0.8360193520784378, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 5.386695861816406, | |
| "learning_rate": 3.3434343434343435e-05, | |
| "loss": 2.8613, | |
| "mean_token_accuracy": 0.8595142513513565, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 5.8718791007995605, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 3.405, | |
| "mean_token_accuracy": 0.8177822679281235, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 5.533784866333008, | |
| "learning_rate": 3.3232323232323234e-05, | |
| "loss": 2.7535, | |
| "mean_token_accuracy": 0.847559779882431, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 6.221394062042236, | |
| "learning_rate": 3.3131313131313134e-05, | |
| "loss": 3.2758, | |
| "mean_token_accuracy": 0.8370038121938705, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 5.768967628479004, | |
| "learning_rate": 3.303030303030303e-05, | |
| "loss": 3.144, | |
| "mean_token_accuracy": 0.8520988523960114, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 5.794501781463623, | |
| "learning_rate": 3.292929292929293e-05, | |
| "loss": 3.294, | |
| "mean_token_accuracy": 0.8296584039926529, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 6.306114196777344, | |
| "learning_rate": 3.282828282828283e-05, | |
| "loss": 3.8122, | |
| "mean_token_accuracy": 0.8015681505203247, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 5.309399127960205, | |
| "learning_rate": 3.272727272727273e-05, | |
| "loss": 3.5656, | |
| "mean_token_accuracy": 0.8148599863052368, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 6.384012699127197, | |
| "learning_rate": 3.2626262626262624e-05, | |
| "loss": 3.3465, | |
| "mean_token_accuracy": 0.8153296113014221, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 5.76847505569458, | |
| "learning_rate": 3.2525252525252524e-05, | |
| "loss": 3.5117, | |
| "mean_token_accuracy": 0.8243328183889389, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 5.956698417663574, | |
| "learning_rate": 3.2424242424242423e-05, | |
| "loss": 3.3246, | |
| "mean_token_accuracy": 0.8309469819068909, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 5.436334133148193, | |
| "learning_rate": 3.232323232323233e-05, | |
| "loss": 3.3948, | |
| "mean_token_accuracy": 0.8304746299982071, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 6.713822364807129, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 3.7249, | |
| "mean_token_accuracy": 0.8197664022445679, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 5.69707727432251, | |
| "learning_rate": 3.212121212121212e-05, | |
| "loss": 3.4366, | |
| "mean_token_accuracy": 0.8290866911411285, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 5.674050807952881, | |
| "learning_rate": 3.202020202020202e-05, | |
| "loss": 3.4487, | |
| "mean_token_accuracy": 0.8316894471645355, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 6.27432918548584, | |
| "learning_rate": 3.191919191919192e-05, | |
| "loss": 3.7579, | |
| "mean_token_accuracy": 0.8203246891498566, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 6.963736534118652, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 3.3907, | |
| "mean_token_accuracy": 0.8244215101003647, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 6.205438137054443, | |
| "learning_rate": 3.171717171717172e-05, | |
| "loss": 3.2918, | |
| "mean_token_accuracy": 0.8321795910596848, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 6.026015281677246, | |
| "learning_rate": 3.161616161616161e-05, | |
| "loss": 3.7184, | |
| "mean_token_accuracy": 0.8272889703512192, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 5.637466907501221, | |
| "learning_rate": 3.151515151515151e-05, | |
| "loss": 2.5835, | |
| "mean_token_accuracy": 0.8698715269565582, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 6.474348545074463, | |
| "learning_rate": 3.141414141414142e-05, | |
| "loss": 3.2364, | |
| "mean_token_accuracy": 0.8249330222606659, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 6.4876885414123535, | |
| "learning_rate": 3.131313131313132e-05, | |
| "loss": 3.3135, | |
| "mean_token_accuracy": 0.8382684588432312, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 5.597368240356445, | |
| "learning_rate": 3.121212121212122e-05, | |
| "loss": 3.2368, | |
| "mean_token_accuracy": 0.8341715931892395, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 6.087445259094238, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 3.4584, | |
| "mean_token_accuracy": 0.8352274149656296, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 5.523479461669922, | |
| "learning_rate": 3.101010101010101e-05, | |
| "loss": 3.0476, | |
| "mean_token_accuracy": 0.8508585840463638, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 5.6211442947387695, | |
| "learning_rate": 3.090909090909091e-05, | |
| "loss": 3.2642, | |
| "mean_token_accuracy": 0.8426928371191025, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 5.739730358123779, | |
| "learning_rate": 3.080808080808081e-05, | |
| "loss": 3.6835, | |
| "mean_token_accuracy": 0.8226543515920639, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 6.00140380859375, | |
| "learning_rate": 3.070707070707071e-05, | |
| "loss": 2.9485, | |
| "mean_token_accuracy": 0.8533317595720291, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 5.541549205780029, | |
| "learning_rate": 3.060606060606061e-05, | |
| "loss": 3.0892, | |
| "mean_token_accuracy": 0.8474195152521133, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 5.923431873321533, | |
| "learning_rate": 3.050505050505051e-05, | |
| "loss": 3.018, | |
| "mean_token_accuracy": 0.84869784116745, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 6.204696178436279, | |
| "learning_rate": 3.0404040404040406e-05, | |
| "loss": 3.5126, | |
| "mean_token_accuracy": 0.8261184245347977, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 6.3025712966918945, | |
| "learning_rate": 3.0303030303030306e-05, | |
| "loss": 3.4185, | |
| "mean_token_accuracy": 0.8210860043764114, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 5.689709663391113, | |
| "learning_rate": 3.0202020202020205e-05, | |
| "loss": 3.5322, | |
| "mean_token_accuracy": 0.8191230297088623, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 5.707929611206055, | |
| "learning_rate": 3.01010101010101e-05, | |
| "loss": 3.024, | |
| "mean_token_accuracy": 0.8488472253084183, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 5.80771017074585, | |
| "learning_rate": 3e-05, | |
| "loss": 3.5556, | |
| "mean_token_accuracy": 0.8266987651586533, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 6.03700590133667, | |
| "learning_rate": 2.98989898989899e-05, | |
| "loss": 3.1062, | |
| "mean_token_accuracy": 0.8308413475751877, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 6.733646392822266, | |
| "learning_rate": 2.9797979797979796e-05, | |
| "loss": 3.1995, | |
| "mean_token_accuracy": 0.8294665813446045, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 5.883334636688232, | |
| "learning_rate": 2.96969696969697e-05, | |
| "loss": 2.8124, | |
| "mean_token_accuracy": 0.8439156115055084, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 5.941483497619629, | |
| "learning_rate": 2.95959595959596e-05, | |
| "loss": 3.0179, | |
| "mean_token_accuracy": 0.8416165858507156, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 6.391957759857178, | |
| "learning_rate": 2.9494949494949498e-05, | |
| "loss": 3.7068, | |
| "mean_token_accuracy": 0.8122821152210236, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 6.237335205078125, | |
| "learning_rate": 2.9393939393939394e-05, | |
| "loss": 3.32, | |
| "mean_token_accuracy": 0.8313788622617722, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 6.5731987953186035, | |
| "learning_rate": 2.9292929292929294e-05, | |
| "loss": 3.5619, | |
| "mean_token_accuracy": 0.816206768155098, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 5.061030387878418, | |
| "learning_rate": 2.9191919191919193e-05, | |
| "loss": 2.8566, | |
| "mean_token_accuracy": 0.8475048393011093, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 5.921192646026611, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 3.3349, | |
| "mean_token_accuracy": 0.827596977353096, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 5.721929550170898, | |
| "learning_rate": 2.898989898989899e-05, | |
| "loss": 2.7363, | |
| "mean_token_accuracy": 0.8517429679632187, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 6.763914585113525, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 2.8292, | |
| "mean_token_accuracy": 0.8498516529798508, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.817018032073975, | |
| "learning_rate": 2.878787878787879e-05, | |
| "loss": 3.2958, | |
| "mean_token_accuracy": 0.8348537087440491, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 5.659510612487793, | |
| "learning_rate": 2.868686868686869e-05, | |
| "loss": 2.8683, | |
| "mean_token_accuracy": 0.8435689210891724, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 6.099102020263672, | |
| "learning_rate": 2.8585858585858587e-05, | |
| "loss": 2.9825, | |
| "mean_token_accuracy": 0.8394478559494019, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 6.094569206237793, | |
| "learning_rate": 2.8484848484848486e-05, | |
| "loss": 3.2325, | |
| "mean_token_accuracy": 0.8246908932924271, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 6.866254806518555, | |
| "learning_rate": 2.8383838383838386e-05, | |
| "loss": 3.1284, | |
| "mean_token_accuracy": 0.8342044651508331, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 6.149699687957764, | |
| "learning_rate": 2.8282828282828282e-05, | |
| "loss": 3.6781, | |
| "mean_token_accuracy": 0.8211702555418015, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 5.862130641937256, | |
| "learning_rate": 2.818181818181818e-05, | |
| "loss": 3.3624, | |
| "mean_token_accuracy": 0.8324427157640457, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 5.395320415496826, | |
| "learning_rate": 2.808080808080808e-05, | |
| "loss": 3.3338, | |
| "mean_token_accuracy": 0.8399553894996643, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 5.550381183624268, | |
| "learning_rate": 2.7979797979797984e-05, | |
| "loss": 3.4036, | |
| "mean_token_accuracy": 0.8223441988229752, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 5.406087875366211, | |
| "learning_rate": 2.7878787878787883e-05, | |
| "loss": 2.6777, | |
| "mean_token_accuracy": 0.8600380420684814, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 5.7635698318481445, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 3.5319, | |
| "mean_token_accuracy": 0.8255642205476761, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 5.934004783630371, | |
| "learning_rate": 2.767676767676768e-05, | |
| "loss": 3.2659, | |
| "mean_token_accuracy": 0.8305630534887314, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 6.8721184730529785, | |
| "learning_rate": 2.7575757575757578e-05, | |
| "loss": 3.1587, | |
| "mean_token_accuracy": 0.8299548774957657, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 5.556332588195801, | |
| "learning_rate": 2.7474747474747474e-05, | |
| "loss": 2.8912, | |
| "mean_token_accuracy": 0.8541744500398636, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 5.591052532196045, | |
| "learning_rate": 2.7373737373737374e-05, | |
| "loss": 3.1768, | |
| "mean_token_accuracy": 0.840098574757576, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 7.923620700836182, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 3.1165, | |
| "mean_token_accuracy": 0.8416318744421005, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 6.180293083190918, | |
| "learning_rate": 2.717171717171717e-05, | |
| "loss": 3.1878, | |
| "mean_token_accuracy": 0.8414299786090851, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 6.479142189025879, | |
| "learning_rate": 2.7070707070707075e-05, | |
| "loss": 3.519, | |
| "mean_token_accuracy": 0.8237949758768082, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 6.211218357086182, | |
| "learning_rate": 2.696969696969697e-05, | |
| "loss": 3.6602, | |
| "mean_token_accuracy": 0.8112812489271164, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 6.262876033782959, | |
| "learning_rate": 2.686868686868687e-05, | |
| "loss": 3.8658, | |
| "mean_token_accuracy": 0.8071164190769196, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 5.474344730377197, | |
| "learning_rate": 2.676767676767677e-05, | |
| "loss": 2.971, | |
| "mean_token_accuracy": 0.8377246409654617, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 6.193055629730225, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 3.478, | |
| "mean_token_accuracy": 0.823166161775589, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 5.627188682556152, | |
| "learning_rate": 2.6565656565656566e-05, | |
| "loss": 3.1995, | |
| "mean_token_accuracy": 0.8456520289182663, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 6.110049247741699, | |
| "learning_rate": 2.6464646464646466e-05, | |
| "loss": 3.0544, | |
| "mean_token_accuracy": 0.8367758989334106, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 5.3701982498168945, | |
| "learning_rate": 2.636363636363636e-05, | |
| "loss": 2.6659, | |
| "mean_token_accuracy": 0.8609540313482285, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 5.352710247039795, | |
| "learning_rate": 2.6262626262626268e-05, | |
| "loss": 3.082, | |
| "mean_token_accuracy": 0.842663049697876, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 6.08484411239624, | |
| "learning_rate": 2.6161616161616164e-05, | |
| "loss": 3.3737, | |
| "mean_token_accuracy": 0.8245035409927368, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 5.956201076507568, | |
| "learning_rate": 2.6060606060606063e-05, | |
| "loss": 2.9072, | |
| "mean_token_accuracy": 0.8548275828361511, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": 5.499819278717041, | |
| "learning_rate": 2.5959595959595963e-05, | |
| "loss": 3.355, | |
| "mean_token_accuracy": 0.820908397436142, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 6.338292598724365, | |
| "learning_rate": 2.585858585858586e-05, | |
| "loss": 3.7158, | |
| "mean_token_accuracy": 0.8252478241920471, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.105760097503662, | |
| "learning_rate": 2.575757575757576e-05, | |
| "loss": 3.1973, | |
| "mean_token_accuracy": 0.8429799377918243, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 5.636971950531006, | |
| "learning_rate": 2.5656565656565658e-05, | |
| "loss": 2.6206, | |
| "mean_token_accuracy": 0.8573070019483566, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": 6.419624328613281, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 3.5778, | |
| "mean_token_accuracy": 0.8265507072210312, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 5.554307460784912, | |
| "learning_rate": 2.5454545454545454e-05, | |
| "loss": 2.9344, | |
| "mean_token_accuracy": 0.8515568971633911, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 6.059617519378662, | |
| "learning_rate": 2.5353535353535356e-05, | |
| "loss": 3.4458, | |
| "mean_token_accuracy": 0.8348643332719803, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.995134353637695, | |
| "learning_rate": 2.5252525252525256e-05, | |
| "loss": 2.9078, | |
| "mean_token_accuracy": 0.8390886038541794, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": 5.29079532623291, | |
| "learning_rate": 2.5151515151515155e-05, | |
| "loss": 2.5449, | |
| "mean_token_accuracy": 0.8623387068510056, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 6.307847499847412, | |
| "learning_rate": 2.505050505050505e-05, | |
| "loss": 2.6746, | |
| "mean_token_accuracy": 0.8547452688217163, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": 5.258662700653076, | |
| "learning_rate": 2.494949494949495e-05, | |
| "loss": 2.8106, | |
| "mean_token_accuracy": 0.8465935438871384, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 5.160598278045654, | |
| "learning_rate": 2.4848484848484847e-05, | |
| "loss": 2.5327, | |
| "mean_token_accuracy": 0.8596736937761307, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 5.5139055252075195, | |
| "learning_rate": 2.474747474747475e-05, | |
| "loss": 2.5406, | |
| "mean_token_accuracy": 0.8599353432655334, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 5.596591949462891, | |
| "learning_rate": 2.464646464646465e-05, | |
| "loss": 2.5955, | |
| "mean_token_accuracy": 0.8636805862188339, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": 5.260463237762451, | |
| "learning_rate": 2.4545454545454545e-05, | |
| "loss": 2.8003, | |
| "mean_token_accuracy": 0.8472322225570679, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 5.485147476196289, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 2.4907, | |
| "mean_token_accuracy": 0.8619499355554581, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": 5.589687824249268, | |
| "learning_rate": 2.4343434343434344e-05, | |
| "loss": 2.563, | |
| "mean_token_accuracy": 0.8651385009288788, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 4.876934051513672, | |
| "learning_rate": 2.4242424242424244e-05, | |
| "loss": 2.0974, | |
| "mean_token_accuracy": 0.8819858431816101, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": 6.266694068908691, | |
| "learning_rate": 2.4141414141414143e-05, | |
| "loss": 2.6164, | |
| "mean_token_accuracy": 0.8617752641439438, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 5.54473352432251, | |
| "learning_rate": 2.404040404040404e-05, | |
| "loss": 2.9005, | |
| "mean_token_accuracy": 0.8471231460571289, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": 5.299022197723389, | |
| "learning_rate": 2.393939393939394e-05, | |
| "loss": 2.3437, | |
| "mean_token_accuracy": 0.872315376996994, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 5.8907856941223145, | |
| "learning_rate": 2.3838383838383842e-05, | |
| "loss": 2.5687, | |
| "mean_token_accuracy": 0.8683241903781891, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.84760046005249, | |
| "learning_rate": 2.3737373737373738e-05, | |
| "loss": 2.6167, | |
| "mean_token_accuracy": 0.8569298684597015, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 5.831202507019043, | |
| "learning_rate": 2.3636363636363637e-05, | |
| "loss": 3.0151, | |
| "mean_token_accuracy": 0.8395980596542358, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": 5.698826313018799, | |
| "learning_rate": 2.3535353535353537e-05, | |
| "loss": 3.1071, | |
| "mean_token_accuracy": 0.850052535533905, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 6.112237930297852, | |
| "learning_rate": 2.3434343434343436e-05, | |
| "loss": 2.624, | |
| "mean_token_accuracy": 0.8661712259054184, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": 6.229471683502197, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 2.5656, | |
| "mean_token_accuracy": 0.8691338896751404, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 6.126317501068115, | |
| "learning_rate": 2.3232323232323232e-05, | |
| "loss": 2.6405, | |
| "mean_token_accuracy": 0.8534891903400421, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": 6.28452205657959, | |
| "learning_rate": 2.313131313131313e-05, | |
| "loss": 2.6796, | |
| "mean_token_accuracy": 0.8531413078308105, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 5.863641262054443, | |
| "learning_rate": 2.3030303030303034e-05, | |
| "loss": 2.7015, | |
| "mean_token_accuracy": 0.8519039303064346, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": 6.535420894622803, | |
| "learning_rate": 2.292929292929293e-05, | |
| "loss": 3.0103, | |
| "mean_token_accuracy": 0.8415365815162659, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 5.875096321105957, | |
| "learning_rate": 2.282828282828283e-05, | |
| "loss": 2.8179, | |
| "mean_token_accuracy": 0.8494950979948044, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 5.35692834854126, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 2.1563, | |
| "mean_token_accuracy": 0.8810548335313797, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 6.05448579788208, | |
| "learning_rate": 2.262626262626263e-05, | |
| "loss": 2.4902, | |
| "mean_token_accuracy": 0.8656753301620483, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": 5.888792037963867, | |
| "learning_rate": 2.2525252525252528e-05, | |
| "loss": 2.5769, | |
| "mean_token_accuracy": 0.8682566732168198, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 5.912112712860107, | |
| "learning_rate": 2.2424242424242424e-05, | |
| "loss": 2.5715, | |
| "mean_token_accuracy": 0.8543179333209991, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": 6.485673427581787, | |
| "learning_rate": 2.2323232323232324e-05, | |
| "loss": 2.6801, | |
| "mean_token_accuracy": 0.8597021549940109, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.757596969604492, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 2.5495, | |
| "mean_token_accuracy": 0.8788398951292038, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": 5.577345848083496, | |
| "learning_rate": 2.2121212121212123e-05, | |
| "loss": 2.4548, | |
| "mean_token_accuracy": 0.8728667497634888, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 8.356124877929688, | |
| "learning_rate": 2.2020202020202022e-05, | |
| "loss": 2.9846, | |
| "mean_token_accuracy": 0.8483117371797562, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 5.887564659118652, | |
| "learning_rate": 2.191919191919192e-05, | |
| "loss": 2.7833, | |
| "mean_token_accuracy": 0.8488295823335648, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 6.558447360992432, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 2.3944, | |
| "mean_token_accuracy": 0.8682046681642532, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 6.828135013580322, | |
| "learning_rate": 2.171717171717172e-05, | |
| "loss": 2.8403, | |
| "mean_token_accuracy": 0.8527731895446777, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 5.772212505340576, | |
| "learning_rate": 2.1616161616161617e-05, | |
| "loss": 2.8966, | |
| "mean_token_accuracy": 0.8490441888570786, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": 5.538363933563232, | |
| "learning_rate": 2.1515151515151516e-05, | |
| "loss": 2.4595, | |
| "mean_token_accuracy": 0.8743099421262741, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 6.172807216644287, | |
| "learning_rate": 2.1414141414141416e-05, | |
| "loss": 2.3328, | |
| "mean_token_accuracy": 0.8704792261123657, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": 6.232668399810791, | |
| "learning_rate": 2.1313131313131315e-05, | |
| "loss": 2.9026, | |
| "mean_token_accuracy": 0.8546594232320786, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 6.070257663726807, | |
| "learning_rate": 2.1212121212121215e-05, | |
| "loss": 2.4686, | |
| "mean_token_accuracy": 0.8643316328525543, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": 5.737977504730225, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 2.9917, | |
| "mean_token_accuracy": 0.8514521420001984, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 6.409512519836426, | |
| "learning_rate": 2.101010101010101e-05, | |
| "loss": 2.6572, | |
| "mean_token_accuracy": 0.8566225320100784, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": 6.57427978515625, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 2.466, | |
| "mean_token_accuracy": 0.853950634598732, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 6.017867088317871, | |
| "learning_rate": 2.080808080808081e-05, | |
| "loss": 2.8763, | |
| "mean_token_accuracy": 0.8532497733831406, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 5.640237808227539, | |
| "learning_rate": 2.070707070707071e-05, | |
| "loss": 2.6013, | |
| "mean_token_accuracy": 0.8739284723997116, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 6.341038703918457, | |
| "learning_rate": 2.0606060606060608e-05, | |
| "loss": 2.6485, | |
| "mean_token_accuracy": 0.8521928191184998, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": 6.142149925231934, | |
| "learning_rate": 2.0505050505050504e-05, | |
| "loss": 2.3397, | |
| "mean_token_accuracy": 0.8850863575935364, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 6.308354377746582, | |
| "learning_rate": 2.0404040404040407e-05, | |
| "loss": 2.5977, | |
| "mean_token_accuracy": 0.8648134768009186, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": 5.566152572631836, | |
| "learning_rate": 2.0303030303030303e-05, | |
| "loss": 2.7123, | |
| "mean_token_accuracy": 0.8633100241422653, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.725775241851807, | |
| "learning_rate": 2.0202020202020203e-05, | |
| "loss": 2.6108, | |
| "mean_token_accuracy": 0.8738928139209747, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": 6.435664176940918, | |
| "learning_rate": 2.0101010101010102e-05, | |
| "loss": 2.7976, | |
| "mean_token_accuracy": 0.8510608673095703, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 7.4542622566223145, | |
| "learning_rate": 2e-05, | |
| "loss": 2.8097, | |
| "mean_token_accuracy": 0.8516113609075546, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": 5.655482769012451, | |
| "learning_rate": 1.98989898989899e-05, | |
| "loss": 2.3856, | |
| "mean_token_accuracy": 0.8649388700723648, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 6.412247180938721, | |
| "learning_rate": 1.9797979797979797e-05, | |
| "loss": 2.8675, | |
| "mean_token_accuracy": 0.8470883667469025, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 7.186456680297852, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 3.1156, | |
| "mean_token_accuracy": 0.8328486531972885, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 5.615805625915527, | |
| "learning_rate": 1.95959595959596e-05, | |
| "loss": 2.6565, | |
| "mean_token_accuracy": 0.8666907250881195, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": 6.449917793273926, | |
| "learning_rate": 1.9494949494949496e-05, | |
| "loss": 2.9544, | |
| "mean_token_accuracy": 0.8483339697122574, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 5.747985363006592, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "loss": 2.1422, | |
| "mean_token_accuracy": 0.8723134845495224, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": 6.027318000793457, | |
| "learning_rate": 1.9292929292929295e-05, | |
| "loss": 2.5305, | |
| "mean_token_accuracy": 0.8660032451152802, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 7.454213619232178, | |
| "learning_rate": 1.919191919191919e-05, | |
| "loss": 2.5679, | |
| "mean_token_accuracy": 0.8533085733652115, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": 5.4801716804504395, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "loss": 2.6006, | |
| "mean_token_accuracy": 0.8550811111927032, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 6.020359039306641, | |
| "learning_rate": 1.898989898989899e-05, | |
| "loss": 2.2521, | |
| "mean_token_accuracy": 0.8731967061758041, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": 5.78825569152832, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 2.4461, | |
| "mean_token_accuracy": 0.8596260696649551, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 5.771088123321533, | |
| "learning_rate": 1.878787878787879e-05, | |
| "loss": 2.3915, | |
| "mean_token_accuracy": 0.8663473874330521, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 5.939653396606445, | |
| "learning_rate": 1.8686868686868688e-05, | |
| "loss": 2.7159, | |
| "mean_token_accuracy": 0.8722144514322281, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 6.489267349243164, | |
| "learning_rate": 1.8585858585858588e-05, | |
| "loss": 2.6137, | |
| "mean_token_accuracy": 0.8580323159694672, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": 5.97398042678833, | |
| "learning_rate": 1.8484848484848487e-05, | |
| "loss": 2.7352, | |
| "mean_token_accuracy": 0.8655538409948349, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 5.858009338378906, | |
| "learning_rate": 1.8383838383838383e-05, | |
| "loss": 2.2991, | |
| "mean_token_accuracy": 0.8808925747871399, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": 5.720334053039551, | |
| "learning_rate": 1.8282828282828286e-05, | |
| "loss": 2.6282, | |
| "mean_token_accuracy": 0.8669092357158661, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 6.569767475128174, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 2.5663, | |
| "mean_token_accuracy": 0.8692184388637543, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": 6.509884357452393, | |
| "learning_rate": 1.808080808080808e-05, | |
| "loss": 2.8698, | |
| "mean_token_accuracy": 0.8457562029361725, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 5.976521968841553, | |
| "learning_rate": 1.797979797979798e-05, | |
| "loss": 2.7501, | |
| "mean_token_accuracy": 0.846638560295105, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": 5.878550052642822, | |
| "learning_rate": 1.787878787878788e-05, | |
| "loss": 2.2021, | |
| "mean_token_accuracy": 0.8834634870290756, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 5.905157566070557, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 2.5808, | |
| "mean_token_accuracy": 0.8616163432598114, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 5.510688304901123, | |
| "learning_rate": 1.7676767676767676e-05, | |
| "loss": 2.265, | |
| "mean_token_accuracy": 0.8738918304443359, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 5.952402591705322, | |
| "learning_rate": 1.7575757575757576e-05, | |
| "loss": 2.7743, | |
| "mean_token_accuracy": 0.85427226126194, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": 6.647154808044434, | |
| "learning_rate": 1.7474747474747475e-05, | |
| "loss": 2.9181, | |
| "mean_token_accuracy": 0.8349238932132721, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 7.24924898147583, | |
| "learning_rate": 1.7373737373737375e-05, | |
| "loss": 3.1174, | |
| "mean_token_accuracy": 0.8411069959402084, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": 5.238758563995361, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 2.4068, | |
| "mean_token_accuracy": 0.8669510632753372, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 5.997193336486816, | |
| "learning_rate": 1.7171717171717173e-05, | |
| "loss": 3.0963, | |
| "mean_token_accuracy": 0.8448814451694489, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.648, | |
| "grad_norm": 6.348751544952393, | |
| "learning_rate": 1.707070707070707e-05, | |
| "loss": 2.8593, | |
| "mean_token_accuracy": 0.8491098284721375, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 5.824342250823975, | |
| "learning_rate": 1.6969696969696972e-05, | |
| "loss": 2.5448, | |
| "mean_token_accuracy": 0.8616123348474503, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.664, | |
| "grad_norm": 7.513331890106201, | |
| "learning_rate": 1.686868686868687e-05, | |
| "loss": 2.5108, | |
| "mean_token_accuracy": 0.8665321916341782, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 5.547114372253418, | |
| "learning_rate": 1.6767676767676768e-05, | |
| "loss": 2.674, | |
| "mean_token_accuracy": 0.8699924349784851, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 6.60705041885376, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 2.9442, | |
| "mean_token_accuracy": 0.8435302972793579, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 6.116333961486816, | |
| "learning_rate": 1.6565656565656567e-05, | |
| "loss": 2.3428, | |
| "mean_token_accuracy": 0.8697264492511749, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": 6.642714500427246, | |
| "learning_rate": 1.6464646464646466e-05, | |
| "loss": 2.7629, | |
| "mean_token_accuracy": 0.863296702504158, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 6.764316558837891, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 3.0041, | |
| "mean_token_accuracy": 0.8556036204099655, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": 5.682474613189697, | |
| "learning_rate": 1.6262626262626262e-05, | |
| "loss": 2.4677, | |
| "mean_token_accuracy": 0.8652613461017609, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 6.347187519073486, | |
| "learning_rate": 1.6161616161616165e-05, | |
| "loss": 2.4674, | |
| "mean_token_accuracy": 0.8690385818481445, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": 5.507145404815674, | |
| "learning_rate": 1.606060606060606e-05, | |
| "loss": 2.692, | |
| "mean_token_accuracy": 0.8610135316848755, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 6.108325481414795, | |
| "learning_rate": 1.595959595959596e-05, | |
| "loss": 2.4909, | |
| "mean_token_accuracy": 0.8648791164159775, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": 6.017334938049316, | |
| "learning_rate": 1.585858585858586e-05, | |
| "loss": 2.2442, | |
| "mean_token_accuracy": 0.8753398060798645, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 6.193384170532227, | |
| "learning_rate": 1.5757575757575756e-05, | |
| "loss": 2.3729, | |
| "mean_token_accuracy": 0.8614612221717834, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 6.495743751525879, | |
| "learning_rate": 1.565656565656566e-05, | |
| "loss": 2.8584, | |
| "mean_token_accuracy": 0.8486870527267456, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 5.537295818328857, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 2.3311, | |
| "mean_token_accuracy": 0.8707073032855988, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.776, | |
| "grad_norm": 5.175790309906006, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "loss": 2.4548, | |
| "mean_token_accuracy": 0.8665929436683655, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 5.646080017089844, | |
| "learning_rate": 1.5353535353535354e-05, | |
| "loss": 2.4042, | |
| "mean_token_accuracy": 0.8688449859619141, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.792, | |
| "grad_norm": 5.514770984649658, | |
| "learning_rate": 1.5252525252525255e-05, | |
| "loss": 2.3592, | |
| "mean_token_accuracy": 0.868006706237793, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.443539619445801, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "loss": 2.7749, | |
| "mean_token_accuracy": 0.8517529368400574, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.808, | |
| "grad_norm": 6.177525043487549, | |
| "learning_rate": 1.505050505050505e-05, | |
| "loss": 2.7422, | |
| "mean_token_accuracy": 0.853731244802475, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 5.842599868774414, | |
| "learning_rate": 1.494949494949495e-05, | |
| "loss": 2.7828, | |
| "mean_token_accuracy": 0.8474886268377304, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.824, | |
| "grad_norm": 6.956360816955566, | |
| "learning_rate": 1.484848484848485e-05, | |
| "loss": 2.6584, | |
| "mean_token_accuracy": 0.8608374446630478, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 6.075902938842773, | |
| "learning_rate": 1.4747474747474749e-05, | |
| "loss": 1.7885, | |
| "mean_token_accuracy": 0.8908968865871429, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 5.676952838897705, | |
| "learning_rate": 1.4646464646464647e-05, | |
| "loss": 2.2233, | |
| "mean_token_accuracy": 0.8666234165430069, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 6.072289943695068, | |
| "learning_rate": 1.4545454545454545e-05, | |
| "loss": 2.9788, | |
| "mean_token_accuracy": 0.8446744084358215, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.856, | |
| "grad_norm": 6.316305637359619, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 2.6826, | |
| "mean_token_accuracy": 0.8572750687599182, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 6.896461486816406, | |
| "learning_rate": 1.4343434343434345e-05, | |
| "loss": 2.3163, | |
| "mean_token_accuracy": 0.874391183257103, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.872, | |
| "grad_norm": 6.410128116607666, | |
| "learning_rate": 1.4242424242424243e-05, | |
| "loss": 2.66, | |
| "mean_token_accuracy": 0.8778804391622543, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 5.7101664543151855, | |
| "learning_rate": 1.4141414141414141e-05, | |
| "loss": 2.5463, | |
| "mean_token_accuracy": 0.8713642507791519, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.888, | |
| "grad_norm": 5.941382884979248, | |
| "learning_rate": 1.404040404040404e-05, | |
| "loss": 2.4611, | |
| "mean_token_accuracy": 0.8689820915460587, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 5.766918182373047, | |
| "learning_rate": 1.3939393939393942e-05, | |
| "loss": 2.3588, | |
| "mean_token_accuracy": 0.8715604543685913, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.904, | |
| "grad_norm": 6.412060260772705, | |
| "learning_rate": 1.383838383838384e-05, | |
| "loss": 2.9418, | |
| "mean_token_accuracy": 0.8422341346740723, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 6.339536666870117, | |
| "learning_rate": 1.3737373737373737e-05, | |
| "loss": 2.8499, | |
| "mean_token_accuracy": 0.8519201278686523, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 5.80408239364624, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 2.6526, | |
| "mean_token_accuracy": 0.8615650832653046, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 5.643406391143799, | |
| "learning_rate": 1.3535353535353538e-05, | |
| "loss": 2.4131, | |
| "mean_token_accuracy": 0.8788148909807205, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.936, | |
| "grad_norm": 6.2641167640686035, | |
| "learning_rate": 1.3434343434343436e-05, | |
| "loss": 2.7667, | |
| "mean_token_accuracy": 0.8569764792919159, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 6.191479682922363, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 2.3207, | |
| "mean_token_accuracy": 0.8694847077131271, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.952, | |
| "grad_norm": 5.668253421783447, | |
| "learning_rate": 1.3232323232323233e-05, | |
| "loss": 2.4538, | |
| "mean_token_accuracy": 0.8613507598638535, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 5.68297004699707, | |
| "learning_rate": 1.3131313131313134e-05, | |
| "loss": 2.4336, | |
| "mean_token_accuracy": 0.8691912293434143, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.968, | |
| "grad_norm": 6.308213710784912, | |
| "learning_rate": 1.3030303030303032e-05, | |
| "loss": 2.9029, | |
| "mean_token_accuracy": 0.8455093801021576, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 5.907562255859375, | |
| "learning_rate": 1.292929292929293e-05, | |
| "loss": 2.4898, | |
| "mean_token_accuracy": 0.8742183148860931, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.984, | |
| "grad_norm": 6.077236652374268, | |
| "learning_rate": 1.2828282828282829e-05, | |
| "loss": 2.5998, | |
| "mean_token_accuracy": 0.8597747683525085, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 5.781510829925537, | |
| "learning_rate": 1.2727272727272727e-05, | |
| "loss": 2.5208, | |
| "mean_token_accuracy": 0.8634484708309174, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 6.007801532745361, | |
| "learning_rate": 1.2626262626262628e-05, | |
| "loss": 2.5073, | |
| "mean_token_accuracy": 0.8571873605251312, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 5.24554443359375, | |
| "learning_rate": 1.2525252525252526e-05, | |
| "loss": 2.189, | |
| "mean_token_accuracy": 0.8963351398706436, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 5.7106614112854, | |
| "learning_rate": 1.2424242424242424e-05, | |
| "loss": 2.1298, | |
| "mean_token_accuracy": 0.8783598244190216, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "grad_norm": 5.2301025390625, | |
| "learning_rate": 1.2323232323232325e-05, | |
| "loss": 1.9574, | |
| "mean_token_accuracy": 0.8882095962762833, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.032, | |
| "grad_norm": 5.160372257232666, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 2.315, | |
| "mean_token_accuracy": 0.8809339702129364, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 5.574481010437012, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 2.1501, | |
| "mean_token_accuracy": 0.8889680802822113, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.048, | |
| "grad_norm": 6.913398265838623, | |
| "learning_rate": 1.202020202020202e-05, | |
| "loss": 2.3129, | |
| "mean_token_accuracy": 0.8849718272686005, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.056, | |
| "grad_norm": 6.229403972625732, | |
| "learning_rate": 1.1919191919191921e-05, | |
| "loss": 2.3757, | |
| "mean_token_accuracy": 0.8662046045064926, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.064, | |
| "grad_norm": 5.60806131362915, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 2.6174, | |
| "mean_token_accuracy": 0.8517654091119766, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 5.600075721740723, | |
| "learning_rate": 1.1717171717171718e-05, | |
| "loss": 2.3305, | |
| "mean_token_accuracy": 0.8705442994832993, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 6.062666416168213, | |
| "learning_rate": 1.1616161616161616e-05, | |
| "loss": 2.2109, | |
| "mean_token_accuracy": 0.8755911886692047, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.088, | |
| "grad_norm": 7.154458999633789, | |
| "learning_rate": 1.1515151515151517e-05, | |
| "loss": 2.6082, | |
| "mean_token_accuracy": 0.860401377081871, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 6.247750759124756, | |
| "learning_rate": 1.1414141414141415e-05, | |
| "loss": 2.2877, | |
| "mean_token_accuracy": 0.880204901099205, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 5.955714702606201, | |
| "learning_rate": 1.1313131313131314e-05, | |
| "loss": 1.8738, | |
| "mean_token_accuracy": 0.9012208431959152, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.112, | |
| "grad_norm": 5.796663284301758, | |
| "learning_rate": 1.1212121212121212e-05, | |
| "loss": 1.6587, | |
| "mean_token_accuracy": 0.9029347151517868, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 5.392340660095215, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 2.3056, | |
| "mean_token_accuracy": 0.8753447830677032, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.128, | |
| "grad_norm": 7.139640808105469, | |
| "learning_rate": 1.1010101010101011e-05, | |
| "loss": 2.8439, | |
| "mean_token_accuracy": 0.8585825264453888, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 5.9656829833984375, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 2.1093, | |
| "mean_token_accuracy": 0.8788597285747528, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.144, | |
| "grad_norm": 5.839116096496582, | |
| "learning_rate": 1.0808080808080808e-05, | |
| "loss": 2.2813, | |
| "mean_token_accuracy": 0.885636180639267, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.152, | |
| "grad_norm": 5.032593727111816, | |
| "learning_rate": 1.0707070707070708e-05, | |
| "loss": 1.9325, | |
| "mean_token_accuracy": 0.8939605355262756, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 6.447021961212158, | |
| "learning_rate": 1.0606060606060607e-05, | |
| "loss": 2.0899, | |
| "mean_token_accuracy": 0.8907236605882645, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 6.839473724365234, | |
| "learning_rate": 1.0505050505050505e-05, | |
| "loss": 1.8443, | |
| "mean_token_accuracy": 0.8959701657295227, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 6.076663017272949, | |
| "learning_rate": 1.0404040404040405e-05, | |
| "loss": 2.0503, | |
| "mean_token_accuracy": 0.8750788569450378, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.184, | |
| "grad_norm": 5.917912483215332, | |
| "learning_rate": 1.0303030303030304e-05, | |
| "loss": 2.1561, | |
| "mean_token_accuracy": 0.8795887529850006, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.192, | |
| "grad_norm": 6.047354698181152, | |
| "learning_rate": 1.0202020202020204e-05, | |
| "loss": 2.4048, | |
| "mean_token_accuracy": 0.8771449476480484, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 6.2761125564575195, | |
| "learning_rate": 1.0101010101010101e-05, | |
| "loss": 2.0975, | |
| "mean_token_accuracy": 0.8842770159244537, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.208, | |
| "grad_norm": 6.07623291015625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8668, | |
| "mean_token_accuracy": 0.8863209933042526, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.216, | |
| "grad_norm": 6.246818542480469, | |
| "learning_rate": 9.898989898989899e-06, | |
| "loss": 2.245, | |
| "mean_token_accuracy": 0.8780923783779144, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.224, | |
| "grad_norm": 6.323997974395752, | |
| "learning_rate": 9.7979797979798e-06, | |
| "loss": 2.1369, | |
| "mean_token_accuracy": 0.8779752850532532, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 7.1778411865234375, | |
| "learning_rate": 9.696969696969698e-06, | |
| "loss": 2.5985, | |
| "mean_token_accuracy": 0.8683747202157974, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 6.361847877502441, | |
| "learning_rate": 9.595959595959595e-06, | |
| "loss": 2.3375, | |
| "mean_token_accuracy": 0.8720128089189529, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.248, | |
| "grad_norm": 6.138617992401123, | |
| "learning_rate": 9.494949494949495e-06, | |
| "loss": 2.0752, | |
| "mean_token_accuracy": 0.8752965927124023, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 5.688358783721924, | |
| "learning_rate": 9.393939393939394e-06, | |
| "loss": 1.9569, | |
| "mean_token_accuracy": 0.8975227028131485, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 5.894192218780518, | |
| "learning_rate": 9.292929292929294e-06, | |
| "loss": 2.0029, | |
| "mean_token_accuracy": 0.886854737997055, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.2720000000000002, | |
| "grad_norm": 6.036264419555664, | |
| "learning_rate": 9.191919191919192e-06, | |
| "loss": 2.2071, | |
| "mean_token_accuracy": 0.8780567795038223, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 5.998513221740723, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 1.9301, | |
| "mean_token_accuracy": 0.8917412608861923, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.288, | |
| "grad_norm": 6.909607887268066, | |
| "learning_rate": 8.98989898989899e-06, | |
| "loss": 2.1684, | |
| "mean_token_accuracy": 0.871549054980278, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 6.868885040283203, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 2.3904, | |
| "mean_token_accuracy": 0.8834272921085358, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.304, | |
| "grad_norm": 6.619902610778809, | |
| "learning_rate": 8.787878787878788e-06, | |
| "loss": 2.3145, | |
| "mean_token_accuracy": 0.8664613366127014, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.312, | |
| "grad_norm": 7.675636291503906, | |
| "learning_rate": 8.686868686868687e-06, | |
| "loss": 2.4144, | |
| "mean_token_accuracy": 0.8673148602247238, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 6.839119911193848, | |
| "learning_rate": 8.585858585858587e-06, | |
| "loss": 2.445, | |
| "mean_token_accuracy": 0.8668957501649857, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 6.37800407409668, | |
| "learning_rate": 8.484848484848486e-06, | |
| "loss": 2.0239, | |
| "mean_token_accuracy": 0.8818509876728058, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 6.840362071990967, | |
| "learning_rate": 8.383838383838384e-06, | |
| "loss": 2.2702, | |
| "mean_token_accuracy": 0.8701023757457733, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.344, | |
| "grad_norm": 6.7885003089904785, | |
| "learning_rate": 8.282828282828283e-06, | |
| "loss": 2.4609, | |
| "mean_token_accuracy": 0.8644936680793762, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.352, | |
| "grad_norm": 7.857728481292725, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 1.8772, | |
| "mean_token_accuracy": 0.8961075842380524, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 6.570309162139893, | |
| "learning_rate": 8.080808080808082e-06, | |
| "loss": 1.8877, | |
| "mean_token_accuracy": 0.8875200748443604, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.368, | |
| "grad_norm": 6.203190803527832, | |
| "learning_rate": 7.97979797979798e-06, | |
| "loss": 2.529, | |
| "mean_token_accuracy": 0.8654111176729202, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.376, | |
| "grad_norm": 5.97314977645874, | |
| "learning_rate": 7.878787878787878e-06, | |
| "loss": 1.9645, | |
| "mean_token_accuracy": 0.8833965063095093, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.384, | |
| "grad_norm": 6.25310754776001, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 2.2233, | |
| "mean_token_accuracy": 0.8773595839738846, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 7.28585958480835, | |
| "learning_rate": 7.676767676767677e-06, | |
| "loss": 2.0955, | |
| "mean_token_accuracy": 0.8743046373128891, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 5.4623260498046875, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 2.0769, | |
| "mean_token_accuracy": 0.886017695069313, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.408, | |
| "grad_norm": 6.525033950805664, | |
| "learning_rate": 7.474747474747475e-06, | |
| "loss": 1.8359, | |
| "mean_token_accuracy": 0.8860061317682266, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 6.582717418670654, | |
| "learning_rate": 7.3737373737373745e-06, | |
| "loss": 1.9729, | |
| "mean_token_accuracy": 0.8987778276205063, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 6.132490634918213, | |
| "learning_rate": 7.272727272727272e-06, | |
| "loss": 1.9637, | |
| "mean_token_accuracy": 0.883312463760376, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.432, | |
| "grad_norm": 5.695188045501709, | |
| "learning_rate": 7.171717171717173e-06, | |
| "loss": 2.0213, | |
| "mean_token_accuracy": 0.8891171365976334, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 8.365019798278809, | |
| "learning_rate": 7.0707070707070704e-06, | |
| "loss": 2.4396, | |
| "mean_token_accuracy": 0.8708128333091736, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.448, | |
| "grad_norm": 5.691287040710449, | |
| "learning_rate": 6.969696969696971e-06, | |
| "loss": 2.0038, | |
| "mean_token_accuracy": 0.9007010161876678, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 6.807380676269531, | |
| "learning_rate": 6.8686868686868685e-06, | |
| "loss": 1.9772, | |
| "mean_token_accuracy": 0.8897037208080292, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.464, | |
| "grad_norm": 6.571257591247559, | |
| "learning_rate": 6.767676767676769e-06, | |
| "loss": 2.5581, | |
| "mean_token_accuracy": 0.8635726571083069, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.472, | |
| "grad_norm": 5.491265296936035, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.9038, | |
| "mean_token_accuracy": 0.9004794210195541, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 5.837332725524902, | |
| "learning_rate": 6.565656565656567e-06, | |
| "loss": 1.8962, | |
| "mean_token_accuracy": 0.890899047255516, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 6.126434803009033, | |
| "learning_rate": 6.464646464646465e-06, | |
| "loss": 2.0859, | |
| "mean_token_accuracy": 0.8814673125743866, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 5.65269660949707, | |
| "learning_rate": 6.363636363636363e-06, | |
| "loss": 1.7333, | |
| "mean_token_accuracy": 0.900140568614006, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.504, | |
| "grad_norm": 6.0961079597473145, | |
| "learning_rate": 6.262626262626263e-06, | |
| "loss": 1.9624, | |
| "mean_token_accuracy": 0.8910833448171616, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.512, | |
| "grad_norm": 6.404435157775879, | |
| "learning_rate": 6.161616161616162e-06, | |
| "loss": 2.3729, | |
| "mean_token_accuracy": 0.8687343299388885, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.7053093910217285, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 2.2333, | |
| "mean_token_accuracy": 0.8874952048063278, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.528, | |
| "grad_norm": 6.296670436859131, | |
| "learning_rate": 5.9595959595959605e-06, | |
| "loss": 2.1395, | |
| "mean_token_accuracy": 0.8732293993234634, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.536, | |
| "grad_norm": 6.392675399780273, | |
| "learning_rate": 5.858585858585859e-06, | |
| "loss": 2.0524, | |
| "mean_token_accuracy": 0.8836774080991745, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.544, | |
| "grad_norm": 6.301563262939453, | |
| "learning_rate": 5.7575757575757586e-06, | |
| "loss": 1.8739, | |
| "mean_token_accuracy": 0.8968744426965714, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 6.225803852081299, | |
| "learning_rate": 5.656565656565657e-06, | |
| "loss": 2.0003, | |
| "mean_token_accuracy": 0.8827953040599823, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 6.4414801597595215, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 2.1538, | |
| "mean_token_accuracy": 0.8831271827220917, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.568, | |
| "grad_norm": 8.136292457580566, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 1.7717, | |
| "mean_token_accuracy": 0.8966427743434906, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 6.468795299530029, | |
| "learning_rate": 5.353535353535354e-06, | |
| "loss": 1.9399, | |
| "mean_token_accuracy": 0.8909775465726852, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 6.0381083488464355, | |
| "learning_rate": 5.2525252525252526e-06, | |
| "loss": 2.0466, | |
| "mean_token_accuracy": 0.8934204578399658, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.592, | |
| "grad_norm": 6.083271026611328, | |
| "learning_rate": 5.151515151515152e-06, | |
| "loss": 2.1554, | |
| "mean_token_accuracy": 0.88754041492939, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 6.717824935913086, | |
| "learning_rate": 5.050505050505051e-06, | |
| "loss": 2.1326, | |
| "mean_token_accuracy": 0.8911218792200089, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.608, | |
| "grad_norm": 6.14173698425293, | |
| "learning_rate": 4.949494949494949e-06, | |
| "loss": 1.6302, | |
| "mean_token_accuracy": 0.9065367430448532, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 6.943065643310547, | |
| "learning_rate": 4.848484848484849e-06, | |
| "loss": 2.6283, | |
| "mean_token_accuracy": 0.8691755533218384, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.624, | |
| "grad_norm": 6.845998287200928, | |
| "learning_rate": 4.747474747474747e-06, | |
| "loss": 2.1922, | |
| "mean_token_accuracy": 0.8834270387887955, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.632, | |
| "grad_norm": 6.045809268951416, | |
| "learning_rate": 4.646464646464647e-06, | |
| "loss": 1.8713, | |
| "mean_token_accuracy": 0.9000080078840256, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 7.274755477905273, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 2.0781, | |
| "mean_token_accuracy": 0.8945260941982269, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 6.384909152984619, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 2.1529, | |
| "mean_token_accuracy": 0.8765928000211716, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 5.835945129394531, | |
| "learning_rate": 4.343434343434344e-06, | |
| "loss": 1.626, | |
| "mean_token_accuracy": 0.9043529778718948, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.664, | |
| "grad_norm": 6.979379177093506, | |
| "learning_rate": 4.242424242424243e-06, | |
| "loss": 2.3105, | |
| "mean_token_accuracy": 0.8783639371395111, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.672, | |
| "grad_norm": 7.117347240447998, | |
| "learning_rate": 4.141414141414142e-06, | |
| "loss": 2.1868, | |
| "mean_token_accuracy": 0.884697675704956, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 6.831372261047363, | |
| "learning_rate": 4.040404040404041e-06, | |
| "loss": 2.1852, | |
| "mean_token_accuracy": 0.8815398663282394, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.6879999999999997, | |
| "grad_norm": 6.203273773193359, | |
| "learning_rate": 3.939393939393939e-06, | |
| "loss": 2.1329, | |
| "mean_token_accuracy": 0.8797174096107483, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.6959999999999997, | |
| "grad_norm": 5.709746837615967, | |
| "learning_rate": 3.8383838383838385e-06, | |
| "loss": 2.0121, | |
| "mean_token_accuracy": 0.8906229883432388, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.7039999999999997, | |
| "grad_norm": 6.700322151184082, | |
| "learning_rate": 3.7373737373737375e-06, | |
| "loss": 2.2537, | |
| "mean_token_accuracy": 0.8837107121944427, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 5.748446464538574, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 2.199, | |
| "mean_token_accuracy": 0.8685683310031891, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.7199999999999998, | |
| "grad_norm": 6.6395697593688965, | |
| "learning_rate": 3.5353535353535352e-06, | |
| "loss": 2.4631, | |
| "mean_token_accuracy": 0.8770331889390945, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.7279999999999998, | |
| "grad_norm": 6.521454334259033, | |
| "learning_rate": 3.4343434343434343e-06, | |
| "loss": 2.7614, | |
| "mean_token_accuracy": 0.8615328371524811, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 5.956634998321533, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 2.2834, | |
| "mean_token_accuracy": 0.8763253539800644, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 7.035250186920166, | |
| "learning_rate": 3.2323232323232324e-06, | |
| "loss": 2.033, | |
| "mean_token_accuracy": 0.8936846852302551, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.752, | |
| "grad_norm": 6.001895904541016, | |
| "learning_rate": 3.1313131313131314e-06, | |
| "loss": 2.005, | |
| "mean_token_accuracy": 0.8896369785070419, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 6.189558506011963, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 2.1698, | |
| "mean_token_accuracy": 0.8766046166419983, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.768, | |
| "grad_norm": 6.994729995727539, | |
| "learning_rate": 2.9292929292929295e-06, | |
| "loss": 2.108, | |
| "mean_token_accuracy": 0.8722548186779022, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 5.766332626342773, | |
| "learning_rate": 2.8282828282828286e-06, | |
| "loss": 1.7626, | |
| "mean_token_accuracy": 0.8968187123537064, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.784, | |
| "grad_norm": 6.6420392990112305, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 2.1263, | |
| "mean_token_accuracy": 0.8799799233675003, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.792, | |
| "grad_norm": 6.921932220458984, | |
| "learning_rate": 2.6262626262626263e-06, | |
| "loss": 1.8222, | |
| "mean_token_accuracy": 0.905672699213028, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 6.345583915710449, | |
| "learning_rate": 2.5252525252525253e-06, | |
| "loss": 2.3984, | |
| "mean_token_accuracy": 0.8654076457023621, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 6.0304484367370605, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 2.5167, | |
| "mean_token_accuracy": 0.8724007159471512, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 6.487928867340088, | |
| "learning_rate": 2.3232323232323234e-06, | |
| "loss": 2.5901, | |
| "mean_token_accuracy": 0.8569000661373138, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.824, | |
| "grad_norm": 6.210375785827637, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 1.7313, | |
| "mean_token_accuracy": 0.8982816338539124, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.832, | |
| "grad_norm": 6.7894182205200195, | |
| "learning_rate": 2.1212121212121216e-06, | |
| "loss": 2.121, | |
| "mean_token_accuracy": 0.8915885388851166, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 6.732741355895996, | |
| "learning_rate": 2.0202020202020206e-06, | |
| "loss": 2.0609, | |
| "mean_token_accuracy": 0.8849688917398453, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.848, | |
| "grad_norm": 6.280490875244141, | |
| "learning_rate": 1.9191919191919192e-06, | |
| "loss": 2.0463, | |
| "mean_token_accuracy": 0.8867028504610062, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.856, | |
| "grad_norm": 6.721777439117432, | |
| "learning_rate": 1.818181818181818e-06, | |
| "loss": 2.2023, | |
| "mean_token_accuracy": 0.8840513229370117, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.864, | |
| "grad_norm": 6.34207010269165, | |
| "learning_rate": 1.7171717171717171e-06, | |
| "loss": 2.2238, | |
| "mean_token_accuracy": 0.8773442953824997, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 6.753549098968506, | |
| "learning_rate": 1.6161616161616162e-06, | |
| "loss": 2.1247, | |
| "mean_token_accuracy": 0.8759674429893494, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 6.267600059509277, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 2.1506, | |
| "mean_token_accuracy": 0.8819513469934464, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.888, | |
| "grad_norm": 6.509019374847412, | |
| "learning_rate": 1.4141414141414143e-06, | |
| "loss": 2.6317, | |
| "mean_token_accuracy": 0.8672285228967667, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 6.893660068511963, | |
| "learning_rate": 1.3131313131313131e-06, | |
| "loss": 2.1853, | |
| "mean_token_accuracy": 0.8704394996166229, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 6.314718723297119, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 1.7886, | |
| "mean_token_accuracy": 0.8912533521652222, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.912, | |
| "grad_norm": 7.181535243988037, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 2.2651, | |
| "mean_token_accuracy": 0.8664899617433548, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.317063331604004, | |
| "learning_rate": 1.0101010101010103e-06, | |
| "loss": 2.3394, | |
| "mean_token_accuracy": 0.8771510571241379, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.928, | |
| "grad_norm": 6.195638656616211, | |
| "learning_rate": 9.09090909090909e-07, | |
| "loss": 2.0807, | |
| "mean_token_accuracy": 0.8755189925432205, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 6.2105817794799805, | |
| "learning_rate": 8.080808080808081e-07, | |
| "loss": 1.7974, | |
| "mean_token_accuracy": 0.9031495600938797, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.944, | |
| "grad_norm": 6.697377681732178, | |
| "learning_rate": 7.070707070707071e-07, | |
| "loss": 2.0917, | |
| "mean_token_accuracy": 0.8885557353496552, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.952, | |
| "grad_norm": 6.320727825164795, | |
| "learning_rate": 6.060606060606061e-07, | |
| "loss": 2.0905, | |
| "mean_token_accuracy": 0.8876091539859772, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 5.805496692657471, | |
| "learning_rate": 5.050505050505052e-07, | |
| "loss": 2.1719, | |
| "mean_token_accuracy": 0.8817218542098999, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 6.263917446136475, | |
| "learning_rate": 4.0404040404040405e-07, | |
| "loss": 1.7863, | |
| "mean_token_accuracy": 0.8985198885202408, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 5.801756858825684, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "loss": 2.0907, | |
| "mean_token_accuracy": 0.8869260847568512, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.984, | |
| "grad_norm": 7.1110310554504395, | |
| "learning_rate": 2.0202020202020202e-07, | |
| "loss": 2.3715, | |
| "mean_token_accuracy": 0.8689504116773605, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.992, | |
| "grad_norm": 7.1602911949157715, | |
| "learning_rate": 1.0101010101010101e-07, | |
| "loss": 2.4322, | |
| "mean_token_accuracy": 0.8692635595798492, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 6.362170696258545, | |
| "learning_rate": 0.0, | |
| "loss": 1.9344, | |
| "mean_token_accuracy": 0.8930138498544693, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7732474675200000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |