| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.1278437465320166, |
| "eval_steps": 500, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0035512151814449007, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.4053, |
| "mean_token_accuracy": 0.03334063550209976, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.007102430362889801, |
| "grad_norm": 5.0625, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.4467, |
| "mean_token_accuracy": 0.035705664253327996, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.010653645544334702, |
| "grad_norm": 3.84375, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.412, |
| "mean_token_accuracy": 0.03214258457956021, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.014204860725779603, |
| "grad_norm": 5.4375, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.4373, |
| "mean_token_accuracy": 0.031811027434741845, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.017756075907224503, |
| "grad_norm": 3.671875, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 0.4044, |
| "mean_token_accuracy": 0.030799477463006042, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.021307291088669404, |
| "grad_norm": 3.40625, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.4079, |
| "mean_token_accuracy": 0.029206037786934758, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.024858506270114305, |
| "grad_norm": 3.15625, |
| "learning_rate": 1e-05, |
| "loss": 0.3746, |
| "mean_token_accuracy": 0.02718283714784775, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.028409721451559206, |
| "grad_norm": 2.59375, |
| "learning_rate": 9.999980365120307e-06, |
| "loss": 0.3386, |
| "mean_token_accuracy": 0.029688632937904913, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0319609366330041, |
| "grad_norm": 3.296875, |
| "learning_rate": 9.999921460635436e-06, |
| "loss": 0.3371, |
| "mean_token_accuracy": 0.028260885846975725, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03551215181444901, |
| "grad_norm": 3.0, |
| "learning_rate": 9.999823287008022e-06, |
| "loss": 0.3134, |
| "mean_token_accuracy": 0.028826194098655833, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.039063366995893904, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.999685845009114e-06, |
| "loss": 0.3141, |
| "mean_token_accuracy": 0.03309909115705523, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.04261458217733881, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.999509135718176e-06, |
| "loss": 0.3091, |
| "mean_token_accuracy": 0.033378408043063246, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.046165797358783706, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.999293160523074e-06, |
| "loss": 0.3064, |
| "mean_token_accuracy": 0.031830415049626026, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04971701254022861, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.999037921120068e-06, |
| "loss": 0.2926, |
| "mean_token_accuracy": 0.0313498438699753, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.05326822772167351, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.998743419513795e-06, |
| "loss": 0.2893, |
| "mean_token_accuracy": 0.0330949738236086, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05681944290311841, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.998409658017256e-06, |
| "loss": 0.2984, |
| "mean_token_accuracy": 0.027138729998114286, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.06037065808456331, |
| "grad_norm": 2.84375, |
| "learning_rate": 9.998036639251798e-06, |
| "loss": 0.2999, |
| "mean_token_accuracy": 0.03013355429357034, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0639218732660082, |
| "grad_norm": 2.515625, |
| "learning_rate": 9.997624366147094e-06, |
| "loss": 0.2867, |
| "mean_token_accuracy": 0.031100922809855547, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06747308844745312, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.997172841941114e-06, |
| "loss": 0.279, |
| "mean_token_accuracy": 0.03025696396798594, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.07102430362889801, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.99668207018011e-06, |
| "loss": 0.2848, |
| "mean_token_accuracy": 0.032818032945215236, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07457551881034291, |
| "grad_norm": 1.3671875, |
| "learning_rate": 9.996152054718579e-06, |
| "loss": 0.2815, |
| "mean_token_accuracy": 0.03186455541072064, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.07812673399178781, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.995582799719237e-06, |
| "loss": 0.2878, |
| "mean_token_accuracy": 0.0334690267482074, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.08167794917323272, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.994974309652984e-06, |
| "loss": 0.2826, |
| "mean_token_accuracy": 0.03061204146433738, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.08522916435467762, |
| "grad_norm": 1.0859375, |
| "learning_rate": 9.994326589298875e-06, |
| "loss": 0.2673, |
| "mean_token_accuracy": 0.03804276299706544, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08878037953612251, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.993639643744071e-06, |
| "loss": 0.2909, |
| "mean_token_accuracy": 0.033314747761323815, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.09233159471756741, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.99291347838381e-06, |
| "loss": 0.2774, |
| "mean_token_accuracy": 0.030899848927219864, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.09588280989901232, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.992148098921361e-06, |
| "loss": 0.2609, |
| "mean_token_accuracy": 0.033753670039004646, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.09943402508045722, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.99134351136798e-06, |
| "loss": 0.2721, |
| "mean_token_accuracy": 0.03772788319474785, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.10298524026190212, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.990499722042852e-06, |
| "loss": 0.2751, |
| "mean_token_accuracy": 0.032295682049152674, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.10653645544334701, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.989616737573064e-06, |
| "loss": 0.2781, |
| "mean_token_accuracy": 0.033647085841948865, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11008767062479193, |
| "grad_norm": 1.2890625, |
| "learning_rate": 9.98869456489353e-06, |
| "loss": 0.2779, |
| "mean_token_accuracy": 0.028794426627428038, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.11363888580623682, |
| "grad_norm": 0.89453125, |
| "learning_rate": 9.987733211246952e-06, |
| "loss": 0.2607, |
| "mean_token_accuracy": 0.032419106319139246, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.11719010098768172, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.986732684183753e-06, |
| "loss": 0.2605, |
| "mean_token_accuracy": 0.037521557434956776, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.12074131616912662, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.985692991562026e-06, |
| "loss": 0.2634, |
| "mean_token_accuracy": 0.040194204961153446, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.12429253135057153, |
| "grad_norm": 1.28125, |
| "learning_rate": 9.984614141547468e-06, |
| "loss": 0.2557, |
| "mean_token_accuracy": 0.039635664583329344, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1278437465320164, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.983496142613314e-06, |
| "loss": 0.2685, |
| "mean_token_accuracy": 0.029371898828685516, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.13139496171346132, |
| "grad_norm": 0.921875, |
| "learning_rate": 9.982339003540272e-06, |
| "loss": 0.2693, |
| "mean_token_accuracy": 0.03898763568940922, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.13494617689490623, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.981142733416457e-06, |
| "loss": 0.2679, |
| "mean_token_accuracy": 0.03345580608220189, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.13849739207635112, |
| "grad_norm": 0.96875, |
| "learning_rate": 9.97990734163732e-06, |
| "loss": 0.2602, |
| "mean_token_accuracy": 0.03686568816374347, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.14204860725779603, |
| "grad_norm": 0.84375, |
| "learning_rate": 9.978632837905566e-06, |
| "loss": 0.262, |
| "mean_token_accuracy": 0.03514496624120511, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14559982243924094, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.977319232231088e-06, |
| "loss": 0.264, |
| "mean_token_accuracy": 0.03551164961390896, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.14915103762068582, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.975966534930879e-06, |
| "loss": 0.2641, |
| "mean_token_accuracy": 0.03131604355075979, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.15270225280213073, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.974574756628961e-06, |
| "loss": 0.2663, |
| "mean_token_accuracy": 0.035479316422424745, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.15625346798357562, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.973143908256291e-06, |
| "loss": 0.2637, |
| "mean_token_accuracy": 0.03412504114021431, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.15980468316502053, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.971674001050687e-06, |
| "loss": 0.2602, |
| "mean_token_accuracy": 0.038116581108624814, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.16335589834646544, |
| "grad_norm": 1.125, |
| "learning_rate": 9.970165046556726e-06, |
| "loss": 0.2529, |
| "mean_token_accuracy": 0.03723000747777405, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.16690711352791032, |
| "grad_norm": 1.0234375, |
| "learning_rate": 9.968617056625665e-06, |
| "loss": 0.2597, |
| "mean_token_accuracy": 0.031160971375356894, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.17045832870935523, |
| "grad_norm": 0.98828125, |
| "learning_rate": 9.967030043415345e-06, |
| "loss": 0.2654, |
| "mean_token_accuracy": 0.03424461206304841, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.17400954389080014, |
| "grad_norm": 1.25, |
| "learning_rate": 9.965404019390087e-06, |
| "loss": 0.2778, |
| "mean_token_accuracy": 0.030317287382786162, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.17756075907224503, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.963738997320609e-06, |
| "loss": 0.2656, |
| "mean_token_accuracy": 0.028600392921362072, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18111197425368994, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.962034990283912e-06, |
| "loss": 0.28, |
| "mean_token_accuracy": 0.03231685267383, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.18466318943513482, |
| "grad_norm": 1.0, |
| "learning_rate": 9.960292011663186e-06, |
| "loss": 0.2533, |
| "mean_token_accuracy": 0.03403068816260202, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.18821440461657973, |
| "grad_norm": 1.25, |
| "learning_rate": 9.958510075147703e-06, |
| "loss": 0.259, |
| "mean_token_accuracy": 0.032833737386681605, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.19176561979802464, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.956689194732702e-06, |
| "loss": 0.2656, |
| "mean_token_accuracy": 0.035690422784682596, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.19531683497946953, |
| "grad_norm": 0.99609375, |
| "learning_rate": 9.954829384719296e-06, |
| "loss": 0.2532, |
| "mean_token_accuracy": 0.036441230346099474, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.19886805016091444, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.95293065971434e-06, |
| "loss": 0.2651, |
| "mean_token_accuracy": 0.034422322398313554, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.20241926534235935, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.950993034630328e-06, |
| "loss": 0.2657, |
| "mean_token_accuracy": 0.03684557715678238, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.20597048052380423, |
| "grad_norm": 1.2890625, |
| "learning_rate": 9.949016524685277e-06, |
| "loss": 0.2558, |
| "mean_token_accuracy": 0.03600433477549814, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.20952169570524914, |
| "grad_norm": 0.9921875, |
| "learning_rate": 9.947001145402598e-06, |
| "loss": 0.2528, |
| "mean_token_accuracy": 0.03322864700021455, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.21307291088669403, |
| "grad_norm": 0.90625, |
| "learning_rate": 9.944946912610986e-06, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.03672743546849233, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.21662412606813894, |
| "grad_norm": 0.87890625, |
| "learning_rate": 9.942853842444283e-06, |
| "loss": 0.27, |
| "mean_token_accuracy": 0.032799736989545636, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.22017534124958385, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.940721951341365e-06, |
| "loss": 0.2626, |
| "mean_token_accuracy": 0.03218006519819028, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.22372655643102873, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.938551256046e-06, |
| "loss": 0.2527, |
| "mean_token_accuracy": 0.03779199095151853, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.22727777161247364, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.936341773606723e-06, |
| "loss": 0.2505, |
| "mean_token_accuracy": 0.03585068928805413, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.23082898679391856, |
| "grad_norm": 0.91015625, |
| "learning_rate": 9.934093521376707e-06, |
| "loss": 0.2453, |
| "mean_token_accuracy": 0.03667068052163813, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.23438020197536344, |
| "grad_norm": 1.125, |
| "learning_rate": 9.931806517013612e-06, |
| "loss": 0.2549, |
| "mean_token_accuracy": 0.03179566226390307, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.23793141715680835, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.929480778479465e-06, |
| "loss": 0.2578, |
| "mean_token_accuracy": 0.03751813623966882, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.24148263233825323, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.9271163240405e-06, |
| "loss": 0.2516, |
| "mean_token_accuracy": 0.038412457124650246, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.24503384751969814, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.92471317226703e-06, |
| "loss": 0.2507, |
| "mean_token_accuracy": 0.034873580965722795, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.24858506270114306, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.922271342033295e-06, |
| "loss": 0.239, |
| "mean_token_accuracy": 0.03720474839792587, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.25213627788258797, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.919790852517313e-06, |
| "loss": 0.2463, |
| "mean_token_accuracy": 0.0361522939929273, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.2556874930640328, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.917271723200725e-06, |
| "loss": 0.2585, |
| "mean_token_accuracy": 0.033959298038098495, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.25923870824547773, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.914713973868654e-06, |
| "loss": 0.2585, |
| "mean_token_accuracy": 0.035365104355150834, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.26278992342692264, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.91211762460954e-06, |
| "loss": 0.2674, |
| "mean_token_accuracy": 0.03293743663743953, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.26634113860836756, |
| "grad_norm": 0.9609375, |
| "learning_rate": 9.909482695814986e-06, |
| "loss": 0.2446, |
| "mean_token_accuracy": 0.0343262117858103, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.26989235378981247, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.906809208179593e-06, |
| "loss": 0.2521, |
| "mean_token_accuracy": 0.03826202965865377, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.2734435689712574, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.904097182700806e-06, |
| "loss": 0.2589, |
| "mean_token_accuracy": 0.03630730328222853, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.27699478415270223, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.901346640678744e-06, |
| "loss": 0.2413, |
| "mean_token_accuracy": 0.03989503123011673, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.28054599933414714, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.898557603716031e-06, |
| "loss": 0.2658, |
| "mean_token_accuracy": 0.03677298163893283, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.28409721451559206, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.895730093717629e-06, |
| "loss": 0.252, |
| "mean_token_accuracy": 0.03291640393581474, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.28764842969703697, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.892864132890663e-06, |
| "loss": 0.2562, |
| "mean_token_accuracy": 0.03596020668192068, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2911996448784819, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.889959743744253e-06, |
| "loss": 0.2475, |
| "mean_token_accuracy": 0.03590105600960669, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.29475086005992673, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.887016949089334e-06, |
| "loss": 0.2616, |
| "mean_token_accuracy": 0.03479768028228136, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.29830207524137164, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.884035772038471e-06, |
| "loss": 0.2488, |
| "mean_token_accuracy": 0.042390721162519185, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.30185329042281656, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.881016236005686e-06, |
| "loss": 0.2452, |
| "mean_token_accuracy": 0.03633292374070152, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.30540450560426147, |
| "grad_norm": 0.828125, |
| "learning_rate": 9.877958364706269e-06, |
| "loss": 0.2486, |
| "mean_token_accuracy": 0.03563892778765876, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.3089557207857064, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.874862182156596e-06, |
| "loss": 0.2606, |
| "mean_token_accuracy": 0.031386454902531113, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.31250693596715123, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.871727712673931e-06, |
| "loss": 0.2539, |
| "mean_token_accuracy": 0.03635412478615763, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.31605815114859614, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.868554980876253e-06, |
| "loss": 0.2501, |
| "mean_token_accuracy": 0.03346586779662175, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.31960936633004106, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.865344011682038e-06, |
| "loss": 0.261, |
| "mean_token_accuracy": 0.03928510249170358, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.32316058151148597, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.86209483031009e-06, |
| "loss": 0.2564, |
| "mean_token_accuracy": 0.03208838186401408, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.3267117966929309, |
| "grad_norm": 1.75, |
| "learning_rate": 9.858807462279319e-06, |
| "loss": 0.2611, |
| "mean_token_accuracy": 0.034783691704433295, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.3302630118743758, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.855481933408557e-06, |
| "loss": 0.2563, |
| "mean_token_accuracy": 0.035772691921010846, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.33381422705582064, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.852118269816348e-06, |
| "loss": 0.2562, |
| "mean_token_accuracy": 0.03401207756360236, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.33736544223726556, |
| "grad_norm": 1.328125, |
| "learning_rate": 9.848716497920742e-06, |
| "loss": 0.25, |
| "mean_token_accuracy": 0.033619258623730275, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.34091665741871047, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.845276644439093e-06, |
| "loss": 0.2491, |
| "mean_token_accuracy": 0.03523858997687057, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3444678726001554, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.841798736387846e-06, |
| "loss": 0.2603, |
| "mean_token_accuracy": 0.036500092544883955, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3480190877816003, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.838282801082322e-06, |
| "loss": 0.2508, |
| "mean_token_accuracy": 0.03253259204575443, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.35157030296304514, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.834728866136506e-06, |
| "loss": 0.2622, |
| "mean_token_accuracy": 0.0348993868137768, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.35512151814449006, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.831136959462835e-06, |
| "loss": 0.2535, |
| "mean_token_accuracy": 0.03625574364923523, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.35867273332593497, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.82750710927197e-06, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.038484412667457946, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.3622239485073799, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.823839344072582e-06, |
| "loss": 0.2504, |
| "mean_token_accuracy": 0.04149091762883472, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.3657751636888248, |
| "grad_norm": 1.0234375, |
| "learning_rate": 9.820133692671116e-06, |
| "loss": 0.248, |
| "mean_token_accuracy": 0.03215024942983291, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.36932637887026964, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.816390184171587e-06, |
| "loss": 0.2525, |
| "mean_token_accuracy": 0.032526576575037325, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.37287759405171456, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.812608847975327e-06, |
| "loss": 0.2373, |
| "mean_token_accuracy": 0.03747400016436586, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.37642880923315947, |
| "grad_norm": 1.0234375, |
| "learning_rate": 9.808789713780768e-06, |
| "loss": 0.2594, |
| "mean_token_accuracy": 0.03837547679722775, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.3799800244146044, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.804932811583208e-06, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.035400711036345456, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.3835312395960493, |
| "grad_norm": 1.25, |
| "learning_rate": 9.801038171674571e-06, |
| "loss": 0.2549, |
| "mean_token_accuracy": 0.036432786924706306, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3870824547774942, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.797105824643171e-06, |
| "loss": 0.264, |
| "mean_token_accuracy": 0.02840231164736906, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.39063366995893906, |
| "grad_norm": 1.0859375, |
| "learning_rate": 9.793135801373472e-06, |
| "loss": 0.2612, |
| "mean_token_accuracy": 0.031225431059283437, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.39418488514038397, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.789128133045846e-06, |
| "loss": 0.2458, |
| "mean_token_accuracy": 0.04181941410206491, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.3977361003218289, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.785082851136327e-06, |
| "loss": 0.2472, |
| "mean_token_accuracy": 0.036744315055329935, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.4012873155032738, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.780999987416363e-06, |
| "loss": 0.2613, |
| "mean_token_accuracy": 0.03466283130183001, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.4048385306847187, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.776879573952573e-06, |
| "loss": 0.2608, |
| "mean_token_accuracy": 0.03970765234407736, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.40838974586616356, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.772721643106483e-06, |
| "loss": 0.2509, |
| "mean_token_accuracy": 0.034389628966891905, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.41194096104760847, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.768526227534286e-06, |
| "loss": 0.2492, |
| "mean_token_accuracy": 0.036198436089762254, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.4154921762290534, |
| "grad_norm": 1.25, |
| "learning_rate": 9.764293360186568e-06, |
| "loss": 0.2477, |
| "mean_token_accuracy": 0.041034682388271904, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.4190433914104983, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.760023074308067e-06, |
| "loss": 0.2589, |
| "mean_token_accuracy": 0.036038057656696765, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.4225946065919432, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.755715403437405e-06, |
| "loss": 0.2536, |
| "mean_token_accuracy": 0.02993579488611431, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.42614582177338806, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.75137038140682e-06, |
| "loss": 0.2483, |
| "mean_token_accuracy": 0.03487629652772739, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.42969703695483297, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.746988042341907e-06, |
| "loss": 0.2428, |
| "mean_token_accuracy": 0.03707023007882526, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.4332482521362779, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.742568420661347e-06, |
| "loss": 0.2385, |
| "mean_token_accuracy": 0.038770213703173795, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.4367994673177228, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.738111551076633e-06, |
| "loss": 0.2562, |
| "mean_token_accuracy": 0.03519319925180753, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.4403506824991677, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.733617468591806e-06, |
| "loss": 0.2484, |
| "mean_token_accuracy": 0.030770529268920654, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.4439018976806126, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.729086208503174e-06, |
| "loss": 0.2516, |
| "mean_token_accuracy": 0.036209895533829695, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.44745311286205747, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.724517806399035e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.03420353115507169, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.4510043280435024, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.7199122981594e-06, |
| "loss": 0.2584, |
| "mean_token_accuracy": 0.03600568573529017, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4545555432249473, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.715269719955708e-06, |
| "loss": 0.2667, |
| "mean_token_accuracy": 0.03318146305173286, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4581067584063922, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.710590108250546e-06, |
| "loss": 0.2584, |
| "mean_token_accuracy": 0.03327207771144458, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.4616579735878371, |
| "grad_norm": 1.28125, |
| "learning_rate": 9.705873499797358e-06, |
| "loss": 0.2487, |
| "mean_token_accuracy": 0.034562985882075736, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.46520918876928197, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.701119931640161e-06, |
| "loss": 0.2529, |
| "mean_token_accuracy": 0.03666910440369975, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.4687604039507269, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.69632944111325e-06, |
| "loss": 0.2563, |
| "mean_token_accuracy": 0.03438336936960695, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4723116191321718, |
| "grad_norm": 2.875, |
| "learning_rate": 9.691502065840905e-06, |
| "loss": 0.2461, |
| "mean_token_accuracy": 0.03849026275565848, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.4758628343136167, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.686637843737104e-06, |
| "loss": 0.2565, |
| "mean_token_accuracy": 0.03409404997728416, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.4794140494950616, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.681736813005207e-06, |
| "loss": 0.2568, |
| "mean_token_accuracy": 0.03154798768446199, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.48296526467650647, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.676799012137678e-06, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.03744548839313211, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.4865164798579514, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.671824479915768e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.04052274335481343, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.4900676950393963, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.666813255409212e-06, |
| "loss": 0.2568, |
| "mean_token_accuracy": 0.0329462843874353, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4936189102208412, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.661765377975924e-06, |
| "loss": 0.2613, |
| "mean_token_accuracy": 0.03440686877729604, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.4971701254022861, |
| "grad_norm": 1.125, |
| "learning_rate": 9.656680887261693e-06, |
| "loss": 0.233, |
| "mean_token_accuracy": 0.03317031992082775, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.500721340583731, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.651559823199865e-06, |
| "loss": 0.2564, |
| "mean_token_accuracy": 0.0330515707100858, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.5042725557651759, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.646402226011028e-06, |
| "loss": 0.256, |
| "mean_token_accuracy": 0.03438013891536684, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.5078237709466208, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.641208136202705e-06, |
| "loss": 0.2457, |
| "mean_token_accuracy": 0.037212962459307164, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.5113749861280656, |
| "grad_norm": 12.75, |
| "learning_rate": 9.635977594569025e-06, |
| "loss": 0.2567, |
| "mean_token_accuracy": 0.03464826566778356, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.5149262013095106, |
| "grad_norm": 1.0234375, |
| "learning_rate": 9.630710642190412e-06, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.03332022005270119, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.5184774164909555, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.625407320433257e-06, |
| "loss": 0.2496, |
| "mean_token_accuracy": 0.04005811481329147, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.5220286316724004, |
| "grad_norm": 2.125, |
| "learning_rate": 9.620067670949593e-06, |
| "loss": 0.2492, |
| "mean_token_accuracy": 0.03430858852516394, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.5255798468538453, |
| "grad_norm": 1.75, |
| "learning_rate": 9.614691735676768e-06, |
| "loss": 0.2512, |
| "mean_token_accuracy": 0.031662787096138345, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.5291310620352903, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.609279556837122e-06, |
| "loss": 0.2487, |
| "mean_token_accuracy": 0.03723479399923235, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.5326822772167351, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.603831176937645e-06, |
| "loss": 0.2581, |
| "mean_token_accuracy": 0.03453483120392775, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.53623349239818, |
| "grad_norm": 0.91796875, |
| "learning_rate": 9.598346638769653e-06, |
| "loss": 0.2648, |
| "mean_token_accuracy": 0.03409193667539512, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.5397847075796249, |
| "grad_norm": 1.25, |
| "learning_rate": 9.592825985408443e-06, |
| "loss": 0.2482, |
| "mean_token_accuracy": 0.037982173162163235, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.5433359227610698, |
| "grad_norm": 1.0, |
| "learning_rate": 9.58726926021296e-06, |
| "loss": 0.2446, |
| "mean_token_accuracy": 0.032505186056368984, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.5468871379425148, |
| "grad_norm": 1.25, |
| "learning_rate": 9.581676506825458e-06, |
| "loss": 0.2392, |
| "mean_token_accuracy": 0.03936906753733638, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.5504383531239596, |
| "grad_norm": 5.5, |
| "learning_rate": 9.576047769171154e-06, |
| "loss": 0.253, |
| "mean_token_accuracy": 0.037195249842625344, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5539895683054045, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.57038309145788e-06, |
| "loss": 0.2476, |
| "mean_token_accuracy": 0.033277792263106676, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5575407834868494, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.564682518175745e-06, |
| "loss": 0.244, |
| "mean_token_accuracy": 0.03346487059025094, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5610919986682943, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.558946094096773e-06, |
| "loss": 0.26, |
| "mean_token_accuracy": 0.0327607999824977, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5646432138497393, |
| "grad_norm": 1.2109375, |
| "learning_rate": 9.553173864274567e-06, |
| "loss": 0.2459, |
| "mean_token_accuracy": 0.03798125943285413, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5681944290311841, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.547365874043939e-06, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.03454116692591924, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.571745644212629, |
| "grad_norm": 1.625, |
| "learning_rate": 9.541522169020568e-06, |
| "loss": 0.2422, |
| "mean_token_accuracy": 0.03469104680698365, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5752968593940739, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.535642795100628e-06, |
| "loss": 0.2408, |
| "mean_token_accuracy": 0.03519366278487723, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5788480745755188, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.529727798460443e-06, |
| "loss": 0.244, |
| "mean_token_accuracy": 0.033296961744781584, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5823992897569638, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.52377722555611e-06, |
| "loss": 0.2477, |
| "mean_token_accuracy": 0.03645497989964497, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5859505049384086, |
| "grad_norm": 1.328125, |
| "learning_rate": 9.517791123123141e-06, |
| "loss": 0.2481, |
| "mean_token_accuracy": 0.0361881392163923, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5895017201198535, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.5117695381761e-06, |
| "loss": 0.2482, |
| "mean_token_accuracy": 0.03308462850691285, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5930529353012984, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.50571251800822e-06, |
| "loss": 0.2495, |
| "mean_token_accuracy": 0.0373438170299778, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5966041504827433, |
| "grad_norm": 1.2890625, |
| "learning_rate": 9.49962011019105e-06, |
| "loss": 0.2569, |
| "mean_token_accuracy": 0.030896698255673982, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.6001553656641883, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.493492362574069e-06, |
| "loss": 0.2317, |
| "mean_token_accuracy": 0.03890984639656381, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.6037065808456331, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.487329323284306e-06, |
| "loss": 0.2605, |
| "mean_token_accuracy": 0.03156319166737376, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.607257796027078, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.481131040725982e-06, |
| "loss": 0.2543, |
| "mean_token_accuracy": 0.03919281046910328, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.6108090112085229, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.474897563580105e-06, |
| "loss": 0.2299, |
| "mean_token_accuracy": 0.04073215187599999, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.6143602263899678, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.468628940804109e-06, |
| "loss": 0.2372, |
| "mean_token_accuracy": 0.03550426434594556, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.6179114415714128, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.46232522163145e-06, |
| "loss": 0.2577, |
| "mean_token_accuracy": 0.03367227574563003, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.6214626567528576, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.45598645557124e-06, |
| "loss": 0.2492, |
| "mean_token_accuracy": 0.032757934073742945, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.6250138719343025, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.44961269240784e-06, |
| "loss": 0.2462, |
| "mean_token_accuracy": 0.03420929316052934, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.6285650871157474, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.443203982200479e-06, |
| "loss": 0.2592, |
| "mean_token_accuracy": 0.033935571223992156, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.6321163022971923, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.436760375282858e-06, |
| "loss": 0.2395, |
| "mean_token_accuracy": 0.03958856422104873, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.6356675174786373, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.430281922262758e-06, |
| "loss": 0.2387, |
| "mean_token_accuracy": 0.04119610415727948, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.6392187326600821, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.423768674021638e-06, |
| "loss": 0.2539, |
| "mean_token_accuracy": 0.03369366040897148, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.642769947841527, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.417220681714232e-06, |
| "loss": 0.2444, |
| "mean_token_accuracy": 0.03464862687542336, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.6463211630229719, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.410637996768161e-06, |
| "loss": 0.2387, |
| "mean_token_accuracy": 0.035173144740838325, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.6498723782044168, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.404020670883511e-06, |
| "loss": 0.2466, |
| "mean_token_accuracy": 0.0379961929138517, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.6534235933858618, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.397368756032445e-06, |
| "loss": 0.2415, |
| "mean_token_accuracy": 0.03510807668862981, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.6569748085673066, |
| "grad_norm": 1.375, |
| "learning_rate": 9.390682304458782e-06, |
| "loss": 0.235, |
| "mean_token_accuracy": 0.03696706932532834, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6605260237487516, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.38396136867759e-06, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.03457535789129906, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6640772389301964, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.377206001474773e-06, |
| "loss": 0.2503, |
| "mean_token_accuracy": 0.03181831787514966, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6676284541116413, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.370416255906663e-06, |
| "loss": 0.2533, |
| "mean_token_accuracy": 0.03447212269020383, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6711796692930863, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.363592185299593e-06, |
| "loss": 0.2508, |
| "mean_token_accuracy": 0.03599807388127374, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.6747308844745311, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.356733843249487e-06, |
| "loss": 0.2452, |
| "mean_token_accuracy": 0.030227956107410137, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6782820996559761, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.349841283621432e-06, |
| "loss": 0.2523, |
| "mean_token_accuracy": 0.030009737140062498, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6818333148374209, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.34291456054926e-06, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.036572398468706524, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6853845300188658, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.33595372843512e-06, |
| "loss": 0.237, |
| "mean_token_accuracy": 0.034027565063297516, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6889357452003108, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.328958841949056e-06, |
| "loss": 0.2549, |
| "mean_token_accuracy": 0.032021424787672004, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.6924869603817556, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.321929956028565e-06, |
| "loss": 0.2503, |
| "mean_token_accuracy": 0.037128334486624226, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6960381755632006, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.31486712587818e-06, |
| "loss": 0.2519, |
| "mean_token_accuracy": 0.0347771145261504, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6995893907446454, |
| "grad_norm": 1.296875, |
| "learning_rate": 9.307770406969032e-06, |
| "loss": 0.2512, |
| "mean_token_accuracy": 0.03815152426250279, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.7031406059260903, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.300639855038405e-06, |
| "loss": 0.2442, |
| "mean_token_accuracy": 0.03452787006244762, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.7066918211075353, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.293475526089316e-06, |
| "loss": 0.2431, |
| "mean_token_accuracy": 0.03716798722598469, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.7102430362889801, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.286277476390056e-06, |
| "loss": 0.2421, |
| "mean_token_accuracy": 0.03371156241337303, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7137942514704251, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.279045762473764e-06, |
| "loss": 0.25, |
| "mean_token_accuracy": 0.0350483679867466, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.7173454666518699, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.27178044113797e-06, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.03318321451479278, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.7208966818333148, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.264481569444157e-06, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.03552469089845545, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.7244478970147598, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.257149204717317e-06, |
| "loss": 0.2507, |
| "mean_token_accuracy": 0.03375845828668389, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.7279991121962046, |
| "grad_norm": 2.375, |
| "learning_rate": 9.249783404545488e-06, |
| "loss": 0.2443, |
| "mean_token_accuracy": 0.03998926315762219, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.7315503273776496, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.242384226779308e-06, |
| "loss": 0.2457, |
| "mean_token_accuracy": 0.035812200483633205, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.7351015425590944, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.234951729531564e-06, |
| "loss": 0.2365, |
| "mean_token_accuracy": 0.03771704700193368, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.7386527577405393, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.227485971176734e-06, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.04016880454582861, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.7422039729219843, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.219987010350522e-06, |
| "loss": 0.2356, |
| "mean_token_accuracy": 0.038527078770130174, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.7457551881034291, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.212454905949406e-06, |
| "loss": 0.2366, |
| "mean_token_accuracy": 0.03399550302310672, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7493064032848741, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.204889717130172e-06, |
| "loss": 0.2525, |
| "mean_token_accuracy": 0.035145990557793994, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.7528576184663189, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.197291503309448e-06, |
| "loss": 0.2378, |
| "mean_token_accuracy": 0.038536792555532884, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.7564088336477638, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.189660324163243e-06, |
| "loss": 0.2474, |
| "mean_token_accuracy": 0.037968925902532646, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.7599600488292088, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.181996239626468e-06, |
| "loss": 0.2373, |
| "mean_token_accuracy": 0.03542459754316951, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.7635112640106536, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.174299309892474e-06, |
| "loss": 0.2371, |
| "mean_token_accuracy": 0.03846056985275936, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.7670624791920986, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.166569595412576e-06, |
| "loss": 0.2308, |
| "mean_token_accuracy": 0.0358878808474401, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7706136943735434, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.158807156895581e-06, |
| "loss": 0.24, |
| "mean_token_accuracy": 0.03474290976009797, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.7741649095549884, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.151012055307308e-06, |
| "loss": 0.2446, |
| "mean_token_accuracy": 0.03521274150625686, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7777161247364333, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.14318435187011e-06, |
| "loss": 0.242, |
| "mean_token_accuracy": 0.03299982330281637, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.7812673399178781, |
| "grad_norm": 2.5, |
| "learning_rate": 9.135324108062391e-06, |
| "loss": 0.2262, |
| "mean_token_accuracy": 0.04036388936947333, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7848185550993231, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.127431385618129e-06, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.04137472144429921, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7883697702807679, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.119506246526386e-06, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.036027361149535864, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7919209854622129, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.111548753030824e-06, |
| "loss": 0.2461, |
| "mean_token_accuracy": 0.034415613237797515, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7954722006436578, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.103558967629211e-06, |
| "loss": 0.2404, |
| "mean_token_accuracy": 0.03685819863312645, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7990234158251026, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.09553695307294e-06, |
| "loss": 0.248, |
| "mean_token_accuracy": 0.028797292045055656, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.8025746310065476, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.087482772366529e-06, |
| "loss": 0.2471, |
| "mean_token_accuracy": 0.03305003515924909, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.8061258461879924, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.07939648876712e-06, |
| "loss": 0.2403, |
| "mean_token_accuracy": 0.03300378163476125, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.8096770613694374, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.071278165784001e-06, |
| "loss": 0.2469, |
| "mean_token_accuracy": 0.03578633150755195, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.8132282765508823, |
| "grad_norm": 1.28125, |
| "learning_rate": 9.063127867178085e-06, |
| "loss": 0.2369, |
| "mean_token_accuracy": 0.036125565729889786, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.8167794917323271, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.054945656961429e-06, |
| "loss": 0.2361, |
| "mean_token_accuracy": 0.03855715526515269, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8203307069137721, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.046731599396716e-06, |
| "loss": 0.2513, |
| "mean_token_accuracy": 0.034193019520898815, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.8238819220952169, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.03848575899676e-06, |
| "loss": 0.2429, |
| "mean_token_accuracy": 0.04017591796582565, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.8274331372766619, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.030208200523994e-06, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.03354914677765919, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.8309843524581068, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.021898988989966e-06, |
| "loss": 0.2425, |
| "mean_token_accuracy": 0.031292581817979226, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.8345355676395516, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.013558189654819e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.037720333613833645, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.8380867828209966, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.005185868026793e-06, |
| "loss": 0.2349, |
| "mean_token_accuracy": 0.03620595469328691, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.8416379980024414, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.996782089861699e-06, |
| "loss": 0.2573, |
| "mean_token_accuracy": 0.03309179725329159, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.8451892131838864, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.988346921162407e-06, |
| "loss": 0.2476, |
| "mean_token_accuracy": 0.03174106139340438, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.8487404283653313, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.979880428178323e-06, |
| "loss": 0.24, |
| "mean_token_accuracy": 0.03508038215659326, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.8522916435467761, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.971382677404878e-06, |
| "loss": 0.2536, |
| "mean_token_accuracy": 0.03149323346224264, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8558428587282211, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.962853735582996e-06, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.037929220459773205, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.8593940739096659, |
| "grad_norm": 0.8828125, |
| "learning_rate": 8.95429366969858e-06, |
| "loss": 0.2478, |
| "mean_token_accuracy": 0.03517991015723965, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.8629452890911109, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.94570254698197e-06, |
| "loss": 0.2465, |
| "mean_token_accuracy": 0.040985416919284035, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.8664965042725558, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.93708043490743e-06, |
| "loss": 0.249, |
| "mean_token_accuracy": 0.037392959638964385, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.8700477194540006, |
| "grad_norm": 1.2421875, |
| "learning_rate": 8.928427401192618e-06, |
| "loss": 0.2442, |
| "mean_token_accuracy": 0.03466749745348352, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.8735989346354456, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.919743513798044e-06, |
| "loss": 0.2417, |
| "mean_token_accuracy": 0.03530099252020591, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.8771501498168904, |
| "grad_norm": 2.375, |
| "learning_rate": 8.911028840926537e-06, |
| "loss": 0.2465, |
| "mean_token_accuracy": 0.03762459074278013, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.8807013649983354, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.902283451022725e-06, |
| "loss": 0.2322, |
| "mean_token_accuracy": 0.03646585380920442, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8842525801797803, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.89350741277247e-06, |
| "loss": 0.2554, |
| "mean_token_accuracy": 0.03502001665037824, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8878037953612252, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.884700795102365e-06, |
| "loss": 0.2463, |
| "mean_token_accuracy": 0.03513136104447767, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8913550105426701, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.875863667179155e-06, |
| "loss": 0.2436, |
| "mean_token_accuracy": 0.0331545490953431, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.8949062257241149, |
| "grad_norm": 1.53125, |
| "learning_rate": 8.866996098409217e-06, |
| "loss": 0.2436, |
| "mean_token_accuracy": 0.03208865020133089, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8984574409055599, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.858098158438013e-06, |
| "loss": 0.2451, |
| "mean_token_accuracy": 0.03713261121083633, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.9020086560870048, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.849169917149532e-06, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.03933425937066204, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.9055598712684497, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.840211444665754e-06, |
| "loss": 0.2475, |
| "mean_token_accuracy": 0.04025235302106012, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.9091110864498946, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.831222811346088e-06, |
| "loss": 0.2392, |
| "mean_token_accuracy": 0.03604988591541769, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.9126623016313394, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.822204087786831e-06, |
| "loss": 0.2451, |
| "mean_token_accuracy": 0.03125222894232138, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.9162135168127844, |
| "grad_norm": 1.53125, |
| "learning_rate": 8.813155344820602e-06, |
| "loss": 0.2326, |
| "mean_token_accuracy": 0.0366273350919073, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.9197647319942293, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.804076653515792e-06, |
| "loss": 0.2392, |
| "mean_token_accuracy": 0.03683656148496084, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.9233159471756742, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.794968085176006e-06, |
| "loss": 0.2438, |
| "mean_token_accuracy": 0.03510568725687335, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9268671623571191, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.785829711339502e-06, |
| "loss": 0.238, |
| "mean_token_accuracy": 0.03469807377041434, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.9304183775385639, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.776661603778629e-06, |
| "loss": 0.2419, |
| "mean_token_accuracy": 0.03890717753165518, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.9339695927200089, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.767463834499261e-06, |
| "loss": 0.2481, |
| "mean_token_accuracy": 0.032666244380379794, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.9375208079014538, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.758236475740236e-06, |
| "loss": 0.2329, |
| "mean_token_accuracy": 0.037105117426108336, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.9410720230828987, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.748979599972787e-06, |
| "loss": 0.2443, |
| "mean_token_accuracy": 0.03263842041269527, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.9446232382643436, |
| "grad_norm": 1.4375, |
| "learning_rate": 8.739693279899969e-06, |
| "loss": 0.2349, |
| "mean_token_accuracy": 0.034980818134499714, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.9481744534457884, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.730377588456092e-06, |
| "loss": 0.2357, |
| "mean_token_accuracy": 0.036848886411462445, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.9517256686272334, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.72103259880615e-06, |
| "loss": 0.2417, |
| "mean_token_accuracy": 0.037047853220428806, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.9552768838086783, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.711658384345244e-06, |
| "loss": 0.2464, |
| "mean_token_accuracy": 0.03403061416611308, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.9588280989901232, |
| "grad_norm": 1.5078125, |
| "learning_rate": 8.702255018698e-06, |
| "loss": 0.2276, |
| "mean_token_accuracy": 0.035586197598604485, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9623793141715681, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.692822575718e-06, |
| "loss": 0.2336, |
| "mean_token_accuracy": 0.03349671917021624, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.9659305293530129, |
| "grad_norm": 1.3046875, |
| "learning_rate": 8.683361129487198e-06, |
| "loss": 0.236, |
| "mean_token_accuracy": 0.0383108137830277, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.9694817445344579, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.673870754315336e-06, |
| "loss": 0.2401, |
| "mean_token_accuracy": 0.04072669624656555, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.9730329597159028, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.664351524739368e-06, |
| "loss": 0.2529, |
| "mean_token_accuracy": 0.036624281186959706, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.9765841748973477, |
| "grad_norm": 1.0390625, |
| "learning_rate": 8.65480351552286e-06, |
| "loss": 0.2461, |
| "mean_token_accuracy": 0.03385563577285211, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.9801353900787926, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.645226801655418e-06, |
| "loss": 0.2356, |
| "mean_token_accuracy": 0.03583552375130239, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.9836866052602374, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.635621458352094e-06, |
| "loss": 0.2505, |
| "mean_token_accuracy": 0.03417470180647797, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.9872378204416824, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.625987561052789e-06, |
| "loss": 0.2445, |
| "mean_token_accuracy": 0.034865413577790605, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.9907890356231273, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.616325185421673e-06, |
| "loss": 0.2543, |
| "mean_token_accuracy": 0.03070924551502685, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.9943402508045722, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.606634407346575e-06, |
| "loss": 0.2412, |
| "mean_token_accuracy": 0.03835893170617055, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9978914659860171, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.596915302938403e-06, |
| "loss": 0.2416, |
| "mean_token_accuracy": 0.03124076440144563, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.78125, |
| "learning_rate": 8.587167948530533e-06, |
| "loss": 0.1443, |
| "mean_token_accuracy": 0.03148232989540128, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.003551215181445, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.577392420678217e-06, |
| "loss": 0.2333, |
| "mean_token_accuracy": 0.039055653414834524, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.0071024303628897, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.567588796157983e-06, |
| "loss": 0.248, |
| "mean_token_accuracy": 0.03278845629756688, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.0106536455443347, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.557757151967025e-06, |
| "loss": 0.2505, |
| "mean_token_accuracy": 0.03427708127492224, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.0142048607257796, |
| "grad_norm": 0.96484375, |
| "learning_rate": 8.547897565322601e-06, |
| "loss": 0.2414, |
| "mean_token_accuracy": 0.03930300644788076, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.0177560759072246, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.538010113661434e-06, |
| "loss": 0.2325, |
| "mean_token_accuracy": 0.03667710912122857, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.0213072910886694, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.528094874639092e-06, |
| "loss": 0.2494, |
| "mean_token_accuracy": 0.03501396441060933, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.0248585062701143, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.518151926129384e-06, |
| "loss": 0.2265, |
| "mean_token_accuracy": 0.04366327696516237, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.0284097214515593, |
| "grad_norm": 1.046875, |
| "learning_rate": 8.508181346223749e-06, |
| "loss": 0.247, |
| "mean_token_accuracy": 0.03554195544529648, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.031960936633004, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.498183213230646e-06, |
| "loss": 0.2376, |
| "mean_token_accuracy": 0.03931035470304778, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.035512151814449, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.488157605674924e-06, |
| "loss": 0.2407, |
| "mean_token_accuracy": 0.03499374365674157, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.039063366995894, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.478104602297226e-06, |
| "loss": 0.2348, |
| "mean_token_accuracy": 0.03476892200342263, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.0426145821773387, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.468024282053357e-06, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.035643573090055725, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.0461657973587837, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.457916724113667e-06, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.03672185743562295, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.0497170125402286, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.447782007862427e-06, |
| "loss": 0.2341, |
| "mean_token_accuracy": 0.039112065329391044, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.0532682277216736, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.437620212897213e-06, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.03804202094033826, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.0568194429031184, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.427431419028273e-06, |
| "loss": 0.2438, |
| "mean_token_accuracy": 0.03412439540261403, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.0603706580845633, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.417215706277905e-06, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.03501426196999091, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.0639218732660083, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.406973154879826e-06, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.03277427892317064, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.067473088447453, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.396703845278537e-06, |
| "loss": 0.2401, |
| "mean_token_accuracy": 0.035917978868383216, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.071024303628898, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.386407858128707e-06, |
| "loss": 0.2385, |
| "mean_token_accuracy": 0.03660639774170704, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.074575518810343, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.376085274294518e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.033684822363284184, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.0781267339917877, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.365736174849053e-06, |
| "loss": 0.2377, |
| "mean_token_accuracy": 0.034216828673379496, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.0816779491732327, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.355360641073637e-06, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.03404876044623961, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.0852291643546776, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.344958754457214e-06, |
| "loss": 0.2401, |
| "mean_token_accuracy": 0.03517156572706881, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.0887803795361226, |
| "grad_norm": 1.375, |
| "learning_rate": 8.3345305966957e-06, |
| "loss": 0.2451, |
| "mean_token_accuracy": 0.033717118327331264, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.0923315947175674, |
| "grad_norm": 1.5078125, |
| "learning_rate": 8.324076249691347e-06, |
| "loss": 0.2415, |
| "mean_token_accuracy": 0.03640562181681162, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.0958828098990123, |
| "grad_norm": 3.5, |
| "learning_rate": 8.31359579555209e-06, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.039631859472137876, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.0994340250804573, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.30308931659091e-06, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.03287116249339306, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.102985240261902, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.292556895325195e-06, |
| "loss": 0.246, |
| "mean_token_accuracy": 0.037367168624768965, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.106536455443347, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.281998614476066e-06, |
| "loss": 0.2463, |
| "mean_token_accuracy": 0.03246536147344159, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.110087670624792, |
| "grad_norm": 1.453125, |
| "learning_rate": 8.271414556967758e-06, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.033534199588757474, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.1136388858062367, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.260804805926948e-06, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.034469886544684414, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.1171901009876817, |
| "grad_norm": 1.125, |
| "learning_rate": 8.250169444682109e-06, |
| "loss": 0.2364, |
| "mean_token_accuracy": 0.03434639961778885, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.1207413161691266, |
| "grad_norm": 3.078125, |
| "learning_rate": 8.239508556762857e-06, |
| "loss": 0.244, |
| "mean_token_accuracy": 0.035720634419703856, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.1242925313505716, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.228822225899294e-06, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.03444605955883162, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.1278437465320164, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.218110536021347e-06, |
| "loss": 0.2409, |
| "mean_token_accuracy": 0.04050879927308415, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.1313949617134613, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.207373571258113e-06, |
| "loss": 0.2353, |
| "mean_token_accuracy": 0.03475078833071166, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.1349461768949063, |
| "grad_norm": 6.1875, |
| "learning_rate": 8.196611415937196e-06, |
| "loss": 0.2444, |
| "mean_token_accuracy": 0.032825655180204194, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.138497392076351, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.18582415458405e-06, |
| "loss": 0.2377, |
| "mean_token_accuracy": 0.036314319742814405, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.142048607257796, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.1750118719213e-06, |
| "loss": 0.2484, |
| "mean_token_accuracy": 0.03831162358619622, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.145599822439241, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.164174652868097e-06, |
| "loss": 0.2499, |
| "mean_token_accuracy": 0.033986524165811716, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.149151037620686, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.153312582539438e-06, |
| "loss": 0.246, |
| "mean_token_accuracy": 0.03575117406217032, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.1527022528021307, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.142425746245503e-06, |
| "loss": 0.2345, |
| "mean_token_accuracy": 0.037550230876149726, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.1562534679835756, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.131514229490975e-06, |
| "loss": 0.2277, |
| "mean_token_accuracy": 0.03635891172598349, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.1598046831650206, |
| "grad_norm": 1.375, |
| "learning_rate": 8.120578117974388e-06, |
| "loss": 0.2428, |
| "mean_token_accuracy": 0.0431224472959002, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.1633558983464654, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.109617497587429e-06, |
| "loss": 0.2272, |
| "mean_token_accuracy": 0.036638571738876635, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.1669071135279103, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.098632454414286e-06, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.03733357025339501, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.1704583287093553, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.08762307473096e-06, |
| "loss": 0.2413, |
| "mean_token_accuracy": 0.03179363884191844, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1740095438908003, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.07658944500459e-06, |
| "loss": 0.2333, |
| "mean_token_accuracy": 0.035859118954249425, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.177560759072245, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.065531651892771e-06, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.036549534866935574, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.18111197425369, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.054449782242876e-06, |
| "loss": 0.2491, |
| "mean_token_accuracy": 0.029785508802888216, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.1846631894351347, |
| "grad_norm": 1.484375, |
| "learning_rate": 8.043343923091382e-06, |
| "loss": 0.2329, |
| "mean_token_accuracy": 0.037331359108065953, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.1882144046165797, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.03221416166317e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.03565247428196017, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.1917656197980246, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.021060585370845e-06, |
| "loss": 0.2376, |
| "mean_token_accuracy": 0.0353173619696463, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.1953168349794696, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.009883281814066e-06, |
| "loss": 0.2388, |
| "mean_token_accuracy": 0.03581908489650232, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.1988680501609144, |
| "grad_norm": 1.25, |
| "learning_rate": 7.998682338778834e-06, |
| "loss": 0.2296, |
| "mean_token_accuracy": 0.041877999477947014, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.2024192653423593, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.987457844236817e-06, |
| "loss": 0.2356, |
| "mean_token_accuracy": 0.0347092972297105, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.2059704805238043, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.976209886344654e-06, |
| "loss": 0.2387, |
| "mean_token_accuracy": 0.03283250893218792, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.209521695705249, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.964938553443267e-06, |
| "loss": 0.2292, |
| "mean_token_accuracy": 0.03533021227485733, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.213072910886694, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.953643934057162e-06, |
| "loss": 0.2424, |
| "mean_token_accuracy": 0.03509292240414652, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.216624126068139, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.942326116893733e-06, |
| "loss": 0.2383, |
| "mean_token_accuracy": 0.03585474247302045, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.220175341249584, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.930985190842576e-06, |
| "loss": 0.2338, |
| "mean_token_accuracy": 0.04019446158054052, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.2237265564310287, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.919621244974773e-06, |
| "loss": 0.2458, |
| "mean_token_accuracy": 0.03536258387248381, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.2272777716124736, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.908234368542214e-06, |
| "loss": 0.2292, |
| "mean_token_accuracy": 0.03392878967861179, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.2308289867939186, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.896824650976873e-06, |
| "loss": 0.2201, |
| "mean_token_accuracy": 0.037319375238439534, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.2343802019753634, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.885392181890126e-06, |
| "loss": 0.2346, |
| "mean_token_accuracy": 0.04020543451770209, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.2379314171568083, |
| "grad_norm": 1.4921875, |
| "learning_rate": 7.873937051072037e-06, |
| "loss": 0.2415, |
| "mean_token_accuracy": 0.03590130691372906, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.2414826323382533, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.862459348490645e-06, |
| "loss": 0.2419, |
| "mean_token_accuracy": 0.038676922078593634, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2450338475196983, |
| "grad_norm": 1.875, |
| "learning_rate": 7.85095916429128e-06, |
| "loss": 0.2302, |
| "mean_token_accuracy": 0.03200000841388828, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.248585062701143, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.839436588795834e-06, |
| "loss": 0.2371, |
| "mean_token_accuracy": 0.03455797864080523, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.252136277882588, |
| "grad_norm": 1.375, |
| "learning_rate": 7.82789171250206e-06, |
| "loss": 0.2276, |
| "mean_token_accuracy": 0.03702645877638133, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.2556874930640327, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.816324626082864e-06, |
| "loss": 0.245, |
| "mean_token_accuracy": 0.035418079471128294, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.2592387082454777, |
| "grad_norm": 1.2109375, |
| "learning_rate": 7.804735420385578e-06, |
| "loss": 0.2496, |
| "mean_token_accuracy": 0.04052014215631061, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.2627899234269226, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.793124186431271e-06, |
| "loss": 0.2253, |
| "mean_token_accuracy": 0.03850708331447095, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.2663411386083676, |
| "grad_norm": 1.125, |
| "learning_rate": 7.781491015414018e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.03744576991448412, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.2698923537898126, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.769835998700182e-06, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.04161575323450961, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.2734435689712573, |
| "grad_norm": 1.3046875, |
| "learning_rate": 7.758159227827701e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.03756350834009936, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.2769947841527023, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.746460794505375e-06, |
| "loss": 0.2447, |
| "mean_token_accuracy": 0.032854993114597164, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.280545999334147, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.734740790612137e-06, |
| "loss": 0.2426, |
| "mean_token_accuracy": 0.037274191450705985, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.284097214515592, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.722999308196329e-06, |
| "loss": 0.2315, |
| "mean_token_accuracy": 0.03713756873185048, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.287648429697037, |
| "grad_norm": 1.296875, |
| "learning_rate": 7.711236439474991e-06, |
| "loss": 0.2365, |
| "mean_token_accuracy": 0.036791833699680865, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.291199644878482, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.69945227683313e-06, |
| "loss": 0.2352, |
| "mean_token_accuracy": 0.0381344523448206, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.2947508600599267, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.68764691282299e-06, |
| "loss": 0.2415, |
| "mean_token_accuracy": 0.04074793899053475, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.2983020752413716, |
| "grad_norm": 1.296875, |
| "learning_rate": 7.675820440163334e-06, |
| "loss": 0.2369, |
| "mean_token_accuracy": 0.03905295207732706, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.3018532904228166, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.663972951738708e-06, |
| "loss": 0.2301, |
| "mean_token_accuracy": 0.035785467212917865, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.3054045056042614, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.652104540598712e-06, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.038330111019604374, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.3089557207857063, |
| "grad_norm": 1.0234375, |
| "learning_rate": 7.640215299957283e-06, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.029918354761321098, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.3125069359671513, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.628305323191942e-06, |
| "loss": 0.2349, |
| "mean_token_accuracy": 0.036909411177475704, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3160581511485963, |
| "grad_norm": 1.5390625, |
| "learning_rate": 7.616374703843071e-06, |
| "loss": 0.2493, |
| "mean_token_accuracy": 0.033028718306013616, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.319609366330041, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.604423535613183e-06, |
| "loss": 0.2347, |
| "mean_token_accuracy": 0.034142533426347654, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.323160581511486, |
| "grad_norm": 1.125, |
| "learning_rate": 7.592451912366176e-06, |
| "loss": 0.2303, |
| "mean_token_accuracy": 0.03645164545378066, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.326711796692931, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.580459928126607e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.031131474817811977, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.3302630118743757, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.568447677078937e-06, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.033356625943270046, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.3338142270558206, |
| "grad_norm": 1.3359375, |
| "learning_rate": 7.556415253566814e-06, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.03734725382673787, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.3373654422372656, |
| "grad_norm": 1.4921875, |
| "learning_rate": 7.544362752092309e-06, |
| "loss": 0.2344, |
| "mean_token_accuracy": 0.04189994388252671, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.3409166574187106, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.532290267315189e-06, |
| "loss": 0.2373, |
| "mean_token_accuracy": 0.034419633655488724, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.3444678726001553, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.52019789405217e-06, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.03570301646141161, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.3480190877816003, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.508085727276169e-06, |
| "loss": 0.2516, |
| "mean_token_accuracy": 0.03470832618040731, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.351570302963045, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.495953862115561e-06, |
| "loss": 0.2352, |
| "mean_token_accuracy": 0.030847028268908616, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.35512151814449, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.483802393853431e-06, |
| "loss": 0.2212, |
| "mean_token_accuracy": 0.03855792362446664, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.358672733325935, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.471631417926826e-06, |
| "loss": 0.2462, |
| "mean_token_accuracy": 0.03374627606171998, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.36222394850738, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.459441029926006e-06, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.039975615829462186, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.365775163688825, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.447231325593689e-06, |
| "loss": 0.2409, |
| "mean_token_accuracy": 0.03599957966071088, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.3693263788702696, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.435002400824309e-06, |
| "loss": 0.2424, |
| "mean_token_accuracy": 0.04002546479568991, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.3728775940517146, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.422754351663252e-06, |
| "loss": 0.2276, |
| "mean_token_accuracy": 0.03706626188068185, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.3764288092331594, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.410487274306104e-06, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.03697552310222818, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.3799800244146043, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.398201265097902e-06, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.038530107736733044, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.3835312395960493, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.385896420532372e-06, |
| "loss": 0.2302, |
| "mean_token_accuracy": 0.03841527171971393, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3870824547774943, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.37357283725117e-06, |
| "loss": 0.2331, |
| "mean_token_accuracy": 0.0332794542555348, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.390633669958939, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.361230612043125e-06, |
| "loss": 0.2351, |
| "mean_token_accuracy": 0.03882583613449242, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.394184885140384, |
| "grad_norm": 1.625, |
| "learning_rate": 7.3488698418434824e-06, |
| "loss": 0.236, |
| "mean_token_accuracy": 0.034570411104141385, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.397736100321829, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.3364906237331345e-06, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.03536622403044021, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.4012873155032737, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.324093054937864e-06, |
| "loss": 0.2403, |
| "mean_token_accuracy": 0.03640507650561631, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.4048385306847186, |
| "grad_norm": 1.375, |
| "learning_rate": 7.311677232827583e-06, |
| "loss": 0.2459, |
| "mean_token_accuracy": 0.034755626278638374, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.4083897458661636, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.299243254915558e-06, |
| "loss": 0.2456, |
| "mean_token_accuracy": 0.03809268196710036, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.4119409610476086, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.286791218857654e-06, |
| "loss": 0.2353, |
| "mean_token_accuracy": 0.03845041626118473, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.4154921762290533, |
| "grad_norm": 2.625, |
| "learning_rate": 7.274321222451561e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.0348738961838535, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.4190433914104983, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.261833363636036e-06, |
| "loss": 0.2518, |
| "mean_token_accuracy": 0.03649535452859709, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4225946065919433, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.249327740490114e-06, |
| "loss": 0.2357, |
| "mean_token_accuracy": 0.03557684525367222, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.426145821773388, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.236804451232364e-06, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.037769879712868715, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.429697036954833, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.224263594220093e-06, |
| "loss": 0.2443, |
| "mean_token_accuracy": 0.035131411799739, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.433248252136278, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.211705267948592e-06, |
| "loss": 0.2288, |
| "mean_token_accuracy": 0.038788905261753825, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.436799467317723, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.199129571050345e-06, |
| "loss": 0.2534, |
| "mean_token_accuracy": 0.03436974439318874, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.4403506824991676, |
| "grad_norm": 0.98828125, |
| "learning_rate": 7.186536602294278e-06, |
| "loss": 0.22, |
| "mean_token_accuracy": 0.03988837570796022, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.4439018976806126, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.173926460584956e-06, |
| "loss": 0.24, |
| "mean_token_accuracy": 0.0312625703154481, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.4474531128620574, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.161299244961828e-06, |
| "loss": 0.2339, |
| "mean_token_accuracy": 0.04129007174924482, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.4510043280435023, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.148655054598436e-06, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.036289898944232846, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.4545555432249473, |
| "grad_norm": 1.421875, |
| "learning_rate": 7.135993988801644e-06, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.034655624454899225, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4581067584063923, |
| "grad_norm": 1.125, |
| "learning_rate": 7.1233161470108525e-06, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.037607920974551234, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.4616579735878372, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.110621628797222e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.03281604757466994, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.465209188769282, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.097910533862886e-06, |
| "loss": 0.2352, |
| "mean_token_accuracy": 0.034316202265472384, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.468760403950727, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.085182962040173e-06, |
| "loss": 0.249, |
| "mean_token_accuracy": 0.032907980152231175, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.4723116191321717, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.072439013290824e-06, |
| "loss": 0.238, |
| "mean_token_accuracy": 0.03209046665506321, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.4758628343136166, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.059678787705191e-06, |
| "loss": 0.2456, |
| "mean_token_accuracy": 0.03140619180339854, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.4794140494950616, |
| "grad_norm": 1.125, |
| "learning_rate": 7.046902385501477e-06, |
| "loss": 0.2361, |
| "mean_token_accuracy": 0.03729598738209461, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.4829652646765066, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.03410990702493e-06, |
| "loss": 0.2285, |
| "mean_token_accuracy": 0.03452415266292519, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.4865164798579513, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.02130145274706e-06, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.03802148198155919, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.4900676950393963, |
| "grad_norm": 1.125, |
| "learning_rate": 7.008477123264849e-06, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.038115493043733295, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4936189102208413, |
| "grad_norm": 1.3203125, |
| "learning_rate": 6.995637019299963e-06, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.03685344383484335, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.497170125402286, |
| "grad_norm": 1.484375, |
| "learning_rate": 6.982781241697963e-06, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.03893219211749965, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.500721340583731, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.969909891427509e-06, |
| "loss": 0.2317, |
| "mean_token_accuracy": 0.0351432016796025, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.504272555765176, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.957023069579561e-06, |
| "loss": 0.2241, |
| "mean_token_accuracy": 0.03948885342106223, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.507823770946621, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.944120877366605e-06, |
| "loss": 0.2438, |
| "mean_token_accuracy": 0.03480056252737995, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.5113749861280656, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.931203416121831e-06, |
| "loss": 0.2336, |
| "mean_token_accuracy": 0.031314958760049194, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.5149262013095106, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.918270787298361e-06, |
| "loss": 0.2466, |
| "mean_token_accuracy": 0.03130312504072208, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.5184774164909554, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.90532309246844e-06, |
| "loss": 0.249, |
| "mean_token_accuracy": 0.03627348578811507, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.5220286316724003, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.89236043332264e-06, |
| "loss": 0.2229, |
| "mean_token_accuracy": 0.04060299464981654, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.5255798468538453, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.87938291166906e-06, |
| "loss": 0.2348, |
| "mean_token_accuracy": 0.03792013142447104, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5291310620352903, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.866390629432533e-06, |
| "loss": 0.228, |
| "mean_token_accuracy": 0.04107047704019351, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.5326822772167352, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.8533836886538175e-06, |
| "loss": 0.2524, |
| "mean_token_accuracy": 0.03478304122472764, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.53623349239818, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.840362191488801e-06, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.036657528744399315, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.539784707579625, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.8273262402076935e-06, |
| "loss": 0.2321, |
| "mean_token_accuracy": 0.03788031428484828, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.5433359227610697, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.814275937194233e-06, |
| "loss": 0.2263, |
| "mean_token_accuracy": 0.0349160894365923, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.5468871379425146, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.801211384944867e-06, |
| "loss": 0.2405, |
| "mean_token_accuracy": 0.03533229600725463, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.5504383531239596, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.788132686067963e-06, |
| "loss": 0.2356, |
| "mean_token_accuracy": 0.04051366758358199, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.5539895683054046, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.77503994328299e-06, |
| "loss": 0.235, |
| "mean_token_accuracy": 0.03521855714279809, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.5575407834868495, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.761933259419725e-06, |
| "loss": 0.2383, |
| "mean_token_accuracy": 0.03321351679187501, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.5610919986682943, |
| "grad_norm": 3.125, |
| "learning_rate": 6.748812737417428e-06, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.03764536406015395, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5646432138497393, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.7356784803240464e-06, |
| "loss": 0.2445, |
| "mean_token_accuracy": 0.0358848099149327, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.568194429031184, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.722530591295406e-06, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.035192674804420676, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.571745644212629, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.709369173594396e-06, |
| "loss": 0.2384, |
| "mean_token_accuracy": 0.037970394078001846, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.575296859394074, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.6961943305901515e-06, |
| "loss": 0.2388, |
| "mean_token_accuracy": 0.0376486132190621, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.578848074575519, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.683006165757262e-06, |
| "loss": 0.2249, |
| "mean_token_accuracy": 0.03808089874291909, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.5823992897569639, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.669804782674937e-06, |
| "loss": 0.2401, |
| "mean_token_accuracy": 0.03246638694690773, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.5859505049384086, |
| "grad_norm": 1.421875, |
| "learning_rate": 6.656590285026203e-06, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.037845788236154476, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.5895017201198534, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.643362776597089e-06, |
| "loss": 0.2588, |
| "mean_token_accuracy": 0.034229919638164574, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.5930529353012983, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.630122361275811e-06, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.034587128982821014, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.5966041504827433, |
| "grad_norm": 1.5, |
| "learning_rate": 6.6168691430519524e-06, |
| "loss": 0.25, |
| "mean_token_accuracy": 0.030672385284560733, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6001553656641883, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.6036032260156526e-06, |
| "loss": 0.2306, |
| "mean_token_accuracy": 0.04033701937441947, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.6037065808456332, |
| "grad_norm": 1.625, |
| "learning_rate": 6.590324714356784e-06, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.034981218981556594, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.607257796027078, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.5770337123641405e-06, |
| "loss": 0.2212, |
| "mean_token_accuracy": 0.0414577160445333, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.610809011208523, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.563730324424609e-06, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.0381101651910285, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.6143602263899677, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.55041465502236e-06, |
| "loss": 0.2272, |
| "mean_token_accuracy": 0.04102694254106609, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.6179114415714126, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.53708680873802e-06, |
| "loss": 0.2494, |
| "mean_token_accuracy": 0.03622519101918442, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.6214626567528576, |
| "grad_norm": 1.46875, |
| "learning_rate": 6.523746890247853e-06, |
| "loss": 0.2255, |
| "mean_token_accuracy": 0.03894408120322623, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.6250138719343026, |
| "grad_norm": 1.03125, |
| "learning_rate": 6.510395004322937e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.03699832962593064, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.6285650871157475, |
| "grad_norm": 1.546875, |
| "learning_rate": 6.49703125582834e-06, |
| "loss": 0.2341, |
| "mean_token_accuracy": 0.03479354368937493, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.6321163022971923, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.4836557497222995e-06, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.04207373945610016, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6356675174786373, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.470268591055398e-06, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.03901815911376616, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.639218732660082, |
| "grad_norm": 1.0703125, |
| "learning_rate": 6.456869884969738e-06, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.03514585681114113, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.642769947841527, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.443459736698106e-06, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.03269572283170419, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.646321163022972, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.430038251563166e-06, |
| "loss": 0.2268, |
| "mean_token_accuracy": 0.03920681574527407, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.649872378204417, |
| "grad_norm": 6.90625, |
| "learning_rate": 6.416605534976614e-06, |
| "loss": 0.2331, |
| "mean_token_accuracy": 0.035078568678727606, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.6534235933858619, |
| "grad_norm": 1.4609375, |
| "learning_rate": 6.403161692438364e-06, |
| "loss": 0.2333, |
| "mean_token_accuracy": 0.03780714001550223, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.6569748085673066, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.3897068295357e-06, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.033330076843412826, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.6605260237487516, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.376241051942477e-06, |
| "loss": 0.2281, |
| "mean_token_accuracy": 0.038188748992979527, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.6640772389301963, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.362764465418258e-06, |
| "loss": 0.2361, |
| "mean_token_accuracy": 0.040052126856608083, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.6676284541116413, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.349277175807506e-06, |
| "loss": 0.2286, |
| "mean_token_accuracy": 0.0381559070374351, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6711796692930863, |
| "grad_norm": 1.421875, |
| "learning_rate": 6.3357792890387485e-06, |
| "loss": 0.2314, |
| "mean_token_accuracy": 0.032606342934741406, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.6747308844745312, |
| "grad_norm": 1.421875, |
| "learning_rate": 6.322270911123734e-06, |
| "loss": 0.2369, |
| "mean_token_accuracy": 0.034345271826168755, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.6782820996559762, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.308752148156614e-06, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.03452475197263993, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.681833314837421, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.295223106313104e-06, |
| "loss": 0.2374, |
| "mean_token_accuracy": 0.03493335935854702, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.6853845300188657, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.281683891849645e-06, |
| "loss": 0.2407, |
| "mean_token_accuracy": 0.03363906976665021, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.6889357452003106, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.268134611102578e-06, |
| "loss": 0.2352, |
| "mean_token_accuracy": 0.03696349471283611, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.6924869603817556, |
| "grad_norm": 1.5, |
| "learning_rate": 6.254575370487299e-06, |
| "loss": 0.2299, |
| "mean_token_accuracy": 0.03581606224179268, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.6960381755632006, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.2410062764974366e-06, |
| "loss": 0.2385, |
| "mean_token_accuracy": 0.03706932210479863, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.6995893907446455, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.227427435703997e-06, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.03006288245160249, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.7031406059260903, |
| "grad_norm": 1.25, |
| "learning_rate": 6.213838954754543e-06, |
| "loss": 0.2429, |
| "mean_token_accuracy": 0.044262315965170274, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7066918211075353, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.2002409403723525e-06, |
| "loss": 0.2393, |
| "mean_token_accuracy": 0.03908459947706433, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.71024303628898, |
| "grad_norm": 3.46875, |
| "learning_rate": 6.186633499355576e-06, |
| "loss": 0.245, |
| "mean_token_accuracy": 0.036892217398417415, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.713794251470425, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.173016738576396e-06, |
| "loss": 0.2362, |
| "mean_token_accuracy": 0.03681937377768918, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.71734546665187, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.159390764980202e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.03445882866799366, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.720896681833315, |
| "grad_norm": 1.25, |
| "learning_rate": 6.145755685584731e-06, |
| "loss": 0.2322, |
| "mean_token_accuracy": 0.039366260476526804, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.7244478970147599, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.132111607479243e-06, |
| "loss": 0.2364, |
| "mean_token_accuracy": 0.03817261819494888, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.7279991121962046, |
| "grad_norm": 1.375, |
| "learning_rate": 6.118458637823669e-06, |
| "loss": 0.2247, |
| "mean_token_accuracy": 0.03410290294414153, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.7315503273776496, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.104796883847777e-06, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.036821881629293784, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.7351015425590943, |
| "grad_norm": 1.53125, |
| "learning_rate": 6.091126452850324e-06, |
| "loss": 0.2207, |
| "mean_token_accuracy": 0.04196582403710636, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.7386527577405393, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.077447452198219e-06, |
| "loss": 0.245, |
| "mean_token_accuracy": 0.030956470383898704, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7422039729219843, |
| "grad_norm": 2.265625, |
| "learning_rate": 6.063759989325673e-06, |
| "loss": 0.2277, |
| "mean_token_accuracy": 0.04123112729212153, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.7457551881034292, |
| "grad_norm": 1.125, |
| "learning_rate": 6.050064171733362e-06, |
| "loss": 0.2345, |
| "mean_token_accuracy": 0.03712779658235377, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.7493064032848742, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.0363601069875755e-06, |
| "loss": 0.234, |
| "mean_token_accuracy": 0.04036831553457887, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.752857618466319, |
| "grad_norm": 1.4375, |
| "learning_rate": 6.022647902719384e-06, |
| "loss": 0.252, |
| "mean_token_accuracy": 0.03416740952343389, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.7564088336477637, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.008927666623775e-06, |
| "loss": 0.2324, |
| "mean_token_accuracy": 0.03657872789517569, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.7599600488292086, |
| "grad_norm": 1.484375, |
| "learning_rate": 5.9951995064588245e-06, |
| "loss": 0.2284, |
| "mean_token_accuracy": 0.03967171028489247, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.7635112640106536, |
| "grad_norm": 1.4296875, |
| "learning_rate": 5.981463530044841e-06, |
| "loss": 0.2325, |
| "mean_token_accuracy": 0.03946096594881965, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.7670624791920986, |
| "grad_norm": 0.984375, |
| "learning_rate": 5.967719845263524e-06, |
| "loss": 0.2344, |
| "mean_token_accuracy": 0.03674850361494464, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.7706136943735435, |
| "grad_norm": 1.4453125, |
| "learning_rate": 5.953968560057112e-06, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.03331250947303488, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.7741649095549885, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.940209782427535e-06, |
| "loss": 0.2466, |
| "mean_token_accuracy": 0.035536976964067435, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7777161247364333, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.926443620435572e-06, |
| "loss": 0.2372, |
| "mean_token_accuracy": 0.034023440719465725, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.781267339917878, |
| "grad_norm": 1.4375, |
| "learning_rate": 5.912670182199998e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.0336604088297463, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.784818555099323, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.898889575896731e-06, |
| "loss": 0.2378, |
| "mean_token_accuracy": 0.034576399257275625, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.788369770280768, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.8851019097579935e-06, |
| "loss": 0.2299, |
| "mean_token_accuracy": 0.04091495179090998, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.791920985462213, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.871307292071449e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.03323508650282747, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.7954722006436579, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.857505831179361e-06, |
| "loss": 0.2452, |
| "mean_token_accuracy": 0.039159927426226204, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.7990234158251026, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.843697635477742e-06, |
| "loss": 0.2474, |
| "mean_token_accuracy": 0.032113108623889275, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.8025746310065476, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.8298828134154935e-06, |
| "loss": 0.2273, |
| "mean_token_accuracy": 0.03509632355417125, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.8061258461879923, |
| "grad_norm": 1.8984375, |
| "learning_rate": 5.816061473493565e-06, |
| "loss": 0.2457, |
| "mean_token_accuracy": 0.03150586118135834, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.8096770613694373, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.802233724264094e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.03352252102922648, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8132282765508823, |
| "grad_norm": 1.484375, |
| "learning_rate": 5.788399674329559e-06, |
| "loss": 0.2355, |
| "mean_token_accuracy": 0.03722988534354954, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.8167794917323272, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.774559432341918e-06, |
| "loss": 0.2347, |
| "mean_token_accuracy": 0.03498390710228705, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.8203307069137722, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.760713107001773e-06, |
| "loss": 0.239, |
| "mean_token_accuracy": 0.034972945799381705, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.823881922095217, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.746860807057491e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.03691285277818679, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.827433137276662, |
| "grad_norm": 1.9375, |
| "learning_rate": 5.7330026413043726e-06, |
| "loss": 0.2316, |
| "mean_token_accuracy": 0.03540732314104389, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.8309843524581066, |
| "grad_norm": 1.390625, |
| "learning_rate": 5.719138718583781e-06, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.03562885835162888, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.8345355676395516, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.705269147782303e-06, |
| "loss": 0.2338, |
| "mean_token_accuracy": 0.033621748105360894, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.8380867828209966, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.6913940378308755e-06, |
| "loss": 0.234, |
| "mean_token_accuracy": 0.04016883431177121, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.8416379980024415, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.677513497703947e-06, |
| "loss": 0.2357, |
| "mean_token_accuracy": 0.04463333530293312, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.8451892131838865, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.663627636418611e-06, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.03623782243448659, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8487404283653313, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.649736563033754e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.03492143240146106, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.852291643546776, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.635840386649197e-06, |
| "loss": 0.2308, |
| "mean_token_accuracy": 0.03824477005036897, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.855842858728221, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.621939216404842e-06, |
| "loss": 0.2439, |
| "mean_token_accuracy": 0.034680439697694965, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.859394073909666, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.608033161479811e-06, |
| "loss": 0.2264, |
| "mean_token_accuracy": 0.039163380019090255, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.862945289091111, |
| "grad_norm": 2.015625, |
| "learning_rate": 5.594122331091591e-06, |
| "loss": 0.2239, |
| "mean_token_accuracy": 0.036949697489035316, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.8664965042725559, |
| "grad_norm": 1.453125, |
| "learning_rate": 5.580206834495169e-06, |
| "loss": 0.2358, |
| "mean_token_accuracy": 0.03440156889701029, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.8700477194540006, |
| "grad_norm": 1.25, |
| "learning_rate": 5.566286780982193e-06, |
| "loss": 0.2363, |
| "mean_token_accuracy": 0.03501361182497931, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.8735989346354456, |
| "grad_norm": 1.5078125, |
| "learning_rate": 5.552362279880091e-06, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.03163332929398166, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.8771501498168903, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.538433440551221e-06, |
| "loss": 0.2309, |
| "mean_token_accuracy": 0.040504791504645254, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.8807013649983353, |
| "grad_norm": 1.3203125, |
| "learning_rate": 5.524500372392021e-06, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.03375892240728717, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8842525801797803, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.5105631848321375e-06, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.040157406461730716, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.8878037953612252, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.496621987333567e-06, |
| "loss": 0.2455, |
| "mean_token_accuracy": 0.034050565504003316, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.8913550105426702, |
| "grad_norm": 1.3828125, |
| "learning_rate": 5.482676889389808e-06, |
| "loss": 0.2376, |
| "mean_token_accuracy": 0.03857235643590684, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.894906225724115, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.468728000524987e-06, |
| "loss": 0.2264, |
| "mean_token_accuracy": 0.040069550173939206, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.89845744090556, |
| "grad_norm": 1.7734375, |
| "learning_rate": 5.454775430293008e-06, |
| "loss": 0.2318, |
| "mean_token_accuracy": 0.03648939702179632, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.9020086560870046, |
| "grad_norm": 0.8984375, |
| "learning_rate": 5.440819288276683e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.038116528681712225, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.9055598712684496, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.426859684086881e-06, |
| "loss": 0.2456, |
| "mean_token_accuracy": 0.034208514596684836, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.9091110864498946, |
| "grad_norm": 2.0, |
| "learning_rate": 5.412896727361663e-06, |
| "loss": 0.2265, |
| "mean_token_accuracy": 0.040587535226222826, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.9126623016313395, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.398930527765416e-06, |
| "loss": 0.2385, |
| "mean_token_accuracy": 0.03604849764087703, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.9162135168127845, |
| "grad_norm": 1.421875, |
| "learning_rate": 5.384961194988002e-06, |
| "loss": 0.2318, |
| "mean_token_accuracy": 0.03515218906613882, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9197647319942293, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.370988838743889e-06, |
| "loss": 0.236, |
| "mean_token_accuracy": 0.03593129891669378, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.9233159471756742, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.357013568771288e-06, |
| "loss": 0.2232, |
| "mean_token_accuracy": 0.03748265598733269, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.926867162357119, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.343035494831298e-06, |
| "loss": 0.2288, |
| "mean_token_accuracy": 0.03536356821859954, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.930418377538564, |
| "grad_norm": 2.171875, |
| "learning_rate": 5.32905472670704e-06, |
| "loss": 0.2246, |
| "mean_token_accuracy": 0.03547816792342928, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.933969592720009, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.315071374202792e-06, |
| "loss": 0.2534, |
| "mean_token_accuracy": 0.03712530972188688, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.9375208079014539, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.301085547143135e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.03489181573604583, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.9410720230828988, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.287097355372079e-06, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.030296869514131686, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.9446232382643436, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.273106908752211e-06, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.03895932896557497, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.9481744534457883, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.259114317163822e-06, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.03215844640726573, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.9517256686272333, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.245119690504056e-06, |
| "loss": 0.2353, |
| "mean_token_accuracy": 0.03430458562797867, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9552768838086783, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.231123138686036e-06, |
| "loss": 0.2239, |
| "mean_token_accuracy": 0.042256227920006495, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.9588280989901232, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.217124771638008e-06, |
| "loss": 0.2488, |
| "mean_token_accuracy": 0.033073368220357224, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.9623793141715682, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.2031246993024705e-06, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.03448270119224617, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.965930529353013, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.1891230316353215e-06, |
| "loss": 0.2315, |
| "mean_token_accuracy": 0.038373887560737785, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.969481744534458, |
| "grad_norm": 1.25, |
| "learning_rate": 5.1751198786049815e-06, |
| "loss": 0.2251, |
| "mean_token_accuracy": 0.035272060780698666, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.9730329597159026, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.161115350191543e-06, |
| "loss": 0.2358, |
| "mean_token_accuracy": 0.037838590164028574, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.9765841748973476, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.147109556385898e-06, |
| "loss": 0.225, |
| "mean_token_accuracy": 0.035712803082788014, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.9801353900787926, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.133102607188875e-06, |
| "loss": 0.236, |
| "mean_token_accuracy": 0.037219416080915835, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.9836866052602375, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.119094612610381e-06, |
| "loss": 0.2321, |
| "mean_token_accuracy": 0.03555746503116097, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.9872378204416825, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.10508568266853e-06, |
| "loss": 0.2416, |
| "mean_token_accuracy": 0.03534104762002244, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9907890356231273, |
| "grad_norm": 1.484375, |
| "learning_rate": 5.091075927388785e-06, |
| "loss": 0.2424, |
| "mean_token_accuracy": 0.034718068922302336, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.9943402508045722, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.077065456803089e-06, |
| "loss": 0.2403, |
| "mean_token_accuracy": 0.036199005553498864, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.997891465986017, |
| "grad_norm": 0.9140625, |
| "learning_rate": 5.063054380949003e-06, |
| "loss": 0.2283, |
| "mean_token_accuracy": 0.0378040480427444, |
| "step": 563 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5.049042809868845e-06, |
| "loss": 0.1326, |
| "mean_token_accuracy": 0.03690299116026022, |
| "step": 564 |
| }, |
| { |
| "epoch": 2.003551215181445, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5.035030853608817e-06, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.03732422069515451, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.00710243036289, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.0210186222181515e-06, |
| "loss": 0.2215, |
| "mean_token_accuracy": 0.0390621348105924, |
| "step": 566 |
| }, |
| { |
| "epoch": 2.010653645544335, |
| "grad_norm": 1.578125, |
| "learning_rate": 5.007006225748238e-06, |
| "loss": 0.2289, |
| "mean_token_accuracy": 0.03769652777918964, |
| "step": 567 |
| }, |
| { |
| "epoch": 2.0142048607257794, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.992993774251764e-06, |
| "loss": 0.2419, |
| "mean_token_accuracy": 0.03473068901075749, |
| "step": 568 |
| }, |
| { |
| "epoch": 2.0177560759072244, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.97898137778185e-06, |
| "loss": 0.2346, |
| "mean_token_accuracy": 0.03907662617348251, |
| "step": 569 |
| }, |
| { |
| "epoch": 2.0213072910886694, |
| "grad_norm": 1.125, |
| "learning_rate": 4.964969146391184e-06, |
| "loss": 0.2347, |
| "mean_token_accuracy": 0.03834367445597309, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.0248585062701143, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.950957190131157e-06, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.041130597212031716, |
| "step": 571 |
| }, |
| { |
| "epoch": 2.0284097214515593, |
| "grad_norm": 3.09375, |
| "learning_rate": 4.936945619050998e-06, |
| "loss": 0.242, |
| "mean_token_accuracy": 0.03198737775164773, |
| "step": 572 |
| }, |
| { |
| "epoch": 2.0319609366330043, |
| "grad_norm": 1.125, |
| "learning_rate": 4.922934543196912e-06, |
| "loss": 0.2301, |
| "mean_token_accuracy": 0.03535122560788295, |
| "step": 573 |
| }, |
| { |
| "epoch": 2.0355121518144492, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.908924072611218e-06, |
| "loss": 0.2334, |
| "mean_token_accuracy": 0.0340686052768433, |
| "step": 574 |
| }, |
| { |
| "epoch": 2.0390633669958937, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.894914317331471e-06, |
| "loss": 0.2274, |
| "mean_token_accuracy": 0.038621567571681226, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.0426145821773387, |
| "grad_norm": 1.25, |
| "learning_rate": 4.88090538738962e-06, |
| "loss": 0.239, |
| "mean_token_accuracy": 0.032639294491673354, |
| "step": 576 |
| }, |
| { |
| "epoch": 2.0461657973587837, |
| "grad_norm": 1.453125, |
| "learning_rate": 4.866897392811127e-06, |
| "loss": 0.2341, |
| "mean_token_accuracy": 0.03567746441694908, |
| "step": 577 |
| }, |
| { |
| "epoch": 2.0497170125402286, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.852890443614105e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.03639252596258302, |
| "step": 578 |
| }, |
| { |
| "epoch": 2.0532682277216736, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.838884649808458e-06, |
| "loss": 0.2366, |
| "mean_token_accuracy": 0.04135367330673034, |
| "step": 579 |
| }, |
| { |
| "epoch": 2.0568194429031186, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.82488012139502e-06, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.040144713548215805, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.060370658084563, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.810876968364679e-06, |
| "loss": 0.2303, |
| "mean_token_accuracy": 0.039767392045177985, |
| "step": 581 |
| }, |
| { |
| "epoch": 2.063921873266008, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.796875300697532e-06, |
| "loss": 0.2284, |
| "mean_token_accuracy": 0.03720286238967674, |
| "step": 582 |
| }, |
| { |
| "epoch": 2.067473088447453, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.782875228361994e-06, |
| "loss": 0.2258, |
| "mean_token_accuracy": 0.0378811931987002, |
| "step": 583 |
| }, |
| { |
| "epoch": 2.071024303628898, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.7688768613139655e-06, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.03471687817364, |
| "step": 584 |
| }, |
| { |
| "epoch": 2.074575518810343, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.754880309495946e-06, |
| "loss": 0.2217, |
| "mean_token_accuracy": 0.03663280401087832, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.078126733991788, |
| "grad_norm": 2.53125, |
| "learning_rate": 4.74088568283618e-06, |
| "loss": 0.2388, |
| "mean_token_accuracy": 0.038270618082606234, |
| "step": 586 |
| }, |
| { |
| "epoch": 2.081677949173233, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.726893091247792e-06, |
| "loss": 0.2363, |
| "mean_token_accuracy": 0.03145957482593076, |
| "step": 587 |
| }, |
| { |
| "epoch": 2.0852291643546774, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.712902644627923e-06, |
| "loss": 0.2259, |
| "mean_token_accuracy": 0.03767043779953383, |
| "step": 588 |
| }, |
| { |
| "epoch": 2.0887803795361224, |
| "grad_norm": 1.375, |
| "learning_rate": 4.698914452856866e-06, |
| "loss": 0.2192, |
| "mean_token_accuracy": 0.03968908742535859, |
| "step": 589 |
| }, |
| { |
| "epoch": 2.0923315947175674, |
| "grad_norm": 2.1875, |
| "learning_rate": 4.684928625797208e-06, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.03802780578553211, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0958828098990123, |
| "grad_norm": 1.4453125, |
| "learning_rate": 4.6709452732929614e-06, |
| "loss": 0.2412, |
| "mean_token_accuracy": 0.03319291988009354, |
| "step": 591 |
| }, |
| { |
| "epoch": 2.0994340250804573, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.656964505168703e-06, |
| "loss": 0.2434, |
| "mean_token_accuracy": 0.03837345182910212, |
| "step": 592 |
| }, |
| { |
| "epoch": 2.1029852402619023, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.642986431228713e-06, |
| "loss": 0.228, |
| "mean_token_accuracy": 0.03522277422234765, |
| "step": 593 |
| }, |
| { |
| "epoch": 2.1065364554433472, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.629011161256114e-06, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.03698047510260949, |
| "step": 594 |
| }, |
| { |
| "epoch": 2.1100876706247917, |
| "grad_norm": 1.5390625, |
| "learning_rate": 4.615038805011999e-06, |
| "loss": 0.2394, |
| "mean_token_accuracy": 0.034128702951420564, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.1136388858062367, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.601069472234584e-06, |
| "loss": 0.235, |
| "mean_token_accuracy": 0.03288589773364947, |
| "step": 596 |
| }, |
| { |
| "epoch": 2.1171901009876817, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.587103272638339e-06, |
| "loss": 0.2261, |
| "mean_token_accuracy": 0.0330345298134489, |
| "step": 597 |
| }, |
| { |
| "epoch": 2.1207413161691266, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.57314031591312e-06, |
| "loss": 0.2354, |
| "mean_token_accuracy": 0.03702639001130592, |
| "step": 598 |
| }, |
| { |
| "epoch": 2.1242925313505716, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.559180711723318e-06, |
| "loss": 0.2499, |
| "mean_token_accuracy": 0.03753973091443186, |
| "step": 599 |
| }, |
| { |
| "epoch": 2.1278437465320166, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.545224569706994e-06, |
| "loss": 0.2327, |
| "mean_token_accuracy": 0.035053080224315636, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1124, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.283809149801565e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|