diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5272 @@ +{ + "best_global_step": 744, + "best_metric": 0.9021978021978022, + "best_model_checkpoint": "./albert_multilabel_base\\checkpoint-744", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 744, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004032258064516129, + "grad_norm": 4.463273525238037, + "learning_rate": 1.9973118279569895e-05, + "loss": 0.6697, + "step": 1 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 4.188074111938477, + "learning_rate": 1.9946236559139788e-05, + "loss": 0.618, + "step": 2 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 4.5477166175842285, + "learning_rate": 1.9919354838709678e-05, + "loss": 0.5686, + "step": 3 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.3298499584198, + "learning_rate": 1.989247311827957e-05, + "loss": 0.5691, + "step": 4 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 3.3022494316101074, + "learning_rate": 1.9865591397849465e-05, + "loss": 0.5157, + "step": 5 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 3.095808744430542, + "learning_rate": 1.9838709677419358e-05, + "loss": 0.4709, + "step": 6 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.8690195083618164, + "learning_rate": 1.9811827956989248e-05, + "loss": 0.4679, + "step": 7 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.376504898071289, + "learning_rate": 1.978494623655914e-05, + "loss": 0.4801, + "step": 8 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 2.947565793991089, + "learning_rate": 1.9758064516129035e-05, + "loss": 0.4196, + "step": 9 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.624013662338257, + "learning_rate": 1.9731182795698928e-05, + "loss": 0.3948, + "step": 10 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.218142032623291, + "learning_rate": 1.9704301075268818e-05, + "loss": 0.3588, + "step": 11 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 2.0879335403442383, + "learning_rate": 1.967741935483871e-05, + "loss": 0.3732, + "step": 12 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 1.8436014652252197, + "learning_rate": 1.9650537634408605e-05, + "loss": 0.3549, + "step": 13 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.7105393409729004, + "learning_rate": 1.9623655913978498e-05, + "loss": 0.3588, + "step": 14 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 2.038672685623169, + "learning_rate": 1.9596774193548388e-05, + "loss": 0.3431, + "step": 15 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.9189316034317017, + "learning_rate": 1.956989247311828e-05, + "loss": 0.3334, + "step": 16 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 1.5137898921966553, + "learning_rate": 1.9543010752688174e-05, + "loss": 0.3142, + "step": 17 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 1.6630574464797974, + "learning_rate": 1.9516129032258068e-05, + "loss": 0.3702, + "step": 18 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 1.7377407550811768, + "learning_rate": 1.9489247311827958e-05, + "loss": 0.3059, + "step": 19 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.9414572715759277, + "learning_rate": 1.946236559139785e-05, + "loss": 0.3146, + "step": 20 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 1.5869086980819702, + "learning_rate": 1.9435483870967744e-05, + "loss": 0.3053, + "step": 21 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 1.5989874601364136, + "learning_rate": 1.9408602150537638e-05, + "loss": 0.3022, + "step": 22 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.2120656967163086, + "learning_rate": 1.9381720430107528e-05, + "loss": 0.3372, + "step": 23 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.8420207500457764, + "learning_rate": 1.935483870967742e-05, + "loss": 0.2947, + "step": 24 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.324240803718567, + "learning_rate": 1.9327956989247314e-05, + "loss": 0.3117, + "step": 25 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 1.3763035535812378, + "learning_rate": 1.9301075268817207e-05, + "loss": 0.3097, + "step": 26 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.13789963722229, + "learning_rate": 1.9274193548387097e-05, + "loss": 0.3073, + "step": 27 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 1.8375061750411987, + "learning_rate": 1.924731182795699e-05, + "loss": 0.2729, + "step": 28 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.4714664220809937, + "learning_rate": 1.9220430107526884e-05, + "loss": 0.2884, + "step": 29 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 1.7859275341033936, + "learning_rate": 1.9193548387096777e-05, + "loss": 0.2921, + "step": 30 + }, + { + "epoch": 0.125, + "grad_norm": 1.1975992918014526, + "learning_rate": 1.916666666666667e-05, + "loss": 0.2999, + "step": 31 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 1.1491625308990479, + "learning_rate": 1.913978494623656e-05, + "loss": 0.3037, + "step": 32 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 1.7719403505325317, + "learning_rate": 1.9112903225806454e-05, + "loss": 0.3097, + "step": 33 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 2.003831148147583, + "learning_rate": 1.9086021505376347e-05, + "loss": 0.3419, + "step": 34 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 1.2528795003890991, + "learning_rate": 1.905913978494624e-05, + "loss": 0.3104, + "step": 35 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 1.9549521207809448, + "learning_rate": 1.903225806451613e-05, + "loss": 0.2948, + "step": 36 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 3.2783548831939697, + "learning_rate": 1.9005376344086024e-05, + "loss": 0.2442, + "step": 37 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 2.185920476913452, + "learning_rate": 1.8978494623655917e-05, + "loss": 0.2714, + "step": 38 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 1.0794386863708496, + "learning_rate": 1.895161290322581e-05, + "loss": 0.27, + "step": 39 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.1620965003967285, + "learning_rate": 1.89247311827957e-05, + "loss": 0.2259, + "step": 40 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 1.599996566772461, + "learning_rate": 1.8897849462365594e-05, + "loss": 0.2035, + "step": 41 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 1.2780784368515015, + "learning_rate": 1.8870967741935487e-05, + "loss": 0.22, + "step": 42 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 1.4363526105880737, + "learning_rate": 1.884408602150538e-05, + "loss": 0.2368, + "step": 43 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 2.520078420639038, + "learning_rate": 1.881720430107527e-05, + "loss": 0.1892, + "step": 44 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 1.2548242807388306, + "learning_rate": 1.8790322580645163e-05, + "loss": 0.2666, + "step": 45 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 3.150402545928955, + "learning_rate": 1.8763440860215057e-05, + "loss": 0.2119, + "step": 46 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 2.226469039916992, + "learning_rate": 1.8736559139784947e-05, + "loss": 0.2952, + "step": 47 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 2.2046310901641846, + "learning_rate": 1.870967741935484e-05, + "loss": 0.2089, + "step": 48 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 2.4914636611938477, + "learning_rate": 1.8682795698924733e-05, + "loss": 0.2622, + "step": 49 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 1.6519418954849243, + "learning_rate": 1.8655913978494623e-05, + "loss": 0.2268, + "step": 50 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 2.803372383117676, + "learning_rate": 1.8629032258064517e-05, + "loss": 0.2628, + "step": 51 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 1.6514664888381958, + "learning_rate": 1.860215053763441e-05, + "loss": 0.1742, + "step": 52 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 4.286014556884766, + "learning_rate": 1.8575268817204303e-05, + "loss": 0.3353, + "step": 53 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.8548387096774193e-05, + "loss": 0.193, + "step": 54 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 3.4638161659240723, + "learning_rate": 1.8521505376344086e-05, + "loss": 0.243, + "step": 55 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 2.002242088317871, + "learning_rate": 1.849462365591398e-05, + "loss": 0.2098, + "step": 56 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 2.865968942642212, + "learning_rate": 1.8467741935483873e-05, + "loss": 0.2875, + "step": 57 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 4.366675853729248, + "learning_rate": 1.8440860215053763e-05, + "loss": 0.1991, + "step": 58 + }, + { + "epoch": 0.23790322580645162, + "grad_norm": 3.3366456031799316, + "learning_rate": 1.8413978494623656e-05, + "loss": 0.2024, + "step": 59 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 3.0101261138916016, + "learning_rate": 1.838709677419355e-05, + "loss": 0.1529, + "step": 60 + }, + { + "epoch": 0.24596774193548387, + "grad_norm": 1.7896872758865356, + "learning_rate": 1.8360215053763443e-05, + "loss": 0.1969, + "step": 61 + }, + { + "epoch": 0.25, + "grad_norm": 3.260681390762329, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.2426, + "step": 62 + }, + { + "epoch": 0.2540322580645161, + "grad_norm": 2.9987573623657227, + "learning_rate": 1.8306451612903226e-05, + "loss": 0.2763, + "step": 63 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 2.370321273803711, + "learning_rate": 1.827956989247312e-05, + "loss": 0.1721, + "step": 64 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 2.941296339035034, + "learning_rate": 1.8252688172043013e-05, + "loss": 0.2495, + "step": 65 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 2.7063724994659424, + "learning_rate": 1.8225806451612903e-05, + "loss": 0.1995, + "step": 66 + }, + { + "epoch": 0.2701612903225806, + "grad_norm": 2.6740007400512695, + "learning_rate": 1.8198924731182796e-05, + "loss": 0.1484, + "step": 67 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 4.054372310638428, + "learning_rate": 1.817204301075269e-05, + "loss": 0.1958, + "step": 68 + }, + { + "epoch": 0.2782258064516129, + "grad_norm": 3.995595932006836, + "learning_rate": 1.8145161290322583e-05, + "loss": 0.1581, + "step": 69 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 4.279732704162598, + "learning_rate": 1.8118279569892473e-05, + "loss": 0.2051, + "step": 70 + }, + { + "epoch": 0.2862903225806452, + "grad_norm": 2.194711446762085, + "learning_rate": 1.8091397849462366e-05, + "loss": 0.2699, + "step": 71 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 2.1309289932250977, + "learning_rate": 1.806451612903226e-05, + "loss": 0.202, + "step": 72 + }, + { + "epoch": 0.29435483870967744, + "grad_norm": 4.026139259338379, + "learning_rate": 1.8037634408602153e-05, + "loss": 0.2346, + "step": 73 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 2.3642139434814453, + "learning_rate": 1.8010752688172042e-05, + "loss": 0.2095, + "step": 74 + }, + { + "epoch": 0.3024193548387097, + "grad_norm": 1.9105987548828125, + "learning_rate": 1.7983870967741936e-05, + "loss": 0.1839, + "step": 75 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 1.926434874534607, + "learning_rate": 1.795698924731183e-05, + "loss": 0.1984, + "step": 76 + }, + { + "epoch": 0.31048387096774194, + "grad_norm": 1.3975627422332764, + "learning_rate": 1.7930107526881722e-05, + "loss": 0.2008, + "step": 77 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 2.820605993270874, + "learning_rate": 1.7903225806451612e-05, + "loss": 0.1451, + "step": 78 + }, + { + "epoch": 0.3185483870967742, + "grad_norm": 2.23899245262146, + "learning_rate": 1.7876344086021506e-05, + "loss": 0.1839, + "step": 79 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 1.4833577871322632, + "learning_rate": 1.78494623655914e-05, + "loss": 0.1874, + "step": 80 + }, + { + "epoch": 0.32661290322580644, + "grad_norm": 3.754470109939575, + "learning_rate": 1.7822580645161292e-05, + "loss": 0.2299, + "step": 81 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 3.65791654586792, + "learning_rate": 1.7795698924731186e-05, + "loss": 0.2075, + "step": 82 + }, + { + "epoch": 0.3346774193548387, + "grad_norm": 2.704558849334717, + "learning_rate": 1.7768817204301075e-05, + "loss": 0.1688, + "step": 83 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 1.7909516096115112, + "learning_rate": 1.774193548387097e-05, + "loss": 0.2472, + "step": 84 + }, + { + "epoch": 0.34274193548387094, + "grad_norm": 1.412300705909729, + "learning_rate": 1.7715053763440862e-05, + "loss": 0.1858, + "step": 85 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 2.0086007118225098, + "learning_rate": 1.7688172043010755e-05, + "loss": 0.2088, + "step": 86 + }, + { + "epoch": 0.35080645161290325, + "grad_norm": 1.4397780895233154, + "learning_rate": 1.7661290322580645e-05, + "loss": 0.1816, + "step": 87 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 2.947721242904663, + "learning_rate": 1.763440860215054e-05, + "loss": 0.1629, + "step": 88 + }, + { + "epoch": 0.3588709677419355, + "grad_norm": 1.9372519254684448, + "learning_rate": 1.7607526881720432e-05, + "loss": 0.1636, + "step": 89 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 1.4998315572738647, + "learning_rate": 1.7580645161290325e-05, + "loss": 0.2162, + "step": 90 + }, + { + "epoch": 0.36693548387096775, + "grad_norm": 3.8654611110687256, + "learning_rate": 1.7553763440860215e-05, + "loss": 0.2311, + "step": 91 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 2.416931390762329, + "learning_rate": 1.752688172043011e-05, + "loss": 0.161, + "step": 92 + }, + { + "epoch": 0.375, + "grad_norm": 1.4065924882888794, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.1919, + "step": 93 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 3.5286269187927246, + "learning_rate": 1.7473118279569895e-05, + "loss": 0.2338, + "step": 94 + }, + { + "epoch": 0.38306451612903225, + "grad_norm": 3.0253779888153076, + "learning_rate": 1.7446236559139785e-05, + "loss": 0.1824, + "step": 95 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 1.31352961063385, + "learning_rate": 1.741935483870968e-05, + "loss": 0.1203, + "step": 96 + }, + { + "epoch": 0.3911290322580645, + "grad_norm": 2.3616368770599365, + "learning_rate": 1.7392473118279572e-05, + "loss": 0.2555, + "step": 97 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 1.9040699005126953, + "learning_rate": 1.7365591397849465e-05, + "loss": 0.1857, + "step": 98 + }, + { + "epoch": 0.39919354838709675, + "grad_norm": 1.4389543533325195, + "learning_rate": 1.7338709677419355e-05, + "loss": 0.1379, + "step": 99 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 1.4597176313400269, + "learning_rate": 1.7311827956989248e-05, + "loss": 0.1258, + "step": 100 + }, + { + "epoch": 0.40725806451612906, + "grad_norm": 2.730079174041748, + "learning_rate": 1.728494623655914e-05, + "loss": 0.1936, + "step": 101 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 1.6900081634521484, + "learning_rate": 1.7258064516129035e-05, + "loss": 0.1782, + "step": 102 + }, + { + "epoch": 0.4153225806451613, + "grad_norm": 3.9283804893493652, + "learning_rate": 1.7231182795698925e-05, + "loss": 0.1447, + "step": 103 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 2.625030755996704, + "learning_rate": 1.7204301075268818e-05, + "loss": 0.1361, + "step": 104 + }, + { + "epoch": 0.42338709677419356, + "grad_norm": 6.293011665344238, + "learning_rate": 1.717741935483871e-05, + "loss": 0.2563, + "step": 105 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 3.8414859771728516, + "learning_rate": 1.7150537634408605e-05, + "loss": 0.2118, + "step": 106 + }, + { + "epoch": 0.4314516129032258, + "grad_norm": 1.7090542316436768, + "learning_rate": 1.7123655913978495e-05, + "loss": 0.2919, + "step": 107 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 1.4093631505966187, + "learning_rate": 1.7096774193548388e-05, + "loss": 0.2151, + "step": 108 + }, + { + "epoch": 0.43951612903225806, + "grad_norm": 1.6816186904907227, + "learning_rate": 1.706989247311828e-05, + "loss": 0.2001, + "step": 109 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 2.5388357639312744, + "learning_rate": 1.7043010752688175e-05, + "loss": 0.1839, + "step": 110 + }, + { + "epoch": 0.4475806451612903, + "grad_norm": 4.355051040649414, + "learning_rate": 1.7016129032258068e-05, + "loss": 0.1517, + "step": 111 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 5.043978691101074, + "learning_rate": 1.6989247311827958e-05, + "loss": 0.1148, + "step": 112 + }, + { + "epoch": 0.45564516129032256, + "grad_norm": 4.623153209686279, + "learning_rate": 1.696236559139785e-05, + "loss": 0.2277, + "step": 113 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 2.165748119354248, + "learning_rate": 1.6935483870967744e-05, + "loss": 0.175, + "step": 114 + }, + { + "epoch": 0.4637096774193548, + "grad_norm": 2.074892520904541, + "learning_rate": 1.6908602150537638e-05, + "loss": 0.107, + "step": 115 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 4.597195625305176, + "learning_rate": 1.6881720430107528e-05, + "loss": 0.201, + "step": 116 + }, + { + "epoch": 0.4717741935483871, + "grad_norm": 1.589660406112671, + "learning_rate": 1.685483870967742e-05, + "loss": 0.1904, + "step": 117 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 3.892699718475342, + "learning_rate": 1.6827956989247314e-05, + "loss": 0.1536, + "step": 118 + }, + { + "epoch": 0.4798387096774194, + "grad_norm": 1.4085686206817627, + "learning_rate": 1.6801075268817208e-05, + "loss": 0.1762, + "step": 119 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 2.0159590244293213, + "learning_rate": 1.6774193548387098e-05, + "loss": 0.1433, + "step": 120 + }, + { + "epoch": 0.4879032258064516, + "grad_norm": 3.211463451385498, + "learning_rate": 1.674731182795699e-05, + "loss": 0.2131, + "step": 121 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 1.9942578077316284, + "learning_rate": 1.6720430107526884e-05, + "loss": 0.1962, + "step": 122 + }, + { + "epoch": 0.4959677419354839, + "grad_norm": 2.7773678302764893, + "learning_rate": 1.6693548387096778e-05, + "loss": 0.1516, + "step": 123 + }, + { + "epoch": 0.5, + "grad_norm": 4.387947082519531, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2402, + "step": 124 + }, + { + "epoch": 0.5040322580645161, + "grad_norm": 3.206733465194702, + "learning_rate": 1.663978494623656e-05, + "loss": 0.1615, + "step": 125 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 4.810270309448242, + "learning_rate": 1.6612903225806454e-05, + "loss": 0.1587, + "step": 126 + }, + { + "epoch": 0.5120967741935484, + "grad_norm": 3.2066237926483154, + "learning_rate": 1.6586021505376347e-05, + "loss": 0.1747, + "step": 127 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.2533971071243286, + "learning_rate": 1.6559139784946237e-05, + "loss": 0.1662, + "step": 128 + }, + { + "epoch": 0.5201612903225806, + "grad_norm": 4.853026390075684, + "learning_rate": 1.653225806451613e-05, + "loss": 0.1895, + "step": 129 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 1.5984219312667847, + "learning_rate": 1.6505376344086024e-05, + "loss": 0.1867, + "step": 130 + }, + { + "epoch": 0.5282258064516129, + "grad_norm": 1.9208190441131592, + "learning_rate": 1.6478494623655917e-05, + "loss": 0.1661, + "step": 131 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 4.714879035949707, + "learning_rate": 1.6451612903225807e-05, + "loss": 0.1978, + "step": 132 + }, + { + "epoch": 0.5362903225806451, + "grad_norm": 1.9932862520217896, + "learning_rate": 1.64247311827957e-05, + "loss": 0.1894, + "step": 133 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 1.8594914674758911, + "learning_rate": 1.6397849462365594e-05, + "loss": 0.1819, + "step": 134 + }, + { + "epoch": 0.5443548387096774, + "grad_norm": 3.666376829147339, + "learning_rate": 1.6370967741935487e-05, + "loss": 0.1594, + "step": 135 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 5.887596607208252, + "learning_rate": 1.6344086021505377e-05, + "loss": 0.1807, + "step": 136 + }, + { + "epoch": 0.5524193548387096, + "grad_norm": 1.3112871646881104, + "learning_rate": 1.631720430107527e-05, + "loss": 0.1411, + "step": 137 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 2.456446409225464, + "learning_rate": 1.6290322580645164e-05, + "loss": 0.1413, + "step": 138 + }, + { + "epoch": 0.5604838709677419, + "grad_norm": 4.192088603973389, + "learning_rate": 1.6263440860215057e-05, + "loss": 0.1422, + "step": 139 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 2.659003734588623, + "learning_rate": 1.6236559139784947e-05, + "loss": 0.1812, + "step": 140 + }, + { + "epoch": 0.5685483870967742, + "grad_norm": 1.699617624282837, + "learning_rate": 1.620967741935484e-05, + "loss": 0.1771, + "step": 141 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 2.221937656402588, + "learning_rate": 1.618279569892473e-05, + "loss": 0.1815, + "step": 142 + }, + { + "epoch": 0.5766129032258065, + "grad_norm": 1.5366272926330566, + "learning_rate": 1.6155913978494623e-05, + "loss": 0.184, + "step": 143 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 2.0756564140319824, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.145, + "step": 144 + }, + { + "epoch": 0.5846774193548387, + "grad_norm": 1.1063413619995117, + "learning_rate": 1.610215053763441e-05, + "loss": 0.0964, + "step": 145 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 1.9002684354782104, + "learning_rate": 1.60752688172043e-05, + "loss": 0.2313, + "step": 146 + }, + { + "epoch": 0.592741935483871, + "grad_norm": 3.5199642181396484, + "learning_rate": 1.6048387096774193e-05, + "loss": 0.1803, + "step": 147 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 2.263772487640381, + "learning_rate": 1.6021505376344087e-05, + "loss": 0.1661, + "step": 148 + }, + { + "epoch": 0.6008064516129032, + "grad_norm": 3.434790849685669, + "learning_rate": 1.599462365591398e-05, + "loss": 0.1729, + "step": 149 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 3.1659963130950928, + "learning_rate": 1.596774193548387e-05, + "loss": 0.2271, + "step": 150 + }, + { + "epoch": 0.6088709677419355, + "grad_norm": 3.9298930168151855, + "learning_rate": 1.5940860215053763e-05, + "loss": 0.1787, + "step": 151 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 5.221301078796387, + "learning_rate": 1.5913978494623657e-05, + "loss": 0.1432, + "step": 152 + }, + { + "epoch": 0.6169354838709677, + "grad_norm": 0.9517404437065125, + "learning_rate": 1.588709677419355e-05, + "loss": 0.143, + "step": 153 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 1.685404658317566, + "learning_rate": 1.586021505376344e-05, + "loss": 0.1446, + "step": 154 + }, + { + "epoch": 0.625, + "grad_norm": 4.428375720977783, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.2276, + "step": 155 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 1.976226568222046, + "learning_rate": 1.5806451612903226e-05, + "loss": 0.1822, + "step": 156 + }, + { + "epoch": 0.6330645161290323, + "grad_norm": 2.600104570388794, + "learning_rate": 1.577956989247312e-05, + "loss": 0.2352, + "step": 157 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 2.413137197494507, + "learning_rate": 1.5752688172043013e-05, + "loss": 0.258, + "step": 158 + }, + { + "epoch": 0.6411290322580645, + "grad_norm": 2.801959753036499, + "learning_rate": 1.5725806451612903e-05, + "loss": 0.1796, + "step": 159 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 2.2109487056732178, + "learning_rate": 1.5698924731182796e-05, + "loss": 0.1749, + "step": 160 + }, + { + "epoch": 0.6491935483870968, + "grad_norm": 3.4560952186584473, + "learning_rate": 1.567204301075269e-05, + "loss": 0.1315, + "step": 161 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 3.611140251159668, + "learning_rate": 1.5645161290322583e-05, + "loss": 0.2518, + "step": 162 + }, + { + "epoch": 0.657258064516129, + "grad_norm": 3.5987987518310547, + "learning_rate": 1.5618279569892473e-05, + "loss": 0.2134, + "step": 163 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 3.6609513759613037, + "learning_rate": 1.5591397849462366e-05, + "loss": 0.1134, + "step": 164 + }, + { + "epoch": 0.6653225806451613, + "grad_norm": 4.708600044250488, + "learning_rate": 1.556451612903226e-05, + "loss": 0.1581, + "step": 165 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 2.0652990341186523, + "learning_rate": 1.5537634408602153e-05, + "loss": 0.114, + "step": 166 + }, + { + "epoch": 0.6733870967741935, + "grad_norm": 2.3634023666381836, + "learning_rate": 1.5510752688172043e-05, + "loss": 0.1383, + "step": 167 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 1.8686503171920776, + "learning_rate": 1.5483870967741936e-05, + "loss": 0.1071, + "step": 168 + }, + { + "epoch": 0.6814516129032258, + "grad_norm": 2.7749221324920654, + "learning_rate": 1.545698924731183e-05, + "loss": 0.1394, + "step": 169 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 5.210952281951904, + "learning_rate": 1.5430107526881723e-05, + "loss": 0.1698, + "step": 170 + }, + { + "epoch": 0.6895161290322581, + "grad_norm": 1.9079982042312622, + "learning_rate": 1.5403225806451613e-05, + "loss": 0.149, + "step": 171 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 3.1015419960021973, + "learning_rate": 1.5376344086021506e-05, + "loss": 0.1821, + "step": 172 + }, + { + "epoch": 0.6975806451612904, + "grad_norm": 2.9040095806121826, + "learning_rate": 1.53494623655914e-05, + "loss": 0.1624, + "step": 173 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 3.451782703399658, + "learning_rate": 1.5322580645161292e-05, + "loss": 0.2349, + "step": 174 + }, + { + "epoch": 0.7056451612903226, + "grad_norm": 2.0777103900909424, + "learning_rate": 1.5295698924731182e-05, + "loss": 0.1575, + "step": 175 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 2.8455822467803955, + "learning_rate": 1.5268817204301076e-05, + "loss": 0.1402, + "step": 176 + }, + { + "epoch": 0.7137096774193549, + "grad_norm": 2.1864373683929443, + "learning_rate": 1.5241935483870969e-05, + "loss": 0.2501, + "step": 177 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 2.945974349975586, + "learning_rate": 1.521505376344086e-05, + "loss": 0.1758, + "step": 178 + }, + { + "epoch": 0.7217741935483871, + "grad_norm": 2.071998357772827, + "learning_rate": 1.5188172043010754e-05, + "loss": 0.1185, + "step": 179 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 2.1226487159729004, + "learning_rate": 1.5161290322580646e-05, + "loss": 0.1809, + "step": 180 + }, + { + "epoch": 0.7298387096774194, + "grad_norm": 3.2159790992736816, + "learning_rate": 1.5134408602150539e-05, + "loss": 0.1583, + "step": 181 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 1.7255215644836426, + "learning_rate": 1.510752688172043e-05, + "loss": 0.1706, + "step": 182 + }, + { + "epoch": 0.7379032258064516, + "grad_norm": 1.9457521438598633, + "learning_rate": 1.5080645161290324e-05, + "loss": 0.1658, + "step": 183 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 7.032689571380615, + "learning_rate": 1.5053763440860215e-05, + "loss": 0.173, + "step": 184 + }, + { + "epoch": 0.7459677419354839, + "grad_norm": 2.1762492656707764, + "learning_rate": 1.5026881720430109e-05, + "loss": 0.144, + "step": 185 + }, + { + "epoch": 0.75, + "grad_norm": 3.8494467735290527, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1933, + "step": 186 + }, + { + "epoch": 0.7540322580645161, + "grad_norm": 1.369672179222107, + "learning_rate": 1.4973118279569894e-05, + "loss": 0.1082, + "step": 187 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 2.0053024291992188, + "learning_rate": 1.4946236559139787e-05, + "loss": 0.1564, + "step": 188 + }, + { + "epoch": 0.7620967741935484, + "grad_norm": 3.1830568313598633, + "learning_rate": 1.4919354838709679e-05, + "loss": 0.1956, + "step": 189 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 2.129666328430176, + "learning_rate": 1.4892473118279572e-05, + "loss": 0.2594, + "step": 190 + }, + { + "epoch": 0.7701612903225806, + "grad_norm": 2.121384859085083, + "learning_rate": 1.4865591397849464e-05, + "loss": 0.1363, + "step": 191 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 2.5438694953918457, + "learning_rate": 1.4838709677419357e-05, + "loss": 0.15, + "step": 192 + }, + { + "epoch": 0.7782258064516129, + "grad_norm": 1.4790626764297485, + "learning_rate": 1.4811827956989248e-05, + "loss": 0.1677, + "step": 193 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 5.348865985870361, + "learning_rate": 1.4784946236559142e-05, + "loss": 0.2535, + "step": 194 + }, + { + "epoch": 0.7862903225806451, + "grad_norm": 7.304387092590332, + "learning_rate": 1.4758064516129033e-05, + "loss": 0.2327, + "step": 195 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 1.6960372924804688, + "learning_rate": 1.4731182795698927e-05, + "loss": 0.1453, + "step": 196 + }, + { + "epoch": 0.7943548387096774, + "grad_norm": 5.402785301208496, + "learning_rate": 1.4704301075268818e-05, + "loss": 0.2091, + "step": 197 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 6.772708415985107, + "learning_rate": 1.4677419354838712e-05, + "loss": 0.1622, + "step": 198 + }, + { + "epoch": 0.8024193548387096, + "grad_norm": 2.564171075820923, + "learning_rate": 1.4650537634408603e-05, + "loss": 0.2299, + "step": 199 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 2.6038239002227783, + "learning_rate": 1.4623655913978497e-05, + "loss": 0.1725, + "step": 200 + }, + { + "epoch": 0.8104838709677419, + "grad_norm": 2.527942180633545, + "learning_rate": 1.4596774193548388e-05, + "loss": 0.1359, + "step": 201 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 3.5944533348083496, + "learning_rate": 1.4569892473118282e-05, + "loss": 0.1268, + "step": 202 + }, + { + "epoch": 0.8185483870967742, + "grad_norm": 3.4926414489746094, + "learning_rate": 1.4543010752688173e-05, + "loss": 0.1473, + "step": 203 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 5.071582794189453, + "learning_rate": 1.4516129032258066e-05, + "loss": 0.196, + "step": 204 + }, + { + "epoch": 0.8266129032258065, + "grad_norm": 4.80251932144165, + "learning_rate": 1.4489247311827958e-05, + "loss": 0.19, + "step": 205 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 3.378063678741455, + "learning_rate": 1.4462365591397851e-05, + "loss": 0.1293, + "step": 206 + }, + { + "epoch": 0.8346774193548387, + "grad_norm": 3.138362407684326, + "learning_rate": 1.4435483870967743e-05, + "loss": 0.1919, + "step": 207 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 3.114121675491333, + "learning_rate": 1.4408602150537636e-05, + "loss": 0.3021, + "step": 208 + }, + { + "epoch": 0.842741935483871, + "grad_norm": 2.903273582458496, + "learning_rate": 1.4381720430107528e-05, + "loss": 0.1613, + "step": 209 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 2.530947685241699, + "learning_rate": 1.4354838709677421e-05, + "loss": 0.2119, + "step": 210 + }, + { + "epoch": 0.8508064516129032, + "grad_norm": 1.9773578643798828, + "learning_rate": 1.4327956989247313e-05, + "loss": 0.1994, + "step": 211 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 3.0400474071502686, + "learning_rate": 1.4301075268817206e-05, + "loss": 0.1553, + "step": 212 + }, + { + "epoch": 0.8588709677419355, + "grad_norm": 2.43621826171875, + "learning_rate": 1.4274193548387098e-05, + "loss": 0.1592, + "step": 213 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 3.8775973320007324, + "learning_rate": 1.4247311827956991e-05, + "loss": 0.161, + "step": 214 + }, + { + "epoch": 0.8669354838709677, + "grad_norm": 2.805100202560425, + "learning_rate": 1.4220430107526883e-05, + "loss": 0.1363, + "step": 215 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 5.203761577606201, + "learning_rate": 1.4193548387096776e-05, + "loss": 0.1403, + "step": 216 + }, + { + "epoch": 0.875, + "grad_norm": 2.3190488815307617, + "learning_rate": 1.416666666666667e-05, + "loss": 0.2115, + "step": 217 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 2.513376474380493, + "learning_rate": 1.4139784946236561e-05, + "loss": 0.227, + "step": 218 + }, + { + "epoch": 0.8830645161290323, + "grad_norm": 3.1920270919799805, + "learning_rate": 1.4112903225806454e-05, + "loss": 0.1732, + "step": 219 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 2.252589225769043, + "learning_rate": 1.4086021505376346e-05, + "loss": 0.1329, + "step": 220 + }, + { + "epoch": 0.8911290322580645, + "grad_norm": 3.943742275238037, + "learning_rate": 1.405913978494624e-05, + "loss": 0.1473, + "step": 221 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 4.05203104019165, + "learning_rate": 1.4032258064516131e-05, + "loss": 0.1647, + "step": 222 + }, + { + "epoch": 0.8991935483870968, + "grad_norm": 4.86761474609375, + "learning_rate": 1.4005376344086024e-05, + "loss": 0.118, + "step": 223 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 2.4546005725860596, + "learning_rate": 1.3978494623655916e-05, + "loss": 0.2356, + "step": 224 + }, + { + "epoch": 0.907258064516129, + "grad_norm": 1.0434294939041138, + "learning_rate": 1.3951612903225809e-05, + "loss": 0.1536, + "step": 225 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 1.4482147693634033, + "learning_rate": 1.39247311827957e-05, + "loss": 0.1479, + "step": 226 + }, + { + "epoch": 0.9153225806451613, + "grad_norm": 2.1162939071655273, + "learning_rate": 1.3897849462365594e-05, + "loss": 0.2139, + "step": 227 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 1.401563048362732, + "learning_rate": 1.3870967741935486e-05, + "loss": 0.2074, + "step": 228 + }, + { + "epoch": 0.9233870967741935, + "grad_norm": 2.659918785095215, + "learning_rate": 1.3844086021505379e-05, + "loss": 0.1711, + "step": 229 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 2.3634390830993652, + "learning_rate": 1.381720430107527e-05, + "loss": 0.1347, + "step": 230 + }, + { + "epoch": 0.9314516129032258, + "grad_norm": 3.2036492824554443, + "learning_rate": 1.3790322580645164e-05, + "loss": 0.1348, + "step": 231 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 2.362253427505493, + "learning_rate": 1.3763440860215056e-05, + "loss": 0.1623, + "step": 232 + }, + { + "epoch": 0.9395161290322581, + "grad_norm": 4.0349860191345215, + "learning_rate": 1.3736559139784945e-05, + "loss": 0.1421, + "step": 233 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 2.3942513465881348, + "learning_rate": 1.3709677419354839e-05, + "loss": 0.1678, + "step": 234 + }, + { + "epoch": 0.9475806451612904, + "grad_norm": 2.3485310077667236, + "learning_rate": 1.3682795698924732e-05, + "loss": 0.1327, + "step": 235 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 1.8485794067382812, + "learning_rate": 1.3655913978494624e-05, + "loss": 0.0943, + "step": 236 + }, + { + "epoch": 0.9556451612903226, + "grad_norm": 2.820059299468994, + "learning_rate": 1.3629032258064517e-05, + "loss": 0.1664, + "step": 237 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 3.143728256225586, + "learning_rate": 1.3602150537634409e-05, + "loss": 0.2003, + "step": 238 + }, + { + "epoch": 0.9637096774193549, + "grad_norm": 3.496140241622925, + "learning_rate": 1.3575268817204302e-05, + "loss": 0.1833, + "step": 239 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 2.736649751663208, + "learning_rate": 1.3548387096774194e-05, + "loss": 0.1505, + "step": 240 + }, + { + "epoch": 0.9717741935483871, + "grad_norm": 2.9205009937286377, + "learning_rate": 1.3521505376344087e-05, + "loss": 0.22, + "step": 241 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 2.8891749382019043, + "learning_rate": 1.3494623655913978e-05, + "loss": 0.1415, + "step": 242 + }, + { + "epoch": 0.9798387096774194, + "grad_norm": 2.7234678268432617, + "learning_rate": 1.3467741935483872e-05, + "loss": 0.1112, + "step": 243 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 2.395036220550537, + "learning_rate": 1.3440860215053763e-05, + "loss": 0.1288, + "step": 244 + }, + { + "epoch": 0.9879032258064516, + "grad_norm": 3.214719772338867, + "learning_rate": 1.3413978494623657e-05, + "loss": 0.1637, + "step": 245 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 3.2017691135406494, + "learning_rate": 1.3387096774193548e-05, + "loss": 0.1614, + "step": 246 + }, + { + "epoch": 0.9959677419354839, + "grad_norm": 2.6992430686950684, + "learning_rate": 1.3360215053763442e-05, + "loss": 0.125, + "step": 247 + }, + { + "epoch": 1.0, + "grad_norm": 3.5830893516540527, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.1237, + "step": 248 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.7306818181818182, + "eval_f1": 0.8891346419684821, + "eval_loss": 0.1686243861913681, + "eval_runtime": 2.375, + "eval_samples_per_second": 370.523, + "eval_steps_per_second": 11.789, + "step": 248 + }, + { + "epoch": 1.0040322580645162, + "grad_norm": 3.1516823768615723, + "learning_rate": 1.3306451612903227e-05, + "loss": 0.2192, + "step": 249 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 3.530272960662842, + "learning_rate": 1.3279569892473118e-05, + "loss": 0.2153, + "step": 250 + }, + { + "epoch": 1.0120967741935485, + "grad_norm": 1.239799976348877, + "learning_rate": 1.3252688172043012e-05, + "loss": 0.1496, + "step": 251 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 1.6457215547561646, + "learning_rate": 1.3225806451612903e-05, + "loss": 0.1542, + "step": 252 + }, + { + "epoch": 1.0201612903225807, + "grad_norm": 3.6155178546905518, + "learning_rate": 1.3198924731182796e-05, + "loss": 0.2078, + "step": 253 + }, + { + "epoch": 1.0241935483870968, + "grad_norm": 1.6368227005004883, + "learning_rate": 1.3172043010752688e-05, + "loss": 0.1271, + "step": 254 + }, + { + "epoch": 1.028225806451613, + "grad_norm": 3.2075986862182617, + "learning_rate": 1.3145161290322581e-05, + "loss": 0.2012, + "step": 255 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 2.500032901763916, + "learning_rate": 1.3118279569892473e-05, + "loss": 0.1093, + "step": 256 + }, + { + "epoch": 1.0362903225806452, + "grad_norm": 2.440420150756836, + "learning_rate": 1.3091397849462366e-05, + "loss": 0.1072, + "step": 257 + }, + { + "epoch": 1.0403225806451613, + "grad_norm": 3.545008659362793, + "learning_rate": 1.3064516129032258e-05, + "loss": 0.1596, + "step": 258 + }, + { + "epoch": 1.0443548387096775, + "grad_norm": 3.0299694538116455, + "learning_rate": 1.3037634408602151e-05, + "loss": 0.1784, + "step": 259 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 1.5560317039489746, + "learning_rate": 1.3010752688172043e-05, + "loss": 0.1254, + "step": 260 + }, + { + "epoch": 1.0524193548387097, + "grad_norm": 3.661555528640747, + "learning_rate": 1.2983870967741936e-05, + "loss": 0.1665, + "step": 261 + }, + { + "epoch": 1.0564516129032258, + "grad_norm": 2.275700092315674, + "learning_rate": 1.2956989247311828e-05, + "loss": 0.113, + "step": 262 + }, + { + "epoch": 1.060483870967742, + "grad_norm": 0.8315228223800659, + "learning_rate": 1.2930107526881721e-05, + "loss": 0.1173, + "step": 263 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 1.0392459630966187, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.0891, + "step": 264 + }, + { + "epoch": 1.0685483870967742, + "grad_norm": 1.7290191650390625, + "learning_rate": 1.2876344086021506e-05, + "loss": 0.1432, + "step": 265 + }, + { + "epoch": 1.0725806451612903, + "grad_norm": 4.252922058105469, + "learning_rate": 1.28494623655914e-05, + "loss": 0.1772, + "step": 266 + }, + { + "epoch": 1.0766129032258065, + "grad_norm": 3.656017303466797, + "learning_rate": 1.2822580645161291e-05, + "loss": 0.1832, + "step": 267 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 8.33147144317627, + "learning_rate": 1.2795698924731184e-05, + "loss": 0.219, + "step": 268 + }, + { + "epoch": 1.0846774193548387, + "grad_norm": 2.9899277687072754, + "learning_rate": 1.2768817204301076e-05, + "loss": 0.1716, + "step": 269 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 1.3610584735870361, + "learning_rate": 1.274193548387097e-05, + "loss": 0.145, + "step": 270 + }, + { + "epoch": 1.092741935483871, + "grad_norm": 3.155823230743408, + "learning_rate": 1.2715053763440861e-05, + "loss": 0.1671, + "step": 271 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 2.408665657043457, + "learning_rate": 1.2688172043010754e-05, + "loss": 0.1151, + "step": 272 + }, + { + "epoch": 1.1008064516129032, + "grad_norm": 1.192688226699829, + "learning_rate": 1.2661290322580646e-05, + "loss": 0.1489, + "step": 273 + }, + { + "epoch": 1.1048387096774193, + "grad_norm": 2.6764822006225586, + "learning_rate": 1.2634408602150539e-05, + "loss": 0.1593, + "step": 274 + }, + { + "epoch": 1.1088709677419355, + "grad_norm": 4.13783073425293, + "learning_rate": 1.260752688172043e-05, + "loss": 0.1911, + "step": 275 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 1.4944156408309937, + "learning_rate": 1.2580645161290324e-05, + "loss": 0.2192, + "step": 276 + }, + { + "epoch": 1.1169354838709677, + "grad_norm": 1.162097692489624, + "learning_rate": 1.2553763440860216e-05, + "loss": 0.1277, + "step": 277 + }, + { + "epoch": 1.120967741935484, + "grad_norm": 2.7152187824249268, + "learning_rate": 1.2526881720430109e-05, + "loss": 0.2191, + "step": 278 + }, + { + "epoch": 1.125, + "grad_norm": 2.895779609680176, + "learning_rate": 1.25e-05, + "loss": 0.1696, + "step": 279 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 2.1827454566955566, + "learning_rate": 1.2473118279569894e-05, + "loss": 0.0868, + "step": 280 + }, + { + "epoch": 1.1330645161290323, + "grad_norm": 2.273141860961914, + "learning_rate": 1.2446236559139786e-05, + "loss": 0.214, + "step": 281 + }, + { + "epoch": 1.1370967741935485, + "grad_norm": 1.9680695533752441, + "learning_rate": 1.2419354838709679e-05, + "loss": 0.1716, + "step": 282 + }, + { + "epoch": 1.1411290322580645, + "grad_norm": 1.9214951992034912, + "learning_rate": 1.239247311827957e-05, + "loss": 0.1965, + "step": 283 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 5.198123931884766, + "learning_rate": 1.2365591397849464e-05, + "loss": 0.1683, + "step": 284 + }, + { + "epoch": 1.1491935483870968, + "grad_norm": 1.6851317882537842, + "learning_rate": 1.2338709677419355e-05, + "loss": 0.1698, + "step": 285 + }, + { + "epoch": 1.153225806451613, + "grad_norm": 2.037721633911133, + "learning_rate": 1.2311827956989249e-05, + "loss": 0.1484, + "step": 286 + }, + { + "epoch": 1.157258064516129, + "grad_norm": 1.9201178550720215, + "learning_rate": 1.228494623655914e-05, + "loss": 0.1128, + "step": 287 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 2.517728090286255, + "learning_rate": 1.2258064516129034e-05, + "loss": 0.1865, + "step": 288 + }, + { + "epoch": 1.1653225806451613, + "grad_norm": 2.0093541145324707, + "learning_rate": 1.2231182795698925e-05, + "loss": 0.1765, + "step": 289 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 2.9235808849334717, + "learning_rate": 1.2204301075268819e-05, + "loss": 0.2289, + "step": 290 + }, + { + "epoch": 1.1733870967741935, + "grad_norm": 2.3604586124420166, + "learning_rate": 1.217741935483871e-05, + "loss": 0.1056, + "step": 291 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 2.574960947036743, + "learning_rate": 1.2150537634408604e-05, + "loss": 0.1599, + "step": 292 + }, + { + "epoch": 1.1814516129032258, + "grad_norm": 3.23299241065979, + "learning_rate": 1.2123655913978495e-05, + "loss": 0.0828, + "step": 293 + }, + { + "epoch": 1.185483870967742, + "grad_norm": 3.568268299102783, + "learning_rate": 1.2096774193548388e-05, + "loss": 0.168, + "step": 294 + }, + { + "epoch": 1.189516129032258, + "grad_norm": 1.8116379976272583, + "learning_rate": 1.2069892473118282e-05, + "loss": 0.1233, + "step": 295 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 2.839566469192505, + "learning_rate": 1.2043010752688173e-05, + "loss": 0.1286, + "step": 296 + }, + { + "epoch": 1.1975806451612903, + "grad_norm": 1.3624751567840576, + "learning_rate": 1.2016129032258067e-05, + "loss": 0.1815, + "step": 297 + }, + { + "epoch": 1.2016129032258065, + "grad_norm": 1.082029938697815, + "learning_rate": 1.1989247311827958e-05, + "loss": 0.1486, + "step": 298 + }, + { + "epoch": 1.2056451612903225, + "grad_norm": 1.6093990802764893, + "learning_rate": 1.1962365591397852e-05, + "loss": 0.176, + "step": 299 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 1.5934983491897583, + "learning_rate": 1.1935483870967743e-05, + "loss": 0.1167, + "step": 300 + }, + { + "epoch": 1.2137096774193548, + "grad_norm": 1.5063905715942383, + "learning_rate": 1.1908602150537637e-05, + "loss": 0.1091, + "step": 301 + }, + { + "epoch": 1.217741935483871, + "grad_norm": 2.482288122177124, + "learning_rate": 1.1881720430107528e-05, + "loss": 0.171, + "step": 302 + }, + { + "epoch": 1.221774193548387, + "grad_norm": 2.164362907409668, + "learning_rate": 1.1854838709677421e-05, + "loss": 0.1996, + "step": 303 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 1.5960315465927124, + "learning_rate": 1.1827956989247313e-05, + "loss": 0.1211, + "step": 304 + }, + { + "epoch": 1.2298387096774193, + "grad_norm": 3.244338274002075, + "learning_rate": 1.1801075268817206e-05, + "loss": 0.1378, + "step": 305 + }, + { + "epoch": 1.2338709677419355, + "grad_norm": 1.7096904516220093, + "learning_rate": 1.1774193548387098e-05, + "loss": 0.1706, + "step": 306 + }, + { + "epoch": 1.2379032258064515, + "grad_norm": 1.980499505996704, + "learning_rate": 1.1747311827956991e-05, + "loss": 0.1233, + "step": 307 + }, + { + "epoch": 1.2419354838709677, + "grad_norm": 1.3346567153930664, + "learning_rate": 1.1720430107526883e-05, + "loss": 0.1496, + "step": 308 + }, + { + "epoch": 1.245967741935484, + "grad_norm": 1.437386393547058, + "learning_rate": 1.1693548387096776e-05, + "loss": 0.1306, + "step": 309 + }, + { + "epoch": 1.25, + "grad_norm": 1.8189259767532349, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.1579, + "step": 310 + }, + { + "epoch": 1.254032258064516, + "grad_norm": 2.060695171356201, + "learning_rate": 1.1639784946236561e-05, + "loss": 0.1289, + "step": 311 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 2.4348952770233154, + "learning_rate": 1.1612903225806453e-05, + "loss": 0.1002, + "step": 312 + }, + { + "epoch": 1.2620967741935485, + "grad_norm": 1.843311071395874, + "learning_rate": 1.1586021505376346e-05, + "loss": 0.1457, + "step": 313 + }, + { + "epoch": 1.2661290322580645, + "grad_norm": 2.4911584854125977, + "learning_rate": 1.1559139784946238e-05, + "loss": 0.1832, + "step": 314 + }, + { + "epoch": 1.2701612903225805, + "grad_norm": 4.787015438079834, + "learning_rate": 1.1532258064516131e-05, + "loss": 0.1762, + "step": 315 + }, + { + "epoch": 1.2741935483870968, + "grad_norm": 2.0192394256591797, + "learning_rate": 1.1505376344086023e-05, + "loss": 0.1286, + "step": 316 + }, + { + "epoch": 1.278225806451613, + "grad_norm": 0.9655114412307739, + "learning_rate": 1.1478494623655916e-05, + "loss": 0.0727, + "step": 317 + }, + { + "epoch": 1.282258064516129, + "grad_norm": 1.3966825008392334, + "learning_rate": 1.1451612903225808e-05, + "loss": 0.1365, + "step": 318 + }, + { + "epoch": 1.2862903225806452, + "grad_norm": 1.7151025533676147, + "learning_rate": 1.1424731182795701e-05, + "loss": 0.1583, + "step": 319 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 2.77651309967041, + "learning_rate": 1.1397849462365593e-05, + "loss": 0.0976, + "step": 320 + }, + { + "epoch": 1.2943548387096775, + "grad_norm": 1.6842812299728394, + "learning_rate": 1.1370967741935486e-05, + "loss": 0.0929, + "step": 321 + }, + { + "epoch": 1.2983870967741935, + "grad_norm": 1.3794842958450317, + "learning_rate": 1.1344086021505377e-05, + "loss": 0.2264, + "step": 322 + }, + { + "epoch": 1.3024193548387097, + "grad_norm": 1.75082266330719, + "learning_rate": 1.131720430107527e-05, + "loss": 0.1209, + "step": 323 + }, + { + "epoch": 1.3064516129032258, + "grad_norm": 3.0138330459594727, + "learning_rate": 1.1290322580645164e-05, + "loss": 0.1394, + "step": 324 + }, + { + "epoch": 1.310483870967742, + "grad_norm": 1.3630236387252808, + "learning_rate": 1.1263440860215056e-05, + "loss": 0.0678, + "step": 325 + }, + { + "epoch": 1.314516129032258, + "grad_norm": 3.041550636291504, + "learning_rate": 1.1236559139784946e-05, + "loss": 0.2057, + "step": 326 + }, + { + "epoch": 1.3185483870967742, + "grad_norm": 1.457026720046997, + "learning_rate": 1.1209677419354839e-05, + "loss": 0.1104, + "step": 327 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 2.0328333377838135, + "learning_rate": 1.118279569892473e-05, + "loss": 0.13, + "step": 328 + }, + { + "epoch": 1.3266129032258065, + "grad_norm": 1.102391242980957, + "learning_rate": 1.1155913978494624e-05, + "loss": 0.1015, + "step": 329 + }, + { + "epoch": 1.3306451612903225, + "grad_norm": 1.9321506023406982, + "learning_rate": 1.1129032258064516e-05, + "loss": 0.085, + "step": 330 + }, + { + "epoch": 1.3346774193548387, + "grad_norm": 1.9749724864959717, + "learning_rate": 1.1102150537634409e-05, + "loss": 0.1343, + "step": 331 + }, + { + "epoch": 1.3387096774193548, + "grad_norm": 2.8448033332824707, + "learning_rate": 1.10752688172043e-05, + "loss": 0.1639, + "step": 332 + }, + { + "epoch": 1.342741935483871, + "grad_norm": 1.1829770803451538, + "learning_rate": 1.1048387096774194e-05, + "loss": 0.0986, + "step": 333 + }, + { + "epoch": 1.346774193548387, + "grad_norm": 3.9288711547851562, + "learning_rate": 1.1021505376344085e-05, + "loss": 0.1465, + "step": 334 + }, + { + "epoch": 1.3508064516129032, + "grad_norm": 2.407360553741455, + "learning_rate": 1.0994623655913979e-05, + "loss": 0.1261, + "step": 335 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 3.4650471210479736, + "learning_rate": 1.096774193548387e-05, + "loss": 0.1828, + "step": 336 + }, + { + "epoch": 1.3588709677419355, + "grad_norm": 2.9928531646728516, + "learning_rate": 1.0940860215053764e-05, + "loss": 0.1139, + "step": 337 + }, + { + "epoch": 1.3629032258064515, + "grad_norm": 4.049187183380127, + "learning_rate": 1.0913978494623655e-05, + "loss": 0.1622, + "step": 338 + }, + { + "epoch": 1.3669354838709677, + "grad_norm": 3.758409023284912, + "learning_rate": 1.0887096774193549e-05, + "loss": 0.1058, + "step": 339 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 3.3022146224975586, + "learning_rate": 1.086021505376344e-05, + "loss": 0.1414, + "step": 340 + }, + { + "epoch": 1.375, + "grad_norm": 6.569581985473633, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.2535, + "step": 341 + }, + { + "epoch": 1.379032258064516, + "grad_norm": 1.4087936878204346, + "learning_rate": 1.0806451612903225e-05, + "loss": 0.1063, + "step": 342 + }, + { + "epoch": 1.3830645161290323, + "grad_norm": 1.7777235507965088, + "learning_rate": 1.0779569892473118e-05, + "loss": 0.1408, + "step": 343 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 1.5745316743850708, + "learning_rate": 1.0752688172043012e-05, + "loss": 0.0839, + "step": 344 + }, + { + "epoch": 1.3911290322580645, + "grad_norm": 3.317399024963379, + "learning_rate": 1.0725806451612903e-05, + "loss": 0.1597, + "step": 345 + }, + { + "epoch": 1.3951612903225805, + "grad_norm": 2.1819188594818115, + "learning_rate": 1.0698924731182797e-05, + "loss": 0.0801, + "step": 346 + }, + { + "epoch": 1.3991935483870968, + "grad_norm": 3.0169897079467773, + "learning_rate": 1.0672043010752688e-05, + "loss": 0.1419, + "step": 347 + }, + { + "epoch": 1.403225806451613, + "grad_norm": 2.023202419281006, + "learning_rate": 1.0645161290322582e-05, + "loss": 0.1038, + "step": 348 + }, + { + "epoch": 1.407258064516129, + "grad_norm": 1.4076378345489502, + "learning_rate": 1.0618279569892473e-05, + "loss": 0.1048, + "step": 349 + }, + { + "epoch": 1.4112903225806452, + "grad_norm": 2.562121629714966, + "learning_rate": 1.0591397849462367e-05, + "loss": 0.2181, + "step": 350 + }, + { + "epoch": 1.4153225806451613, + "grad_norm": 2.308161973953247, + "learning_rate": 1.0564516129032258e-05, + "loss": 0.1669, + "step": 351 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 1.25361168384552, + "learning_rate": 1.0537634408602151e-05, + "loss": 0.0966, + "step": 352 + }, + { + "epoch": 1.4233870967741935, + "grad_norm": 4.17478609085083, + "learning_rate": 1.0510752688172043e-05, + "loss": 0.169, + "step": 353 + }, + { + "epoch": 1.4274193548387097, + "grad_norm": 3.2306907176971436, + "learning_rate": 1.0483870967741936e-05, + "loss": 0.1613, + "step": 354 + }, + { + "epoch": 1.4314516129032258, + "grad_norm": 4.039083957672119, + "learning_rate": 1.0456989247311828e-05, + "loss": 0.334, + "step": 355 + }, + { + "epoch": 1.435483870967742, + "grad_norm": 3.141918182373047, + "learning_rate": 1.0430107526881721e-05, + "loss": 0.1727, + "step": 356 + }, + { + "epoch": 1.439516129032258, + "grad_norm": 4.222794532775879, + "learning_rate": 1.0403225806451613e-05, + "loss": 0.1147, + "step": 357 + }, + { + "epoch": 1.4435483870967742, + "grad_norm": 1.3107784986495972, + "learning_rate": 1.0376344086021506e-05, + "loss": 0.0728, + "step": 358 + }, + { + "epoch": 1.4475806451612903, + "grad_norm": 5.24909782409668, + "learning_rate": 1.0349462365591398e-05, + "loss": 0.2182, + "step": 359 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 4.10903787612915, + "learning_rate": 1.0322580645161291e-05, + "loss": 0.1699, + "step": 360 + }, + { + "epoch": 1.4556451612903225, + "grad_norm": 2.6097729206085205, + "learning_rate": 1.0295698924731183e-05, + "loss": 0.1172, + "step": 361 + }, + { + "epoch": 1.4596774193548387, + "grad_norm": 2.7892961502075195, + "learning_rate": 1.0268817204301076e-05, + "loss": 0.1382, + "step": 362 + }, + { + "epoch": 1.4637096774193548, + "grad_norm": 4.481143474578857, + "learning_rate": 1.0241935483870968e-05, + "loss": 0.171, + "step": 363 + }, + { + "epoch": 1.467741935483871, + "grad_norm": 2.8739893436431885, + "learning_rate": 1.0215053763440861e-05, + "loss": 0.0895, + "step": 364 + }, + { + "epoch": 1.471774193548387, + "grad_norm": 2.5068511962890625, + "learning_rate": 1.0188172043010753e-05, + "loss": 0.1144, + "step": 365 + }, + { + "epoch": 1.4758064516129032, + "grad_norm": 2.39939284324646, + "learning_rate": 1.0161290322580646e-05, + "loss": 0.1195, + "step": 366 + }, + { + "epoch": 1.4798387096774195, + "grad_norm": 2.4179749488830566, + "learning_rate": 1.0134408602150538e-05, + "loss": 0.1791, + "step": 367 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 1.944332480430603, + "learning_rate": 1.0107526881720431e-05, + "loss": 0.1656, + "step": 368 + }, + { + "epoch": 1.4879032258064515, + "grad_norm": 4.383450031280518, + "learning_rate": 1.0080645161290323e-05, + "loss": 0.1545, + "step": 369 + }, + { + "epoch": 1.4919354838709677, + "grad_norm": 1.622109293937683, + "learning_rate": 1.0053763440860216e-05, + "loss": 0.0967, + "step": 370 + }, + { + "epoch": 1.495967741935484, + "grad_norm": 1.5308200120925903, + "learning_rate": 1.0026881720430108e-05, + "loss": 0.0721, + "step": 371 + }, + { + "epoch": 1.5, + "grad_norm": 2.9791924953460693, + "learning_rate": 1e-05, + "loss": 0.1107, + "step": 372 + }, + { + "epoch": 1.504032258064516, + "grad_norm": 2.284703254699707, + "learning_rate": 9.973118279569894e-06, + "loss": 0.1647, + "step": 373 + }, + { + "epoch": 1.5080645161290323, + "grad_norm": 3.273378849029541, + "learning_rate": 9.946236559139786e-06, + "loss": 0.1238, + "step": 374 + }, + { + "epoch": 1.5120967741935485, + "grad_norm": 1.6005984544754028, + "learning_rate": 9.919354838709679e-06, + "loss": 0.1222, + "step": 375 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 3.7144405841827393, + "learning_rate": 9.89247311827957e-06, + "loss": 0.1352, + "step": 376 + }, + { + "epoch": 1.5201612903225805, + "grad_norm": 2.2433135509490967, + "learning_rate": 9.865591397849464e-06, + "loss": 0.1353, + "step": 377 + }, + { + "epoch": 1.5241935483870968, + "grad_norm": 1.9060373306274414, + "learning_rate": 9.838709677419356e-06, + "loss": 0.0938, + "step": 378 + }, + { + "epoch": 1.528225806451613, + "grad_norm": 2.5664989948272705, + "learning_rate": 9.811827956989249e-06, + "loss": 0.1925, + "step": 379 + }, + { + "epoch": 1.532258064516129, + "grad_norm": 3.4527804851531982, + "learning_rate": 9.78494623655914e-06, + "loss": 0.17, + "step": 380 + }, + { + "epoch": 1.536290322580645, + "grad_norm": 1.0636487007141113, + "learning_rate": 9.758064516129034e-06, + "loss": 0.0896, + "step": 381 + }, + { + "epoch": 1.5403225806451613, + "grad_norm": 2.5372161865234375, + "learning_rate": 9.731182795698925e-06, + "loss": 0.2065, + "step": 382 + }, + { + "epoch": 1.5443548387096775, + "grad_norm": 3.2576494216918945, + "learning_rate": 9.704301075268819e-06, + "loss": 0.1855, + "step": 383 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 3.3559024333953857, + "learning_rate": 9.67741935483871e-06, + "loss": 0.2192, + "step": 384 + }, + { + "epoch": 1.5524193548387095, + "grad_norm": 3.2494945526123047, + "learning_rate": 9.650537634408604e-06, + "loss": 0.1617, + "step": 385 + }, + { + "epoch": 1.5564516129032258, + "grad_norm": 2.113792896270752, + "learning_rate": 9.623655913978495e-06, + "loss": 0.187, + "step": 386 + }, + { + "epoch": 1.560483870967742, + "grad_norm": 1.530720829963684, + "learning_rate": 9.596774193548389e-06, + "loss": 0.0767, + "step": 387 + }, + { + "epoch": 1.564516129032258, + "grad_norm": 1.9626264572143555, + "learning_rate": 9.56989247311828e-06, + "loss": 0.1725, + "step": 388 + }, + { + "epoch": 1.5685483870967742, + "grad_norm": 1.6587375402450562, + "learning_rate": 9.543010752688174e-06, + "loss": 0.0998, + "step": 389 + }, + { + "epoch": 1.5725806451612905, + "grad_norm": 1.3230797052383423, + "learning_rate": 9.516129032258065e-06, + "loss": 0.1522, + "step": 390 + }, + { + "epoch": 1.5766129032258065, + "grad_norm": 2.594186305999756, + "learning_rate": 9.489247311827959e-06, + "loss": 0.2221, + "step": 391 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 3.0997118949890137, + "learning_rate": 9.46236559139785e-06, + "loss": 0.1009, + "step": 392 + }, + { + "epoch": 1.5846774193548387, + "grad_norm": 4.0840325355529785, + "learning_rate": 9.435483870967743e-06, + "loss": 0.1231, + "step": 393 + }, + { + "epoch": 1.588709677419355, + "grad_norm": 3.59895658493042, + "learning_rate": 9.408602150537635e-06, + "loss": 0.1296, + "step": 394 + }, + { + "epoch": 1.592741935483871, + "grad_norm": 1.21365487575531, + "learning_rate": 9.381720430107528e-06, + "loss": 0.1062, + "step": 395 + }, + { + "epoch": 1.596774193548387, + "grad_norm": 2.8088643550872803, + "learning_rate": 9.35483870967742e-06, + "loss": 0.1877, + "step": 396 + }, + { + "epoch": 1.6008064516129032, + "grad_norm": 2.218878746032715, + "learning_rate": 9.327956989247312e-06, + "loss": 0.1258, + "step": 397 + }, + { + "epoch": 1.6048387096774195, + "grad_norm": 1.3823351860046387, + "learning_rate": 9.301075268817205e-06, + "loss": 0.1048, + "step": 398 + }, + { + "epoch": 1.6088709677419355, + "grad_norm": 2.248091220855713, + "learning_rate": 9.274193548387097e-06, + "loss": 0.2004, + "step": 399 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 2.475921392440796, + "learning_rate": 9.24731182795699e-06, + "loss": 0.1216, + "step": 400 + }, + { + "epoch": 1.6169354838709677, + "grad_norm": 1.0884982347488403, + "learning_rate": 9.220430107526881e-06, + "loss": 0.099, + "step": 401 + }, + { + "epoch": 1.620967741935484, + "grad_norm": 2.633883237838745, + "learning_rate": 9.193548387096775e-06, + "loss": 0.1015, + "step": 402 + }, + { + "epoch": 1.625, + "grad_norm": 1.0618445873260498, + "learning_rate": 9.166666666666666e-06, + "loss": 0.1017, + "step": 403 + }, + { + "epoch": 1.629032258064516, + "grad_norm": 6.406484127044678, + "learning_rate": 9.13978494623656e-06, + "loss": 0.159, + "step": 404 + }, + { + "epoch": 1.6330645161290323, + "grad_norm": 1.9261949062347412, + "learning_rate": 9.112903225806451e-06, + "loss": 0.1234, + "step": 405 + }, + { + "epoch": 1.6370967741935485, + "grad_norm": 4.344938278198242, + "learning_rate": 9.086021505376345e-06, + "loss": 0.1519, + "step": 406 + }, + { + "epoch": 1.6411290322580645, + "grad_norm": 1.627624273300171, + "learning_rate": 9.059139784946236e-06, + "loss": 0.1154, + "step": 407 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 1.8751394748687744, + "learning_rate": 9.03225806451613e-06, + "loss": 0.1693, + "step": 408 + }, + { + "epoch": 1.6491935483870968, + "grad_norm": 2.318906784057617, + "learning_rate": 9.005376344086021e-06, + "loss": 0.1491, + "step": 409 + }, + { + "epoch": 1.653225806451613, + "grad_norm": 5.5316853523254395, + "learning_rate": 8.978494623655915e-06, + "loss": 0.2173, + "step": 410 + }, + { + "epoch": 1.657258064516129, + "grad_norm": 2.1360862255096436, + "learning_rate": 8.951612903225806e-06, + "loss": 0.1482, + "step": 411 + }, + { + "epoch": 1.661290322580645, + "grad_norm": 1.136660099029541, + "learning_rate": 8.9247311827957e-06, + "loss": 0.1142, + "step": 412 + }, + { + "epoch": 1.6653225806451613, + "grad_norm": 5.797788143157959, + "learning_rate": 8.897849462365593e-06, + "loss": 0.1382, + "step": 413 + }, + { + "epoch": 1.6693548387096775, + "grad_norm": 4.6974287033081055, + "learning_rate": 8.870967741935484e-06, + "loss": 0.1369, + "step": 414 + }, + { + "epoch": 1.6733870967741935, + "grad_norm": 7.289107799530029, + "learning_rate": 8.844086021505378e-06, + "loss": 0.2441, + "step": 415 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 4.214983940124512, + "learning_rate": 8.81720430107527e-06, + "loss": 0.213, + "step": 416 + }, + { + "epoch": 1.6814516129032258, + "grad_norm": 3.4454896450042725, + "learning_rate": 8.790322580645163e-06, + "loss": 0.0803, + "step": 417 + }, + { + "epoch": 1.685483870967742, + "grad_norm": 2.8119094371795654, + "learning_rate": 8.763440860215054e-06, + "loss": 0.1307, + "step": 418 + }, + { + "epoch": 1.689516129032258, + "grad_norm": 3.920039653778076, + "learning_rate": 8.736559139784948e-06, + "loss": 0.1729, + "step": 419 + }, + { + "epoch": 1.6935483870967742, + "grad_norm": 4.052835941314697, + "learning_rate": 8.70967741935484e-06, + "loss": 0.0988, + "step": 420 + }, + { + "epoch": 1.6975806451612905, + "grad_norm": 2.297703742980957, + "learning_rate": 8.682795698924733e-06, + "loss": 0.1176, + "step": 421 + }, + { + "epoch": 1.7016129032258065, + "grad_norm": 2.6884663105010986, + "learning_rate": 8.655913978494624e-06, + "loss": 0.1113, + "step": 422 + }, + { + "epoch": 1.7056451612903225, + "grad_norm": 2.510374069213867, + "learning_rate": 8.629032258064517e-06, + "loss": 0.1428, + "step": 423 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 1.734438419342041, + "learning_rate": 8.602150537634409e-06, + "loss": 0.0651, + "step": 424 + }, + { + "epoch": 1.713709677419355, + "grad_norm": 2.6852872371673584, + "learning_rate": 8.575268817204302e-06, + "loss": 0.1801, + "step": 425 + }, + { + "epoch": 1.717741935483871, + "grad_norm": 1.1802809238433838, + "learning_rate": 8.548387096774194e-06, + "loss": 0.1295, + "step": 426 + }, + { + "epoch": 1.721774193548387, + "grad_norm": 3.5888192653656006, + "learning_rate": 8.521505376344087e-06, + "loss": 0.1373, + "step": 427 + }, + { + "epoch": 1.7258064516129032, + "grad_norm": 3.463402271270752, + "learning_rate": 8.494623655913979e-06, + "loss": 0.104, + "step": 428 + }, + { + "epoch": 1.7298387096774195, + "grad_norm": 2.838108777999878, + "learning_rate": 8.467741935483872e-06, + "loss": 0.1541, + "step": 429 + }, + { + "epoch": 1.7338709677419355, + "grad_norm": 3.127835512161255, + "learning_rate": 8.440860215053764e-06, + "loss": 0.1103, + "step": 430 + }, + { + "epoch": 1.7379032258064515, + "grad_norm": 2.3887135982513428, + "learning_rate": 8.413978494623657e-06, + "loss": 0.1686, + "step": 431 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 2.5920352935791016, + "learning_rate": 8.387096774193549e-06, + "loss": 0.1187, + "step": 432 + }, + { + "epoch": 1.745967741935484, + "grad_norm": 4.120633125305176, + "learning_rate": 8.360215053763442e-06, + "loss": 0.1966, + "step": 433 + }, + { + "epoch": 1.75, + "grad_norm": 1.4372769594192505, + "learning_rate": 8.333333333333334e-06, + "loss": 0.1419, + "step": 434 + }, + { + "epoch": 1.754032258064516, + "grad_norm": 2.421602964401245, + "learning_rate": 8.306451612903227e-06, + "loss": 0.145, + "step": 435 + }, + { + "epoch": 1.7580645161290323, + "grad_norm": 2.807343006134033, + "learning_rate": 8.279569892473119e-06, + "loss": 0.0802, + "step": 436 + }, + { + "epoch": 1.7620967741935485, + "grad_norm": 2.304760217666626, + "learning_rate": 8.252688172043012e-06, + "loss": 0.1192, + "step": 437 + }, + { + "epoch": 1.7661290322580645, + "grad_norm": 2.381269693374634, + "learning_rate": 8.225806451612904e-06, + "loss": 0.1027, + "step": 438 + }, + { + "epoch": 1.7701612903225805, + "grad_norm": 2.1986842155456543, + "learning_rate": 8.198924731182797e-06, + "loss": 0.1012, + "step": 439 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 2.638580560684204, + "learning_rate": 8.172043010752689e-06, + "loss": 0.1746, + "step": 440 + }, + { + "epoch": 1.778225806451613, + "grad_norm": 4.079723358154297, + "learning_rate": 8.145161290322582e-06, + "loss": 0.1684, + "step": 441 + }, + { + "epoch": 1.782258064516129, + "grad_norm": 1.7967092990875244, + "learning_rate": 8.118279569892473e-06, + "loss": 0.2538, + "step": 442 + }, + { + "epoch": 1.786290322580645, + "grad_norm": 2.0538618564605713, + "learning_rate": 8.091397849462365e-06, + "loss": 0.1666, + "step": 443 + }, + { + "epoch": 1.7903225806451613, + "grad_norm": 3.8617451190948486, + "learning_rate": 8.064516129032258e-06, + "loss": 0.1725, + "step": 444 + }, + { + "epoch": 1.7943548387096775, + "grad_norm": 3.2343311309814453, + "learning_rate": 8.03763440860215e-06, + "loss": 0.1595, + "step": 445 + }, + { + "epoch": 1.7983870967741935, + "grad_norm": 2.797788619995117, + "learning_rate": 8.010752688172043e-06, + "loss": 0.0903, + "step": 446 + }, + { + "epoch": 1.8024193548387095, + "grad_norm": 3.4088759422302246, + "learning_rate": 7.983870967741935e-06, + "loss": 0.1113, + "step": 447 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 3.5541694164276123, + "learning_rate": 7.956989247311828e-06, + "loss": 0.1433, + "step": 448 + }, + { + "epoch": 1.810483870967742, + "grad_norm": 1.529510736465454, + "learning_rate": 7.93010752688172e-06, + "loss": 0.0728, + "step": 449 + }, + { + "epoch": 1.814516129032258, + "grad_norm": 1.9386626482009888, + "learning_rate": 7.903225806451613e-06, + "loss": 0.1112, + "step": 450 + }, + { + "epoch": 1.8185483870967742, + "grad_norm": 2.3068759441375732, + "learning_rate": 7.876344086021507e-06, + "loss": 0.0967, + "step": 451 + }, + { + "epoch": 1.8225806451612905, + "grad_norm": 2.4969186782836914, + "learning_rate": 7.849462365591398e-06, + "loss": 0.0896, + "step": 452 + }, + { + "epoch": 1.8266129032258065, + "grad_norm": 2.4641716480255127, + "learning_rate": 7.822580645161291e-06, + "loss": 0.1365, + "step": 453 + }, + { + "epoch": 1.8306451612903225, + "grad_norm": 4.361322402954102, + "learning_rate": 7.795698924731183e-06, + "loss": 0.1544, + "step": 454 + }, + { + "epoch": 1.8346774193548387, + "grad_norm": 1.1930088996887207, + "learning_rate": 7.768817204301076e-06, + "loss": 0.0941, + "step": 455 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 1.9687577486038208, + "learning_rate": 7.741935483870968e-06, + "loss": 0.114, + "step": 456 + }, + { + "epoch": 1.842741935483871, + "grad_norm": 4.404200077056885, + "learning_rate": 7.715053763440861e-06, + "loss": 0.153, + "step": 457 + }, + { + "epoch": 1.846774193548387, + "grad_norm": 0.9312377572059631, + "learning_rate": 7.688172043010753e-06, + "loss": 0.1008, + "step": 458 + }, + { + "epoch": 1.8508064516129032, + "grad_norm": 2.732656717300415, + "learning_rate": 7.661290322580646e-06, + "loss": 0.0629, + "step": 459 + }, + { + "epoch": 1.8548387096774195, + "grad_norm": 1.351778268814087, + "learning_rate": 7.634408602150538e-06, + "loss": 0.0842, + "step": 460 + }, + { + "epoch": 1.8588709677419355, + "grad_norm": 1.4621678590774536, + "learning_rate": 7.60752688172043e-06, + "loss": 0.0929, + "step": 461 + }, + { + "epoch": 1.8629032258064515, + "grad_norm": 2.530639171600342, + "learning_rate": 7.580645161290323e-06, + "loss": 0.1128, + "step": 462 + }, + { + "epoch": 1.8669354838709677, + "grad_norm": 1.704332709312439, + "learning_rate": 7.553763440860215e-06, + "loss": 0.1268, + "step": 463 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 1.6047230958938599, + "learning_rate": 7.526881720430108e-06, + "loss": 0.1598, + "step": 464 + }, + { + "epoch": 1.875, + "grad_norm": 1.620526909828186, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0815, + "step": 465 + }, + { + "epoch": 1.879032258064516, + "grad_norm": 3.030583620071411, + "learning_rate": 7.4731182795698935e-06, + "loss": 0.1197, + "step": 466 + }, + { + "epoch": 1.8830645161290323, + "grad_norm": 2.4432520866394043, + "learning_rate": 7.446236559139786e-06, + "loss": 0.2071, + "step": 467 + }, + { + "epoch": 1.8870967741935485, + "grad_norm": 1.968050241470337, + "learning_rate": 7.4193548387096784e-06, + "loss": 0.0968, + "step": 468 + }, + { + "epoch": 1.8911290322580645, + "grad_norm": 1.4884843826293945, + "learning_rate": 7.392473118279571e-06, + "loss": 0.0825, + "step": 469 + }, + { + "epoch": 1.8951612903225805, + "grad_norm": 1.9214873313903809, + "learning_rate": 7.365591397849463e-06, + "loss": 0.1782, + "step": 470 + }, + { + "epoch": 1.8991935483870968, + "grad_norm": 2.7788476943969727, + "learning_rate": 7.338709677419356e-06, + "loss": 0.1283, + "step": 471 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 5.547091484069824, + "learning_rate": 7.311827956989248e-06, + "loss": 0.1202, + "step": 472 + }, + { + "epoch": 1.907258064516129, + "grad_norm": 1.0326168537139893, + "learning_rate": 7.284946236559141e-06, + "loss": 0.1248, + "step": 473 + }, + { + "epoch": 1.911290322580645, + "grad_norm": 2.0643792152404785, + "learning_rate": 7.258064516129033e-06, + "loss": 0.1846, + "step": 474 + }, + { + "epoch": 1.9153225806451613, + "grad_norm": 3.634772300720215, + "learning_rate": 7.231182795698926e-06, + "loss": 0.1574, + "step": 475 + }, + { + "epoch": 1.9193548387096775, + "grad_norm": 2.878852605819702, + "learning_rate": 7.204301075268818e-06, + "loss": 0.1066, + "step": 476 + }, + { + "epoch": 1.9233870967741935, + "grad_norm": 2.5109782218933105, + "learning_rate": 7.177419354838711e-06, + "loss": 0.1653, + "step": 477 + }, + { + "epoch": 1.9274193548387095, + "grad_norm": 3.136270761489868, + "learning_rate": 7.150537634408603e-06, + "loss": 0.1387, + "step": 478 + }, + { + "epoch": 1.9314516129032258, + "grad_norm": 1.636473536491394, + "learning_rate": 7.1236559139784956e-06, + "loss": 0.2813, + "step": 479 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 1.7131333351135254, + "learning_rate": 7.096774193548388e-06, + "loss": 0.0882, + "step": 480 + }, + { + "epoch": 1.939516129032258, + "grad_norm": 1.6337999105453491, + "learning_rate": 7.0698924731182805e-06, + "loss": 0.182, + "step": 481 + }, + { + "epoch": 1.9435483870967742, + "grad_norm": 3.1437745094299316, + "learning_rate": 7.043010752688173e-06, + "loss": 0.1109, + "step": 482 + }, + { + "epoch": 1.9475806451612905, + "grad_norm": 2.556929588317871, + "learning_rate": 7.0161290322580654e-06, + "loss": 0.1183, + "step": 483 + }, + { + "epoch": 1.9516129032258065, + "grad_norm": 1.5759491920471191, + "learning_rate": 6.989247311827958e-06, + "loss": 0.1137, + "step": 484 + }, + { + "epoch": 1.9556451612903225, + "grad_norm": 3.2312610149383545, + "learning_rate": 6.96236559139785e-06, + "loss": 0.1508, + "step": 485 + }, + { + "epoch": 1.9596774193548387, + "grad_norm": 1.5898686647415161, + "learning_rate": 6.935483870967743e-06, + "loss": 0.1276, + "step": 486 + }, + { + "epoch": 1.963709677419355, + "grad_norm": 2.8799052238464355, + "learning_rate": 6.908602150537635e-06, + "loss": 0.118, + "step": 487 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 3.8616013526916504, + "learning_rate": 6.881720430107528e-06, + "loss": 0.1884, + "step": 488 + }, + { + "epoch": 1.971774193548387, + "grad_norm": 1.7562378644943237, + "learning_rate": 6.854838709677419e-06, + "loss": 0.1048, + "step": 489 + }, + { + "epoch": 1.9758064516129032, + "grad_norm": 1.8255165815353394, + "learning_rate": 6.827956989247312e-06, + "loss": 0.1632, + "step": 490 + }, + { + "epoch": 1.9798387096774195, + "grad_norm": 1.126791000366211, + "learning_rate": 6.801075268817204e-06, + "loss": 0.1529, + "step": 491 + }, + { + "epoch": 1.9838709677419355, + "grad_norm": 3.804797649383545, + "learning_rate": 6.774193548387097e-06, + "loss": 0.1882, + "step": 492 + }, + { + "epoch": 1.9879032258064515, + "grad_norm": 2.6595122814178467, + "learning_rate": 6.747311827956989e-06, + "loss": 0.1564, + "step": 493 + }, + { + "epoch": 1.9919354838709677, + "grad_norm": 1.5479234457015991, + "learning_rate": 6.720430107526882e-06, + "loss": 0.1166, + "step": 494 + }, + { + "epoch": 1.995967741935484, + "grad_norm": 2.059999704360962, + "learning_rate": 6.693548387096774e-06, + "loss": 0.1269, + "step": 495 + }, + { + "epoch": 2.0, + "grad_norm": 3.3951058387756348, + "learning_rate": 6.666666666666667e-06, + "loss": 0.153, + "step": 496 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.7477272727272727, + "eval_f1": 0.8979367262723521, + "eval_loss": 0.1496548354625702, + "eval_runtime": 2.3658, + "eval_samples_per_second": 371.972, + "eval_steps_per_second": 11.835, + "step": 496 + }, + { + "epoch": 2.004032258064516, + "grad_norm": 1.5380765199661255, + "learning_rate": 6.639784946236559e-06, + "loss": 0.09, + "step": 497 + }, + { + "epoch": 2.0080645161290325, + "grad_norm": 2.340376615524292, + "learning_rate": 6.612903225806452e-06, + "loss": 0.1141, + "step": 498 + }, + { + "epoch": 2.0120967741935485, + "grad_norm": 1.2891623973846436, + "learning_rate": 6.586021505376344e-06, + "loss": 0.1402, + "step": 499 + }, + { + "epoch": 2.0161290322580645, + "grad_norm": 2.0554511547088623, + "learning_rate": 6.5591397849462365e-06, + "loss": 0.1812, + "step": 500 + }, + { + "epoch": 2.0201612903225805, + "grad_norm": 3.6273601055145264, + "learning_rate": 6.532258064516129e-06, + "loss": 0.1321, + "step": 501 + }, + { + "epoch": 2.024193548387097, + "grad_norm": 2.3132970333099365, + "learning_rate": 6.5053763440860214e-06, + "loss": 0.1039, + "step": 502 + }, + { + "epoch": 2.028225806451613, + "grad_norm": 1.4552321434020996, + "learning_rate": 6.478494623655914e-06, + "loss": 0.1449, + "step": 503 + }, + { + "epoch": 2.032258064516129, + "grad_norm": 2.3734288215637207, + "learning_rate": 6.451612903225806e-06, + "loss": 0.1196, + "step": 504 + }, + { + "epoch": 2.036290322580645, + "grad_norm": 3.308845043182373, + "learning_rate": 6.4247311827957e-06, + "loss": 0.1678, + "step": 505 + }, + { + "epoch": 2.0403225806451615, + "grad_norm": 2.2413101196289062, + "learning_rate": 6.397849462365592e-06, + "loss": 0.1195, + "step": 506 + }, + { + "epoch": 2.0443548387096775, + "grad_norm": 2.4938807487487793, + "learning_rate": 6.370967741935485e-06, + "loss": 0.1344, + "step": 507 + }, + { + "epoch": 2.0483870967741935, + "grad_norm": 1.7536572217941284, + "learning_rate": 6.344086021505377e-06, + "loss": 0.1037, + "step": 508 + }, + { + "epoch": 2.0524193548387095, + "grad_norm": 5.4296345710754395, + "learning_rate": 6.3172043010752696e-06, + "loss": 0.1651, + "step": 509 + }, + { + "epoch": 2.056451612903226, + "grad_norm": 1.4947242736816406, + "learning_rate": 6.290322580645162e-06, + "loss": 0.1779, + "step": 510 + }, + { + "epoch": 2.060483870967742, + "grad_norm": 2.460106372833252, + "learning_rate": 6.2634408602150545e-06, + "loss": 0.1335, + "step": 511 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 2.0023834705352783, + "learning_rate": 6.236559139784947e-06, + "loss": 0.1134, + "step": 512 + }, + { + "epoch": 2.068548387096774, + "grad_norm": 9.138616561889648, + "learning_rate": 6.209677419354839e-06, + "loss": 0.1426, + "step": 513 + }, + { + "epoch": 2.0725806451612905, + "grad_norm": 2.8430464267730713, + "learning_rate": 6.182795698924732e-06, + "loss": 0.1099, + "step": 514 + }, + { + "epoch": 2.0766129032258065, + "grad_norm": 2.494035482406616, + "learning_rate": 6.155913978494624e-06, + "loss": 0.1609, + "step": 515 + }, + { + "epoch": 2.0806451612903225, + "grad_norm": 0.7258223295211792, + "learning_rate": 6.129032258064517e-06, + "loss": 0.1089, + "step": 516 + }, + { + "epoch": 2.0846774193548385, + "grad_norm": 2.798238754272461, + "learning_rate": 6.102150537634409e-06, + "loss": 0.1433, + "step": 517 + }, + { + "epoch": 2.088709677419355, + "grad_norm": 1.2602635622024536, + "learning_rate": 6.075268817204302e-06, + "loss": 0.0947, + "step": 518 + }, + { + "epoch": 2.092741935483871, + "grad_norm": 2.4262948036193848, + "learning_rate": 6.048387096774194e-06, + "loss": 0.1297, + "step": 519 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 1.0531463623046875, + "learning_rate": 6.021505376344087e-06, + "loss": 0.0798, + "step": 520 + }, + { + "epoch": 2.100806451612903, + "grad_norm": 2.968100070953369, + "learning_rate": 5.994623655913979e-06, + "loss": 0.1638, + "step": 521 + }, + { + "epoch": 2.1048387096774195, + "grad_norm": 1.3490394353866577, + "learning_rate": 5.967741935483872e-06, + "loss": 0.1011, + "step": 522 + }, + { + "epoch": 2.1088709677419355, + "grad_norm": 1.9310318231582642, + "learning_rate": 5.940860215053764e-06, + "loss": 0.1442, + "step": 523 + }, + { + "epoch": 2.1129032258064515, + "grad_norm": 5.254608631134033, + "learning_rate": 5.9139784946236566e-06, + "loss": 0.1855, + "step": 524 + }, + { + "epoch": 2.1169354838709675, + "grad_norm": 1.3291866779327393, + "learning_rate": 5.887096774193549e-06, + "loss": 0.1166, + "step": 525 + }, + { + "epoch": 2.120967741935484, + "grad_norm": 1.8239635229110718, + "learning_rate": 5.8602150537634415e-06, + "loss": 0.1574, + "step": 526 + }, + { + "epoch": 2.125, + "grad_norm": 1.7408251762390137, + "learning_rate": 5.833333333333334e-06, + "loss": 0.073, + "step": 527 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 1.5561959743499756, + "learning_rate": 5.806451612903226e-06, + "loss": 0.0942, + "step": 528 + }, + { + "epoch": 2.133064516129032, + "grad_norm": 1.163049340248108, + "learning_rate": 5.779569892473119e-06, + "loss": 0.1472, + "step": 529 + }, + { + "epoch": 2.1370967741935485, + "grad_norm": 2.868626356124878, + "learning_rate": 5.752688172043011e-06, + "loss": 0.1565, + "step": 530 + }, + { + "epoch": 2.1411290322580645, + "grad_norm": 4.834112644195557, + "learning_rate": 5.725806451612904e-06, + "loss": 0.1909, + "step": 531 + }, + { + "epoch": 2.1451612903225805, + "grad_norm": 3.0024447441101074, + "learning_rate": 5.698924731182796e-06, + "loss": 0.1108, + "step": 532 + }, + { + "epoch": 2.149193548387097, + "grad_norm": 5.311757564544678, + "learning_rate": 5.672043010752689e-06, + "loss": 0.1366, + "step": 533 + }, + { + "epoch": 2.153225806451613, + "grad_norm": 3.452758312225342, + "learning_rate": 5.645161290322582e-06, + "loss": 0.2077, + "step": 534 + }, + { + "epoch": 2.157258064516129, + "grad_norm": 1.520835280418396, + "learning_rate": 5.618279569892473e-06, + "loss": 0.0959, + "step": 535 + }, + { + "epoch": 2.161290322580645, + "grad_norm": 3.092453718185425, + "learning_rate": 5.591397849462365e-06, + "loss": 0.1078, + "step": 536 + }, + { + "epoch": 2.1653225806451615, + "grad_norm": 3.7799909114837646, + "learning_rate": 5.564516129032258e-06, + "loss": 0.0856, + "step": 537 + }, + { + "epoch": 2.1693548387096775, + "grad_norm": 1.4819005727767944, + "learning_rate": 5.53763440860215e-06, + "loss": 0.1015, + "step": 538 + }, + { + "epoch": 2.1733870967741935, + "grad_norm": 2.0473012924194336, + "learning_rate": 5.510752688172043e-06, + "loss": 0.1372, + "step": 539 + }, + { + "epoch": 2.1774193548387095, + "grad_norm": 3.29624080657959, + "learning_rate": 5.483870967741935e-06, + "loss": 0.1395, + "step": 540 + }, + { + "epoch": 2.181451612903226, + "grad_norm": 1.8019927740097046, + "learning_rate": 5.456989247311828e-06, + "loss": 0.1426, + "step": 541 + }, + { + "epoch": 2.185483870967742, + "grad_norm": 1.848576307296753, + "learning_rate": 5.43010752688172e-06, + "loss": 0.1854, + "step": 542 + }, + { + "epoch": 2.189516129032258, + "grad_norm": 1.5595825910568237, + "learning_rate": 5.4032258064516126e-06, + "loss": 0.0969, + "step": 543 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 1.914633870124817, + "learning_rate": 5.376344086021506e-06, + "loss": 0.1466, + "step": 544 + }, + { + "epoch": 2.1975806451612905, + "grad_norm": 1.185180902481079, + "learning_rate": 5.349462365591398e-06, + "loss": 0.1457, + "step": 545 + }, + { + "epoch": 2.2016129032258065, + "grad_norm": 4.04098653793335, + "learning_rate": 5.322580645161291e-06, + "loss": 0.0852, + "step": 546 + }, + { + "epoch": 2.2056451612903225, + "grad_norm": 1.7851275205612183, + "learning_rate": 5.295698924731183e-06, + "loss": 0.1771, + "step": 547 + }, + { + "epoch": 2.2096774193548385, + "grad_norm": 2.551523208618164, + "learning_rate": 5.268817204301076e-06, + "loss": 0.1229, + "step": 548 + }, + { + "epoch": 2.213709677419355, + "grad_norm": 1.8646845817565918, + "learning_rate": 5.241935483870968e-06, + "loss": 0.0997, + "step": 549 + }, + { + "epoch": 2.217741935483871, + "grad_norm": 3.173009157180786, + "learning_rate": 5.215053763440861e-06, + "loss": 0.1498, + "step": 550 + }, + { + "epoch": 2.221774193548387, + "grad_norm": 2.938751220703125, + "learning_rate": 5.188172043010753e-06, + "loss": 0.1432, + "step": 551 + }, + { + "epoch": 2.225806451612903, + "grad_norm": 1.863869071006775, + "learning_rate": 5.161290322580646e-06, + "loss": 0.1246, + "step": 552 + }, + { + "epoch": 2.2298387096774195, + "grad_norm": 2.414386034011841, + "learning_rate": 5.134408602150538e-06, + "loss": 0.137, + "step": 553 + }, + { + "epoch": 2.2338709677419355, + "grad_norm": 1.476468563079834, + "learning_rate": 5.1075268817204305e-06, + "loss": 0.1444, + "step": 554 + }, + { + "epoch": 2.2379032258064515, + "grad_norm": 1.2723708152770996, + "learning_rate": 5.080645161290323e-06, + "loss": 0.0898, + "step": 555 + }, + { + "epoch": 2.241935483870968, + "grad_norm": 2.391453742980957, + "learning_rate": 5.0537634408602155e-06, + "loss": 0.0788, + "step": 556 + }, + { + "epoch": 2.245967741935484, + "grad_norm": 1.704126000404358, + "learning_rate": 5.026881720430108e-06, + "loss": 0.1598, + "step": 557 + }, + { + "epoch": 2.25, + "grad_norm": 2.0079946517944336, + "learning_rate": 5e-06, + "loss": 0.115, + "step": 558 + }, + { + "epoch": 2.254032258064516, + "grad_norm": 2.3702168464660645, + "learning_rate": 4.973118279569893e-06, + "loss": 0.1229, + "step": 559 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 4.38910436630249, + "learning_rate": 4.946236559139785e-06, + "loss": 0.1102, + "step": 560 + }, + { + "epoch": 2.2620967741935485, + "grad_norm": 4.0716776847839355, + "learning_rate": 4.919354838709678e-06, + "loss": 0.2517, + "step": 561 + }, + { + "epoch": 2.2661290322580645, + "grad_norm": 0.9846423268318176, + "learning_rate": 4.89247311827957e-06, + "loss": 0.1007, + "step": 562 + }, + { + "epoch": 2.2701612903225805, + "grad_norm": 4.429399490356445, + "learning_rate": 4.865591397849463e-06, + "loss": 0.0691, + "step": 563 + }, + { + "epoch": 2.274193548387097, + "grad_norm": 2.5109846591949463, + "learning_rate": 4.838709677419355e-06, + "loss": 0.1717, + "step": 564 + }, + { + "epoch": 2.278225806451613, + "grad_norm": 2.3060779571533203, + "learning_rate": 4.811827956989248e-06, + "loss": 0.0871, + "step": 565 + }, + { + "epoch": 2.282258064516129, + "grad_norm": 2.165689706802368, + "learning_rate": 4.78494623655914e-06, + "loss": 0.1648, + "step": 566 + }, + { + "epoch": 2.286290322580645, + "grad_norm": 3.6668314933776855, + "learning_rate": 4.758064516129033e-06, + "loss": 0.093, + "step": 567 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 3.3539183139801025, + "learning_rate": 4.731182795698925e-06, + "loss": 0.1632, + "step": 568 + }, + { + "epoch": 2.2943548387096775, + "grad_norm": 2.386240243911743, + "learning_rate": 4.7043010752688175e-06, + "loss": 0.2024, + "step": 569 + }, + { + "epoch": 2.2983870967741935, + "grad_norm": 1.5170916318893433, + "learning_rate": 4.67741935483871e-06, + "loss": 0.1433, + "step": 570 + }, + { + "epoch": 2.3024193548387095, + "grad_norm": 1.2362958192825317, + "learning_rate": 4.6505376344086025e-06, + "loss": 0.0842, + "step": 571 + }, + { + "epoch": 2.306451612903226, + "grad_norm": 2.1873104572296143, + "learning_rate": 4.623655913978495e-06, + "loss": 0.1048, + "step": 572 + }, + { + "epoch": 2.310483870967742, + "grad_norm": 1.4580453634262085, + "learning_rate": 4.596774193548387e-06, + "loss": 0.0954, + "step": 573 + }, + { + "epoch": 2.314516129032258, + "grad_norm": 1.951701283454895, + "learning_rate": 4.56989247311828e-06, + "loss": 0.2442, + "step": 574 + }, + { + "epoch": 2.318548387096774, + "grad_norm": 1.1734662055969238, + "learning_rate": 4.543010752688172e-06, + "loss": 0.1161, + "step": 575 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 2.285795211791992, + "learning_rate": 4.516129032258065e-06, + "loss": 0.1039, + "step": 576 + }, + { + "epoch": 2.3266129032258065, + "grad_norm": 1.4747077226638794, + "learning_rate": 4.489247311827957e-06, + "loss": 0.1783, + "step": 577 + }, + { + "epoch": 2.3306451612903225, + "grad_norm": 1.9864970445632935, + "learning_rate": 4.46236559139785e-06, + "loss": 0.1308, + "step": 578 + }, + { + "epoch": 2.3346774193548385, + "grad_norm": 1.8854461908340454, + "learning_rate": 4.435483870967742e-06, + "loss": 0.1359, + "step": 579 + }, + { + "epoch": 2.338709677419355, + "grad_norm": 1.6464263200759888, + "learning_rate": 4.408602150537635e-06, + "loss": 0.1447, + "step": 580 + }, + { + "epoch": 2.342741935483871, + "grad_norm": 1.9216904640197754, + "learning_rate": 4.381720430107527e-06, + "loss": 0.0523, + "step": 581 + }, + { + "epoch": 2.346774193548387, + "grad_norm": 1.3401274681091309, + "learning_rate": 4.35483870967742e-06, + "loss": 0.0718, + "step": 582 + }, + { + "epoch": 2.350806451612903, + "grad_norm": 1.0903416872024536, + "learning_rate": 4.327956989247312e-06, + "loss": 0.1176, + "step": 583 + }, + { + "epoch": 2.3548387096774195, + "grad_norm": 2.7714807987213135, + "learning_rate": 4.3010752688172045e-06, + "loss": 0.1075, + "step": 584 + }, + { + "epoch": 2.3588709677419355, + "grad_norm": 0.9703463315963745, + "learning_rate": 4.274193548387097e-06, + "loss": 0.0649, + "step": 585 + }, + { + "epoch": 2.3629032258064515, + "grad_norm": 5.25622034072876, + "learning_rate": 4.2473118279569895e-06, + "loss": 0.1146, + "step": 586 + }, + { + "epoch": 2.366935483870968, + "grad_norm": 1.6004530191421509, + "learning_rate": 4.220430107526882e-06, + "loss": 0.1173, + "step": 587 + }, + { + "epoch": 2.370967741935484, + "grad_norm": 1.7906506061553955, + "learning_rate": 4.193548387096774e-06, + "loss": 0.1305, + "step": 588 + }, + { + "epoch": 2.375, + "grad_norm": 5.008229732513428, + "learning_rate": 4.166666666666667e-06, + "loss": 0.1907, + "step": 589 + }, + { + "epoch": 2.379032258064516, + "grad_norm": 2.5902585983276367, + "learning_rate": 4.139784946236559e-06, + "loss": 0.1836, + "step": 590 + }, + { + "epoch": 2.383064516129032, + "grad_norm": 2.13139009475708, + "learning_rate": 4.112903225806452e-06, + "loss": 0.1012, + "step": 591 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 1.3434728384017944, + "learning_rate": 4.086021505376344e-06, + "loss": 0.1011, + "step": 592 + }, + { + "epoch": 2.3911290322580645, + "grad_norm": 1.74350106716156, + "learning_rate": 4.059139784946237e-06, + "loss": 0.1008, + "step": 593 + }, + { + "epoch": 2.3951612903225805, + "grad_norm": 1.2974146604537964, + "learning_rate": 4.032258064516129e-06, + "loss": 0.0692, + "step": 594 + }, + { + "epoch": 2.399193548387097, + "grad_norm": 2.330141544342041, + "learning_rate": 4.005376344086022e-06, + "loss": 0.1496, + "step": 595 + }, + { + "epoch": 2.403225806451613, + "grad_norm": 4.154720306396484, + "learning_rate": 3.978494623655914e-06, + "loss": 0.1431, + "step": 596 + }, + { + "epoch": 2.407258064516129, + "grad_norm": 1.0211693048477173, + "learning_rate": 3.951612903225807e-06, + "loss": 0.1015, + "step": 597 + }, + { + "epoch": 2.411290322580645, + "grad_norm": 4.42000675201416, + "learning_rate": 3.924731182795699e-06, + "loss": 0.1123, + "step": 598 + }, + { + "epoch": 2.4153225806451615, + "grad_norm": 4.67465353012085, + "learning_rate": 3.8978494623655915e-06, + "loss": 0.2117, + "step": 599 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 2.3731729984283447, + "learning_rate": 3.870967741935484e-06, + "loss": 0.1847, + "step": 600 + }, + { + "epoch": 2.4233870967741935, + "grad_norm": 2.7813737392425537, + "learning_rate": 3.8440860215053765e-06, + "loss": 0.1696, + "step": 601 + }, + { + "epoch": 2.4274193548387095, + "grad_norm": 1.7436891794204712, + "learning_rate": 3.817204301075269e-06, + "loss": 0.1275, + "step": 602 + }, + { + "epoch": 2.431451612903226, + "grad_norm": 2.3506593704223633, + "learning_rate": 3.7903225806451614e-06, + "loss": 0.1749, + "step": 603 + }, + { + "epoch": 2.435483870967742, + "grad_norm": 3.1209230422973633, + "learning_rate": 3.763440860215054e-06, + "loss": 0.1307, + "step": 604 + }, + { + "epoch": 2.439516129032258, + "grad_norm": 2.593312978744507, + "learning_rate": 3.7365591397849468e-06, + "loss": 0.1168, + "step": 605 + }, + { + "epoch": 2.443548387096774, + "grad_norm": 2.2633056640625, + "learning_rate": 3.7096774193548392e-06, + "loss": 0.1355, + "step": 606 + }, + { + "epoch": 2.4475806451612905, + "grad_norm": 2.898655891418457, + "learning_rate": 3.6827956989247317e-06, + "loss": 0.1298, + "step": 607 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 1.7566426992416382, + "learning_rate": 3.655913978494624e-06, + "loss": 0.1159, + "step": 608 + }, + { + "epoch": 2.4556451612903225, + "grad_norm": 6.452301025390625, + "learning_rate": 3.6290322580645166e-06, + "loss": 0.1349, + "step": 609 + }, + { + "epoch": 2.4596774193548385, + "grad_norm": 2.342876434326172, + "learning_rate": 3.602150537634409e-06, + "loss": 0.0856, + "step": 610 + }, + { + "epoch": 2.463709677419355, + "grad_norm": 2.236654281616211, + "learning_rate": 3.5752688172043015e-06, + "loss": 0.1418, + "step": 611 + }, + { + "epoch": 2.467741935483871, + "grad_norm": 3.464710235595703, + "learning_rate": 3.548387096774194e-06, + "loss": 0.097, + "step": 612 + }, + { + "epoch": 2.471774193548387, + "grad_norm": 1.936618447303772, + "learning_rate": 3.5215053763440865e-06, + "loss": 0.0876, + "step": 613 + }, + { + "epoch": 2.475806451612903, + "grad_norm": 3.4211089611053467, + "learning_rate": 3.494623655913979e-06, + "loss": 0.1502, + "step": 614 + }, + { + "epoch": 2.4798387096774195, + "grad_norm": 2.9869165420532227, + "learning_rate": 3.4677419354838714e-06, + "loss": 0.2398, + "step": 615 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 3.0065438747406006, + "learning_rate": 3.440860215053764e-06, + "loss": 0.1115, + "step": 616 + }, + { + "epoch": 2.4879032258064515, + "grad_norm": 1.830977439880371, + "learning_rate": 3.413978494623656e-06, + "loss": 0.1161, + "step": 617 + }, + { + "epoch": 2.491935483870968, + "grad_norm": 1.9489115476608276, + "learning_rate": 3.3870967741935484e-06, + "loss": 0.1224, + "step": 618 + }, + { + "epoch": 2.495967741935484, + "grad_norm": 2.2884771823883057, + "learning_rate": 3.360215053763441e-06, + "loss": 0.1092, + "step": 619 + }, + { + "epoch": 2.5, + "grad_norm": 2.671285629272461, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1711, + "step": 620 + }, + { + "epoch": 2.504032258064516, + "grad_norm": 1.9497143030166626, + "learning_rate": 3.306451612903226e-06, + "loss": 0.1145, + "step": 621 + }, + { + "epoch": 2.508064516129032, + "grad_norm": 1.3452483415603638, + "learning_rate": 3.2795698924731183e-06, + "loss": 0.1217, + "step": 622 + }, + { + "epoch": 2.5120967741935485, + "grad_norm": 1.2507896423339844, + "learning_rate": 3.2526881720430107e-06, + "loss": 0.0931, + "step": 623 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 1.3425334692001343, + "learning_rate": 3.225806451612903e-06, + "loss": 0.1145, + "step": 624 + }, + { + "epoch": 2.5201612903225805, + "grad_norm": 3.7666163444519043, + "learning_rate": 3.198924731182796e-06, + "loss": 0.1044, + "step": 625 + }, + { + "epoch": 2.524193548387097, + "grad_norm": 2.4426252841949463, + "learning_rate": 3.1720430107526885e-06, + "loss": 0.0856, + "step": 626 + }, + { + "epoch": 2.528225806451613, + "grad_norm": 3.1804256439208984, + "learning_rate": 3.145161290322581e-06, + "loss": 0.0867, + "step": 627 + }, + { + "epoch": 2.532258064516129, + "grad_norm": 2.037343740463257, + "learning_rate": 3.1182795698924735e-06, + "loss": 0.1085, + "step": 628 + }, + { + "epoch": 2.536290322580645, + "grad_norm": 4.272365093231201, + "learning_rate": 3.091397849462366e-06, + "loss": 0.174, + "step": 629 + }, + { + "epoch": 2.540322580645161, + "grad_norm": 2.1623380184173584, + "learning_rate": 3.0645161290322584e-06, + "loss": 0.0866, + "step": 630 + }, + { + "epoch": 2.5443548387096775, + "grad_norm": 3.3426926136016846, + "learning_rate": 3.037634408602151e-06, + "loss": 0.1142, + "step": 631 + }, + { + "epoch": 2.5483870967741935, + "grad_norm": 2.2241578102111816, + "learning_rate": 3.0107526881720433e-06, + "loss": 0.0995, + "step": 632 + }, + { + "epoch": 2.5524193548387095, + "grad_norm": 3.2953665256500244, + "learning_rate": 2.983870967741936e-06, + "loss": 0.1487, + "step": 633 + }, + { + "epoch": 2.556451612903226, + "grad_norm": 1.5125126838684082, + "learning_rate": 2.9569892473118283e-06, + "loss": 0.0567, + "step": 634 + }, + { + "epoch": 2.560483870967742, + "grad_norm": 2.612718343734741, + "learning_rate": 2.9301075268817207e-06, + "loss": 0.1033, + "step": 635 + }, + { + "epoch": 2.564516129032258, + "grad_norm": 1.220692753791809, + "learning_rate": 2.903225806451613e-06, + "loss": 0.0999, + "step": 636 + }, + { + "epoch": 2.568548387096774, + "grad_norm": 1.8477866649627686, + "learning_rate": 2.8763440860215057e-06, + "loss": 0.2083, + "step": 637 + }, + { + "epoch": 2.5725806451612905, + "grad_norm": 4.778720855712891, + "learning_rate": 2.849462365591398e-06, + "loss": 0.1335, + "step": 638 + }, + { + "epoch": 2.5766129032258065, + "grad_norm": 1.4977765083312988, + "learning_rate": 2.822580645161291e-06, + "loss": 0.147, + "step": 639 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 1.5701426267623901, + "learning_rate": 2.7956989247311827e-06, + "loss": 0.1333, + "step": 640 + }, + { + "epoch": 2.584677419354839, + "grad_norm": 1.051927924156189, + "learning_rate": 2.768817204301075e-06, + "loss": 0.0888, + "step": 641 + }, + { + "epoch": 2.588709677419355, + "grad_norm": 4.721953868865967, + "learning_rate": 2.7419354838709676e-06, + "loss": 0.131, + "step": 642 + }, + { + "epoch": 2.592741935483871, + "grad_norm": 2.7012953758239746, + "learning_rate": 2.71505376344086e-06, + "loss": 0.0714, + "step": 643 + }, + { + "epoch": 2.596774193548387, + "grad_norm": 5.694790363311768, + "learning_rate": 2.688172043010753e-06, + "loss": 0.1403, + "step": 644 + }, + { + "epoch": 2.600806451612903, + "grad_norm": 4.28272008895874, + "learning_rate": 2.6612903225806454e-06, + "loss": 0.1169, + "step": 645 + }, + { + "epoch": 2.6048387096774195, + "grad_norm": 1.0393054485321045, + "learning_rate": 2.634408602150538e-06, + "loss": 0.0731, + "step": 646 + }, + { + "epoch": 2.6088709677419355, + "grad_norm": 0.9973479509353638, + "learning_rate": 2.6075268817204303e-06, + "loss": 0.1072, + "step": 647 + }, + { + "epoch": 2.6129032258064515, + "grad_norm": 1.4460006952285767, + "learning_rate": 2.580645161290323e-06, + "loss": 0.1372, + "step": 648 + }, + { + "epoch": 2.616935483870968, + "grad_norm": 2.2750110626220703, + "learning_rate": 2.5537634408602153e-06, + "loss": 0.1131, + "step": 649 + }, + { + "epoch": 2.620967741935484, + "grad_norm": 2.0815539360046387, + "learning_rate": 2.5268817204301077e-06, + "loss": 0.1101, + "step": 650 + }, + { + "epoch": 2.625, + "grad_norm": 1.0647339820861816, + "learning_rate": 2.5e-06, + "loss": 0.1106, + "step": 651 + }, + { + "epoch": 2.629032258064516, + "grad_norm": 1.694512963294983, + "learning_rate": 2.4731182795698927e-06, + "loss": 0.1207, + "step": 652 + }, + { + "epoch": 2.633064516129032, + "grad_norm": 1.1872451305389404, + "learning_rate": 2.446236559139785e-06, + "loss": 0.0823, + "step": 653 + }, + { + "epoch": 2.6370967741935485, + "grad_norm": 1.3260234594345093, + "learning_rate": 2.4193548387096776e-06, + "loss": 0.1082, + "step": 654 + }, + { + "epoch": 2.6411290322580645, + "grad_norm": 2.025832414627075, + "learning_rate": 2.39247311827957e-06, + "loss": 0.1217, + "step": 655 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 2.340317487716675, + "learning_rate": 2.3655913978494625e-06, + "loss": 0.1507, + "step": 656 + }, + { + "epoch": 2.649193548387097, + "grad_norm": 2.595576524734497, + "learning_rate": 2.338709677419355e-06, + "loss": 0.1588, + "step": 657 + }, + { + "epoch": 2.653225806451613, + "grad_norm": 2.0549724102020264, + "learning_rate": 2.3118279569892475e-06, + "loss": 0.1111, + "step": 658 + }, + { + "epoch": 2.657258064516129, + "grad_norm": 0.7785517573356628, + "learning_rate": 2.28494623655914e-06, + "loss": 0.0675, + "step": 659 + }, + { + "epoch": 2.661290322580645, + "grad_norm": 1.5122755765914917, + "learning_rate": 2.2580645161290324e-06, + "loss": 0.0898, + "step": 660 + }, + { + "epoch": 2.665322580645161, + "grad_norm": 1.361331582069397, + "learning_rate": 2.231182795698925e-06, + "loss": 0.12, + "step": 661 + }, + { + "epoch": 2.6693548387096775, + "grad_norm": 2.3188157081604004, + "learning_rate": 2.2043010752688173e-06, + "loss": 0.1406, + "step": 662 + }, + { + "epoch": 2.6733870967741935, + "grad_norm": 1.4102435111999512, + "learning_rate": 2.17741935483871e-06, + "loss": 0.0961, + "step": 663 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 2.144468069076538, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.0633, + "step": 664 + }, + { + "epoch": 2.681451612903226, + "grad_norm": 1.7837616205215454, + "learning_rate": 2.1236559139784947e-06, + "loss": 0.1136, + "step": 665 + }, + { + "epoch": 2.685483870967742, + "grad_norm": 2.097506284713745, + "learning_rate": 2.096774193548387e-06, + "loss": 0.1284, + "step": 666 + }, + { + "epoch": 2.689516129032258, + "grad_norm": 1.8578377962112427, + "learning_rate": 2.0698924731182797e-06, + "loss": 0.0985, + "step": 667 + }, + { + "epoch": 2.693548387096774, + "grad_norm": 1.454514741897583, + "learning_rate": 2.043010752688172e-06, + "loss": 0.1499, + "step": 668 + }, + { + "epoch": 2.6975806451612905, + "grad_norm": 6.206280708312988, + "learning_rate": 2.0161290322580646e-06, + "loss": 0.1131, + "step": 669 + }, + { + "epoch": 2.7016129032258065, + "grad_norm": 1.9479646682739258, + "learning_rate": 1.989247311827957e-06, + "loss": 0.0599, + "step": 670 + }, + { + "epoch": 2.7056451612903225, + "grad_norm": 1.7119982242584229, + "learning_rate": 1.9623655913978495e-06, + "loss": 0.1553, + "step": 671 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 2.411572217941284, + "learning_rate": 1.935483870967742e-06, + "loss": 0.1384, + "step": 672 + }, + { + "epoch": 2.713709677419355, + "grad_norm": 2.9151971340179443, + "learning_rate": 1.9086021505376345e-06, + "loss": 0.1513, + "step": 673 + }, + { + "epoch": 2.717741935483871, + "grad_norm": 1.6464580297470093, + "learning_rate": 1.881720430107527e-06, + "loss": 0.0747, + "step": 674 + }, + { + "epoch": 2.721774193548387, + "grad_norm": 2.2614505290985107, + "learning_rate": 1.8548387096774196e-06, + "loss": 0.1306, + "step": 675 + }, + { + "epoch": 2.725806451612903, + "grad_norm": 3.9104158878326416, + "learning_rate": 1.827956989247312e-06, + "loss": 0.1036, + "step": 676 + }, + { + "epoch": 2.7298387096774195, + "grad_norm": 2.6638762950897217, + "learning_rate": 1.8010752688172045e-06, + "loss": 0.1882, + "step": 677 + }, + { + "epoch": 2.7338709677419355, + "grad_norm": 2.4250168800354004, + "learning_rate": 1.774193548387097e-06, + "loss": 0.1356, + "step": 678 + }, + { + "epoch": 2.7379032258064515, + "grad_norm": 1.579751968383789, + "learning_rate": 1.7473118279569895e-06, + "loss": 0.0973, + "step": 679 + }, + { + "epoch": 2.741935483870968, + "grad_norm": 1.9126899242401123, + "learning_rate": 1.720430107526882e-06, + "loss": 0.1051, + "step": 680 + }, + { + "epoch": 2.745967741935484, + "grad_norm": 2.4986772537231445, + "learning_rate": 1.6935483870967742e-06, + "loss": 0.097, + "step": 681 + }, + { + "epoch": 2.75, + "grad_norm": 1.9752743244171143, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1827, + "step": 682 + }, + { + "epoch": 2.754032258064516, + "grad_norm": 5.470479488372803, + "learning_rate": 1.6397849462365591e-06, + "loss": 0.156, + "step": 683 + }, + { + "epoch": 2.758064516129032, + "grad_norm": 0.6137222647666931, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.0982, + "step": 684 + }, + { + "epoch": 2.7620967741935485, + "grad_norm": 1.8977144956588745, + "learning_rate": 1.5860215053763443e-06, + "loss": 0.1484, + "step": 685 + }, + { + "epoch": 2.7661290322580645, + "grad_norm": 1.982347846031189, + "learning_rate": 1.5591397849462367e-06, + "loss": 0.0805, + "step": 686 + }, + { + "epoch": 2.7701612903225805, + "grad_norm": 1.7538466453552246, + "learning_rate": 1.5322580645161292e-06, + "loss": 0.0831, + "step": 687 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 1.4044026136398315, + "learning_rate": 1.5053763440860217e-06, + "loss": 0.0967, + "step": 688 + }, + { + "epoch": 2.778225806451613, + "grad_norm": 3.4833972454071045, + "learning_rate": 1.4784946236559141e-06, + "loss": 0.1445, + "step": 689 + }, + { + "epoch": 2.782258064516129, + "grad_norm": 3.1198818683624268, + "learning_rate": 1.4516129032258066e-06, + "loss": 0.1556, + "step": 690 + }, + { + "epoch": 2.786290322580645, + "grad_norm": 1.4616000652313232, + "learning_rate": 1.424731182795699e-06, + "loss": 0.1769, + "step": 691 + }, + { + "epoch": 2.790322580645161, + "grad_norm": 0.99885493516922, + "learning_rate": 1.3978494623655913e-06, + "loss": 0.0787, + "step": 692 + }, + { + "epoch": 2.7943548387096775, + "grad_norm": 1.395920991897583, + "learning_rate": 1.3709677419354838e-06, + "loss": 0.0763, + "step": 693 + }, + { + "epoch": 2.7983870967741935, + "grad_norm": 2.1815457344055176, + "learning_rate": 1.3440860215053765e-06, + "loss": 0.1605, + "step": 694 + }, + { + "epoch": 2.8024193548387095, + "grad_norm": 1.7244387865066528, + "learning_rate": 1.317204301075269e-06, + "loss": 0.0735, + "step": 695 + }, + { + "epoch": 2.806451612903226, + "grad_norm": 2.2874746322631836, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.1302, + "step": 696 + }, + { + "epoch": 2.810483870967742, + "grad_norm": 0.7056113481521606, + "learning_rate": 1.2634408602150539e-06, + "loss": 0.097, + "step": 697 + }, + { + "epoch": 2.814516129032258, + "grad_norm": 2.458315134048462, + "learning_rate": 1.2365591397849463e-06, + "loss": 0.1269, + "step": 698 + }, + { + "epoch": 2.818548387096774, + "grad_norm": 1.1753970384597778, + "learning_rate": 1.2096774193548388e-06, + "loss": 0.0745, + "step": 699 + }, + { + "epoch": 2.8225806451612905, + "grad_norm": 1.3138192892074585, + "learning_rate": 1.1827956989247313e-06, + "loss": 0.1289, + "step": 700 + }, + { + "epoch": 2.8266129032258065, + "grad_norm": 1.3947908878326416, + "learning_rate": 1.1559139784946237e-06, + "loss": 0.1225, + "step": 701 + }, + { + "epoch": 2.8306451612903225, + "grad_norm": 1.4919567108154297, + "learning_rate": 1.1290322580645162e-06, + "loss": 0.119, + "step": 702 + }, + { + "epoch": 2.834677419354839, + "grad_norm": 1.0990135669708252, + "learning_rate": 1.1021505376344087e-06, + "loss": 0.0763, + "step": 703 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 5.159749507904053, + "learning_rate": 1.0752688172043011e-06, + "loss": 0.1223, + "step": 704 + }, + { + "epoch": 2.842741935483871, + "grad_norm": 1.0806690454483032, + "learning_rate": 1.0483870967741936e-06, + "loss": 0.1383, + "step": 705 + }, + { + "epoch": 2.846774193548387, + "grad_norm": 3.3874659538269043, + "learning_rate": 1.021505376344086e-06, + "loss": 0.1, + "step": 706 + }, + { + "epoch": 2.850806451612903, + "grad_norm": 2.235921859741211, + "learning_rate": 9.946236559139785e-07, + "loss": 0.2047, + "step": 707 + }, + { + "epoch": 2.8548387096774195, + "grad_norm": 1.7132419347763062, + "learning_rate": 9.67741935483871e-07, + "loss": 0.1235, + "step": 708 + }, + { + "epoch": 2.8588709677419355, + "grad_norm": 5.385939598083496, + "learning_rate": 9.408602150537635e-07, + "loss": 0.0994, + "step": 709 + }, + { + "epoch": 2.8629032258064515, + "grad_norm": 1.3089847564697266, + "learning_rate": 9.13978494623656e-07, + "loss": 0.0878, + "step": 710 + }, + { + "epoch": 2.866935483870968, + "grad_norm": 1.2990398406982422, + "learning_rate": 8.870967741935485e-07, + "loss": 0.1384, + "step": 711 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 1.1270217895507812, + "learning_rate": 8.60215053763441e-07, + "loss": 0.0901, + "step": 712 + }, + { + "epoch": 2.875, + "grad_norm": 1.7017827033996582, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1437, + "step": 713 + }, + { + "epoch": 2.879032258064516, + "grad_norm": 0.8682994246482849, + "learning_rate": 8.064516129032258e-07, + "loss": 0.0515, + "step": 714 + }, + { + "epoch": 2.883064516129032, + "grad_norm": 3.2466022968292236, + "learning_rate": 7.795698924731184e-07, + "loss": 0.1803, + "step": 715 + }, + { + "epoch": 2.8870967741935485, + "grad_norm": 2.0208983421325684, + "learning_rate": 7.526881720430108e-07, + "loss": 0.1035, + "step": 716 + }, + { + "epoch": 2.8911290322580645, + "grad_norm": 1.170736312866211, + "learning_rate": 7.258064516129033e-07, + "loss": 0.13, + "step": 717 + }, + { + "epoch": 2.8951612903225805, + "grad_norm": 1.7991589307785034, + "learning_rate": 6.989247311827957e-07, + "loss": 0.0715, + "step": 718 + }, + { + "epoch": 2.899193548387097, + "grad_norm": 1.9782204627990723, + "learning_rate": 6.720430107526882e-07, + "loss": 0.0969, + "step": 719 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 2.702655792236328, + "learning_rate": 6.451612903225807e-07, + "loss": 0.0927, + "step": 720 + }, + { + "epoch": 2.907258064516129, + "grad_norm": 2.2504546642303467, + "learning_rate": 6.182795698924732e-07, + "loss": 0.146, + "step": 721 + }, + { + "epoch": 2.911290322580645, + "grad_norm": 2.546994209289551, + "learning_rate": 5.913978494623656e-07, + "loss": 0.0981, + "step": 722 + }, + { + "epoch": 2.915322580645161, + "grad_norm": 3.177995443344116, + "learning_rate": 5.645161290322581e-07, + "loss": 0.1493, + "step": 723 + }, + { + "epoch": 2.9193548387096775, + "grad_norm": 1.8446640968322754, + "learning_rate": 5.376344086021506e-07, + "loss": 0.1168, + "step": 724 + }, + { + "epoch": 2.9233870967741935, + "grad_norm": 1.462225079536438, + "learning_rate": 5.10752688172043e-07, + "loss": 0.0658, + "step": 725 + }, + { + "epoch": 2.9274193548387095, + "grad_norm": 2.062228202819824, + "learning_rate": 4.838709677419355e-07, + "loss": 0.0861, + "step": 726 + }, + { + "epoch": 2.931451612903226, + "grad_norm": 1.066767930984497, + "learning_rate": 4.56989247311828e-07, + "loss": 0.1016, + "step": 727 + }, + { + "epoch": 2.935483870967742, + "grad_norm": 1.9916908740997314, + "learning_rate": 4.301075268817205e-07, + "loss": 0.0999, + "step": 728 + }, + { + "epoch": 2.939516129032258, + "grad_norm": 2.194812536239624, + "learning_rate": 4.032258064516129e-07, + "loss": 0.1289, + "step": 729 + }, + { + "epoch": 2.943548387096774, + "grad_norm": 1.067501425743103, + "learning_rate": 3.763440860215054e-07, + "loss": 0.0866, + "step": 730 + }, + { + "epoch": 2.9475806451612905, + "grad_norm": 1.4836621284484863, + "learning_rate": 3.4946236559139783e-07, + "loss": 0.0954, + "step": 731 + }, + { + "epoch": 2.9516129032258065, + "grad_norm": 2.3710856437683105, + "learning_rate": 3.2258064516129035e-07, + "loss": 0.1969, + "step": 732 + }, + { + "epoch": 2.9556451612903225, + "grad_norm": 2.83674693107605, + "learning_rate": 2.956989247311828e-07, + "loss": 0.1929, + "step": 733 + }, + { + "epoch": 2.959677419354839, + "grad_norm": 2.1235055923461914, + "learning_rate": 2.688172043010753e-07, + "loss": 0.1497, + "step": 734 + }, + { + "epoch": 2.963709677419355, + "grad_norm": 1.6132428646087646, + "learning_rate": 2.4193548387096775e-07, + "loss": 0.1602, + "step": 735 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 2.551713228225708, + "learning_rate": 2.1505376344086024e-07, + "loss": 0.0789, + "step": 736 + }, + { + "epoch": 2.971774193548387, + "grad_norm": 1.6804261207580566, + "learning_rate": 1.881720430107527e-07, + "loss": 0.1895, + "step": 737 + }, + { + "epoch": 2.975806451612903, + "grad_norm": 1.8240628242492676, + "learning_rate": 1.6129032258064518e-07, + "loss": 0.1558, + "step": 738 + }, + { + "epoch": 2.9798387096774195, + "grad_norm": 1.9709798097610474, + "learning_rate": 1.3440860215053764e-07, + "loss": 0.1245, + "step": 739 + }, + { + "epoch": 2.9838709677419355, + "grad_norm": 1.7331011295318604, + "learning_rate": 1.0752688172043012e-07, + "loss": 0.0999, + "step": 740 + }, + { + "epoch": 2.9879032258064515, + "grad_norm": 1.3876549005508423, + "learning_rate": 8.064516129032259e-08, + "loss": 0.1744, + "step": 741 + }, + { + "epoch": 2.991935483870968, + "grad_norm": 1.998831033706665, + "learning_rate": 5.376344086021506e-08, + "loss": 0.0876, + "step": 742 + }, + { + "epoch": 2.995967741935484, + "grad_norm": 1.9842371940612793, + "learning_rate": 2.688172043010753e-08, + "loss": 0.1036, + "step": 743 + }, + { + "epoch": 3.0, + "grad_norm": 2.3604133129119873, + "learning_rate": 0.0, + "loss": 0.0957, + "step": 744 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.7625, + "eval_f1": 0.9021978021978022, + "eval_loss": 0.14369329810142517, + "eval_runtime": 2.3693, + "eval_samples_per_second": 371.416, + "eval_steps_per_second": 11.818, + "step": 744 + } + ], + "logging_steps": 1, + "max_steps": 744, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 142052951715840.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}