| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9966329966329965, |
| "eval_steps": 500, |
| "global_step": 555, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0053872053872053875, |
| "grad_norm": 21.266149520874023, |
| "learning_rate": 0.0, |
| "loss": 0.9993, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010774410774410775, |
| "grad_norm": 21.13385009765625, |
| "learning_rate": 5.882352941176471e-07, |
| "loss": 1.0245, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01616161616161616, |
| "grad_norm": 20.182464599609375, |
| "learning_rate": 1.1764705882352942e-06, |
| "loss": 0.9562, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02154882154882155, |
| "grad_norm": 18.727153778076172, |
| "learning_rate": 1.7647058823529414e-06, |
| "loss": 0.9445, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.026936026936026935, |
| "grad_norm": 16.479658126831055, |
| "learning_rate": 2.3529411764705885e-06, |
| "loss": 0.9854, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.03232323232323232, |
| "grad_norm": 10.075958251953125, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 0.7675, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03771043771043771, |
| "grad_norm": 8.65888500213623, |
| "learning_rate": 3.529411764705883e-06, |
| "loss": 0.7297, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0430976430976431, |
| "grad_norm": 8.33163070678711, |
| "learning_rate": 4.11764705882353e-06, |
| "loss": 0.6672, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.048484848484848485, |
| "grad_norm": 7.000586032867432, |
| "learning_rate": 4.705882352941177e-06, |
| "loss": 0.6599, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05387205387205387, |
| "grad_norm": 6.877265930175781, |
| "learning_rate": 5.294117647058824e-06, |
| "loss": 0.5728, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05925925925925926, |
| "grad_norm": 6.868885040283203, |
| "learning_rate": 5.882352941176471e-06, |
| "loss": 0.6731, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06464646464646465, |
| "grad_norm": 6.862372875213623, |
| "learning_rate": 6.470588235294119e-06, |
| "loss": 0.7126, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.07003367003367003, |
| "grad_norm": 6.195284843444824, |
| "learning_rate": 7.058823529411766e-06, |
| "loss": 0.644, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07542087542087542, |
| "grad_norm": 6.2631120681762695, |
| "learning_rate": 7.647058823529411e-06, |
| "loss": 0.5753, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.08080808080808081, |
| "grad_norm": 5.94320011138916, |
| "learning_rate": 8.23529411764706e-06, |
| "loss": 0.6584, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0861952861952862, |
| "grad_norm": 5.2665205001831055, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 0.6102, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.09158249158249158, |
| "grad_norm": 5.388559341430664, |
| "learning_rate": 9.411764705882354e-06, |
| "loss": 0.6127, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.09696969696969697, |
| "grad_norm": 5.109943866729736, |
| "learning_rate": 1e-05, |
| "loss": 0.6259, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.10235690235690235, |
| "grad_norm": 4.631857395172119, |
| "learning_rate": 9.999914754008063e-06, |
| "loss": 0.6064, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.10774410774410774, |
| "grad_norm": 4.755272388458252, |
| "learning_rate": 9.999659018938999e-06, |
| "loss": 0.5934, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11313131313131314, |
| "grad_norm": 4.383729934692383, |
| "learning_rate": 9.999232803512967e-06, |
| "loss": 0.6137, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.11851851851851852, |
| "grad_norm": 4.2614593505859375, |
| "learning_rate": 9.998636122263227e-06, |
| "loss": 0.5914, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12390572390572391, |
| "grad_norm": 4.656721591949463, |
| "learning_rate": 9.997868995535658e-06, |
| "loss": 0.599, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.1292929292929293, |
| "grad_norm": 4.374063491821289, |
| "learning_rate": 9.996931449488046e-06, |
| "loss": 0.6489, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.13468013468013468, |
| "grad_norm": 4.434129238128662, |
| "learning_rate": 9.99582351608921e-06, |
| "loss": 0.5895, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14006734006734006, |
| "grad_norm": 4.682045936584473, |
| "learning_rate": 9.994545233117904e-06, |
| "loss": 0.6253, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.14545454545454545, |
| "grad_norm": 4.347814559936523, |
| "learning_rate": 9.993096644161526e-06, |
| "loss": 0.617, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.15084175084175083, |
| "grad_norm": 4.48855447769165, |
| "learning_rate": 9.991477798614638e-06, |
| "loss": 0.6468, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.15622895622895622, |
| "grad_norm": 4.433114528656006, |
| "learning_rate": 9.989688751677277e-06, |
| "loss": 0.6084, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.16161616161616163, |
| "grad_norm": 3.879382610321045, |
| "learning_rate": 9.987729564353077e-06, |
| "loss": 0.5468, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16700336700336701, |
| "grad_norm": 4.3543009757995605, |
| "learning_rate": 9.985600303447185e-06, |
| "loss": 0.6268, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.1723905723905724, |
| "grad_norm": 6.505020618438721, |
| "learning_rate": 9.98330104156398e-06, |
| "loss": 0.5947, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.17777777777777778, |
| "grad_norm": 4.169903755187988, |
| "learning_rate": 9.980831857104612e-06, |
| "loss": 0.574, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.18316498316498317, |
| "grad_norm": 4.362861633300781, |
| "learning_rate": 9.978192834264307e-06, |
| "loss": 0.5851, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.18855218855218855, |
| "grad_norm": 3.7690815925598145, |
| "learning_rate": 9.975384063029516e-06, |
| "loss": 0.6023, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.19393939393939394, |
| "grad_norm": 4.335365295410156, |
| "learning_rate": 9.972405639174833e-06, |
| "loss": 0.6267, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.19932659932659932, |
| "grad_norm": 4.149550914764404, |
| "learning_rate": 9.96925766425974e-06, |
| "loss": 0.599, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.2047138047138047, |
| "grad_norm": 4.021537780761719, |
| "learning_rate": 9.965940245625131e-06, |
| "loss": 0.5859, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2101010101010101, |
| "grad_norm": 4.439505100250244, |
| "learning_rate": 9.962453496389665e-06, |
| "loss": 0.5895, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.21548821548821548, |
| "grad_norm": 4.288372039794922, |
| "learning_rate": 9.958797535445898e-06, |
| "loss": 0.6212, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22087542087542086, |
| "grad_norm": 4.0634260177612305, |
| "learning_rate": 9.95497248745624e-06, |
| "loss": 0.6061, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.22626262626262628, |
| "grad_norm": 4.286866188049316, |
| "learning_rate": 9.950978482848694e-06, |
| "loss": 0.6458, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.23164983164983166, |
| "grad_norm": 3.877549409866333, |
| "learning_rate": 9.946815657812416e-06, |
| "loss": 0.5868, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.23703703703703705, |
| "grad_norm": 4.321531295776367, |
| "learning_rate": 9.94248415429306e-06, |
| "loss": 0.6158, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "grad_norm": 3.8047635555267334, |
| "learning_rate": 9.937984119987958e-06, |
| "loss": 0.5437, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.24781144781144782, |
| "grad_norm": 4.01943826675415, |
| "learning_rate": 9.93331570834106e-06, |
| "loss": 0.5668, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.2531986531986532, |
| "grad_norm": 4.549412250518799, |
| "learning_rate": 9.928479078537722e-06, |
| "loss": 0.6271, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.2585858585858586, |
| "grad_norm": 3.865027904510498, |
| "learning_rate": 9.923474395499266e-06, |
| "loss": 0.6187, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.26397306397306397, |
| "grad_norm": 3.9334516525268555, |
| "learning_rate": 9.91830182987736e-06, |
| "loss": 0.614, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.26936026936026936, |
| "grad_norm": 3.9490811824798584, |
| "learning_rate": 9.912961558048196e-06, |
| "loss": 0.5716, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.27474747474747474, |
| "grad_norm": 3.834277391433716, |
| "learning_rate": 9.907453762106484e-06, |
| "loss": 0.5145, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2801346801346801, |
| "grad_norm": 3.9712698459625244, |
| "learning_rate": 9.901778629859236e-06, |
| "loss": 0.627, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2855218855218855, |
| "grad_norm": 4.146055698394775, |
| "learning_rate": 9.895936354819362e-06, |
| "loss": 0.5962, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "grad_norm": 4.930230140686035, |
| "learning_rate": 9.889927136199075e-06, |
| "loss": 0.5974, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2962962962962963, |
| "grad_norm": 4.270641803741455, |
| "learning_rate": 9.883751178903095e-06, |
| "loss": 0.6245, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.30168350168350166, |
| "grad_norm": 8.589272499084473, |
| "learning_rate": 9.877408693521664e-06, |
| "loss": 0.6359, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.30707070707070705, |
| "grad_norm": 8.13204288482666, |
| "learning_rate": 9.870899896323368e-06, |
| "loss": 0.6429, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.31245791245791243, |
| "grad_norm": 7.613426208496094, |
| "learning_rate": 9.864225009247753e-06, |
| "loss": 0.577, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.3178451178451178, |
| "grad_norm": 4.240153789520264, |
| "learning_rate": 9.857384259897768e-06, |
| "loss": 0.6715, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.32323232323232326, |
| "grad_norm": 3.9827535152435303, |
| "learning_rate": 9.850377881532e-06, |
| "loss": 0.5256, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.32861952861952864, |
| "grad_norm": 5.192502975463867, |
| "learning_rate": 9.843206113056715e-06, |
| "loss": 0.5537, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.33400673400673403, |
| "grad_norm": 3.669801950454712, |
| "learning_rate": 9.835869199017725e-06, |
| "loss": 0.6018, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3393939393939394, |
| "grad_norm": 4.642088890075684, |
| "learning_rate": 9.828367389592034e-06, |
| "loss": 0.5001, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3447811447811448, |
| "grad_norm": 3.983962297439575, |
| "learning_rate": 9.820700940579312e-06, |
| "loss": 0.624, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.3501683501683502, |
| "grad_norm": 3.97925066947937, |
| "learning_rate": 9.812870113393185e-06, |
| "loss": 0.5945, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.35555555555555557, |
| "grad_norm": 4.082148551940918, |
| "learning_rate": 9.804875175052304e-06, |
| "loss": 0.5847, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.36094276094276095, |
| "grad_norm": 3.4573113918304443, |
| "learning_rate": 9.796716398171248e-06, |
| "loss": 0.5016, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.36632996632996634, |
| "grad_norm": 3.9368677139282227, |
| "learning_rate": 9.788394060951228e-06, |
| "loss": 0.5582, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.3717171717171717, |
| "grad_norm": 3.7513315677642822, |
| "learning_rate": 9.779908447170602e-06, |
| "loss": 0.5525, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3771043771043771, |
| "grad_norm": 3.7674131393432617, |
| "learning_rate": 9.771259846175195e-06, |
| "loss": 0.5577, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3824915824915825, |
| "grad_norm": 3.596757650375366, |
| "learning_rate": 9.762448552868433e-06, |
| "loss": 0.553, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3878787878787879, |
| "grad_norm": 3.4366366863250732, |
| "learning_rate": 9.753474867701294e-06, |
| "loss": 0.533, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.39326599326599326, |
| "grad_norm": 3.8846004009246826, |
| "learning_rate": 9.744339096662056e-06, |
| "loss": 0.5755, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.39865319865319865, |
| "grad_norm": 3.593231439590454, |
| "learning_rate": 9.735041551265862e-06, |
| "loss": 0.5424, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.40404040404040403, |
| "grad_norm": 3.5270259380340576, |
| "learning_rate": 9.725582548544106e-06, |
| "loss": 0.5218, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4094276094276094, |
| "grad_norm": 3.9130117893218994, |
| "learning_rate": 9.715962411033614e-06, |
| "loss": 0.5529, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.4148148148148148, |
| "grad_norm": 3.5708324909210205, |
| "learning_rate": 9.706181466765654e-06, |
| "loss": 0.5047, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4202020202020202, |
| "grad_norm": 3.6041488647460938, |
| "learning_rate": 9.696240049254744e-06, |
| "loss": 0.4715, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.4255892255892256, |
| "grad_norm": 3.532111644744873, |
| "learning_rate": 9.686138497487282e-06, |
| "loss": 0.5443, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.43097643097643096, |
| "grad_norm": 3.3798911571502686, |
| "learning_rate": 9.675877155909989e-06, |
| "loss": 0.5196, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "grad_norm": 3.576612949371338, |
| "learning_rate": 9.66545637441816e-06, |
| "loss": 0.5593, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4417508417508417, |
| "grad_norm": 3.6367032527923584, |
| "learning_rate": 9.654876508343739e-06, |
| "loss": 0.5199, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4471380471380471, |
| "grad_norm": 4.221003532409668, |
| "learning_rate": 9.644137918443198e-06, |
| "loss": 0.5799, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.45252525252525255, |
| "grad_norm": 3.6288747787475586, |
| "learning_rate": 9.633240970885231e-06, |
| "loss": 0.5702, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.45791245791245794, |
| "grad_norm": 3.6418979167938232, |
| "learning_rate": 9.622186037238286e-06, |
| "loss": 0.5463, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4632996632996633, |
| "grad_norm": 3.5099191665649414, |
| "learning_rate": 9.610973494457873e-06, |
| "loss": 0.5871, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.4686868686868687, |
| "grad_norm": 3.9148519039154053, |
| "learning_rate": 9.599603724873725e-06, |
| "loss": 0.6149, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4740740740740741, |
| "grad_norm": 3.3477306365966797, |
| "learning_rate": 9.588077116176756e-06, |
| "loss": 0.5618, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4794612794612795, |
| "grad_norm": 3.632464647293091, |
| "learning_rate": 9.576394061405847e-06, |
| "loss": 0.5747, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "grad_norm": 5.160216808319092, |
| "learning_rate": 9.564554958934432e-06, |
| "loss": 0.6318, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.49023569023569025, |
| "grad_norm": 3.320161819458008, |
| "learning_rate": 9.55256021245692e-06, |
| "loss": 0.5472, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.49562289562289563, |
| "grad_norm": 3.577775716781616, |
| "learning_rate": 9.540410230974943e-06, |
| "loss": 0.584, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.501010101010101, |
| "grad_norm": 3.3152377605438232, |
| "learning_rate": 9.52810542878339e-06, |
| "loss": 0.5268, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5063973063973064, |
| "grad_norm": 3.468808889389038, |
| "learning_rate": 9.515646225456283e-06, |
| "loss": 0.6323, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5117845117845118, |
| "grad_norm": 5.061112880706787, |
| "learning_rate": 9.503033045832484e-06, |
| "loss": 0.5041, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5171717171717172, |
| "grad_norm": 5.695023059844971, |
| "learning_rate": 9.490266320001195e-06, |
| "loss": 0.5678, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5225589225589226, |
| "grad_norm": 4.4895920753479, |
| "learning_rate": 9.4773464832873e-06, |
| "loss": 0.6127, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5279461279461279, |
| "grad_norm": 3.6477298736572266, |
| "learning_rate": 9.464273976236518e-06, |
| "loss": 0.539, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "grad_norm": 5.325118541717529, |
| "learning_rate": 9.451049244600381e-06, |
| "loss": 0.5428, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5387205387205387, |
| "grad_norm": 3.778438091278076, |
| "learning_rate": 9.437672739321034e-06, |
| "loss": 0.5781, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5441077441077441, |
| "grad_norm": 3.363888740539551, |
| "learning_rate": 9.424144916515863e-06, |
| "loss": 0.5424, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5494949494949495, |
| "grad_norm": 3.4057974815368652, |
| "learning_rate": 9.410466237461937e-06, |
| "loss": 0.527, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5548821548821549, |
| "grad_norm": 3.555009126663208, |
| "learning_rate": 9.396637168580282e-06, |
| "loss": 0.5645, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5602693602693603, |
| "grad_norm": 3.691166639328003, |
| "learning_rate": 9.382658181419977e-06, |
| "loss": 0.5689, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5656565656565656, |
| "grad_norm": 3.210749626159668, |
| "learning_rate": 9.36852975264207e-06, |
| "loss": 0.4849, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.571043771043771, |
| "grad_norm": 3.507824659347534, |
| "learning_rate": 9.354252364003334e-06, |
| "loss": 0.5872, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5764309764309764, |
| "grad_norm": 3.4085872173309326, |
| "learning_rate": 9.339826502339828e-06, |
| "loss": 0.5664, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "grad_norm": 3.474592924118042, |
| "learning_rate": 9.32525265955031e-06, |
| "loss": 0.5818, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5872053872053872, |
| "grad_norm": 3.5888025760650635, |
| "learning_rate": 9.310531332579453e-06, |
| "loss": 0.567, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.5925925925925926, |
| "grad_norm": 3.412595510482788, |
| "learning_rate": 9.295663023400907e-06, |
| "loss": 0.5482, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.597979797979798, |
| "grad_norm": 3.397404193878174, |
| "learning_rate": 9.280648239000174e-06, |
| "loss": 0.5572, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.6033670033670033, |
| "grad_norm": 3.6878013610839844, |
| "learning_rate": 9.265487491357334e-06, |
| "loss": 0.6044, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6087542087542087, |
| "grad_norm": 3.4067952632904053, |
| "learning_rate": 9.250181297429573e-06, |
| "loss": 0.519, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6141414141414141, |
| "grad_norm": 3.6102547645568848, |
| "learning_rate": 9.234730179133564e-06, |
| "loss": 0.5897, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6195286195286195, |
| "grad_norm": 3.254011392593384, |
| "learning_rate": 9.219134663327672e-06, |
| "loss": 0.5444, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6249158249158249, |
| "grad_norm": 3.4662082195281982, |
| "learning_rate": 9.203395281793979e-06, |
| "loss": 0.5689, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6303030303030303, |
| "grad_norm": 3.225325345993042, |
| "learning_rate": 9.187512571220166e-06, |
| "loss": 0.4967, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6356902356902356, |
| "grad_norm": 3.3803765773773193, |
| "learning_rate": 9.171487073181198e-06, |
| "loss": 0.5245, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.641077441077441, |
| "grad_norm": 3.078711748123169, |
| "learning_rate": 9.155319334120864e-06, |
| "loss": 0.4871, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6464646464646465, |
| "grad_norm": 3.5471031665802, |
| "learning_rate": 9.139009905333147e-06, |
| "loss": 0.5674, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6518518518518519, |
| "grad_norm": 3.0351247787475586, |
| "learning_rate": 9.122559342943423e-06, |
| "loss": 0.4854, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6572390572390573, |
| "grad_norm": 3.3814985752105713, |
| "learning_rate": 9.105968207889493e-06, |
| "loss": 0.5141, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6626262626262627, |
| "grad_norm": 3.2874019145965576, |
| "learning_rate": 9.089237065902464e-06, |
| "loss": 0.5255, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6680134680134681, |
| "grad_norm": 3.173571825027466, |
| "learning_rate": 9.072366487487451e-06, |
| "loss": 0.5269, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.6734006734006734, |
| "grad_norm": 3.3994832038879395, |
| "learning_rate": 9.055357047904133e-06, |
| "loss": 0.5768, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6787878787878788, |
| "grad_norm": 3.376079797744751, |
| "learning_rate": 9.038209327147134e-06, |
| "loss": 0.6, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6841750841750842, |
| "grad_norm": 3.5709731578826904, |
| "learning_rate": 9.020923909926233e-06, |
| "loss": 0.6137, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.6895622895622896, |
| "grad_norm": 3.0871469974517822, |
| "learning_rate": 9.00350138564645e-06, |
| "loss": 0.5537, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.694949494949495, |
| "grad_norm": 2.978905200958252, |
| "learning_rate": 8.985942348387926e-06, |
| "loss": 0.4888, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.7003367003367004, |
| "grad_norm": 3.196749687194824, |
| "learning_rate": 8.968247396885685e-06, |
| "loss": 0.5279, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7057239057239058, |
| "grad_norm": 3.2792575359344482, |
| "learning_rate": 8.950417134509201e-06, |
| "loss": 0.5749, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7111111111111111, |
| "grad_norm": 3.157092332839966, |
| "learning_rate": 8.932452169241838e-06, |
| "loss": 0.619, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7164983164983165, |
| "grad_norm": 3.2496225833892822, |
| "learning_rate": 8.914353113660107e-06, |
| "loss": 0.5495, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7218855218855219, |
| "grad_norm": 3.2431371212005615, |
| "learning_rate": 8.89612058491279e-06, |
| "loss": 0.5297, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 3.2148752212524414, |
| "learning_rate": 8.877755204699883e-06, |
| "loss": 0.5175, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7326599326599327, |
| "grad_norm": 3.1605641841888428, |
| "learning_rate": 8.859257599251408e-06, |
| "loss": 0.5848, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7380471380471381, |
| "grad_norm": 3.1001222133636475, |
| "learning_rate": 8.840628399306056e-06, |
| "loss": 0.539, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.7434343434343434, |
| "grad_norm": 3.3802716732025146, |
| "learning_rate": 8.821868240089676e-06, |
| "loss": 0.5782, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7488215488215488, |
| "grad_norm": 3.0083656311035156, |
| "learning_rate": 8.802977761293625e-06, |
| "loss": 0.5314, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7542087542087542, |
| "grad_norm": 3.2978479862213135, |
| "learning_rate": 8.783957607052941e-06, |
| "loss": 0.548, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7595959595959596, |
| "grad_norm": 3.177548885345459, |
| "learning_rate": 8.764808425924392e-06, |
| "loss": 0.4653, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.764983164983165, |
| "grad_norm": 3.2603986263275146, |
| "learning_rate": 8.745530870864351e-06, |
| "loss": 0.5768, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.7703703703703704, |
| "grad_norm": 3.4270477294921875, |
| "learning_rate": 8.726125599206543e-06, |
| "loss": 0.5426, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.7757575757575758, |
| "grad_norm": 3.006866693496704, |
| "learning_rate": 8.706593272639616e-06, |
| "loss": 0.5038, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7811447811447811, |
| "grad_norm": 3.9326441287994385, |
| "learning_rate": 8.686934557184594e-06, |
| "loss": 0.618, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.7865319865319865, |
| "grad_norm": 3.3260936737060547, |
| "learning_rate": 8.667150123172159e-06, |
| "loss": 0.5245, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.7919191919191919, |
| "grad_norm": 3.189055919647217, |
| "learning_rate": 8.647240645219787e-06, |
| "loss": 0.5403, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.7973063973063973, |
| "grad_norm": 3.107164144515991, |
| "learning_rate": 8.62720680220876e-06, |
| "loss": 0.5292, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8026936026936027, |
| "grad_norm": 3.372941493988037, |
| "learning_rate": 8.607049277261005e-06, |
| "loss": 0.5486, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "grad_norm": 3.3730578422546387, |
| "learning_rate": 8.586768757715806e-06, |
| "loss": 0.5845, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8134680134680135, |
| "grad_norm": 3.1509501934051514, |
| "learning_rate": 8.566365935106367e-06, |
| "loss": 0.5266, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.8188552188552188, |
| "grad_norm": 3.464965581893921, |
| "learning_rate": 8.545841505136224e-06, |
| "loss": 0.5701, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.8242424242424242, |
| "grad_norm": 3.0586905479431152, |
| "learning_rate": 8.525196167655539e-06, |
| "loss": 0.4934, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.8296296296296296, |
| "grad_norm": 3.1889281272888184, |
| "learning_rate": 8.504430626637215e-06, |
| "loss": 0.5937, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.835016835016835, |
| "grad_norm": 3.2143123149871826, |
| "learning_rate": 8.483545590152915e-06, |
| "loss": 0.5358, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8404040404040404, |
| "grad_norm": 3.3132236003875732, |
| "learning_rate": 8.462541770348896e-06, |
| "loss": 0.5258, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.8457912457912458, |
| "grad_norm": 3.310232400894165, |
| "learning_rate": 8.441419883421742e-06, |
| "loss": 0.5908, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8511784511784511, |
| "grad_norm": 3.13468599319458, |
| "learning_rate": 8.42018064959393e-06, |
| "loss": 0.4796, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8565656565656565, |
| "grad_norm": 3.0902316570281982, |
| "learning_rate": 8.398824793089287e-06, |
| "loss": 0.5082, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8619528619528619, |
| "grad_norm": 3.193399429321289, |
| "learning_rate": 8.377353042108278e-06, |
| "loss": 0.5388, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8673400673400673, |
| "grad_norm": 3.0939056873321533, |
| "learning_rate": 8.355766128803192e-06, |
| "loss": 0.4641, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 3.229541540145874, |
| "learning_rate": 8.334064789253157e-06, |
| "loss": 0.5247, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.8781144781144781, |
| "grad_norm": 3.2554848194122314, |
| "learning_rate": 8.312249763439066e-06, |
| "loss": 0.5491, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.8835016835016835, |
| "grad_norm": 3.2184009552001953, |
| "learning_rate": 8.29032179521832e-06, |
| "loss": 0.6099, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 3.2048168182373047, |
| "learning_rate": 8.268281632299483e-06, |
| "loss": 0.4963, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.8942760942760942, |
| "grad_norm": 3.0308377742767334, |
| "learning_rate": 8.246130026216777e-06, |
| "loss": 0.5222, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.8996632996632996, |
| "grad_norm": 3.189265012741089, |
| "learning_rate": 8.22386773230445e-06, |
| "loss": 0.4913, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.9050505050505051, |
| "grad_norm": 3.2512941360473633, |
| "learning_rate": 8.201495509671036e-06, |
| "loss": 0.5717, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.9104377104377105, |
| "grad_norm": 3.2874414920806885, |
| "learning_rate": 8.179014121173461e-06, |
| "loss": 0.5334, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.9158249158249159, |
| "grad_norm": 3.307884931564331, |
| "learning_rate": 8.156424333391026e-06, |
| "loss": 0.5617, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9212121212121213, |
| "grad_norm": 3.0463500022888184, |
| "learning_rate": 8.13372691659928e-06, |
| "loss": 0.5305, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.9265993265993266, |
| "grad_norm": 3.3068511486053467, |
| "learning_rate": 8.110922644743747e-06, |
| "loss": 0.549, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.931986531986532, |
| "grad_norm": 3.1428866386413574, |
| "learning_rate": 8.088012295413536e-06, |
| "loss": 0.4856, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9373737373737374, |
| "grad_norm": 2.96205997467041, |
| "learning_rate": 8.064996649814826e-06, |
| "loss": 0.4599, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.9427609427609428, |
| "grad_norm": 3.3233330249786377, |
| "learning_rate": 8.041876492744239e-06, |
| "loss": 0.5505, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9481481481481482, |
| "grad_norm": 3.2451870441436768, |
| "learning_rate": 8.018652612562061e-06, |
| "loss": 0.4739, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9535353535353536, |
| "grad_norm": 3.231306791305542, |
| "learning_rate": 7.99532580116537e-06, |
| "loss": 0.5119, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.958922558922559, |
| "grad_norm": 3.147303342819214, |
| "learning_rate": 7.971896853961043e-06, |
| "loss": 0.496, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.9643097643097643, |
| "grad_norm": 3.530423641204834, |
| "learning_rate": 7.948366569838612e-06, |
| "loss": 0.6025, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 3.5202131271362305, |
| "learning_rate": 7.924735751143044e-06, |
| "loss": 0.4822, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9750841750841751, |
| "grad_norm": 3.288405656814575, |
| "learning_rate": 7.901005203647373e-06, |
| "loss": 0.5393, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.9804713804713805, |
| "grad_norm": 3.291487693786621, |
| "learning_rate": 7.877175736525217e-06, |
| "loss": 0.6146, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.9858585858585859, |
| "grad_norm": 2.933931350708008, |
| "learning_rate": 7.853248162323208e-06, |
| "loss": 0.4874, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.9912457912457913, |
| "grad_norm": 3.0823869705200195, |
| "learning_rate": 7.829223296933259e-06, |
| "loss": 0.5756, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.9966329966329966, |
| "grad_norm": 2.960385799407959, |
| "learning_rate": 7.805101959564768e-06, |
| "loss": 0.4738, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.0053872053872055, |
| "grad_norm": 5.235624313354492, |
| "learning_rate": 7.780884972716663e-06, |
| "loss": 0.8368, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.0107744107744108, |
| "grad_norm": 2.6471757888793945, |
| "learning_rate": 7.75657316214937e-06, |
| "loss": 0.2894, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.0161616161616163, |
| "grad_norm": 2.7157034873962402, |
| "learning_rate": 7.732167356856656e-06, |
| "loss": 0.3068, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.0215488215488215, |
| "grad_norm": 2.7159922122955322, |
| "learning_rate": 7.70766838903735e-06, |
| "loss": 0.3193, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.026936026936027, |
| "grad_norm": 2.4954445362091064, |
| "learning_rate": 7.683077094066981e-06, |
| "loss": 0.2827, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0323232323232323, |
| "grad_norm": 2.3092992305755615, |
| "learning_rate": 7.65839431046928e-06, |
| "loss": 0.253, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.0377104377104378, |
| "grad_norm": 2.994446039199829, |
| "learning_rate": 7.63362087988759e-06, |
| "loss": 0.2969, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.043097643097643, |
| "grad_norm": 2.7718987464904785, |
| "learning_rate": 7.608757647056186e-06, |
| "loss": 0.2913, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.0484848484848486, |
| "grad_norm": 2.769294500350952, |
| "learning_rate": 7.583805459771443e-06, |
| "loss": 0.2704, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.0538720538720538, |
| "grad_norm": 3.459955930709839, |
| "learning_rate": 7.5587651688629405e-06, |
| "loss": 0.3051, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.0592592592592593, |
| "grad_norm": 3.5698342323303223, |
| "learning_rate": 7.533637628164456e-06, |
| "loss": 0.2757, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.0646464646464646, |
| "grad_norm": 3.165423631668091, |
| "learning_rate": 7.508423694484841e-06, |
| "loss": 0.2811, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.0700336700336701, |
| "grad_norm": 3.1055243015289307, |
| "learning_rate": 7.483124227578811e-06, |
| "loss": 0.2594, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.0754208754208754, |
| "grad_norm": 3.1683449745178223, |
| "learning_rate": 7.457740090117627e-06, |
| "loss": 0.3102, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.0808080808080809, |
| "grad_norm": 3.477832317352295, |
| "learning_rate": 7.432272147659678e-06, |
| "loss": 0.3035, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0861952861952862, |
| "grad_norm": 2.9576289653778076, |
| "learning_rate": 7.406721268620975e-06, |
| "loss": 0.2653, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.0915824915824917, |
| "grad_norm": 2.561279773712158, |
| "learning_rate": 7.381088324245526e-06, |
| "loss": 0.2485, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.096969696969697, |
| "grad_norm": 3.1348936557769775, |
| "learning_rate": 7.355374188575639e-06, |
| "loss": 0.2715, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.1023569023569024, |
| "grad_norm": 2.7675235271453857, |
| "learning_rate": 7.3295797384221156e-06, |
| "loss": 0.2805, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.1077441077441077, |
| "grad_norm": 3.367643117904663, |
| "learning_rate": 7.303705853334353e-06, |
| "loss": 0.2897, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.1131313131313132, |
| "grad_norm": 3.007518768310547, |
| "learning_rate": 7.277753415570349e-06, |
| "loss": 0.2699, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.1185185185185185, |
| "grad_norm": 2.9317398071289062, |
| "learning_rate": 7.2517233100666255e-06, |
| "loss": 0.3282, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.123905723905724, |
| "grad_norm": 3.2546324729919434, |
| "learning_rate": 7.225616424408045e-06, |
| "loss": 0.2947, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.1292929292929292, |
| "grad_norm": 2.954130172729492, |
| "learning_rate": 7.199433648797558e-06, |
| "loss": 0.2994, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.1346801346801347, |
| "grad_norm": 2.7771804332733154, |
| "learning_rate": 7.1731758760258315e-06, |
| "loss": 0.229, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.14006734006734, |
| "grad_norm": 2.771481990814209, |
| "learning_rate": 7.146844001440823e-06, |
| "loss": 0.2725, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.1454545454545455, |
| "grad_norm": 2.742431402206421, |
| "learning_rate": 7.120438922917237e-06, |
| "loss": 0.2514, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.1508417508417508, |
| "grad_norm": 2.6713271141052246, |
| "learning_rate": 7.09396154082592e-06, |
| "loss": 0.2485, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.1562289562289563, |
| "grad_norm": 2.492274284362793, |
| "learning_rate": 7.067412758003154e-06, |
| "loss": 0.2278, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.1616161616161615, |
| "grad_norm": 2.8618505001068115, |
| "learning_rate": 7.040793479719864e-06, |
| "loss": 0.2854, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.167003367003367, |
| "grad_norm": 2.6601178646087646, |
| "learning_rate": 7.014104613650767e-06, |
| "loss": 0.2966, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.1723905723905723, |
| "grad_norm": 3.3377082347869873, |
| "learning_rate": 6.987347069843406e-06, |
| "loss": 0.3149, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.1777777777777778, |
| "grad_norm": 2.778550863265991, |
| "learning_rate": 6.96052176068713e-06, |
| "loss": 0.2543, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.183164983164983, |
| "grad_norm": 3.040800094604492, |
| "learning_rate": 6.93362960088197e-06, |
| "loss": 0.2438, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.1885521885521886, |
| "grad_norm": 2.9394142627716064, |
| "learning_rate": 6.906671507407463e-06, |
| "loss": 0.2304, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.1939393939393939, |
| "grad_norm": 2.869964122772217, |
| "learning_rate": 6.879648399491376e-06, |
| "loss": 0.2984, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.1993265993265994, |
| "grad_norm": 2.9759936332702637, |
| "learning_rate": 6.852561198578364e-06, |
| "loss": 0.2603, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.2047138047138046, |
| "grad_norm": 3.310718297958374, |
| "learning_rate": 6.825410828298552e-06, |
| "loss": 0.2233, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.2101010101010101, |
| "grad_norm": 2.7231340408325195, |
| "learning_rate": 6.79819821443604e-06, |
| "loss": 0.2124, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.2154882154882154, |
| "grad_norm": 2.9152019023895264, |
| "learning_rate": 6.7709242848973326e-06, |
| "loss": 0.2757, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.220875420875421, |
| "grad_norm": 2.9841840267181396, |
| "learning_rate": 6.743589969679697e-06, |
| "loss": 0.2853, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.2262626262626264, |
| "grad_norm": 3.3108832836151123, |
| "learning_rate": 6.716196200839465e-06, |
| "loss": 0.2608, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.2316498316498317, |
| "grad_norm": 2.9652819633483887, |
| "learning_rate": 6.6887439124602295e-06, |
| "loss": 0.2598, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.237037037037037, |
| "grad_norm": 2.812822103500366, |
| "learning_rate": 6.661234040621017e-06, |
| "loss": 0.2638, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.2424242424242424, |
| "grad_norm": 3.03281831741333, |
| "learning_rate": 6.63366752336435e-06, |
| "loss": 0.2439, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.247811447811448, |
| "grad_norm": 2.7430481910705566, |
| "learning_rate": 6.606045300664272e-06, |
| "loss": 0.2502, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.2531986531986532, |
| "grad_norm": 3.0615146160125732, |
| "learning_rate": 6.578368314394293e-06, |
| "loss": 0.2494, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.2585858585858585, |
| "grad_norm": 2.689999580383301, |
| "learning_rate": 6.550637508295272e-06, |
| "loss": 0.2309, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.263973063973064, |
| "grad_norm": 3.2054049968719482, |
| "learning_rate": 6.52285382794324e-06, |
| "loss": 0.2942, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.2693602693602695, |
| "grad_norm": 2.9260945320129395, |
| "learning_rate": 6.49501822071715e-06, |
| "loss": 0.2861, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.2747474747474747, |
| "grad_norm": 3.240046262741089, |
| "learning_rate": 6.467131635766585e-06, |
| "loss": 0.2949, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.28013468013468, |
| "grad_norm": 2.6747567653656006, |
| "learning_rate": 6.439195023979381e-06, |
| "loss": 0.2851, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.2855218855218855, |
| "grad_norm": 3.605665445327759, |
| "learning_rate": 6.411209337949214e-06, |
| "loss": 0.3156, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.290909090909091, |
| "grad_norm": 3.0257179737091064, |
| "learning_rate": 6.383175531943106e-06, |
| "loss": 0.2481, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.2962962962962963, |
| "grad_norm": 3.004091739654541, |
| "learning_rate": 6.355094561868902e-06, |
| "loss": 0.2608, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3016835016835016, |
| "grad_norm": 2.927186965942383, |
| "learning_rate": 6.3269673852426575e-06, |
| "loss": 0.2298, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.307070707070707, |
| "grad_norm": 2.5807888507843018, |
| "learning_rate": 6.298794961156004e-06, |
| "loss": 0.2263, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.3124579124579125, |
| "grad_norm": 2.7014336585998535, |
| "learning_rate": 6.270578250243437e-06, |
| "loss": 0.2931, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.3178451178451178, |
| "grad_norm": 3.1106925010681152, |
| "learning_rate": 6.242318214649556e-06, |
| "loss": 0.2789, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.3232323232323233, |
| "grad_norm": 2.7850258350372314, |
| "learning_rate": 6.214015817996273e-06, |
| "loss": 0.3062, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.3286195286195286, |
| "grad_norm": 2.841632127761841, |
| "learning_rate": 6.185672025349936e-06, |
| "loss": 0.2595, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.334006734006734, |
| "grad_norm": 2.757871150970459, |
| "learning_rate": 6.157287803188432e-06, |
| "loss": 0.2408, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.3393939393939394, |
| "grad_norm": 2.7471070289611816, |
| "learning_rate": 6.128864119368234e-06, |
| "loss": 0.2618, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.3447811447811449, |
| "grad_norm": 3.062896490097046, |
| "learning_rate": 6.100401943091386e-06, |
| "loss": 0.2893, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.3501683501683501, |
| "grad_norm": 2.937164068222046, |
| "learning_rate": 6.0719022448724705e-06, |
| "loss": 0.2735, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3555555555555556, |
| "grad_norm": 3.1469810009002686, |
| "learning_rate": 6.043365996505506e-06, |
| "loss": 0.3295, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.360942760942761, |
| "grad_norm": 2.82350754737854, |
| "learning_rate": 6.014794171030811e-06, |
| "loss": 0.2778, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.3663299663299664, |
| "grad_norm": 3.0384979248046875, |
| "learning_rate": 5.986187742701825e-06, |
| "loss": 0.2678, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.3717171717171717, |
| "grad_norm": 2.782715082168579, |
| "learning_rate": 5.9575476869518945e-06, |
| "loss": 0.2664, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.3771043771043772, |
| "grad_norm": 2.811166763305664, |
| "learning_rate": 5.928874980361005e-06, |
| "loss": 0.2387, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.3824915824915824, |
| "grad_norm": 2.939649820327759, |
| "learning_rate": 5.900170600622477e-06, |
| "loss": 0.2957, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.387878787878788, |
| "grad_norm": 3.0286529064178467, |
| "learning_rate": 5.871435526509647e-06, |
| "loss": 0.2937, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.3932659932659932, |
| "grad_norm": 2.7286617755889893, |
| "learning_rate": 5.8426707378424675e-06, |
| "loss": 0.2543, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.3986531986531987, |
| "grad_norm": 2.7167487144470215, |
| "learning_rate": 5.813877215454118e-06, |
| "loss": 0.2296, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.404040404040404, |
| "grad_norm": 2.8393452167510986, |
| "learning_rate": 5.78505594115755e-06, |
| "loss": 0.2708, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4094276094276095, |
| "grad_norm": 2.903613567352295, |
| "learning_rate": 5.756207897712011e-06, |
| "loss": 0.267, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.4148148148148147, |
| "grad_norm": 2.823423147201538, |
| "learning_rate": 5.727334068789529e-06, |
| "loss": 0.2774, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.4202020202020202, |
| "grad_norm": 2.7938835620880127, |
| "learning_rate": 5.698435438941382e-06, |
| "loss": 0.2474, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.4255892255892255, |
| "grad_norm": 2.9996232986450195, |
| "learning_rate": 5.669512993564517e-06, |
| "loss": 0.3086, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.430976430976431, |
| "grad_norm": 3.045121908187866, |
| "learning_rate": 5.640567718867951e-06, |
| "loss": 0.2617, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.4363636363636363, |
| "grad_norm": 2.7745134830474854, |
| "learning_rate": 5.611600601839144e-06, |
| "loss": 0.2604, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.4417508417508418, |
| "grad_norm": 2.8739848136901855, |
| "learning_rate": 5.582612630210349e-06, |
| "loss": 0.2774, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.447138047138047, |
| "grad_norm": 2.740999221801758, |
| "learning_rate": 5.553604792424923e-06, |
| "loss": 0.2341, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.4525252525252526, |
| "grad_norm": 2.991398572921753, |
| "learning_rate": 5.524578077603627e-06, |
| "loss": 0.2299, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.457912457912458, |
| "grad_norm": 2.636726140975952, |
| "learning_rate": 5.495533475510901e-06, |
| "loss": 0.2472, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.4632996632996633, |
| "grad_norm": 3.0140764713287354, |
| "learning_rate": 5.4664719765211125e-06, |
| "loss": 0.2597, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.4686868686868686, |
| "grad_norm": 2.988635778427124, |
| "learning_rate": 5.4373945715847845e-06, |
| "loss": 0.2939, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.474074074074074, |
| "grad_norm": 3.1995465755462646, |
| "learning_rate": 5.408302252194806e-06, |
| "loss": 0.2678, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.4794612794612796, |
| "grad_norm": 2.9540798664093018, |
| "learning_rate": 5.379196010352629e-06, |
| "loss": 0.3033, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.4848484848484849, |
| "grad_norm": 3.282701253890991, |
| "learning_rate": 5.3500768385344345e-06, |
| "loss": 0.2588, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.4902356902356901, |
| "grad_norm": 2.9532341957092285, |
| "learning_rate": 5.320945729657299e-06, |
| "loss": 0.289, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.4956228956228956, |
| "grad_norm": 2.660553455352783, |
| "learning_rate": 5.2918036770453285e-06, |
| "loss": 0.2653, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.5010101010101011, |
| "grad_norm": 2.7580904960632324, |
| "learning_rate": 5.262651674395799e-06, |
| "loss": 0.2585, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.5063973063973064, |
| "grad_norm": 2.7895712852478027, |
| "learning_rate": 5.2334907157452605e-06, |
| "loss": 0.2425, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.5117845117845117, |
| "grad_norm": 2.852928876876831, |
| "learning_rate": 5.204321795435656e-06, |
| "loss": 0.2702, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.5171717171717172, |
| "grad_norm": 3.042116403579712, |
| "learning_rate": 5.1751459080803986e-06, |
| "loss": 0.2615, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.5225589225589227, |
| "grad_norm": 2.737823724746704, |
| "learning_rate": 5.145964048530475e-06, |
| "loss": 0.2695, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.527946127946128, |
| "grad_norm": 2.6959354877471924, |
| "learning_rate": 5.11677721184051e-06, |
| "loss": 0.2595, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.5333333333333332, |
| "grad_norm": 3.073336601257324, |
| "learning_rate": 5.08758639323484e-06, |
| "loss": 0.249, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.5387205387205387, |
| "grad_norm": 2.496995449066162, |
| "learning_rate": 5.058392588073583e-06, |
| "loss": 0.2409, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.5441077441077442, |
| "grad_norm": 2.8654353618621826, |
| "learning_rate": 5.029196791818688e-06, |
| "loss": 0.2428, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.5494949494949495, |
| "grad_norm": 2.753993034362793, |
| "learning_rate": 5e-06, |
| "loss": 0.2768, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.5548821548821548, |
| "grad_norm": 2.972564220428467, |
| "learning_rate": 4.970803208181315e-06, |
| "loss": 0.2451, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.5602693602693603, |
| "grad_norm": 3.036773681640625, |
| "learning_rate": 4.941607411926419e-06, |
| "loss": 0.2642, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.5656565656565657, |
| "grad_norm": 3.0601320266723633, |
| "learning_rate": 4.9124136067651615e-06, |
| "loss": 0.2803, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.571043771043771, |
| "grad_norm": 3.3641974925994873, |
| "learning_rate": 4.883222788159491e-06, |
| "loss": 0.289, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.5764309764309763, |
| "grad_norm": 3.0665841102600098, |
| "learning_rate": 4.8540359514695266e-06, |
| "loss": 0.2196, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.5818181818181818, |
| "grad_norm": 2.884730339050293, |
| "learning_rate": 4.824854091919601e-06, |
| "loss": 0.2532, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.5872053872053873, |
| "grad_norm": 3.1136231422424316, |
| "learning_rate": 4.795678204564346e-06, |
| "loss": 0.2545, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.5925925925925926, |
| "grad_norm": 2.821955919265747, |
| "learning_rate": 4.766509284254739e-06, |
| "loss": 0.2524, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.5979797979797978, |
| "grad_norm": 3.191521167755127, |
| "learning_rate": 4.737348325604203e-06, |
| "loss": 0.2638, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.6033670033670033, |
| "grad_norm": 2.8502752780914307, |
| "learning_rate": 4.708196322954673e-06, |
| "loss": 0.2648, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.6087542087542088, |
| "grad_norm": 3.3543736934661865, |
| "learning_rate": 4.679054270342703e-06, |
| "loss": 0.2884, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.614141414141414, |
| "grad_norm": 2.9385459423065186, |
| "learning_rate": 4.649923161465567e-06, |
| "loss": 0.2422, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.6195286195286194, |
| "grad_norm": 2.9000279903411865, |
| "learning_rate": 4.620803989647373e-06, |
| "loss": 0.244, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6249158249158249, |
| "grad_norm": 2.7263593673706055, |
| "learning_rate": 4.591697747805196e-06, |
| "loss": 0.2452, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.6303030303030304, |
| "grad_norm": 2.7036728858947754, |
| "learning_rate": 4.562605428415216e-06, |
| "loss": 0.2555, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.6356902356902356, |
| "grad_norm": 2.996410608291626, |
| "learning_rate": 4.533528023478888e-06, |
| "loss": 0.2212, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.641077441077441, |
| "grad_norm": 2.6675851345062256, |
| "learning_rate": 4.5044665244891e-06, |
| "loss": 0.2411, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.6464646464646466, |
| "grad_norm": 2.8888285160064697, |
| "learning_rate": 4.475421922396375e-06, |
| "loss": 0.2374, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.651851851851852, |
| "grad_norm": 2.5365850925445557, |
| "learning_rate": 4.446395207575081e-06, |
| "loss": 0.2443, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.6572390572390572, |
| "grad_norm": 2.7890241146087646, |
| "learning_rate": 4.417387369789652e-06, |
| "loss": 0.2219, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.6626262626262627, |
| "grad_norm": 3.0111935138702393, |
| "learning_rate": 4.388399398160857e-06, |
| "loss": 0.2528, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.6680134680134682, |
| "grad_norm": 2.897418260574341, |
| "learning_rate": 4.359432281132051e-06, |
| "loss": 0.2432, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.6734006734006734, |
| "grad_norm": 2.736621618270874, |
| "learning_rate": 4.330487006435485e-06, |
| "loss": 0.2381, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.6787878787878787, |
| "grad_norm": 2.9282073974609375, |
| "learning_rate": 4.301564561058618e-06, |
| "loss": 0.2405, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.6841750841750842, |
| "grad_norm": 2.8673527240753174, |
| "learning_rate": 4.272665931210472e-06, |
| "loss": 0.2638, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.6895622895622897, |
| "grad_norm": 3.049126148223877, |
| "learning_rate": 4.243792102287991e-06, |
| "loss": 0.2505, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.694949494949495, |
| "grad_norm": 3.018843173980713, |
| "learning_rate": 4.214944058842452e-06, |
| "loss": 0.262, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.7003367003367003, |
| "grad_norm": 3.1092209815979004, |
| "learning_rate": 4.186122784545885e-06, |
| "loss": 0.2784, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.7057239057239058, |
| "grad_norm": 3.118446111679077, |
| "learning_rate": 4.157329262157534e-06, |
| "loss": 0.2645, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.7111111111111112, |
| "grad_norm": 3.1034669876098633, |
| "learning_rate": 4.128564473490357e-06, |
| "loss": 0.244, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.7164983164983165, |
| "grad_norm": 3.1367504596710205, |
| "learning_rate": 4.099829399377524e-06, |
| "loss": 0.2526, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.7218855218855218, |
| "grad_norm": 3.1213414669036865, |
| "learning_rate": 4.071125019638998e-06, |
| "loss": 0.2603, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.7272727272727273, |
| "grad_norm": 2.7703611850738525, |
| "learning_rate": 4.0424523130481055e-06, |
| "loss": 0.2302, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7326599326599328, |
| "grad_norm": 3.022610902786255, |
| "learning_rate": 4.013812257298175e-06, |
| "loss": 0.2637, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.738047138047138, |
| "grad_norm": 2.519594192504883, |
| "learning_rate": 3.985205828969191e-06, |
| "loss": 0.2235, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.7434343434343433, |
| "grad_norm": 2.90838360786438, |
| "learning_rate": 3.956634003494496e-06, |
| "loss": 0.2834, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.7488215488215488, |
| "grad_norm": 2.999645948410034, |
| "learning_rate": 3.9280977551275294e-06, |
| "loss": 0.2463, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.7542087542087543, |
| "grad_norm": 2.5574517250061035, |
| "learning_rate": 3.899598056908615e-06, |
| "loss": 0.2101, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.7595959595959596, |
| "grad_norm": 2.567458391189575, |
| "learning_rate": 3.871135880631769e-06, |
| "loss": 0.2576, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.7649831649831649, |
| "grad_norm": 2.833789110183716, |
| "learning_rate": 3.842712196811569e-06, |
| "loss": 0.2322, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.7703703703703704, |
| "grad_norm": 2.6010053157806396, |
| "learning_rate": 3.8143279746500665e-06, |
| "loss": 0.2227, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.7757575757575759, |
| "grad_norm": 2.8823626041412354, |
| "learning_rate": 3.785984182003728e-06, |
| "loss": 0.2646, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.7811447811447811, |
| "grad_norm": 2.988429546356201, |
| "learning_rate": 3.757681785350445e-06, |
| "loss": 0.2626, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.7865319865319864, |
| "grad_norm": 2.649637222290039, |
| "learning_rate": 3.729421749756564e-06, |
| "loss": 0.2145, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.791919191919192, |
| "grad_norm": 2.893730401992798, |
| "learning_rate": 3.701205038843997e-06, |
| "loss": 0.2727, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.7973063973063974, |
| "grad_norm": 2.917715549468994, |
| "learning_rate": 3.6730326147573425e-06, |
| "loss": 0.2281, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.8026936026936027, |
| "grad_norm": 2.8687551021575928, |
| "learning_rate": 3.6449054381311e-06, |
| "loss": 0.2531, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.808080808080808, |
| "grad_norm": 2.496572256088257, |
| "learning_rate": 3.616824468056896e-06, |
| "loss": 0.2227, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.8134680134680135, |
| "grad_norm": 3.0722904205322266, |
| "learning_rate": 3.5887906620507877e-06, |
| "loss": 0.2803, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.818855218855219, |
| "grad_norm": 2.8439204692840576, |
| "learning_rate": 3.5608049760206203e-06, |
| "loss": 0.2315, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.8242424242424242, |
| "grad_norm": 2.7868878841400146, |
| "learning_rate": 3.532868364233416e-06, |
| "loss": 0.2806, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.8296296296296295, |
| "grad_norm": 2.97046160697937, |
| "learning_rate": 3.504981779282852e-06, |
| "loss": 0.2521, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.835016835016835, |
| "grad_norm": 2.795283794403076, |
| "learning_rate": 3.4771461720567613e-06, |
| "loss": 0.2522, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.8404040404040405, |
| "grad_norm": 2.8273348808288574, |
| "learning_rate": 3.4493624917047284e-06, |
| "loss": 0.2462, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.8457912457912458, |
| "grad_norm": 2.7871997356414795, |
| "learning_rate": 3.4216316856057074e-06, |
| "loss": 0.2334, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.851178451178451, |
| "grad_norm": 2.754995346069336, |
| "learning_rate": 3.3939546993357297e-06, |
| "loss": 0.2533, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.8565656565656565, |
| "grad_norm": 2.96561336517334, |
| "learning_rate": 3.3663324766356524e-06, |
| "loss": 0.261, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.861952861952862, |
| "grad_norm": 2.781203269958496, |
| "learning_rate": 3.3387659593789845e-06, |
| "loss": 0.2312, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.8673400673400673, |
| "grad_norm": 2.9479804039001465, |
| "learning_rate": 3.3112560875397713e-06, |
| "loss": 0.2618, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.8727272727272726, |
| "grad_norm": 2.9663288593292236, |
| "learning_rate": 3.283803799160537e-06, |
| "loss": 0.2554, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.878114478114478, |
| "grad_norm": 2.95430064201355, |
| "learning_rate": 3.256410030320304e-06, |
| "loss": 0.2873, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.8835016835016836, |
| "grad_norm": 2.834928274154663, |
| "learning_rate": 3.2290757151026687e-06, |
| "loss": 0.2407, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 2.61153244972229, |
| "learning_rate": 3.2018017855639605e-06, |
| "loss": 0.2482, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.8942760942760941, |
| "grad_norm": 2.788770914077759, |
| "learning_rate": 3.1745891717014477e-06, |
| "loss": 0.224, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.8996632996632996, |
| "grad_norm": 2.7962043285369873, |
| "learning_rate": 3.147438801421638e-06, |
| "loss": 0.2526, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.905050505050505, |
| "grad_norm": 2.967076539993286, |
| "learning_rate": 3.1203516005086276e-06, |
| "loss": 0.2335, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.9104377104377104, |
| "grad_norm": 2.598158836364746, |
| "learning_rate": 3.093328492592539e-06, |
| "loss": 0.2127, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.9158249158249159, |
| "grad_norm": 2.835001230239868, |
| "learning_rate": 3.0663703991180318e-06, |
| "loss": 0.2273, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.9212121212121214, |
| "grad_norm": 2.9137284755706787, |
| "learning_rate": 3.0394782393128713e-06, |
| "loss": 0.2754, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.9265993265993266, |
| "grad_norm": 3.0207886695861816, |
| "learning_rate": 3.0126529301565945e-06, |
| "loss": 0.2449, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.931986531986532, |
| "grad_norm": 2.987816095352173, |
| "learning_rate": 2.9858953863492334e-06, |
| "loss": 0.2521, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.9373737373737374, |
| "grad_norm": 2.8369038105010986, |
| "learning_rate": 2.9592065202801374e-06, |
| "loss": 0.2383, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.942760942760943, |
| "grad_norm": 2.73996639251709, |
| "learning_rate": 2.9325872419968484e-06, |
| "loss": 0.2536, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.9481481481481482, |
| "grad_norm": 3.1415867805480957, |
| "learning_rate": 2.906038459174081e-06, |
| "loss": 0.2599, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.9535353535353535, |
| "grad_norm": 3.266170024871826, |
| "learning_rate": 2.879561077082764e-06, |
| "loss": 0.2544, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.958922558922559, |
| "grad_norm": 2.9058427810668945, |
| "learning_rate": 2.853155998559179e-06, |
| "loss": 0.244, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.9643097643097645, |
| "grad_norm": 2.8677961826324463, |
| "learning_rate": 2.826824123974171e-06, |
| "loss": 0.2192, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.9696969696969697, |
| "grad_norm": 3.0954580307006836, |
| "learning_rate": 2.800566351202443e-06, |
| "loss": 0.2538, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.975084175084175, |
| "grad_norm": 3.023210287094116, |
| "learning_rate": 2.774383575591956e-06, |
| "loss": 0.248, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.9804713804713805, |
| "grad_norm": 2.7636148929595947, |
| "learning_rate": 2.748276689933377e-06, |
| "loss": 0.2281, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.985858585858586, |
| "grad_norm": 2.7266335487365723, |
| "learning_rate": 2.722246584429652e-06, |
| "loss": 0.2492, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.9912457912457913, |
| "grad_norm": 2.8604986667633057, |
| "learning_rate": 2.6962941466656477e-06, |
| "loss": 0.2358, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.9966329966329965, |
| "grad_norm": 2.7491540908813477, |
| "learning_rate": 2.6704202615778844e-06, |
| "loss": 0.2366, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.0053872053872053, |
| "grad_norm": 5.093502998352051, |
| "learning_rate": 2.6446258114243633e-06, |
| "loss": 0.343, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.010774410774411, |
| "grad_norm": 1.904625415802002, |
| "learning_rate": 2.6189116757544765e-06, |
| "loss": 0.0965, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.0161616161616163, |
| "grad_norm": 1.87295663356781, |
| "learning_rate": 2.593278731379027e-06, |
| "loss": 0.1118, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.0215488215488215, |
| "grad_norm": 2.0098869800567627, |
| "learning_rate": 2.567727852340323e-06, |
| "loss": 0.0975, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.026936026936027, |
| "grad_norm": 1.6401960849761963, |
| "learning_rate": 2.542259909882374e-06, |
| "loss": 0.0918, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.0323232323232325, |
| "grad_norm": 1.9632785320281982, |
| "learning_rate": 2.51687577242119e-06, |
| "loss": 0.0885, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.037710437710438, |
| "grad_norm": 1.801023006439209, |
| "learning_rate": 2.4915763055151615e-06, |
| "loss": 0.0849, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.043097643097643, |
| "grad_norm": 1.8630132675170898, |
| "learning_rate": 2.4663623718355444e-06, |
| "loss": 0.0782, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.0484848484848484, |
| "grad_norm": 1.9627724885940552, |
| "learning_rate": 2.4412348311370616e-06, |
| "loss": 0.0985, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.053872053872054, |
| "grad_norm": 2.127228260040283, |
| "learning_rate": 2.416194540228559e-06, |
| "loss": 0.0885, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.0592592592592593, |
| "grad_norm": 2.282618284225464, |
| "learning_rate": 2.3912423529438145e-06, |
| "loss": 0.0705, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.0646464646464646, |
| "grad_norm": 2.917990207672119, |
| "learning_rate": 2.3663791201124093e-06, |
| "loss": 0.0904, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.07003367003367, |
| "grad_norm": 2.867617130279541, |
| "learning_rate": 2.341605689530723e-06, |
| "loss": 0.0766, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.0754208754208756, |
| "grad_norm": 2.3559350967407227, |
| "learning_rate": 2.316922905933022e-06, |
| "loss": 0.0889, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.080808080808081, |
| "grad_norm": 2.959153890609741, |
| "learning_rate": 2.292331610962649e-06, |
| "loss": 0.0759, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.086195286195286, |
| "grad_norm": 2.87480092048645, |
| "learning_rate": 2.2678326431433456e-06, |
| "loss": 0.0836, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.0915824915824914, |
| "grad_norm": 2.830786943435669, |
| "learning_rate": 2.243426837850631e-06, |
| "loss": 0.1042, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.096969696969697, |
| "grad_norm": 2.9633374214172363, |
| "learning_rate": 2.219115027283339e-06, |
| "loss": 0.0958, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.1023569023569024, |
| "grad_norm": 2.6659820079803467, |
| "learning_rate": 2.194898040435234e-06, |
| "loss": 0.0772, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.1077441077441077, |
| "grad_norm": 2.3520843982696533, |
| "learning_rate": 2.17077670306674e-06, |
| "loss": 0.0564, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.113131313131313, |
| "grad_norm": 2.393596887588501, |
| "learning_rate": 2.146751837676794e-06, |
| "loss": 0.075, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.1185185185185187, |
| "grad_norm": 2.7160770893096924, |
| "learning_rate": 2.122824263474784e-06, |
| "loss": 0.1021, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.123905723905724, |
| "grad_norm": 2.5906686782836914, |
| "learning_rate": 2.098994796352629e-06, |
| "loss": 0.0886, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.1292929292929292, |
| "grad_norm": 2.3228564262390137, |
| "learning_rate": 2.0752642488569557e-06, |
| "loss": 0.0807, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.1346801346801345, |
| "grad_norm": 2.289416790008545, |
| "learning_rate": 2.0516334301613876e-06, |
| "loss": 0.0804, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.1400673400673402, |
| "grad_norm": 2.459120750427246, |
| "learning_rate": 2.028103146038958e-06, |
| "loss": 0.1073, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.1454545454545455, |
| "grad_norm": 2.474850654602051, |
| "learning_rate": 2.004674198834631e-06, |
| "loss": 0.0746, |
| "step": 397 |
| }, |
| { |
| "epoch": 2.1508417508417508, |
| "grad_norm": 2.63972806930542, |
| "learning_rate": 1.98134738743794e-06, |
| "loss": 0.0754, |
| "step": 398 |
| }, |
| { |
| "epoch": 2.156228956228956, |
| "grad_norm": 2.22719407081604, |
| "learning_rate": 1.9581235072557618e-06, |
| "loss": 0.084, |
| "step": 399 |
| }, |
| { |
| "epoch": 2.1616161616161618, |
| "grad_norm": 2.08853816986084, |
| "learning_rate": 1.935003350185174e-06, |
| "loss": 0.0779, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.167003367003367, |
| "grad_norm": 1.9397152662277222, |
| "learning_rate": 1.911987704586466e-06, |
| "loss": 0.07, |
| "step": 401 |
| }, |
| { |
| "epoch": 2.1723905723905723, |
| "grad_norm": 1.917934775352478, |
| "learning_rate": 1.8890773552562564e-06, |
| "loss": 0.0725, |
| "step": 402 |
| }, |
| { |
| "epoch": 2.1777777777777776, |
| "grad_norm": 2.1869399547576904, |
| "learning_rate": 1.8662730834007204e-06, |
| "loss": 0.0745, |
| "step": 403 |
| }, |
| { |
| "epoch": 2.1831649831649833, |
| "grad_norm": 2.0088367462158203, |
| "learning_rate": 1.843575666608976e-06, |
| "loss": 0.091, |
| "step": 404 |
| }, |
| { |
| "epoch": 2.1885521885521886, |
| "grad_norm": 2.3277580738067627, |
| "learning_rate": 1.8209858788265411e-06, |
| "loss": 0.0605, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.193939393939394, |
| "grad_norm": 1.99192214012146, |
| "learning_rate": 1.7985044903289645e-06, |
| "loss": 0.0706, |
| "step": 406 |
| }, |
| { |
| "epoch": 2.199326599326599, |
| "grad_norm": 2.2638256549835205, |
| "learning_rate": 1.7761322676955505e-06, |
| "loss": 0.0728, |
| "step": 407 |
| }, |
| { |
| "epoch": 2.204713804713805, |
| "grad_norm": 2.2363462448120117, |
| "learning_rate": 1.7538699737832237e-06, |
| "loss": 0.0804, |
| "step": 408 |
| }, |
| { |
| "epoch": 2.21010101010101, |
| "grad_norm": 2.1804420948028564, |
| "learning_rate": 1.7317183677005173e-06, |
| "loss": 0.0882, |
| "step": 409 |
| }, |
| { |
| "epoch": 2.2154882154882154, |
| "grad_norm": 2.3650074005126953, |
| "learning_rate": 1.7096782047816806e-06, |
| "loss": 0.0784, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.2208754208754207, |
| "grad_norm": 2.209190845489502, |
| "learning_rate": 1.687750236560936e-06, |
| "loss": 0.087, |
| "step": 411 |
| }, |
| { |
| "epoch": 2.2262626262626264, |
| "grad_norm": 2.8381571769714355, |
| "learning_rate": 1.665935210746844e-06, |
| "loss": 0.0656, |
| "step": 412 |
| }, |
| { |
| "epoch": 2.2316498316498317, |
| "grad_norm": 2.1763696670532227, |
| "learning_rate": 1.6442338711968102e-06, |
| "loss": 0.0884, |
| "step": 413 |
| }, |
| { |
| "epoch": 2.237037037037037, |
| "grad_norm": 2.410494327545166, |
| "learning_rate": 1.622646957891722e-06, |
| "loss": 0.0702, |
| "step": 414 |
| }, |
| { |
| "epoch": 2.242424242424242, |
| "grad_norm": 2.365952491760254, |
| "learning_rate": 1.601175206910715e-06, |
| "loss": 0.0902, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.247811447811448, |
| "grad_norm": 2.4530627727508545, |
| "learning_rate": 1.5798193504060693e-06, |
| "loss": 0.0792, |
| "step": 416 |
| }, |
| { |
| "epoch": 2.253198653198653, |
| "grad_norm": 2.4529592990875244, |
| "learning_rate": 1.5585801165782606e-06, |
| "loss": 0.0863, |
| "step": 417 |
| }, |
| { |
| "epoch": 2.2585858585858585, |
| "grad_norm": 2.298218250274658, |
| "learning_rate": 1.5374582296511054e-06, |
| "loss": 0.0854, |
| "step": 418 |
| }, |
| { |
| "epoch": 2.263973063973064, |
| "grad_norm": 2.545762538909912, |
| "learning_rate": 1.5164544098470862e-06, |
| "loss": 0.0913, |
| "step": 419 |
| }, |
| { |
| "epoch": 2.2693602693602695, |
| "grad_norm": 2.3648526668548584, |
| "learning_rate": 1.4955693733627869e-06, |
| "loss": 0.0795, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.2747474747474747, |
| "grad_norm": 2.335575819015503, |
| "learning_rate": 1.474803832344463e-06, |
| "loss": 0.084, |
| "step": 421 |
| }, |
| { |
| "epoch": 2.28013468013468, |
| "grad_norm": 2.2477426528930664, |
| "learning_rate": 1.4541584948637777e-06, |
| "loss": 0.0876, |
| "step": 422 |
| }, |
| { |
| "epoch": 2.2855218855218853, |
| "grad_norm": 2.9558703899383545, |
| "learning_rate": 1.4336340648936342e-06, |
| "loss": 0.079, |
| "step": 423 |
| }, |
| { |
| "epoch": 2.290909090909091, |
| "grad_norm": 2.1282129287719727, |
| "learning_rate": 1.413231242284195e-06, |
| "loss": 0.0689, |
| "step": 424 |
| }, |
| { |
| "epoch": 2.2962962962962963, |
| "grad_norm": 2.1239535808563232, |
| "learning_rate": 1.3929507227389954e-06, |
| "loss": 0.0701, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.3016835016835016, |
| "grad_norm": 2.0963549613952637, |
| "learning_rate": 1.3727931977912406e-06, |
| "loss": 0.0758, |
| "step": 426 |
| }, |
| { |
| "epoch": 2.3070707070707073, |
| "grad_norm": 3.4831295013427734, |
| "learning_rate": 1.352759354780215e-06, |
| "loss": 0.086, |
| "step": 427 |
| }, |
| { |
| "epoch": 2.3124579124579125, |
| "grad_norm": 2.0869736671447754, |
| "learning_rate": 1.332849876827842e-06, |
| "loss": 0.072, |
| "step": 428 |
| }, |
| { |
| "epoch": 2.317845117845118, |
| "grad_norm": 2.1851084232330322, |
| "learning_rate": 1.3130654428154066e-06, |
| "loss": 0.0644, |
| "step": 429 |
| }, |
| { |
| "epoch": 2.323232323232323, |
| "grad_norm": 1.7817176580429077, |
| "learning_rate": 1.2934067273603855e-06, |
| "loss": 0.0522, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.328619528619529, |
| "grad_norm": 2.0074706077575684, |
| "learning_rate": 1.2738744007934595e-06, |
| "loss": 0.0744, |
| "step": 431 |
| }, |
| { |
| "epoch": 2.334006734006734, |
| "grad_norm": 2.3214468955993652, |
| "learning_rate": 1.2544691291356497e-06, |
| "loss": 0.0759, |
| "step": 432 |
| }, |
| { |
| "epoch": 2.3393939393939394, |
| "grad_norm": 2.294804096221924, |
| "learning_rate": 1.2351915740756087e-06, |
| "loss": 0.068, |
| "step": 433 |
| }, |
| { |
| "epoch": 2.3447811447811446, |
| "grad_norm": 2.0611894130706787, |
| "learning_rate": 1.2160423929470584e-06, |
| "loss": 0.0667, |
| "step": 434 |
| }, |
| { |
| "epoch": 2.3501683501683504, |
| "grad_norm": 2.080531120300293, |
| "learning_rate": 1.1970222387063756e-06, |
| "loss": 0.0749, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.3555555555555556, |
| "grad_norm": 2.0696070194244385, |
| "learning_rate": 1.1781317599103238e-06, |
| "loss": 0.0773, |
| "step": 436 |
| }, |
| { |
| "epoch": 2.360942760942761, |
| "grad_norm": 2.34531569480896, |
| "learning_rate": 1.1593716006939455e-06, |
| "loss": 0.0752, |
| "step": 437 |
| }, |
| { |
| "epoch": 2.366329966329966, |
| "grad_norm": 2.6101057529449463, |
| "learning_rate": 1.140742400748593e-06, |
| "loss": 0.0605, |
| "step": 438 |
| }, |
| { |
| "epoch": 2.371717171717172, |
| "grad_norm": 2.1780221462249756, |
| "learning_rate": 1.1222447953001182e-06, |
| "loss": 0.0638, |
| "step": 439 |
| }, |
| { |
| "epoch": 2.377104377104377, |
| "grad_norm": 2.247965097427368, |
| "learning_rate": 1.1038794150872117e-06, |
| "loss": 0.0714, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.3824915824915824, |
| "grad_norm": 1.9487817287445068, |
| "learning_rate": 1.0856468863398917e-06, |
| "loss": 0.0654, |
| "step": 441 |
| }, |
| { |
| "epoch": 2.3878787878787877, |
| "grad_norm": 2.285243272781372, |
| "learning_rate": 1.0675478307581627e-06, |
| "loss": 0.0706, |
| "step": 442 |
| }, |
| { |
| "epoch": 2.3932659932659934, |
| "grad_norm": 2.08785343170166, |
| "learning_rate": 1.0495828654907991e-06, |
| "loss": 0.0828, |
| "step": 443 |
| }, |
| { |
| "epoch": 2.3986531986531987, |
| "grad_norm": 2.6061668395996094, |
| "learning_rate": 1.0317526031143161e-06, |
| "loss": 0.06, |
| "step": 444 |
| }, |
| { |
| "epoch": 2.404040404040404, |
| "grad_norm": 1.9994468688964844, |
| "learning_rate": 1.014057651612076e-06, |
| "loss": 0.0678, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.4094276094276093, |
| "grad_norm": 2.335872173309326, |
| "learning_rate": 9.964986143535515e-07, |
| "loss": 0.0696, |
| "step": 446 |
| }, |
| { |
| "epoch": 2.414814814814815, |
| "grad_norm": 2.4777722358703613, |
| "learning_rate": 9.790760900737683e-07, |
| "loss": 0.0651, |
| "step": 447 |
| }, |
| { |
| "epoch": 2.4202020202020202, |
| "grad_norm": 2.2628719806671143, |
| "learning_rate": 9.61790672852868e-07, |
| "loss": 0.0789, |
| "step": 448 |
| }, |
| { |
| "epoch": 2.4255892255892255, |
| "grad_norm": 2.406503677368164, |
| "learning_rate": 9.446429520958666e-07, |
| "loss": 0.0812, |
| "step": 449 |
| }, |
| { |
| "epoch": 2.430976430976431, |
| "grad_norm": 2.8397791385650635, |
| "learning_rate": 9.276335125125502e-07, |
| "loss": 0.0678, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.4363636363636365, |
| "grad_norm": 2.485055923461914, |
| "learning_rate": 9.107629340975388e-07, |
| "loss": 0.0619, |
| "step": 451 |
| }, |
| { |
| "epoch": 2.441750841750842, |
| "grad_norm": 2.066659927368164, |
| "learning_rate": 8.940317921105085e-07, |
| "loss": 0.0611, |
| "step": 452 |
| }, |
| { |
| "epoch": 2.447138047138047, |
| "grad_norm": 2.2130823135375977, |
| "learning_rate": 8.774406570565791e-07, |
| "loss": 0.0674, |
| "step": 453 |
| }, |
| { |
| "epoch": 2.4525252525252528, |
| "grad_norm": 2.1492106914520264, |
| "learning_rate": 8.609900946668536e-07, |
| "loss": 0.0744, |
| "step": 454 |
| }, |
| { |
| "epoch": 2.457912457912458, |
| "grad_norm": 2.2511839866638184, |
| "learning_rate": 8.446806658791373e-07, |
| "loss": 0.0689, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.4632996632996633, |
| "grad_norm": 2.078249454498291, |
| "learning_rate": 8.285129268188042e-07, |
| "loss": 0.0726, |
| "step": 456 |
| }, |
| { |
| "epoch": 2.4686868686868686, |
| "grad_norm": 2.2379488945007324, |
| "learning_rate": 8.124874287798352e-07, |
| "loss": 0.0748, |
| "step": 457 |
| }, |
| { |
| "epoch": 2.474074074074074, |
| "grad_norm": 2.272982120513916, |
| "learning_rate": 7.966047182060226e-07, |
| "loss": 0.0549, |
| "step": 458 |
| }, |
| { |
| "epoch": 2.4794612794612796, |
| "grad_norm": 1.9955648183822632, |
| "learning_rate": 7.808653366723296e-07, |
| "loss": 0.0603, |
| "step": 459 |
| }, |
| { |
| "epoch": 2.484848484848485, |
| "grad_norm": 1.8981883525848389, |
| "learning_rate": 7.652698208664377e-07, |
| "loss": 0.0675, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.49023569023569, |
| "grad_norm": 2.4488866329193115, |
| "learning_rate": 7.498187025704296e-07, |
| "loss": 0.0768, |
| "step": 461 |
| }, |
| { |
| "epoch": 2.495622895622896, |
| "grad_norm": 2.1295886039733887, |
| "learning_rate": 7.345125086426675e-07, |
| "loss": 0.0662, |
| "step": 462 |
| }, |
| { |
| "epoch": 2.501010101010101, |
| "grad_norm": 2.2743725776672363, |
| "learning_rate": 7.193517609998263e-07, |
| "loss": 0.0686, |
| "step": 463 |
| }, |
| { |
| "epoch": 2.5063973063973064, |
| "grad_norm": 2.235623836517334, |
| "learning_rate": 7.043369765990943e-07, |
| "loss": 0.0615, |
| "step": 464 |
| }, |
| { |
| "epoch": 2.5117845117845117, |
| "grad_norm": 2.076993942260742, |
| "learning_rate": 6.894686674205481e-07, |
| "loss": 0.0803, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.517171717171717, |
| "grad_norm": 2.2475011348724365, |
| "learning_rate": 6.747473404496902e-07, |
| "loss": 0.0851, |
| "step": 466 |
| }, |
| { |
| "epoch": 2.5225589225589227, |
| "grad_norm": 2.5577120780944824, |
| "learning_rate": 6.601734976601737e-07, |
| "loss": 0.0735, |
| "step": 467 |
| }, |
| { |
| "epoch": 2.527946127946128, |
| "grad_norm": 2.3084797859191895, |
| "learning_rate": 6.457476359966685e-07, |
| "loss": 0.0724, |
| "step": 468 |
| }, |
| { |
| "epoch": 2.533333333333333, |
| "grad_norm": 2.051790237426758, |
| "learning_rate": 6.314702473579309e-07, |
| "loss": 0.0851, |
| "step": 469 |
| }, |
| { |
| "epoch": 2.538720538720539, |
| "grad_norm": 2.8228673934936523, |
| "learning_rate": 6.17341818580024e-07, |
| "loss": 0.0715, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.544107744107744, |
| "grad_norm": 2.070128917694092, |
| "learning_rate": 6.033628314197176e-07, |
| "loss": 0.0615, |
| "step": 471 |
| }, |
| { |
| "epoch": 2.5494949494949495, |
| "grad_norm": 2.154543876647949, |
| "learning_rate": 5.895337625380632e-07, |
| "loss": 0.0646, |
| "step": 472 |
| }, |
| { |
| "epoch": 2.5548821548821548, |
| "grad_norm": 1.9985536336898804, |
| "learning_rate": 5.758550834841381e-07, |
| "loss": 0.0574, |
| "step": 473 |
| }, |
| { |
| "epoch": 2.56026936026936, |
| "grad_norm": 2.2103183269500732, |
| "learning_rate": 5.62327260678967e-07, |
| "loss": 0.0694, |
| "step": 474 |
| }, |
| { |
| "epoch": 2.5656565656565657, |
| "grad_norm": 2.3436076641082764, |
| "learning_rate": 5.489507553996204e-07, |
| "loss": 0.065, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.571043771043771, |
| "grad_norm": 2.371115207672119, |
| "learning_rate": 5.357260237634826e-07, |
| "loss": 0.0804, |
| "step": 476 |
| }, |
| { |
| "epoch": 2.5764309764309763, |
| "grad_norm": 2.1717820167541504, |
| "learning_rate": 5.226535167127e-07, |
| "loss": 0.0744, |
| "step": 477 |
| }, |
| { |
| "epoch": 2.581818181818182, |
| "grad_norm": 2.0997424125671387, |
| "learning_rate": 5.097336799988067e-07, |
| "loss": 0.0582, |
| "step": 478 |
| }, |
| { |
| "epoch": 2.5872053872053873, |
| "grad_norm": 1.9539695978164673, |
| "learning_rate": 4.96966954167517e-07, |
| "loss": 0.0843, |
| "step": 479 |
| }, |
| { |
| "epoch": 2.5925925925925926, |
| "grad_norm": 2.401609182357788, |
| "learning_rate": 4.843537745437188e-07, |
| "loss": 0.0628, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.597979797979798, |
| "grad_norm": 2.3277831077575684, |
| "learning_rate": 4.718945712166123e-07, |
| "loss": 0.0904, |
| "step": 481 |
| }, |
| { |
| "epoch": 2.603367003367003, |
| "grad_norm": 2.537806510925293, |
| "learning_rate": 4.595897690250567e-07, |
| "loss": 0.0653, |
| "step": 482 |
| }, |
| { |
| "epoch": 2.608754208754209, |
| "grad_norm": 2.5211031436920166, |
| "learning_rate": 4.4743978754308027e-07, |
| "loss": 0.0762, |
| "step": 483 |
| }, |
| { |
| "epoch": 2.614141414141414, |
| "grad_norm": 2.538830280303955, |
| "learning_rate": 4.3544504106557026e-07, |
| "loss": 0.0722, |
| "step": 484 |
| }, |
| { |
| "epoch": 2.6195286195286194, |
| "grad_norm": 2.389099597930908, |
| "learning_rate": 4.2360593859415433e-07, |
| "loss": 0.0669, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.624915824915825, |
| "grad_norm": 2.186370372772217, |
| "learning_rate": 4.1192288382324363e-07, |
| "loss": 0.0719, |
| "step": 486 |
| }, |
| { |
| "epoch": 2.6303030303030304, |
| "grad_norm": 2.426302909851074, |
| "learning_rate": 4.003962751262763e-07, |
| "loss": 0.065, |
| "step": 487 |
| }, |
| { |
| "epoch": 2.6356902356902356, |
| "grad_norm": 2.080082893371582, |
| "learning_rate": 3.890265055421283e-07, |
| "loss": 0.0677, |
| "step": 488 |
| }, |
| { |
| "epoch": 2.641077441077441, |
| "grad_norm": 2.4764468669891357, |
| "learning_rate": 3.77813962761715e-07, |
| "loss": 0.0775, |
| "step": 489 |
| }, |
| { |
| "epoch": 2.6464646464646466, |
| "grad_norm": 2.2122390270233154, |
| "learning_rate": 3.6675902911476937e-07, |
| "loss": 0.0754, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.651851851851852, |
| "grad_norm": 2.6265482902526855, |
| "learning_rate": 3.558620815568048e-07, |
| "loss": 0.0631, |
| "step": 491 |
| }, |
| { |
| "epoch": 2.657239057239057, |
| "grad_norm": 2.3554742336273193, |
| "learning_rate": 3.451234916562618e-07, |
| "loss": 0.0653, |
| "step": 492 |
| }, |
| { |
| "epoch": 2.6626262626262625, |
| "grad_norm": 2.077880382537842, |
| "learning_rate": 3.3454362558184075e-07, |
| "loss": 0.0749, |
| "step": 493 |
| }, |
| { |
| "epoch": 2.668013468013468, |
| "grad_norm": 2.258436918258667, |
| "learning_rate": 3.241228440900124e-07, |
| "loss": 0.067, |
| "step": 494 |
| }, |
| { |
| "epoch": 2.6734006734006734, |
| "grad_norm": 2.1589324474334717, |
| "learning_rate": 3.1386150251271897e-07, |
| "loss": 0.0814, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.6787878787878787, |
| "grad_norm": 2.316006898880005, |
| "learning_rate": 3.0375995074525764e-07, |
| "loss": 0.0624, |
| "step": 496 |
| }, |
| { |
| "epoch": 2.6841750841750844, |
| "grad_norm": 2.2028238773345947, |
| "learning_rate": 2.9381853323434627e-07, |
| "loss": 0.0583, |
| "step": 497 |
| }, |
| { |
| "epoch": 2.6895622895622897, |
| "grad_norm": 2.372264862060547, |
| "learning_rate": 2.840375889663871e-07, |
| "loss": 0.0638, |
| "step": 498 |
| }, |
| { |
| "epoch": 2.694949494949495, |
| "grad_norm": 2.3102543354034424, |
| "learning_rate": 2.744174514558956e-07, |
| "loss": 0.0601, |
| "step": 499 |
| }, |
| { |
| "epoch": 2.7003367003367003, |
| "grad_norm": 2.3564910888671875, |
| "learning_rate": 2.6495844873413944e-07, |
| "loss": 0.0721, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.7057239057239055, |
| "grad_norm": 2.442258834838867, |
| "learning_rate": 2.556609033379459e-07, |
| "loss": 0.0616, |
| "step": 501 |
| }, |
| { |
| "epoch": 2.7111111111111112, |
| "grad_norm": 2.313163995742798, |
| "learning_rate": 2.465251322987061e-07, |
| "loss": 0.0634, |
| "step": 502 |
| }, |
| { |
| "epoch": 2.7164983164983165, |
| "grad_norm": 2.4522969722747803, |
| "learning_rate": 2.3755144713156819e-07, |
| "loss": 0.0613, |
| "step": 503 |
| }, |
| { |
| "epoch": 2.721885521885522, |
| "grad_norm": 2.2570788860321045, |
| "learning_rate": 2.287401538248074e-07, |
| "loss": 0.0737, |
| "step": 504 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 2.2716591358184814, |
| "learning_rate": 2.20091552829399e-07, |
| "loss": 0.0639, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.732659932659933, |
| "grad_norm": 2.105753183364868, |
| "learning_rate": 2.1160593904877236e-07, |
| "loss": 0.0625, |
| "step": 506 |
| }, |
| { |
| "epoch": 2.738047138047138, |
| "grad_norm": 2.383596658706665, |
| "learning_rate": 2.0328360182875262e-07, |
| "loss": 0.0682, |
| "step": 507 |
| }, |
| { |
| "epoch": 2.7434343434343433, |
| "grad_norm": 2.4483511447906494, |
| "learning_rate": 1.9512482494769613e-07, |
| "loss": 0.0649, |
| "step": 508 |
| }, |
| { |
| "epoch": 2.7488215488215486, |
| "grad_norm": 2.1391537189483643, |
| "learning_rate": 1.8712988660681498e-07, |
| "loss": 0.0704, |
| "step": 509 |
| }, |
| { |
| "epoch": 2.7542087542087543, |
| "grad_norm": 2.9412190914154053, |
| "learning_rate": 1.7929905942068836e-07, |
| "loss": 0.0717, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.7595959595959596, |
| "grad_norm": 2.366955280303955, |
| "learning_rate": 1.7163261040796797e-07, |
| "loss": 0.0645, |
| "step": 511 |
| }, |
| { |
| "epoch": 2.764983164983165, |
| "grad_norm": 2.511876344680786, |
| "learning_rate": 1.6413080098227562e-07, |
| "loss": 0.0762, |
| "step": 512 |
| }, |
| { |
| "epoch": 2.7703703703703706, |
| "grad_norm": 2.14850115776062, |
| "learning_rate": 1.5679388694328446e-07, |
| "loss": 0.0613, |
| "step": 513 |
| }, |
| { |
| "epoch": 2.775757575757576, |
| "grad_norm": 2.2042980194091797, |
| "learning_rate": 1.4962211846800078e-07, |
| "loss": 0.0648, |
| "step": 514 |
| }, |
| { |
| "epoch": 2.781144781144781, |
| "grad_norm": 2.243152379989624, |
| "learning_rate": 1.426157401022321e-07, |
| "loss": 0.0769, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.7865319865319864, |
| "grad_norm": 2.4439616203308105, |
| "learning_rate": 1.3577499075224821e-07, |
| "loss": 0.0726, |
| "step": 516 |
| }, |
| { |
| "epoch": 2.7919191919191917, |
| "grad_norm": 2.2987587451934814, |
| "learning_rate": 1.2910010367663317e-07, |
| "loss": 0.0665, |
| "step": 517 |
| }, |
| { |
| "epoch": 2.7973063973063974, |
| "grad_norm": 2.111358642578125, |
| "learning_rate": 1.2259130647833627e-07, |
| "loss": 0.0523, |
| "step": 518 |
| }, |
| { |
| "epoch": 2.8026936026936027, |
| "grad_norm": 2.131275177001953, |
| "learning_rate": 1.162488210969065e-07, |
| "loss": 0.0687, |
| "step": 519 |
| }, |
| { |
| "epoch": 2.808080808080808, |
| "grad_norm": 2.1112232208251953, |
| "learning_rate": 1.100728638009263e-07, |
| "loss": 0.0603, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.8134680134680137, |
| "grad_norm": 2.212636947631836, |
| "learning_rate": 1.0406364518063927e-07, |
| "loss": 0.0565, |
| "step": 521 |
| }, |
| { |
| "epoch": 2.818855218855219, |
| "grad_norm": 2.0088913440704346, |
| "learning_rate": 9.822137014076472e-08, |
| "loss": 0.0597, |
| "step": 522 |
| }, |
| { |
| "epoch": 2.824242424242424, |
| "grad_norm": 2.1878387928009033, |
| "learning_rate": 9.254623789351714e-08, |
| "loss": 0.0751, |
| "step": 523 |
| }, |
| { |
| "epoch": 2.8296296296296295, |
| "grad_norm": 2.465935230255127, |
| "learning_rate": 8.703844195180555e-08, |
| "loss": 0.0753, |
| "step": 524 |
| }, |
| { |
| "epoch": 2.8350168350168348, |
| "grad_norm": 2.2098045349121094, |
| "learning_rate": 8.169817012264214e-08, |
| "loss": 0.0586, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.8404040404040405, |
| "grad_norm": 2.3172450065612793, |
| "learning_rate": 7.652560450073454e-08, |
| "loss": 0.0639, |
| "step": 526 |
| }, |
| { |
| "epoch": 2.8457912457912458, |
| "grad_norm": 1.8119255304336548, |
| "learning_rate": 7.152092146227806e-08, |
| "loss": 0.0762, |
| "step": 527 |
| }, |
| { |
| "epoch": 2.851178451178451, |
| "grad_norm": 2.9995367527008057, |
| "learning_rate": 6.668429165893996e-08, |
| "loss": 0.0802, |
| "step": 528 |
| }, |
| { |
| "epoch": 2.8565656565656568, |
| "grad_norm": 2.5341761112213135, |
| "learning_rate": 6.20158800120435e-08, |
| "loss": 0.0751, |
| "step": 529 |
| }, |
| { |
| "epoch": 2.861952861952862, |
| "grad_norm": 2.48641300201416, |
| "learning_rate": 5.7515845706940246e-08, |
| "loss": 0.0678, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.8673400673400673, |
| "grad_norm": 2.129096746444702, |
| "learning_rate": 5.31843421875855e-08, |
| "loss": 0.057, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.8727272727272726, |
| "grad_norm": 1.9396432638168335, |
| "learning_rate": 4.9021517151305875e-08, |
| "loss": 0.0492, |
| "step": 532 |
| }, |
| { |
| "epoch": 2.878114478114478, |
| "grad_norm": 2.2072770595550537, |
| "learning_rate": 4.502751254375992e-08, |
| "loss": 0.0734, |
| "step": 533 |
| }, |
| { |
| "epoch": 2.8835016835016836, |
| "grad_norm": 2.1861319541931152, |
| "learning_rate": 4.120246455410204e-08, |
| "loss": 0.0537, |
| "step": 534 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 2.1539671421051025, |
| "learning_rate": 3.7546503610336183e-08, |
| "loss": 0.0496, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.894276094276094, |
| "grad_norm": 1.8679490089416504, |
| "learning_rate": 3.405975437486997e-08, |
| "loss": 0.0702, |
| "step": 536 |
| }, |
| { |
| "epoch": 2.8996632996633, |
| "grad_norm": 2.585775375366211, |
| "learning_rate": 3.074233574026087e-08, |
| "loss": 0.0626, |
| "step": 537 |
| }, |
| { |
| "epoch": 2.905050505050505, |
| "grad_norm": 2.1468751430511475, |
| "learning_rate": 2.7594360825166644e-08, |
| "loss": 0.0575, |
| "step": 538 |
| }, |
| { |
| "epoch": 2.9104377104377104, |
| "grad_norm": 2.1872782707214355, |
| "learning_rate": 2.4615936970485144e-08, |
| "loss": 0.0712, |
| "step": 539 |
| }, |
| { |
| "epoch": 2.915824915824916, |
| "grad_norm": 2.4800572395324707, |
| "learning_rate": 2.180716573569386e-08, |
| "loss": 0.0646, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.9212121212121214, |
| "grad_norm": 2.528630495071411, |
| "learning_rate": 1.9168142895389376e-08, |
| "loss": 0.075, |
| "step": 541 |
| }, |
| { |
| "epoch": 2.9265993265993266, |
| "grad_norm": 2.408411741256714, |
| "learning_rate": 1.6698958436019986e-08, |
| "loss": 0.0717, |
| "step": 542 |
| }, |
| { |
| "epoch": 2.931986531986532, |
| "grad_norm": 2.5206246376037598, |
| "learning_rate": 1.4399696552816477e-08, |
| "loss": 0.088, |
| "step": 543 |
| }, |
| { |
| "epoch": 2.937373737373737, |
| "grad_norm": 2.5237998962402344, |
| "learning_rate": 1.2270435646922763e-08, |
| "loss": 0.0667, |
| "step": 544 |
| }, |
| { |
| "epoch": 2.942760942760943, |
| "grad_norm": 2.4456536769866943, |
| "learning_rate": 1.031124832272301e-08, |
| "loss": 0.0484, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.948148148148148, |
| "grad_norm": 2.1876487731933594, |
| "learning_rate": 8.522201385362528e-09, |
| "loss": 0.0683, |
| "step": 546 |
| }, |
| { |
| "epoch": 2.9535353535353535, |
| "grad_norm": 2.1443562507629395, |
| "learning_rate": 6.903355838475123e-09, |
| "loss": 0.0688, |
| "step": 547 |
| }, |
| { |
| "epoch": 2.958922558922559, |
| "grad_norm": 2.5573830604553223, |
| "learning_rate": 5.454766882097007e-09, |
| "loss": 0.0497, |
| "step": 548 |
| }, |
| { |
| "epoch": 2.9643097643097645, |
| "grad_norm": 2.2723801136016846, |
| "learning_rate": 4.1764839107905074e-09, |
| "loss": 0.0777, |
| "step": 549 |
| }, |
| { |
| "epoch": 2.9696969696969697, |
| "grad_norm": 2.2643778324127197, |
| "learning_rate": 3.068550511955426e-09, |
| "loss": 0.0725, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.975084175084175, |
| "grad_norm": 2.395113945007324, |
| "learning_rate": 2.131004464343556e-09, |
| "loss": 0.0715, |
| "step": 551 |
| }, |
| { |
| "epoch": 2.9804713804713803, |
| "grad_norm": 2.2361373901367188, |
| "learning_rate": 1.3638777367724898e-09, |
| "loss": 0.0843, |
| "step": 552 |
| }, |
| { |
| "epoch": 2.985858585858586, |
| "grad_norm": 2.993990182876587, |
| "learning_rate": 7.671964870337168e-10, |
| "loss": 0.0732, |
| "step": 553 |
| }, |
| { |
| "epoch": 2.9912457912457913, |
| "grad_norm": 2.1068203449249268, |
| "learning_rate": 3.4098106100166616e-10, |
| "loss": 0.0671, |
| "step": 554 |
| }, |
| { |
| "epoch": 2.9966329966329965, |
| "grad_norm": 2.407553195953369, |
| "learning_rate": 8.52459919381543e-11, |
| "loss": 0.0774, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.9966329966329965, |
| "step": 555, |
| "total_flos": 9.477952550322831e+17, |
| "train_loss": 0.30530234318193017, |
| "train_runtime": 3941.2467, |
| "train_samples_per_second": 4.521, |
| "train_steps_per_second": 0.141 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 555, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.477952550322831e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|