| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 474, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004232804232804233, | |
| "grad_norm": 43.72717354138711, | |
| "learning_rate": 0.0, | |
| "loss": 1.6554, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008465608465608466, | |
| "grad_norm": 45.20956747041741, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 1.6613, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.012698412698412698, | |
| "grad_norm": 43.537183963199226, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 1.5809, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.016931216931216932, | |
| "grad_norm": 44.16788560508001, | |
| "learning_rate": 6.25e-07, | |
| "loss": 1.5904, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.021164021164021163, | |
| "grad_norm": 46.192802343401844, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 1.7731, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.025396825396825397, | |
| "grad_norm": 41.283893260891254, | |
| "learning_rate": 1.0416666666666667e-06, | |
| "loss": 1.3879, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02962962962962963, | |
| "grad_norm": 41.031222170070116, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.379, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.033862433862433865, | |
| "grad_norm": 36.224662378650926, | |
| "learning_rate": 1.4583333333333335e-06, | |
| "loss": 1.2278, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0380952380952381, | |
| "grad_norm": 34.90626982037512, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.1593, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.042328042328042326, | |
| "grad_norm": 33.784429793622, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 1.0654, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04656084656084656, | |
| "grad_norm": 15.61619058454679, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.6672, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.050793650793650794, | |
| "grad_norm": 16.519814951576077, | |
| "learning_rate": 2.2916666666666666e-06, | |
| "loss": 0.7885, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05502645502645503, | |
| "grad_norm": 13.171010801124316, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.71, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05925925925925926, | |
| "grad_norm": 10.014105616857696, | |
| "learning_rate": 2.7083333333333334e-06, | |
| "loss": 0.5411, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 5.737900509202217, | |
| "learning_rate": 2.916666666666667e-06, | |
| "loss": 0.4377, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06772486772486773, | |
| "grad_norm": 5.771623449687533, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.5075, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07195767195767196, | |
| "grad_norm": 5.175193297939713, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.4612, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0761904761904762, | |
| "grad_norm": 4.172409316471575, | |
| "learning_rate": 3.5416666666666673e-06, | |
| "loss": 0.4509, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08042328042328042, | |
| "grad_norm": 3.0944559263366096, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.4225, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08465608465608465, | |
| "grad_norm": 2.9291701549743854, | |
| "learning_rate": 3.958333333333333e-06, | |
| "loss": 0.4555, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 4.3100971907175305, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.3314, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09312169312169312, | |
| "grad_norm": 3.283513340198027, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.3266, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.09735449735449736, | |
| "grad_norm": 2.581455776581018, | |
| "learning_rate": 4.583333333333333e-06, | |
| "loss": 0.3687, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10158730158730159, | |
| "grad_norm": 2.9154599797832703, | |
| "learning_rate": 4.791666666666668e-06, | |
| "loss": 0.3703, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10582010582010581, | |
| "grad_norm": 3.1934063229246834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3491, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11005291005291006, | |
| "grad_norm": 2.959988508176276, | |
| "learning_rate": 5.208333333333334e-06, | |
| "loss": 0.3126, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 2.507296053358847, | |
| "learning_rate": 5.416666666666667e-06, | |
| "loss": 0.3653, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.11851851851851852, | |
| "grad_norm": 2.1595052302076323, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.2977, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.12275132275132275, | |
| "grad_norm": 2.55785433661827, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.3175, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 2.461287213627725, | |
| "learning_rate": 6.041666666666667e-06, | |
| "loss": 0.2476, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1312169312169312, | |
| "grad_norm": 2.7466343812327136, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.2898, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.13544973544973546, | |
| "grad_norm": 2.6506565239032276, | |
| "learning_rate": 6.458333333333334e-06, | |
| "loss": 0.2668, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.13968253968253969, | |
| "grad_norm": 2.445226241184337, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.2737, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1439153439153439, | |
| "grad_norm": 1.7323266471999959, | |
| "learning_rate": 6.875e-06, | |
| "loss": 0.2488, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 2.8002485372037227, | |
| "learning_rate": 7.083333333333335e-06, | |
| "loss": 0.2553, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 2.5749122065394436, | |
| "learning_rate": 7.291666666666667e-06, | |
| "loss": 0.1856, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.15661375661375662, | |
| "grad_norm": 2.0107150492116785, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.2574, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.16084656084656085, | |
| "grad_norm": 2.07586369622188, | |
| "learning_rate": 7.708333333333334e-06, | |
| "loss": 0.2342, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.16507936507936508, | |
| "grad_norm": 2.650702366970764, | |
| "learning_rate": 7.916666666666667e-06, | |
| "loss": 0.2513, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1693121693121693, | |
| "grad_norm": 2.4643245144932457, | |
| "learning_rate": 8.125000000000001e-06, | |
| "loss": 0.2118, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17354497354497356, | |
| "grad_norm": 2.2583893699241404, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.2598, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 1.9649988674548964, | |
| "learning_rate": 8.541666666666666e-06, | |
| "loss": 0.2121, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.182010582010582, | |
| "grad_norm": 2.2512170588798477, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.1935, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.18624338624338624, | |
| "grad_norm": 2.428750936867218, | |
| "learning_rate": 8.958333333333334e-06, | |
| "loss": 0.2479, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 2.170136828036097, | |
| "learning_rate": 9.166666666666666e-06, | |
| "loss": 0.2397, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19470899470899472, | |
| "grad_norm": 2.0585076637804103, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.2467, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.19894179894179895, | |
| "grad_norm": 3.5115386946858522, | |
| "learning_rate": 9.583333333333335e-06, | |
| "loss": 0.1458, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.20317460317460317, | |
| "grad_norm": 2.4060311560165157, | |
| "learning_rate": 9.791666666666666e-06, | |
| "loss": 0.2102, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2074074074074074, | |
| "grad_norm": 1.8201349561390552, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1991, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "grad_norm": 1.60595437087767, | |
| "learning_rate": 9.999969538288953e-06, | |
| "loss": 0.1734, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21587301587301588, | |
| "grad_norm": 1.6521270217539161, | |
| "learning_rate": 9.999878153526974e-06, | |
| "loss": 0.1832, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.2201058201058201, | |
| "grad_norm": 2.3590338162527713, | |
| "learning_rate": 9.999725846827562e-06, | |
| "loss": 0.2099, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.22433862433862434, | |
| "grad_norm": 1.9526628599541012, | |
| "learning_rate": 9.999512620046523e-06, | |
| "loss": 0.2356, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1.7003027985463, | |
| "learning_rate": 9.999238475781957e-06, | |
| "loss": 0.1916, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2328042328042328, | |
| "grad_norm": 2.197253252430344, | |
| "learning_rate": 9.998903417374228e-06, | |
| "loss": 0.17, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.23703703703703705, | |
| "grad_norm": 2.392152390417205, | |
| "learning_rate": 9.998507448905917e-06, | |
| "loss": 0.1976, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.24126984126984127, | |
| "grad_norm": 1.6988210931642431, | |
| "learning_rate": 9.998050575201772e-06, | |
| "loss": 0.1848, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2455026455026455, | |
| "grad_norm": 1.4811689327728754, | |
| "learning_rate": 9.997532801828659e-06, | |
| "loss": 0.1685, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.24973544973544973, | |
| "grad_norm": 2.3716195293830635, | |
| "learning_rate": 9.99695413509548e-06, | |
| "loss": 0.2242, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 2.078219190553916, | |
| "learning_rate": 9.996314582053106e-06, | |
| "loss": 0.1641, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2582010582010582, | |
| "grad_norm": 2.945016339798954, | |
| "learning_rate": 9.995614150494293e-06, | |
| "loss": 0.1991, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2624338624338624, | |
| "grad_norm": 1.7177524445891639, | |
| "learning_rate": 9.994852848953574e-06, | |
| "loss": 0.1764, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 1.4250330727718503, | |
| "learning_rate": 9.994030686707171e-06, | |
| "loss": 0.143, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2708994708994709, | |
| "grad_norm": 1.3236123948431169, | |
| "learning_rate": 9.993147673772869e-06, | |
| "loss": 0.1352, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2751322751322751, | |
| "grad_norm": 1.4471704995242132, | |
| "learning_rate": 9.992203820909906e-06, | |
| "loss": 0.1687, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.27936507936507937, | |
| "grad_norm": 1.898245103472612, | |
| "learning_rate": 9.991199139618828e-06, | |
| "loss": 0.1909, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.28359788359788357, | |
| "grad_norm": 1.4090770167093434, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.1742, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2878306878306878, | |
| "grad_norm": 1.7072856786986559, | |
| "learning_rate": 9.989007341460251e-06, | |
| "loss": 0.1714, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2920634920634921, | |
| "grad_norm": 1.811382538600758, | |
| "learning_rate": 9.987820251299121e-06, | |
| "loss": 0.1739, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 1.3512054929719852, | |
| "learning_rate": 9.98657238612229e-06, | |
| "loss": 0.1712, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30052910052910053, | |
| "grad_norm": 1.3933604620853732, | |
| "learning_rate": 9.985263761134602e-06, | |
| "loss": 0.159, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 7.4550791659417275, | |
| "learning_rate": 9.983894392281237e-06, | |
| "loss": 0.1697, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.308994708994709, | |
| "grad_norm": 1.7482631074271633, | |
| "learning_rate": 9.982464296247523e-06, | |
| "loss": 0.2084, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.31322751322751324, | |
| "grad_norm": 1.4279579591947398, | |
| "learning_rate": 9.980973490458728e-06, | |
| "loss": 0.1146, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 1.752202038891908, | |
| "learning_rate": 9.979421993079853e-06, | |
| "loss": 0.2068, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3216931216931217, | |
| "grad_norm": 1.3196365121929348, | |
| "learning_rate": 9.9778098230154e-06, | |
| "loss": 0.1529, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.32592592592592595, | |
| "grad_norm": 1.4588919246797958, | |
| "learning_rate": 9.976136999909156e-06, | |
| "loss": 0.1447, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.33015873015873015, | |
| "grad_norm": 1.534245227167607, | |
| "learning_rate": 9.974403544143942e-06, | |
| "loss": 0.1407, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3343915343915344, | |
| "grad_norm": 1.2573367552977501, | |
| "learning_rate": 9.972609476841368e-06, | |
| "loss": 0.128, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3386243386243386, | |
| "grad_norm": 1.4699487437745995, | |
| "learning_rate": 9.970754819861577e-06, | |
| "loss": 0.1399, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 1.9752095010628823, | |
| "learning_rate": 9.968839595802982e-06, | |
| "loss": 0.1996, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3470899470899471, | |
| "grad_norm": 1.36801818550152, | |
| "learning_rate": 9.966863828001982e-06, | |
| "loss": 0.1524, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3513227513227513, | |
| "grad_norm": 1.4453966569682166, | |
| "learning_rate": 9.964827540532685e-06, | |
| "loss": 0.1635, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 1.3826856785129076, | |
| "learning_rate": 9.962730758206612e-06, | |
| "loss": 0.1363, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.35978835978835977, | |
| "grad_norm": 1.8672803480166524, | |
| "learning_rate": 9.960573506572391e-06, | |
| "loss": 0.1987, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.364021164021164, | |
| "grad_norm": 1.6365447793184111, | |
| "learning_rate": 9.958355811915452e-06, | |
| "loss": 0.1642, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3682539682539683, | |
| "grad_norm": 1.7338201367733481, | |
| "learning_rate": 9.95607770125771e-06, | |
| "loss": 0.1709, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3724867724867725, | |
| "grad_norm": 1.299464880095663, | |
| "learning_rate": 9.953739202357219e-06, | |
| "loss": 0.1323, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.37671957671957673, | |
| "grad_norm": 1.5426139181892822, | |
| "learning_rate": 9.951340343707852e-06, | |
| "loss": 0.15, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 1.885701142257975, | |
| "learning_rate": 9.948881154538946e-06, | |
| "loss": 0.1644, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3851851851851852, | |
| "grad_norm": 1.4591100615878896, | |
| "learning_rate": 9.946361664814942e-06, | |
| "loss": 0.1537, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.38941798941798944, | |
| "grad_norm": 1.1799285767193204, | |
| "learning_rate": 9.94378190523503e-06, | |
| "loss": 0.1154, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.39365079365079364, | |
| "grad_norm": 1.216634736766946, | |
| "learning_rate": 9.941141907232766e-06, | |
| "loss": 0.1199, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3978835978835979, | |
| "grad_norm": 2.829211639773649, | |
| "learning_rate": 9.938441702975689e-06, | |
| "loss": 0.1745, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4021164021164021, | |
| "grad_norm": 1.284371191947075, | |
| "learning_rate": 9.93568132536494e-06, | |
| "loss": 0.1221, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.40634920634920635, | |
| "grad_norm": 1.5092414625197825, | |
| "learning_rate": 9.932860808034847e-06, | |
| "loss": 0.1551, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4105820105820106, | |
| "grad_norm": 1.428323617465579, | |
| "learning_rate": 9.929980185352525e-06, | |
| "loss": 0.1233, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4148148148148148, | |
| "grad_norm": 1.4247177035890213, | |
| "learning_rate": 9.927039492417452e-06, | |
| "loss": 0.1487, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.41904761904761906, | |
| "grad_norm": 1.4478986915245, | |
| "learning_rate": 9.924038765061042e-06, | |
| "loss": 0.1372, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 1.700062917316063, | |
| "learning_rate": 9.92097803984621e-06, | |
| "loss": 0.1476, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4275132275132275, | |
| "grad_norm": 1.4235599018999308, | |
| "learning_rate": 9.91785735406693e-06, | |
| "loss": 0.1348, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.43174603174603177, | |
| "grad_norm": 1.5944688415988961, | |
| "learning_rate": 9.914676745747772e-06, | |
| "loss": 0.1321, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.43597883597883597, | |
| "grad_norm": 1.7126018443966151, | |
| "learning_rate": 9.911436253643445e-06, | |
| "loss": 0.1617, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.4402116402116402, | |
| "grad_norm": 1.5173736456674343, | |
| "learning_rate": 9.908135917238321e-06, | |
| "loss": 0.1555, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.643979121514149, | |
| "learning_rate": 9.904775776745959e-06, | |
| "loss": 0.1357, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4486772486772487, | |
| "grad_norm": 1.6123058130098344, | |
| "learning_rate": 9.901355873108611e-06, | |
| "loss": 0.1611, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.45291005291005293, | |
| "grad_norm": 1.36764926534103, | |
| "learning_rate": 9.89787624799672e-06, | |
| "loss": 0.1369, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.5312605310786775, | |
| "learning_rate": 9.894336943808426e-06, | |
| "loss": 0.1613, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.4613756613756614, | |
| "grad_norm": 1.48194042128105, | |
| "learning_rate": 9.890738003669029e-06, | |
| "loss": 0.1364, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4656084656084656, | |
| "grad_norm": 2.3838744962487963, | |
| "learning_rate": 9.887079471430481e-06, | |
| "loss": 0.1469, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46984126984126984, | |
| "grad_norm": 1.865768936599346, | |
| "learning_rate": 9.883361391670841e-06, | |
| "loss": 0.0879, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4740740740740741, | |
| "grad_norm": 1.9344837800965173, | |
| "learning_rate": 9.879583809693737e-06, | |
| "loss": 0.1396, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4783068783068783, | |
| "grad_norm": 1.4564213566749669, | |
| "learning_rate": 9.875746771527817e-06, | |
| "loss": 0.1655, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.48253968253968255, | |
| "grad_norm": 1.416412960445032, | |
| "learning_rate": 9.871850323926178e-06, | |
| "loss": 0.1601, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.48677248677248675, | |
| "grad_norm": 1.2470195823746264, | |
| "learning_rate": 9.867894514365802e-06, | |
| "loss": 0.1124, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.491005291005291, | |
| "grad_norm": 1.6737155479454644, | |
| "learning_rate": 9.863879391046985e-06, | |
| "loss": 0.1204, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.49523809523809526, | |
| "grad_norm": 1.267448596747368, | |
| "learning_rate": 9.859805002892733e-06, | |
| "loss": 0.1067, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.49947089947089945, | |
| "grad_norm": 1.35223310268365, | |
| "learning_rate": 9.85567139954818e-06, | |
| "loss": 0.1329, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5037037037037037, | |
| "grad_norm": 1.3410113455783514, | |
| "learning_rate": 9.851478631379982e-06, | |
| "loss": 0.1324, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 1.5555421463772552, | |
| "learning_rate": 9.847226749475696e-06, | |
| "loss": 0.1075, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5121693121693122, | |
| "grad_norm": 1.425042236100241, | |
| "learning_rate": 9.842915805643156e-06, | |
| "loss": 0.1314, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5164021164021164, | |
| "grad_norm": 2.1003863770949844, | |
| "learning_rate": 9.838545852409857e-06, | |
| "loss": 0.166, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5206349206349207, | |
| "grad_norm": 1.0792059058292323, | |
| "learning_rate": 9.834116943022299e-06, | |
| "loss": 0.0989, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5248677248677248, | |
| "grad_norm": 1.5340606066613167, | |
| "learning_rate": 9.829629131445342e-06, | |
| "loss": 0.1125, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5291005291005291, | |
| "grad_norm": 1.8203980636061152, | |
| "learning_rate": 9.825082472361558e-06, | |
| "loss": 0.1393, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.8653391457662798, | |
| "learning_rate": 9.82047702117055e-06, | |
| "loss": 0.1781, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5375661375661376, | |
| "grad_norm": 1.868686455952389, | |
| "learning_rate": 9.815812833988292e-06, | |
| "loss": 0.1511, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5417989417989418, | |
| "grad_norm": 3.4516742714903956, | |
| "learning_rate": 9.811089967646427e-06, | |
| "loss": 0.182, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.546031746031746, | |
| "grad_norm": 1.302285639983537, | |
| "learning_rate": 9.806308479691595e-06, | |
| "loss": 0.1362, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5502645502645502, | |
| "grad_norm": 1.3554292892825641, | |
| "learning_rate": 9.801468428384716e-06, | |
| "loss": 0.1183, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5544973544973545, | |
| "grad_norm": 1.5569558889730544, | |
| "learning_rate": 9.796569872700287e-06, | |
| "loss": 0.115, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5587301587301587, | |
| "grad_norm": 1.4128905574376962, | |
| "learning_rate": 9.791612872325667e-06, | |
| "loss": 0.1424, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.562962962962963, | |
| "grad_norm": 1.5703621512026078, | |
| "learning_rate": 9.786597487660336e-06, | |
| "loss": 0.1426, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5671957671957671, | |
| "grad_norm": 1.1134755961640572, | |
| "learning_rate": 9.781523779815178e-06, | |
| "loss": 0.1148, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 1.6279717460336087, | |
| "learning_rate": 9.776391810611719e-06, | |
| "loss": 0.1432, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5756613756613757, | |
| "grad_norm": 1.1710680368668092, | |
| "learning_rate": 9.771201642581384e-06, | |
| "loss": 0.1112, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5798941798941799, | |
| "grad_norm": 1.468212722664273, | |
| "learning_rate": 9.765953338964736e-06, | |
| "loss": 0.1467, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5841269841269842, | |
| "grad_norm": 1.452581142800487, | |
| "learning_rate": 9.760646963710694e-06, | |
| "loss": 0.1686, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5883597883597883, | |
| "grad_norm": 1.000120927347695, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.0845, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 1.3458988878420348, | |
| "learning_rate": 9.749860257623262e-06, | |
| "loss": 0.1351, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5968253968253968, | |
| "grad_norm": 1.2838054499563394, | |
| "learning_rate": 9.744380058222483e-06, | |
| "loss": 0.1296, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6010582010582011, | |
| "grad_norm": 1.302987625581627, | |
| "learning_rate": 9.73884205004793e-06, | |
| "loss": 0.1163, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6052910052910053, | |
| "grad_norm": 1.3357457319423431, | |
| "learning_rate": 9.733246300578482e-06, | |
| "loss": 0.1345, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 1.530634255987009, | |
| "learning_rate": 9.727592877996585e-06, | |
| "loss": 0.1248, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6137566137566137, | |
| "grad_norm": 1.55621140705952, | |
| "learning_rate": 9.721881851187406e-06, | |
| "loss": 0.155, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.617989417989418, | |
| "grad_norm": 1.5076666049322645, | |
| "learning_rate": 9.716113289738005e-06, | |
| "loss": 0.1262, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 3.424577642100365, | |
| "learning_rate": 9.710287263936485e-06, | |
| "loss": 0.1012, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6264550264550265, | |
| "grad_norm": 1.467957894333247, | |
| "learning_rate": 9.704403844771128e-06, | |
| "loss": 0.119, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6306878306878307, | |
| "grad_norm": 1.9287811168902935, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.1635, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 1.5672777610226705, | |
| "learning_rate": 9.69246511379778e-06, | |
| "loss": 0.1281, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6391534391534391, | |
| "grad_norm": 1.2674787426967342, | |
| "learning_rate": 9.68640994745946e-06, | |
| "loss": 0.1114, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6433862433862434, | |
| "grad_norm": 1.461753783787954, | |
| "learning_rate": 9.680297678694867e-06, | |
| "loss": 0.1439, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6476190476190476, | |
| "grad_norm": 1.5617305093987728, | |
| "learning_rate": 9.674128381980073e-06, | |
| "loss": 0.1308, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6518518518518519, | |
| "grad_norm": 1.5817050995954458, | |
| "learning_rate": 9.667902132486009e-06, | |
| "loss": 0.1244, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.656084656084656, | |
| "grad_norm": 1.0297735518000437, | |
| "learning_rate": 9.661619006077562e-06, | |
| "loss": 0.0944, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6603174603174603, | |
| "grad_norm": 1.1490229757653174, | |
| "learning_rate": 9.655279079312643e-06, | |
| "loss": 0.1071, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6645502645502646, | |
| "grad_norm": 1.3820142441126761, | |
| "learning_rate": 9.648882429441258e-06, | |
| "loss": 0.1302, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6687830687830688, | |
| "grad_norm": 1.3066591757009012, | |
| "learning_rate": 9.642429134404568e-06, | |
| "loss": 0.1443, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6730158730158731, | |
| "grad_norm": 1.0248040135390748, | |
| "learning_rate": 9.635919272833938e-06, | |
| "loss": 0.0812, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6772486772486772, | |
| "grad_norm": 1.253548823921826, | |
| "learning_rate": 9.629352924049975e-06, | |
| "loss": 0.135, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6814814814814815, | |
| "grad_norm": 1.321779244061287, | |
| "learning_rate": 9.622730168061568e-06, | |
| "loss": 0.1236, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 1.0767529959934636, | |
| "learning_rate": 9.616051085564905e-06, | |
| "loss": 0.0846, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.68994708994709, | |
| "grad_norm": 1.2506064320593104, | |
| "learning_rate": 9.609315757942504e-06, | |
| "loss": 0.1048, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.6941798941798942, | |
| "grad_norm": 1.0490674529865702, | |
| "learning_rate": 9.602524267262202e-06, | |
| "loss": 0.1043, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 1.1274540124608232, | |
| "learning_rate": 9.595676696276173e-06, | |
| "loss": 0.1067, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7026455026455026, | |
| "grad_norm": 1.2376351263786731, | |
| "learning_rate": 9.588773128419907e-06, | |
| "loss": 0.1278, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7068783068783069, | |
| "grad_norm": 1.5158325756912805, | |
| "learning_rate": 9.581813647811199e-06, | |
| "loss": 0.1197, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 1.9368463333626136, | |
| "learning_rate": 9.574798339249124e-06, | |
| "loss": 0.108, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7153439153439154, | |
| "grad_norm": 1.3604192116787752, | |
| "learning_rate": 9.567727288213005e-06, | |
| "loss": 0.1131, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7195767195767195, | |
| "grad_norm": 2.3852673421250494, | |
| "learning_rate": 9.560600580861366e-06, | |
| "loss": 0.1146, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7238095238095238, | |
| "grad_norm": 1.3447438337348436, | |
| "learning_rate": 9.553418304030886e-06, | |
| "loss": 0.1114, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.728042328042328, | |
| "grad_norm": 1.459045833803003, | |
| "learning_rate": 9.546180545235344e-06, | |
| "loss": 0.1467, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7322751322751323, | |
| "grad_norm": 1.6596975324831713, | |
| "learning_rate": 9.538887392664544e-06, | |
| "loss": 0.1069, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7365079365079366, | |
| "grad_norm": 1.3125689917057628, | |
| "learning_rate": 9.531538935183252e-06, | |
| "loss": 0.1155, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 1.1254939749629098, | |
| "learning_rate": 9.524135262330098e-06, | |
| "loss": 0.1014, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.744973544973545, | |
| "grad_norm": 1.1850635605555897, | |
| "learning_rate": 9.516676464316505e-06, | |
| "loss": 0.0973, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7492063492063492, | |
| "grad_norm": 1.7947316330954919, | |
| "learning_rate": 9.50916263202557e-06, | |
| "loss": 0.1398, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7534391534391535, | |
| "grad_norm": 1.5534071768101145, | |
| "learning_rate": 9.501593857010968e-06, | |
| "loss": 0.1124, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7576719576719577, | |
| "grad_norm": 1.333104024444634, | |
| "learning_rate": 9.493970231495836e-06, | |
| "loss": 0.1172, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 1.4850034964893315, | |
| "learning_rate": 9.486291848371642e-06, | |
| "loss": 0.0939, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7661375661375661, | |
| "grad_norm": 1.657420935625152, | |
| "learning_rate": 9.478558801197065e-06, | |
| "loss": 0.0969, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7703703703703704, | |
| "grad_norm": 1.2670523544038959, | |
| "learning_rate": 9.470771184196842e-06, | |
| "loss": 0.0999, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7746031746031746, | |
| "grad_norm": 1.3320429466871224, | |
| "learning_rate": 9.46292909226063e-06, | |
| "loss": 0.148, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7788359788359789, | |
| "grad_norm": 1.2935520644229133, | |
| "learning_rate": 9.45503262094184e-06, | |
| "loss": 0.1137, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.783068783068783, | |
| "grad_norm": 1.2311993159949326, | |
| "learning_rate": 9.44708186645649e-06, | |
| "loss": 0.1219, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7873015873015873, | |
| "grad_norm": 1.324888001044985, | |
| "learning_rate": 9.439076925682006e-06, | |
| "loss": 0.1016, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.7915343915343915, | |
| "grad_norm": 0.9753462251818058, | |
| "learning_rate": 9.431017896156074e-06, | |
| "loss": 0.0939, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.7957671957671958, | |
| "grad_norm": 1.162015032545452, | |
| "learning_rate": 9.42290487607542e-06, | |
| "loss": 0.1008, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.339272642650438, | |
| "learning_rate": 9.414737964294636e-06, | |
| "loss": 0.1047, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8042328042328042, | |
| "grad_norm": 1.7752016458700282, | |
| "learning_rate": 9.406517260324962e-06, | |
| "loss": 0.0836, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8084656084656084, | |
| "grad_norm": 1.2132508053585902, | |
| "learning_rate": 9.398242864333084e-06, | |
| "loss": 0.0893, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8126984126984127, | |
| "grad_norm": 1.0549315304944495, | |
| "learning_rate": 9.389914877139903e-06, | |
| "loss": 0.0958, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.816931216931217, | |
| "grad_norm": 1.5784101564965254, | |
| "learning_rate": 9.381533400219319e-06, | |
| "loss": 0.0825, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8211640211640212, | |
| "grad_norm": 1.2660572838595667, | |
| "learning_rate": 9.37309853569698e-06, | |
| "loss": 0.1021, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 1.3321730339276188, | |
| "learning_rate": 9.364610386349048e-06, | |
| "loss": 0.0993, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8296296296296296, | |
| "grad_norm": 1.2635879093213929, | |
| "learning_rate": 9.356069055600949e-06, | |
| "loss": 0.0983, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8338624338624339, | |
| "grad_norm": 1.4546715685072167, | |
| "learning_rate": 9.347474647526095e-06, | |
| "loss": 0.0928, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8380952380952381, | |
| "grad_norm": 1.02322670489793, | |
| "learning_rate": 9.338827266844643e-06, | |
| "loss": 0.0772, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8423280423280424, | |
| "grad_norm": 1.3325182305302485, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.0987, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 1.403989810243998, | |
| "learning_rate": 9.321374009768525e-06, | |
| "loss": 0.0958, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8507936507936508, | |
| "grad_norm": 1.2529973130678513, | |
| "learning_rate": 9.312568346036288e-06, | |
| "loss": 0.0922, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.855026455026455, | |
| "grad_norm": 1.339942832189008, | |
| "learning_rate": 9.30371013501972e-06, | |
| "loss": 0.0847, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8592592592592593, | |
| "grad_norm": 1.9827932325299167, | |
| "learning_rate": 9.294799484653323e-06, | |
| "loss": 0.0941, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8634920634920635, | |
| "grad_norm": 1.1213092752857448, | |
| "learning_rate": 9.285836503510562e-06, | |
| "loss": 0.0784, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8677248677248677, | |
| "grad_norm": 1.022053544019045, | |
| "learning_rate": 9.276821300802535e-06, | |
| "loss": 0.0826, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8719576719576719, | |
| "grad_norm": 1.2437584729757036, | |
| "learning_rate": 9.267753986376638e-06, | |
| "loss": 0.0988, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8761904761904762, | |
| "grad_norm": 1.4060299126574083, | |
| "learning_rate": 9.25863467071524e-06, | |
| "loss": 0.0963, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.8804232804232804, | |
| "grad_norm": 1.1657287674213732, | |
| "learning_rate": 9.24946346493432e-06, | |
| "loss": 0.0855, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.8846560846560847, | |
| "grad_norm": 1.1071987425032126, | |
| "learning_rate": 9.24024048078213e-06, | |
| "loss": 0.0828, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.376476091762823, | |
| "learning_rate": 9.230965830637821e-06, | |
| "loss": 0.1064, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8931216931216931, | |
| "grad_norm": 1.3361508869710212, | |
| "learning_rate": 9.221639627510076e-06, | |
| "loss": 0.1173, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.8973544973544973, | |
| "grad_norm": 1.3580696473911005, | |
| "learning_rate": 9.21226198503574e-06, | |
| "loss": 0.0949, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9015873015873016, | |
| "grad_norm": 1.2783463166289852, | |
| "learning_rate": 9.202833017478421e-06, | |
| "loss": 0.0786, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9058201058201059, | |
| "grad_norm": 1.1862774845425663, | |
| "learning_rate": 9.193352839727122e-06, | |
| "loss": 0.0752, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.91005291005291, | |
| "grad_norm": 1.0330987153646436, | |
| "learning_rate": 9.18382156729481e-06, | |
| "loss": 0.0731, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 1.2227259879389292, | |
| "learning_rate": 9.174239316317034e-06, | |
| "loss": 0.0852, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9185185185185185, | |
| "grad_norm": 1.338767605192061, | |
| "learning_rate": 9.164606203550498e-06, | |
| "loss": 0.1114, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9227513227513228, | |
| "grad_norm": 1.1914966704194456, | |
| "learning_rate": 9.154922346371641e-06, | |
| "loss": 0.0792, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.926984126984127, | |
| "grad_norm": 1.2077161006608368, | |
| "learning_rate": 9.145187862775208e-06, | |
| "loss": 0.0937, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9312169312169312, | |
| "grad_norm": 3.455070672821183, | |
| "learning_rate": 9.13540287137281e-06, | |
| "loss": 0.1029, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9354497354497354, | |
| "grad_norm": 1.3029805107614403, | |
| "learning_rate": 9.125567491391476e-06, | |
| "loss": 0.0868, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9396825396825397, | |
| "grad_norm": 1.1230441398539233, | |
| "learning_rate": 9.115681842672211e-06, | |
| "loss": 0.0792, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9439153439153439, | |
| "grad_norm": 1.0274770873560906, | |
| "learning_rate": 9.10574604566852e-06, | |
| "loss": 0.0828, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9481481481481482, | |
| "grad_norm": 1.1591065147913875, | |
| "learning_rate": 9.09576022144496e-06, | |
| "loss": 0.0868, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.3488128549102674, | |
| "learning_rate": 9.085724491675642e-06, | |
| "loss": 0.106, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9566137566137566, | |
| "grad_norm": 1.2276156196511196, | |
| "learning_rate": 9.07563897864277e-06, | |
| "loss": 0.1007, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9608465608465608, | |
| "grad_norm": 1.0373897808991692, | |
| "learning_rate": 9.065503805235139e-06, | |
| "loss": 0.0757, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.9650793650793651, | |
| "grad_norm": 1.5642701263049024, | |
| "learning_rate": 9.055319094946633e-06, | |
| "loss": 0.1113, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9693121693121693, | |
| "grad_norm": 1.3821470998948688, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.1066, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9735449735449735, | |
| "grad_norm": 1.2849131411511094, | |
| "learning_rate": 9.03480156071901e-06, | |
| "loss": 0.0996, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 1.4937027707955488, | |
| "learning_rate": 9.02446898677957e-06, | |
| "loss": 0.1066, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.982010582010582, | |
| "grad_norm": 0.9718589878629968, | |
| "learning_rate": 9.014087375955574e-06, | |
| "loss": 0.0695, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.9862433862433863, | |
| "grad_norm": 1.0615875981137657, | |
| "learning_rate": 9.003656854743667e-06, | |
| "loss": 0.0741, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9904761904761905, | |
| "grad_norm": 1.0154034408078385, | |
| "learning_rate": 8.993177550236464e-06, | |
| "loss": 0.085, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.9947089947089947, | |
| "grad_norm": 1.0590350628316874, | |
| "learning_rate": 8.982649590120982e-06, | |
| "loss": 0.0666, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9989417989417989, | |
| "grad_norm": 1.009502950887791, | |
| "learning_rate": 8.972073102677091e-06, | |
| "loss": 0.0667, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.5207179670384026, | |
| "learning_rate": 8.961448216775955e-06, | |
| "loss": 0.0719, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0042328042328041, | |
| "grad_norm": 1.0439827565118902, | |
| "learning_rate": 8.950775061878453e-06, | |
| "loss": 0.0617, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0084656084656085, | |
| "grad_norm": 1.0127555558104633, | |
| "learning_rate": 8.94005376803361e-06, | |
| "loss": 0.0604, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.0126984126984127, | |
| "grad_norm": 1.0868468474919644, | |
| "learning_rate": 8.92928446587701e-06, | |
| "loss": 0.0593, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.016931216931217, | |
| "grad_norm": 1.1138192394576714, | |
| "learning_rate": 8.9184672866292e-06, | |
| "loss": 0.066, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0211640211640212, | |
| "grad_norm": 1.0568629136248378, | |
| "learning_rate": 8.907602362094094e-06, | |
| "loss": 0.0521, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.0253968253968253, | |
| "grad_norm": 1.3202632756778692, | |
| "learning_rate": 8.896689824657371e-06, | |
| "loss": 0.0737, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.0296296296296297, | |
| "grad_norm": 1.094241539250586, | |
| "learning_rate": 8.885729807284855e-06, | |
| "loss": 0.0577, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.0338624338624338, | |
| "grad_norm": 0.7772743020562026, | |
| "learning_rate": 8.874722443520898e-06, | |
| "loss": 0.0414, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0380952380952382, | |
| "grad_norm": 1.0784304127588566, | |
| "learning_rate": 8.863667867486756e-06, | |
| "loss": 0.0601, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0423280423280423, | |
| "grad_norm": 1.0115870407215455, | |
| "learning_rate": 8.852566213878947e-06, | |
| "loss": 0.0672, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0465608465608465, | |
| "grad_norm": 0.9393680447714831, | |
| "learning_rate": 8.841417617967618e-06, | |
| "loss": 0.0562, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.0507936507936508, | |
| "grad_norm": 1.000103639091894, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.0505, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.055026455026455, | |
| "grad_norm": 0.8530141568102377, | |
| "learning_rate": 8.818980143173212e-06, | |
| "loss": 0.0445, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0592592592592593, | |
| "grad_norm": 0.9092503604322939, | |
| "learning_rate": 8.807691537683685e-06, | |
| "loss": 0.0377, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.0634920634920635, | |
| "grad_norm": 1.3616532372700458, | |
| "learning_rate": 8.796356536674404e-06, | |
| "loss": 0.0447, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.0677248677248676, | |
| "grad_norm": 0.9961095762490313, | |
| "learning_rate": 8.784975278258783e-06, | |
| "loss": 0.0422, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.071957671957672, | |
| "grad_norm": 1.417792501888075, | |
| "learning_rate": 8.773547901113862e-06, | |
| "loss": 0.0685, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.0761904761904761, | |
| "grad_norm": 1.109045645130657, | |
| "learning_rate": 8.762074544478622e-06, | |
| "loss": 0.057, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0804232804232805, | |
| "grad_norm": 1.1085366846759674, | |
| "learning_rate": 8.750555348152299e-06, | |
| "loss": 0.0618, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.0846560846560847, | |
| "grad_norm": 0.9675368242876587, | |
| "learning_rate": 8.73899045249266e-06, | |
| "loss": 0.0698, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.0888888888888888, | |
| "grad_norm": 0.8969933415528727, | |
| "learning_rate": 8.727379998414311e-06, | |
| "loss": 0.0499, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.0931216931216932, | |
| "grad_norm": 0.9848664591981523, | |
| "learning_rate": 8.715724127386971e-06, | |
| "loss": 0.0664, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.0973544973544973, | |
| "grad_norm": 0.9775428435297511, | |
| "learning_rate": 8.70402298143375e-06, | |
| "loss": 0.0467, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1015873015873017, | |
| "grad_norm": 0.8332721352250031, | |
| "learning_rate": 8.692276703129421e-06, | |
| "loss": 0.035, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.1058201058201058, | |
| "grad_norm": 0.9472578567420649, | |
| "learning_rate": 8.680485435598674e-06, | |
| "loss": 0.0499, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.11005291005291, | |
| "grad_norm": 0.8344760929082524, | |
| "learning_rate": 8.668649322514382e-06, | |
| "loss": 0.0473, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1142857142857143, | |
| "grad_norm": 1.0269303477988245, | |
| "learning_rate": 8.656768508095853e-06, | |
| "loss": 0.0453, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1185185185185185, | |
| "grad_norm": 1.1216585383229603, | |
| "learning_rate": 8.644843137107058e-06, | |
| "loss": 0.0445, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1227513227513228, | |
| "grad_norm": 1.1375626971477624, | |
| "learning_rate": 8.632873354854881e-06, | |
| "loss": 0.0609, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.126984126984127, | |
| "grad_norm": 1.0980193157423086, | |
| "learning_rate": 8.620859307187339e-06, | |
| "loss": 0.0585, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1312169312169311, | |
| "grad_norm": 1.0923249482421242, | |
| "learning_rate": 8.608801140491811e-06, | |
| "loss": 0.0535, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1354497354497355, | |
| "grad_norm": 1.199920331072888, | |
| "learning_rate": 8.596699001693257e-06, | |
| "loss": 0.0574, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.1396825396825396, | |
| "grad_norm": 1.3840691329312167, | |
| "learning_rate": 8.584553038252415e-06, | |
| "loss": 0.0603, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.143915343915344, | |
| "grad_norm": 1.0976468243723367, | |
| "learning_rate": 8.572363398164017e-06, | |
| "loss": 0.0495, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.1481481481481481, | |
| "grad_norm": 0.8361407917583945, | |
| "learning_rate": 8.560130229954985e-06, | |
| "loss": 0.0428, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1523809523809523, | |
| "grad_norm": 1.1399444462070143, | |
| "learning_rate": 8.547853682682605e-06, | |
| "loss": 0.0507, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.1566137566137566, | |
| "grad_norm": 1.9349212107218514, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 0.0399, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.1608465608465608, | |
| "grad_norm": 1.4185063496527812, | |
| "learning_rate": 8.523171049817974e-06, | |
| "loss": 0.0504, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1650793650793652, | |
| "grad_norm": 0.9431563961918448, | |
| "learning_rate": 8.510765264975813e-06, | |
| "loss": 0.045, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.1693121693121693, | |
| "grad_norm": 0.9634549342361133, | |
| "learning_rate": 8.498316702566828e-06, | |
| "loss": 0.0433, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.1735449735449737, | |
| "grad_norm": 0.8107013087955712, | |
| "learning_rate": 8.485825514272824e-06, | |
| "loss": 0.0396, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.1777777777777778, | |
| "grad_norm": 1.2143724846797779, | |
| "learning_rate": 8.473291852294986e-06, | |
| "loss": 0.0503, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.182010582010582, | |
| "grad_norm": 1.214634058606961, | |
| "learning_rate": 8.460715869352035e-06, | |
| "loss": 0.0492, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1862433862433863, | |
| "grad_norm": 1.5326647754225693, | |
| "learning_rate": 8.44809771867835e-06, | |
| "loss": 0.0697, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 1.0183387396190917, | |
| "learning_rate": 8.435437554022116e-06, | |
| "loss": 0.0506, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.1947089947089946, | |
| "grad_norm": 0.9774623735237468, | |
| "learning_rate": 8.422735529643445e-06, | |
| "loss": 0.0478, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.198941798941799, | |
| "grad_norm": 0.9151100643280784, | |
| "learning_rate": 8.409991800312493e-06, | |
| "loss": 0.0521, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2031746031746031, | |
| "grad_norm": 0.944795332484615, | |
| "learning_rate": 8.397206521307584e-06, | |
| "loss": 0.0462, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2074074074074075, | |
| "grad_norm": 0.8708767785679999, | |
| "learning_rate": 8.384379848413304e-06, | |
| "loss": 0.04, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2116402116402116, | |
| "grad_norm": 0.9060528905235375, | |
| "learning_rate": 8.371511937918616e-06, | |
| "loss": 0.0415, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.215873015873016, | |
| "grad_norm": 0.9029101418348026, | |
| "learning_rate": 8.358602946614952e-06, | |
| "loss": 0.0457, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2201058201058201, | |
| "grad_norm": 1.1143501235536941, | |
| "learning_rate": 8.345653031794292e-06, | |
| "loss": 0.0492, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.2243386243386243, | |
| "grad_norm": 0.6904563329955595, | |
| "learning_rate": 8.332662351247262e-06, | |
| "loss": 0.029, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2285714285714286, | |
| "grad_norm": 0.9806076212927931, | |
| "learning_rate": 8.319631063261209e-06, | |
| "loss": 0.0565, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2328042328042328, | |
| "grad_norm": 0.9637920964858679, | |
| "learning_rate": 8.30655932661826e-06, | |
| "loss": 0.0547, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.237037037037037, | |
| "grad_norm": 0.9864706909349096, | |
| "learning_rate": 8.293447300593402e-06, | |
| "loss": 0.0485, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.2412698412698413, | |
| "grad_norm": 1.0056521811983314, | |
| "learning_rate": 8.280295144952537e-06, | |
| "loss": 0.0455, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.2455026455026454, | |
| "grad_norm": 2.7374704058965413, | |
| "learning_rate": 8.267103019950529e-06, | |
| "loss": 0.0402, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2497354497354498, | |
| "grad_norm": 1.0675471111168537, | |
| "learning_rate": 8.253871086329255e-06, | |
| "loss": 0.0444, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 0.8741609150096519, | |
| "learning_rate": 8.240599505315656e-06, | |
| "loss": 0.034, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.2582010582010583, | |
| "grad_norm": 1.0949757974574201, | |
| "learning_rate": 8.227288438619754e-06, | |
| "loss": 0.0413, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.2624338624338625, | |
| "grad_norm": 1.3321265503517306, | |
| "learning_rate": 8.213938048432697e-06, | |
| "loss": 0.0432, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 1.16809688783282, | |
| "learning_rate": 8.200548497424779e-06, | |
| "loss": 0.0457, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.270899470899471, | |
| "grad_norm": 1.2234242804842506, | |
| "learning_rate": 8.18711994874345e-06, | |
| "loss": 0.0351, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.2751322751322751, | |
| "grad_norm": 0.8455313768916234, | |
| "learning_rate": 8.173652566011339e-06, | |
| "loss": 0.0305, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.2793650793650793, | |
| "grad_norm": 1.264978471670017, | |
| "learning_rate": 8.160146513324256e-06, | |
| "loss": 0.0681, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.2835978835978836, | |
| "grad_norm": 1.0191087783122954, | |
| "learning_rate": 8.146601955249187e-06, | |
| "loss": 0.0458, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.2878306878306878, | |
| "grad_norm": 1.5364104175585032, | |
| "learning_rate": 8.133019056822303e-06, | |
| "loss": 0.0478, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2920634920634921, | |
| "grad_norm": 1.0186300363769811, | |
| "learning_rate": 8.119397983546932e-06, | |
| "loss": 0.0521, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.2962962962962963, | |
| "grad_norm": 0.9695838159643356, | |
| "learning_rate": 8.105738901391553e-06, | |
| "loss": 0.043, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3005291005291006, | |
| "grad_norm": 1.1606845112374315, | |
| "learning_rate": 8.092041976787772e-06, | |
| "loss": 0.0391, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3047619047619048, | |
| "grad_norm": 0.9131906220785553, | |
| "learning_rate": 8.078307376628292e-06, | |
| "loss": 0.0503, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.308994708994709, | |
| "grad_norm": 1.1213236223136132, | |
| "learning_rate": 8.064535268264883e-06, | |
| "loss": 0.0563, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3132275132275133, | |
| "grad_norm": 1.5919757523939648, | |
| "learning_rate": 8.05072581950634e-06, | |
| "loss": 0.0485, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3174603174603174, | |
| "grad_norm": 0.8068544658944983, | |
| "learning_rate": 8.036879198616434e-06, | |
| "loss": 0.036, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3216931216931216, | |
| "grad_norm": 1.32303739260173, | |
| "learning_rate": 8.022995574311876e-06, | |
| "loss": 0.0557, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.325925925925926, | |
| "grad_norm": 1.6279390963252223, | |
| "learning_rate": 8.009075115760243e-06, | |
| "loss": 0.0553, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.33015873015873, | |
| "grad_norm": 0.713797069897446, | |
| "learning_rate": 7.99511799257793e-06, | |
| "loss": 0.0397, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.3343915343915345, | |
| "grad_norm": 0.8079576051295935, | |
| "learning_rate": 7.981124374828079e-06, | |
| "loss": 0.04, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.3386243386243386, | |
| "grad_norm": 1.1435417675953259, | |
| "learning_rate": 7.967094433018508e-06, | |
| "loss": 0.0435, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.342857142857143, | |
| "grad_norm": 1.143163112909474, | |
| "learning_rate": 7.953028338099628e-06, | |
| "loss": 0.0474, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.3470899470899471, | |
| "grad_norm": 0.9269672035843429, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.0313, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.3513227513227513, | |
| "grad_norm": 1.0466290884719476, | |
| "learning_rate": 7.92478837493608e-06, | |
| "loss": 0.0505, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3555555555555556, | |
| "grad_norm": 1.256639700693128, | |
| "learning_rate": 7.910614850786448e-06, | |
| "loss": 0.0438, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.3597883597883598, | |
| "grad_norm": 0.802253728130833, | |
| "learning_rate": 7.896405861713393e-06, | |
| "loss": 0.0344, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.364021164021164, | |
| "grad_norm": 1.0700812448503103, | |
| "learning_rate": 7.882161580848966e-06, | |
| "loss": 0.0516, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.3682539682539683, | |
| "grad_norm": 1.0286894659033428, | |
| "learning_rate": 7.86788218175523e-06, | |
| "loss": 0.0485, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.3724867724867724, | |
| "grad_norm": 1.032321094949915, | |
| "learning_rate": 7.85356783842216e-06, | |
| "loss": 0.0493, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3767195767195768, | |
| "grad_norm": 0.8860252372178579, | |
| "learning_rate": 7.839218725265507e-06, | |
| "loss": 0.0413, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.9104230123199901, | |
| "learning_rate": 7.82483501712469e-06, | |
| "loss": 0.0484, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.3851851851851853, | |
| "grad_norm": 0.9105868366799567, | |
| "learning_rate": 7.810416889260653e-06, | |
| "loss": 0.033, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.3894179894179894, | |
| "grad_norm": 0.8874460617855023, | |
| "learning_rate": 7.795964517353734e-06, | |
| "loss": 0.0418, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.3936507936507936, | |
| "grad_norm": 0.6979249575618531, | |
| "learning_rate": 7.781478077501526e-06, | |
| "loss": 0.0276, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.397883597883598, | |
| "grad_norm": 1.1401863184171674, | |
| "learning_rate": 7.76695774621672e-06, | |
| "loss": 0.0487, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.402116402116402, | |
| "grad_norm": 1.180237330609796, | |
| "learning_rate": 7.752403700424978e-06, | |
| "loss": 0.0481, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4063492063492062, | |
| "grad_norm": 0.971305148025189, | |
| "learning_rate": 7.737816117462752e-06, | |
| "loss": 0.0533, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4105820105820106, | |
| "grad_norm": 1.143064790256319, | |
| "learning_rate": 7.723195175075136e-06, | |
| "loss": 0.0481, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4148148148148147, | |
| "grad_norm": 1.3039942970009886, | |
| "learning_rate": 7.7085410514137e-06, | |
| "loss": 0.0464, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.4190476190476191, | |
| "grad_norm": 0.7197106725379593, | |
| "learning_rate": 7.693853925034316e-06, | |
| "loss": 0.0294, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.4232804232804233, | |
| "grad_norm": 0.7576529055995104, | |
| "learning_rate": 7.679133974894984e-06, | |
| "loss": 0.0363, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.4275132275132276, | |
| "grad_norm": 1.0996527863122494, | |
| "learning_rate": 7.66438138035365e-06, | |
| "loss": 0.0511, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.4317460317460318, | |
| "grad_norm": 1.2175807102371912, | |
| "learning_rate": 7.649596321166024e-06, | |
| "loss": 0.0521, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.435978835978836, | |
| "grad_norm": 1.1677918243054364, | |
| "learning_rate": 7.634778977483389e-06, | |
| "loss": 0.053, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4402116402116403, | |
| "grad_norm": 1.3661180396603787, | |
| "learning_rate": 7.619929529850397e-06, | |
| "loss": 0.0541, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 1.134002971754773, | |
| "learning_rate": 7.605048159202884e-06, | |
| "loss": 0.0418, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.4486772486772486, | |
| "grad_norm": 1.2740661399159647, | |
| "learning_rate": 7.590135046865652e-06, | |
| "loss": 0.0477, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.452910052910053, | |
| "grad_norm": 0.9564673428495316, | |
| "learning_rate": 7.575190374550272e-06, | |
| "loss": 0.0424, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.457142857142857, | |
| "grad_norm": 0.8592538896773881, | |
| "learning_rate": 7.560214324352858e-06, | |
| "loss": 0.0313, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4613756613756614, | |
| "grad_norm": 1.025295823546742, | |
| "learning_rate": 7.545207078751858e-06, | |
| "loss": 0.05, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.4656084656084656, | |
| "grad_norm": 1.005206182718304, | |
| "learning_rate": 7.530168820605819e-06, | |
| "loss": 0.0366, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.46984126984127, | |
| "grad_norm": 0.817943689379754, | |
| "learning_rate": 7.515099733151177e-06, | |
| "loss": 0.0348, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.474074074074074, | |
| "grad_norm": 0.9487808741793253, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.0462, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.4783068783068782, | |
| "grad_norm": 0.7940195414603822, | |
| "learning_rate": 7.484869805137778e-06, | |
| "loss": 0.0333, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4825396825396826, | |
| "grad_norm": 2.594857831175931, | |
| "learning_rate": 7.469709332921155e-06, | |
| "loss": 0.0377, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.4867724867724867, | |
| "grad_norm": 0.6298189399907342, | |
| "learning_rate": 7.454518768075705e-06, | |
| "loss": 0.0298, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.491005291005291, | |
| "grad_norm": 1.2746680413135476, | |
| "learning_rate": 7.4392982956936644e-06, | |
| "loss": 0.056, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.4952380952380953, | |
| "grad_norm": 0.822040862171887, | |
| "learning_rate": 7.424048101231687e-06, | |
| "loss": 0.033, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.4994708994708994, | |
| "grad_norm": 1.1242964464799274, | |
| "learning_rate": 7.408768370508577e-06, | |
| "loss": 0.0517, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.5037037037037035, | |
| "grad_norm": 0.8701346706195384, | |
| "learning_rate": 7.393459289703035e-06, | |
| "loss": 0.0315, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 0.9133640557604403, | |
| "learning_rate": 7.378121045351378e-06, | |
| "loss": 0.039, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.5121693121693123, | |
| "grad_norm": 1.0021618381519257, | |
| "learning_rate": 7.362753824345271e-06, | |
| "loss": 0.0394, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.5164021164021164, | |
| "grad_norm": 0.8996476991571326, | |
| "learning_rate": 7.347357813929455e-06, | |
| "loss": 0.0446, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.5206349206349206, | |
| "grad_norm": 2.264884992718341, | |
| "learning_rate": 7.3319332016994575e-06, | |
| "loss": 0.0484, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.524867724867725, | |
| "grad_norm": 1.066146344538063, | |
| "learning_rate": 7.31648017559931e-06, | |
| "loss": 0.0436, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.529100529100529, | |
| "grad_norm": 0.9302808730777476, | |
| "learning_rate": 7.300998923919259e-06, | |
| "loss": 0.0328, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 0.9911847404144593, | |
| "learning_rate": 7.285489635293472e-06, | |
| "loss": 0.0423, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.5375661375661376, | |
| "grad_norm": 1.0364471553258094, | |
| "learning_rate": 7.269952498697734e-06, | |
| "loss": 0.0395, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.541798941798942, | |
| "grad_norm": 0.7662023272437689, | |
| "learning_rate": 7.254387703447154e-06, | |
| "loss": 0.0307, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.5460317460317459, | |
| "grad_norm": 1.0377441871093285, | |
| "learning_rate": 7.238795439193849e-06, | |
| "loss": 0.0389, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.5502645502645502, | |
| "grad_norm": 1.0484977562889242, | |
| "learning_rate": 7.223175895924638e-06, | |
| "loss": 0.0474, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.5544973544973546, | |
| "grad_norm": 1.3320962267108385, | |
| "learning_rate": 7.207529263958727e-06, | |
| "loss": 0.0461, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.5587301587301587, | |
| "grad_norm": 1.1981890101491062, | |
| "learning_rate": 7.191855733945388e-06, | |
| "loss": 0.046, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.5629629629629629, | |
| "grad_norm": 1.0353677690689251, | |
| "learning_rate": 7.176155496861639e-06, | |
| "loss": 0.0467, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5671957671957673, | |
| "grad_norm": 0.9023734846152021, | |
| "learning_rate": 7.160428744009913e-06, | |
| "loss": 0.032, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.9795672938910132, | |
| "learning_rate": 7.1446756670157306e-06, | |
| "loss": 0.042, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.5756613756613755, | |
| "grad_norm": 0.8949456456419879, | |
| "learning_rate": 7.128896457825364e-06, | |
| "loss": 0.034, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.57989417989418, | |
| "grad_norm": 1.0979703990784655, | |
| "learning_rate": 7.113091308703498e-06, | |
| "loss": 0.0356, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.5841269841269843, | |
| "grad_norm": 0.9108245034081557, | |
| "learning_rate": 7.0972604122308865e-06, | |
| "loss": 0.0378, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5883597883597882, | |
| "grad_norm": 0.9069830623011091, | |
| "learning_rate": 7.081403961302007e-06, | |
| "loss": 0.0382, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.5925925925925926, | |
| "grad_norm": 0.9298727976375222, | |
| "learning_rate": 7.06552214912271e-06, | |
| "loss": 0.0381, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.596825396825397, | |
| "grad_norm": 1.030708836778186, | |
| "learning_rate": 7.049615169207864e-06, | |
| "loss": 0.0369, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.601058201058201, | |
| "grad_norm": 1.0309556409580762, | |
| "learning_rate": 7.033683215379002e-06, | |
| "loss": 0.0439, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.6052910052910052, | |
| "grad_norm": 1.0407782701760113, | |
| "learning_rate": 7.0177264817619514e-06, | |
| "loss": 0.0421, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6095238095238096, | |
| "grad_norm": 1.3354904716977571, | |
| "learning_rate": 7.0017451627844765e-06, | |
| "loss": 0.0526, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6137566137566137, | |
| "grad_norm": 0.8800338194269767, | |
| "learning_rate": 6.985739453173903e-06, | |
| "loss": 0.0379, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6179894179894179, | |
| "grad_norm": 0.752228395119178, | |
| "learning_rate": 6.9697095479547564e-06, | |
| "loss": 0.0306, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6222222222222222, | |
| "grad_norm": 0.9413579645875557, | |
| "learning_rate": 6.953655642446368e-06, | |
| "loss": 0.0331, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.6264550264550266, | |
| "grad_norm": 0.9570770027126146, | |
| "learning_rate": 6.9375779322605154e-06, | |
| "loss": 0.0387, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.6306878306878307, | |
| "grad_norm": 0.8684490278106084, | |
| "learning_rate": 6.921476613299018e-06, | |
| "loss": 0.0355, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.6349206349206349, | |
| "grad_norm": 1.0632953241977892, | |
| "learning_rate": 6.905351881751372e-06, | |
| "loss": 0.0454, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.6391534391534393, | |
| "grad_norm": 1.15676003887316, | |
| "learning_rate": 6.889203934092337e-06, | |
| "loss": 0.0517, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.6433862433862434, | |
| "grad_norm": 0.8832495191608842, | |
| "learning_rate": 6.873032967079562e-06, | |
| "loss": 0.0358, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.6476190476190475, | |
| "grad_norm": 0.8891845587906834, | |
| "learning_rate": 6.856839177751175e-06, | |
| "loss": 0.0329, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.651851851851852, | |
| "grad_norm": 0.833507703150214, | |
| "learning_rate": 6.840622763423391e-06, | |
| "loss": 0.0314, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.656084656084656, | |
| "grad_norm": 0.8520667024026805, | |
| "learning_rate": 6.824383921688098e-06, | |
| "loss": 0.0336, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.6603174603174602, | |
| "grad_norm": 0.9617042714964271, | |
| "learning_rate": 6.808122850410461e-06, | |
| "loss": 0.0317, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.6645502645502646, | |
| "grad_norm": 0.9734363303715623, | |
| "learning_rate": 6.7918397477265e-06, | |
| "loss": 0.0483, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.668783068783069, | |
| "grad_norm": 1.0180051756915056, | |
| "learning_rate": 6.775534812040686e-06, | |
| "loss": 0.0438, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.673015873015873, | |
| "grad_norm": 0.9663036959934387, | |
| "learning_rate": 6.759208242023509e-06, | |
| "loss": 0.0411, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.6772486772486772, | |
| "grad_norm": 0.6966551244304431, | |
| "learning_rate": 6.7428602366090764e-06, | |
| "loss": 0.0234, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.6814814814814816, | |
| "grad_norm": 1.3759059743950008, | |
| "learning_rate": 6.7264909949926735e-06, | |
| "loss": 0.0291, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.6857142857142857, | |
| "grad_norm": 0.7565996741340549, | |
| "learning_rate": 6.710100716628345e-06, | |
| "loss": 0.0224, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.6899470899470899, | |
| "grad_norm": 1.1887100463923472, | |
| "learning_rate": 6.693689601226458e-06, | |
| "loss": 0.042, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6941798941798942, | |
| "grad_norm": 0.966051588155485, | |
| "learning_rate": 6.677257848751276e-06, | |
| "loss": 0.0378, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.6984126984126984, | |
| "grad_norm": 0.8772916268098477, | |
| "learning_rate": 6.6608056594185166e-06, | |
| "loss": 0.0335, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7026455026455025, | |
| "grad_norm": 0.8372311631604785, | |
| "learning_rate": 6.644333233692917e-06, | |
| "loss": 0.0367, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.7068783068783069, | |
| "grad_norm": 0.992879995777924, | |
| "learning_rate": 6.627840772285784e-06, | |
| "loss": 0.045, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7111111111111112, | |
| "grad_norm": 0.9366130180501334, | |
| "learning_rate": 6.611328476152557e-06, | |
| "loss": 0.0289, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7153439153439154, | |
| "grad_norm": 0.8915482740408233, | |
| "learning_rate": 6.594796546490351e-06, | |
| "loss": 0.0355, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.7195767195767195, | |
| "grad_norm": 1.3583953457116698, | |
| "learning_rate": 6.578245184735513e-06, | |
| "loss": 0.0461, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.723809523809524, | |
| "grad_norm": 0.8410154802690613, | |
| "learning_rate": 6.561674592561164e-06, | |
| "loss": 0.0352, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.728042328042328, | |
| "grad_norm": 0.9907489907823347, | |
| "learning_rate": 6.545084971874738e-06, | |
| "loss": 0.037, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.7322751322751322, | |
| "grad_norm": 0.7734556527152434, | |
| "learning_rate": 6.5284765248155295e-06, | |
| "loss": 0.0264, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7365079365079366, | |
| "grad_norm": 0.8517169774537486, | |
| "learning_rate": 6.5118494537522235e-06, | |
| "loss": 0.0336, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.7407407407407407, | |
| "grad_norm": 1.0261903054202124, | |
| "learning_rate": 6.495203961280434e-06, | |
| "loss": 0.0435, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.7449735449735448, | |
| "grad_norm": 0.8195140295058992, | |
| "learning_rate": 6.4785402502202345e-06, | |
| "loss": 0.0319, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.7492063492063492, | |
| "grad_norm": 0.8064178126782999, | |
| "learning_rate": 6.461858523613684e-06, | |
| "loss": 0.0262, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.7534391534391536, | |
| "grad_norm": 1.059308376312486, | |
| "learning_rate": 6.445158984722358e-06, | |
| "loss": 0.0323, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.7576719576719577, | |
| "grad_norm": 0.7467600140287795, | |
| "learning_rate": 6.428441837024868e-06, | |
| "loss": 0.0274, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.8230132283566644, | |
| "learning_rate": 6.411707284214384e-06, | |
| "loss": 0.0236, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.7661375661375662, | |
| "grad_norm": 0.787813130354089, | |
| "learning_rate": 6.3949555301961474e-06, | |
| "loss": 0.0355, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.7703703703703704, | |
| "grad_norm": 1.451682463238513, | |
| "learning_rate": 6.378186779084996e-06, | |
| "loss": 0.0437, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.7746031746031745, | |
| "grad_norm": 0.8747504739561074, | |
| "learning_rate": 6.361401235202872e-06, | |
| "loss": 0.0284, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7788359788359789, | |
| "grad_norm": 1.070054317826232, | |
| "learning_rate": 6.344599103076329e-06, | |
| "loss": 0.0491, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.783068783068783, | |
| "grad_norm": 0.9839694781276292, | |
| "learning_rate": 6.327780587434045e-06, | |
| "loss": 0.0262, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.7873015873015872, | |
| "grad_norm": 1.9302390005333394, | |
| "learning_rate": 6.310945893204324e-06, | |
| "loss": 0.0381, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.7915343915343915, | |
| "grad_norm": 0.9588230652812939, | |
| "learning_rate": 6.294095225512604e-06, | |
| "loss": 0.0304, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.795767195767196, | |
| "grad_norm": 0.947273510951518, | |
| "learning_rate": 6.277228789678953e-06, | |
| "loss": 0.0468, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.8800021402393514, | |
| "learning_rate": 6.26034679121557e-06, | |
| "loss": 0.0199, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.8042328042328042, | |
| "grad_norm": 1.130364633548092, | |
| "learning_rate": 6.243449435824276e-06, | |
| "loss": 0.0319, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.8084656084656086, | |
| "grad_norm": 1.1066604882510598, | |
| "learning_rate": 6.2265369293940135e-06, | |
| "loss": 0.0329, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.8126984126984127, | |
| "grad_norm": 1.0550706667423015, | |
| "learning_rate": 6.209609477998339e-06, | |
| "loss": 0.0283, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.8169312169312168, | |
| "grad_norm": 0.9811790647444768, | |
| "learning_rate": 6.192667287892905e-06, | |
| "loss": 0.0335, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8211640211640212, | |
| "grad_norm": 0.7827472464069767, | |
| "learning_rate": 6.17571056551295e-06, | |
| "loss": 0.0328, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.8253968253968254, | |
| "grad_norm": 0.7344784176592902, | |
| "learning_rate": 6.158739517470786e-06, | |
| "loss": 0.0244, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.8296296296296295, | |
| "grad_norm": 1.1298574892350588, | |
| "learning_rate": 6.141754350553279e-06, | |
| "loss": 0.041, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.8338624338624339, | |
| "grad_norm": 0.748246425007082, | |
| "learning_rate": 6.124755271719326e-06, | |
| "loss": 0.0281, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.8380952380952382, | |
| "grad_norm": 1.0435711016816667, | |
| "learning_rate": 6.107742488097338e-06, | |
| "loss": 0.0389, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.8423280423280424, | |
| "grad_norm": 1.0041457638530626, | |
| "learning_rate": 6.090716206982714e-06, | |
| "loss": 0.0328, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.8465608465608465, | |
| "grad_norm": 0.940959041938284, | |
| "learning_rate": 6.073676635835317e-06, | |
| "loss": 0.0274, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.8507936507936509, | |
| "grad_norm": 0.995917260966997, | |
| "learning_rate": 6.056623982276945e-06, | |
| "loss": 0.0302, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.855026455026455, | |
| "grad_norm": 0.6253437609792718, | |
| "learning_rate": 6.039558454088796e-06, | |
| "loss": 0.0275, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.8592592592592592, | |
| "grad_norm": 0.7265389917244132, | |
| "learning_rate": 6.022480259208951e-06, | |
| "loss": 0.0227, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.8634920634920635, | |
| "grad_norm": 1.0390428613384723, | |
| "learning_rate": 6.005389605729824e-06, | |
| "loss": 0.0372, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.8677248677248677, | |
| "grad_norm": 0.8380178245455322, | |
| "learning_rate": 5.988286701895631e-06, | |
| "loss": 0.027, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.8719576719576718, | |
| "grad_norm": 1.0320239189151428, | |
| "learning_rate": 5.97117175609986e-06, | |
| "loss": 0.0273, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.8761904761904762, | |
| "grad_norm": 0.9065424646753207, | |
| "learning_rate": 5.954044976882725e-06, | |
| "loss": 0.0278, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.8804232804232806, | |
| "grad_norm": 0.9178765588368991, | |
| "learning_rate": 5.936906572928625e-06, | |
| "loss": 0.0331, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.8846560846560847, | |
| "grad_norm": 0.7960677399262612, | |
| "learning_rate": 5.919756753063601e-06, | |
| "loss": 0.0193, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 1.280013434964197, | |
| "learning_rate": 5.902595726252801e-06, | |
| "loss": 0.0411, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.8931216931216932, | |
| "grad_norm": 1.1561219072627489, | |
| "learning_rate": 5.885423701597918e-06, | |
| "loss": 0.0243, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.8973544973544973, | |
| "grad_norm": 1.0393388710304052, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.0365, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.9015873015873015, | |
| "grad_norm": 0.8819590478297261, | |
| "learning_rate": 5.851047495830163e-06, | |
| "loss": 0.0302, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9058201058201059, | |
| "grad_norm": 1.778211477880275, | |
| "learning_rate": 5.8338437335805124e-06, | |
| "loss": 0.0498, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.91005291005291, | |
| "grad_norm": 0.6164685645460388, | |
| "learning_rate": 5.816629811208112e-06, | |
| "loss": 0.02, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.9142857142857141, | |
| "grad_norm": 1.022308010686047, | |
| "learning_rate": 5.799405938459175e-06, | |
| "loss": 0.0388, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.9185185185185185, | |
| "grad_norm": 0.8643023085266177, | |
| "learning_rate": 5.782172325201155e-06, | |
| "loss": 0.0233, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.9227513227513229, | |
| "grad_norm": 1.0448789439073158, | |
| "learning_rate": 5.764929181420191e-06, | |
| "loss": 0.0366, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.926984126984127, | |
| "grad_norm": 1.1136298371281064, | |
| "learning_rate": 5.747676717218549e-06, | |
| "loss": 0.0406, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.9312169312169312, | |
| "grad_norm": 0.8054456826112729, | |
| "learning_rate": 5.730415142812059e-06, | |
| "loss": 0.0204, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.9354497354497355, | |
| "grad_norm": 0.8704692982674614, | |
| "learning_rate": 5.7131446685275595e-06, | |
| "loss": 0.0228, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.9396825396825397, | |
| "grad_norm": 0.7876406159977293, | |
| "learning_rate": 5.695865504800328e-06, | |
| "loss": 0.0272, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.9439153439153438, | |
| "grad_norm": 0.9252100216116378, | |
| "learning_rate": 5.678577862171523e-06, | |
| "loss": 0.0237, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.9481481481481482, | |
| "grad_norm": 0.6893854806674795, | |
| "learning_rate": 5.661281951285613e-06, | |
| "loss": 0.0206, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.7845843824207334, | |
| "learning_rate": 5.643977982887815e-06, | |
| "loss": 0.0312, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.9566137566137565, | |
| "grad_norm": 0.8049990285785104, | |
| "learning_rate": 5.626666167821522e-06, | |
| "loss": 0.0292, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.9608465608465608, | |
| "grad_norm": 1.148625600779929, | |
| "learning_rate": 5.609346717025738e-06, | |
| "loss": 0.0193, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.9650793650793652, | |
| "grad_norm": 0.6413181984555196, | |
| "learning_rate": 5.592019841532507e-06, | |
| "loss": 0.0198, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.9693121693121693, | |
| "grad_norm": 0.8580613271611244, | |
| "learning_rate": 5.5746857524643335e-06, | |
| "loss": 0.0219, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.9735449735449735, | |
| "grad_norm": 0.9676515609676355, | |
| "learning_rate": 5.557344661031628e-06, | |
| "loss": 0.0326, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.9777777777777779, | |
| "grad_norm": 0.9277086173806844, | |
| "learning_rate": 5.539996778530114e-06, | |
| "loss": 0.0331, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.982010582010582, | |
| "grad_norm": 1.2919252511895343, | |
| "learning_rate": 5.522642316338268e-06, | |
| "loss": 0.0298, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.9862433862433861, | |
| "grad_norm": 0.8640757560649422, | |
| "learning_rate": 5.505281485914732e-06, | |
| "loss": 0.0218, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9904761904761905, | |
| "grad_norm": 1.1249585693174393, | |
| "learning_rate": 5.487914498795748e-06, | |
| "loss": 0.0418, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.9947089947089947, | |
| "grad_norm": 0.810724157613082, | |
| "learning_rate": 5.470541566592573e-06, | |
| "loss": 0.0294, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.9989417989417988, | |
| "grad_norm": 0.8074300914872133, | |
| "learning_rate": 5.453162900988902e-06, | |
| "loss": 0.0221, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6149507180824806, | |
| "learning_rate": 5.435778713738292e-06, | |
| "loss": 0.0114, | |
| "step": 474 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 948, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 120380300677120.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |