diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4584 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4444444444444444, + "eval_steps": 500, + "global_step": 325000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0022222222222222222, + "grad_norm": 4388.67578125, + "learning_rate": 5.988e-07, + "loss": 891.7704, + "step": 500 + }, + { + "epoch": 0.0044444444444444444, + "grad_norm": 1130.9163818359375, + "learning_rate": 1.1988e-06, + "loss": 365.2608, + "step": 1000 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 647.9131469726562, + "learning_rate": 1.7988e-06, + "loss": 143.2146, + "step": 1500 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 862.5914916992188, + "learning_rate": 2.3988000000000002e-06, + "loss": 101.8926, + "step": 2000 + }, + { + "epoch": 0.011111111111111112, + "grad_norm": 874.10302734375, + "learning_rate": 2.9988e-06, + "loss": 86.5583, + "step": 2500 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 732.438720703125, + "learning_rate": 3.5988e-06, + "loss": 80.9323, + "step": 3000 + }, + { + "epoch": 0.015555555555555555, + "grad_norm": 493.2248229980469, + "learning_rate": 4.1988e-06, + "loss": 73.8484, + "step": 3500 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 678.9496459960938, + "learning_rate": 4.7988e-06, + "loss": 68.8807, + "step": 4000 + }, + { + "epoch": 0.02, + "grad_norm": 2241.881103515625, + "learning_rate": 5.398800000000001e-06, + "loss": 69.1163, + "step": 4500 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 572.26318359375, + "learning_rate": 5.9988e-06, + "loss": 65.9477, + "step": 5000 + }, + { + "epoch": 0.024444444444444446, + "grad_norm": 472.3359069824219, + "learning_rate": 6.5988e-06, + "loss": 60.6877, + "step": 5500 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 713.2996215820312, + "learning_rate": 7.1988000000000004e-06, + "loss": 62.0643, + "step": 6000 + }, + { + "epoch": 0.028888888888888888, + "grad_norm": 399.187255859375, + "learning_rate": 7.7988e-06, + "loss": 58.1376, + "step": 6500 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 494.1978454589844, + "learning_rate": 8.3988e-06, + "loss": 56.4748, + "step": 7000 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 338.4364318847656, + "learning_rate": 8.998800000000001e-06, + "loss": 59.7625, + "step": 7500 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 287.89202880859375, + "learning_rate": 9.5988e-06, + "loss": 55.0997, + "step": 8000 + }, + { + "epoch": 0.03777777777777778, + "grad_norm": 213.35813903808594, + "learning_rate": 1.01988e-05, + "loss": 53.2111, + "step": 8500 + }, + { + "epoch": 0.04, + "grad_norm": 354.8004455566406, + "learning_rate": 1.07988e-05, + "loss": 53.5394, + "step": 9000 + }, + { + "epoch": 0.042222222222222223, + "grad_norm": 875.28955078125, + "learning_rate": 1.1398800000000002e-05, + "loss": 52.944, + "step": 9500 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 523.3621215820312, + "learning_rate": 1.19988e-05, + "loss": 50.8715, + "step": 10000 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 545.8438720703125, + "learning_rate": 1.25988e-05, + "loss": 51.0906, + "step": 10500 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 371.3891296386719, + "learning_rate": 1.3198800000000001e-05, + "loss": 49.5472, + "step": 11000 + }, + { + "epoch": 0.051111111111111114, + "grad_norm": 175.73524475097656, + "learning_rate": 1.3798799999999999e-05, + "loss": 47.1287, + "step": 11500 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 335.2581481933594, + "learning_rate": 1.43988e-05, + "loss": 47.6528, + "step": 12000 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1022.18115234375, + "learning_rate": 1.4998800000000001e-05, + "loss": 46.9557, + "step": 12500 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 380.919677734375, + "learning_rate": 1.55988e-05, + "loss": 44.6385, + "step": 13000 + }, + { + "epoch": 0.06, + "grad_norm": 305.0384826660156, + "learning_rate": 1.61988e-05, + "loss": 44.5282, + "step": 13500 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 458.19122314453125, + "learning_rate": 1.67988e-05, + "loss": 44.6465, + "step": 14000 + }, + { + "epoch": 0.06444444444444444, + "grad_norm": 143.66160583496094, + "learning_rate": 1.73988e-05, + "loss": 44.0934, + "step": 14500 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 436.7533874511719, + "learning_rate": 1.79988e-05, + "loss": 43.5587, + "step": 15000 + }, + { + "epoch": 0.06888888888888889, + "grad_norm": 455.068359375, + "learning_rate": 1.85988e-05, + "loss": 41.507, + "step": 15500 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 394.86676025390625, + "learning_rate": 1.91988e-05, + "loss": 40.521, + "step": 16000 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 371.15753173828125, + "learning_rate": 1.97988e-05, + "loss": 40.0934, + "step": 16500 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 476.3223571777344, + "learning_rate": 2.0398800000000002e-05, + "loss": 42.2142, + "step": 17000 + }, + { + "epoch": 0.07777777777777778, + "grad_norm": 498.6954650878906, + "learning_rate": 2.0998800000000003e-05, + "loss": 39.011, + "step": 17500 + }, + { + "epoch": 0.08, + "grad_norm": 327.6210632324219, + "learning_rate": 2.15988e-05, + "loss": 39.5519, + "step": 18000 + }, + { + "epoch": 0.08222222222222222, + "grad_norm": 210.87628173828125, + "learning_rate": 2.2198799999999998e-05, + "loss": 39.4893, + "step": 18500 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 357.408203125, + "learning_rate": 2.27988e-05, + "loss": 39.7812, + "step": 19000 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 312.556640625, + "learning_rate": 2.33988e-05, + "loss": 37.975, + "step": 19500 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 363.57891845703125, + "learning_rate": 2.39988e-05, + "loss": 36.2815, + "step": 20000 + }, + { + "epoch": 0.09111111111111111, + "grad_norm": 332.95977783203125, + "learning_rate": 2.4598800000000002e-05, + "loss": 36.7108, + "step": 20500 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 483.03765869140625, + "learning_rate": 2.5198800000000003e-05, + "loss": 36.0883, + "step": 21000 + }, + { + "epoch": 0.09555555555555556, + "grad_norm": 266.86065673828125, + "learning_rate": 2.5798799999999998e-05, + "loss": 38.5255, + "step": 21500 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 371.4537048339844, + "learning_rate": 2.63988e-05, + "loss": 34.8224, + "step": 22000 + }, + { + "epoch": 0.1, + "grad_norm": 1334.1453857421875, + "learning_rate": 2.69988e-05, + "loss": 36.1617, + "step": 22500 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 234.84649658203125, + "learning_rate": 2.75988e-05, + "loss": 35.088, + "step": 23000 + }, + { + "epoch": 0.10444444444444445, + "grad_norm": 2964.02978515625, + "learning_rate": 2.8198800000000002e-05, + "loss": 34.028, + "step": 23500 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 456.6842956542969, + "learning_rate": 2.8798800000000003e-05, + "loss": 36.25, + "step": 24000 + }, + { + "epoch": 0.10888888888888888, + "grad_norm": 306.76007080078125, + "learning_rate": 2.9398800000000004e-05, + "loss": 33.3643, + "step": 24500 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 818.77783203125, + "learning_rate": 2.9998799999999998e-05, + "loss": 36.2583, + "step": 25000 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 173.24815368652344, + "learning_rate": 2.9999918308948427e-05, + "loss": 36.2218, + "step": 25500 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 542.15234375, + "learning_rate": 2.9999672581521505e-05, + "loss": 33.669, + "step": 26000 + }, + { + "epoch": 0.11777777777777777, + "grad_norm": 663.7468872070312, + "learning_rate": 2.999926282007839e-05, + "loss": 33.3195, + "step": 26500 + }, + { + "epoch": 0.12, + "grad_norm": 237.98435974121094, + "learning_rate": 2.9998689029100164e-05, + "loss": 34.6775, + "step": 27000 + }, + { + "epoch": 0.12222222222222222, + "grad_norm": 350.93109130859375, + "learning_rate": 2.9997951214861724e-05, + "loss": 32.0158, + "step": 27500 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 648.1705322265625, + "learning_rate": 2.999704938543168e-05, + "loss": 33.583, + "step": 28000 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 263.5220642089844, + "learning_rate": 2.9995983550672296e-05, + "loss": 33.9471, + "step": 28500 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 193.79708862304688, + "learning_rate": 2.9994753722239374e-05, + "loss": 32.0882, + "step": 29000 + }, + { + "epoch": 0.13111111111111112, + "grad_norm": 584.5958862304688, + "learning_rate": 2.999335991358211e-05, + "loss": 32.2817, + "step": 29500 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 498.8976745605469, + "learning_rate": 2.999180213994299e-05, + "loss": 31.1343, + "step": 30000 + }, + { + "epoch": 0.13555555555555557, + "grad_norm": 492.1926574707031, + "learning_rate": 2.9990080418357563e-05, + "loss": 30.703, + "step": 30500 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 389.2348937988281, + "learning_rate": 2.99881947676543e-05, + "loss": 32.2483, + "step": 31000 + }, + { + "epoch": 0.14, + "grad_norm": 687.1718139648438, + "learning_rate": 2.9986145208454382e-05, + "loss": 31.1763, + "step": 31500 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 404.84326171875, + "learning_rate": 2.998393176317146e-05, + "loss": 31.7738, + "step": 32000 + }, + { + "epoch": 0.14444444444444443, + "grad_norm": 492.9033203125, + "learning_rate": 2.9981554456011407e-05, + "loss": 31.7717, + "step": 32500 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 393.6338195800781, + "learning_rate": 2.997901331297209e-05, + "loss": 30.5822, + "step": 33000 + }, + { + "epoch": 0.14888888888888888, + "grad_norm": 510.1676025390625, + "learning_rate": 2.9976308361843024e-05, + "loss": 28.6046, + "step": 33500 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 547.7921142578125, + "learning_rate": 2.997343963220513e-05, + "loss": 29.9463, + "step": 34000 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 481.76092529296875, + "learning_rate": 2.997040715543038e-05, + "loss": 29.8005, + "step": 34500 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 394.83935546875, + "learning_rate": 2.9967210964681447e-05, + "loss": 29.8433, + "step": 35000 + }, + { + "epoch": 0.15777777777777777, + "grad_norm": 223.97235107421875, + "learning_rate": 2.9963851094911362e-05, + "loss": 30.1751, + "step": 35500 + }, + { + "epoch": 0.16, + "grad_norm": 587.9564819335938, + "learning_rate": 2.9960327582863126e-05, + "loss": 28.0523, + "step": 36000 + }, + { + "epoch": 0.1622222222222222, + "grad_norm": 786.5308227539062, + "learning_rate": 2.9956640467069298e-05, + "loss": 30.0858, + "step": 36500 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 627.6124267578125, + "learning_rate": 2.995278978785159e-05, + "loss": 27.514, + "step": 37000 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 135.85784912109375, + "learning_rate": 2.9948775587320413e-05, + "loss": 29.0652, + "step": 37500 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 516.0145874023438, + "learning_rate": 2.9944597909374416e-05, + "loss": 28.7626, + "step": 38000 + }, + { + "epoch": 0.1711111111111111, + "grad_norm": 381.4872131347656, + "learning_rate": 2.994025679970002e-05, + "loss": 30.4396, + "step": 38500 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 612.7399291992188, + "learning_rate": 2.99357523057709e-05, + "loss": 26.5003, + "step": 39000 + }, + { + "epoch": 0.17555555555555555, + "grad_norm": 365.5273132324219, + "learning_rate": 2.9931084476847486e-05, + "loss": 27.6445, + "step": 39500 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 117.53230285644531, + "learning_rate": 2.99262533639764e-05, + "loss": 26.8894, + "step": 40000 + }, + { + "epoch": 0.18, + "grad_norm": 895.5122680664062, + "learning_rate": 2.9921259019989926e-05, + "loss": 26.3664, + "step": 40500 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 493.69683837890625, + "learning_rate": 2.9916101499505408e-05, + "loss": 25.5829, + "step": 41000 + }, + { + "epoch": 0.18444444444444444, + "grad_norm": 469.6036376953125, + "learning_rate": 2.9910780858924657e-05, + "loss": 27.9183, + "step": 41500 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 539.50390625, + "learning_rate": 2.9905297156433357e-05, + "loss": 27.7629, + "step": 42000 + }, + { + "epoch": 0.18888888888888888, + "grad_norm": 127.55433654785156, + "learning_rate": 2.9899650452000393e-05, + "loss": 26.9212, + "step": 42500 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 361.29010009765625, + "learning_rate": 2.9893840807377214e-05, + "loss": 25.828, + "step": 43000 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 603.46533203125, + "learning_rate": 2.988786828609718e-05, + "loss": 27.1813, + "step": 43500 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 94.64213562011719, + "learning_rate": 2.988173295347481e-05, + "loss": 28.3537, + "step": 44000 + }, + { + "epoch": 0.19777777777777777, + "grad_norm": 1213.6317138671875, + "learning_rate": 2.987543487660513e-05, + "loss": 25.5299, + "step": 44500 + }, + { + "epoch": 0.2, + "grad_norm": 504.8955993652344, + "learning_rate": 2.986897412436289e-05, + "loss": 29.0305, + "step": 45000 + }, + { + "epoch": 0.20222222222222222, + "grad_norm": 734.322021484375, + "learning_rate": 2.9862350767401846e-05, + "loss": 28.3809, + "step": 45500 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 1137.0435791015625, + "learning_rate": 2.9855564878153972e-05, + "loss": 26.6201, + "step": 46000 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 373.8830871582031, + "learning_rate": 2.984861653082866e-05, + "loss": 25.7129, + "step": 46500 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 263.8885498046875, + "learning_rate": 2.9841505801411928e-05, + "loss": 26.2681, + "step": 47000 + }, + { + "epoch": 0.2111111111111111, + "grad_norm": 1805.83984375, + "learning_rate": 2.983423276766557e-05, + "loss": 26.6592, + "step": 47500 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 286.2330627441406, + "learning_rate": 2.982679750912632e-05, + "loss": 25.0459, + "step": 48000 + }, + { + "epoch": 0.21555555555555556, + "grad_norm": 219.3948516845703, + "learning_rate": 2.9819200107104972e-05, + "loss": 25.5699, + "step": 48500 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 412.9397888183594, + "learning_rate": 2.98114406446855e-05, + "loss": 26.1915, + "step": 49000 + }, + { + "epoch": 0.22, + "grad_norm": 602.8424682617188, + "learning_rate": 2.9803519206724136e-05, + "loss": 27.0685, + "step": 49500 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 149.6744384765625, + "learning_rate": 2.9795435879848466e-05, + "loss": 24.8978, + "step": 50000 + }, + { + "epoch": 0.22444444444444445, + "grad_norm": 339.0307312011719, + "learning_rate": 2.9787190752456448e-05, + "loss": 23.1352, + "step": 50500 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 627.1898193359375, + "learning_rate": 2.977878391471548e-05, + "loss": 25.7614, + "step": 51000 + }, + { + "epoch": 0.2288888888888889, + "grad_norm": 959.9122924804688, + "learning_rate": 2.9770215458561394e-05, + "loss": 23.909, + "step": 51500 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 290.6165466308594, + "learning_rate": 2.976148547769745e-05, + "loss": 25.6165, + "step": 52000 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 337.4861755371094, + "learning_rate": 2.9752594067593318e-05, + "loss": 24.7856, + "step": 52500 + }, + { + "epoch": 0.23555555555555555, + "grad_norm": 1252.9945068359375, + "learning_rate": 2.974354132548404e-05, + "loss": 25.353, + "step": 53000 + }, + { + "epoch": 0.23777777777777778, + "grad_norm": 186.39710998535156, + "learning_rate": 2.973432735036895e-05, + "loss": 24.7965, + "step": 53500 + }, + { + "epoch": 0.24, + "grad_norm": 795.011962890625, + "learning_rate": 2.9724952243010605e-05, + "loss": 24.6118, + "step": 54000 + }, + { + "epoch": 0.24222222222222223, + "grad_norm": 217.4955291748047, + "learning_rate": 2.9715416105933675e-05, + "loss": 24.6205, + "step": 54500 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 310.7270812988281, + "learning_rate": 2.970571904342383e-05, + "loss": 24.1833, + "step": 55000 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 250.29307556152344, + "learning_rate": 2.969586116152659e-05, + "loss": 24.082, + "step": 55500 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 243.90106201171875, + "learning_rate": 2.9685842568046167e-05, + "loss": 23.5486, + "step": 56000 + }, + { + "epoch": 0.2511111111111111, + "grad_norm": 281.5003967285156, + "learning_rate": 2.967566337254431e-05, + "loss": 22.6343, + "step": 56500 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 190.99545288085938, + "learning_rate": 2.9665323686339052e-05, + "loss": 25.0189, + "step": 57000 + }, + { + "epoch": 0.25555555555555554, + "grad_norm": 400.95361328125, + "learning_rate": 2.9654823622503557e-05, + "loss": 23.9388, + "step": 57500 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 74.59510040283203, + "learning_rate": 2.9644163295864836e-05, + "loss": 24.4699, + "step": 58000 + }, + { + "epoch": 0.26, + "grad_norm": 650.9434204101562, + "learning_rate": 2.9633342823002515e-05, + "loss": 22.5825, + "step": 58500 + }, + { + "epoch": 0.26222222222222225, + "grad_norm": 359.67315673828125, + "learning_rate": 2.9622362322247548e-05, + "loss": 24.1618, + "step": 59000 + }, + { + "epoch": 0.2644444444444444, + "grad_norm": 0.0, + "learning_rate": 2.9611221913680935e-05, + "loss": 22.4548, + "step": 59500 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 392.0536804199219, + "learning_rate": 2.9599921719132397e-05, + "loss": 22.0985, + "step": 60000 + }, + { + "epoch": 0.2688888888888889, + "grad_norm": 220.76341247558594, + "learning_rate": 2.9588461862179055e-05, + "loss": 22.2635, + "step": 60500 + }, + { + "epoch": 0.27111111111111114, + "grad_norm": 179.5050048828125, + "learning_rate": 2.9576842468144067e-05, + "loss": 22.9824, + "step": 61000 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 625.1077270507812, + "learning_rate": 2.9565063664095265e-05, + "loss": 23.0385, + "step": 61500 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 787.576171875, + "learning_rate": 2.955312557884376e-05, + "loss": 23.6391, + "step": 62000 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 287.6144714355469, + "learning_rate": 2.954102834294254e-05, + "loss": 22.4223, + "step": 62500 + }, + { + "epoch": 0.28, + "grad_norm": 598.0758666992188, + "learning_rate": 2.9528772088685042e-05, + "loss": 22.2955, + "step": 63000 + }, + { + "epoch": 0.2822222222222222, + "grad_norm": 567.0135498046875, + "learning_rate": 2.9516356950103695e-05, + "loss": 22.5473, + "step": 63500 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 209.81381225585938, + "learning_rate": 2.950378306296847e-05, + "loss": 23.5631, + "step": 64000 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 413.2209167480469, + "learning_rate": 2.9491050564785384e-05, + "loss": 23.1249, + "step": 64500 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 140.22494506835938, + "learning_rate": 2.9478159594794985e-05, + "loss": 23.2432, + "step": 65000 + }, + { + "epoch": 0.2911111111111111, + "grad_norm": 322.0098571777344, + "learning_rate": 2.946511029397087e-05, + "loss": 23.1568, + "step": 65500 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 204.205810546875, + "learning_rate": 2.945190280501809e-05, + "loss": 23.9367, + "step": 66000 + }, + { + "epoch": 0.29555555555555557, + "grad_norm": 247.4243621826172, + "learning_rate": 2.943853727237164e-05, + "loss": 23.2841, + "step": 66500 + }, + { + "epoch": 0.29777777777777775, + "grad_norm": 767.0619506835938, + "learning_rate": 2.9425013842194833e-05, + "loss": 23.7975, + "step": 67000 + }, + { + "epoch": 0.3, + "grad_norm": 1255.4112548828125, + "learning_rate": 2.9411332662377744e-05, + "loss": 23.7579, + "step": 67500 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 444.0653991699219, + "learning_rate": 2.9397493882535556e-05, + "loss": 22.0943, + "step": 68000 + }, + { + "epoch": 0.30444444444444446, + "grad_norm": 362.8856506347656, + "learning_rate": 2.9383497654006945e-05, + "loss": 22.6397, + "step": 68500 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 450.62237548828125, + "learning_rate": 2.936934412985244e-05, + "loss": 22.2143, + "step": 69000 + }, + { + "epoch": 0.3088888888888889, + "grad_norm": 148.87391662597656, + "learning_rate": 2.9355033464852697e-05, + "loss": 21.7673, + "step": 69500 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 182.1023406982422, + "learning_rate": 2.9340565815506865e-05, + "loss": 22.5551, + "step": 70000 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 289.2044677734375, + "learning_rate": 2.932594134003083e-05, + "loss": 22.7895, + "step": 70500 + }, + { + "epoch": 0.31555555555555553, + "grad_norm": 0.0, + "learning_rate": 2.931116019835553e-05, + "loss": 22.729, + "step": 71000 + }, + { + "epoch": 0.31777777777777777, + "grad_norm": 361.7475891113281, + "learning_rate": 2.9296222552125148e-05, + "loss": 21.4155, + "step": 71500 + }, + { + "epoch": 0.32, + "grad_norm": 391.5496520996094, + "learning_rate": 2.928112856469539e-05, + "loss": 22.2849, + "step": 72000 + }, + { + "epoch": 0.32222222222222224, + "grad_norm": 429.3208923339844, + "learning_rate": 2.9265878401131687e-05, + "loss": 20.7871, + "step": 72500 + }, + { + "epoch": 0.3244444444444444, + "grad_norm": 912.58154296875, + "learning_rate": 2.9250472228207387e-05, + "loss": 20.8959, + "step": 73000 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 145.02476501464844, + "learning_rate": 2.9234910214401926e-05, + "loss": 22.3574, + "step": 73500 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 313.38629150390625, + "learning_rate": 2.9219192529899e-05, + "loss": 22.3035, + "step": 74000 + }, + { + "epoch": 0.33111111111111113, + "grad_norm": 416.150146484375, + "learning_rate": 2.9203319346584673e-05, + "loss": 22.091, + "step": 74500 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 125.51025390625, + "learning_rate": 2.9187290838045552e-05, + "loss": 21.6607, + "step": 75000 + }, + { + "epoch": 0.33555555555555555, + "grad_norm": 256.96875, + "learning_rate": 2.9171107179566826e-05, + "loss": 21.8178, + "step": 75500 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 1280.6885986328125, + "learning_rate": 2.91547685481304e-05, + "loss": 21.1816, + "step": 76000 + }, + { + "epoch": 0.34, + "grad_norm": 276.4981994628906, + "learning_rate": 2.9138275122412927e-05, + "loss": 21.1474, + "step": 76500 + }, + { + "epoch": 0.3422222222222222, + "grad_norm": 0.0, + "learning_rate": 2.9121627082783864e-05, + "loss": 21.2128, + "step": 77000 + }, + { + "epoch": 0.34444444444444444, + "grad_norm": 779.9710693359375, + "learning_rate": 2.910482461130351e-05, + "loss": 21.6096, + "step": 77500 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 726.4488525390625, + "learning_rate": 2.9087867891721e-05, + "loss": 20.5737, + "step": 78000 + }, + { + "epoch": 0.3488888888888889, + "grad_norm": 867.9049682617188, + "learning_rate": 2.90707571094723e-05, + "loss": 21.431, + "step": 78500 + }, + { + "epoch": 0.3511111111111111, + "grad_norm": 1406.6778564453125, + "learning_rate": 2.905349245167819e-05, + "loss": 22.8944, + "step": 79000 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 30.834983825683594, + "learning_rate": 2.903607410714219e-05, + "loss": 20.6775, + "step": 79500 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 307.98822021484375, + "learning_rate": 2.9018502266348537e-05, + "loss": 19.7868, + "step": 80000 + }, + { + "epoch": 0.35777777777777775, + "grad_norm": 897.4186401367188, + "learning_rate": 2.900077712146006e-05, + "loss": 22.5855, + "step": 80500 + }, + { + "epoch": 0.36, + "grad_norm": 203.12339782714844, + "learning_rate": 2.8982898866316107e-05, + "loss": 21.1752, + "step": 81000 + }, + { + "epoch": 0.3622222222222222, + "grad_norm": 220.3880157470703, + "learning_rate": 2.8964867696430412e-05, + "loss": 21.3629, + "step": 81500 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 46.697349548339844, + "learning_rate": 2.8946683808988956e-05, + "loss": 21.3887, + "step": 82000 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 179.70164489746094, + "learning_rate": 2.892834740284782e-05, + "loss": 21.825, + "step": 82500 + }, + { + "epoch": 0.3688888888888889, + "grad_norm": 518.6677856445312, + "learning_rate": 2.8909858678531007e-05, + "loss": 20.7174, + "step": 83000 + }, + { + "epoch": 0.3711111111111111, + "grad_norm": 643.6600952148438, + "learning_rate": 2.889121783822824e-05, + "loss": 22.1913, + "step": 83500 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 262.9464111328125, + "learning_rate": 2.887242508579277e-05, + "loss": 22.0347, + "step": 84000 + }, + { + "epoch": 0.37555555555555553, + "grad_norm": 1396.894775390625, + "learning_rate": 2.8853480626739115e-05, + "loss": 20.4351, + "step": 84500 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 597.5517578125, + "learning_rate": 2.883438466824085e-05, + "loss": 19.2972, + "step": 85000 + }, + { + "epoch": 0.38, + "grad_norm": 238.96986389160156, + "learning_rate": 2.8815137419128317e-05, + "loss": 20.8544, + "step": 85500 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 118.68024444580078, + "learning_rate": 2.8795739089886353e-05, + "loss": 20.0097, + "step": 86000 + }, + { + "epoch": 0.3844444444444444, + "grad_norm": 281.3915100097656, + "learning_rate": 2.877618989265197e-05, + "loss": 19.3276, + "step": 86500 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 412.5798645019531, + "learning_rate": 2.8756490041212067e-05, + "loss": 20.9107, + "step": 87000 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 897.23095703125, + "learning_rate": 2.8736639751001056e-05, + "loss": 21.3243, + "step": 87500 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 1561.7535400390625, + "learning_rate": 2.871663923909853e-05, + "loss": 20.2997, + "step": 88000 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 219.94825744628906, + "learning_rate": 2.8696488724226884e-05, + "loss": 19.0194, + "step": 88500 + }, + { + "epoch": 0.39555555555555555, + "grad_norm": 175.09353637695312, + "learning_rate": 2.8676188426748923e-05, + "loss": 20.7055, + "step": 89000 + }, + { + "epoch": 0.3977777777777778, + "grad_norm": 282.50933837890625, + "learning_rate": 2.8655738568665447e-05, + "loss": 19.1337, + "step": 89500 + }, + { + "epoch": 0.4, + "grad_norm": 60.395172119140625, + "learning_rate": 2.863513937361283e-05, + "loss": 20.728, + "step": 90000 + }, + { + "epoch": 0.4022222222222222, + "grad_norm": 314.94561767578125, + "learning_rate": 2.861439106686056e-05, + "loss": 19.575, + "step": 90500 + }, + { + "epoch": 0.40444444444444444, + "grad_norm": 473.822998046875, + "learning_rate": 2.8593493875308805e-05, + "loss": 20.2208, + "step": 91000 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 412.5682373046875, + "learning_rate": 2.8572448027485896e-05, + "loss": 19.7487, + "step": 91500 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 155.67567443847656, + "learning_rate": 2.855125375354586e-05, + "loss": 18.5899, + "step": 92000 + }, + { + "epoch": 0.4111111111111111, + "grad_norm": 401.43621826171875, + "learning_rate": 2.8529911285265876e-05, + "loss": 21.001, + "step": 92500 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 379.79302978515625, + "learning_rate": 2.8508420856043763e-05, + "loss": 19.6731, + "step": 93000 + }, + { + "epoch": 0.41555555555555557, + "grad_norm": 224.41383361816406, + "learning_rate": 2.8486782700895407e-05, + "loss": 19.2887, + "step": 93500 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 164.6722412109375, + "learning_rate": 2.8464997056452206e-05, + "loss": 20.0013, + "step": 94000 + }, + { + "epoch": 0.42, + "grad_norm": 241.1973876953125, + "learning_rate": 2.8443064160958483e-05, + "loss": 18.3981, + "step": 94500 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 790.732421875, + "learning_rate": 2.8420984254268863e-05, + "loss": 18.5947, + "step": 95000 + }, + { + "epoch": 0.42444444444444446, + "grad_norm": 446.4692687988281, + "learning_rate": 2.8398757577845665e-05, + "loss": 19.8438, + "step": 95500 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 17.384523391723633, + "learning_rate": 2.837638437475627e-05, + "loss": 19.1518, + "step": 96000 + }, + { + "epoch": 0.4288888888888889, + "grad_norm": 292.8326416015625, + "learning_rate": 2.8353864889670442e-05, + "loss": 18.9518, + "step": 96500 + }, + { + "epoch": 0.4311111111111111, + "grad_norm": 1216.1114501953125, + "learning_rate": 2.8331199368857656e-05, + "loss": 19.3502, + "step": 97000 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 256.9949035644531, + "learning_rate": 2.830838806018442e-05, + "loss": 18.1643, + "step": 97500 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 203.0587615966797, + "learning_rate": 2.8285431213111548e-05, + "loss": 19.173, + "step": 98000 + }, + { + "epoch": 0.43777777777777777, + "grad_norm": 290.00775146484375, + "learning_rate": 2.826232907869145e-05, + "loss": 20.2496, + "step": 98500 + }, + { + "epoch": 0.44, + "grad_norm": 437.4803771972656, + "learning_rate": 2.823908190956535e-05, + "loss": 19.568, + "step": 99000 + }, + { + "epoch": 0.44222222222222224, + "grad_norm": 79.48589324951172, + "learning_rate": 2.821568995996058e-05, + "loss": 18.2379, + "step": 99500 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 252.00978088378906, + "learning_rate": 2.8192153485687752e-05, + "loss": 19.322, + "step": 100000 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 220.2042999267578, + "learning_rate": 2.8168472744137977e-05, + "loss": 18.7556, + "step": 100500 + }, + { + "epoch": 0.4488888888888889, + "grad_norm": 260.3736572265625, + "learning_rate": 2.814464799428004e-05, + "loss": 18.9124, + "step": 101000 + }, + { + "epoch": 0.45111111111111113, + "grad_norm": 593.2783203125, + "learning_rate": 2.8120679496657602e-05, + "loss": 19.0002, + "step": 101500 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 1167.1844482421875, + "learning_rate": 2.80965675133863e-05, + "loss": 19.2148, + "step": 102000 + }, + { + "epoch": 0.45555555555555555, + "grad_norm": 15.313830375671387, + "learning_rate": 2.8072312308150934e-05, + "loss": 18.2168, + "step": 102500 + }, + { + "epoch": 0.4577777777777778, + "grad_norm": 200.6254119873047, + "learning_rate": 2.8047914146202533e-05, + "loss": 19.3346, + "step": 103000 + }, + { + "epoch": 0.46, + "grad_norm": 426.6332702636719, + "learning_rate": 2.8023373294355492e-05, + "loss": 17.3282, + "step": 103500 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 432.8354187011719, + "learning_rate": 2.799869002098463e-05, + "loss": 19.5463, + "step": 104000 + }, + { + "epoch": 0.46444444444444444, + "grad_norm": 298.2032775878906, + "learning_rate": 2.7973864596022273e-05, + "loss": 18.7725, + "step": 104500 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 403.9524841308594, + "learning_rate": 2.7948897290955293e-05, + "loss": 19.5364, + "step": 105000 + }, + { + "epoch": 0.4688888888888889, + "grad_norm": 51.500240325927734, + "learning_rate": 2.7923788378822135e-05, + "loss": 18.9839, + "step": 105500 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 521.7046508789062, + "learning_rate": 2.7898538134209837e-05, + "loss": 18.7831, + "step": 106000 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 105.23808288574219, + "learning_rate": 2.787314683325104e-05, + "loss": 18.1615, + "step": 106500 + }, + { + "epoch": 0.47555555555555556, + "grad_norm": 332.540283203125, + "learning_rate": 2.7847614753620926e-05, + "loss": 19.3657, + "step": 107000 + }, + { + "epoch": 0.4777777777777778, + "grad_norm": 901.9822387695312, + "learning_rate": 2.7821942174534243e-05, + "loss": 18.9534, + "step": 107500 + }, + { + "epoch": 0.48, + "grad_norm": 437.5888977050781, + "learning_rate": 2.779612937674219e-05, + "loss": 18.7374, + "step": 108000 + }, + { + "epoch": 0.4822222222222222, + "grad_norm": 438.2900390625, + "learning_rate": 2.7770176642529397e-05, + "loss": 20.7495, + "step": 108500 + }, + { + "epoch": 0.48444444444444446, + "grad_norm": 369.8582763671875, + "learning_rate": 2.7744084255710804e-05, + "loss": 17.091, + "step": 109000 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 734.362548828125, + "learning_rate": 2.7717852501628574e-05, + "loss": 19.0611, + "step": 109500 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 425.8333435058594, + "learning_rate": 2.769148166714897e-05, + "loss": 18.6956, + "step": 110000 + }, + { + "epoch": 0.4911111111111111, + "grad_norm": 273.7350158691406, + "learning_rate": 2.76649720406592e-05, + "loss": 18.9581, + "step": 110500 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 501.64019775390625, + "learning_rate": 2.763832391206431e-05, + "loss": 17.5245, + "step": 111000 + }, + { + "epoch": 0.4955555555555556, + "grad_norm": 1036.9017333984375, + "learning_rate": 2.7611537572783953e-05, + "loss": 17.9539, + "step": 111500 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 63.28369140625, + "learning_rate": 2.7584613315749247e-05, + "loss": 17.5569, + "step": 112000 + }, + { + "epoch": 0.5, + "grad_norm": 144.62741088867188, + "learning_rate": 2.7557551435399554e-05, + "loss": 18.3981, + "step": 112500 + }, + { + "epoch": 0.5022222222222222, + "grad_norm": 50.069549560546875, + "learning_rate": 2.753035222767926e-05, + "loss": 18.6216, + "step": 113000 + }, + { + "epoch": 0.5044444444444445, + "grad_norm": 733.9398193359375, + "learning_rate": 2.7503015990034543e-05, + "loss": 17.1969, + "step": 113500 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 444.6294250488281, + "learning_rate": 2.747554302141012e-05, + "loss": 18.0202, + "step": 114000 + }, + { + "epoch": 0.5088888888888888, + "grad_norm": 59.344337463378906, + "learning_rate": 2.7447933622245974e-05, + "loss": 17.6973, + "step": 114500 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.0, + "learning_rate": 2.742018809447407e-05, + "loss": 18.7046, + "step": 115000 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 421.5881652832031, + "learning_rate": 2.7392306741515056e-05, + "loss": 17.8755, + "step": 115500 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 292.31060791015625, + "learning_rate": 2.736428986827494e-05, + "loss": 18.5183, + "step": 116000 + }, + { + "epoch": 0.5177777777777778, + "grad_norm": 448.3764343261719, + "learning_rate": 2.7336137781141758e-05, + "loss": 18.2446, + "step": 116500 + }, + { + "epoch": 0.52, + "grad_norm": 312.8506164550781, + "learning_rate": 2.730785078798222e-05, + "loss": 17.2551, + "step": 117000 + }, + { + "epoch": 0.5222222222222223, + "grad_norm": 198.42645263671875, + "learning_rate": 2.7279429198138368e-05, + "loss": 17.8948, + "step": 117500 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 148.22213745117188, + "learning_rate": 2.7250873322424135e-05, + "loss": 17.4501, + "step": 118000 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 537.1702270507812, + "learning_rate": 2.7222183473122015e-05, + "loss": 18.9861, + "step": 118500 + }, + { + "epoch": 0.5288888888888889, + "grad_norm": 363.04833984375, + "learning_rate": 2.71933599639796e-05, + "loss": 18.2579, + "step": 119000 + }, + { + "epoch": 0.5311111111111111, + "grad_norm": 550.2840576171875, + "learning_rate": 2.7164403110206168e-05, + "loss": 17.3876, + "step": 119500 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 99.29381561279297, + "learning_rate": 2.713531322846923e-05, + "loss": 18.4671, + "step": 120000 + }, + { + "epoch": 0.5355555555555556, + "grad_norm": 267.3313293457031, + "learning_rate": 2.7106090636891077e-05, + "loss": 19.6639, + "step": 120500 + }, + { + "epoch": 0.5377777777777778, + "grad_norm": 356.0230407714844, + "learning_rate": 2.7076735655045283e-05, + "loss": 18.553, + "step": 121000 + }, + { + "epoch": 0.54, + "grad_norm": 72.5117416381836, + "learning_rate": 2.7047248603953233e-05, + "loss": 16.9581, + "step": 121500 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 283.059326171875, + "learning_rate": 2.701762980608059e-05, + "loss": 17.3513, + "step": 122000 + }, + { + "epoch": 0.5444444444444444, + "grad_norm": 455.74267578125, + "learning_rate": 2.698787958533378e-05, + "loss": 18.527, + "step": 122500 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 264.24700927734375, + "learning_rate": 2.6957998267056454e-05, + "loss": 18.6227, + "step": 123000 + }, + { + "epoch": 0.5488888888888889, + "grad_norm": 563.1781005859375, + "learning_rate": 2.692798617802592e-05, + "loss": 17.3232, + "step": 123500 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 488.3459777832031, + "learning_rate": 2.6897843646449575e-05, + "loss": 17.4262, + "step": 124000 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 119.61053466796875, + "learning_rate": 2.6867571001961312e-05, + "loss": 17.022, + "step": 124500 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 239.64756774902344, + "learning_rate": 2.683716857561793e-05, + "loss": 17.9908, + "step": 125000 + }, + { + "epoch": 0.5577777777777778, + "grad_norm": 418.17547607421875, + "learning_rate": 2.6806636699895484e-05, + "loss": 18.6269, + "step": 125500 + }, + { + "epoch": 0.56, + "grad_norm": 551.5980224609375, + "learning_rate": 2.677597570868568e-05, + "loss": 18.3972, + "step": 126000 + }, + { + "epoch": 0.5622222222222222, + "grad_norm": 304.7643127441406, + "learning_rate": 2.6745185937292207e-05, + "loss": 18.2829, + "step": 126500 + }, + { + "epoch": 0.5644444444444444, + "grad_norm": 144.07781982421875, + "learning_rate": 2.6714267722427064e-05, + "loss": 18.218, + "step": 127000 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 353.9224548339844, + "learning_rate": 2.66832214022069e-05, + "loss": 18.1345, + "step": 127500 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 197.71298217773438, + "learning_rate": 2.66520473161493e-05, + "loss": 17.18, + "step": 128000 + }, + { + "epoch": 0.5711111111111111, + "grad_norm": 783.7542114257812, + "learning_rate": 2.6620745805169076e-05, + "loss": 16.7577, + "step": 128500 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 331.999755859375, + "learning_rate": 2.6589317211574535e-05, + "loss": 16.8293, + "step": 129000 + }, + { + "epoch": 0.5755555555555556, + "grad_norm": 386.9215393066406, + "learning_rate": 2.6557761879063737e-05, + "loss": 16.7488, + "step": 129500 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 670.8016357421875, + "learning_rate": 2.652608015272075e-05, + "loss": 16.6633, + "step": 130000 + }, + { + "epoch": 0.58, + "grad_norm": 130.0618133544922, + "learning_rate": 2.6494272379011853e-05, + "loss": 17.5815, + "step": 130500 + }, + { + "epoch": 0.5822222222222222, + "grad_norm": 363.4728698730469, + "learning_rate": 2.6462338905781766e-05, + "loss": 17.5676, + "step": 131000 + }, + { + "epoch": 0.5844444444444444, + "grad_norm": 194.19207763671875, + "learning_rate": 2.6430280082249832e-05, + "loss": 19.0677, + "step": 131500 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 478.40692138671875, + "learning_rate": 2.6398096259006212e-05, + "loss": 16.4278, + "step": 132000 + }, + { + "epoch": 0.5888888888888889, + "grad_norm": 673.5048828125, + "learning_rate": 2.636578778800804e-05, + "loss": 17.7745, + "step": 132500 + }, + { + "epoch": 0.5911111111111111, + "grad_norm": 208.15098571777344, + "learning_rate": 2.633335502257558e-05, + "loss": 17.4536, + "step": 133000 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 1426.62109375, + "learning_rate": 2.6300798317388357e-05, + "loss": 17.152, + "step": 133500 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 253.73455810546875, + "learning_rate": 2.626811802848128e-05, + "loss": 16.4736, + "step": 134000 + }, + { + "epoch": 0.5977777777777777, + "grad_norm": 890.9122924804688, + "learning_rate": 2.623531451324076e-05, + "loss": 17.913, + "step": 134500 + }, + { + "epoch": 0.6, + "grad_norm": 880.38671875, + "learning_rate": 2.6202388130400772e-05, + "loss": 17.0165, + "step": 135000 + }, + { + "epoch": 0.6022222222222222, + "grad_norm": 284.1332702636719, + "learning_rate": 2.616933924003898e-05, + "loss": 17.0189, + "step": 135500 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 23.394821166992188, + "learning_rate": 2.6136168203572742e-05, + "loss": 17.2017, + "step": 136000 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 790.5655517578125, + "learning_rate": 2.61028753837552e-05, + "loss": 15.7028, + "step": 136500 + }, + { + "epoch": 0.6088888888888889, + "grad_norm": 196.9662628173828, + "learning_rate": 2.6069461144671298e-05, + "loss": 16.4864, + "step": 137000 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 178.7125244140625, + "learning_rate": 2.6035925851733808e-05, + "loss": 17.2559, + "step": 137500 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 402.0807800292969, + "learning_rate": 2.600226987167931e-05, + "loss": 17.2757, + "step": 138000 + }, + { + "epoch": 0.6155555555555555, + "grad_norm": 252.41526794433594, + "learning_rate": 2.5968493572564218e-05, + "loss": 16.8407, + "step": 138500 + }, + { + "epoch": 0.6177777777777778, + "grad_norm": 0.0, + "learning_rate": 2.593459732376072e-05, + "loss": 16.4473, + "step": 139000 + }, + { + "epoch": 0.62, + "grad_norm": 324.2782287597656, + "learning_rate": 2.590058149595277e-05, + "loss": 17.0955, + "step": 139500 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 259.27532958984375, + "learning_rate": 2.5866446461132007e-05, + "loss": 17.8668, + "step": 140000 + }, + { + "epoch": 0.6244444444444445, + "grad_norm": 504.20550537109375, + "learning_rate": 2.5832192592593707e-05, + "loss": 18.1582, + "step": 140500 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 464.8078918457031, + "learning_rate": 2.5797820264932682e-05, + "loss": 16.0802, + "step": 141000 + }, + { + "epoch": 0.6288888888888889, + "grad_norm": 294.2264099121094, + "learning_rate": 2.5763329854039204e-05, + "loss": 16.0784, + "step": 141500 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 212.64166259765625, + "learning_rate": 2.572872173709488e-05, + "loss": 16.1939, + "step": 142000 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 313.9952087402344, + "learning_rate": 2.5693996292568535e-05, + "loss": 16.6863, + "step": 142500 + }, + { + "epoch": 0.6355555555555555, + "grad_norm": 350.9505615234375, + "learning_rate": 2.565915390021206e-05, + "loss": 15.5249, + "step": 143000 + }, + { + "epoch": 0.6377777777777778, + "grad_norm": 113.72864532470703, + "learning_rate": 2.562419494105628e-05, + "loss": 17.4712, + "step": 143500 + }, + { + "epoch": 0.64, + "grad_norm": 439.85784912109375, + "learning_rate": 2.558911979740677e-05, + "loss": 16.1441, + "step": 144000 + }, + { + "epoch": 0.6422222222222222, + "grad_norm": 107.58014678955078, + "learning_rate": 2.5553928852839686e-05, + "loss": 17.8531, + "step": 144500 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 314.7883605957031, + "learning_rate": 2.5518622492197558e-05, + "loss": 16.5554, + "step": 145000 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 146.2752227783203, + "learning_rate": 2.5483201101585085e-05, + "loss": 17.0876, + "step": 145500 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 493.06488037109375, + "learning_rate": 2.544766506836492e-05, + "loss": 16.4471, + "step": 146000 + }, + { + "epoch": 0.6511111111111111, + "grad_norm": 331.6954040527344, + "learning_rate": 2.5412014781153433e-05, + "loss": 16.6836, + "step": 146500 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 324.4432373046875, + "learning_rate": 2.537625062981645e-05, + "loss": 16.9327, + "step": 147000 + }, + { + "epoch": 0.6555555555555556, + "grad_norm": 447.0750732421875, + "learning_rate": 2.5340373005465007e-05, + "loss": 16.6021, + "step": 147500 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 74.82227325439453, + "learning_rate": 2.530438230045105e-05, + "loss": 16.6877, + "step": 148000 + }, + { + "epoch": 0.66, + "grad_norm": 408.71380615234375, + "learning_rate": 2.5268278908363157e-05, + "loss": 15.4423, + "step": 148500 + }, + { + "epoch": 0.6622222222222223, + "grad_norm": 434.0395812988281, + "learning_rate": 2.523206322402225e-05, + "loss": 16.9507, + "step": 149000 + }, + { + "epoch": 0.6644444444444444, + "grad_norm": 0.0, + "learning_rate": 2.5195735643477244e-05, + "loss": 17.0505, + "step": 149500 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 744.4578857421875, + "learning_rate": 2.5159296564000744e-05, + "loss": 16.4468, + "step": 150000 + }, + { + "epoch": 0.6688888888888889, + "grad_norm": 203.68789672851562, + "learning_rate": 2.5122746384084683e-05, + "loss": 15.6102, + "step": 150500 + }, + { + "epoch": 0.6711111111111111, + "grad_norm": 304.8150329589844, + "learning_rate": 2.5086085503435973e-05, + "loss": 16.5682, + "step": 151000 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 212.24891662597656, + "learning_rate": 2.504931432297213e-05, + "loss": 16.6716, + "step": 151500 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 143.3702392578125, + "learning_rate": 2.5012433244816894e-05, + "loss": 17.2561, + "step": 152000 + }, + { + "epoch": 0.6777777777777778, + "grad_norm": 82.70915985107422, + "learning_rate": 2.4975442672295827e-05, + "loss": 17.7661, + "step": 152500 + }, + { + "epoch": 0.68, + "grad_norm": 81.59647369384766, + "learning_rate": 2.4938343009931908e-05, + "loss": 15.6807, + "step": 153000 + }, + { + "epoch": 0.6822222222222222, + "grad_norm": 483.339111328125, + "learning_rate": 2.4901134663441088e-05, + "loss": 16.8148, + "step": 153500 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 0.0, + "learning_rate": 2.4863818039727895e-05, + "loss": 17.1794, + "step": 154000 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 211.79966735839844, + "learning_rate": 2.482639354688094e-05, + "loss": 15.5973, + "step": 154500 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 242.6669464111328, + "learning_rate": 2.4788861594168485e-05, + "loss": 16.9753, + "step": 155000 + }, + { + "epoch": 0.6911111111111111, + "grad_norm": 186.95126342773438, + "learning_rate": 2.475122259203395e-05, + "loss": 15.0561, + "step": 155500 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 332.6864929199219, + "learning_rate": 2.471347695209143e-05, + "loss": 16.4118, + "step": 156000 + }, + { + "epoch": 0.6955555555555556, + "grad_norm": 373.36944580078125, + "learning_rate": 2.4675625087121204e-05, + "loss": 16.9823, + "step": 156500 + }, + { + "epoch": 0.6977777777777778, + "grad_norm": 61.25292205810547, + "learning_rate": 2.4637667411065197e-05, + "loss": 16.2012, + "step": 157000 + }, + { + "epoch": 0.7, + "grad_norm": 549.8672485351562, + "learning_rate": 2.459960433902247e-05, + "loss": 17.6019, + "step": 157500 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 478.5077209472656, + "learning_rate": 2.4561436287244685e-05, + "loss": 17.6805, + "step": 158000 + }, + { + "epoch": 0.7044444444444444, + "grad_norm": 218.25418090820312, + "learning_rate": 2.4523163673131538e-05, + "loss": 15.3333, + "step": 158500 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 383.55767822265625, + "learning_rate": 2.4484786915226213e-05, + "loss": 16.3707, + "step": 159000 + }, + { + "epoch": 0.7088888888888889, + "grad_norm": 729.36474609375, + "learning_rate": 2.444630643321078e-05, + "loss": 15.4495, + "step": 159500 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 578.4398193359375, + "learning_rate": 2.4407722647901624e-05, + "loss": 17.7177, + "step": 160000 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 284.87823486328125, + "learning_rate": 2.4369035981244836e-05, + "loss": 16.7006, + "step": 160500 + }, + { + "epoch": 0.7155555555555555, + "grad_norm": 287.9507751464844, + "learning_rate": 2.4330246856311613e-05, + "loss": 16.7623, + "step": 161000 + }, + { + "epoch": 0.7177777777777777, + "grad_norm": 518.5828857421875, + "learning_rate": 2.429135569729361e-05, + "loss": 18.6743, + "step": 161500 + }, + { + "epoch": 0.72, + "grad_norm": 741.138916015625, + "learning_rate": 2.42523629294983e-05, + "loss": 15.989, + "step": 162000 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.0, + "learning_rate": 2.4213268979344362e-05, + "loss": 16.102, + "step": 162500 + }, + { + "epoch": 0.7244444444444444, + "grad_norm": 358.8752746582031, + "learning_rate": 2.417407427435696e-05, + "loss": 15.923, + "step": 163000 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 570.9427490234375, + "learning_rate": 2.4134779243163105e-05, + "loss": 16.5887, + "step": 163500 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 435.3963928222656, + "learning_rate": 2.409538431548697e-05, + "loss": 15.2045, + "step": 164000 + }, + { + "epoch": 0.7311111111111112, + "grad_norm": 298.369140625, + "learning_rate": 2.405588992214517e-05, + "loss": 16.1364, + "step": 164500 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 161.4807586669922, + "learning_rate": 2.4016296495042065e-05, + "loss": 16.3397, + "step": 165000 + }, + { + "epoch": 0.7355555555555555, + "grad_norm": 450.2773742675781, + "learning_rate": 2.3976604467165035e-05, + "loss": 14.8856, + "step": 165500 + }, + { + "epoch": 0.7377777777777778, + "grad_norm": 62.63951110839844, + "learning_rate": 2.3936814272579718e-05, + "loss": 16.1214, + "step": 166000 + }, + { + "epoch": 0.74, + "grad_norm": 295.8753662109375, + "learning_rate": 2.389692634642533e-05, + "loss": 16.7177, + "step": 166500 + }, + { + "epoch": 0.7422222222222222, + "grad_norm": 83.56742858886719, + "learning_rate": 2.385694112490983e-05, + "loss": 16.233, + "step": 167000 + }, + { + "epoch": 0.7444444444444445, + "grad_norm": 859.1819458007812, + "learning_rate": 2.381685904530519e-05, + "loss": 16.7252, + "step": 167500 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 414.3497009277344, + "learning_rate": 2.377668054594262e-05, + "loss": 16.0818, + "step": 168000 + }, + { + "epoch": 0.7488888888888889, + "grad_norm": 291.54498291015625, + "learning_rate": 2.373640606620775e-05, + "loss": 14.5691, + "step": 168500 + }, + { + "epoch": 0.7511111111111111, + "grad_norm": 594.7430419921875, + "learning_rate": 2.369603604653583e-05, + "loss": 16.9945, + "step": 169000 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 202.13864135742188, + "learning_rate": 2.3655570928406937e-05, + "loss": 15.3943, + "step": 169500 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 212.4605712890625, + "learning_rate": 2.361501115434112e-05, + "loss": 16.8734, + "step": 170000 + }, + { + "epoch": 0.7577777777777778, + "grad_norm": 414.1224060058594, + "learning_rate": 2.357435716789356e-05, + "loss": 15.8502, + "step": 170500 + }, + { + "epoch": 0.76, + "grad_norm": 92.9588394165039, + "learning_rate": 2.3533609413649745e-05, + "loss": 16.2583, + "step": 171000 + }, + { + "epoch": 0.7622222222222222, + "grad_norm": 308.7859802246094, + "learning_rate": 2.349276833722059e-05, + "loss": 16.0059, + "step": 171500 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 437.89178466796875, + "learning_rate": 2.345183438523756e-05, + "loss": 16.7771, + "step": 172000 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 28.078920364379883, + "learning_rate": 2.3410808005347798e-05, + "loss": 17.1159, + "step": 172500 + }, + { + "epoch": 0.7688888888888888, + "grad_norm": 243.4501495361328, + "learning_rate": 2.336968964620922e-05, + "loss": 17.4442, + "step": 173000 + }, + { + "epoch": 0.7711111111111111, + "grad_norm": 873.5339965820312, + "learning_rate": 2.3328479757485615e-05, + "loss": 16.389, + "step": 173500 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 487.0278015136719, + "learning_rate": 2.328717878984172e-05, + "loss": 15.1246, + "step": 174000 + }, + { + "epoch": 0.7755555555555556, + "grad_norm": 1256.6805419921875, + "learning_rate": 2.32457871949383e-05, + "loss": 16.0509, + "step": 174500 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 437.3548278808594, + "learning_rate": 2.320430542542721e-05, + "loss": 14.2762, + "step": 175000 + }, + { + "epoch": 0.78, + "grad_norm": 50.979103088378906, + "learning_rate": 2.3162733934946437e-05, + "loss": 15.7425, + "step": 175500 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 461.4090270996094, + "learning_rate": 2.3121073178115136e-05, + "loss": 17.1488, + "step": 176000 + }, + { + "epoch": 0.7844444444444445, + "grad_norm": 163.63095092773438, + "learning_rate": 2.307932361052867e-05, + "loss": 14.9277, + "step": 176500 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 349.4720458984375, + "learning_rate": 2.3037485688753623e-05, + "loss": 15.1278, + "step": 177000 + }, + { + "epoch": 0.7888888888888889, + "grad_norm": 266.4578857421875, + "learning_rate": 2.2995559870322797e-05, + "loss": 14.9445, + "step": 177500 + }, + { + "epoch": 0.7911111111111111, + "grad_norm": 259.8016357421875, + "learning_rate": 2.2953546613730237e-05, + "loss": 15.8992, + "step": 178000 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 302.3138732910156, + "learning_rate": 2.2911446378426177e-05, + "loss": 16.151, + "step": 178500 + }, + { + "epoch": 0.7955555555555556, + "grad_norm": 302.546142578125, + "learning_rate": 2.286925962481205e-05, + "loss": 15.9711, + "step": 179000 + }, + { + "epoch": 0.7977777777777778, + "grad_norm": 161.2322998046875, + "learning_rate": 2.282698681423543e-05, + "loss": 15.3818, + "step": 179500 + }, + { + "epoch": 0.8, + "grad_norm": 338.44873046875, + "learning_rate": 2.2784628408985005e-05, + "loss": 16.7231, + "step": 180000 + }, + { + "epoch": 0.8022222222222222, + "grad_norm": 331.5046691894531, + "learning_rate": 2.2742184872285507e-05, + "loss": 15.7784, + "step": 180500 + }, + { + "epoch": 0.8044444444444444, + "grad_norm": 532.013671875, + "learning_rate": 2.2699656668292653e-05, + "loss": 15.8937, + "step": 181000 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 30.83024024963379, + "learning_rate": 2.2657044262088068e-05, + "loss": 14.8331, + "step": 181500 + }, + { + "epoch": 0.8088888888888889, + "grad_norm": 208.97105407714844, + "learning_rate": 2.26143481196742e-05, + "loss": 14.8417, + "step": 182000 + }, + { + "epoch": 0.8111111111111111, + "grad_norm": 178.349609375, + "learning_rate": 2.2571568707969224e-05, + "loss": 15.9551, + "step": 182500 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 191.2917938232422, + "learning_rate": 2.2528706494801933e-05, + "loss": 15.4303, + "step": 183000 + }, + { + "epoch": 0.8155555555555556, + "grad_norm": 379.2752685546875, + "learning_rate": 2.248576194890661e-05, + "loss": 17.1609, + "step": 183500 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 49.782352447509766, + "learning_rate": 2.244273553991795e-05, + "loss": 16.6368, + "step": 184000 + }, + { + "epoch": 0.82, + "grad_norm": 164.4068603515625, + "learning_rate": 2.239962773836585e-05, + "loss": 16.0915, + "step": 184500 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 120.09187316894531, + "learning_rate": 2.2356439015670335e-05, + "loss": 15.3172, + "step": 185000 + }, + { + "epoch": 0.8244444444444444, + "grad_norm": 119.5110855102539, + "learning_rate": 2.2313169844136342e-05, + "loss": 15.7401, + "step": 185500 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 238.2360076904297, + "learning_rate": 2.226982069694861e-05, + "loss": 15.5555, + "step": 186000 + }, + { + "epoch": 0.8288888888888889, + "grad_norm": 234.07911682128906, + "learning_rate": 2.2226392048166467e-05, + "loss": 15.8124, + "step": 186500 + }, + { + "epoch": 0.8311111111111111, + "grad_norm": 0.0, + "learning_rate": 2.218288437271865e-05, + "loss": 14.9297, + "step": 187000 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.0, + "learning_rate": 2.213929814639814e-05, + "loss": 14.9676, + "step": 187500 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 221.94076538085938, + "learning_rate": 2.2095633845856912e-05, + "loss": 14.5759, + "step": 188000 + }, + { + "epoch": 0.8377777777777777, + "grad_norm": 798.3099365234375, + "learning_rate": 2.2051891948600773e-05, + "loss": 16.8336, + "step": 188500 + }, + { + "epoch": 0.84, + "grad_norm": 148.87489318847656, + "learning_rate": 2.2008072932984095e-05, + "loss": 15.6524, + "step": 189000 + }, + { + "epoch": 0.8422222222222222, + "grad_norm": 979.9264526367188, + "learning_rate": 2.196417727820461e-05, + "loss": 14.5125, + "step": 189500 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 273.1609191894531, + "learning_rate": 2.1920205464298174e-05, + "loss": 14.7308, + "step": 190000 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 494.7351989746094, + "learning_rate": 2.187615797213349e-05, + "loss": 14.448, + "step": 190500 + }, + { + "epoch": 0.8488888888888889, + "grad_norm": 2433.17529296875, + "learning_rate": 2.183203528340689e-05, + "loss": 15.0146, + "step": 191000 + }, + { + "epoch": 0.8511111111111112, + "grad_norm": 446.34490966796875, + "learning_rate": 2.1787837880637014e-05, + "loss": 15.0511, + "step": 191500 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 596.4390869140625, + "learning_rate": 2.1743566247159586e-05, + "loss": 14.3164, + "step": 192000 + }, + { + "epoch": 0.8555555555555555, + "grad_norm": 927.9017333984375, + "learning_rate": 2.1699220867122087e-05, + "loss": 14.7031, + "step": 192500 + }, + { + "epoch": 0.8577777777777778, + "grad_norm": 174.5888671875, + "learning_rate": 2.16548022254785e-05, + "loss": 14.77, + "step": 193000 + }, + { + "epoch": 0.86, + "grad_norm": 346.9240417480469, + "learning_rate": 2.161031080798397e-05, + "loss": 14.618, + "step": 193500 + }, + { + "epoch": 0.8622222222222222, + "grad_norm": 533.3963623046875, + "learning_rate": 2.156574710118951e-05, + "loss": 14.1816, + "step": 194000 + }, + { + "epoch": 0.8644444444444445, + "grad_norm": 234.50579833984375, + "learning_rate": 2.1521111592436673e-05, + "loss": 15.6746, + "step": 194500 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 654.4329833984375, + "learning_rate": 2.1476404769852238e-05, + "loss": 16.4027, + "step": 195000 + }, + { + "epoch": 0.8688888888888889, + "grad_norm": 97.57040405273438, + "learning_rate": 2.143162712234285e-05, + "loss": 14.6315, + "step": 195500 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 347.2988586425781, + "learning_rate": 2.138677913958969e-05, + "loss": 14.8534, + "step": 196000 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 61.20378112792969, + "learning_rate": 2.1341861312043116e-05, + "loss": 14.0666, + "step": 196500 + }, + { + "epoch": 0.8755555555555555, + "grad_norm": 57.949256896972656, + "learning_rate": 2.1296874130917282e-05, + "loss": 13.8681, + "step": 197000 + }, + { + "epoch": 0.8777777777777778, + "grad_norm": 417.0851745605469, + "learning_rate": 2.1251818088184808e-05, + "loss": 15.6193, + "step": 197500 + }, + { + "epoch": 0.88, + "grad_norm": 261.3269958496094, + "learning_rate": 2.1206693676571347e-05, + "loss": 15.1966, + "step": 198000 + }, + { + "epoch": 0.8822222222222222, + "grad_norm": 105.9546890258789, + "learning_rate": 2.1161501389550242e-05, + "loss": 15.0815, + "step": 198500 + }, + { + "epoch": 0.8844444444444445, + "grad_norm": 453.0606994628906, + "learning_rate": 2.11162417213371e-05, + "loss": 15.7839, + "step": 199000 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 0.0, + "learning_rate": 2.10709151668844e-05, + "loss": 15.5458, + "step": 199500 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 373.2171630859375, + "learning_rate": 2.1025522221876087e-05, + "loss": 14.8535, + "step": 200000 + }, + { + "epoch": 0.8911111111111111, + "grad_norm": 182.15408325195312, + "learning_rate": 2.098006338272212e-05, + "loss": 15.9142, + "step": 200500 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 159.78123474121094, + "learning_rate": 2.09345391465531e-05, + "loss": 17.2029, + "step": 201000 + }, + { + "epoch": 0.8955555555555555, + "grad_norm": 761.6434326171875, + "learning_rate": 2.0888950011214763e-05, + "loss": 14.7574, + "step": 201500 + }, + { + "epoch": 0.8977777777777778, + "grad_norm": 602.9556274414062, + "learning_rate": 2.0843296475262604e-05, + "loss": 15.3703, + "step": 202000 + }, + { + "epoch": 0.9, + "grad_norm": 44.228267669677734, + "learning_rate": 2.0797579037956364e-05, + "loss": 16.191, + "step": 202500 + }, + { + "epoch": 0.9022222222222223, + "grad_norm": 191.9353485107422, + "learning_rate": 2.075179819925462e-05, + "loss": 15.4188, + "step": 203000 + }, + { + "epoch": 0.9044444444444445, + "grad_norm": 41.51668930053711, + "learning_rate": 2.0705954459809293e-05, + "loss": 14.5222, + "step": 203500 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 281.99273681640625, + "learning_rate": 2.0660048320960164e-05, + "loss": 15.4986, + "step": 204000 + }, + { + "epoch": 0.9088888888888889, + "grad_norm": 3.3990941047668457, + "learning_rate": 2.061408028472942e-05, + "loss": 15.7127, + "step": 204500 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 151.7320556640625, + "learning_rate": 2.0568050853816137e-05, + "loss": 14.9146, + "step": 205000 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 223.80499267578125, + "learning_rate": 2.0521960531590795e-05, + "loss": 15.3864, + "step": 205500 + }, + { + "epoch": 0.9155555555555556, + "grad_norm": 394.2869567871094, + "learning_rate": 2.0475809822089774e-05, + "loss": 15.7962, + "step": 206000 + }, + { + "epoch": 0.9177777777777778, + "grad_norm": 471.55072021484375, + "learning_rate": 2.0429599230009844e-05, + "loss": 14.9467, + "step": 206500 + }, + { + "epoch": 0.92, + "grad_norm": 773.841552734375, + "learning_rate": 2.0383329260702634e-05, + "loss": 14.1642, + "step": 207000 + }, + { + "epoch": 0.9222222222222223, + "grad_norm": 269.2467346191406, + "learning_rate": 2.0337000420169113e-05, + "loss": 14.8939, + "step": 207500 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 262.891357421875, + "learning_rate": 2.0290613215054063e-05, + "loss": 14.6107, + "step": 208000 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 370.94036865234375, + "learning_rate": 2.0244168152640522e-05, + "loss": 14.8097, + "step": 208500 + }, + { + "epoch": 0.9288888888888889, + "grad_norm": 526.1622924804688, + "learning_rate": 2.0197665740844254e-05, + "loss": 13.5514, + "step": 209000 + }, + { + "epoch": 0.9311111111111111, + "grad_norm": 402.8370361328125, + "learning_rate": 2.0151106488208185e-05, + "loss": 15.5235, + "step": 209500 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 240.7682647705078, + "learning_rate": 2.0104490903896834e-05, + "loss": 15.7625, + "step": 210000 + }, + { + "epoch": 0.9355555555555556, + "grad_norm": 929.83447265625, + "learning_rate": 2.0057819497690778e-05, + "loss": 13.7892, + "step": 210500 + }, + { + "epoch": 0.9377777777777778, + "grad_norm": 50.330322265625, + "learning_rate": 2.0011092779981027e-05, + "loss": 14.8297, + "step": 211000 + }, + { + "epoch": 0.94, + "grad_norm": 106.34629821777344, + "learning_rate": 1.9964311261763482e-05, + "loss": 14.0396, + "step": 211500 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 519.3964233398438, + "learning_rate": 1.991747545463333e-05, + "loss": 14.4548, + "step": 212000 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 496.7522888183594, + "learning_rate": 1.987058587077946e-05, + "loss": 15.0954, + "step": 212500 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 79.46224975585938, + "learning_rate": 1.9823643022978844e-05, + "loss": 15.5782, + "step": 213000 + }, + { + "epoch": 0.9488888888888889, + "grad_norm": 0.0, + "learning_rate": 1.9776647424590937e-05, + "loss": 14.1761, + "step": 213500 + }, + { + "epoch": 0.9511111111111111, + "grad_norm": 328.0174560546875, + "learning_rate": 1.9729599589552084e-05, + "loss": 14.5482, + "step": 214000 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 223.33721923828125, + "learning_rate": 1.968250003236987e-05, + "loss": 14.5949, + "step": 214500 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 233.63478088378906, + "learning_rate": 1.9635349268117507e-05, + "loss": 14.8437, + "step": 215000 + }, + { + "epoch": 0.9577777777777777, + "grad_norm": 4.987401485443115, + "learning_rate": 1.9588147812428197e-05, + "loss": 15.7183, + "step": 215500 + }, + { + "epoch": 0.96, + "grad_norm": 341.9475402832031, + "learning_rate": 1.954089618148949e-05, + "loss": 15.5074, + "step": 216000 + }, + { + "epoch": 0.9622222222222222, + "grad_norm": 186.303466796875, + "learning_rate": 1.9493594892037667e-05, + "loss": 14.1594, + "step": 216500 + }, + { + "epoch": 0.9644444444444444, + "grad_norm": 196.6855010986328, + "learning_rate": 1.9446244461352033e-05, + "loss": 16.0385, + "step": 217000 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 536.9638061523438, + "learning_rate": 1.9398845407249326e-05, + "loss": 15.1219, + "step": 217500 + }, + { + "epoch": 0.9688888888888889, + "grad_norm": 369.9173889160156, + "learning_rate": 1.9351398248078004e-05, + "loss": 14.1767, + "step": 218000 + }, + { + "epoch": 0.9711111111111111, + "grad_norm": 36.90256118774414, + "learning_rate": 1.9303903502712592e-05, + "loss": 15.2894, + "step": 218500 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 475.021240234375, + "learning_rate": 1.9256361690548026e-05, + "loss": 14.8856, + "step": 219000 + }, + { + "epoch": 0.9755555555555555, + "grad_norm": 805.5115356445312, + "learning_rate": 1.9208773331493938e-05, + "loss": 14.159, + "step": 219500 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 767.4393310546875, + "learning_rate": 1.9161138945969007e-05, + "loss": 14.6288, + "step": 220000 + }, + { + "epoch": 0.98, + "grad_norm": 122.41221618652344, + "learning_rate": 1.911345905489523e-05, + "loss": 13.795, + "step": 220500 + }, + { + "epoch": 0.9822222222222222, + "grad_norm": 432.9138488769531, + "learning_rate": 1.9065734179692262e-05, + "loss": 14.115, + "step": 221000 + }, + { + "epoch": 0.9844444444444445, + "grad_norm": 630.0858764648438, + "learning_rate": 1.90179648422717e-05, + "loss": 13.5404, + "step": 221500 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 681.5342407226562, + "learning_rate": 1.897015156503135e-05, + "loss": 14.8603, + "step": 222000 + }, + { + "epoch": 0.9888888888888889, + "grad_norm": 18.26776885986328, + "learning_rate": 1.8922294870849566e-05, + "loss": 14.8978, + "step": 222500 + }, + { + "epoch": 0.9911111111111112, + "grad_norm": 610.2125244140625, + "learning_rate": 1.8874395283079478e-05, + "loss": 14.0042, + "step": 223000 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 236.45591735839844, + "learning_rate": 1.8826453325543308e-05, + "loss": 13.2571, + "step": 223500 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 146.5922393798828, + "learning_rate": 1.877846952252662e-05, + "loss": 14.9317, + "step": 224000 + }, + { + "epoch": 0.9977777777777778, + "grad_norm": 831.205078125, + "learning_rate": 1.8730444398772605e-05, + "loss": 14.2085, + "step": 224500 + }, + { + "epoch": 1.0, + "grad_norm": 465.5499267578125, + "learning_rate": 1.8682378479476307e-05, + "loss": 15.6298, + "step": 225000 + }, + { + "epoch": 1.0022222222222221, + "grad_norm": 130.86990356445312, + "learning_rate": 1.8634272290278932e-05, + "loss": 12.7156, + "step": 225500 + }, + { + "epoch": 1.0044444444444445, + "grad_norm": 394.0591125488281, + "learning_rate": 1.8586126357262054e-05, + "loss": 12.0245, + "step": 226000 + }, + { + "epoch": 1.0066666666666666, + "grad_norm": 144.7230682373047, + "learning_rate": 1.853794120694187e-05, + "loss": 12.68, + "step": 226500 + }, + { + "epoch": 1.008888888888889, + "grad_norm": 108.50147247314453, + "learning_rate": 1.8489717366263487e-05, + "loss": 11.755, + "step": 227000 + }, + { + "epoch": 1.011111111111111, + "grad_norm": 45.11106872558594, + "learning_rate": 1.8441455362595082e-05, + "loss": 12.0449, + "step": 227500 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 321.0522155761719, + "learning_rate": 1.8393155723722205e-05, + "loss": 12.5334, + "step": 228000 + }, + { + "epoch": 1.0155555555555555, + "grad_norm": 409.6867370605469, + "learning_rate": 1.8344818977841967e-05, + "loss": 12.5081, + "step": 228500 + }, + { + "epoch": 1.0177777777777777, + "grad_norm": 293.31866455078125, + "learning_rate": 1.829644565355727e-05, + "loss": 11.9373, + "step": 229000 + }, + { + "epoch": 1.02, + "grad_norm": 182.61883544921875, + "learning_rate": 1.8248036279871043e-05, + "loss": 12.3983, + "step": 229500 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 152.36061096191406, + "learning_rate": 1.819959138618044e-05, + "loss": 13.1577, + "step": 230000 + }, + { + "epoch": 1.0244444444444445, + "grad_norm": 31.093074798583984, + "learning_rate": 1.8151111502271063e-05, + "loss": 13.6112, + "step": 230500 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 504.9164733886719, + "learning_rate": 1.810259715831115e-05, + "loss": 12.9236, + "step": 231000 + }, + { + "epoch": 1.028888888888889, + "grad_norm": 118.45124053955078, + "learning_rate": 1.8054048884845784e-05, + "loss": 14.7912, + "step": 231500 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 247.5614776611328, + "learning_rate": 1.8005467212791124e-05, + "loss": 13.3697, + "step": 232000 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 431.06396484375, + "learning_rate": 1.795685267342854e-05, + "loss": 13.0248, + "step": 232500 + }, + { + "epoch": 1.0355555555555556, + "grad_norm": 209.7031707763672, + "learning_rate": 1.7908205798398853e-05, + "loss": 13.0866, + "step": 233000 + }, + { + "epoch": 1.0377777777777777, + "grad_norm": 127.96566009521484, + "learning_rate": 1.7859527119696487e-05, + "loss": 13.5331, + "step": 233500 + }, + { + "epoch": 1.04, + "grad_norm": 117.52790832519531, + "learning_rate": 1.7810817169663676e-05, + "loss": 11.3817, + "step": 234000 + }, + { + "epoch": 1.0422222222222222, + "grad_norm": 1179.1375732421875, + "learning_rate": 1.7762076480984635e-05, + "loss": 12.7315, + "step": 234500 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 357.2664489746094, + "learning_rate": 1.771330558667971e-05, + "loss": 12.4928, + "step": 235000 + }, + { + "epoch": 1.0466666666666666, + "grad_norm": 230.9121551513672, + "learning_rate": 1.766450502009961e-05, + "loss": 13.6869, + "step": 235500 + }, + { + "epoch": 1.048888888888889, + "grad_norm": 236.51214599609375, + "learning_rate": 1.7615675314919504e-05, + "loss": 13.8959, + "step": 236000 + }, + { + "epoch": 1.051111111111111, + "grad_norm": 32.029823303222656, + "learning_rate": 1.7566817005133215e-05, + "loss": 11.7484, + "step": 236500 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 487.9048767089844, + "learning_rate": 1.7517930625047403e-05, + "loss": 12.8478, + "step": 237000 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 64.5386962890625, + "learning_rate": 1.7469016709275678e-05, + "loss": 13.1321, + "step": 237500 + }, + { + "epoch": 1.0577777777777777, + "grad_norm": 123.01608276367188, + "learning_rate": 1.7420075792732797e-05, + "loss": 12.7279, + "step": 238000 + }, + { + "epoch": 1.06, + "grad_norm": 418.50323486328125, + "learning_rate": 1.7371108410628778e-05, + "loss": 12.7196, + "step": 238500 + }, + { + "epoch": 1.0622222222222222, + "grad_norm": 15.958662986755371, + "learning_rate": 1.732211509846306e-05, + "loss": 12.8302, + "step": 239000 + }, + { + "epoch": 1.0644444444444445, + "grad_norm": 903.5818481445312, + "learning_rate": 1.7273096392018664e-05, + "loss": 12.5959, + "step": 239500 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 132.69081115722656, + "learning_rate": 1.7224052827356306e-05, + "loss": 12.4179, + "step": 240000 + }, + { + "epoch": 1.068888888888889, + "grad_norm": 72.78104400634766, + "learning_rate": 1.7174984940808555e-05, + "loss": 12.6991, + "step": 240500 + }, + { + "epoch": 1.0711111111111111, + "grad_norm": 19.8783016204834, + "learning_rate": 1.7125893268973953e-05, + "loss": 12.3093, + "step": 241000 + }, + { + "epoch": 1.0733333333333333, + "grad_norm": 53.51363754272461, + "learning_rate": 1.707677834871116e-05, + "loss": 12.2946, + "step": 241500 + }, + { + "epoch": 1.0755555555555556, + "grad_norm": 310.8068542480469, + "learning_rate": 1.7027640717133074e-05, + "loss": 12.9432, + "step": 242000 + }, + { + "epoch": 1.0777777777777777, + "grad_norm": 448.7236633300781, + "learning_rate": 1.697848091160096e-05, + "loss": 12.162, + "step": 242500 + }, + { + "epoch": 1.08, + "grad_norm": 802.4764404296875, + "learning_rate": 1.6929299469718585e-05, + "loss": 13.7779, + "step": 243000 + }, + { + "epoch": 1.0822222222222222, + "grad_norm": 429.84564208984375, + "learning_rate": 1.68800969293263e-05, + "loss": 12.5977, + "step": 243500 + }, + { + "epoch": 1.0844444444444445, + "grad_norm": 0.0, + "learning_rate": 1.6830873828495226e-05, + "loss": 11.7274, + "step": 244000 + }, + { + "epoch": 1.0866666666666667, + "grad_norm": 194.27366638183594, + "learning_rate": 1.6781630705521288e-05, + "loss": 13.384, + "step": 244500 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 28.86142921447754, + "learning_rate": 1.67323680989194e-05, + "loss": 12.4926, + "step": 245000 + }, + { + "epoch": 1.0911111111111111, + "grad_norm": 729.71875, + "learning_rate": 1.6683086547417527e-05, + "loss": 12.177, + "step": 245500 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 17.39883804321289, + "learning_rate": 1.663378658995083e-05, + "loss": 11.7948, + "step": 246000 + }, + { + "epoch": 1.0955555555555556, + "grad_norm": 0.0, + "learning_rate": 1.6584468765655737e-05, + "loss": 12.777, + "step": 246500 + }, + { + "epoch": 1.0977777777777777, + "grad_norm": 214.7503204345703, + "learning_rate": 1.653513361386408e-05, + "loss": 12.8227, + "step": 247000 + }, + { + "epoch": 1.1, + "grad_norm": 279.39007568359375, + "learning_rate": 1.6485781674097173e-05, + "loss": 12.6121, + "step": 247500 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 74.43594360351562, + "learning_rate": 1.643641348605992e-05, + "loss": 11.8667, + "step": 248000 + }, + { + "epoch": 1.1044444444444443, + "grad_norm": 35.02223587036133, + "learning_rate": 1.638702958963492e-05, + "loss": 12.2564, + "step": 248500 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 23.571346282958984, + "learning_rate": 1.6337630524876546e-05, + "loss": 11.9732, + "step": 249000 + }, + { + "epoch": 1.1088888888888888, + "grad_norm": 15.899101257324219, + "learning_rate": 1.628821683200506e-05, + "loss": 13.1795, + "step": 249500 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 272.45257568359375, + "learning_rate": 1.6238789051400688e-05, + "loss": 12.9309, + "step": 250000 + }, + { + "epoch": 1.1133333333333333, + "grad_norm": 0.0, + "learning_rate": 1.6189347723597725e-05, + "loss": 12.8293, + "step": 250500 + }, + { + "epoch": 1.1155555555555556, + "grad_norm": 10.567012786865234, + "learning_rate": 1.6139893389278608e-05, + "loss": 11.9302, + "step": 251000 + }, + { + "epoch": 1.1177777777777778, + "grad_norm": 823.9113159179688, + "learning_rate": 1.609042658926801e-05, + "loss": 11.3798, + "step": 251500 + }, + { + "epoch": 1.12, + "grad_norm": 449.7940673828125, + "learning_rate": 1.6040947864526935e-05, + "loss": 12.5211, + "step": 252000 + }, + { + "epoch": 1.1222222222222222, + "grad_norm": 427.29150390625, + "learning_rate": 1.5991457756146786e-05, + "loss": 12.1701, + "step": 252500 + }, + { + "epoch": 1.1244444444444444, + "grad_norm": 108.2233657836914, + "learning_rate": 1.5941956805343463e-05, + "loss": 12.4913, + "step": 253000 + }, + { + "epoch": 1.1266666666666667, + "grad_norm": 92.11042022705078, + "learning_rate": 1.589244555345143e-05, + "loss": 11.8749, + "step": 253500 + }, + { + "epoch": 1.1288888888888888, + "grad_norm": 177.92575073242188, + "learning_rate": 1.584292454191781e-05, + "loss": 13.8006, + "step": 254000 + }, + { + "epoch": 1.1311111111111112, + "grad_norm": 203.5926513671875, + "learning_rate": 1.5793394312296444e-05, + "loss": 12.2695, + "step": 254500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 339.7933654785156, + "learning_rate": 1.5743855406242e-05, + "loss": 12.3823, + "step": 255000 + }, + { + "epoch": 1.1355555555555557, + "grad_norm": 334.1343688964844, + "learning_rate": 1.5694308365504e-05, + "loss": 13.8132, + "step": 255500 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 206.6999969482422, + "learning_rate": 1.5644753731920954e-05, + "loss": 12.8192, + "step": 256000 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 237.3104248046875, + "learning_rate": 1.5595192047414395e-05, + "loss": 11.9175, + "step": 256500 + }, + { + "epoch": 1.1422222222222222, + "grad_norm": 673.7626953125, + "learning_rate": 1.5545623853982966e-05, + "loss": 13.1039, + "step": 257000 + }, + { + "epoch": 1.1444444444444444, + "grad_norm": 40.97128677368164, + "learning_rate": 1.549604969369649e-05, + "loss": 11.9416, + "step": 257500 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 125.23896789550781, + "learning_rate": 1.544647010869003e-05, + "loss": 12.4299, + "step": 258000 + }, + { + "epoch": 1.1488888888888888, + "grad_norm": 297.3369140625, + "learning_rate": 1.5396885641158002e-05, + "loss": 12.2724, + "step": 258500 + }, + { + "epoch": 1.1511111111111112, + "grad_norm": 0.0, + "learning_rate": 1.534729683334818e-05, + "loss": 10.8568, + "step": 259000 + }, + { + "epoch": 1.1533333333333333, + "grad_norm": 222.6666717529297, + "learning_rate": 1.529770422755583e-05, + "loss": 11.321, + "step": 259500 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 258.8761291503906, + "learning_rate": 1.524810836611775e-05, + "loss": 11.3846, + "step": 260000 + }, + { + "epoch": 1.1577777777777778, + "grad_norm": 362.4846496582031, + "learning_rate": 1.5198509791406325e-05, + "loss": 12.1888, + "step": 260500 + }, + { + "epoch": 1.16, + "grad_norm": 325.5453186035156, + "learning_rate": 1.5148909045823626e-05, + "loss": 11.6617, + "step": 261000 + }, + { + "epoch": 1.1622222222222223, + "grad_norm": 346.42791748046875, + "learning_rate": 1.509930667179546e-05, + "loss": 12.4993, + "step": 261500 + }, + { + "epoch": 1.1644444444444444, + "grad_norm": 427.6278991699219, + "learning_rate": 1.5049703211765442e-05, + "loss": 12.6815, + "step": 262000 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 416.53680419921875, + "learning_rate": 1.5000099208189061e-05, + "loss": 12.9896, + "step": 262500 + }, + { + "epoch": 1.1688888888888889, + "grad_norm": 181.99703979492188, + "learning_rate": 1.4950495203527755e-05, + "loss": 12.7223, + "step": 263000 + }, + { + "epoch": 1.1711111111111112, + "grad_norm": 38.73680114746094, + "learning_rate": 1.4900891740242976e-05, + "loss": 12.5012, + "step": 263500 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 527.49267578125, + "learning_rate": 1.4851289360790243e-05, + "loss": 11.8226, + "step": 264000 + }, + { + "epoch": 1.1755555555555555, + "grad_norm": 593.9708862304688, + "learning_rate": 1.480168860761324e-05, + "loss": 11.9695, + "step": 264500 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 743.0066528320312, + "learning_rate": 1.4752090023137843e-05, + "loss": 12.0286, + "step": 265000 + }, + { + "epoch": 1.18, + "grad_norm": 201.2530059814453, + "learning_rate": 1.4702494149766239e-05, + "loss": 10.9088, + "step": 265500 + }, + { + "epoch": 1.1822222222222223, + "grad_norm": 548.9100952148438, + "learning_rate": 1.465290152987095e-05, + "loss": 11.889, + "step": 266000 + }, + { + "epoch": 1.1844444444444444, + "grad_norm": 233.81863403320312, + "learning_rate": 1.4603312705788917e-05, + "loss": 12.1066, + "step": 266500 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 163.2041015625, + "learning_rate": 1.4553728219815586e-05, + "loss": 12.8837, + "step": 267000 + }, + { + "epoch": 1.1888888888888889, + "grad_norm": 153.75701904296875, + "learning_rate": 1.4504148614198935e-05, + "loss": 11.7215, + "step": 267500 + }, + { + "epoch": 1.1911111111111112, + "grad_norm": 32.576324462890625, + "learning_rate": 1.4454574431133605e-05, + "loss": 12.7392, + "step": 268000 + }, + { + "epoch": 1.1933333333333334, + "grad_norm": 690.4747314453125, + "learning_rate": 1.4405006212754901e-05, + "loss": 12.4667, + "step": 268500 + }, + { + "epoch": 1.1955555555555555, + "grad_norm": 70.4339828491211, + "learning_rate": 1.4355444501132934e-05, + "loss": 12.3897, + "step": 269000 + }, + { + "epoch": 1.1977777777777778, + "grad_norm": 1018.3383178710938, + "learning_rate": 1.430588983826664e-05, + "loss": 11.7094, + "step": 269500 + }, + { + "epoch": 1.2, + "grad_norm": 64.98046112060547, + "learning_rate": 1.4256342766077859e-05, + "loss": 11.031, + "step": 270000 + }, + { + "epoch": 1.2022222222222223, + "grad_norm": 507.530029296875, + "learning_rate": 1.4206803826405453e-05, + "loss": 11.7225, + "step": 270500 + }, + { + "epoch": 1.2044444444444444, + "grad_norm": 396.6742248535156, + "learning_rate": 1.4157273560999311e-05, + "loss": 12.0661, + "step": 271000 + }, + { + "epoch": 1.2066666666666666, + "grad_norm": 741.4268188476562, + "learning_rate": 1.4107752511514499e-05, + "loss": 12.1401, + "step": 271500 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 977.9871826171875, + "learning_rate": 1.405824121950526e-05, + "loss": 11.8266, + "step": 272000 + }, + { + "epoch": 1.211111111111111, + "grad_norm": 172.49072265625, + "learning_rate": 1.4008740226419166e-05, + "loss": 12.024, + "step": 272500 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 148.6393585205078, + "learning_rate": 1.3959250073591146e-05, + "loss": 11.7095, + "step": 273000 + }, + { + "epoch": 1.2155555555555555, + "grad_norm": 50.63189697265625, + "learning_rate": 1.390977130223757e-05, + "loss": 11.5046, + "step": 273500 + }, + { + "epoch": 1.2177777777777778, + "grad_norm": 101.87459564208984, + "learning_rate": 1.3860304453450373e-05, + "loss": 11.3638, + "step": 274000 + }, + { + "epoch": 1.22, + "grad_norm": 274.5159606933594, + "learning_rate": 1.3810850068191069e-05, + "loss": 12.2588, + "step": 274500 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 108.9557876586914, + "learning_rate": 1.3761408687284907e-05, + "loss": 12.7642, + "step": 275000 + }, + { + "epoch": 1.2244444444444444, + "grad_norm": 455.4017028808594, + "learning_rate": 1.3711980851414898e-05, + "loss": 11.3841, + "step": 275500 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 239.2037811279297, + "learning_rate": 1.3662567101115934e-05, + "loss": 12.0606, + "step": 276000 + }, + { + "epoch": 1.228888888888889, + "grad_norm": 56.60507583618164, + "learning_rate": 1.3613167976768886e-05, + "loss": 11.4546, + "step": 276500 + }, + { + "epoch": 1.231111111111111, + "grad_norm": 310.4095458984375, + "learning_rate": 1.3563784018594645e-05, + "loss": 11.4747, + "step": 277000 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 335.875, + "learning_rate": 1.3514415766648284e-05, + "loss": 11.9081, + "step": 277500 + }, + { + "epoch": 1.2355555555555555, + "grad_norm": 594.1018676757812, + "learning_rate": 1.346506376081308e-05, + "loss": 11.2674, + "step": 278000 + }, + { + "epoch": 1.2377777777777779, + "grad_norm": 275.7675476074219, + "learning_rate": 1.3415728540794674e-05, + "loss": 10.7813, + "step": 278500 + }, + { + "epoch": 1.24, + "grad_norm": 214.95712280273438, + "learning_rate": 1.3366410646115118e-05, + "loss": 12.3449, + "step": 279000 + }, + { + "epoch": 1.2422222222222223, + "grad_norm": 0.0, + "learning_rate": 1.331711061610701e-05, + "loss": 11.6398, + "step": 279500 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 15.316904067993164, + "learning_rate": 1.3267828989907592e-05, + "loss": 11.7452, + "step": 280000 + }, + { + "epoch": 1.2466666666666666, + "grad_norm": 529.526611328125, + "learning_rate": 1.3218566306452813e-05, + "loss": 12.7856, + "step": 280500 + }, + { + "epoch": 1.248888888888889, + "grad_norm": 4.096035480499268, + "learning_rate": 1.31693231044715e-05, + "loss": 11.2883, + "step": 281000 + }, + { + "epoch": 1.251111111111111, + "grad_norm": 641.160888671875, + "learning_rate": 1.3120099922479414e-05, + "loss": 12.2018, + "step": 281500 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 218.7012939453125, + "learning_rate": 1.3070897298773392e-05, + "loss": 11.9625, + "step": 282000 + }, + { + "epoch": 1.2555555555555555, + "grad_norm": 1709.0491943359375, + "learning_rate": 1.3021715771425437e-05, + "loss": 11.9818, + "step": 282500 + }, + { + "epoch": 1.2577777777777777, + "grad_norm": 325.7183532714844, + "learning_rate": 1.2972555878276857e-05, + "loss": 12.171, + "step": 283000 + }, + { + "epoch": 1.26, + "grad_norm": 463.99432373046875, + "learning_rate": 1.292341815693237e-05, + "loss": 12.996, + "step": 283500 + }, + { + "epoch": 1.2622222222222224, + "grad_norm": 30.650217056274414, + "learning_rate": 1.2874303144754219e-05, + "loss": 11.0988, + "step": 284000 + }, + { + "epoch": 1.2644444444444445, + "grad_norm": 308.7669372558594, + "learning_rate": 1.2825211378856311e-05, + "loss": 11.6588, + "step": 284500 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 813.3473510742188, + "learning_rate": 1.2776143396098331e-05, + "loss": 11.7966, + "step": 285000 + }, + { + "epoch": 1.268888888888889, + "grad_norm": 277.6453857421875, + "learning_rate": 1.272709973307988e-05, + "loss": 11.957, + "step": 285500 + }, + { + "epoch": 1.271111111111111, + "grad_norm": 614.5536499023438, + "learning_rate": 1.2678080926134595e-05, + "loss": 12.0953, + "step": 286000 + }, + { + "epoch": 1.2733333333333334, + "grad_norm": 600.1682739257812, + "learning_rate": 1.2629087511324295e-05, + "loss": 12.4912, + "step": 286500 + }, + { + "epoch": 1.2755555555555556, + "grad_norm": 291.91387939453125, + "learning_rate": 1.2580120024433123e-05, + "loss": 11.737, + "step": 287000 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 645.7890625, + "learning_rate": 1.2531179000961662e-05, + "loss": 11.1851, + "step": 287500 + }, + { + "epoch": 1.28, + "grad_norm": 390.1597900390625, + "learning_rate": 1.2482264976121108e-05, + "loss": 11.5208, + "step": 288000 + }, + { + "epoch": 1.2822222222222222, + "grad_norm": 15.699028968811035, + "learning_rate": 1.2433378484827395e-05, + "loss": 12.3516, + "step": 288500 + }, + { + "epoch": 1.2844444444444445, + "grad_norm": 35.82905578613281, + "learning_rate": 1.2384520061695367e-05, + "loss": 11.0025, + "step": 289000 + }, + { + "epoch": 1.2866666666666666, + "grad_norm": 112.55397033691406, + "learning_rate": 1.2335690241032904e-05, + "loss": 11.9212, + "step": 289500 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 143.4647979736328, + "learning_rate": 1.2286889556835105e-05, + "loss": 11.8427, + "step": 290000 + }, + { + "epoch": 1.291111111111111, + "grad_norm": 83.45748138427734, + "learning_rate": 1.2238118542778435e-05, + "loss": 11.4673, + "step": 290500 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 128.21621704101562, + "learning_rate": 1.2189377732214886e-05, + "loss": 10.8374, + "step": 291000 + }, + { + "epoch": 1.2955555555555556, + "grad_norm": 987.85302734375, + "learning_rate": 1.2140667658166162e-05, + "loss": 12.346, + "step": 291500 + }, + { + "epoch": 1.2977777777777777, + "grad_norm": 250.47520446777344, + "learning_rate": 1.2091988853317817e-05, + "loss": 10.7999, + "step": 292000 + }, + { + "epoch": 1.3, + "grad_norm": 33.65868377685547, + "learning_rate": 1.2043341850013472e-05, + "loss": 12.6021, + "step": 292500 + }, + { + "epoch": 1.3022222222222222, + "grad_norm": 207.2305450439453, + "learning_rate": 1.1994727180248953e-05, + "loss": 12.2435, + "step": 293000 + }, + { + "epoch": 1.3044444444444445, + "grad_norm": 210.83741760253906, + "learning_rate": 1.1946145375666504e-05, + "loss": 11.2422, + "step": 293500 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 289.1300964355469, + "learning_rate": 1.189759696754896e-05, + "loss": 11.7366, + "step": 294000 + }, + { + "epoch": 1.3088888888888888, + "grad_norm": 491.4790954589844, + "learning_rate": 1.1849082486813923e-05, + "loss": 11.8805, + "step": 294500 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 286.23681640625, + "learning_rate": 1.1800602464007995e-05, + "loss": 11.8487, + "step": 295000 + }, + { + "epoch": 1.3133333333333335, + "grad_norm": 150.55995178222656, + "learning_rate": 1.175215742930093e-05, + "loss": 11.2674, + "step": 295500 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 90.90438842773438, + "learning_rate": 1.1703747912479867e-05, + "loss": 12.0513, + "step": 296000 + }, + { + "epoch": 1.3177777777777777, + "grad_norm": 402.916748046875, + "learning_rate": 1.1655374442943526e-05, + "loss": 11.3287, + "step": 296500 + }, + { + "epoch": 1.32, + "grad_norm": 221.00369262695312, + "learning_rate": 1.160703754969642e-05, + "loss": 10.8907, + "step": 297000 + }, + { + "epoch": 1.3222222222222222, + "grad_norm": 84.22000885009766, + "learning_rate": 1.1558737761343074e-05, + "loss": 12.0133, + "step": 297500 + }, + { + "epoch": 1.3244444444444445, + "grad_norm": 19.054018020629883, + "learning_rate": 1.1510475606082226e-05, + "loss": 10.2377, + "step": 298000 + }, + { + "epoch": 1.3266666666666667, + "grad_norm": 453.34326171875, + "learning_rate": 1.1462251611701084e-05, + "loss": 11.93, + "step": 298500 + }, + { + "epoch": 1.3288888888888888, + "grad_norm": 275.5953063964844, + "learning_rate": 1.1414066305569514e-05, + "loss": 13.0519, + "step": 299000 + }, + { + "epoch": 1.3311111111111111, + "grad_norm": 279.2978210449219, + "learning_rate": 1.1365920214634312e-05, + "loss": 11.8949, + "step": 299500 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 278.0643310546875, + "learning_rate": 1.1317813865413409e-05, + "loss": 10.4946, + "step": 300000 + }, + { + "epoch": 1.3355555555555556, + "grad_norm": 685.4400024414062, + "learning_rate": 1.1269747783990135e-05, + "loss": 11.1153, + "step": 300500 + }, + { + "epoch": 1.3377777777777777, + "grad_norm": 312.36724853515625, + "learning_rate": 1.1221722496007462e-05, + "loss": 12.0323, + "step": 301000 + }, + { + "epoch": 1.34, + "grad_norm": 1231.820068359375, + "learning_rate": 1.1173738526662234e-05, + "loss": 10.8594, + "step": 301500 + }, + { + "epoch": 1.3422222222222222, + "grad_norm": 273.9977111816406, + "learning_rate": 1.1125796400699458e-05, + "loss": 11.2889, + "step": 302000 + }, + { + "epoch": 1.3444444444444446, + "grad_norm": 222.45266723632812, + "learning_rate": 1.1077896642406542e-05, + "loss": 11.6009, + "step": 302500 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 1616.0927734375, + "learning_rate": 1.103003977560757e-05, + "loss": 11.7312, + "step": 303000 + }, + { + "epoch": 1.3488888888888888, + "grad_norm": 172.6010284423828, + "learning_rate": 1.0982226323657565e-05, + "loss": 11.6923, + "step": 303500 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 188.0960235595703, + "learning_rate": 1.093445680943678e-05, + "loss": 10.7696, + "step": 304000 + }, + { + "epoch": 1.3533333333333333, + "grad_norm": 708.9501342773438, + "learning_rate": 1.0886731755344972e-05, + "loss": 11.5035, + "step": 304500 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 112.46131896972656, + "learning_rate": 1.0839051683295682e-05, + "loss": 11.0951, + "step": 305000 + }, + { + "epoch": 1.3577777777777778, + "grad_norm": 42.40409469604492, + "learning_rate": 1.0791417114710543e-05, + "loss": 12.8662, + "step": 305500 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 447.1692810058594, + "learning_rate": 1.074382857051356e-05, + "loss": 11.2495, + "step": 306000 + }, + { + "epoch": 1.3622222222222222, + "grad_norm": 0.0, + "learning_rate": 1.0696286571125437e-05, + "loss": 12.0512, + "step": 306500 + }, + { + "epoch": 1.3644444444444446, + "grad_norm": 1327.175537109375, + "learning_rate": 1.0648791636457847e-05, + "loss": 11.3486, + "step": 307000 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 114.16178894042969, + "learning_rate": 1.0601344285907797e-05, + "loss": 12.0348, + "step": 307500 + }, + { + "epoch": 1.3688888888888888, + "grad_norm": 410.4014587402344, + "learning_rate": 1.0553945038351914e-05, + "loss": 11.0606, + "step": 308000 + }, + { + "epoch": 1.3711111111111112, + "grad_norm": 205.14894104003906, + "learning_rate": 1.0506594412140768e-05, + "loss": 12.0553, + "step": 308500 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 70.4958267211914, + "learning_rate": 1.0459292925093228e-05, + "loss": 11.5397, + "step": 309000 + }, + { + "epoch": 1.3755555555555556, + "grad_norm": 194.81698608398438, + "learning_rate": 1.0412041094490767e-05, + "loss": 10.2973, + "step": 309500 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 15.8478364944458, + "learning_rate": 1.0364839437071848e-05, + "loss": 11.748, + "step": 310000 + }, + { + "epoch": 1.38, + "grad_norm": 117.78498840332031, + "learning_rate": 1.0317688469026219e-05, + "loss": 11.4108, + "step": 310500 + }, + { + "epoch": 1.3822222222222222, + "grad_norm": 497.8285217285156, + "learning_rate": 1.0270588705989322e-05, + "loss": 11.4724, + "step": 311000 + }, + { + "epoch": 1.3844444444444444, + "grad_norm": 339.4550476074219, + "learning_rate": 1.0223540663036624e-05, + "loss": 12.0662, + "step": 311500 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 191.90072631835938, + "learning_rate": 1.017654485467797e-05, + "loss": 12.0687, + "step": 312000 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 201.05392456054688, + "learning_rate": 1.0129601794852007e-05, + "loss": 12.6799, + "step": 312500 + }, + { + "epoch": 1.3911111111111112, + "grad_norm": 0.0, + "learning_rate": 1.00827119969205e-05, + "loss": 11.8095, + "step": 313000 + }, + { + "epoch": 1.3933333333333333, + "grad_norm": 110.20486450195312, + "learning_rate": 1.0035875973662787e-05, + "loss": 11.1245, + "step": 313500 + }, + { + "epoch": 1.3955555555555557, + "grad_norm": 142.47384643554688, + "learning_rate": 9.989094237270094e-06, + "loss": 11.5409, + "step": 314000 + }, + { + "epoch": 1.3977777777777778, + "grad_norm": 408.63031005859375, + "learning_rate": 9.942367299340003e-06, + "loss": 11.8593, + "step": 314500 + }, + { + "epoch": 1.4, + "grad_norm": 470.6206970214844, + "learning_rate": 9.89569567087083e-06, + "loss": 11.6008, + "step": 315000 + }, + { + "epoch": 1.4022222222222223, + "grad_norm": 67.78047180175781, + "learning_rate": 9.84907986225601e-06, + "loss": 10.926, + "step": 315500 + }, + { + "epoch": 1.4044444444444444, + "grad_norm": 293.5436706542969, + "learning_rate": 9.802520383278574e-06, + "loss": 10.8069, + "step": 316000 + }, + { + "epoch": 1.4066666666666667, + "grad_norm": 480.68389892578125, + "learning_rate": 9.75601774310551e-06, + "loss": 11.2341, + "step": 316500 + }, + { + "epoch": 1.4088888888888889, + "grad_norm": 235.30406188964844, + "learning_rate": 9.709572450282253e-06, + "loss": 11.3084, + "step": 317000 + }, + { + "epoch": 1.411111111111111, + "grad_norm": 977.0435180664062, + "learning_rate": 9.663185012727075e-06, + "loss": 12.978, + "step": 317500 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 26.692384719848633, + "learning_rate": 9.61685593772556e-06, + "loss": 11.2446, + "step": 318000 + }, + { + "epoch": 1.4155555555555557, + "grad_norm": 691.8837280273438, + "learning_rate": 9.570585731925064e-06, + "loss": 11.2801, + "step": 318500 + }, + { + "epoch": 1.4177777777777778, + "grad_norm": 210.51527404785156, + "learning_rate": 9.524374901329125e-06, + "loss": 10.0809, + "step": 319000 + }, + { + "epoch": 1.42, + "grad_norm": 0.0, + "learning_rate": 9.478223951292001e-06, + "loss": 11.3325, + "step": 319500 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 302.1672668457031, + "learning_rate": 9.432133386513075e-06, + "loss": 10.449, + "step": 320000 + }, + { + "epoch": 1.4244444444444444, + "grad_norm": 210.6238250732422, + "learning_rate": 9.386103711031384e-06, + "loss": 12.6131, + "step": 320500 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 615.0394897460938, + "learning_rate": 9.340135428220081e-06, + "loss": 11.892, + "step": 321000 + }, + { + "epoch": 1.4288888888888889, + "grad_norm": 44.33654022216797, + "learning_rate": 9.294229040780948e-06, + "loss": 11.7791, + "step": 321500 + }, + { + "epoch": 1.431111111111111, + "grad_norm": 423.60943603515625, + "learning_rate": 9.248385050738874e-06, + "loss": 11.8577, + "step": 322000 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 750.2989501953125, + "learning_rate": 9.202603959436398e-06, + "loss": 11.5078, + "step": 322500 + }, + { + "epoch": 1.4355555555555555, + "grad_norm": 14.56828784942627, + "learning_rate": 9.156886267528198e-06, + "loss": 11.1005, + "step": 323000 + }, + { + "epoch": 1.4377777777777778, + "grad_norm": 98.75641632080078, + "learning_rate": 9.111232474975624e-06, + "loss": 10.4616, + "step": 323500 + }, + { + "epoch": 1.44, + "grad_norm": 58.22282409667969, + "learning_rate": 9.065643081041242e-06, + "loss": 10.8385, + "step": 324000 + }, + { + "epoch": 1.4422222222222223, + "grad_norm": 95.11531066894531, + "learning_rate": 9.020118584283357e-06, + "loss": 10.93, + "step": 324500 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 402.2863464355469, + "learning_rate": 8.974659482550576e-06, + "loss": 10.7504, + "step": 325000 + } + ], + "logging_steps": 500, + "max_steps": 500000, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}