| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 100, | |
| "global_step": 3375, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008888888888888889, | |
| "grad_norm": 4.868620036481513, | |
| "learning_rate": 1.4792899408284025e-07, | |
| "loss": 1.254, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017777777777777778, | |
| "grad_norm": 4.907376524481664, | |
| "learning_rate": 2.958579881656805e-07, | |
| "loss": 1.2718, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 4.924364407249927, | |
| "learning_rate": 4.4378698224852073e-07, | |
| "loss": 1.2432, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 3.4088932051993455, | |
| "learning_rate": 5.91715976331361e-07, | |
| "loss": 1.2112, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 2.9113656297806103, | |
| "learning_rate": 7.396449704142013e-07, | |
| "loss": 1.2001, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 2.3204535662374544, | |
| "learning_rate": 8.875739644970415e-07, | |
| "loss": 1.1006, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06222222222222222, | |
| "grad_norm": 1.8516910199282557, | |
| "learning_rate": 1.0355029585798817e-06, | |
| "loss": 1.0786, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 1.8962999657740618, | |
| "learning_rate": 1.183431952662722e-06, | |
| "loss": 1.0518, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.6146752217951077, | |
| "learning_rate": 1.3313609467455623e-06, | |
| "loss": 1.0076, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 1.6900956108641927, | |
| "learning_rate": 1.4792899408284026e-06, | |
| "loss": 1.0064, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "eval_loss": 0.9964859485626221, | |
| "eval_runtime": 37.5504, | |
| "eval_samples_per_second": 53.262, | |
| "eval_steps_per_second": 6.658, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09777777777777778, | |
| "grad_norm": 1.8080972395438724, | |
| "learning_rate": 1.6272189349112426e-06, | |
| "loss": 0.9937, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 1.7684913996303595, | |
| "learning_rate": 1.775147928994083e-06, | |
| "loss": 0.9937, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11555555555555555, | |
| "grad_norm": 1.4352385051753562, | |
| "learning_rate": 1.9230769230769234e-06, | |
| "loss": 0.9882, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12444444444444444, | |
| "grad_norm": 1.5616454174683354, | |
| "learning_rate": 2.0710059171597635e-06, | |
| "loss": 0.9763, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 1.763519993122216, | |
| "learning_rate": 2.2189349112426035e-06, | |
| "loss": 1.0136, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 1.6007254163395832, | |
| "learning_rate": 2.366863905325444e-06, | |
| "loss": 0.9758, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1511111111111111, | |
| "grad_norm": 1.5948360204769163, | |
| "learning_rate": 2.5147928994082845e-06, | |
| "loss": 0.9505, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.5057997454321306, | |
| "learning_rate": 2.6627218934911246e-06, | |
| "loss": 0.9591, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1688888888888889, | |
| "grad_norm": 1.5390209924258027, | |
| "learning_rate": 2.8106508875739646e-06, | |
| "loss": 0.9463, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 1.461925655745632, | |
| "learning_rate": 2.958579881656805e-06, | |
| "loss": 0.9226, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "eval_loss": 0.9364051222801208, | |
| "eval_runtime": 36.7175, | |
| "eval_samples_per_second": 54.47, | |
| "eval_steps_per_second": 6.809, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 1.5374045647524592, | |
| "learning_rate": 3.106508875739645e-06, | |
| "loss": 0.9472, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.19555555555555557, | |
| "grad_norm": 1.5912062989392914, | |
| "learning_rate": 3.2544378698224853e-06, | |
| "loss": 0.9343, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.20444444444444446, | |
| "grad_norm": 1.7723775001947806, | |
| "learning_rate": 3.4023668639053257e-06, | |
| "loss": 0.9391, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 1.6435892953143434, | |
| "learning_rate": 3.550295857988166e-06, | |
| "loss": 0.9368, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 1.4424599436942067, | |
| "learning_rate": 3.6982248520710063e-06, | |
| "loss": 0.9319, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2311111111111111, | |
| "grad_norm": 1.728159212104628, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 0.8866, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.3326845832416128, | |
| "learning_rate": 3.9940828402366864e-06, | |
| "loss": 0.9045, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 1.6886292509720702, | |
| "learning_rate": 4.142011834319527e-06, | |
| "loss": 0.9265, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2577777777777778, | |
| "grad_norm": 1.6019493123530608, | |
| "learning_rate": 4.289940828402367e-06, | |
| "loss": 0.9331, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 1.6146648266355992, | |
| "learning_rate": 4.437869822485207e-06, | |
| "loss": 0.9237, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "eval_loss": 0.9182068109512329, | |
| "eval_runtime": 36.535, | |
| "eval_samples_per_second": 54.742, | |
| "eval_steps_per_second": 6.843, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.27555555555555555, | |
| "grad_norm": 1.3862729857445228, | |
| "learning_rate": 4.5857988165680475e-06, | |
| "loss": 0.9306, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 1.651953174906888, | |
| "learning_rate": 4.733727810650888e-06, | |
| "loss": 0.9459, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 1.755393092768391, | |
| "learning_rate": 4.8816568047337285e-06, | |
| "loss": 0.9519, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3022222222222222, | |
| "grad_norm": 1.5247145129317299, | |
| "learning_rate": 4.99999464967688e-06, | |
| "loss": 0.9419, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3111111111111111, | |
| "grad_norm": 1.5673209156579126, | |
| "learning_rate": 4.999807390772256e-06, | |
| "loss": 0.9357, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.5928733064190967, | |
| "learning_rate": 4.999352638611963e-06, | |
| "loss": 0.947, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3288888888888889, | |
| "grad_norm": 1.7766805751830876, | |
| "learning_rate": 4.998630441857007e-06, | |
| "loss": 0.9325, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3377777777777778, | |
| "grad_norm": 1.511653536436131, | |
| "learning_rate": 4.997640877786446e-06, | |
| "loss": 0.906, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 1.4941501475149104, | |
| "learning_rate": 4.996384052289124e-06, | |
| "loss": 0.9349, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 1.5183679914795427, | |
| "learning_rate": 4.994860099852339e-06, | |
| "loss": 0.9025, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "eval_loss": 0.9070032835006714, | |
| "eval_runtime": 36.6, | |
| "eval_samples_per_second": 54.645, | |
| "eval_steps_per_second": 6.831, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.36444444444444446, | |
| "grad_norm": 1.5024543245788338, | |
| "learning_rate": 4.993069183547456e-06, | |
| "loss": 0.9083, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 1.6343439673384825, | |
| "learning_rate": 4.991011495012451e-06, | |
| "loss": 0.9507, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.38222222222222224, | |
| "grad_norm": 1.7262818739661763, | |
| "learning_rate": 4.98868725443141e-06, | |
| "loss": 0.9567, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 1.6061518007474969, | |
| "learning_rate": 4.986096710510968e-06, | |
| "loss": 0.9333, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.6028615990046051, | |
| "learning_rate": 4.9832401404536915e-06, | |
| "loss": 0.9358, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4088888888888889, | |
| "grad_norm": 1.570005612548181, | |
| "learning_rate": 4.980117849928419e-06, | |
| "loss": 0.8983, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4177777777777778, | |
| "grad_norm": 1.5495976820237416, | |
| "learning_rate": 4.976730173037556e-06, | |
| "loss": 0.9312, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.6311647053909046, | |
| "learning_rate": 4.973077472281319e-06, | |
| "loss": 0.9197, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.43555555555555553, | |
| "grad_norm": 1.4775941411466225, | |
| "learning_rate": 4.969160138518946e-06, | |
| "loss": 0.9067, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.60402515279859, | |
| "learning_rate": 4.964978590926879e-06, | |
| "loss": 0.9169, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "eval_loss": 0.8998147249221802, | |
| "eval_runtime": 37.1803, | |
| "eval_samples_per_second": 53.792, | |
| "eval_steps_per_second": 6.724, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 1.4700844366599264, | |
| "learning_rate": 4.960533276953902e-06, | |
| "loss": 0.9235, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 1.6631199340680172, | |
| "learning_rate": 4.955824672273265e-06, | |
| "loss": 0.9018, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4711111111111111, | |
| "grad_norm": 1.7111177341739554, | |
| "learning_rate": 4.950853280731785e-06, | |
| "loss": 0.9181, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.4535444071016672, | |
| "learning_rate": 4.945619634295929e-06, | |
| "loss": 0.9499, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4888888888888889, | |
| "grad_norm": 1.3828751580671985, | |
| "learning_rate": 4.940124292994895e-06, | |
| "loss": 0.9081, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 1.6075817183262429, | |
| "learning_rate": 4.9343678448606816e-06, | |
| "loss": 0.9224, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 1.5519162793061656, | |
| "learning_rate": 4.928350905865165e-06, | |
| "loss": 0.9026, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5155555555555555, | |
| "grad_norm": 1.635116289002661, | |
| "learning_rate": 4.92207411985419e-06, | |
| "loss": 0.9227, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5244444444444445, | |
| "grad_norm": 1.5336828605332502, | |
| "learning_rate": 4.915538158478674e-06, | |
| "loss": 0.9226, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.6800188276640182, | |
| "learning_rate": 4.908743721122734e-06, | |
| "loss": 0.8682, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "eval_loss": 0.893156886100769, | |
| "eval_runtime": 36.6444, | |
| "eval_samples_per_second": 54.579, | |
| "eval_steps_per_second": 6.822, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5422222222222223, | |
| "grad_norm": 1.5154383041086907, | |
| "learning_rate": 4.901691534828853e-06, | |
| "loss": 0.8901, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5511111111111111, | |
| "grad_norm": 1.5069638989634389, | |
| "learning_rate": 4.894382354220077e-06, | |
| "loss": 0.9143, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.5826508156504, | |
| "learning_rate": 4.886816961419272e-06, | |
| "loss": 0.8948, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 1.5966300524059291, | |
| "learning_rate": 4.8789961659654276e-06, | |
| "loss": 0.9182, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5777777777777777, | |
| "grad_norm": 1.5542668195010378, | |
| "learning_rate": 4.870920804727034e-06, | |
| "loss": 0.9145, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 1.5961850069901384, | |
| "learning_rate": 4.862591741812533e-06, | |
| "loss": 0.8982, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5955555555555555, | |
| "grad_norm": 1.4942467173404939, | |
| "learning_rate": 4.8540098684778505e-06, | |
| "loss": 0.9123, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 1.5917562014794315, | |
| "learning_rate": 4.845176103031035e-06, | |
| "loss": 0.8976, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 1.6105397844621, | |
| "learning_rate": 4.836091390733983e-06, | |
| "loss": 0.8688, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 1.6497475274540199, | |
| "learning_rate": 4.826756703701298e-06, | |
| "loss": 0.8827, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "eval_loss": 0.8888917565345764, | |
| "eval_runtime": 36.7414, | |
| "eval_samples_per_second": 54.435, | |
| "eval_steps_per_second": 6.804, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6311111111111111, | |
| "grad_norm": 1.4922158605433329, | |
| "learning_rate": 4.817173040796263e-06, | |
| "loss": 0.8649, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.6152696298420064, | |
| "learning_rate": 4.807341427523969e-06, | |
| "loss": 0.9177, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6488888888888888, | |
| "grad_norm": 1.5961353957665738, | |
| "learning_rate": 4.797262915921561e-06, | |
| "loss": 0.8991, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6577777777777778, | |
| "grad_norm": 1.5667464158602376, | |
| "learning_rate": 4.7869385844456825e-06, | |
| "loss": 0.9503, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.5781196814649665, | |
| "learning_rate": 4.776369537857062e-06, | |
| "loss": 0.9105, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 1.503944616712719, | |
| "learning_rate": 4.765556907102306e-06, | |
| "loss": 0.9263, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6844444444444444, | |
| "grad_norm": 1.6473086807633828, | |
| "learning_rate": 4.7545018491928755e-06, | |
| "loss": 0.8819, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 1.4117747098000493, | |
| "learning_rate": 4.743205547081281e-06, | |
| "loss": 0.8922, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7022222222222222, | |
| "grad_norm": 1.4708303212935516, | |
| "learning_rate": 4.731669209534504e-06, | |
| "loss": 0.9025, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 1.5712775868269122, | |
| "learning_rate": 4.719894071004645e-06, | |
| "loss": 0.9096, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "eval_loss": 0.8853357434272766, | |
| "eval_runtime": 36.6826, | |
| "eval_samples_per_second": 54.522, | |
| "eval_steps_per_second": 6.815, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.4534761342687224, | |
| "learning_rate": 4.707881391496837e-06, | |
| "loss": 0.9035, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7288888888888889, | |
| "grad_norm": 1.5619341405621416, | |
| "learning_rate": 4.695632456434414e-06, | |
| "loss": 0.8942, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7377777777777778, | |
| "grad_norm": 1.6530414569806062, | |
| "learning_rate": 4.683148576521363e-06, | |
| "loss": 0.895, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 1.500702857508256, | |
| "learning_rate": 4.670431087602079e-06, | |
| "loss": 0.8826, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7555555555555555, | |
| "grad_norm": 1.5622670153829135, | |
| "learning_rate": 4.657481350518409e-06, | |
| "loss": 0.9172, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7644444444444445, | |
| "grad_norm": 1.5483259428719673, | |
| "learning_rate": 4.644300750964045e-06, | |
| "loss": 0.9304, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 1.468282447027174, | |
| "learning_rate": 4.630890699336244e-06, | |
| "loss": 0.8819, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 1.671539158314848, | |
| "learning_rate": 4.6172526305849094e-06, | |
| "loss": 0.8929, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7911111111111111, | |
| "grad_norm": 1.6595480420379722, | |
| "learning_rate": 4.603388004059037e-06, | |
| "loss": 0.9401, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.4846657745286256, | |
| "learning_rate": 4.589298303350565e-06, | |
| "loss": 0.9054, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.8810927867889404, | |
| "eval_runtime": 35.9747, | |
| "eval_samples_per_second": 55.595, | |
| "eval_steps_per_second": 6.949, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8088888888888889, | |
| "grad_norm": 1.8077709952778653, | |
| "learning_rate": 4.574985036135613e-06, | |
| "loss": 0.8756, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 1.5746787689057065, | |
| "learning_rate": 4.5604497340131635e-06, | |
| "loss": 0.8656, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 1.4640338579966414, | |
| "learning_rate": 4.545693952341159e-06, | |
| "loss": 0.934, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8355555555555556, | |
| "grad_norm": 1.6059690989804483, | |
| "learning_rate": 4.5307192700700804e-06, | |
| "loss": 0.9242, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8444444444444444, | |
| "grad_norm": 1.5097697360744204, | |
| "learning_rate": 4.515527289573986e-06, | |
| "loss": 0.906, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.4679308675173353, | |
| "learning_rate": 4.50011963647905e-06, | |
| "loss": 0.9128, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8622222222222222, | |
| "grad_norm": 1.6181492334082097, | |
| "learning_rate": 4.484497959489608e-06, | |
| "loss": 0.9166, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8711111111111111, | |
| "grad_norm": 1.558080687611738, | |
| "learning_rate": 4.468663930211743e-06, | |
| "loss": 0.8939, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.615024667449851, | |
| "learning_rate": 4.452619242974408e-06, | |
| "loss": 0.895, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.4440060757462254, | |
| "learning_rate": 4.436365614648128e-06, | |
| "loss": 0.86, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "eval_loss": 0.8786353468894958, | |
| "eval_runtime": 35.8226, | |
| "eval_samples_per_second": 55.831, | |
| "eval_steps_per_second": 6.979, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8977777777777778, | |
| "grad_norm": 1.410698981929063, | |
| "learning_rate": 4.4199047844612825e-06, | |
| "loss": 0.9238, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 1.518825110706773, | |
| "learning_rate": 4.4032385138139985e-06, | |
| "loss": 0.9239, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9155555555555556, | |
| "grad_norm": 1.6639850633807787, | |
| "learning_rate": 4.386368586089674e-06, | |
| "loss": 0.8846, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 1.6210196882366308, | |
| "learning_rate": 4.369296806464141e-06, | |
| "loss": 0.9081, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 1.5864235639001156, | |
| "learning_rate": 4.3520250017125076e-06, | |
| "loss": 0.8935, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9422222222222222, | |
| "grad_norm": 1.572481998249584, | |
| "learning_rate": 4.334555020013675e-06, | |
| "loss": 0.8712, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9511111111111111, | |
| "grad_norm": 1.458386818849127, | |
| "learning_rate": 4.316888730752583e-06, | |
| "loss": 0.9231, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.4402325640506388, | |
| "learning_rate": 4.299028024320166e-06, | |
| "loss": 0.8799, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9688888888888889, | |
| "grad_norm": 1.340308449364537, | |
| "learning_rate": 4.280974811911071e-06, | |
| "loss": 0.9094, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 1.5272352631206383, | |
| "learning_rate": 4.262731025319159e-06, | |
| "loss": 0.9017, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "eval_loss": 0.8765040040016174, | |
| "eval_runtime": 36.05, | |
| "eval_samples_per_second": 55.478, | |
| "eval_steps_per_second": 6.935, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 1.5223097724923493, | |
| "learning_rate": 4.244298616730781e-06, | |
| "loss": 0.906, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 1.5240087315865598, | |
| "learning_rate": 4.2256795585158894e-06, | |
| "loss": 0.9239, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0044444444444445, | |
| "grad_norm": 1.4157186107882624, | |
| "learning_rate": 4.2068758430169805e-06, | |
| "loss": 0.8415, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.0133333333333334, | |
| "grad_norm": 1.3850656963075334, | |
| "learning_rate": 4.187889482335905e-06, | |
| "loss": 0.795, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.0222222222222221, | |
| "grad_norm": 1.5545137194154015, | |
| "learning_rate": 4.168722508118562e-06, | |
| "loss": 0.8158, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.031111111111111, | |
| "grad_norm": 1.436097134909851, | |
| "learning_rate": 4.1493769713374995e-06, | |
| "loss": 0.8242, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.4545054070568668, | |
| "learning_rate": 4.12985494207245e-06, | |
| "loss": 0.8222, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.048888888888889, | |
| "grad_norm": 1.6781126937570572, | |
| "learning_rate": 4.110158509288822e-06, | |
| "loss": 0.7896, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.0577777777777777, | |
| "grad_norm": 1.6451559840416778, | |
| "learning_rate": 4.090289780614167e-06, | |
| "loss": 0.8267, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 1.8165606673067634, | |
| "learning_rate": 4.070250882112652e-06, | |
| "loss": 0.8243, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "eval_loss": 0.8799266219139099, | |
| "eval_runtime": 35.8312, | |
| "eval_samples_per_second": 55.817, | |
| "eval_steps_per_second": 6.977, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0755555555555556, | |
| "grad_norm": 1.5731027558902222, | |
| "learning_rate": 4.050043958057561e-06, | |
| "loss": 0.7882, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.0844444444444445, | |
| "grad_norm": 1.5157766166322681, | |
| "learning_rate": 4.029671170701841e-06, | |
| "loss": 0.7994, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0933333333333333, | |
| "grad_norm": 1.656247414095981, | |
| "learning_rate": 4.009134700046735e-06, | |
| "loss": 0.8028, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.1022222222222222, | |
| "grad_norm": 1.6145210447139136, | |
| "learning_rate": 3.988436743608506e-06, | |
| "loss": 0.7792, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.5895335709957146, | |
| "learning_rate": 3.967579516183292e-06, | |
| "loss": 0.8461, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.5213968296343328, | |
| "learning_rate": 3.946565249610108e-06, | |
| "loss": 0.8084, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.1288888888888888, | |
| "grad_norm": 1.6388524059550809, | |
| "learning_rate": 3.925396192532032e-06, | |
| "loss": 0.796, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.1377777777777778, | |
| "grad_norm": 1.6244300431135235, | |
| "learning_rate": 3.90407461015558e-06, | |
| "loss": 0.8123, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1466666666666667, | |
| "grad_norm": 1.6092762028671914, | |
| "learning_rate": 3.882602784008327e-06, | |
| "loss": 0.7696, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1555555555555554, | |
| "grad_norm": 1.5021829727375555, | |
| "learning_rate": 3.8609830116947596e-06, | |
| "loss": 0.8015, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1555555555555554, | |
| "eval_loss": 0.8797820210456848, | |
| "eval_runtime": 36.1345, | |
| "eval_samples_per_second": 55.349, | |
| "eval_steps_per_second": 6.919, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1644444444444444, | |
| "grad_norm": 1.615911865378651, | |
| "learning_rate": 3.839217606650426e-06, | |
| "loss": 0.8034, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.1733333333333333, | |
| "grad_norm": 1.7219139523140692, | |
| "learning_rate": 3.817308897894387e-06, | |
| "loss": 0.8028, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.1822222222222223, | |
| "grad_norm": 1.5565441923912047, | |
| "learning_rate": 3.7952592297799904e-06, | |
| "loss": 0.7707, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.1911111111111112, | |
| "grad_norm": 1.5035890539708752, | |
| "learning_rate": 3.7730709617440227e-06, | |
| "loss": 0.7985, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.4799634437203686, | |
| "learning_rate": 3.750746468054227e-06, | |
| "loss": 0.7902, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.208888888888889, | |
| "grad_norm": 1.3993362652699162, | |
| "learning_rate": 3.7282881375552475e-06, | |
| "loss": 0.7858, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.2177777777777778, | |
| "grad_norm": 1.53157940698711, | |
| "learning_rate": 3.70569837341301e-06, | |
| "loss": 0.8245, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.2266666666666666, | |
| "grad_norm": 1.5047456544840674, | |
| "learning_rate": 3.6829795928575703e-06, | |
| "loss": 0.7838, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.2355555555555555, | |
| "grad_norm": 1.6203070493661107, | |
| "learning_rate": 3.6601342269244528e-06, | |
| "loss": 0.8158, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "grad_norm": 1.722919280316014, | |
| "learning_rate": 3.6371647201945216e-06, | |
| "loss": 0.7866, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "eval_loss": 0.8785194158554077, | |
| "eval_runtime": 36.4407, | |
| "eval_samples_per_second": 54.884, | |
| "eval_steps_per_second": 6.86, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2533333333333334, | |
| "grad_norm": 1.5717366793745968, | |
| "learning_rate": 3.6140735305323943e-06, | |
| "loss": 0.7952, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.2622222222222224, | |
| "grad_norm": 1.6799739973960937, | |
| "learning_rate": 3.5908631288234374e-06, | |
| "loss": 0.8413, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.271111111111111, | |
| "grad_norm": 1.7727944476099253, | |
| "learning_rate": 3.5675359987093665e-06, | |
| "loss": 0.8119, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.6172985005067175, | |
| "learning_rate": 3.5440946363224855e-06, | |
| "loss": 0.7956, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2888888888888888, | |
| "grad_norm": 1.346340940135618, | |
| "learning_rate": 3.5205415500185836e-06, | |
| "loss": 0.7975, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.2977777777777777, | |
| "grad_norm": 1.5095896332297247, | |
| "learning_rate": 3.4968792601085296e-06, | |
| "loss": 0.8253, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.3066666666666666, | |
| "grad_norm": 1.7498432683213323, | |
| "learning_rate": 3.473110298588584e-06, | |
| "loss": 0.823, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.3155555555555556, | |
| "grad_norm": 1.5152609108895707, | |
| "learning_rate": 3.4492372088694605e-06, | |
| "loss": 0.838, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.3244444444444445, | |
| "grad_norm": 1.6196695100410112, | |
| "learning_rate": 3.4252625455041684e-06, | |
| "loss": 0.8212, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 1.4865828502504883, | |
| "learning_rate": 3.4011888739146587e-06, | |
| "loss": 0.8163, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "eval_loss": 0.8743957281112671, | |
| "eval_runtime": 36.0452, | |
| "eval_samples_per_second": 55.486, | |
| "eval_steps_per_second": 6.936, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.3422222222222222, | |
| "grad_norm": 1.4717740488802518, | |
| "learning_rate": 3.377018770117315e-06, | |
| "loss": 0.8238, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.3511111111111112, | |
| "grad_norm": 1.559356688866601, | |
| "learning_rate": 3.3527548204472985e-06, | |
| "loss": 0.824, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 1.674377707216799, | |
| "learning_rate": 3.3283996212818015e-06, | |
| "loss": 0.7708, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3688888888888888, | |
| "grad_norm": 1.6987925635864727, | |
| "learning_rate": 3.303955778762217e-06, | |
| "loss": 0.8285, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3777777777777778, | |
| "grad_norm": 1.6182932763097473, | |
| "learning_rate": 3.2794259085152703e-06, | |
| "loss": 0.8112, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.3866666666666667, | |
| "grad_norm": 1.5442535633131087, | |
| "learning_rate": 3.254812635373128e-06, | |
| "loss": 0.7727, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.3955555555555557, | |
| "grad_norm": 1.6816544725748876, | |
| "learning_rate": 3.2301185930925318e-06, | |
| "loss": 0.7945, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.4044444444444444, | |
| "grad_norm": 1.8215956260335184, | |
| "learning_rate": 3.205346424072967e-06, | |
| "loss": 0.8047, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.4133333333333333, | |
| "grad_norm": 1.548045904992036, | |
| "learning_rate": 3.180498779073915e-06, | |
| "loss": 0.7649, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "grad_norm": 1.7043703318380858, | |
| "learning_rate": 3.1555783169312048e-06, | |
| "loss": 0.8066, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "eval_loss": 0.8725046515464783, | |
| "eval_runtime": 35.6589, | |
| "eval_samples_per_second": 56.087, | |
| "eval_steps_per_second": 7.011, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.431111111111111, | |
| "grad_norm": 1.629807976853238, | |
| "learning_rate": 3.1305877042725036e-06, | |
| "loss": 0.8237, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.5484379121797882, | |
| "learning_rate": 3.1055296152319732e-06, | |
| "loss": 0.8076, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.448888888888889, | |
| "grad_norm": 1.7262688696739736, | |
| "learning_rate": 3.0804067311641217e-06, | |
| "loss": 0.8333, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.4577777777777778, | |
| "grad_norm": 1.8260535210069693, | |
| "learning_rate": 3.0552217403568855e-06, | |
| "loss": 0.7926, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.4666666666666668, | |
| "grad_norm": 1.6750611032983032, | |
| "learning_rate": 3.0299773377439677e-06, | |
| "loss": 0.7915, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.4755555555555555, | |
| "grad_norm": 1.3990474179417702, | |
| "learning_rate": 3.0046762246164608e-06, | |
| "loss": 0.8013, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.4844444444444445, | |
| "grad_norm": 1.8809571305092, | |
| "learning_rate": 2.979321108333799e-06, | |
| "loss": 0.7652, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.4933333333333334, | |
| "grad_norm": 1.5210473212334878, | |
| "learning_rate": 2.953914702034054e-06, | |
| "loss": 0.7984, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.5022222222222221, | |
| "grad_norm": 1.4249046917639296, | |
| "learning_rate": 2.928459724343613e-06, | |
| "loss": 0.8404, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.511111111111111, | |
| "grad_norm": 1.6177483971885367, | |
| "learning_rate": 2.9029588990862717e-06, | |
| "loss": 0.8194, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.511111111111111, | |
| "eval_loss": 0.8726724982261658, | |
| "eval_runtime": 35.94, | |
| "eval_samples_per_second": 55.648, | |
| "eval_steps_per_second": 6.956, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.5584170495767722, | |
| "learning_rate": 2.8774149549917697e-06, | |
| "loss": 0.7978, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.528888888888889, | |
| "grad_norm": 1.5710118549978462, | |
| "learning_rate": 2.8518306254037996e-06, | |
| "loss": 0.813, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.537777777777778, | |
| "grad_norm": 1.5689927318459866, | |
| "learning_rate": 2.82620864798753e-06, | |
| "loss": 0.8105, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.5466666666666666, | |
| "grad_norm": 1.4832935326504852, | |
| "learning_rate": 2.800551764436652e-06, | |
| "loss": 0.8546, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 1.5783175731659531, | |
| "learning_rate": 2.774862720180008e-06, | |
| "loss": 0.7933, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5644444444444443, | |
| "grad_norm": 1.538282343725092, | |
| "learning_rate": 2.749144264087814e-06, | |
| "loss": 0.7878, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.5733333333333333, | |
| "grad_norm": 1.7371423308538598, | |
| "learning_rate": 2.7233991481775173e-06, | |
| "loss": 0.8287, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.5822222222222222, | |
| "grad_norm": 1.4670462319523592, | |
| "learning_rate": 2.697630127319312e-06, | |
| "loss": 0.8091, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.5911111111111111, | |
| "grad_norm": 1.4813586250805493, | |
| "learning_rate": 2.6718399589413533e-06, | |
| "loss": 0.8116, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.4827255891188802, | |
| "learning_rate": 2.6460314027347002e-06, | |
| "loss": 0.8274, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.8705858588218689, | |
| "eval_runtime": 37.1653, | |
| "eval_samples_per_second": 53.814, | |
| "eval_steps_per_second": 6.727, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.608888888888889, | |
| "grad_norm": 1.2776381474133234, | |
| "learning_rate": 2.6202072203580098e-06, | |
| "loss": 0.7884, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.6177777777777778, | |
| "grad_norm": 1.538210347010849, | |
| "learning_rate": 2.594370175142029e-06, | |
| "loss": 0.7876, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.6266666666666667, | |
| "grad_norm": 1.725667854174387, | |
| "learning_rate": 2.5685230317938946e-06, | |
| "loss": 0.7747, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.6355555555555554, | |
| "grad_norm": 1.3174960157067677, | |
| "learning_rate": 2.542668556101305e-06, | |
| "loss": 0.7909, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.6444444444444444, | |
| "grad_norm": 1.5132311970848813, | |
| "learning_rate": 2.516809514636556e-06, | |
| "loss": 0.8031, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.6533333333333333, | |
| "grad_norm": 1.482050621647811, | |
| "learning_rate": 2.4909486744605105e-06, | |
| "loss": 0.787, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.6622222222222223, | |
| "grad_norm": 1.553711235421291, | |
| "learning_rate": 2.4650888028264993e-06, | |
| "loss": 0.801, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.6711111111111112, | |
| "grad_norm": 1.3848153492761215, | |
| "learning_rate": 2.439232666884216e-06, | |
| "loss": 0.7986, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.595050634131418, | |
| "learning_rate": 2.413383033383614e-06, | |
| "loss": 0.8155, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.6888888888888889, | |
| "grad_norm": 1.5059920276903438, | |
| "learning_rate": 2.3875426683788497e-06, | |
| "loss": 0.7773, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6888888888888889, | |
| "eval_loss": 0.8696685433387756, | |
| "eval_runtime": 36.3642, | |
| "eval_samples_per_second": 54.999, | |
| "eval_steps_per_second": 6.875, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6977777777777778, | |
| "grad_norm": 1.4898946275893297, | |
| "learning_rate": 2.3617143369322988e-06, | |
| "loss": 0.7831, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.7066666666666666, | |
| "grad_norm": 1.7074625969738908, | |
| "learning_rate": 2.33590080281868e-06, | |
| "loss": 0.7772, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.7155555555555555, | |
| "grad_norm": 1.5452190188528385, | |
| "learning_rate": 2.310104828229313e-06, | |
| "loss": 0.7799, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.7244444444444444, | |
| "grad_norm": 1.7059988950700535, | |
| "learning_rate": 2.2843291734765544e-06, | |
| "loss": 0.8215, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.7333333333333334, | |
| "grad_norm": 1.727770499458536, | |
| "learning_rate": 2.2585765966984236e-06, | |
| "loss": 0.8464, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.7422222222222223, | |
| "grad_norm": 1.5915714270418408, | |
| "learning_rate": 2.2328498535634704e-06, | |
| "loss": 0.7807, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.751111111111111, | |
| "grad_norm": 1.5627049232503645, | |
| "learning_rate": 2.2071516969758988e-06, | |
| "loss": 0.7882, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.8357823815684302, | |
| "learning_rate": 2.181484876780996e-06, | |
| "loss": 0.7988, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.7688888888888887, | |
| "grad_norm": 1.5731340393984028, | |
| "learning_rate": 2.1558521394708793e-06, | |
| "loss": 0.8354, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.5262237155455842, | |
| "learning_rate": 2.1302562278906106e-06, | |
| "loss": 0.7985, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "eval_loss": 0.8678115010261536, | |
| "eval_runtime": 36.2471, | |
| "eval_samples_per_second": 55.177, | |
| "eval_steps_per_second": 6.897, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7866666666666666, | |
| "grad_norm": 1.4718386643088002, | |
| "learning_rate": 2.1046998809446932e-06, | |
| "loss": 0.7953, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.7955555555555556, | |
| "grad_norm": 1.5995323653119067, | |
| "learning_rate": 2.0791858333039947e-06, | |
| "loss": 0.806, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.8044444444444445, | |
| "grad_norm": 1.58135246912806, | |
| "learning_rate": 2.0537168151131234e-06, | |
| "loss": 0.795, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.8133333333333335, | |
| "grad_norm": 1.5609289710145282, | |
| "learning_rate": 2.0282955516982865e-06, | |
| "loss": 0.8164, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.8222222222222222, | |
| "grad_norm": 1.4134565673327775, | |
| "learning_rate": 2.002924763275665e-06, | |
| "loss": 0.7827, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.8311111111111111, | |
| "grad_norm": 1.5473112113504999, | |
| "learning_rate": 1.9776071646603355e-06, | |
| "loss": 0.7877, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.8489942307077663, | |
| "learning_rate": 1.95234546497577e-06, | |
| "loss": 0.7678, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.8488888888888888, | |
| "grad_norm": 1.8649529470139352, | |
| "learning_rate": 1.9271423673639474e-06, | |
| "loss": 0.7743, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.8577777777777778, | |
| "grad_norm": 1.596641221330504, | |
| "learning_rate": 1.9020005686960962e-06, | |
| "loss": 0.81, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 1.5772287775631808, | |
| "learning_rate": 1.8769227592841205e-06, | |
| "loss": 0.7761, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "eval_loss": 0.8661899566650391, | |
| "eval_runtime": 36.0247, | |
| "eval_samples_per_second": 55.517, | |
| "eval_steps_per_second": 6.94, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8755555555555556, | |
| "grad_norm": 1.6860310546652746, | |
| "learning_rate": 1.851911622592717e-06, | |
| "loss": 0.7925, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.8844444444444446, | |
| "grad_norm": 1.496768365281578, | |
| "learning_rate": 1.826969834952234e-06, | |
| "loss": 0.7981, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.8933333333333333, | |
| "grad_norm": 1.5347469355011545, | |
| "learning_rate": 1.8021000652722847e-06, | |
| "loss": 0.8138, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.9022222222222223, | |
| "grad_norm": 1.527775906491713, | |
| "learning_rate": 1.777304974756162e-06, | |
| "loss": 0.7749, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.911111111111111, | |
| "grad_norm": 1.5482837642135612, | |
| "learning_rate": 1.7525872166160735e-06, | |
| "loss": 0.7937, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.3283380313242343, | |
| "learning_rate": 1.7279494357892338e-06, | |
| "loss": 0.812, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.9288888888888889, | |
| "grad_norm": 1.819887679873115, | |
| "learning_rate": 1.7033942686548425e-06, | |
| "loss": 0.8163, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.9377777777777778, | |
| "grad_norm": 1.3392395398864048, | |
| "learning_rate": 1.6789243427519744e-06, | |
| "loss": 0.779, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.9466666666666668, | |
| "grad_norm": 1.4107048094097487, | |
| "learning_rate": 1.6545422764984207e-06, | |
| "loss": 0.8143, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "grad_norm": 1.7233121403066654, | |
| "learning_rate": 1.6302506789105017e-06, | |
| "loss": 0.8017, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "eval_loss": 0.865475058555603, | |
| "eval_runtime": 36.1086, | |
| "eval_samples_per_second": 55.388, | |
| "eval_steps_per_second": 6.924, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9644444444444444, | |
| "grad_norm": 1.5157145628480404, | |
| "learning_rate": 1.60605214932389e-06, | |
| "loss": 0.7958, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.9733333333333334, | |
| "grad_norm": 1.4682970832921591, | |
| "learning_rate": 1.581949277115466e-06, | |
| "loss": 0.7968, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.982222222222222, | |
| "grad_norm": 1.6162541135699722, | |
| "learning_rate": 1.5579446414262384e-06, | |
| "loss": 0.7914, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.991111111111111, | |
| "grad_norm": 1.7069050244586401, | |
| "learning_rate": 1.5340408108853646e-06, | |
| "loss": 0.7826, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.6052940300197762, | |
| "learning_rate": 1.5102403433352924e-06, | |
| "loss": 0.7933, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.008888888888889, | |
| "grad_norm": 1.4169925983243987, | |
| "learning_rate": 1.4865457855580563e-06, | |
| "loss": 0.7336, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.017777777777778, | |
| "grad_norm": 1.6119178744200524, | |
| "learning_rate": 1.462959673002756e-06, | |
| "loss": 0.7411, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.026666666666667, | |
| "grad_norm": 1.5276893858570764, | |
| "learning_rate": 1.4394845295142524e-06, | |
| "loss": 0.7157, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.0355555555555553, | |
| "grad_norm": 1.6398321349308567, | |
| "learning_rate": 1.4161228670631022e-06, | |
| "loss": 0.6998, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "grad_norm": 1.5117837361418895, | |
| "learning_rate": 1.3928771854767575e-06, | |
| "loss": 0.7595, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "eval_loss": 0.8771167993545532, | |
| "eval_runtime": 36.4262, | |
| "eval_samples_per_second": 54.906, | |
| "eval_steps_per_second": 6.863, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.0533333333333332, | |
| "grad_norm": 1.4021963769227548, | |
| "learning_rate": 1.3697499721720786e-06, | |
| "loss": 0.7053, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.062222222222222, | |
| "grad_norm": 1.599485779658645, | |
| "learning_rate": 1.3467437018891622e-06, | |
| "loss": 0.7226, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.071111111111111, | |
| "grad_norm": 1.6021205768550164, | |
| "learning_rate": 1.3238608364265276e-06, | |
| "loss": 0.7333, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.5545114008887053, | |
| "learning_rate": 1.3011038243776975e-06, | |
| "loss": 0.7438, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.088888888888889, | |
| "grad_norm": 1.636381079268412, | |
| "learning_rate": 1.2784751008691793e-06, | |
| "loss": 0.7404, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.097777777777778, | |
| "grad_norm": 1.4332574632041426, | |
| "learning_rate": 1.2559770872998962e-06, | |
| "loss": 0.7324, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.1066666666666665, | |
| "grad_norm": 1.5030046270555426, | |
| "learning_rate": 1.2336121910820828e-06, | |
| "loss": 0.7435, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.1155555555555554, | |
| "grad_norm": 1.7470180021148172, | |
| "learning_rate": 1.211382805383677e-06, | |
| "loss": 0.7552, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.1244444444444444, | |
| "grad_norm": 1.5916339549467873, | |
| "learning_rate": 1.189291308872243e-06, | |
| "loss": 0.7101, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 1.8172783810251072, | |
| "learning_rate": 1.1673400654604325e-06, | |
| "loss": 0.7305, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "eval_loss": 0.8782817125320435, | |
| "eval_runtime": 37.4156, | |
| "eval_samples_per_second": 53.454, | |
| "eval_steps_per_second": 6.682, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.1422222222222222, | |
| "grad_norm": 1.6919416300141397, | |
| "learning_rate": 1.1455314240530416e-06, | |
| "loss": 0.7644, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.151111111111111, | |
| "grad_norm": 1.5086154137269185, | |
| "learning_rate": 1.1238677182956606e-06, | |
| "loss": 0.7454, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.5948081894641448, | |
| "learning_rate": 1.1023512663249585e-06, | |
| "loss": 0.7604, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.168888888888889, | |
| "grad_norm": 1.4819840743100468, | |
| "learning_rate": 1.0809843705206364e-06, | |
| "loss": 0.7127, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.1777777777777776, | |
| "grad_norm": 1.6944671193032417, | |
| "learning_rate": 1.0597693172590517e-06, | |
| "loss": 0.7573, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.1866666666666665, | |
| "grad_norm": 1.541348615431903, | |
| "learning_rate": 1.0387083766685716e-06, | |
| "loss": 0.7205, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.1955555555555555, | |
| "grad_norm": 1.6242696709289983, | |
| "learning_rate": 1.0178038023866513e-06, | |
| "loss": 0.7118, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.2044444444444444, | |
| "grad_norm": 1.500556038709764, | |
| "learning_rate": 9.97057831318682e-07, | |
| "loss": 0.7682, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.2133333333333334, | |
| "grad_norm": 1.6664323670898291, | |
| "learning_rate": 9.764726833986332e-07, | |
| "loss": 0.7178, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.7295247087941228, | |
| "learning_rate": 9.560505613515063e-07, | |
| "loss": 0.7071, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "eval_loss": 0.8789962530136108, | |
| "eval_runtime": 36.3968, | |
| "eval_samples_per_second": 54.95, | |
| "eval_steps_per_second": 6.869, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.2311111111111113, | |
| "grad_norm": 1.605648408432351, | |
| "learning_rate": 9.357936504576279e-07, | |
| "loss": 0.7544, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.5045989489374607, | |
| "learning_rate": 9.157041183188167e-07, | |
| "loss": 0.7497, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.2488888888888887, | |
| "grad_norm": 1.586179290895908, | |
| "learning_rate": 8.957841146264343e-07, | |
| "loss": 0.7228, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.2577777777777777, | |
| "grad_norm": 1.6244395773314837, | |
| "learning_rate": 8.760357709313602e-07, | |
| "loss": 0.6999, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 1.4683689082964326, | |
| "learning_rate": 8.564612004159023e-07, | |
| "loss": 0.6985, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.2755555555555556, | |
| "grad_norm": 1.5941817968672853, | |
| "learning_rate": 8.370624976676717e-07, | |
| "loss": 0.729, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.2844444444444445, | |
| "grad_norm": 1.5421068873667845, | |
| "learning_rate": 8.178417384554557e-07, | |
| "loss": 0.7354, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.2933333333333334, | |
| "grad_norm": 1.437158127165305, | |
| "learning_rate": 7.988009795070964e-07, | |
| "loss": 0.7116, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.3022222222222224, | |
| "grad_norm": 1.524186430554368, | |
| "learning_rate": 7.799422582894067e-07, | |
| "loss": 0.7191, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "grad_norm": 1.6495374367745645, | |
| "learning_rate": 7.612675927901558e-07, | |
| "loss": 0.7342, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "eval_loss": 0.8779633045196533, | |
| "eval_runtime": 36.2671, | |
| "eval_samples_per_second": 55.146, | |
| "eval_steps_per_second": 6.893, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.5355652490456149, | |
| "learning_rate": 7.427789813021271e-07, | |
| "loss": 0.7219, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.328888888888889, | |
| "grad_norm": 1.5957107866076117, | |
| "learning_rate": 7.244784022092957e-07, | |
| "loss": 0.6655, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.3377777777777777, | |
| "grad_norm": 1.632900448971318, | |
| "learning_rate": 7.063678137751265e-07, | |
| "loss": 0.7351, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.3466666666666667, | |
| "grad_norm": 1.6596321264641172, | |
| "learning_rate": 6.884491539330284e-07, | |
| "loss": 0.6971, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.3555555555555556, | |
| "grad_norm": 1.611902880979865, | |
| "learning_rate": 6.707243400789895e-07, | |
| "loss": 0.7094, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.3644444444444446, | |
| "grad_norm": 1.4283956152146977, | |
| "learning_rate": 6.531952688664003e-07, | |
| "loss": 0.7196, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.3733333333333335, | |
| "grad_norm": 1.4988935902436313, | |
| "learning_rate": 6.358638160031027e-07, | |
| "loss": 0.6881, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.3822222222222225, | |
| "grad_norm": 1.4591265664164854, | |
| "learning_rate": 6.187318360506805e-07, | |
| "loss": 0.736, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.391111111111111, | |
| "grad_norm": 1.5350816459153274, | |
| "learning_rate": 6.018011622260079e-07, | |
| "loss": 0.7402, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.5666259856450508, | |
| "learning_rate": 5.850736062050883e-07, | |
| "loss": 0.7255, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.877381443977356, | |
| "eval_runtime": 36.3644, | |
| "eval_samples_per_second": 54.999, | |
| "eval_steps_per_second": 6.875, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.408888888888889, | |
| "grad_norm": 1.5260786913555917, | |
| "learning_rate": 5.68550957929194e-07, | |
| "loss": 0.7324, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.417777777777778, | |
| "grad_norm": 1.5807446646796888, | |
| "learning_rate": 5.522349854133297e-07, | |
| "loss": 0.7215, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.4266666666666667, | |
| "grad_norm": 1.449409055029308, | |
| "learning_rate": 5.361274345570505e-07, | |
| "loss": 0.7119, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.4355555555555557, | |
| "grad_norm": 1.5392498745641638, | |
| "learning_rate": 5.202300289576351e-07, | |
| "loss": 0.6946, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 1.693499106185872, | |
| "learning_rate": 5.04544469725656e-07, | |
| "loss": 0.7199, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.453333333333333, | |
| "grad_norm": 1.6004474137695024, | |
| "learning_rate": 4.890724353029491e-07, | |
| "loss": 0.7571, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.462222222222222, | |
| "grad_norm": 1.7437520021683772, | |
| "learning_rate": 4.738155812830114e-07, | |
| "loss": 0.7123, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.471111111111111, | |
| "grad_norm": 1.5046028663576092, | |
| "learning_rate": 4.587755402338434e-07, | |
| "loss": 0.7241, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.6971107366608889, | |
| "learning_rate": 4.439539215232541e-07, | |
| "loss": 0.7508, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "grad_norm": 1.4778630633745058, | |
| "learning_rate": 4.293523111466519e-07, | |
| "loss": 0.7483, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "eval_loss": 0.8779265880584717, | |
| "eval_runtime": 36.3964, | |
| "eval_samples_per_second": 54.95, | |
| "eval_steps_per_second": 6.869, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.497777777777778, | |
| "grad_norm": 1.620605611623628, | |
| "learning_rate": 4.149722715573343e-07, | |
| "loss": 0.6977, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.506666666666667, | |
| "grad_norm": 1.6034086943258046, | |
| "learning_rate": 4.008153414992924e-07, | |
| "loss": 0.6988, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.5155555555555553, | |
| "grad_norm": 1.6435952136740553, | |
| "learning_rate": 3.868830358425635e-07, | |
| "loss": 0.7421, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.5244444444444447, | |
| "grad_norm": 1.4509202336876796, | |
| "learning_rate": 3.731768454211254e-07, | |
| "loss": 0.6894, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 1.6051037288046612, | |
| "learning_rate": 3.596982368733737e-07, | |
| "loss": 0.7167, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.542222222222222, | |
| "grad_norm": 1.542670069346282, | |
| "learning_rate": 3.464486524851804e-07, | |
| "loss": 0.7186, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.551111111111111, | |
| "grad_norm": 1.571528472589531, | |
| "learning_rate": 3.334295100355611e-07, | |
| "loss": 0.7276, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.4707258460441264, | |
| "learning_rate": 3.2064220264496735e-07, | |
| "loss": 0.7331, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.568888888888889, | |
| "grad_norm": 1.5241920182375372, | |
| "learning_rate": 3.0808809862621414e-07, | |
| "loss": 0.6941, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "grad_norm": 1.4914245546973939, | |
| "learning_rate": 2.9576854133805954e-07, | |
| "loss": 0.7285, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "eval_loss": 0.8775882124900818, | |
| "eval_runtime": 36.3492, | |
| "eval_samples_per_second": 55.022, | |
| "eval_steps_per_second": 6.878, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.586666666666667, | |
| "grad_norm": 1.563575238301146, | |
| "learning_rate": 2.836848490414637e-07, | |
| "loss": 0.7469, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.5955555555555554, | |
| "grad_norm": 1.545598396057516, | |
| "learning_rate": 2.718383147585213e-07, | |
| "loss": 0.7357, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.6044444444444443, | |
| "grad_norm": 1.8665840567581016, | |
| "learning_rate": 2.602302061341069e-07, | |
| "loss": 0.6887, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.6133333333333333, | |
| "grad_norm": 1.8496458894791434, | |
| "learning_rate": 2.4886176530022677e-07, | |
| "loss": 0.7173, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 1.8326585874144004, | |
| "learning_rate": 2.3773420874310226e-07, | |
| "loss": 0.7295, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.631111111111111, | |
| "grad_norm": 1.6457759357884785, | |
| "learning_rate": 2.2684872717300355e-07, | |
| "loss": 0.7146, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.5545055495774078, | |
| "learning_rate": 2.1620648539683293e-07, | |
| "loss": 0.7417, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.648888888888889, | |
| "grad_norm": 1.7824620672718754, | |
| "learning_rate": 2.0580862219348656e-07, | |
| "loss": 0.7092, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.6577777777777776, | |
| "grad_norm": 1.670989523069054, | |
| "learning_rate": 1.9565625019199785e-07, | |
| "loss": 0.7593, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 1.597030903429691, | |
| "learning_rate": 1.8575045575247918e-07, | |
| "loss": 0.7462, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "eval_loss": 0.8767767548561096, | |
| "eval_runtime": 36.7181, | |
| "eval_samples_per_second": 54.469, | |
| "eval_steps_per_second": 6.809, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.6755555555555555, | |
| "grad_norm": 1.4419419276479524, | |
| "learning_rate": 1.76092298849877e-07, | |
| "loss": 0.7178, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.6844444444444444, | |
| "grad_norm": 1.652273431476157, | |
| "learning_rate": 1.66682812960548e-07, | |
| "loss": 0.7445, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.6933333333333334, | |
| "grad_norm": 1.6221941093351853, | |
| "learning_rate": 1.5752300495167017e-07, | |
| "loss": 0.7323, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.7022222222222223, | |
| "grad_norm": 1.5827263625862764, | |
| "learning_rate": 1.4861385497350472e-07, | |
| "loss": 0.7117, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.7111111111111112, | |
| "grad_norm": 1.586274147952276, | |
| "learning_rate": 1.399563163545123e-07, | |
| "loss": 0.6905, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.5819937681513818, | |
| "learning_rate": 1.315513154993431e-07, | |
| "loss": 0.7248, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.728888888888889, | |
| "grad_norm": 1.583127670414582, | |
| "learning_rate": 1.233997517897062e-07, | |
| "loss": 0.7338, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.7377777777777776, | |
| "grad_norm": 1.5851377143113055, | |
| "learning_rate": 1.155024974881297e-07, | |
| "loss": 0.7016, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.7466666666666666, | |
| "grad_norm": 1.629306178995298, | |
| "learning_rate": 1.0786039764462492e-07, | |
| "loss": 0.734, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.7555555555555555, | |
| "grad_norm": 1.88514847397562, | |
| "learning_rate": 1.0047427000626164e-07, | |
| "loss": 0.7338, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.7555555555555555, | |
| "eval_loss": 0.8767729997634888, | |
| "eval_runtime": 36.1716, | |
| "eval_samples_per_second": 55.292, | |
| "eval_steps_per_second": 6.911, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.7644444444444445, | |
| "grad_norm": 1.7789627575897133, | |
| "learning_rate": 9.33449049296628e-08, | |
| "loss": 0.731, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.7733333333333334, | |
| "grad_norm": 1.5683993214305225, | |
| "learning_rate": 8.647306529643378e-08, | |
| "loss": 0.6994, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.7822222222222224, | |
| "grad_norm": 1.7367743332470011, | |
| "learning_rate": 7.985948643152913e-08, | |
| "loss": 0.713, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.7911111111111113, | |
| "grad_norm": 1.5856496471015165, | |
| "learning_rate": 7.35048760245688e-08, | |
| "loss": 0.732, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.5163465560548002, | |
| "learning_rate": 6.740991405411151e-08, | |
| "loss": 0.7603, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.8088888888888888, | |
| "grad_norm": 1.5838531712850452, | |
| "learning_rate": 6.157525271489245e-08, | |
| "loss": 0.719, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.8177777777777777, | |
| "grad_norm": 1.5510855817737712, | |
| "learning_rate": 5.60015163480368e-08, | |
| "loss": 0.7072, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.8266666666666667, | |
| "grad_norm": 1.606625002529381, | |
| "learning_rate": 5.0689301374249045e-08, | |
| "loss": 0.729, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.8355555555555556, | |
| "grad_norm": 1.8722551524392257, | |
| "learning_rate": 4.5639176229995696e-08, | |
| "loss": 0.7129, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.8444444444444446, | |
| "grad_norm": 1.4907886093472633, | |
| "learning_rate": 4.085168130667672e-08, | |
| "loss": 0.7248, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.8444444444444446, | |
| "eval_loss": 0.8769278526306152, | |
| "eval_runtime": 36.1486, | |
| "eval_samples_per_second": 55.327, | |
| "eval_steps_per_second": 6.916, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.8533333333333335, | |
| "grad_norm": 1.7182061158055983, | |
| "learning_rate": 3.6327328892801774e-08, | |
| "loss": 0.7024, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.862222222222222, | |
| "grad_norm": 1.7249109800368647, | |
| "learning_rate": 3.2066603119173255e-08, | |
| "loss": 0.7433, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.871111111111111, | |
| "grad_norm": 1.5746847008202491, | |
| "learning_rate": 2.8069959907080502e-08, | |
| "loss": 0.7267, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.5015420252419114, | |
| "learning_rate": 2.4337826919513808e-08, | |
| "loss": 0.7244, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 1.50527687101669, | |
| "learning_rate": 2.0870603515402986e-08, | |
| "loss": 0.706, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.897777777777778, | |
| "grad_norm": 1.5776025878841073, | |
| "learning_rate": 1.766866070688267e-08, | |
| "loss": 0.7579, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.9066666666666667, | |
| "grad_norm": 1.6091761478348845, | |
| "learning_rate": 1.4732341119592375e-08, | |
| "loss": 0.706, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.9155555555555557, | |
| "grad_norm": 1.6516161509088219, | |
| "learning_rate": 1.2061958956013641e-08, | |
| "loss": 0.7316, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.924444444444444, | |
| "grad_norm": 1.4091978233996505, | |
| "learning_rate": 9.657799961849401e-09, | |
| "loss": 0.7151, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.9333333333333336, | |
| "grad_norm": 1.7591108514188951, | |
| "learning_rate": 7.520121395446223e-09, | |
| "loss": 0.7053, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.9333333333333336, | |
| "eval_loss": 0.8769709467887878, | |
| "eval_runtime": 36.193, | |
| "eval_samples_per_second": 55.259, | |
| "eval_steps_per_second": 6.907, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.942222222222222, | |
| "grad_norm": 1.5676511530944413, | |
| "learning_rate": 5.6491520002668885e-09, | |
| "loss": 0.7137, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.951111111111111, | |
| "grad_norm": 1.6109445584580693, | |
| "learning_rate": 4.045091980413862e-09, | |
| "loss": 0.693, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.596050238626073, | |
| "learning_rate": 2.7081129792061458e-09, | |
| "loss": 0.7183, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.968888888888889, | |
| "grad_norm": 1.4897287860994974, | |
| "learning_rate": 1.6383580608120287e-09, | |
| "loss": 0.7327, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.977777777777778, | |
| "grad_norm": 1.567603974160796, | |
| "learning_rate": 8.35941694941056e-10, | |
| "loss": 0.7431, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.986666666666667, | |
| "grad_norm": 1.4235316742408306, | |
| "learning_rate": 3.0094974459521367e-10, | |
| "loss": 0.73, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.9955555555555557, | |
| "grad_norm": 1.6271857085054227, | |
| "learning_rate": 3.3439456879891255e-11, | |
| "loss": 0.7037, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3375, | |
| "total_flos": 125861383569408.0, | |
| "train_loss": 0.8214060693670202, | |
| "train_runtime": 4861.6121, | |
| "train_samples_per_second": 11.107, | |
| "train_steps_per_second": 0.694 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 125861383569408.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |