| { | |
| "best_global_step": 2550, | |
| "best_metric": 4.99726676940918, | |
| "best_model_checkpoint": ".../training_output/checkpoint-1000", | |
| "epoch": 3.0, | |
| "eval_steps": 50, | |
| "global_step": 3129, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009587727708533078, | |
| "grad_norm": 1.2602713108062744, | |
| "learning_rate": 2.875399361022364e-07, | |
| "loss": 5.0879, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.019175455417066157, | |
| "grad_norm": 1.1363953351974487, | |
| "learning_rate": 6.070287539936103e-07, | |
| "loss": 5.1046, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.028763183125599234, | |
| "grad_norm": 1.1238548755645752, | |
| "learning_rate": 9.265175718849841e-07, | |
| "loss": 5.0837, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.038350910834132314, | |
| "grad_norm": 1.0674521923065186, | |
| "learning_rate": 1.2460063897763578e-06, | |
| "loss": 5.0778, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04793863854266539, | |
| "grad_norm": 1.0108286142349243, | |
| "learning_rate": 1.565495207667732e-06, | |
| "loss": 5.0643, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04793863854266539, | |
| "eval_q2q_data_loss": 5.071373462677002, | |
| "eval_q2q_data_runtime": 8.6567, | |
| "eval_q2q_data_samples_per_second": 312.475, | |
| "eval_q2q_data_steps_per_second": 19.638, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04793863854266539, | |
| "eval_q2p_data_loss": 5.046911239624023, | |
| "eval_q2p_data_runtime": 15.4129, | |
| "eval_q2p_data_samples_per_second": 52.683, | |
| "eval_q2p_data_steps_per_second": 3.309, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05752636625119847, | |
| "grad_norm": 1.051458477973938, | |
| "learning_rate": 1.8849840255591056e-06, | |
| "loss": 5.0424, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06711409395973154, | |
| "grad_norm": 1.123085856437683, | |
| "learning_rate": 2.2044728434504793e-06, | |
| "loss": 5.0255, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07670182166826463, | |
| "grad_norm": 0.8094280362129211, | |
| "learning_rate": 2.5239616613418532e-06, | |
| "loss": 5.0099, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0862895493767977, | |
| "grad_norm": 1.4995239973068237, | |
| "learning_rate": 2.8434504792332267e-06, | |
| "loss": 5.0063, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09587727708533078, | |
| "grad_norm": 0.6668018698692322, | |
| "learning_rate": 3.162939297124601e-06, | |
| "loss": 5.0033, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09587727708533078, | |
| "eval_q2q_data_loss": 5.014667510986328, | |
| "eval_q2q_data_runtime": 8.6334, | |
| "eval_q2q_data_samples_per_second": 313.318, | |
| "eval_q2q_data_steps_per_second": 19.691, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09587727708533078, | |
| "eval_q2p_data_loss": 5.0004682540893555, | |
| "eval_q2p_data_runtime": 15.4405, | |
| "eval_q2p_data_samples_per_second": 52.589, | |
| "eval_q2p_data_steps_per_second": 3.303, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10546500479386385, | |
| "grad_norm": 0.811168909072876, | |
| "learning_rate": 3.482428115015975e-06, | |
| "loss": 5.003, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.11505273250239693, | |
| "grad_norm": 1.420505404472351, | |
| "learning_rate": 3.8019169329073485e-06, | |
| "loss": 4.9967, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12464046021093, | |
| "grad_norm": 5.024260520935059, | |
| "learning_rate": 4.121405750798722e-06, | |
| "loss": 4.998, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1342281879194631, | |
| "grad_norm": 4.843268394470215, | |
| "learning_rate": 4.440894568690096e-06, | |
| "loss": 5.0012, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14381591562799617, | |
| "grad_norm": 0.6666759848594666, | |
| "learning_rate": 4.76038338658147e-06, | |
| "loss": 4.9989, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14381591562799617, | |
| "eval_q2q_data_loss": 5.009535312652588, | |
| "eval_q2q_data_runtime": 8.5717, | |
| "eval_q2q_data_samples_per_second": 315.574, | |
| "eval_q2q_data_steps_per_second": 19.833, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14381591562799617, | |
| "eval_q2p_data_loss": 4.942420959472656, | |
| "eval_q2p_data_runtime": 15.4905, | |
| "eval_q2p_data_samples_per_second": 52.419, | |
| "eval_q2p_data_steps_per_second": 3.292, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.15340364333652926, | |
| "grad_norm": 0.6130227446556091, | |
| "learning_rate": 5.079872204472844e-06, | |
| "loss": 4.9908, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1629913710450623, | |
| "grad_norm": 0.7333933711051941, | |
| "learning_rate": 5.399361022364218e-06, | |
| "loss": 4.9735, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1725790987535954, | |
| "grad_norm": 2.2645883560180664, | |
| "learning_rate": 5.718849840255591e-06, | |
| "loss": 4.9965, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.18216682646212848, | |
| "grad_norm": 0.6750437617301941, | |
| "learning_rate": 6.038338658146965e-06, | |
| "loss": 4.9825, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.19175455417066156, | |
| "grad_norm": 8.299290657043457, | |
| "learning_rate": 6.35782747603834e-06, | |
| "loss": 4.9514, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.19175455417066156, | |
| "eval_q2q_data_loss": 5.007415294647217, | |
| "eval_q2q_data_runtime": 8.6664, | |
| "eval_q2q_data_samples_per_second": 312.126, | |
| "eval_q2q_data_steps_per_second": 19.616, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.19175455417066156, | |
| "eval_q2p_data_loss": 4.874378204345703, | |
| "eval_q2p_data_runtime": 15.5099, | |
| "eval_q2p_data_samples_per_second": 52.354, | |
| "eval_q2p_data_steps_per_second": 3.288, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.20134228187919462, | |
| "grad_norm": 1.9930428266525269, | |
| "learning_rate": 6.677316293929713e-06, | |
| "loss": 4.9521, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2109300095877277, | |
| "grad_norm": 4.539638042449951, | |
| "learning_rate": 6.996805111821087e-06, | |
| "loss": 4.968, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.22051773729626079, | |
| "grad_norm": 0.5192278027534485, | |
| "learning_rate": 7.316293929712461e-06, | |
| "loss": 4.96, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.23010546500479387, | |
| "grad_norm": 4.190878868103027, | |
| "learning_rate": 7.635782747603835e-06, | |
| "loss": 4.9758, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.23969319271332695, | |
| "grad_norm": 0.7492648959159851, | |
| "learning_rate": 7.955271565495208e-06, | |
| "loss": 4.9834, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.23969319271332695, | |
| "eval_q2q_data_loss": 5.00647497177124, | |
| "eval_q2q_data_runtime": 8.6319, | |
| "eval_q2q_data_samples_per_second": 313.372, | |
| "eval_q2q_data_steps_per_second": 19.694, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.23969319271332695, | |
| "eval_q2p_data_loss": 4.842836856842041, | |
| "eval_q2p_data_runtime": 15.4423, | |
| "eval_q2p_data_samples_per_second": 52.583, | |
| "eval_q2p_data_steps_per_second": 3.303, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.24928092042186, | |
| "grad_norm": 1.2294269800186157, | |
| "learning_rate": 8.274760383386582e-06, | |
| "loss": 4.9273, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2588686481303931, | |
| "grad_norm": 1.7497507333755493, | |
| "learning_rate": 8.594249201277956e-06, | |
| "loss": 4.9796, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2684563758389262, | |
| "grad_norm": 5.415214538574219, | |
| "learning_rate": 8.91373801916933e-06, | |
| "loss": 4.9517, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.27804410354745923, | |
| "grad_norm": 2.2691502571105957, | |
| "learning_rate": 9.233226837060704e-06, | |
| "loss": 4.9763, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.28763183125599234, | |
| "grad_norm": 5.458872318267822, | |
| "learning_rate": 9.552715654952077e-06, | |
| "loss": 4.9372, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.28763183125599234, | |
| "eval_q2q_data_loss": 5.0056328773498535, | |
| "eval_q2q_data_runtime": 8.5076, | |
| "eval_q2q_data_samples_per_second": 317.952, | |
| "eval_q2q_data_steps_per_second": 19.982, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.28763183125599234, | |
| "eval_q2p_data_loss": 4.825343608856201, | |
| "eval_q2p_data_runtime": 15.402, | |
| "eval_q2p_data_samples_per_second": 52.72, | |
| "eval_q2p_data_steps_per_second": 3.311, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2972195589645254, | |
| "grad_norm": 4.435003757476807, | |
| "learning_rate": 9.87220447284345e-06, | |
| "loss": 4.9325, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3068072866730585, | |
| "grad_norm": 0.34137386083602905, | |
| "learning_rate": 9.978693181818183e-06, | |
| "loss": 4.9477, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.31639501438159157, | |
| "grad_norm": 1.3951576948165894, | |
| "learning_rate": 9.943181818181819e-06, | |
| "loss": 4.9455, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3259827420901246, | |
| "grad_norm": 8.795852661132812, | |
| "learning_rate": 9.907670454545455e-06, | |
| "loss": 4.9258, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.33557046979865773, | |
| "grad_norm": 0.4223299026489258, | |
| "learning_rate": 9.872159090909091e-06, | |
| "loss": 4.9799, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.33557046979865773, | |
| "eval_q2q_data_loss": 5.004530429840088, | |
| "eval_q2q_data_runtime": 8.523, | |
| "eval_q2q_data_samples_per_second": 317.375, | |
| "eval_q2q_data_steps_per_second": 19.946, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.33557046979865773, | |
| "eval_q2p_data_loss": 4.843413352966309, | |
| "eval_q2p_data_runtime": 15.444, | |
| "eval_q2p_data_samples_per_second": 52.577, | |
| "eval_q2p_data_steps_per_second": 3.302, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3451581975071908, | |
| "grad_norm": 0.3708871006965637, | |
| "learning_rate": 9.836647727272728e-06, | |
| "loss": 4.9791, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3547459252157239, | |
| "grad_norm": 0.3105733096599579, | |
| "learning_rate": 9.801136363636364e-06, | |
| "loss": 4.9437, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.36433365292425696, | |
| "grad_norm": 0.3218185007572174, | |
| "learning_rate": 9.765625e-06, | |
| "loss": 4.9873, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.37392138063279, | |
| "grad_norm": 0.29383164644241333, | |
| "learning_rate": 9.730113636363636e-06, | |
| "loss": 4.9425, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3835091083413231, | |
| "grad_norm": 4.873048305511475, | |
| "learning_rate": 9.694602272727274e-06, | |
| "loss": 4.9837, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3835091083413231, | |
| "eval_q2q_data_loss": 5.004254341125488, | |
| "eval_q2q_data_runtime": 8.5135, | |
| "eval_q2q_data_samples_per_second": 317.73, | |
| "eval_q2q_data_steps_per_second": 19.968, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3835091083413231, | |
| "eval_q2p_data_loss": 4.841865539550781, | |
| "eval_q2p_data_runtime": 15.3899, | |
| "eval_q2p_data_samples_per_second": 52.762, | |
| "eval_q2p_data_steps_per_second": 3.314, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3930968360498562, | |
| "grad_norm": 0.3491421639919281, | |
| "learning_rate": 9.65909090909091e-06, | |
| "loss": 5.0006, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.40268456375838924, | |
| "grad_norm": 5.751034259796143, | |
| "learning_rate": 9.623579545454547e-06, | |
| "loss": 4.9831, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.41227229146692235, | |
| "grad_norm": 0.34302422404289246, | |
| "learning_rate": 9.588068181818183e-06, | |
| "loss": 4.9531, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4218600191754554, | |
| "grad_norm": 0.4230528771877289, | |
| "learning_rate": 9.552556818181818e-06, | |
| "loss": 4.9856, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4314477468839885, | |
| "grad_norm": 17.237260818481445, | |
| "learning_rate": 9.517045454545454e-06, | |
| "loss": 4.8996, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4314477468839885, | |
| "eval_q2q_data_loss": 5.005645751953125, | |
| "eval_q2q_data_runtime": 8.483, | |
| "eval_q2q_data_samples_per_second": 318.872, | |
| "eval_q2q_data_steps_per_second": 20.04, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4314477468839885, | |
| "eval_q2p_data_loss": 4.865195274353027, | |
| "eval_q2p_data_runtime": 15.3699, | |
| "eval_q2p_data_samples_per_second": 52.83, | |
| "eval_q2p_data_steps_per_second": 3.318, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.44103547459252157, | |
| "grad_norm": 2.0367865562438965, | |
| "learning_rate": 9.481534090909092e-06, | |
| "loss": 4.9467, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4506232023010546, | |
| "grad_norm": 0.41367027163505554, | |
| "learning_rate": 9.446022727272728e-06, | |
| "loss": 4.9724, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.46021093000958774, | |
| "grad_norm": 11.92837142944336, | |
| "learning_rate": 9.410511363636365e-06, | |
| "loss": 4.9797, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4697986577181208, | |
| "grad_norm": 0.38374051451683044, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 4.9735, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4793863854266539, | |
| "grad_norm": 5.73974609375, | |
| "learning_rate": 9.339488636363637e-06, | |
| "loss": 4.8765, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4793863854266539, | |
| "eval_q2q_data_loss": 5.003554821014404, | |
| "eval_q2q_data_runtime": 8.5075, | |
| "eval_q2q_data_samples_per_second": 317.954, | |
| "eval_q2q_data_steps_per_second": 19.982, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4793863854266539, | |
| "eval_q2p_data_loss": 4.845742225646973, | |
| "eval_q2p_data_runtime": 15.4131, | |
| "eval_q2p_data_samples_per_second": 52.682, | |
| "eval_q2p_data_steps_per_second": 3.309, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.48897411313518696, | |
| "grad_norm": 0.673588216304779, | |
| "learning_rate": 9.303977272727273e-06, | |
| "loss": 4.9136, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.49856184084372, | |
| "grad_norm": 0.6867577433586121, | |
| "learning_rate": 9.26846590909091e-06, | |
| "loss": 4.9688, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5081495685522531, | |
| "grad_norm": 0.5350639224052429, | |
| "learning_rate": 9.232954545454546e-06, | |
| "loss": 4.9436, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5177372962607862, | |
| "grad_norm": 0.4116136133670807, | |
| "learning_rate": 9.197443181818184e-06, | |
| "loss": 5.0017, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5273250239693192, | |
| "grad_norm": 10.749342918395996, | |
| "learning_rate": 9.161931818181818e-06, | |
| "loss": 4.9867, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5273250239693192, | |
| "eval_q2q_data_loss": 5.004271507263184, | |
| "eval_q2q_data_runtime": 8.4877, | |
| "eval_q2q_data_samples_per_second": 318.695, | |
| "eval_q2q_data_steps_per_second": 20.029, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5273250239693192, | |
| "eval_q2p_data_loss": 4.860942363739014, | |
| "eval_q2p_data_runtime": 15.3408, | |
| "eval_q2p_data_samples_per_second": 52.931, | |
| "eval_q2p_data_steps_per_second": 3.324, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5369127516778524, | |
| "grad_norm": 0.3119679093360901, | |
| "learning_rate": 9.126420454545455e-06, | |
| "loss": 4.9716, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5465004793863855, | |
| "grad_norm": 0.2090018391609192, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 4.9338, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5560882070949185, | |
| "grad_norm": 0.2094723880290985, | |
| "learning_rate": 9.055397727272727e-06, | |
| "loss": 4.9975, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5656759348034516, | |
| "grad_norm": 0.16981257498264313, | |
| "learning_rate": 9.019886363636364e-06, | |
| "loss": 4.9485, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5752636625119847, | |
| "grad_norm": 15.281989097595215, | |
| "learning_rate": 8.984375000000002e-06, | |
| "loss": 4.8959, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5752636625119847, | |
| "eval_q2q_data_loss": 5.002608299255371, | |
| "eval_q2q_data_runtime": 8.4635, | |
| "eval_q2q_data_samples_per_second": 319.608, | |
| "eval_q2q_data_steps_per_second": 20.086, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5752636625119847, | |
| "eval_q2p_data_loss": 4.780869483947754, | |
| "eval_q2p_data_runtime": 15.3652, | |
| "eval_q2p_data_samples_per_second": 52.847, | |
| "eval_q2p_data_steps_per_second": 3.319, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5848513902205177, | |
| "grad_norm": 16.331180572509766, | |
| "learning_rate": 8.948863636363638e-06, | |
| "loss": 4.9769, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5944391179290508, | |
| "grad_norm": 0.17700470983982086, | |
| "learning_rate": 8.913352272727274e-06, | |
| "loss": 4.9407, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6040268456375839, | |
| "grad_norm": 6.958109378814697, | |
| "learning_rate": 8.87784090909091e-06, | |
| "loss": 4.9941, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.613614573346117, | |
| "grad_norm": 5.405721664428711, | |
| "learning_rate": 8.842329545454547e-06, | |
| "loss": 4.976, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.62320230105465, | |
| "grad_norm": 0.2884855270385742, | |
| "learning_rate": 8.806818181818183e-06, | |
| "loss": 4.986, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.62320230105465, | |
| "eval_q2q_data_loss": 5.003030776977539, | |
| "eval_q2q_data_runtime": 8.5486, | |
| "eval_q2q_data_samples_per_second": 316.425, | |
| "eval_q2q_data_steps_per_second": 19.886, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.62320230105465, | |
| "eval_q2p_data_loss": 4.810172080993652, | |
| "eval_q2p_data_runtime": 15.3666, | |
| "eval_q2p_data_samples_per_second": 52.842, | |
| "eval_q2p_data_steps_per_second": 3.319, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6327900287631831, | |
| "grad_norm": 0.44038277864456177, | |
| "learning_rate": 8.77130681818182e-06, | |
| "loss": 4.94, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6423777564717162, | |
| "grad_norm": 0.35095784068107605, | |
| "learning_rate": 8.735795454545455e-06, | |
| "loss": 4.9917, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6519654841802492, | |
| "grad_norm": 0.7992573976516724, | |
| "learning_rate": 8.700284090909092e-06, | |
| "loss": 4.9938, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6615532118887824, | |
| "grad_norm": 12.68810749053955, | |
| "learning_rate": 8.664772727272728e-06, | |
| "loss": 4.9373, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6711409395973155, | |
| "grad_norm": 8.244370460510254, | |
| "learning_rate": 8.629261363636364e-06, | |
| "loss": 5.0235, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6711409395973155, | |
| "eval_q2q_data_loss": 5.032140254974365, | |
| "eval_q2q_data_runtime": 8.4755, | |
| "eval_q2q_data_samples_per_second": 319.155, | |
| "eval_q2q_data_steps_per_second": 20.058, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6711409395973155, | |
| "eval_q2p_data_loss": 4.879370212554932, | |
| "eval_q2p_data_runtime": 15.3816, | |
| "eval_q2p_data_samples_per_second": 52.79, | |
| "eval_q2p_data_steps_per_second": 3.316, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6807286673058485, | |
| "grad_norm": 12.066866874694824, | |
| "learning_rate": 8.59375e-06, | |
| "loss": 4.939, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6903163950143816, | |
| "grad_norm": 15.054842948913574, | |
| "learning_rate": 8.558238636363637e-06, | |
| "loss": 4.9682, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6999041227229147, | |
| "grad_norm": 1.6012367010116577, | |
| "learning_rate": 8.522727272727273e-06, | |
| "loss": 4.9813, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7094918504314478, | |
| "grad_norm": 6.062280654907227, | |
| "learning_rate": 8.48721590909091e-06, | |
| "loss": 4.9442, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7190795781399808, | |
| "grad_norm": 0.4181146025657654, | |
| "learning_rate": 8.451704545454547e-06, | |
| "loss": 4.9354, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7190795781399808, | |
| "eval_q2q_data_loss": 5.002427577972412, | |
| "eval_q2q_data_runtime": 8.4867, | |
| "eval_q2q_data_samples_per_second": 318.733, | |
| "eval_q2q_data_steps_per_second": 20.031, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7190795781399808, | |
| "eval_q2p_data_loss": 4.805325508117676, | |
| "eval_q2p_data_runtime": 15.3619, | |
| "eval_q2p_data_samples_per_second": 52.858, | |
| "eval_q2p_data_steps_per_second": 3.32, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7286673058485139, | |
| "grad_norm": 0.23768964409828186, | |
| "learning_rate": 8.416193181818184e-06, | |
| "loss": 4.9105, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.738255033557047, | |
| "grad_norm": 1.1970841884613037, | |
| "learning_rate": 8.380681818181818e-06, | |
| "loss": 4.9271, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.74784276126558, | |
| "grad_norm": 0.22903920710086823, | |
| "learning_rate": 8.345170454545454e-06, | |
| "loss": 4.9476, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7574304889741131, | |
| "grad_norm": 9.315869331359863, | |
| "learning_rate": 8.30965909090909e-06, | |
| "loss": 4.8887, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.7670182166826462, | |
| "grad_norm": 0.27411147952079773, | |
| "learning_rate": 8.274147727272727e-06, | |
| "loss": 4.9576, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7670182166826462, | |
| "eval_q2q_data_loss": 5.001960754394531, | |
| "eval_q2q_data_runtime": 8.5354, | |
| "eval_q2q_data_samples_per_second": 316.917, | |
| "eval_q2q_data_steps_per_second": 19.917, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7670182166826462, | |
| "eval_q2p_data_loss": 4.739698886871338, | |
| "eval_q2p_data_runtime": 15.3694, | |
| "eval_q2p_data_samples_per_second": 52.832, | |
| "eval_q2p_data_steps_per_second": 3.318, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7766059443911792, | |
| "grad_norm": 11.00167465209961, | |
| "learning_rate": 8.238636363636365e-06, | |
| "loss": 4.9577, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7861936720997124, | |
| "grad_norm": 0.460358589887619, | |
| "learning_rate": 8.203125000000001e-06, | |
| "loss": 4.8974, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7957813998082455, | |
| "grad_norm": 10.619705200195312, | |
| "learning_rate": 8.167613636363637e-06, | |
| "loss": 5.0033, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8053691275167785, | |
| "grad_norm": 0.5667484998703003, | |
| "learning_rate": 8.132102272727274e-06, | |
| "loss": 4.976, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8149568552253116, | |
| "grad_norm": 12.914066314697266, | |
| "learning_rate": 8.09659090909091e-06, | |
| "loss": 4.9915, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8149568552253116, | |
| "eval_q2q_data_loss": 5.042208194732666, | |
| "eval_q2q_data_runtime": 8.496, | |
| "eval_q2q_data_samples_per_second": 318.386, | |
| "eval_q2q_data_steps_per_second": 20.009, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8149568552253116, | |
| "eval_q2p_data_loss": 4.936696529388428, | |
| "eval_q2p_data_runtime": 15.4165, | |
| "eval_q2p_data_samples_per_second": 52.671, | |
| "eval_q2p_data_steps_per_second": 3.308, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8245445829338447, | |
| "grad_norm": 7.874532699584961, | |
| "learning_rate": 8.061079545454546e-06, | |
| "loss": 4.9856, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8341323106423778, | |
| "grad_norm": 3.6945109367370605, | |
| "learning_rate": 8.025568181818183e-06, | |
| "loss": 4.9566, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.8437200383509108, | |
| "grad_norm": 34.59883117675781, | |
| "learning_rate": 7.990056818181819e-06, | |
| "loss": 4.8738, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.8533077660594439, | |
| "grad_norm": 1.2880325317382812, | |
| "learning_rate": 7.954545454545455e-06, | |
| "loss": 4.9258, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.862895493767977, | |
| "grad_norm": 5.390997886657715, | |
| "learning_rate": 7.919034090909091e-06, | |
| "loss": 4.9118, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.862895493767977, | |
| "eval_q2q_data_loss": 5.003294944763184, | |
| "eval_q2q_data_runtime": 8.4963, | |
| "eval_q2q_data_samples_per_second": 318.375, | |
| "eval_q2q_data_steps_per_second": 20.009, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.862895493767977, | |
| "eval_q2p_data_loss": 4.794476509094238, | |
| "eval_q2p_data_runtime": 15.3667, | |
| "eval_q2p_data_samples_per_second": 52.842, | |
| "eval_q2p_data_steps_per_second": 3.319, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.87248322147651, | |
| "grad_norm": 3.2997488975524902, | |
| "learning_rate": 7.883522727272728e-06, | |
| "loss": 4.9782, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8820709491850431, | |
| "grad_norm": 10.71391773223877, | |
| "learning_rate": 7.848011363636364e-06, | |
| "loss": 4.8659, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8916586768935763, | |
| "grad_norm": 0.14661180973052979, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 4.9197, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9012464046021093, | |
| "grad_norm": 0.1432102769613266, | |
| "learning_rate": 7.776988636363636e-06, | |
| "loss": 4.9281, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9108341323106424, | |
| "grad_norm": 0.13064274191856384, | |
| "learning_rate": 7.741477272727274e-06, | |
| "loss": 4.9427, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9108341323106424, | |
| "eval_q2q_data_loss": 5.002143383026123, | |
| "eval_q2q_data_runtime": 8.5053, | |
| "eval_q2q_data_samples_per_second": 318.036, | |
| "eval_q2q_data_steps_per_second": 19.988, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9108341323106424, | |
| "eval_q2p_data_loss": 4.785708427429199, | |
| "eval_q2p_data_runtime": 15.3288, | |
| "eval_q2p_data_samples_per_second": 52.972, | |
| "eval_q2p_data_steps_per_second": 3.327, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9204218600191755, | |
| "grad_norm": 19.881868362426758, | |
| "learning_rate": 7.70596590909091e-06, | |
| "loss": 4.8966, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9300095877277086, | |
| "grad_norm": 0.11643442511558533, | |
| "learning_rate": 7.670454545454547e-06, | |
| "loss": 4.9657, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.9395973154362416, | |
| "grad_norm": 0.20641827583312988, | |
| "learning_rate": 7.634943181818183e-06, | |
| "loss": 4.9597, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.9491850431447747, | |
| "grad_norm": 0.1226697638630867, | |
| "learning_rate": 7.599431818181819e-06, | |
| "loss": 4.9627, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.9587727708533078, | |
| "grad_norm": 0.17849154770374298, | |
| "learning_rate": 7.563920454545455e-06, | |
| "loss": 4.8603, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9587727708533078, | |
| "eval_q2q_data_loss": 5.001661777496338, | |
| "eval_q2q_data_runtime": 8.4763, | |
| "eval_q2q_data_samples_per_second": 319.123, | |
| "eval_q2q_data_steps_per_second": 20.056, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9587727708533078, | |
| "eval_q2p_data_loss": 4.801548004150391, | |
| "eval_q2p_data_runtime": 15.3711, | |
| "eval_q2p_data_samples_per_second": 52.827, | |
| "eval_q2p_data_steps_per_second": 3.318, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9683604985618408, | |
| "grad_norm": 0.11723767966032028, | |
| "learning_rate": 7.528409090909091e-06, | |
| "loss": 4.9817, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9779482262703739, | |
| "grad_norm": 0.14676721394062042, | |
| "learning_rate": 7.4928977272727274e-06, | |
| "loss": 4.813, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.987535953978907, | |
| "grad_norm": 0.18476560711860657, | |
| "learning_rate": 7.4573863636363646e-06, | |
| "loss": 4.9688, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.99712368168744, | |
| "grad_norm": 12.572381019592285, | |
| "learning_rate": 7.421875000000001e-06, | |
| "loss": 4.9802, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0067114093959733, | |
| "grad_norm": 30.89609146118164, | |
| "learning_rate": 7.386363636363637e-06, | |
| "loss": 4.8651, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0067114093959733, | |
| "eval_q2q_data_loss": 5.00149393081665, | |
| "eval_q2q_data_runtime": 8.4886, | |
| "eval_q2q_data_samples_per_second": 318.661, | |
| "eval_q2q_data_steps_per_second": 20.027, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0067114093959733, | |
| "eval_q2p_data_loss": 4.796145439147949, | |
| "eval_q2p_data_runtime": 15.3888, | |
| "eval_q2p_data_samples_per_second": 52.766, | |
| "eval_q2p_data_steps_per_second": 3.314, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0162991371045063, | |
| "grad_norm": 15.047320365905762, | |
| "learning_rate": 7.350852272727273e-06, | |
| "loss": 4.9286, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.0258868648130393, | |
| "grad_norm": 0.20640498399734497, | |
| "learning_rate": 7.31534090909091e-06, | |
| "loss": 4.9124, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.0354745925215725, | |
| "grad_norm": 5.841845989227295, | |
| "learning_rate": 7.279829545454547e-06, | |
| "loss": 4.9927, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.0450623202301055, | |
| "grad_norm": 8.321894645690918, | |
| "learning_rate": 7.244318181818183e-06, | |
| "loss": 4.9769, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.0546500479386385, | |
| "grad_norm": 0.8191462755203247, | |
| "learning_rate": 7.2088068181818185e-06, | |
| "loss": 5.0158, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0546500479386385, | |
| "eval_q2q_data_loss": 5.004606246948242, | |
| "eval_q2q_data_runtime": 8.4874, | |
| "eval_q2q_data_samples_per_second": 318.708, | |
| "eval_q2q_data_steps_per_second": 20.03, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0546500479386385, | |
| "eval_q2p_data_loss": 5.120335102081299, | |
| "eval_q2p_data_runtime": 15.3988, | |
| "eval_q2p_data_samples_per_second": 52.731, | |
| "eval_q2p_data_steps_per_second": 3.312, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0642377756471717, | |
| "grad_norm": 6.462870121002197, | |
| "learning_rate": 7.173295454545455e-06, | |
| "loss": 5.0234, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.0738255033557047, | |
| "grad_norm": 19.973081588745117, | |
| "learning_rate": 7.137784090909091e-06, | |
| "loss": 4.9903, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0834132310642377, | |
| "grad_norm": 6.040268898010254, | |
| "learning_rate": 7.102272727272727e-06, | |
| "loss": 5.008, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.093000958772771, | |
| "grad_norm": 64.06867218017578, | |
| "learning_rate": 7.066761363636364e-06, | |
| "loss": 4.9987, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.102588686481304, | |
| "grad_norm": 51.97669982910156, | |
| "learning_rate": 7.031250000000001e-06, | |
| "loss": 5.0091, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.102588686481304, | |
| "eval_q2q_data_loss": 5.01547384262085, | |
| "eval_q2q_data_runtime": 8.5407, | |
| "eval_q2q_data_samples_per_second": 316.718, | |
| "eval_q2q_data_steps_per_second": 19.905, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.102588686481304, | |
| "eval_q2p_data_loss": 5.103107929229736, | |
| "eval_q2p_data_runtime": 15.3784, | |
| "eval_q2p_data_samples_per_second": 52.801, | |
| "eval_q2p_data_steps_per_second": 3.316, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.112176414189837, | |
| "grad_norm": 10.005661010742188, | |
| "learning_rate": 6.995738636363637e-06, | |
| "loss": 5.0562, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1217641418983701, | |
| "grad_norm": 10.467660903930664, | |
| "learning_rate": 6.960227272727273e-06, | |
| "loss": 5.0129, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.1313518696069031, | |
| "grad_norm": 7.998090744018555, | |
| "learning_rate": 6.92471590909091e-06, | |
| "loss": 5.0033, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.1409395973154361, | |
| "grad_norm": 3.380247116088867, | |
| "learning_rate": 6.889204545454547e-06, | |
| "loss": 4.9961, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.1505273250239694, | |
| "grad_norm": 8.895610809326172, | |
| "learning_rate": 6.853693181818183e-06, | |
| "loss": 4.988, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1505273250239694, | |
| "eval_q2q_data_loss": 5.000478744506836, | |
| "eval_q2q_data_runtime": 8.5322, | |
| "eval_q2q_data_samples_per_second": 317.034, | |
| "eval_q2q_data_steps_per_second": 19.924, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1505273250239694, | |
| "eval_q2p_data_loss": 5.002507209777832, | |
| "eval_q2p_data_runtime": 15.3615, | |
| "eval_q2p_data_samples_per_second": 52.859, | |
| "eval_q2p_data_steps_per_second": 3.32, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1601150527325024, | |
| "grad_norm": 6.491428852081299, | |
| "learning_rate": 6.818181818181818e-06, | |
| "loss": 4.9687, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.1697027804410354, | |
| "grad_norm": 4.309035778045654, | |
| "learning_rate": 6.7826704545454545e-06, | |
| "loss": 4.9824, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.1792905081495686, | |
| "grad_norm": 2.331423759460449, | |
| "learning_rate": 6.747159090909091e-06, | |
| "loss": 4.9955, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.1888782358581016, | |
| "grad_norm": 3.439713954925537, | |
| "learning_rate": 6.711647727272728e-06, | |
| "loss": 4.9943, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.1984659635666346, | |
| "grad_norm": 7.992236137390137, | |
| "learning_rate": 6.676136363636364e-06, | |
| "loss": 5.0552, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.1984659635666346, | |
| "eval_q2q_data_loss": 5.000186920166016, | |
| "eval_q2q_data_runtime": 8.5162, | |
| "eval_q2q_data_samples_per_second": 317.629, | |
| "eval_q2q_data_steps_per_second": 19.962, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.1984659635666346, | |
| "eval_q2p_data_loss": 5.000546932220459, | |
| "eval_q2p_data_runtime": 15.3961, | |
| "eval_q2p_data_samples_per_second": 52.741, | |
| "eval_q2p_data_steps_per_second": 3.313, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.2080536912751678, | |
| "grad_norm": 3.6224541664123535, | |
| "learning_rate": 6.6406250000000005e-06, | |
| "loss": 5.0073, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2176414189837008, | |
| "grad_norm": 1.0430936813354492, | |
| "learning_rate": 6.605113636363637e-06, | |
| "loss": 4.9928, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.2272291466922338, | |
| "grad_norm": 3.0630106925964355, | |
| "learning_rate": 6.569602272727274e-06, | |
| "loss": 5.0183, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.236816874400767, | |
| "grad_norm": 4.258161544799805, | |
| "learning_rate": 6.53409090909091e-06, | |
| "loss": 4.9932, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.2464046021093, | |
| "grad_norm": 2.9531047344207764, | |
| "learning_rate": 6.498579545454546e-06, | |
| "loss": 4.9737, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2464046021093, | |
| "eval_q2q_data_loss": 5.000265121459961, | |
| "eval_q2q_data_runtime": 8.5548, | |
| "eval_q2q_data_samples_per_second": 316.198, | |
| "eval_q2q_data_steps_per_second": 19.872, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2464046021093, | |
| "eval_q2p_data_loss": 5.00175142288208, | |
| "eval_q2p_data_runtime": 15.3988, | |
| "eval_q2p_data_samples_per_second": 52.731, | |
| "eval_q2p_data_steps_per_second": 3.312, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.255992329817833, | |
| "grad_norm": 7.634608745574951, | |
| "learning_rate": 6.463068181818183e-06, | |
| "loss": 5.012, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.2655800575263663, | |
| "grad_norm": 10.259374618530273, | |
| "learning_rate": 6.427556818181818e-06, | |
| "loss": 5.0138, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.2751677852348993, | |
| "grad_norm": 10.425176620483398, | |
| "learning_rate": 6.392045454545454e-06, | |
| "loss": 5.0107, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.2847555129434325, | |
| "grad_norm": 3.6952784061431885, | |
| "learning_rate": 6.3565340909090915e-06, | |
| "loss": 5.0226, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.2943432406519655, | |
| "grad_norm": 2.3303303718566895, | |
| "learning_rate": 6.321022727272728e-06, | |
| "loss": 4.9827, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.2943432406519655, | |
| "eval_q2q_data_loss": 5.000885009765625, | |
| "eval_q2q_data_runtime": 8.4946, | |
| "eval_q2q_data_samples_per_second": 318.436, | |
| "eval_q2q_data_steps_per_second": 20.013, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.2943432406519655, | |
| "eval_q2p_data_loss": 5.002125263214111, | |
| "eval_q2p_data_runtime": 15.3928, | |
| "eval_q2p_data_samples_per_second": 52.752, | |
| "eval_q2p_data_steps_per_second": 3.313, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.3039309683604985, | |
| "grad_norm": 1.1437593698501587, | |
| "learning_rate": 6.285511363636364e-06, | |
| "loss": 5.0089, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.3135186960690317, | |
| "grad_norm": 3.3491806983947754, | |
| "learning_rate": 6.25e-06, | |
| "loss": 4.9869, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.3231064237775647, | |
| "grad_norm": 4.804921627044678, | |
| "learning_rate": 6.2144886363636366e-06, | |
| "loss": 5.0178, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.332694151486098, | |
| "grad_norm": 3.649508476257324, | |
| "learning_rate": 6.178977272727274e-06, | |
| "loss": 5.0038, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.342281879194631, | |
| "grad_norm": 3.105538845062256, | |
| "learning_rate": 6.14346590909091e-06, | |
| "loss": 4.9761, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.342281879194631, | |
| "eval_q2q_data_loss": 5.000288963317871, | |
| "eval_q2q_data_runtime": 8.4946, | |
| "eval_q2q_data_samples_per_second": 318.436, | |
| "eval_q2q_data_steps_per_second": 20.013, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.342281879194631, | |
| "eval_q2p_data_loss": 5.000768184661865, | |
| "eval_q2p_data_runtime": 15.3448, | |
| "eval_q2p_data_samples_per_second": 52.917, | |
| "eval_q2p_data_steps_per_second": 3.324, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.351869606903164, | |
| "grad_norm": 5.388565540313721, | |
| "learning_rate": 6.107954545454546e-06, | |
| "loss": 5.0025, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.3614573346116972, | |
| "grad_norm": 4.318077564239502, | |
| "learning_rate": 6.0724431818181825e-06, | |
| "loss": 4.9973, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.3710450623202302, | |
| "grad_norm": 5.794456481933594, | |
| "learning_rate": 6.036931818181818e-06, | |
| "loss": 4.9911, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.3806327900287632, | |
| "grad_norm": 7.113480567932129, | |
| "learning_rate": 6.001420454545455e-06, | |
| "loss": 5.0088, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.3902205177372964, | |
| "grad_norm": 4.235409736633301, | |
| "learning_rate": 5.965909090909091e-06, | |
| "loss": 4.986, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.3902205177372964, | |
| "eval_q2q_data_loss": 5.0001349449157715, | |
| "eval_q2q_data_runtime": 8.5502, | |
| "eval_q2q_data_samples_per_second": 316.366, | |
| "eval_q2q_data_steps_per_second": 19.883, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.3902205177372964, | |
| "eval_q2p_data_loss": 5.000503063201904, | |
| "eval_q2p_data_runtime": 15.3601, | |
| "eval_q2p_data_samples_per_second": 52.864, | |
| "eval_q2p_data_steps_per_second": 3.32, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.3998082454458294, | |
| "grad_norm": 0.9855827689170837, | |
| "learning_rate": 5.930397727272728e-06, | |
| "loss": 5.0025, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.4093959731543624, | |
| "grad_norm": 4.243587017059326, | |
| "learning_rate": 5.894886363636364e-06, | |
| "loss": 4.9907, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.4189837008628956, | |
| "grad_norm": 9.807540893554688, | |
| "learning_rate": 5.859375e-06, | |
| "loss": 5.0012, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 3.3579766750335693, | |
| "learning_rate": 5.823863636363637e-06, | |
| "loss": 4.9928, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.4381591562799616, | |
| "grad_norm": 2.363482713699341, | |
| "learning_rate": 5.7883522727272735e-06, | |
| "loss": 4.9955, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4381591562799616, | |
| "eval_q2q_data_loss": 5.000216960906982, | |
| "eval_q2q_data_runtime": 8.5231, | |
| "eval_q2q_data_samples_per_second": 317.374, | |
| "eval_q2q_data_steps_per_second": 19.946, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4381591562799616, | |
| "eval_q2p_data_loss": 5.000642776489258, | |
| "eval_q2p_data_runtime": 15.3802, | |
| "eval_q2p_data_samples_per_second": 52.795, | |
| "eval_q2p_data_steps_per_second": 3.316, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4477468839884948, | |
| "grad_norm": 2.8971104621887207, | |
| "learning_rate": 5.75284090909091e-06, | |
| "loss": 4.9952, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.4573346116970278, | |
| "grad_norm": 4.56306266784668, | |
| "learning_rate": 5.717329545454546e-06, | |
| "loss": 4.9875, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.4669223394055608, | |
| "grad_norm": 3.592824935913086, | |
| "learning_rate": 5.681818181818183e-06, | |
| "loss": 5.0027, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.476510067114094, | |
| "grad_norm": 6.926996231079102, | |
| "learning_rate": 5.646306818181818e-06, | |
| "loss": 4.963, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.486097794822627, | |
| "grad_norm": 8.679203987121582, | |
| "learning_rate": 5.610795454545455e-06, | |
| "loss": 4.9662, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.486097794822627, | |
| "eval_q2q_data_loss": 5.001591205596924, | |
| "eval_q2q_data_runtime": 8.4686, | |
| "eval_q2q_data_samples_per_second": 319.414, | |
| "eval_q2q_data_steps_per_second": 20.074, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.486097794822627, | |
| "eval_q2p_data_loss": 5.006067276000977, | |
| "eval_q2p_data_runtime": 15.3614, | |
| "eval_q2p_data_samples_per_second": 52.86, | |
| "eval_q2p_data_steps_per_second": 3.32, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.49568552253116, | |
| "grad_norm": 11.07398796081543, | |
| "learning_rate": 5.575284090909091e-06, | |
| "loss": 4.9284, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.5052732502396933, | |
| "grad_norm": 13.813140869140625, | |
| "learning_rate": 5.539772727272727e-06, | |
| "loss": 4.9773, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.5148609779482263, | |
| "grad_norm": 32.947540283203125, | |
| "learning_rate": 5.504261363636364e-06, | |
| "loss": 5.0154, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.5244487056567593, | |
| "grad_norm": 57.005271911621094, | |
| "learning_rate": 5.468750000000001e-06, | |
| "loss": 4.9956, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.5340364333652925, | |
| "grad_norm": 21.25840187072754, | |
| "learning_rate": 5.433238636363637e-06, | |
| "loss": 5.0147, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5340364333652925, | |
| "eval_q2q_data_loss": 5.015188694000244, | |
| "eval_q2q_data_runtime": 8.4996, | |
| "eval_q2q_data_samples_per_second": 318.25, | |
| "eval_q2q_data_steps_per_second": 20.001, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5340364333652925, | |
| "eval_q2p_data_loss": 5.062190532684326, | |
| "eval_q2p_data_runtime": 15.3191, | |
| "eval_q2p_data_samples_per_second": 53.006, | |
| "eval_q2p_data_steps_per_second": 3.329, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5436241610738255, | |
| "grad_norm": 23.927370071411133, | |
| "learning_rate": 5.397727272727273e-06, | |
| "loss": 5.0216, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.5532118887823585, | |
| "grad_norm": 29.68376350402832, | |
| "learning_rate": 5.36221590909091e-06, | |
| "loss": 5.0276, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.5627996164908917, | |
| "grad_norm": 56.62722396850586, | |
| "learning_rate": 5.326704545454546e-06, | |
| "loss": 5.0115, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.5723873441994247, | |
| "grad_norm": 30.375343322753906, | |
| "learning_rate": 5.291193181818183e-06, | |
| "loss": 4.9836, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.5819750719079577, | |
| "grad_norm": 7.980493068695068, | |
| "learning_rate": 5.255681818181818e-06, | |
| "loss": 5.0171, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.5819750719079577, | |
| "eval_q2q_data_loss": 5.000085353851318, | |
| "eval_q2q_data_runtime": 8.4882, | |
| "eval_q2q_data_samples_per_second": 318.678, | |
| "eval_q2q_data_steps_per_second": 20.028, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.5819750719079577, | |
| "eval_q2p_data_loss": 5.002185821533203, | |
| "eval_q2p_data_runtime": 15.3825, | |
| "eval_q2p_data_samples_per_second": 52.787, | |
| "eval_q2p_data_steps_per_second": 3.315, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.591562799616491, | |
| "grad_norm": 12.629569053649902, | |
| "learning_rate": 5.220170454545455e-06, | |
| "loss": 5.0266, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.601150527325024, | |
| "grad_norm": 26.266088485717773, | |
| "learning_rate": 5.184659090909091e-06, | |
| "loss": 4.9617, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.610738255033557, | |
| "grad_norm": 12.034894943237305, | |
| "learning_rate": 5.149147727272727e-06, | |
| "loss": 4.9691, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.6203259827420902, | |
| "grad_norm": 27.641963958740234, | |
| "learning_rate": 5.113636363636364e-06, | |
| "loss": 5.0004, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.6299137104506232, | |
| "grad_norm": 30.945240020751953, | |
| "learning_rate": 5.078125000000001e-06, | |
| "loss": 5.0173, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6299137104506232, | |
| "eval_q2q_data_loss": 5.039857387542725, | |
| "eval_q2q_data_runtime": 8.4631, | |
| "eval_q2q_data_samples_per_second": 319.624, | |
| "eval_q2q_data_steps_per_second": 20.087, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6299137104506232, | |
| "eval_q2p_data_loss": 5.0407586097717285, | |
| "eval_q2p_data_runtime": 15.3308, | |
| "eval_q2p_data_samples_per_second": 52.965, | |
| "eval_q2p_data_steps_per_second": 3.327, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6395014381591562, | |
| "grad_norm": 38.697303771972656, | |
| "learning_rate": 5.042613636363637e-06, | |
| "loss": 4.9824, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.6490891658676894, | |
| "grad_norm": 1.1715205907821655, | |
| "learning_rate": 5.007102272727273e-06, | |
| "loss": 5.0099, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.6586768935762224, | |
| "grad_norm": 1.030447006225586, | |
| "learning_rate": 4.9715909090909094e-06, | |
| "loss": 5.003, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.6682646212847554, | |
| "grad_norm": 0.6143599152565002, | |
| "learning_rate": 4.936079545454546e-06, | |
| "loss": 5.0039, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.6778523489932886, | |
| "grad_norm": 0.31595391035079956, | |
| "learning_rate": 4.900568181818182e-06, | |
| "loss": 5.0031, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.6778523489932886, | |
| "eval_q2q_data_loss": 5.0020527839660645, | |
| "eval_q2q_data_runtime": 8.472, | |
| "eval_q2q_data_samples_per_second": 319.285, | |
| "eval_q2q_data_steps_per_second": 20.066, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.6778523489932886, | |
| "eval_q2p_data_loss": 5.010634422302246, | |
| "eval_q2p_data_runtime": 15.3164, | |
| "eval_q2p_data_samples_per_second": 53.015, | |
| "eval_q2p_data_steps_per_second": 3.33, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.6874400767018218, | |
| "grad_norm": 0.3842555284500122, | |
| "learning_rate": 4.865056818181818e-06, | |
| "loss": 4.9992, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.6970278044103546, | |
| "grad_norm": 0.3934996426105499, | |
| "learning_rate": 4.829545454545455e-06, | |
| "loss": 4.9997, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.7066155321188878, | |
| "grad_norm": 0.3144057095050812, | |
| "learning_rate": 4.794034090909092e-06, | |
| "loss": 4.9999, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.716203259827421, | |
| "grad_norm": 0.33490219712257385, | |
| "learning_rate": 4.758522727272727e-06, | |
| "loss": 5.0022, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.7257909875359538, | |
| "grad_norm": 0.35593223571777344, | |
| "learning_rate": 4.723011363636364e-06, | |
| "loss": 4.9988, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.7257909875359538, | |
| "eval_q2q_data_loss": 5.001664638519287, | |
| "eval_q2q_data_runtime": 8.4874, | |
| "eval_q2q_data_samples_per_second": 318.706, | |
| "eval_q2q_data_steps_per_second": 20.03, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.7257909875359538, | |
| "eval_q2p_data_loss": 5.009975433349609, | |
| "eval_q2p_data_runtime": 15.3185, | |
| "eval_q2p_data_samples_per_second": 53.008, | |
| "eval_q2p_data_steps_per_second": 3.329, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.735378715244487, | |
| "grad_norm": 0.5832622051239014, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 4.9987, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.7449664429530203, | |
| "grad_norm": 0.4001566171646118, | |
| "learning_rate": 4.651988636363637e-06, | |
| "loss": 5.0029, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.754554170661553, | |
| "grad_norm": 1.2833226919174194, | |
| "learning_rate": 4.616477272727273e-06, | |
| "loss": 4.9949, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.7641418983700863, | |
| "grad_norm": 0.7543688416481018, | |
| "learning_rate": 4.580965909090909e-06, | |
| "loss": 4.999, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.7737296260786195, | |
| "grad_norm": 0.7849061489105225, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 5.0017, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.7737296260786195, | |
| "eval_q2q_data_loss": 5.003254413604736, | |
| "eval_q2q_data_runtime": 8.5165, | |
| "eval_q2q_data_samples_per_second": 317.618, | |
| "eval_q2q_data_steps_per_second": 19.961, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.7737296260786195, | |
| "eval_q2p_data_loss": 4.987276077270508, | |
| "eval_q2p_data_runtime": 15.3548, | |
| "eval_q2p_data_samples_per_second": 52.882, | |
| "eval_q2p_data_steps_per_second": 3.321, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.7833173537871523, | |
| "grad_norm": 12.080714225769043, | |
| "learning_rate": 4.509943181818182e-06, | |
| "loss": 4.9866, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.7929050814956855, | |
| "grad_norm": 1.030135989189148, | |
| "learning_rate": 4.474431818181819e-06, | |
| "loss": 4.9976, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.8024928092042187, | |
| "grad_norm": 2.636124610900879, | |
| "learning_rate": 4.438920454545455e-06, | |
| "loss": 4.9784, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8120805369127517, | |
| "grad_norm": 51.49758529663086, | |
| "learning_rate": 4.4034090909090914e-06, | |
| "loss": 4.9824, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.8216682646212847, | |
| "grad_norm": 59.32814025878906, | |
| "learning_rate": 4.367897727272728e-06, | |
| "loss": 4.9945, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8216682646212847, | |
| "eval_q2q_data_loss": 5.014230251312256, | |
| "eval_q2q_data_runtime": 8.519, | |
| "eval_q2q_data_samples_per_second": 317.527, | |
| "eval_q2q_data_steps_per_second": 19.955, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8216682646212847, | |
| "eval_q2p_data_loss": 5.155740737915039, | |
| "eval_q2p_data_runtime": 15.3763, | |
| "eval_q2p_data_samples_per_second": 52.808, | |
| "eval_q2p_data_steps_per_second": 3.317, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.831255992329818, | |
| "grad_norm": 10.061817169189453, | |
| "learning_rate": 4.332386363636364e-06, | |
| "loss": 4.9445, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.840843720038351, | |
| "grad_norm": 1.1698871850967407, | |
| "learning_rate": 4.296875e-06, | |
| "loss": 4.9477, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.850431447746884, | |
| "grad_norm": 0.6934572458267212, | |
| "learning_rate": 4.2613636363636365e-06, | |
| "loss": 5.0047, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.8600191754554172, | |
| "grad_norm": 18.0229434967041, | |
| "learning_rate": 4.225852272727274e-06, | |
| "loss": 4.9307, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.8696069031639502, | |
| "grad_norm": 8.73933219909668, | |
| "learning_rate": 4.190340909090909e-06, | |
| "loss": 4.9634, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.8696069031639502, | |
| "eval_q2q_data_loss": 5.002269268035889, | |
| "eval_q2q_data_runtime": 8.4962, | |
| "eval_q2q_data_samples_per_second": 318.378, | |
| "eval_q2q_data_steps_per_second": 20.009, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.8696069031639502, | |
| "eval_q2p_data_loss": 4.8260931968688965, | |
| "eval_q2p_data_runtime": 15.3516, | |
| "eval_q2p_data_samples_per_second": 52.894, | |
| "eval_q2p_data_steps_per_second": 3.322, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.8791946308724832, | |
| "grad_norm": 1.5762324333190918, | |
| "learning_rate": 4.154829545454545e-06, | |
| "loss": 4.9791, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.8887823585810164, | |
| "grad_norm": 0.3121432363986969, | |
| "learning_rate": 4.1193181818181825e-06, | |
| "loss": 4.9792, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.8983700862895494, | |
| "grad_norm": 1.5927631855010986, | |
| "learning_rate": 4.083806818181819e-06, | |
| "loss": 4.9041, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9079578139980824, | |
| "grad_norm": 14.304738998413086, | |
| "learning_rate": 4.048295454545455e-06, | |
| "loss": 4.9349, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.9175455417066156, | |
| "grad_norm": 0.2702763080596924, | |
| "learning_rate": 4.012784090909091e-06, | |
| "loss": 4.8942, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9175455417066156, | |
| "eval_q2q_data_loss": 5.001285076141357, | |
| "eval_q2q_data_runtime": 8.47, | |
| "eval_q2q_data_samples_per_second": 319.362, | |
| "eval_q2q_data_steps_per_second": 20.071, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9175455417066156, | |
| "eval_q2p_data_loss": 4.750080585479736, | |
| "eval_q2p_data_runtime": 15.3459, | |
| "eval_q2p_data_samples_per_second": 52.913, | |
| "eval_q2p_data_steps_per_second": 3.323, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9271332694151486, | |
| "grad_norm": 0.2623966634273529, | |
| "learning_rate": 3.9772727272727275e-06, | |
| "loss": 4.9871, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.9367209971236816, | |
| "grad_norm": 0.24292069673538208, | |
| "learning_rate": 3.941761363636364e-06, | |
| "loss": 4.9631, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.9463087248322148, | |
| "grad_norm": 0.2756921947002411, | |
| "learning_rate": 3.90625e-06, | |
| "loss": 4.9604, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.9558964525407478, | |
| "grad_norm": 0.2825332581996918, | |
| "learning_rate": 3.870738636363637e-06, | |
| "loss": 4.9346, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.9654841802492808, | |
| "grad_norm": 0.2173183411359787, | |
| "learning_rate": 3.8352272727272735e-06, | |
| "loss": 4.9398, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.9654841802492808, | |
| "eval_q2q_data_loss": 5.001183032989502, | |
| "eval_q2q_data_runtime": 8.5081, | |
| "eval_q2q_data_samples_per_second": 317.931, | |
| "eval_q2q_data_steps_per_second": 19.981, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.9654841802492808, | |
| "eval_q2p_data_loss": 4.761696815490723, | |
| "eval_q2p_data_runtime": 15.3478, | |
| "eval_q2p_data_samples_per_second": 52.907, | |
| "eval_q2p_data_steps_per_second": 3.323, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.975071907957814, | |
| "grad_norm": 16.142738342285156, | |
| "learning_rate": 3.7997159090909093e-06, | |
| "loss": 4.9262, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.984659635666347, | |
| "grad_norm": 0.2226814180612564, | |
| "learning_rate": 3.7642045454545456e-06, | |
| "loss": 4.9505, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.99424736337488, | |
| "grad_norm": 0.22450749576091766, | |
| "learning_rate": 3.7286931818181823e-06, | |
| "loss": 4.9667, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.0038350910834133, | |
| "grad_norm": 18.707637786865234, | |
| "learning_rate": 3.6931818181818186e-06, | |
| "loss": 4.8763, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.0134228187919465, | |
| "grad_norm": 0.2756267189979553, | |
| "learning_rate": 3.657670454545455e-06, | |
| "loss": 4.9116, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0134228187919465, | |
| "eval_q2q_data_loss": 5.001041412353516, | |
| "eval_q2q_data_runtime": 8.4882, | |
| "eval_q2q_data_samples_per_second": 318.678, | |
| "eval_q2q_data_steps_per_second": 20.028, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0134228187919465, | |
| "eval_q2p_data_loss": 4.771986961364746, | |
| "eval_q2p_data_runtime": 15.3318, | |
| "eval_q2p_data_samples_per_second": 52.962, | |
| "eval_q2p_data_steps_per_second": 3.326, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0230105465004793, | |
| "grad_norm": 0.19571331143379211, | |
| "learning_rate": 3.6221590909090915e-06, | |
| "loss": 4.9367, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.0325982742090125, | |
| "grad_norm": 0.21739406883716583, | |
| "learning_rate": 3.5866477272727274e-06, | |
| "loss": 4.9546, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.0421860019175457, | |
| "grad_norm": 1.4178483486175537, | |
| "learning_rate": 3.5511363636363636e-06, | |
| "loss": 4.9743, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.0517737296260785, | |
| "grad_norm": 0.20393171906471252, | |
| "learning_rate": 3.5156250000000003e-06, | |
| "loss": 4.9795, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.0613614573346117, | |
| "grad_norm": 0.18679551780223846, | |
| "learning_rate": 3.4801136363636366e-06, | |
| "loss": 4.9647, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.0613614573346117, | |
| "eval_q2q_data_loss": 5.0010271072387695, | |
| "eval_q2q_data_runtime": 8.5086, | |
| "eval_q2q_data_samples_per_second": 317.913, | |
| "eval_q2q_data_steps_per_second": 19.98, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.0613614573346117, | |
| "eval_q2p_data_loss": 4.773245811462402, | |
| "eval_q2p_data_runtime": 15.3323, | |
| "eval_q2p_data_samples_per_second": 52.96, | |
| "eval_q2p_data_steps_per_second": 3.326, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.070949185043145, | |
| "grad_norm": 10.774163246154785, | |
| "learning_rate": 3.4446022727272733e-06, | |
| "loss": 4.9856, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.0805369127516777, | |
| "grad_norm": 0.229711651802063, | |
| "learning_rate": 3.409090909090909e-06, | |
| "loss": 4.9553, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.090124640460211, | |
| "grad_norm": 12.86821174621582, | |
| "learning_rate": 3.3735795454545454e-06, | |
| "loss": 4.9479, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.099712368168744, | |
| "grad_norm": 0.19190755486488342, | |
| "learning_rate": 3.338068181818182e-06, | |
| "loss": 4.9672, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.109300095877277, | |
| "grad_norm": 6.124110698699951, | |
| "learning_rate": 3.3025568181818184e-06, | |
| "loss": 4.9645, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.109300095877277, | |
| "eval_q2q_data_loss": 5.001131057739258, | |
| "eval_q2q_data_runtime": 8.4876, | |
| "eval_q2q_data_samples_per_second": 318.702, | |
| "eval_q2q_data_steps_per_second": 20.029, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.109300095877277, | |
| "eval_q2p_data_loss": 4.75758171081543, | |
| "eval_q2p_data_runtime": 15.4135, | |
| "eval_q2p_data_samples_per_second": 52.681, | |
| "eval_q2p_data_steps_per_second": 3.309, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.11888782358581, | |
| "grad_norm": 3.4443752765655518, | |
| "learning_rate": 3.267045454545455e-06, | |
| "loss": 4.9299, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.1284755512943434, | |
| "grad_norm": 0.27355676889419556, | |
| "learning_rate": 3.2315340909090913e-06, | |
| "loss": 4.9777, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.138063279002876, | |
| "grad_norm": 6.125870227813721, | |
| "learning_rate": 3.196022727272727e-06, | |
| "loss": 4.94, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.1476510067114094, | |
| "grad_norm": 23.490581512451172, | |
| "learning_rate": 3.160511363636364e-06, | |
| "loss": 4.978, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.1572387344199426, | |
| "grad_norm": 9.1142578125, | |
| "learning_rate": 3.125e-06, | |
| "loss": 4.968, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1572387344199426, | |
| "eval_q2q_data_loss": 4.999406814575195, | |
| "eval_q2q_data_runtime": 8.4764, | |
| "eval_q2q_data_samples_per_second": 319.121, | |
| "eval_q2q_data_steps_per_second": 20.056, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1572387344199426, | |
| "eval_q2p_data_loss": 4.755669116973877, | |
| "eval_q2p_data_runtime": 15.4053, | |
| "eval_q2p_data_samples_per_second": 52.709, | |
| "eval_q2p_data_steps_per_second": 3.311, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1668264621284754, | |
| "grad_norm": 0.5820243954658508, | |
| "learning_rate": 3.089488636363637e-06, | |
| "loss": 4.9512, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.1764141898370086, | |
| "grad_norm": 0.20500487089157104, | |
| "learning_rate": 3.053977272727273e-06, | |
| "loss": 4.9539, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.186001917545542, | |
| "grad_norm": 0.18161769211292267, | |
| "learning_rate": 3.018465909090909e-06, | |
| "loss": 4.9508, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.1955896452540746, | |
| "grad_norm": 0.19371207058429718, | |
| "learning_rate": 2.9829545454545457e-06, | |
| "loss": 4.8871, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.205177372962608, | |
| "grad_norm": 0.2863902747631073, | |
| "learning_rate": 2.947443181818182e-06, | |
| "loss": 4.909, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.205177372962608, | |
| "eval_q2q_data_loss": 5.001042366027832, | |
| "eval_q2q_data_runtime": 8.4998, | |
| "eval_q2q_data_samples_per_second": 318.244, | |
| "eval_q2q_data_steps_per_second": 20.001, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.205177372962608, | |
| "eval_q2p_data_loss": 4.744427680969238, | |
| "eval_q2p_data_runtime": 15.3338, | |
| "eval_q2p_data_samples_per_second": 52.955, | |
| "eval_q2p_data_steps_per_second": 3.326, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.214765100671141, | |
| "grad_norm": 0.21279603242874146, | |
| "learning_rate": 2.9119318181818186e-06, | |
| "loss": 4.9587, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.224352828379674, | |
| "grad_norm": 0.18541747331619263, | |
| "learning_rate": 2.876420454545455e-06, | |
| "loss": 4.8956, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.233940556088207, | |
| "grad_norm": 0.22428183257579803, | |
| "learning_rate": 2.8409090909090916e-06, | |
| "loss": 4.9891, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.2435282837967403, | |
| "grad_norm": 12.067822456359863, | |
| "learning_rate": 2.8053977272727274e-06, | |
| "loss": 4.8795, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.253116011505273, | |
| "grad_norm": 7.028346061706543, | |
| "learning_rate": 2.7698863636363637e-06, | |
| "loss": 4.887, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.253116011505273, | |
| "eval_q2q_data_loss": 5.001026630401611, | |
| "eval_q2q_data_runtime": 8.487, | |
| "eval_q2q_data_samples_per_second": 318.721, | |
| "eval_q2q_data_steps_per_second": 20.031, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.253116011505273, | |
| "eval_q2p_data_loss": 4.744780540466309, | |
| "eval_q2p_data_runtime": 15.3798, | |
| "eval_q2p_data_samples_per_second": 52.796, | |
| "eval_q2p_data_steps_per_second": 3.316, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.2627037392138063, | |
| "grad_norm": 0.15497416257858276, | |
| "learning_rate": 2.7343750000000004e-06, | |
| "loss": 4.9723, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.2722914669223395, | |
| "grad_norm": 0.14897240698337555, | |
| "learning_rate": 2.6988636363636367e-06, | |
| "loss": 4.8967, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.2818791946308723, | |
| "grad_norm": 6.019428730010986, | |
| "learning_rate": 2.663352272727273e-06, | |
| "loss": 4.8975, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.2914669223394055, | |
| "grad_norm": 7.852274417877197, | |
| "learning_rate": 2.627840909090909e-06, | |
| "loss": 4.9177, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.3010546500479387, | |
| "grad_norm": 128.83132934570312, | |
| "learning_rate": 2.5923295454545455e-06, | |
| "loss": 4.9272, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.3010546500479387, | |
| "eval_q2q_data_loss": 5.000960350036621, | |
| "eval_q2q_data_runtime": 8.4827, | |
| "eval_q2q_data_samples_per_second": 318.882, | |
| "eval_q2q_data_steps_per_second": 20.041, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.3010546500479387, | |
| "eval_q2p_data_loss": 4.7287445068359375, | |
| "eval_q2p_data_runtime": 15.3674, | |
| "eval_q2p_data_samples_per_second": 52.839, | |
| "eval_q2p_data_steps_per_second": 3.319, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.310642377756472, | |
| "grad_norm": 0.1605680286884308, | |
| "learning_rate": 2.556818181818182e-06, | |
| "loss": 4.9283, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.3202301054650047, | |
| "grad_norm": 25.14031982421875, | |
| "learning_rate": 2.5213068181818184e-06, | |
| "loss": 4.9061, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.329817833173538, | |
| "grad_norm": 0.1336502879858017, | |
| "learning_rate": 2.4857954545454547e-06, | |
| "loss": 4.9279, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.3394055608820707, | |
| "grad_norm": 0.5942106246948242, | |
| "learning_rate": 2.450284090909091e-06, | |
| "loss": 4.9856, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.348993288590604, | |
| "grad_norm": 6.196929454803467, | |
| "learning_rate": 2.4147727272727277e-06, | |
| "loss": 4.8988, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.348993288590604, | |
| "eval_q2q_data_loss": 5.000965118408203, | |
| "eval_q2q_data_runtime": 8.4496, | |
| "eval_q2q_data_samples_per_second": 320.134, | |
| "eval_q2q_data_steps_per_second": 20.119, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.348993288590604, | |
| "eval_q2p_data_loss": 4.726756572723389, | |
| "eval_q2p_data_runtime": 15.3322, | |
| "eval_q2p_data_samples_per_second": 52.96, | |
| "eval_q2p_data_steps_per_second": 3.326, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.358581016299137, | |
| "grad_norm": 0.11395616829395294, | |
| "learning_rate": 2.3792613636363635e-06, | |
| "loss": 4.9269, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.3681687440076704, | |
| "grad_norm": 0.14515432715415955, | |
| "learning_rate": 2.3437500000000002e-06, | |
| "loss": 4.9318, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.377756471716203, | |
| "grad_norm": 2.5160467624664307, | |
| "learning_rate": 2.3082386363636365e-06, | |
| "loss": 4.8814, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.3873441994247364, | |
| "grad_norm": 0.1416112333536148, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 4.9912, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.396931927133269, | |
| "grad_norm": 10.503127098083496, | |
| "learning_rate": 2.2372159090909095e-06, | |
| "loss": 4.9226, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.396931927133269, | |
| "eval_q2q_data_loss": 5.000875949859619, | |
| "eval_q2q_data_runtime": 8.4684, | |
| "eval_q2q_data_samples_per_second": 319.422, | |
| "eval_q2q_data_steps_per_second": 20.075, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.396931927133269, | |
| "eval_q2p_data_loss": 4.719711780548096, | |
| "eval_q2p_data_runtime": 15.359, | |
| "eval_q2p_data_samples_per_second": 52.868, | |
| "eval_q2p_data_steps_per_second": 3.321, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.4065196548418024, | |
| "grad_norm": 0.14310245215892792, | |
| "learning_rate": 2.2017045454545457e-06, | |
| "loss": 4.9437, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.4161073825503356, | |
| "grad_norm": 0.12047765403985977, | |
| "learning_rate": 2.166193181818182e-06, | |
| "loss": 4.9553, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.425695110258869, | |
| "grad_norm": 0.1301940679550171, | |
| "learning_rate": 2.1306818181818183e-06, | |
| "loss": 4.9355, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.4352828379674016, | |
| "grad_norm": 0.42147210240364075, | |
| "learning_rate": 2.0951704545454545e-06, | |
| "loss": 4.9063, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.444870565675935, | |
| "grad_norm": 44.65216064453125, | |
| "learning_rate": 2.0596590909090912e-06, | |
| "loss": 4.9095, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.444870565675935, | |
| "eval_q2q_data_loss": 4.99726676940918, | |
| "eval_q2q_data_runtime": 8.4873, | |
| "eval_q2q_data_samples_per_second": 318.711, | |
| "eval_q2q_data_steps_per_second": 20.03, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.444870565675935, | |
| "eval_q2p_data_loss": 4.74806547164917, | |
| "eval_q2p_data_runtime": 15.3525, | |
| "eval_q2p_data_samples_per_second": 52.891, | |
| "eval_q2p_data_steps_per_second": 3.322, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.4544582933844676, | |
| "grad_norm": 22.98095703125, | |
| "learning_rate": 2.0241477272727275e-06, | |
| "loss": 4.9624, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.464046021093001, | |
| "grad_norm": 0.5905591249465942, | |
| "learning_rate": 1.9886363636363638e-06, | |
| "loss": 4.9731, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.473633748801534, | |
| "grad_norm": 24.247333526611328, | |
| "learning_rate": 1.953125e-06, | |
| "loss": 4.9156, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.4832214765100673, | |
| "grad_norm": 32.6563720703125, | |
| "learning_rate": 1.9176136363636367e-06, | |
| "loss": 4.8714, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.4928092042186, | |
| "grad_norm": 36.43191146850586, | |
| "learning_rate": 1.8821022727272728e-06, | |
| "loss": 4.9532, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.4928092042186, | |
| "eval_q2q_data_loss": 5.000910758972168, | |
| "eval_q2q_data_runtime": 8.4722, | |
| "eval_q2q_data_samples_per_second": 319.28, | |
| "eval_q2q_data_steps_per_second": 20.066, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.4928092042186, | |
| "eval_q2p_data_loss": 4.732726573944092, | |
| "eval_q2p_data_runtime": 15.3101, | |
| "eval_q2p_data_samples_per_second": 53.037, | |
| "eval_q2p_data_steps_per_second": 3.331, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.5023969319271333, | |
| "grad_norm": 6.501353740692139, | |
| "learning_rate": 1.8465909090909093e-06, | |
| "loss": 4.9196, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.511984659635666, | |
| "grad_norm": 57.751441955566406, | |
| "learning_rate": 1.8110795454545458e-06, | |
| "loss": 4.9477, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.5215723873441993, | |
| "grad_norm": 0.12283805757761002, | |
| "learning_rate": 1.7755681818181818e-06, | |
| "loss": 4.9725, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.5311601150527325, | |
| "grad_norm": 17.9443302154541, | |
| "learning_rate": 1.7400568181818183e-06, | |
| "loss": 4.9483, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.5407478427612658, | |
| "grad_norm": 0.27849340438842773, | |
| "learning_rate": 1.7045454545454546e-06, | |
| "loss": 4.9124, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5407478427612658, | |
| "eval_q2q_data_loss": 5.000847339630127, | |
| "eval_q2q_data_runtime": 8.4514, | |
| "eval_q2q_data_samples_per_second": 320.064, | |
| "eval_q2q_data_steps_per_second": 20.115, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5407478427612658, | |
| "eval_q2p_data_loss": 4.775162220001221, | |
| "eval_q2p_data_runtime": 15.3209, | |
| "eval_q2p_data_samples_per_second": 53.0, | |
| "eval_q2p_data_steps_per_second": 3.329, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5503355704697985, | |
| "grad_norm": 0.1170654371380806, | |
| "learning_rate": 1.669034090909091e-06, | |
| "loss": 4.9056, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.5599232981783318, | |
| "grad_norm": 9.846685409545898, | |
| "learning_rate": 1.6335227272727275e-06, | |
| "loss": 4.9396, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.569511025886865, | |
| "grad_norm": 0.1312805712223053, | |
| "learning_rate": 1.5980113636363636e-06, | |
| "loss": 4.9472, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.5790987535953978, | |
| "grad_norm": 0.16425052285194397, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 4.9322, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.588686481303931, | |
| "grad_norm": 26.310592651367188, | |
| "learning_rate": 1.5269886363636366e-06, | |
| "loss": 4.9147, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.588686481303931, | |
| "eval_q2q_data_loss": 5.000824928283691, | |
| "eval_q2q_data_runtime": 8.4934, | |
| "eval_q2q_data_samples_per_second": 318.482, | |
| "eval_q2q_data_steps_per_second": 20.016, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.588686481303931, | |
| "eval_q2p_data_loss": 4.735974311828613, | |
| "eval_q2p_data_runtime": 15.3216, | |
| "eval_q2p_data_samples_per_second": 52.997, | |
| "eval_q2p_data_steps_per_second": 3.329, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.598274209012464, | |
| "grad_norm": 0.11873164027929306, | |
| "learning_rate": 1.4914772727272728e-06, | |
| "loss": 4.9511, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.607861936720997, | |
| "grad_norm": 0.11559820920228958, | |
| "learning_rate": 1.4559659090909093e-06, | |
| "loss": 4.9229, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.61744966442953, | |
| "grad_norm": 0.1333041489124298, | |
| "learning_rate": 1.4204545454545458e-06, | |
| "loss": 4.9207, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.6270373921380634, | |
| "grad_norm": 0.16187268495559692, | |
| "learning_rate": 1.3849431818181819e-06, | |
| "loss": 4.9695, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.636625119846596, | |
| "grad_norm": 40.309261322021484, | |
| "learning_rate": 1.3494318181818183e-06, | |
| "loss": 4.8886, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.636625119846596, | |
| "eval_q2q_data_loss": 5.0007758140563965, | |
| "eval_q2q_data_runtime": 8.4851, | |
| "eval_q2q_data_samples_per_second": 318.795, | |
| "eval_q2q_data_steps_per_second": 20.035, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.636625119846596, | |
| "eval_q2p_data_loss": 4.76162052154541, | |
| "eval_q2p_data_runtime": 15.3319, | |
| "eval_q2p_data_samples_per_second": 52.961, | |
| "eval_q2p_data_steps_per_second": 3.326, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.6462128475551294, | |
| "grad_norm": 0.11944945156574249, | |
| "learning_rate": 1.3139204545454546e-06, | |
| "loss": 4.9878, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.6558005752636626, | |
| "grad_norm": 0.1411992311477661, | |
| "learning_rate": 1.278409090909091e-06, | |
| "loss": 4.9647, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.665388302972196, | |
| "grad_norm": 0.11750555783510208, | |
| "learning_rate": 1.2428977272727274e-06, | |
| "loss": 4.9552, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.6749760306807286, | |
| "grad_norm": 12.01413631439209, | |
| "learning_rate": 1.2073863636363638e-06, | |
| "loss": 5.0171, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.684563758389262, | |
| "grad_norm": 39.38778305053711, | |
| "learning_rate": 1.1718750000000001e-06, | |
| "loss": 4.9379, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.684563758389262, | |
| "eval_q2q_data_loss": 5.0007734298706055, | |
| "eval_q2q_data_runtime": 8.5072, | |
| "eval_q2q_data_samples_per_second": 317.965, | |
| "eval_q2q_data_steps_per_second": 19.983, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.684563758389262, | |
| "eval_q2p_data_loss": 4.756326198577881, | |
| "eval_q2p_data_runtime": 15.3794, | |
| "eval_q2p_data_samples_per_second": 52.798, | |
| "eval_q2p_data_steps_per_second": 3.316, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.6941514860977946, | |
| "grad_norm": 0.2822560966014862, | |
| "learning_rate": 1.1363636363636364e-06, | |
| "loss": 4.9727, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.703739213806328, | |
| "grad_norm": 0.9750680923461914, | |
| "learning_rate": 1.1008522727272729e-06, | |
| "loss": 4.9798, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.713326941514861, | |
| "grad_norm": 12.072766304016113, | |
| "learning_rate": 1.0653409090909091e-06, | |
| "loss": 4.9726, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.7229146692233943, | |
| "grad_norm": 24.833826065063477, | |
| "learning_rate": 1.0298295454545456e-06, | |
| "loss": 4.956, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.732502396931927, | |
| "grad_norm": 15.921252250671387, | |
| "learning_rate": 9.943181818181819e-07, | |
| "loss": 4.9512, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.732502396931927, | |
| "eval_q2q_data_loss": 5.000742435455322, | |
| "eval_q2q_data_runtime": 8.4355, | |
| "eval_q2q_data_samples_per_second": 320.669, | |
| "eval_q2q_data_steps_per_second": 20.153, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.732502396931927, | |
| "eval_q2p_data_loss": 4.766937255859375, | |
| "eval_q2p_data_runtime": 15.3173, | |
| "eval_q2p_data_samples_per_second": 53.012, | |
| "eval_q2p_data_steps_per_second": 3.33, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.7420901246404603, | |
| "grad_norm": 0.15265218913555145, | |
| "learning_rate": 9.588068181818184e-07, | |
| "loss": 4.9705, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.751677852348993, | |
| "grad_norm": 15.488290786743164, | |
| "learning_rate": 9.232954545454546e-07, | |
| "loss": 4.8603, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.7612655800575263, | |
| "grad_norm": 0.121486134827137, | |
| "learning_rate": 8.877840909090909e-07, | |
| "loss": 4.9764, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.7708533077660595, | |
| "grad_norm": 0.1105041652917862, | |
| "learning_rate": 8.522727272727273e-07, | |
| "loss": 4.9187, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.7804410354745928, | |
| "grad_norm": 0.10993187129497528, | |
| "learning_rate": 8.167613636363638e-07, | |
| "loss": 4.8941, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.7804410354745928, | |
| "eval_q2q_data_loss": 5.000753402709961, | |
| "eval_q2q_data_runtime": 8.462, | |
| "eval_q2q_data_samples_per_second": 319.666, | |
| "eval_q2q_data_steps_per_second": 20.09, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.7804410354745928, | |
| "eval_q2p_data_loss": 4.73110818862915, | |
| "eval_q2p_data_runtime": 15.3141, | |
| "eval_q2p_data_samples_per_second": 53.023, | |
| "eval_q2p_data_steps_per_second": 3.33, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.7900287631831255, | |
| "grad_norm": 0.09844540059566498, | |
| "learning_rate": 7.8125e-07, | |
| "loss": 4.9592, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.7996164908916588, | |
| "grad_norm": 21.05035400390625, | |
| "learning_rate": 7.457386363636364e-07, | |
| "loss": 4.9141, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.8092042186001915, | |
| "grad_norm": 0.11973018944263458, | |
| "learning_rate": 7.102272727272729e-07, | |
| "loss": 4.9198, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.8187919463087248, | |
| "grad_norm": 0.12149699777364731, | |
| "learning_rate": 6.747159090909092e-07, | |
| "loss": 5.0112, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.828379674017258, | |
| "grad_norm": 5.942767143249512, | |
| "learning_rate": 6.392045454545455e-07, | |
| "loss": 4.9778, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.828379674017258, | |
| "eval_q2q_data_loss": 5.00074577331543, | |
| "eval_q2q_data_runtime": 8.4603, | |
| "eval_q2q_data_samples_per_second": 319.73, | |
| "eval_q2q_data_steps_per_second": 20.094, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.828379674017258, | |
| "eval_q2p_data_loss": 4.73326301574707, | |
| "eval_q2p_data_runtime": 15.3687, | |
| "eval_q2p_data_samples_per_second": 52.835, | |
| "eval_q2p_data_steps_per_second": 3.318, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.837967401725791, | |
| "grad_norm": 5.33225679397583, | |
| "learning_rate": 6.036931818181819e-07, | |
| "loss": 4.8999, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.847555129434324, | |
| "grad_norm": 25.030715942382812, | |
| "learning_rate": 5.681818181818182e-07, | |
| "loss": 4.9223, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.1237885057926178, | |
| "learning_rate": 5.326704545454546e-07, | |
| "loss": 4.9369, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.86673058485139, | |
| "grad_norm": 0.09552864730358124, | |
| "learning_rate": 4.971590909090909e-07, | |
| "loss": 4.8722, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.876318312559923, | |
| "grad_norm": 0.1201782152056694, | |
| "learning_rate": 4.616477272727273e-07, | |
| "loss": 4.9299, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.876318312559923, | |
| "eval_q2q_data_loss": 5.000753402709961, | |
| "eval_q2q_data_runtime": 8.4812, | |
| "eval_q2q_data_samples_per_second": 318.942, | |
| "eval_q2q_data_steps_per_second": 20.044, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.876318312559923, | |
| "eval_q2p_data_loss": 4.7280192375183105, | |
| "eval_q2p_data_runtime": 15.3569, | |
| "eval_q2p_data_samples_per_second": 52.875, | |
| "eval_q2p_data_steps_per_second": 3.321, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.8859060402684564, | |
| "grad_norm": 18.201995849609375, | |
| "learning_rate": 4.2613636363636364e-07, | |
| "loss": 4.8457, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.8954937679769897, | |
| "grad_norm": 0.09412606805562973, | |
| "learning_rate": 3.90625e-07, | |
| "loss": 4.8864, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.9050814956855224, | |
| "grad_norm": 21.844467163085938, | |
| "learning_rate": 3.5511363636363645e-07, | |
| "loss": 4.882, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.9146692233940557, | |
| "grad_norm": 0.1089194044470787, | |
| "learning_rate": 3.1960227272727277e-07, | |
| "loss": 4.8897, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.9242569511025884, | |
| "grad_norm": 0.20910155773162842, | |
| "learning_rate": 2.840909090909091e-07, | |
| "loss": 4.9663, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.9242569511025884, | |
| "eval_q2q_data_loss": 5.000741481781006, | |
| "eval_q2q_data_runtime": 8.4976, | |
| "eval_q2q_data_samples_per_second": 318.326, | |
| "eval_q2q_data_steps_per_second": 20.006, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.9242569511025884, | |
| "eval_q2p_data_loss": 4.723778247833252, | |
| "eval_q2p_data_runtime": 15.2952, | |
| "eval_q2p_data_samples_per_second": 53.088, | |
| "eval_q2p_data_steps_per_second": 3.334, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.9338446788111217, | |
| "grad_norm": 0.1785881370306015, | |
| "learning_rate": 2.4857954545454547e-07, | |
| "loss": 4.946, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.943432406519655, | |
| "grad_norm": 26.99447250366211, | |
| "learning_rate": 2.1306818181818182e-07, | |
| "loss": 4.9555, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.953020134228188, | |
| "grad_norm": 0.10196644067764282, | |
| "learning_rate": 1.7755681818181822e-07, | |
| "loss": 4.9005, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.962607861936721, | |
| "grad_norm": 26.543190002441406, | |
| "learning_rate": 1.4204545454545455e-07, | |
| "loss": 4.9097, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.972195589645254, | |
| "grad_norm": 0.12280410528182983, | |
| "learning_rate": 1.0653409090909091e-07, | |
| "loss": 4.924, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.972195589645254, | |
| "eval_q2q_data_loss": 5.000741004943848, | |
| "eval_q2q_data_runtime": 8.473, | |
| "eval_q2q_data_samples_per_second": 319.25, | |
| "eval_q2q_data_steps_per_second": 20.064, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.972195589645254, | |
| "eval_q2p_data_loss": 4.72309684753418, | |
| "eval_q2p_data_runtime": 15.3713, | |
| "eval_q2p_data_samples_per_second": 52.826, | |
| "eval_q2p_data_steps_per_second": 3.318, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.981783317353787, | |
| "grad_norm": 0.0916726365685463, | |
| "learning_rate": 7.102272727272727e-08, | |
| "loss": 4.8929, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.99137104506232, | |
| "grad_norm": 15.717903137207031, | |
| "learning_rate": 3.551136363636364e-08, | |
| "loss": 4.93, | |
| "step": 3120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3129, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |