| { |
| "best_metric": 1.6921895742416382, |
| "best_model_checkpoint": "./output/checkpoint-4500", |
| "epoch": 0.15508384220219057, |
| "eval_steps": 150, |
| "global_step": 4800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00032309133792123035, |
| "grad_norm": 17.601825714111328, |
| "learning_rate": 2.2360679774997904e-06, |
| "loss": 1.8352, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0006461826758424607, |
| "grad_norm": 7.03536319732666, |
| "learning_rate": 4.472135954999581e-06, |
| "loss": 1.7155, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.000969274013763691, |
| "grad_norm": 9.148643493652344, |
| "learning_rate": 6.70820393249937e-06, |
| "loss": 1.6636, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0012923653516849214, |
| "grad_norm": 8.976370811462402, |
| "learning_rate": 8.944271909999161e-06, |
| "loss": 1.7126, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0016154566896061516, |
| "grad_norm": 8.681303977966309, |
| "learning_rate": 1.118033988749895e-05, |
| "loss": 1.8704, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.001938548027527382, |
| "grad_norm": 10.47424030303955, |
| "learning_rate": 1.341640786499874e-05, |
| "loss": 1.8515, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0022616393654486125, |
| "grad_norm": 13.055950164794922, |
| "learning_rate": 1.565247584249853e-05, |
| "loss": 1.7705, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0025847307033698428, |
| "grad_norm": 8.785000801086426, |
| "learning_rate": 1.7888543819998323e-05, |
| "loss": 1.91, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.002907822041291073, |
| "grad_norm": 11.010156631469727, |
| "learning_rate": 2.0124611797498112e-05, |
| "loss": 1.8089, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0032309133792123032, |
| "grad_norm": 9.710369110107422, |
| "learning_rate": 2.23606797749979e-05, |
| "loss": 1.8483, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0035540047171335335, |
| "grad_norm": 9.487078666687012, |
| "learning_rate": 2.236044998500671e-05, |
| "loss": 1.8135, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.003877096055054764, |
| "grad_norm": 9.57800579071045, |
| "learning_rate": 2.235976062447891e-05, |
| "loss": 1.6482, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.004200187392975994, |
| "grad_norm": 7.63389253616333, |
| "learning_rate": 2.2358611721751407e-05, |
| "loss": 1.8626, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.004523278730897225, |
| "grad_norm": 11.528849601745605, |
| "learning_rate": 2.2357003324051093e-05, |
| "loss": 1.7989, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.004846370068818455, |
| "grad_norm": 12.541647911071777, |
| "learning_rate": 2.23549354974929e-05, |
| "loss": 1.7222, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.004846370068818455, |
| "eval_loss": 1.74934720993042, |
| "eval_runtime": 45.4353, |
| "eval_samples_per_second": 11.027, |
| "eval_steps_per_second": 11.027, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0051694614067396855, |
| "grad_norm": 11.141366958618164, |
| "learning_rate": 2.2352408327077078e-05, |
| "loss": 1.7504, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.005492552744660916, |
| "grad_norm": 11.372058868408203, |
| "learning_rate": 2.2349421916685704e-05, |
| "loss": 1.8559, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.005815644082582146, |
| "grad_norm": 8.757671356201172, |
| "learning_rate": 2.234597638907841e-05, |
| "loss": 1.9035, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.006138735420503376, |
| "grad_norm": 8.273791313171387, |
| "learning_rate": 2.2342071885887346e-05, |
| "loss": 1.8452, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0064618267584246065, |
| "grad_norm": 11.234216690063477, |
| "learning_rate": 2.2337708567611343e-05, |
| "loss": 1.9133, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.006784918096345837, |
| "grad_norm": 9.991055488586426, |
| "learning_rate": 2.233288661360932e-05, |
| "loss": 1.9328, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.007108009434267067, |
| "grad_norm": 10.776007652282715, |
| "learning_rate": 2.232760622209293e-05, |
| "loss": 1.7055, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.007431100772188298, |
| "grad_norm": 7.8985490798950195, |
| "learning_rate": 2.2321867610118378e-05, |
| "loss": 1.7584, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.007754192110109528, |
| "grad_norm": 11.129011154174805, |
| "learning_rate": 2.231567101357753e-05, |
| "loss": 1.7202, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.008077283448030758, |
| "grad_norm": 10.667845726013184, |
| "learning_rate": 2.2309016687188194e-05, |
| "loss": 1.704, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.008400374785951988, |
| "grad_norm": 13.889586448669434, |
| "learning_rate": 2.230190490448367e-05, |
| "loss": 1.7678, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.008723466123873218, |
| "grad_norm": 8.101602554321289, |
| "learning_rate": 2.229433595780149e-05, |
| "loss": 1.6418, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.00904655746179445, |
| "grad_norm": 11.362483978271484, |
| "learning_rate": 2.2286310158271407e-05, |
| "loss": 1.848, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.00936964879971568, |
| "grad_norm": 8.67332935333252, |
| "learning_rate": 2.22778278358026e-05, |
| "loss": 1.7263, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.00969274013763691, |
| "grad_norm": 14.691208839416504, |
| "learning_rate": 2.2268889339070124e-05, |
| "loss": 1.7203, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.00969274013763691, |
| "eval_loss": 1.7411584854125977, |
| "eval_runtime": 49.9553, |
| "eval_samples_per_second": 10.029, |
| "eval_steps_per_second": 10.029, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.01001583147555814, |
| "grad_norm": 13.153305053710938, |
| "learning_rate": 2.2259495035500576e-05, |
| "loss": 1.6979, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.010338922813479371, |
| "grad_norm": 7.682581901550293, |
| "learning_rate": 2.2249645311256972e-05, |
| "loss": 1.8388, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.010662014151400601, |
| "grad_norm": 6.891665935516357, |
| "learning_rate": 2.2239340571222904e-05, |
| "loss": 1.7465, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.010985105489321832, |
| "grad_norm": 10.929823875427246, |
| "learning_rate": 2.2228581238985868e-05, |
| "loss": 1.763, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.011308196827243062, |
| "grad_norm": 9.032389640808105, |
| "learning_rate": 2.2217367756819878e-05, |
| "loss": 1.8118, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.011631288165164292, |
| "grad_norm": 13.457771301269531, |
| "learning_rate": 2.2205700585667257e-05, |
| "loss": 1.84, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.011954379503085522, |
| "grad_norm": 15.551021575927734, |
| "learning_rate": 2.2193580205119724e-05, |
| "loss": 1.7466, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.012277470841006752, |
| "grad_norm": 12.1012601852417, |
| "learning_rate": 2.2181007113398642e-05, |
| "loss": 1.5802, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.012600562178927983, |
| "grad_norm": 11.149864196777344, |
| "learning_rate": 2.216798182733457e-05, |
| "loss": 1.7217, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.012923653516849213, |
| "grad_norm": 10.7238130569458, |
| "learning_rate": 2.2154504882346002e-05, |
| "loss": 1.6624, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.013246744854770443, |
| "grad_norm": 8.965182304382324, |
| "learning_rate": 2.214057683241736e-05, |
| "loss": 1.7672, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.013569836192691673, |
| "grad_norm": 7.6960673332214355, |
| "learning_rate": 2.2126198250076225e-05, |
| "loss": 1.6535, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.013892927530612904, |
| "grad_norm": 11.47298812866211, |
| "learning_rate": 2.2111369726369802e-05, |
| "loss": 1.7447, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.014216018868534134, |
| "grad_norm": 8.105657577514648, |
| "learning_rate": 2.2096091870840613e-05, |
| "loss": 1.7348, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.014539110206455364, |
| "grad_norm": 9.129870414733887, |
| "learning_rate": 2.2080365311501466e-05, |
| "loss": 1.832, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.014539110206455364, |
| "eval_loss": 1.738905906677246, |
| "eval_runtime": 51.3901, |
| "eval_samples_per_second": 9.749, |
| "eval_steps_per_second": 9.749, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.014862201544376596, |
| "grad_norm": 7.760665416717529, |
| "learning_rate": 2.206419069480962e-05, |
| "loss": 1.6879, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.015185292882297826, |
| "grad_norm": 11.239582061767578, |
| "learning_rate": 2.2047568685640212e-05, |
| "loss": 1.7155, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.015508384220219057, |
| "grad_norm": 11.516200065612793, |
| "learning_rate": 2.203049996725894e-05, |
| "loss": 1.8277, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.015831475558140285, |
| "grad_norm": 7.8999834060668945, |
| "learning_rate": 2.2012985241293954e-05, |
| "loss": 1.9313, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.016154566896061515, |
| "grad_norm": 11.626410484313965, |
| "learning_rate": 2.1995025227707044e-05, |
| "loss": 1.8176, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.016477658233982746, |
| "grad_norm": 10.57736873626709, |
| "learning_rate": 2.1976620664764027e-05, |
| "loss": 1.6413, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.016800749571903976, |
| "grad_norm": 10.937850952148438, |
| "learning_rate": 2.1957772309004394e-05, |
| "loss": 1.7362, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.017123840909825206, |
| "grad_norm": 12.991289138793945, |
| "learning_rate": 2.1938480935210228e-05, |
| "loss": 1.8159, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.017446932247746436, |
| "grad_norm": 8.747649192810059, |
| "learning_rate": 2.1918747336374347e-05, |
| "loss": 1.8647, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.017770023585667667, |
| "grad_norm": 7.261239051818848, |
| "learning_rate": 2.189857232366771e-05, |
| "loss": 1.8095, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.0180931149235889, |
| "grad_norm": 11.28962516784668, |
| "learning_rate": 2.1877956726406063e-05, |
| "loss": 1.7219, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.01841620626151013, |
| "grad_norm": 12.047504425048828, |
| "learning_rate": 2.1856901392015874e-05, |
| "loss": 1.9073, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.01873929759943136, |
| "grad_norm": 13.037126541137695, |
| "learning_rate": 2.183540718599946e-05, |
| "loss": 1.8341, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01906238893735259, |
| "grad_norm": 13.498525619506836, |
| "learning_rate": 2.1813474991899453e-05, |
| "loss": 1.9031, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.01938548027527382, |
| "grad_norm": 8.813241958618164, |
| "learning_rate": 2.1791105711262442e-05, |
| "loss": 1.6959, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01938548027527382, |
| "eval_loss": 1.7352592945098877, |
| "eval_runtime": 45.3637, |
| "eval_samples_per_second": 11.044, |
| "eval_steps_per_second": 11.044, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01970857161319505, |
| "grad_norm": 12.009414672851562, |
| "learning_rate": 2.1768300263601945e-05, |
| "loss": 1.9202, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.02003166295111628, |
| "grad_norm": 9.502263069152832, |
| "learning_rate": 2.174505958636059e-05, |
| "loss": 1.8399, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.020354754289037512, |
| "grad_norm": 8.802181243896484, |
| "learning_rate": 2.1721384634871592e-05, |
| "loss": 1.7752, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.020677845626958742, |
| "grad_norm": 10.21849250793457, |
| "learning_rate": 2.169727638231948e-05, |
| "loss": 1.8093, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.021000936964879972, |
| "grad_norm": 13.870856285095215, |
| "learning_rate": 2.1672735819700084e-05, |
| "loss": 1.7692, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.021324028302801203, |
| "grad_norm": 8.835685729980469, |
| "learning_rate": 2.1647763955779823e-05, |
| "loss": 1.7946, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.021647119640722433, |
| "grad_norm": 10.815114974975586, |
| "learning_rate": 2.1622361817054213e-05, |
| "loss": 1.7859, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.021970210978643663, |
| "grad_norm": 7.781187057495117, |
| "learning_rate": 2.1596530447705676e-05, |
| "loss": 1.799, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.022293302316564893, |
| "grad_norm": 7.180160999298096, |
| "learning_rate": 2.157027090956064e-05, |
| "loss": 1.8043, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.022616393654486124, |
| "grad_norm": 10.69900131225586, |
| "learning_rate": 2.1543584282045862e-05, |
| "loss": 1.8702, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.022939484992407354, |
| "grad_norm": 7.213955402374268, |
| "learning_rate": 2.1516471662144077e-05, |
| "loss": 1.7494, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.023262576330328584, |
| "grad_norm": 7.767466068267822, |
| "learning_rate": 2.1488934164348898e-05, |
| "loss": 1.8502, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.023585667668249814, |
| "grad_norm": 11.879374504089355, |
| "learning_rate": 2.1460972920619e-05, |
| "loss": 1.7991, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.023908759006171044, |
| "grad_norm": 14.04179859161377, |
| "learning_rate": 2.143258908033159e-05, |
| "loss": 1.7676, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.024231850344092275, |
| "grad_norm": 13.560479164123535, |
| "learning_rate": 2.140378381023518e-05, |
| "loss": 1.7701, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.024231850344092275, |
| "eval_loss": 1.735255241394043, |
| "eval_runtime": 50.7259, |
| "eval_samples_per_second": 9.877, |
| "eval_steps_per_second": 9.877, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.024554941682013505, |
| "grad_norm": 14.157264709472656, |
| "learning_rate": 2.1374558294401597e-05, |
| "loss": 1.6387, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.024878033019934735, |
| "grad_norm": 9.316694259643555, |
| "learning_rate": 2.134491373417733e-05, |
| "loss": 1.6191, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.025201124357855965, |
| "grad_norm": 11.018717765808105, |
| "learning_rate": 2.1314851348134134e-05, |
| "loss": 1.9314, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.025524215695777196, |
| "grad_norm": 7.385425567626953, |
| "learning_rate": 2.1284372372018963e-05, |
| "loss": 1.8197, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.025847307033698426, |
| "grad_norm": 6.388603210449219, |
| "learning_rate": 2.125347805870314e-05, |
| "loss": 1.6863, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.026170398371619656, |
| "grad_norm": 8.44832706451416, |
| "learning_rate": 2.122216967813088e-05, |
| "loss": 1.9365, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.026493489709540886, |
| "grad_norm": 9.323631286621094, |
| "learning_rate": 2.1190448517267087e-05, |
| "loss": 1.7366, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.026816581047462117, |
| "grad_norm": 8.1749849319458, |
| "learning_rate": 2.115831588004444e-05, |
| "loss": 1.7957, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.027139672385383347, |
| "grad_norm": 8.410055160522461, |
| "learning_rate": 2.1125773087309798e-05, |
| "loss": 1.7546, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.027462763723304577, |
| "grad_norm": 9.194031715393066, |
| "learning_rate": 2.1092821476769906e-05, |
| "loss": 1.7421, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.027785855061225807, |
| "grad_norm": 7.863668918609619, |
| "learning_rate": 2.1059462402936416e-05, |
| "loss": 1.8438, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.028108946399147038, |
| "grad_norm": 8.896843910217285, |
| "learning_rate": 2.102569723707019e-05, |
| "loss": 1.9148, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.028432037737068268, |
| "grad_norm": 12.01693058013916, |
| "learning_rate": 2.0991527367124955e-05, |
| "loss": 1.6743, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.028755129074989498, |
| "grad_norm": 15.558412551879883, |
| "learning_rate": 2.095695419769022e-05, |
| "loss": 1.742, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.02907822041291073, |
| "grad_norm": 9.519794464111328, |
| "learning_rate": 2.0921979149933576e-05, |
| "loss": 1.8417, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02907822041291073, |
| "eval_loss": 1.7262463569641113, |
| "eval_runtime": 46.9684, |
| "eval_samples_per_second": 10.667, |
| "eval_steps_per_second": 10.667, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02940131175083196, |
| "grad_norm": 11.452165603637695, |
| "learning_rate": 2.0886603661542245e-05, |
| "loss": 1.6859, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.029724403088753192, |
| "grad_norm": 14.136131286621094, |
| "learning_rate": 2.0850829186663994e-05, |
| "loss": 1.7964, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.030047494426674422, |
| "grad_norm": 10.132573127746582, |
| "learning_rate": 2.0814657195847375e-05, |
| "loss": 1.8617, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.030370585764595653, |
| "grad_norm": 12.094213485717773, |
| "learning_rate": 2.077808917598125e-05, |
| "loss": 1.7776, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.030693677102516883, |
| "grad_norm": 13.876291275024414, |
| "learning_rate": 2.0741126630233687e-05, |
| "loss": 1.874, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.031016768440438113, |
| "grad_norm": 7.611670017242432, |
| "learning_rate": 2.070377107799017e-05, |
| "loss": 1.818, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.03133985977835934, |
| "grad_norm": 7.113893985748291, |
| "learning_rate": 2.0666024054791137e-05, |
| "loss": 1.795, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.03166295111628057, |
| "grad_norm": 9.59799861907959, |
| "learning_rate": 2.0627887112268875e-05, |
| "loss": 1.7043, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.0319860424542018, |
| "grad_norm": 9.458488464355469, |
| "learning_rate": 2.0589361818083712e-05, |
| "loss": 1.8802, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.03230913379212303, |
| "grad_norm": 18.441673278808594, |
| "learning_rate": 2.0550449755859598e-05, |
| "loss": 1.7685, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.03263222513004426, |
| "grad_norm": 10.183686256408691, |
| "learning_rate": 2.0511152525119014e-05, |
| "loss": 1.7086, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.03295531646796549, |
| "grad_norm": 10.062090873718262, |
| "learning_rate": 2.0471471741217183e-05, |
| "loss": 1.6957, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.03327840780588672, |
| "grad_norm": 13.140121459960938, |
| "learning_rate": 2.0431409035275724e-05, |
| "loss": 1.8548, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.03360149914380795, |
| "grad_norm": 12.769956588745117, |
| "learning_rate": 2.0390966054115558e-05, |
| "loss": 1.8432, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.03392459048172918, |
| "grad_norm": 8.896726608276367, |
| "learning_rate": 2.035014446018924e-05, |
| "loss": 1.8244, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.03392459048172918, |
| "eval_loss": 1.732542634010315, |
| "eval_runtime": 48.4752, |
| "eval_samples_per_second": 10.335, |
| "eval_steps_per_second": 10.335, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.03424768181965041, |
| "grad_norm": 9.040741920471191, |
| "learning_rate": 2.0308945931512606e-05, |
| "loss": 1.8755, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.03457077315757164, |
| "grad_norm": 10.011199951171875, |
| "learning_rate": 2.0267372161595806e-05, |
| "loss": 1.9373, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.03489386449549287, |
| "grad_norm": 11.096688270568848, |
| "learning_rate": 2.022542485937369e-05, |
| "loss": 1.7546, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.0352169558334141, |
| "grad_norm": 8.579634666442871, |
| "learning_rate": 2.0183105749135553e-05, |
| "loss": 1.7737, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.03554004717133533, |
| "grad_norm": 12.04051685333252, |
| "learning_rate": 2.0140416570454266e-05, |
| "loss": 1.7295, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.03586313850925657, |
| "grad_norm": 8.008216857910156, |
| "learning_rate": 2.0097359078114767e-05, |
| "loss": 1.7887, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.0361862298471778, |
| "grad_norm": 10.406676292419434, |
| "learning_rate": 2.0053935042041915e-05, |
| "loss": 1.6403, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.03650932118509903, |
| "grad_norm": 13.819786071777344, |
| "learning_rate": 2.001014624722775e-05, |
| "loss": 1.6146, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.03683241252302026, |
| "grad_norm": 8.4624605178833, |
| "learning_rate": 1.996599449365813e-05, |
| "loss": 1.7913, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.03715550386094149, |
| "grad_norm": 8.123451232910156, |
| "learning_rate": 1.9921481596238703e-05, |
| "loss": 1.735, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.03747859519886272, |
| "grad_norm": 10.989400863647461, |
| "learning_rate": 1.9876609384720335e-05, |
| "loss": 1.8681, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.03780168653678395, |
| "grad_norm": 10.746498107910156, |
| "learning_rate": 1.9831379703623903e-05, |
| "loss": 1.6177, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.03812477787470518, |
| "grad_norm": 10.412644386291504, |
| "learning_rate": 1.978579441216443e-05, |
| "loss": 1.8265, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.03844786921262641, |
| "grad_norm": 8.273632049560547, |
| "learning_rate": 1.9739855384174708e-05, |
| "loss": 1.7172, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.03877096055054764, |
| "grad_norm": 9.549877166748047, |
| "learning_rate": 1.969356450802825e-05, |
| "loss": 1.618, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03877096055054764, |
| "eval_loss": 1.733805775642395, |
| "eval_runtime": 50.1723, |
| "eval_samples_per_second": 9.986, |
| "eval_steps_per_second": 9.986, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03909405188846887, |
| "grad_norm": 11.008418083190918, |
| "learning_rate": 1.964692368656166e-05, |
| "loss": 1.7639, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0394171432263901, |
| "grad_norm": 9.95077133178711, |
| "learning_rate": 1.9599934836996435e-05, |
| "loss": 1.6824, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03974023456431133, |
| "grad_norm": 9.953926086425781, |
| "learning_rate": 1.9552599890860126e-05, |
| "loss": 1.7993, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.04006332590223256, |
| "grad_norm": 8.160842895507812, |
| "learning_rate": 1.9504920793906985e-05, |
| "loss": 1.6618, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.040386417240153794, |
| "grad_norm": 10.420413970947266, |
| "learning_rate": 1.945689950603793e-05, |
| "loss": 1.774, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.040709508578075024, |
| "grad_norm": 6.669633388519287, |
| "learning_rate": 1.9408538001220032e-05, |
| "loss": 1.6355, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.041032599915996254, |
| "grad_norm": 9.326558113098145, |
| "learning_rate": 1.9359838267405318e-05, |
| "loss": 1.644, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.041355691253917484, |
| "grad_norm": 9.282320976257324, |
| "learning_rate": 1.931080230644911e-05, |
| "loss": 1.7776, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.041678782591838714, |
| "grad_norm": 7.603001594543457, |
| "learning_rate": 1.926143213402771e-05, |
| "loss": 1.7819, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.042001873929759945, |
| "grad_norm": 7.637514114379883, |
| "learning_rate": 1.921172977955552e-05, |
| "loss": 1.7667, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.042324965267681175, |
| "grad_norm": 9.193405151367188, |
| "learning_rate": 1.9161697286101677e-05, |
| "loss": 1.8003, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.042648056605602405, |
| "grad_norm": 7.766101837158203, |
| "learning_rate": 1.9111336710306013e-05, |
| "loss": 1.5436, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.042971147943523635, |
| "grad_norm": 9.50290298461914, |
| "learning_rate": 1.9060650122294554e-05, |
| "loss": 1.7709, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.043294239281444866, |
| "grad_norm": 10.983290672302246, |
| "learning_rate": 1.9009639605594407e-05, |
| "loss": 1.7847, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.043617330619366096, |
| "grad_norm": 9.261622428894043, |
| "learning_rate": 1.8958307257048116e-05, |
| "loss": 1.6588, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.043617330619366096, |
| "eval_loss": 1.7271850109100342, |
| "eval_runtime": 52.5068, |
| "eval_samples_per_second": 9.542, |
| "eval_steps_per_second": 9.542, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.043940421957287326, |
| "grad_norm": 13.047907829284668, |
| "learning_rate": 1.890665518672748e-05, |
| "loss": 1.6819, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.044263513295208556, |
| "grad_norm": 10.242769241333008, |
| "learning_rate": 1.88546855178468e-05, |
| "loss": 1.7274, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.04458660463312979, |
| "grad_norm": 10.755792617797852, |
| "learning_rate": 1.880240038667561e-05, |
| "loss": 1.8454, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.04490969597105102, |
| "grad_norm": 8.716181755065918, |
| "learning_rate": 1.874980194245087e-05, |
| "loss": 1.6762, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.04523278730897225, |
| "grad_norm": 11.95937728881836, |
| "learning_rate": 1.8696892347288606e-05, |
| "loss": 1.6698, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.04555587864689348, |
| "grad_norm": 9.280597686767578, |
| "learning_rate": 1.864367377609504e-05, |
| "loss": 1.7899, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.04587896998481471, |
| "grad_norm": 9.891837120056152, |
| "learning_rate": 1.8590148416477198e-05, |
| "loss": 1.8089, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.04620206132273594, |
| "grad_norm": 16.32816505432129, |
| "learning_rate": 1.8536318468652962e-05, |
| "loss": 1.6734, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.04652515266065717, |
| "grad_norm": 8.544951438903809, |
| "learning_rate": 1.8482186145360648e-05, |
| "loss": 1.7713, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.0468482439985784, |
| "grad_norm": 10.275934219360352, |
| "learning_rate": 1.8427753671768056e-05, |
| "loss": 1.7716, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.04717133533649963, |
| "grad_norm": 11.277603149414062, |
| "learning_rate": 1.8373023285380966e-05, |
| "loss": 1.7318, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.04749442667442086, |
| "grad_norm": 8.100369453430176, |
| "learning_rate": 1.8317997235951204e-05, |
| "loss": 1.7877, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.04781751801234209, |
| "grad_norm": 9.99820613861084, |
| "learning_rate": 1.8262677785384142e-05, |
| "loss": 1.8218, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.04814060935026332, |
| "grad_norm": 12.380565643310547, |
| "learning_rate": 1.8207067207645716e-05, |
| "loss": 1.8257, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.04846370068818455, |
| "grad_norm": 9.522893905639648, |
| "learning_rate": 1.815116778866897e-05, |
| "loss": 1.9182, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.04846370068818455, |
| "eval_loss": 1.7235751152038574, |
| "eval_runtime": 52.6329, |
| "eval_samples_per_second": 9.519, |
| "eval_steps_per_second": 9.519, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.04878679202610578, |
| "grad_norm": 10.540502548217773, |
| "learning_rate": 1.8094981826260064e-05, |
| "loss": 1.787, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.04910988336402701, |
| "grad_norm": 18.479272842407227, |
| "learning_rate": 1.8038511630003865e-05, |
| "loss": 1.6268, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.04943297470194824, |
| "grad_norm": 8.567608833312988, |
| "learning_rate": 1.798175952116895e-05, |
| "loss": 1.8189, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.04975606603986947, |
| "grad_norm": 8.42313003540039, |
| "learning_rate": 1.7924727832612227e-05, |
| "loss": 1.6656, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.0500791573777907, |
| "grad_norm": 11.321965217590332, |
| "learning_rate": 1.786741890868305e-05, |
| "loss": 1.8461, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.05040224871571193, |
| "grad_norm": 13.762293815612793, |
| "learning_rate": 1.7809835105126807e-05, |
| "loss": 1.6322, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.05072534005363316, |
| "grad_norm": 10.186868667602539, |
| "learning_rate": 1.7751978788988123e-05, |
| "loss": 1.7285, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.05104843139155439, |
| "grad_norm": 12.446249008178711, |
| "learning_rate": 1.7693852338513545e-05, |
| "loss": 1.7648, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.05137152272947562, |
| "grad_norm": 12.205934524536133, |
| "learning_rate": 1.7635458143053794e-05, |
| "loss": 1.8282, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.05169461406739685, |
| "grad_norm": 11.627687454223633, |
| "learning_rate": 1.7576798602965525e-05, |
| "loss": 1.7931, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.05201770540531808, |
| "grad_norm": 9.943934440612793, |
| "learning_rate": 1.7517876129512677e-05, |
| "loss": 1.8118, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.05234079674323931, |
| "grad_norm": 9.162341117858887, |
| "learning_rate": 1.7458693144767353e-05, |
| "loss": 2.0104, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.05266388808116054, |
| "grad_norm": 11.957560539245605, |
| "learning_rate": 1.7399252081510248e-05, |
| "loss": 1.7413, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.05298697941908177, |
| "grad_norm": 10.325530052185059, |
| "learning_rate": 1.733955538313066e-05, |
| "loss": 1.9299, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.053310070757003, |
| "grad_norm": 13.011626243591309, |
| "learning_rate": 1.7279605503526047e-05, |
| "loss": 1.8611, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.053310070757003, |
| "eval_loss": 1.7197445631027222, |
| "eval_runtime": 50.0407, |
| "eval_samples_per_second": 10.012, |
| "eval_steps_per_second": 10.012, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.05363316209492423, |
| "grad_norm": 9.317269325256348, |
| "learning_rate": 1.721940490700115e-05, |
| "loss": 1.8115, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.053956253432845463, |
| "grad_norm": 9.647391319274902, |
| "learning_rate": 1.7158956068166697e-05, |
| "loss": 1.8057, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.054279344770766694, |
| "grad_norm": 13.944653511047363, |
| "learning_rate": 1.7098261471837696e-05, |
| "loss": 1.732, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.054602436108687924, |
| "grad_norm": 14.09939956665039, |
| "learning_rate": 1.7037323612931272e-05, |
| "loss": 1.6199, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.054925527446609154, |
| "grad_norm": 15.80185604095459, |
| "learning_rate": 1.697614499636414e-05, |
| "loss": 1.727, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.055248618784530384, |
| "grad_norm": 11.975440979003906, |
| "learning_rate": 1.6914728136949594e-05, |
| "loss": 1.5241, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.055571710122451615, |
| "grad_norm": 9.165396690368652, |
| "learning_rate": 1.6853075559294172e-05, |
| "loss": 1.835, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.055894801460372845, |
| "grad_norm": 6.48829460144043, |
| "learning_rate": 1.6791189797693877e-05, |
| "loss": 1.8819, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.056217892798294075, |
| "grad_norm": 9.731879234313965, |
| "learning_rate": 1.6729073396029965e-05, |
| "loss": 1.7085, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.056540984136215305, |
| "grad_norm": 8.751827239990234, |
| "learning_rate": 1.666672890766442e-05, |
| "loss": 1.7571, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.056864075474136536, |
| "grad_norm": 7.602952480316162, |
| "learning_rate": 1.660415889533497e-05, |
| "loss": 1.732, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.057187166812057766, |
| "grad_norm": 7.001266002655029, |
| "learning_rate": 1.6541365931049757e-05, |
| "loss": 1.7523, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.057510258149978996, |
| "grad_norm": 9.747156143188477, |
| "learning_rate": 1.6478352595981594e-05, |
| "loss": 1.7293, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.057833349487900226, |
| "grad_norm": 9.404924392700195, |
| "learning_rate": 1.6415121480361884e-05, |
| "loss": 1.7708, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.05815644082582146, |
| "grad_norm": 8.594147682189941, |
| "learning_rate": 1.635167518337413e-05, |
| "loss": 1.6754, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.05815644082582146, |
| "eval_loss": 1.7189382314682007, |
| "eval_runtime": 49.5978, |
| "eval_samples_per_second": 10.101, |
| "eval_steps_per_second": 10.101, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.05847953216374269, |
| "grad_norm": 10.03783893585205, |
| "learning_rate": 1.6288016313047095e-05, |
| "loss": 1.7414, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.05880262350166392, |
| "grad_norm": 11.099542617797852, |
| "learning_rate": 1.6224147486147602e-05, |
| "loss": 1.8542, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.05912571483958515, |
| "grad_norm": 10.364195823669434, |
| "learning_rate": 1.616007132807298e-05, |
| "loss": 1.7869, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.059448806177506384, |
| "grad_norm": 8.265610694885254, |
| "learning_rate": 1.6095790472743107e-05, |
| "loss": 1.7011, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.059771897515427615, |
| "grad_norm": 13.766877174377441, |
| "learning_rate": 1.6031307562492174e-05, |
| "loss": 1.6642, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.060094988853348845, |
| "grad_norm": 6.757899761199951, |
| "learning_rate": 1.5966625247960068e-05, |
| "loss": 1.7962, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.060418080191270075, |
| "grad_norm": 10.730036735534668, |
| "learning_rate": 1.5901746187983387e-05, |
| "loss": 1.6452, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.060741171529191305, |
| "grad_norm": 7.790616035461426, |
| "learning_rate": 1.5836673049486175e-05, |
| "loss": 1.7794, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.061064262867112536, |
| "grad_norm": 10.735581398010254, |
| "learning_rate": 1.577140850737029e-05, |
| "loss": 1.7813, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.061387354205033766, |
| "grad_norm": 14.150415420532227, |
| "learning_rate": 1.5705955244405423e-05, |
| "loss": 1.6433, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.061710445542954996, |
| "grad_norm": 9.497598648071289, |
| "learning_rate": 1.564031595111886e-05, |
| "loss": 1.6311, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.062033536880876226, |
| "grad_norm": 10.464118003845215, |
| "learning_rate": 1.557449332568485e-05, |
| "loss": 1.7038, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.06235662821879746, |
| "grad_norm": 10.506887435913086, |
| "learning_rate": 1.5508490073813722e-05, |
| "loss": 1.7814, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.06267971955671868, |
| "grad_norm": 17.466184616088867, |
| "learning_rate": 1.5442308908640636e-05, |
| "loss": 1.6902, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.06300281089463991, |
| "grad_norm": 10.4354829788208, |
| "learning_rate": 1.537595255061408e-05, |
| "loss": 1.7127, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.06300281089463991, |
| "eval_loss": 1.713944435119629, |
| "eval_runtime": 51.0974, |
| "eval_samples_per_second": 9.805, |
| "eval_steps_per_second": 9.805, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.06332590223256114, |
| "grad_norm": 9.260307312011719, |
| "learning_rate": 1.5309423727384037e-05, |
| "loss": 1.6463, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.06364899357048237, |
| "grad_norm": 9.762173652648926, |
| "learning_rate": 1.5242725173689851e-05, |
| "loss": 1.7202, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.0639720849084036, |
| "grad_norm": 16.75164222717285, |
| "learning_rate": 1.5175859631247827e-05, |
| "loss": 1.6976, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.06429517624632483, |
| "grad_norm": 7.676208972930908, |
| "learning_rate": 1.5108829848638515e-05, |
| "loss": 1.7599, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.06461826758424606, |
| "grad_norm": 9.695477485656738, |
| "learning_rate": 1.5041638581193741e-05, |
| "loss": 1.6792, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.06494135892216729, |
| "grad_norm": 16.161441802978516, |
| "learning_rate": 1.4974288590883346e-05, |
| "loss": 1.8177, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.06526445026008852, |
| "grad_norm": 11.451770782470703, |
| "learning_rate": 1.4906782646201634e-05, |
| "loss": 1.7105, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.06558754159800975, |
| "grad_norm": 10.997443199157715, |
| "learning_rate": 1.4839123522053591e-05, |
| "loss": 1.7168, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.06591063293593098, |
| "grad_norm": 6.7931132316589355, |
| "learning_rate": 1.4771313999640806e-05, |
| "loss": 1.769, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.06623372427385221, |
| "grad_norm": 9.296013832092285, |
| "learning_rate": 1.4703356866347155e-05, |
| "loss": 1.8983, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.06655681561177344, |
| "grad_norm": 10.765667915344238, |
| "learning_rate": 1.4635254915624214e-05, |
| "loss": 1.6917, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.06687990694969467, |
| "grad_norm": 11.256792068481445, |
| "learning_rate": 1.4567010946876445e-05, |
| "loss": 1.7493, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.0672029982876159, |
| "grad_norm": 9.741044998168945, |
| "learning_rate": 1.4498627765346109e-05, |
| "loss": 1.8623, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.06752608962553713, |
| "grad_norm": 8.61628532409668, |
| "learning_rate": 1.4430108181997962e-05, |
| "loss": 1.7821, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.06784918096345836, |
| "grad_norm": 6.974494934082031, |
| "learning_rate": 1.4361455013403695e-05, |
| "loss": 1.7919, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.06784918096345836, |
| "eval_loss": 1.7191635370254517, |
| "eval_runtime": 48.5314, |
| "eval_samples_per_second": 10.323, |
| "eval_steps_per_second": 10.323, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.0681722723013796, |
| "grad_norm": 10.361462593078613, |
| "learning_rate": 1.4292671081626183e-05, |
| "loss": 1.7856, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.06849536363930082, |
| "grad_norm": 11.415956497192383, |
| "learning_rate": 1.4223759214103443e-05, |
| "loss": 1.8235, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.06881845497722205, |
| "grad_norm": 10.382593154907227, |
| "learning_rate": 1.4154722243532445e-05, |
| "loss": 1.7848, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.06914154631514328, |
| "grad_norm": 8.885568618774414, |
| "learning_rate": 1.4085563007752654e-05, |
| "loss": 1.7903, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.06946463765306451, |
| "grad_norm": 8.006834030151367, |
| "learning_rate": 1.4016284349629364e-05, |
| "loss": 1.7024, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.06978772899098575, |
| "grad_norm": 10.482210159301758, |
| "learning_rate": 1.3946889116936874e-05, |
| "loss": 1.7564, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.07011082032890698, |
| "grad_norm": 7.895102500915527, |
| "learning_rate": 1.3877380162241394e-05, |
| "loss": 1.5733, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.0704339116668282, |
| "grad_norm": 8.805550575256348, |
| "learning_rate": 1.3807760342783804e-05, |
| "loss": 1.6728, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.07075700300474944, |
| "grad_norm": 7.979333400726318, |
| "learning_rate": 1.37380325203622e-05, |
| "loss": 1.7731, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.07108009434267067, |
| "grad_norm": 9.50521469116211, |
| "learning_rate": 1.3668199561214252e-05, |
| "loss": 1.7482, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.07140318568059191, |
| "grad_norm": 7.414913177490234, |
| "learning_rate": 1.35982643358994e-05, |
| "loss": 1.7075, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.07172627701851314, |
| "grad_norm": 8.384417533874512, |
| "learning_rate": 1.3528229719180835e-05, |
| "loss": 1.8332, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.07204936835643437, |
| "grad_norm": 9.352381706237793, |
| "learning_rate": 1.3458098589907348e-05, |
| "loss": 1.806, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.0723724596943556, |
| "grad_norm": 12.371936798095703, |
| "learning_rate": 1.3387873830894973e-05, |
| "loss": 1.8181, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.07269555103227683, |
| "grad_norm": 8.478230476379395, |
| "learning_rate": 1.3317558328808506e-05, |
| "loss": 1.7851, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.07269555103227683, |
| "eval_loss": 1.712678074836731, |
| "eval_runtime": 50.8101, |
| "eval_samples_per_second": 9.86, |
| "eval_steps_per_second": 9.86, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.07301864237019806, |
| "grad_norm": 7.574065685272217, |
| "learning_rate": 1.3247154974042827e-05, |
| "loss": 1.762, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.07334173370811929, |
| "grad_norm": 11.433499336242676, |
| "learning_rate": 1.3176666660604102e-05, |
| "loss": 1.834, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.07366482504604052, |
| "grad_norm": 7.498250484466553, |
| "learning_rate": 1.3106096285990812e-05, |
| "loss": 1.8071, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.07398791638396175, |
| "grad_norm": 7.674075126647949, |
| "learning_rate": 1.3035446751074653e-05, |
| "loss": 1.7767, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.07431100772188298, |
| "grad_norm": 10.668132781982422, |
| "learning_rate": 1.2964720959981287e-05, |
| "loss": 1.5325, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.07463409905980421, |
| "grad_norm": 8.854166984558105, |
| "learning_rate": 1.2893921819970972e-05, |
| "loss": 1.845, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.07495719039772544, |
| "grad_norm": 6.754697322845459, |
| "learning_rate": 1.2823052241319061e-05, |
| "loss": 1.739, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.07528028173564667, |
| "grad_norm": 7.947122573852539, |
| "learning_rate": 1.2752115137196341e-05, |
| "loss": 1.6849, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.0756033730735679, |
| "grad_norm": 12.404890060424805, |
| "learning_rate": 1.2681113423549334e-05, |
| "loss": 1.784, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.07592646441148913, |
| "grad_norm": 8.453229904174805, |
| "learning_rate": 1.2610050018980385e-05, |
| "loss": 1.6999, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.07624955574941036, |
| "grad_norm": 7.385756015777588, |
| "learning_rate": 1.2538927844627726e-05, |
| "loss": 1.6897, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.0765726470873316, |
| "grad_norm": 9.446147918701172, |
| "learning_rate": 1.2467749824045373e-05, |
| "loss": 1.8576, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.07689573842525282, |
| "grad_norm": 7.881250858306885, |
| "learning_rate": 1.2396518883082966e-05, |
| "loss": 1.6776, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.07721882976317405, |
| "grad_norm": 11.96408462524414, |
| "learning_rate": 1.2325237949765496e-05, |
| "loss": 1.791, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.07754192110109528, |
| "grad_norm": 10.829449653625488, |
| "learning_rate": 1.225390995417295e-05, |
| "loss": 1.7245, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.07754192110109528, |
| "eval_loss": 1.7104594707489014, |
| "eval_runtime": 50.4698, |
| "eval_samples_per_second": 9.927, |
| "eval_steps_per_second": 9.927, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.07786501243901651, |
| "grad_norm": 10.396730422973633, |
| "learning_rate": 1.2182537828319848e-05, |
| "loss": 1.8555, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.07818810377693775, |
| "grad_norm": 9.35103702545166, |
| "learning_rate": 1.2111124506034739e-05, |
| "loss": 1.8687, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.07851119511485898, |
| "grad_norm": 8.117349624633789, |
| "learning_rate": 1.2039672922839598e-05, |
| "loss": 1.7337, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.0788342864527802, |
| "grad_norm": 7.560978889465332, |
| "learning_rate": 1.196818601582915e-05, |
| "loss": 1.6828, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.07915737779070144, |
| "grad_norm": 7.487736225128174, |
| "learning_rate": 1.189666672355015e-05, |
| "loss": 1.7545, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.07948046912862267, |
| "grad_norm": 10.839227676391602, |
| "learning_rate": 1.1825117985880576e-05, |
| "loss": 1.749, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.0798035604665439, |
| "grad_norm": 8.697480201721191, |
| "learning_rate": 1.1753542743908802e-05, |
| "loss": 1.7122, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.08012665180446513, |
| "grad_norm": 11.060961723327637, |
| "learning_rate": 1.1681943939812688e-05, |
| "loss": 1.6293, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.08044974314238636, |
| "grad_norm": 8.913187980651855, |
| "learning_rate": 1.1610324516738626e-05, |
| "loss": 1.7505, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.08077283448030759, |
| "grad_norm": 6.877689361572266, |
| "learning_rate": 1.1538687418680596e-05, |
| "loss": 1.4006, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.08109592581822882, |
| "grad_norm": 11.248068809509277, |
| "learning_rate": 1.1467035590359106e-05, |
| "loss": 1.7784, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.08141901715615005, |
| "grad_norm": 10.510488510131836, |
| "learning_rate": 1.139537197710018e-05, |
| "loss": 1.7669, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.08174210849407128, |
| "grad_norm": 14.91984748840332, |
| "learning_rate": 1.1323699524714278e-05, |
| "loss": 1.6776, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.08206519983199251, |
| "grad_norm": 8.973823547363281, |
| "learning_rate": 1.1252021179375192e-05, |
| "loss": 1.8215, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.08238829116991374, |
| "grad_norm": 8.837857246398926, |
| "learning_rate": 1.118033988749895e-05, |
| "loss": 1.69, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.08238829116991374, |
| "eval_loss": 1.7076596021652222, |
| "eval_runtime": 44.7857, |
| "eval_samples_per_second": 11.187, |
| "eval_steps_per_second": 11.187, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.08271138250783497, |
| "grad_norm": 12.05571174621582, |
| "learning_rate": 1.1108658595622709e-05, |
| "loss": 1.7875, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.0830344738457562, |
| "grad_norm": 17.358158111572266, |
| "learning_rate": 1.1036980250283621e-05, |
| "loss": 1.7574, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.08335756518367743, |
| "grad_norm": 7.721527576446533, |
| "learning_rate": 1.096530779789772e-05, |
| "loss": 1.7518, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.08368065652159866, |
| "grad_norm": 11.574057579040527, |
| "learning_rate": 1.0893644184638797e-05, |
| "loss": 1.7236, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.08400374785951989, |
| "grad_norm": 10.338488578796387, |
| "learning_rate": 1.0821992356317307e-05, |
| "loss": 1.8056, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.08432683919744112, |
| "grad_norm": 9.807657241821289, |
| "learning_rate": 1.0750355258259273e-05, |
| "loss": 1.7627, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.08464993053536235, |
| "grad_norm": 7.947932720184326, |
| "learning_rate": 1.0678735835185219e-05, |
| "loss": 1.805, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.08497302187328358, |
| "grad_norm": 9.967132568359375, |
| "learning_rate": 1.06071370310891e-05, |
| "loss": 1.8368, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.08529611321120481, |
| "grad_norm": 10.970772743225098, |
| "learning_rate": 1.0535561789117327e-05, |
| "loss": 1.7216, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.08561920454912604, |
| "grad_norm": 8.452691078186035, |
| "learning_rate": 1.0464013051447755e-05, |
| "loss": 1.6741, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.08594229588704727, |
| "grad_norm": 10.103203773498535, |
| "learning_rate": 1.0392493759168751e-05, |
| "loss": 1.7487, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.0862653872249685, |
| "grad_norm": 9.015948295593262, |
| "learning_rate": 1.0321006852158306e-05, |
| "loss": 1.7332, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.08658847856288973, |
| "grad_norm": 6.827271938323975, |
| "learning_rate": 1.0249555268963164e-05, |
| "loss": 1.8094, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.08691156990081096, |
| "grad_norm": 10.552309036254883, |
| "learning_rate": 1.0178141946678054e-05, |
| "loss": 1.7288, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.08723466123873219, |
| "grad_norm": 7.483726501464844, |
| "learning_rate": 1.0106769820824951e-05, |
| "loss": 1.7407, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.08723466123873219, |
| "eval_loss": 1.7069541215896606, |
| "eval_runtime": 50.7115, |
| "eval_samples_per_second": 9.879, |
| "eval_steps_per_second": 9.879, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.08755775257665342, |
| "grad_norm": 7.673255920410156, |
| "learning_rate": 1.0035441825232406e-05, |
| "loss": 1.7783, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.08788084391457465, |
| "grad_norm": 7.7522382736206055, |
| "learning_rate": 9.964160891914937e-06, |
| "loss": 1.7612, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.08820393525249588, |
| "grad_norm": 10.859241485595703, |
| "learning_rate": 9.892929950952532e-06, |
| "loss": 1.713, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.08852702659041711, |
| "grad_norm": 8.127120971679688, |
| "learning_rate": 9.821751930370177e-06, |
| "loss": 1.7304, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.08885011792833834, |
| "grad_norm": 9.909963607788086, |
| "learning_rate": 9.750629756017514e-06, |
| "loss": 1.6213, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.08917320926625957, |
| "grad_norm": 10.604835510253906, |
| "learning_rate": 9.679566351448571e-06, |
| "loss": 1.6823, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.0894963006041808, |
| "grad_norm": 16.078346252441406, |
| "learning_rate": 9.608564637801562e-06, |
| "loss": 1.7492, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.08981939194210203, |
| "grad_norm": 12.718731880187988, |
| "learning_rate": 9.537627533678842e-06, |
| "loss": 1.7284, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.09014248328002326, |
| "grad_norm": 7.692546367645264, |
| "learning_rate": 9.466757955026925e-06, |
| "loss": 1.7821, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.0904655746179445, |
| "grad_norm": 10.689606666564941, |
| "learning_rate": 9.395958815016618e-06, |
| "loss": 1.6207, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.09078866595586572, |
| "grad_norm": 8.80530071258545, |
| "learning_rate": 9.325233023923252e-06, |
| "loss": 1.8065, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.09111175729378695, |
| "grad_norm": 9.406537055969238, |
| "learning_rate": 9.25458348900709e-06, |
| "loss": 1.678, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.09143484863170818, |
| "grad_norm": 10.812992095947266, |
| "learning_rate": 9.1840131143938e-06, |
| "loss": 1.6795, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.09175793996962942, |
| "grad_norm": 7.725828170776367, |
| "learning_rate": 9.113524800955074e-06, |
| "loss": 1.7043, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.09208103130755065, |
| "grad_norm": 7.820013046264648, |
| "learning_rate": 9.043121446189398e-06, |
| "loss": 1.683, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.09208103130755065, |
| "eval_loss": 1.7077971696853638, |
| "eval_runtime": 49.9011, |
| "eval_samples_per_second": 10.04, |
| "eval_steps_per_second": 10.04, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.09240412264547188, |
| "grad_norm": 7.655306816101074, |
| "learning_rate": 8.972805944102928e-06, |
| "loss": 1.7857, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.0927272139833931, |
| "grad_norm": 10.282003402709961, |
| "learning_rate": 8.902581185090555e-06, |
| "loss": 1.6909, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.09305030532131434, |
| "grad_norm": 7.6710100173950195, |
| "learning_rate": 8.832450055817064e-06, |
| "loss": 1.7883, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.09337339665923557, |
| "grad_norm": 7.631242275238037, |
| "learning_rate": 8.7624154390985e-06, |
| "loss": 1.6346, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.0936964879971568, |
| "grad_norm": 7.102456092834473, |
| "learning_rate": 8.692480213783649e-06, |
| "loss": 1.8356, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.09401957933507803, |
| "grad_norm": 7.865874767303467, |
| "learning_rate": 8.622647254635703e-06, |
| "loss": 1.821, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.09434267067299926, |
| "grad_norm": 8.540060997009277, |
| "learning_rate": 8.552919432214097e-06, |
| "loss": 1.6552, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.09466576201092049, |
| "grad_norm": 8.773176193237305, |
| "learning_rate": 8.483299612756505e-06, |
| "loss": 1.9238, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.09498885334884172, |
| "grad_norm": 8.697489738464355, |
| "learning_rate": 8.413790658061028e-06, |
| "loss": 1.7673, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.09531194468676295, |
| "grad_norm": 8.043781280517578, |
| "learning_rate": 8.344395425368537e-06, |
| "loss": 1.8077, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.09563503602468418, |
| "grad_norm": 12.204351425170898, |
| "learning_rate": 8.275116767245251e-06, |
| "loss": 1.6457, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.09595812736260541, |
| "grad_norm": 7.778212070465088, |
| "learning_rate": 8.205957531465456e-06, |
| "loss": 1.5633, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.09628121870052664, |
| "grad_norm": 8.191123008728027, |
| "learning_rate": 8.136920560894458e-06, |
| "loss": 1.8152, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.09660431003844787, |
| "grad_norm": 12.325820922851562, |
| "learning_rate": 8.068008693371723e-06, |
| "loss": 1.694, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.0969274013763691, |
| "grad_norm": 10.698838233947754, |
| "learning_rate": 7.999224761594206e-06, |
| "loss": 1.9075, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.0969274013763691, |
| "eval_loss": 1.7048513889312744, |
| "eval_runtime": 49.0658, |
| "eval_samples_per_second": 10.211, |
| "eval_steps_per_second": 10.211, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.09725049271429033, |
| "grad_norm": 7.618257999420166, |
| "learning_rate": 7.930571592999942e-06, |
| "loss": 1.6903, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.09757358405221156, |
| "grad_norm": 7.423304080963135, |
| "learning_rate": 7.86205200965179e-06, |
| "loss": 1.8027, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.09789667539013279, |
| "grad_norm": 10.209698677062988, |
| "learning_rate": 7.793668828121457e-06, |
| "loss": 1.6316, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.09821976672805402, |
| "grad_norm": 8.266161918640137, |
| "learning_rate": 7.725424859373688e-06, |
| "loss": 1.751, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.09854285806597525, |
| "grad_norm": 13.602888107299805, |
| "learning_rate": 7.65732290865075e-06, |
| "loss": 1.9036, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.09886594940389648, |
| "grad_norm": 9.627706527709961, |
| "learning_rate": 7.589365775357096e-06, |
| "loss": 1.5739, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.09918904074181771, |
| "grad_norm": 8.23000431060791, |
| "learning_rate": 7.52155625294431e-06, |
| "loss": 1.8857, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.09951213207973894, |
| "grad_norm": 9.111807823181152, |
| "learning_rate": 7.453897128796269e-06, |
| "loss": 1.6359, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.09983522341766017, |
| "grad_norm": 7.767091274261475, |
| "learning_rate": 7.386391184114558e-06, |
| "loss": 1.7635, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.1001583147555814, |
| "grad_norm": 7.9762797355651855, |
| "learning_rate": 7.319041193804161e-06, |
| "loss": 1.5899, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.10048140609350263, |
| "grad_norm": 15.16207218170166, |
| "learning_rate": 7.2518499263593866e-06, |
| "loss": 1.6362, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.10080449743142386, |
| "grad_norm": 9.359869003295898, |
| "learning_rate": 7.184820143750079e-06, |
| "loss": 1.687, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.10112758876934509, |
| "grad_norm": 8.03466796875, |
| "learning_rate": 7.117954601308052e-06, |
| "loss": 1.6855, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.10145068010726632, |
| "grad_norm": 8.714197158813477, |
| "learning_rate": 7.051256047613866e-06, |
| "loss": 1.6671, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.10177377144518755, |
| "grad_norm": 7.063264846801758, |
| "learning_rate": 6.984727224383822e-06, |
| "loss": 1.7538, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.10177377144518755, |
| "eval_loss": 1.702789545059204, |
| "eval_runtime": 48.5181, |
| "eval_samples_per_second": 10.326, |
| "eval_steps_per_second": 10.326, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.10209686278310878, |
| "grad_norm": 8.911219596862793, |
| "learning_rate": 6.918370866357266e-06, |
| "loss": 1.7791, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.10241995412103001, |
| "grad_norm": 9.150374412536621, |
| "learning_rate": 6.852189701184183e-06, |
| "loss": 1.6636, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.10274304545895124, |
| "grad_norm": 5.501253128051758, |
| "learning_rate": 6.786186449313051e-06, |
| "loss": 1.7729, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.10306613679687247, |
| "grad_norm": 8.506068229675293, |
| "learning_rate": 6.720363823879042e-06, |
| "loss": 1.6332, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.1033892281347937, |
| "grad_norm": 7.7138895988464355, |
| "learning_rate": 6.6547245305924765e-06, |
| "loss": 1.6499, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.10371231947271493, |
| "grad_norm": 8.832993507385254, |
| "learning_rate": 6.589271267627615e-06, |
| "loss": 1.7456, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.10403541081063616, |
| "grad_norm": 7.764387130737305, |
| "learning_rate": 6.524006725511727e-06, |
| "loss": 1.7079, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.1043585021485574, |
| "grad_norm": 9.634687423706055, |
| "learning_rate": 6.4589335870145165e-06, |
| "loss": 1.7564, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.10468159348647862, |
| "grad_norm": 12.632416725158691, |
| "learning_rate": 6.394054527037837e-06, |
| "loss": 1.659, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.10500468482439985, |
| "grad_norm": 6.407688140869141, |
| "learning_rate": 6.329372212505727e-06, |
| "loss": 1.6707, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.10532777616232109, |
| "grad_norm": 14.8198823928833, |
| "learning_rate": 6.264889302254797e-06, |
| "loss": 1.7245, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.10565086750024232, |
| "grad_norm": 9.50849437713623, |
| "learning_rate": 6.200608446924922e-06, |
| "loss": 1.9256, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.10597395883816355, |
| "grad_norm": 10.970282554626465, |
| "learning_rate": 6.136532288850295e-06, |
| "loss": 1.5962, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.10629705017608478, |
| "grad_norm": 12.26534652709961, |
| "learning_rate": 6.072663461950806e-06, |
| "loss": 1.8693, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.106620141514006, |
| "grad_norm": 13.819258689880371, |
| "learning_rate": 6.009004591623776e-06, |
| "loss": 1.728, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.106620141514006, |
| "eval_loss": 1.6982126235961914, |
| "eval_runtime": 52.4496, |
| "eval_samples_per_second": 9.552, |
| "eval_steps_per_second": 9.552, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.10694323285192724, |
| "grad_norm": 13.012596130371094, |
| "learning_rate": 5.945558294636019e-06, |
| "loss": 1.6098, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.10726632418984847, |
| "grad_norm": 9.684890747070312, |
| "learning_rate": 5.882327179016307e-06, |
| "loss": 1.5688, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.1075894155277697, |
| "grad_norm": 8.362608909606934, |
| "learning_rate": 5.819313843948146e-06, |
| "loss": 1.531, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.10791250686569093, |
| "grad_norm": 6.872709274291992, |
| "learning_rate": 5.756520879662929e-06, |
| "loss": 1.8154, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.10823559820361216, |
| "grad_norm": 7.6126275062561035, |
| "learning_rate": 5.693950867333488e-06, |
| "loss": 1.6701, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.10855868954153339, |
| "grad_norm": 11.798931121826172, |
| "learning_rate": 5.6316063789679415e-06, |
| "loss": 1.6676, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.10888178087945462, |
| "grad_norm": 8.72461986541748, |
| "learning_rate": 5.569489977304029e-06, |
| "loss": 1.6311, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.10920487221737585, |
| "grad_norm": 10.633599281311035, |
| "learning_rate": 5.507604215703729e-06, |
| "loss": 1.8677, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.10952796355529708, |
| "grad_norm": 10.684865951538086, |
| "learning_rate": 5.44595163804831e-06, |
| "loss": 1.5938, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.10985105489321831, |
| "grad_norm": 8.996574401855469, |
| "learning_rate": 5.384534778633763e-06, |
| "loss": 1.7584, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.11017414623113954, |
| "grad_norm": 9.916984558105469, |
| "learning_rate": 5.323356162066626e-06, |
| "loss": 1.6673, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.11049723756906077, |
| "grad_norm": 8.516958236694336, |
| "learning_rate": 5.262418303160206e-06, |
| "loss": 1.7681, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.110820328906982, |
| "grad_norm": 9.242239952087402, |
| "learning_rate": 5.201723706831204e-06, |
| "loss": 1.8336, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.11114342024490323, |
| "grad_norm": 8.185578346252441, |
| "learning_rate": 5.141274867996755e-06, |
| "loss": 1.7451, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.11146651158282446, |
| "grad_norm": 7.650656700134277, |
| "learning_rate": 5.081074271471855e-06, |
| "loss": 1.6938, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.11146651158282446, |
| "eval_loss": 1.698238730430603, |
| "eval_runtime": 48.6823, |
| "eval_samples_per_second": 10.291, |
| "eval_steps_per_second": 10.291, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.11178960292074569, |
| "grad_norm": 7.296802997589111, |
| "learning_rate": 5.021124391867241e-06, |
| "loss": 1.8332, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.11211269425866692, |
| "grad_norm": 13.779586791992188, |
| "learning_rate": 4.961427693487654e-06, |
| "loss": 1.7706, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.11243578559658815, |
| "grad_norm": 9.145750045776367, |
| "learning_rate": 4.901986630230549e-06, |
| "loss": 1.6575, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.11275887693450938, |
| "grad_norm": 8.650459289550781, |
| "learning_rate": 4.842803645485228e-06, |
| "loss": 1.7603, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.11308196827243061, |
| "grad_norm": 11.718855857849121, |
| "learning_rate": 4.7838811720323795e-06, |
| "loss": 1.7843, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.11340505961035184, |
| "grad_norm": 11.075895309448242, |
| "learning_rate": 4.725221631944109e-06, |
| "loss": 1.794, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.11372815094827307, |
| "grad_norm": 9.832283020019531, |
| "learning_rate": 4.666827436484355e-06, |
| "loss": 1.7766, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.1140512422861943, |
| "grad_norm": 8.829045295715332, |
| "learning_rate": 4.60870098600978e-06, |
| "loss": 1.6138, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.11437433362411553, |
| "grad_norm": 12.860816955566406, |
| "learning_rate": 4.550844669871095e-06, |
| "loss": 1.6731, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.11469742496203676, |
| "grad_norm": 10.444979667663574, |
| "learning_rate": 4.493260866314851e-06, |
| "loss": 1.7394, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.11502051629995799, |
| "grad_norm": 12.831863403320312, |
| "learning_rate": 4.435951942385671e-06, |
| "loss": 1.8, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.11534360763787922, |
| "grad_norm": 11.071577072143555, |
| "learning_rate": 4.378920253828953e-06, |
| "loss": 1.7539, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.11566669897580045, |
| "grad_norm": 6.664394378662109, |
| "learning_rate": 4.322168144994041e-06, |
| "loss": 1.741, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.11598979031372168, |
| "grad_norm": 9.661178588867188, |
| "learning_rate": 4.265697948737836e-06, |
| "loss": 1.6935, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.11631288165164291, |
| "grad_norm": 7.41594934463501, |
| "learning_rate": 4.209511986328935e-06, |
| "loss": 1.6957, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.11631288165164291, |
| "eval_loss": 1.6976414918899536, |
| "eval_runtime": 49.1478, |
| "eval_samples_per_second": 10.194, |
| "eval_steps_per_second": 10.194, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.11663597298956414, |
| "grad_norm": 8.112351417541504, |
| "learning_rate": 4.153612567352186e-06, |
| "loss": 1.6652, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.11695906432748537, |
| "grad_norm": 8.311092376708984, |
| "learning_rate": 4.098001989613763e-06, |
| "loss": 1.7136, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.1172821556654066, |
| "grad_norm": 10.395671844482422, |
| "learning_rate": 4.042682539046698e-06, |
| "loss": 1.7171, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.11760524700332783, |
| "grad_norm": 10.424238204956055, |
| "learning_rate": 3.987656489616937e-06, |
| "loss": 1.7596, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.11792833834124906, |
| "grad_norm": 7.314935684204102, |
| "learning_rate": 3.932926103229849e-06, |
| "loss": 1.6147, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.1182514296791703, |
| "grad_norm": 10.446200370788574, |
| "learning_rate": 3.878493629637249e-06, |
| "loss": 1.6668, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.11857452101709154, |
| "grad_norm": 7.69391393661499, |
| "learning_rate": 3.824361306344942e-06, |
| "loss": 1.8314, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.11889761235501277, |
| "grad_norm": 9.985037803649902, |
| "learning_rate": 3.7705313585207056e-06, |
| "loss": 1.747, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.119220703692934, |
| "grad_norm": 8.62292194366455, |
| "learning_rate": 3.717005998902859e-06, |
| "loss": 1.8816, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.11954379503085523, |
| "grad_norm": 8.038886070251465, |
| "learning_rate": 3.6637874277092946e-06, |
| "loss": 1.7347, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.11986688636877646, |
| "grad_norm": 9.587797164916992, |
| "learning_rate": 3.610877832547034e-06, |
| "loss": 1.7352, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.12018997770669769, |
| "grad_norm": 7.319656848907471, |
| "learning_rate": 3.5582793883222923e-06, |
| "loss": 1.8048, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.12051306904461892, |
| "grad_norm": 11.947138786315918, |
| "learning_rate": 3.5059942571511037e-06, |
| "loss": 1.6342, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.12083616038254015, |
| "grad_norm": 10.702777862548828, |
| "learning_rate": 3.4540245882704213e-06, |
| "loss": 1.6056, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.12115925172046138, |
| "grad_norm": 8.408270835876465, |
| "learning_rate": 3.4023725179497848e-06, |
| "loss": 1.6606, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.12115925172046138, |
| "eval_loss": 1.6962512731552124, |
| "eval_runtime": 49.6842, |
| "eval_samples_per_second": 10.084, |
| "eval_steps_per_second": 10.084, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.12148234305838261, |
| "grad_norm": 8.158541679382324, |
| "learning_rate": 3.351040169403499e-06, |
| "loss": 1.5587, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.12180543439630384, |
| "grad_norm": 7.553473949432373, |
| "learning_rate": 3.30002965270335e-06, |
| "loss": 1.5088, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.12212852573422507, |
| "grad_norm": 7.5580267906188965, |
| "learning_rate": 3.2493430646918865e-06, |
| "loss": 1.6262, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.1224516170721463, |
| "grad_norm": 9.70615005493164, |
| "learning_rate": 3.1989824888962225e-06, |
| "loss": 1.8517, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.12277470841006753, |
| "grad_norm": 10.053832054138184, |
| "learning_rate": 3.1489499954423797e-06, |
| "loss": 1.8122, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.12309779974798876, |
| "grad_norm": 7.846384048461914, |
| "learning_rate": 3.0992476409701936e-06, |
| "loss": 1.7561, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.12342089108590999, |
| "grad_norm": 8.455656051635742, |
| "learning_rate": 3.0498774685487882e-06, |
| "loss": 1.5963, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.12374398242383122, |
| "grad_norm": 9.971628189086914, |
| "learning_rate": 3.000841507592583e-06, |
| "loss": 1.7193, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.12406707376175245, |
| "grad_norm": 9.318702697753906, |
| "learning_rate": 2.9521417737778717e-06, |
| "loss": 1.6967, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.12439016509967368, |
| "grad_norm": 7.269077301025391, |
| "learning_rate": 2.9037802689599704e-06, |
| "loss": 1.7184, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.12471325643759491, |
| "grad_norm": 9.587904930114746, |
| "learning_rate": 2.855758981090918e-06, |
| "loss": 1.6776, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.12503634777551614, |
| "grad_norm": 8.016568183898926, |
| "learning_rate": 2.8080798841377743e-06, |
| "loss": 1.6396, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.12535943911343736, |
| "grad_norm": 9.123952865600586, |
| "learning_rate": 2.7607449380014703e-06, |
| "loss": 1.7405, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.1256825304513586, |
| "grad_norm": 9.068984031677246, |
| "learning_rate": 2.713756088436244e-06, |
| "loss": 1.6592, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.12600562178927982, |
| "grad_norm": 8.082880020141602, |
| "learning_rate": 2.6671152669696515e-06, |
| "loss": 1.6929, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.12600562178927982, |
| "eval_loss": 1.6942435503005981, |
| "eval_runtime": 45.0696, |
| "eval_samples_per_second": 11.116, |
| "eval_steps_per_second": 11.116, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.12632871312720106, |
| "grad_norm": 9.814338684082031, |
| "learning_rate": 2.6208243908231916e-06, |
| "loss": 1.6763, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.12665180446512228, |
| "grad_norm": 7.890253067016602, |
| "learning_rate": 2.57488536283347e-06, |
| "loss": 1.7072, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.12697489580304352, |
| "grad_norm": 7.890439510345459, |
| "learning_rate": 2.5293000713739977e-06, |
| "loss": 1.8804, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.12729798714096474, |
| "grad_norm": 7.131802082061768, |
| "learning_rate": 2.4840703902775642e-06, |
| "loss": 1.7909, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.12762107847888599, |
| "grad_norm": 7.198910713195801, |
| "learning_rate": 2.4391981787592005e-06, |
| "loss": 1.7132, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.1279441698168072, |
| "grad_norm": 10.655743598937988, |
| "learning_rate": 2.3946852813397737e-06, |
| "loss": 1.6567, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.12826726115472845, |
| "grad_norm": 16.728164672851562, |
| "learning_rate": 2.3505335277701494e-06, |
| "loss": 1.7548, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.12859035249264966, |
| "grad_norm": 10.79677963256836, |
| "learning_rate": 2.306744732955991e-06, |
| "loss": 1.7443, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.1289134438305709, |
| "grad_norm": 11.85908317565918, |
| "learning_rate": 2.2633206968831374e-06, |
| "loss": 1.6827, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.12923653516849212, |
| "grad_norm": 7.376532554626465, |
| "learning_rate": 2.220263204543635e-06, |
| "loss": 1.6962, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.12955962650641337, |
| "grad_norm": 9.859908103942871, |
| "learning_rate": 2.1775740258623492e-06, |
| "loss": 1.7198, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.12988271784433458, |
| "grad_norm": 8.99048900604248, |
| "learning_rate": 2.1352549156242126e-06, |
| "loss": 1.8287, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.13020580918225583, |
| "grad_norm": 10.75970458984375, |
| "learning_rate": 2.0933076134020958e-06, |
| "loss": 1.7958, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.13052890052017704, |
| "grad_norm": 11.328814506530762, |
| "learning_rate": 2.0517338434852946e-06, |
| "loss": 1.6765, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.1308519918580983, |
| "grad_norm": 9.293211936950684, |
| "learning_rate": 2.010535314808659e-06, |
| "loss": 1.7851, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1308519918580983, |
| "eval_loss": 1.6941063404083252, |
| "eval_runtime": 50.0573, |
| "eval_samples_per_second": 10.009, |
| "eval_steps_per_second": 10.009, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1311750831960195, |
| "grad_norm": 9.791179656982422, |
| "learning_rate": 1.9697137208823396e-06, |
| "loss": 1.8117, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.13149817453394075, |
| "grad_norm": 8.991856575012207, |
| "learning_rate": 1.9292707397221775e-06, |
| "loss": 1.6096, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.13182126587186196, |
| "grad_norm": 9.120182991027832, |
| "learning_rate": 1.8892080337807171e-06, |
| "loss": 1.7314, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.1321443572097832, |
| "grad_norm": 10.0064697265625, |
| "learning_rate": 1.8495272498788887e-06, |
| "loss": 1.6805, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.13246744854770443, |
| "grad_norm": 6.52011251449585, |
| "learning_rate": 1.8102300191383008e-06, |
| "loss": 1.6526, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.13279053988562567, |
| "grad_norm": 7.259543418884277, |
| "learning_rate": 1.7713179569141897e-06, |
| "loss": 1.8271, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.13311363122354689, |
| "grad_norm": 10.133992195129395, |
| "learning_rate": 1.7327926627290298e-06, |
| "loss": 1.7711, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.13343672256146813, |
| "grad_norm": 7.308538913726807, |
| "learning_rate": 1.6946557202067662e-06, |
| "loss": 1.6875, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.13375981389938935, |
| "grad_norm": 10.048575401306152, |
| "learning_rate": 1.6569086970077352e-06, |
| "loss": 1.705, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.1340829052373106, |
| "grad_norm": 10.202974319458008, |
| "learning_rate": 1.6195531447642177e-06, |
| "loss": 1.755, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.1344059965752318, |
| "grad_norm": 9.808621406555176, |
| "learning_rate": 1.582590599016653e-06, |
| "loss": 1.7341, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.13472908791315305, |
| "grad_norm": 13.069711685180664, |
| "learning_rate": 1.5460225791505258e-06, |
| "loss": 1.621, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.13505217925107427, |
| "grad_norm": 9.029712677001953, |
| "learning_rate": 1.509850588333905e-06, |
| "loss": 1.7657, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.1353752705889955, |
| "grad_norm": 9.384182929992676, |
| "learning_rate": 1.4740761134556557e-06, |
| "loss": 1.7284, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.13569836192691673, |
| "grad_norm": 12.77877426147461, |
| "learning_rate": 1.4387006250643236e-06, |
| "loss": 1.6129, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.13569836192691673, |
| "eval_loss": 1.6926029920578003, |
| "eval_runtime": 45.4397, |
| "eval_samples_per_second": 11.026, |
| "eval_steps_per_second": 11.026, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.13602145326483797, |
| "grad_norm": 7.0634846687316895, |
| "learning_rate": 1.4037255773076804e-06, |
| "loss": 1.7018, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.1363445446027592, |
| "grad_norm": 9.692054748535156, |
| "learning_rate": 1.3691524078729481e-06, |
| "loss": 1.7729, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.13666763594068043, |
| "grad_norm": 7.554727554321289, |
| "learning_rate": 1.3349825379277099e-06, |
| "loss": 1.5513, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.13699072727860165, |
| "grad_norm": 10.388784408569336, |
| "learning_rate": 1.3012173720614862e-06, |
| "loss": 1.7794, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.1373138186165229, |
| "grad_norm": 8.613759994506836, |
| "learning_rate": 1.267858298227995e-06, |
| "loss": 1.8116, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.1376369099544441, |
| "grad_norm": 9.194704055786133, |
| "learning_rate": 1.2349066876881063e-06, |
| "loss": 1.7516, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.13796000129236535, |
| "grad_norm": 7.637813091278076, |
| "learning_rate": 1.202363894953462e-06, |
| "loss": 1.8919, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.13828309263028657, |
| "grad_norm": 7.157520771026611, |
| "learning_rate": 1.1702312577308133e-06, |
| "loss": 1.8295, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.1386061839682078, |
| "grad_norm": 7.413049221038818, |
| "learning_rate": 1.1385100968670189e-06, |
| "loss": 1.6244, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.13892927530612903, |
| "grad_norm": 13.795228958129883, |
| "learning_rate": 1.107201716294762e-06, |
| "loss": 1.6479, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.13925236664405027, |
| "grad_norm": 12.399872779846191, |
| "learning_rate": 1.076307402978938e-06, |
| "loss": 1.6793, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.1395754579819715, |
| "grad_norm": 8.875575065612793, |
| "learning_rate": 1.0458284268637652e-06, |
| "loss": 1.7353, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.13989854931989273, |
| "grad_norm": 7.911688804626465, |
| "learning_rate": 1.0157660408205728e-06, |
| "loss": 1.6169, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.14022164065781395, |
| "grad_norm": 9.808359146118164, |
| "learning_rate": 9.861214805963042e-07, |
| "loss": 1.7996, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.1405447319957352, |
| "grad_norm": 10.307291030883789, |
| "learning_rate": 9.568959647627223e-07, |
| "loss": 1.6639, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.1405447319957352, |
| "eval_loss": 1.6928385496139526, |
| "eval_runtime": 49.1761, |
| "eval_samples_per_second": 10.188, |
| "eval_steps_per_second": 10.188, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.1408678233336564, |
| "grad_norm": 12.124625205993652, |
| "learning_rate": 9.280906946663111e-07, |
| "loss": 1.797, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.14119091467157766, |
| "grad_norm": 10.66218376159668, |
| "learning_rate": 8.997068543789051e-07, |
| "loss": 1.6894, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.14151400600949887, |
| "grad_norm": 11.1690673828125, |
| "learning_rate": 8.717456106490042e-07, |
| "loss": 1.7584, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.14183709734742012, |
| "grad_norm": 9.9891357421875, |
| "learning_rate": 8.442081128538243e-07, |
| "loss": 1.6333, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.14216018868534133, |
| "grad_norm": 8.535771369934082, |
| "learning_rate": 8.170954929520389e-07, |
| "loss": 1.6837, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.14248328002326258, |
| "grad_norm": 9.602922439575195, |
| "learning_rate": 7.904088654372622e-07, |
| "loss": 1.6985, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.14280637136118382, |
| "grad_norm": 6.4636688232421875, |
| "learning_rate": 7.641493272922243e-07, |
| "loss": 1.7214, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.14312946269910504, |
| "grad_norm": 10.1285982131958, |
| "learning_rate": 7.383179579436903e-07, |
| "loss": 1.7285, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.14345255403702628, |
| "grad_norm": 10.605436325073242, |
| "learning_rate": 7.129158192180766e-07, |
| "loss": 1.6752, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.1437756453749475, |
| "grad_norm": 8.596016883850098, |
| "learning_rate": 6.879439552978142e-07, |
| "loss": 1.7964, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.14409873671286874, |
| "grad_norm": 10.723006248474121, |
| "learning_rate": 6.634033926784221e-07, |
| "loss": 1.7373, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.14442182805078996, |
| "grad_norm": 7.131843090057373, |
| "learning_rate": 6.392951401263069e-07, |
| "loss": 1.7181, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.1447449193887112, |
| "grad_norm": 8.122928619384766, |
| "learning_rate": 6.156201886373113e-07, |
| "loss": 1.7157, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.14506801072663242, |
| "grad_norm": 7.9331583976745605, |
| "learning_rate": 5.923795113959569e-07, |
| "loss": 1.6884, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.14539110206455366, |
| "grad_norm": 10.320080757141113, |
| "learning_rate": 5.695740637354591e-07, |
| "loss": 1.7235, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.14539110206455366, |
| "eval_loss": 1.6921895742416382, |
| "eval_runtime": 49.8365, |
| "eval_samples_per_second": 10.053, |
| "eval_steps_per_second": 10.053, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.14571419340247488, |
| "grad_norm": 7.294159889221191, |
| "learning_rate": 5.472047830984499e-07, |
| "loss": 1.7577, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.14603728474039612, |
| "grad_norm": 9.507523536682129, |
| "learning_rate": 5.252725889984403e-07, |
| "loss": 1.7748, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.14636037607831734, |
| "grad_norm": 10.296547889709473, |
| "learning_rate": 5.037783829820298e-07, |
| "loss": 1.6676, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.14668346741623858, |
| "grad_norm": 10.683934211730957, |
| "learning_rate": 4.827230485918372e-07, |
| "loss": 1.703, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.1470065587541598, |
| "grad_norm": 13.149202346801758, |
| "learning_rate": 4.6210745133019236e-07, |
| "loss": 1.8596, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.14732965009208104, |
| "grad_norm": 10.627421379089355, |
| "learning_rate": 4.419324386235529e-07, |
| "loss": 1.5863, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.14765274143000226, |
| "grad_norm": 8.185441970825195, |
| "learning_rate": 4.2219883978767386e-07, |
| "loss": 1.7421, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.1479758327679235, |
| "grad_norm": 6.5582804679870605, |
| "learning_rate": 4.029074659935082e-07, |
| "loss": 1.7486, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.14829892410584472, |
| "grad_norm": 7.293984413146973, |
| "learning_rate": 3.8405911023387444e-07, |
| "loss": 1.7631, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.14862201544376596, |
| "grad_norm": 10.495855331420898, |
| "learning_rate": 3.6565454729085526e-07, |
| "loss": 1.8289, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.14894510678168718, |
| "grad_norm": 7.07685661315918, |
| "learning_rate": 3.4769453370394753e-07, |
| "loss": 1.6386, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.14926819811960843, |
| "grad_norm": 8.069764137268066, |
| "learning_rate": 3.301798077389637e-07, |
| "loss": 1.585, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.14959128945752964, |
| "grad_norm": 8.399779319763184, |
| "learning_rate": 3.1311108935768926e-07, |
| "loss": 1.5544, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.14991438079545089, |
| "grad_norm": 7.10072660446167, |
| "learning_rate": 2.964890801882817e-07, |
| "loss": 1.7765, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.1502374721333721, |
| "grad_norm": 12.693696022033691, |
| "learning_rate": 2.8031446349643393e-07, |
| "loss": 1.5691, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.1502374721333721, |
| "eval_loss": 1.6924811601638794, |
| "eval_runtime": 50.4888, |
| "eval_samples_per_second": 9.923, |
| "eval_steps_per_second": 9.923, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.15056056347129335, |
| "grad_norm": 8.841912269592285, |
| "learning_rate": 2.645879041572891e-07, |
| "loss": 1.6589, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.15088365480921456, |
| "grad_norm": 7.690126895904541, |
| "learning_rate": 2.4931004862810295e-07, |
| "loss": 1.7137, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.1512067461471358, |
| "grad_norm": 14.600467681884766, |
| "learning_rate": 2.3448152492167586e-07, |
| "loss": 1.8001, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.15152983748505702, |
| "grad_norm": 8.619688034057617, |
| "learning_rate": 2.201029425805393e-07, |
| "loss": 1.7615, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.15185292882297827, |
| "grad_norm": 12.033727645874023, |
| "learning_rate": 2.061748926518972e-07, |
| "loss": 1.6317, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.15217602016089948, |
| "grad_norm": 9.276659965515137, |
| "learning_rate": 1.9269794766333073e-07, |
| "loss": 1.6155, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.15249911149882073, |
| "grad_norm": 8.645523071289062, |
| "learning_rate": 1.7967266159925864e-07, |
| "loss": 1.5958, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.15282220283674194, |
| "grad_norm": 13.718961715698242, |
| "learning_rate": 1.670995698781777e-07, |
| "loss": 1.5768, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.1531452941746632, |
| "grad_norm": 12.2525634765625, |
| "learning_rate": 1.549791893306424e-07, |
| "loss": 1.571, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.1534683855125844, |
| "grad_norm": 7.851583003997803, |
| "learning_rate": 1.4331201817802332e-07, |
| "loss": 1.7923, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.15379147685050565, |
| "grad_norm": 10.048659324645996, |
| "learning_rate": 1.320985360120322e-07, |
| "loss": 1.7102, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.15411456818842686, |
| "grad_norm": 9.430795669555664, |
| "learning_rate": 1.2133920377499848e-07, |
| "loss": 1.6879, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.1544376595263481, |
| "grad_norm": 12.329809188842773, |
| "learning_rate": 1.1103446374092981e-07, |
| "loss": 1.7557, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.15476075086426933, |
| "grad_norm": 11.180129051208496, |
| "learning_rate": 1.0118473949732765e-07, |
| "loss": 1.7791, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.15508384220219057, |
| "grad_norm": 8.690634727478027, |
| "learning_rate": 9.179043592777716e-08, |
| "loss": 1.6464, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.15508384220219057, |
| "eval_loss": 1.6925097703933716, |
| "eval_runtime": 44.8573, |
| "eval_samples_per_second": 11.169, |
| "eval_steps_per_second": 11.169, |
| "step": 4800 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.0155209275981824e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|