| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.07272727272727272, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0003636363636363636, | |
| "grad_norm": 0.47871133685112, | |
| "learning_rate": 4.999090909090909e-05, | |
| "loss": 1.4412, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0007272727272727272, | |
| "grad_norm": 0.4764420688152313, | |
| "learning_rate": 4.998181818181818e-05, | |
| "loss": 1.3082, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.001090909090909091, | |
| "grad_norm": 0.6560420989990234, | |
| "learning_rate": 4.997272727272728e-05, | |
| "loss": 1.3592, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0014545454545454545, | |
| "grad_norm": 0.7296062111854553, | |
| "learning_rate": 4.996363636363637e-05, | |
| "loss": 1.1219, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0018181818181818182, | |
| "grad_norm": 0.46472349762916565, | |
| "learning_rate": 4.995454545454546e-05, | |
| "loss": 1.0707, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.002181818181818182, | |
| "grad_norm": 0.5377005338668823, | |
| "learning_rate": 4.994545454545454e-05, | |
| "loss": 1.1453, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0025454545454545456, | |
| "grad_norm": 0.5077579021453857, | |
| "learning_rate": 4.993636363636364e-05, | |
| "loss": 1.1129, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.002909090909090909, | |
| "grad_norm": 0.5722835063934326, | |
| "learning_rate": 4.992727272727273e-05, | |
| "loss": 1.1029, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0032727272727272726, | |
| "grad_norm": 0.5486398339271545, | |
| "learning_rate": 4.991818181818182e-05, | |
| "loss": 1.0021, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0036363636363636364, | |
| "grad_norm": 0.6710432171821594, | |
| "learning_rate": 4.990909090909091e-05, | |
| "loss": 1.0766, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 0.45256921648979187, | |
| "learning_rate": 4.99e-05, | |
| "loss": 1.0424, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.004363636363636364, | |
| "grad_norm": 0.5884903073310852, | |
| "learning_rate": 4.98909090909091e-05, | |
| "loss": 1.1271, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0047272727272727275, | |
| "grad_norm": 0.8489357233047485, | |
| "learning_rate": 4.988181818181819e-05, | |
| "loss": 1.1285, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.005090909090909091, | |
| "grad_norm": 0.6435407996177673, | |
| "learning_rate": 4.987272727272728e-05, | |
| "loss": 1.1967, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.005454545454545455, | |
| "grad_norm": 0.5722768306732178, | |
| "learning_rate": 4.986363636363637e-05, | |
| "loss": 1.0342, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.005818181818181818, | |
| "grad_norm": 0.4862392544746399, | |
| "learning_rate": 4.985454545454546e-05, | |
| "loss": 0.9976, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0061818181818181816, | |
| "grad_norm": 0.7204858064651489, | |
| "learning_rate": 4.984545454545455e-05, | |
| "loss": 1.0818, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.006545454545454545, | |
| "grad_norm": 0.727836549282074, | |
| "learning_rate": 4.983636363636364e-05, | |
| "loss": 1.0893, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.006909090909090909, | |
| "grad_norm": 0.4965401589870453, | |
| "learning_rate": 4.982727272727273e-05, | |
| "loss": 1.01, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.007272727272727273, | |
| "grad_norm": 0.6770114898681641, | |
| "learning_rate": 4.981818181818182e-05, | |
| "loss": 0.9949, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0076363636363636364, | |
| "grad_norm": 0.795165479183197, | |
| "learning_rate": 4.980909090909091e-05, | |
| "loss": 1.1506, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.7894571423530579, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 1.0309, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.008363636363636363, | |
| "grad_norm": 0.9198132753372192, | |
| "learning_rate": 4.9790909090909094e-05, | |
| "loss": 0.9611, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.008727272727272728, | |
| "grad_norm": 0.8672171831130981, | |
| "learning_rate": 4.9781818181818184e-05, | |
| "loss": 1.1689, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.00909090909090909, | |
| "grad_norm": 0.6119312644004822, | |
| "learning_rate": 4.9772727272727275e-05, | |
| "loss": 0.9891, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.009454545454545455, | |
| "grad_norm": 0.7098168134689331, | |
| "learning_rate": 4.9763636363636365e-05, | |
| "loss": 0.9369, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.009818181818181818, | |
| "grad_norm": 0.7419410347938538, | |
| "learning_rate": 4.975454545454546e-05, | |
| "loss": 1.0293, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.010181818181818183, | |
| "grad_norm": 0.6659941673278809, | |
| "learning_rate": 4.9745454545454545e-05, | |
| "loss": 1.0936, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.010545454545454545, | |
| "grad_norm": 0.8974291682243347, | |
| "learning_rate": 4.9736363636363635e-05, | |
| "loss": 1.1088, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.01090909090909091, | |
| "grad_norm": 0.7319221496582031, | |
| "learning_rate": 4.9727272727272725e-05, | |
| "loss": 0.9916, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.011272727272727273, | |
| "grad_norm": 0.9040235280990601, | |
| "learning_rate": 4.971818181818182e-05, | |
| "loss": 1.1141, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.011636363636363636, | |
| "grad_norm": 0.7666190266609192, | |
| "learning_rate": 4.970909090909091e-05, | |
| "loss": 1.0309, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.7975471615791321, | |
| "learning_rate": 4.97e-05, | |
| "loss": 1.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.012363636363636363, | |
| "grad_norm": 0.7558386325836182, | |
| "learning_rate": 4.969090909090909e-05, | |
| "loss": 1.1283, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.012727272727272728, | |
| "grad_norm": 0.76316899061203, | |
| "learning_rate": 4.968181818181818e-05, | |
| "loss": 1.01, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.01309090909090909, | |
| "grad_norm": 0.5065900683403015, | |
| "learning_rate": 4.967272727272728e-05, | |
| "loss": 1.002, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.013454545454545455, | |
| "grad_norm": 1.0256422758102417, | |
| "learning_rate": 4.966363636363637e-05, | |
| "loss": 1.0912, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.013818181818181818, | |
| "grad_norm": 0.734935998916626, | |
| "learning_rate": 4.965454545454546e-05, | |
| "loss": 1.102, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.014181818181818183, | |
| "grad_norm": 0.6911085844039917, | |
| "learning_rate": 4.964545454545455e-05, | |
| "loss": 1.0479, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.014545454545454545, | |
| "grad_norm": 0.778714120388031, | |
| "learning_rate": 4.963636363636364e-05, | |
| "loss": 1.0379, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.014909090909090908, | |
| "grad_norm": 0.6882097721099854, | |
| "learning_rate": 4.962727272727273e-05, | |
| "loss": 0.9732, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.015272727272727273, | |
| "grad_norm": 0.747656524181366, | |
| "learning_rate": 4.961818181818182e-05, | |
| "loss": 1.0602, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.015636363636363636, | |
| "grad_norm": 0.7968394160270691, | |
| "learning_rate": 4.960909090909091e-05, | |
| "loss": 1.1314, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.9477614760398865, | |
| "learning_rate": 4.96e-05, | |
| "loss": 1.0488, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.016363636363636365, | |
| "grad_norm": 0.7730783820152283, | |
| "learning_rate": 4.9590909090909096e-05, | |
| "loss": 1.1314, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.016727272727272726, | |
| "grad_norm": 1.1297029256820679, | |
| "learning_rate": 4.9581818181818186e-05, | |
| "loss": 1.0943, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.01709090909090909, | |
| "grad_norm": 0.7387447357177734, | |
| "learning_rate": 4.9572727272727276e-05, | |
| "loss": 1.0711, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.017454545454545455, | |
| "grad_norm": 0.5228479504585266, | |
| "learning_rate": 4.9563636363636367e-05, | |
| "loss": 1.0711, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.01781818181818182, | |
| "grad_norm": 0.5137602090835571, | |
| "learning_rate": 4.9554545454545457e-05, | |
| "loss": 0.9242, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.01818181818181818, | |
| "grad_norm": 0.859491229057312, | |
| "learning_rate": 4.9545454545454553e-05, | |
| "loss": 1.124, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.018545454545454546, | |
| "grad_norm": 0.7495261430740356, | |
| "learning_rate": 4.953636363636364e-05, | |
| "loss": 1.1184, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.01890909090909091, | |
| "grad_norm": 0.8758722543716431, | |
| "learning_rate": 4.952727272727273e-05, | |
| "loss": 1.1215, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.01927272727272727, | |
| "grad_norm": 0.7047644257545471, | |
| "learning_rate": 4.951818181818182e-05, | |
| "loss": 1.0416, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.019636363636363636, | |
| "grad_norm": 0.9179386496543884, | |
| "learning_rate": 4.9509090909090914e-05, | |
| "loss": 1.0404, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.8945435881614685, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 1.0211, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.020363636363636365, | |
| "grad_norm": 0.49766021966934204, | |
| "learning_rate": 4.9490909090909094e-05, | |
| "loss": 0.9262, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.020727272727272726, | |
| "grad_norm": 0.8015190958976746, | |
| "learning_rate": 4.9481818181818184e-05, | |
| "loss": 1.085, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.02109090909090909, | |
| "grad_norm": 0.8135480284690857, | |
| "learning_rate": 4.9472727272727274e-05, | |
| "loss": 0.9673, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.021454545454545455, | |
| "grad_norm": 0.7892040610313416, | |
| "learning_rate": 4.946363636363637e-05, | |
| "loss": 1.0301, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.02181818181818182, | |
| "grad_norm": 0.6644595265388489, | |
| "learning_rate": 4.945454545454546e-05, | |
| "loss": 0.9969, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.02218181818181818, | |
| "grad_norm": 0.6504725813865662, | |
| "learning_rate": 4.944545454545455e-05, | |
| "loss": 1.0686, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.022545454545454546, | |
| "grad_norm": 0.5474579930305481, | |
| "learning_rate": 4.9436363636363634e-05, | |
| "loss": 1.0338, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.02290909090909091, | |
| "grad_norm": 1.133872151374817, | |
| "learning_rate": 4.9427272727272724e-05, | |
| "loss": 1.0355, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.02327272727272727, | |
| "grad_norm": 0.9010602235794067, | |
| "learning_rate": 4.941818181818182e-05, | |
| "loss": 1.0205, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.023636363636363636, | |
| "grad_norm": 0.6271716952323914, | |
| "learning_rate": 4.940909090909091e-05, | |
| "loss": 0.9688, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.7340686917304993, | |
| "learning_rate": 4.94e-05, | |
| "loss": 1.058, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.024363636363636365, | |
| "grad_norm": 0.9811447262763977, | |
| "learning_rate": 4.939090909090909e-05, | |
| "loss": 1.0941, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.024727272727272726, | |
| "grad_norm": 0.8026047348976135, | |
| "learning_rate": 4.938181818181818e-05, | |
| "loss": 1.0148, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.02509090909090909, | |
| "grad_norm": 0.8320631980895996, | |
| "learning_rate": 4.937272727272728e-05, | |
| "loss": 1.0332, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.025454545454545455, | |
| "grad_norm": 1.0077319145202637, | |
| "learning_rate": 4.936363636363637e-05, | |
| "loss": 0.882, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.025818181818181817, | |
| "grad_norm": 0.792972981929779, | |
| "learning_rate": 4.935454545454546e-05, | |
| "loss": 0.9613, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.02618181818181818, | |
| "grad_norm": 0.9640710353851318, | |
| "learning_rate": 4.934545454545455e-05, | |
| "loss": 0.9879, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.026545454545454546, | |
| "grad_norm": 0.6762244701385498, | |
| "learning_rate": 4.933636363636364e-05, | |
| "loss": 0.9643, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.02690909090909091, | |
| "grad_norm": 0.8114360570907593, | |
| "learning_rate": 4.932727272727273e-05, | |
| "loss": 1.0049, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.02727272727272727, | |
| "grad_norm": 0.6531985998153687, | |
| "learning_rate": 4.931818181818182e-05, | |
| "loss": 0.9141, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.027636363636363636, | |
| "grad_norm": 0.846741795539856, | |
| "learning_rate": 4.930909090909091e-05, | |
| "loss": 0.9738, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.8146620392799377, | |
| "learning_rate": 4.93e-05, | |
| "loss": 1.0508, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.028363636363636365, | |
| "grad_norm": 0.8905590176582336, | |
| "learning_rate": 4.9290909090909096e-05, | |
| "loss": 0.9883, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.028727272727272726, | |
| "grad_norm": 0.820004403591156, | |
| "learning_rate": 4.9281818181818186e-05, | |
| "loss": 1.1111, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.02909090909090909, | |
| "grad_norm": 0.6964171528816223, | |
| "learning_rate": 4.9272727272727276e-05, | |
| "loss": 0.9889, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.029454545454545455, | |
| "grad_norm": 0.7463963627815247, | |
| "learning_rate": 4.9263636363636366e-05, | |
| "loss": 1.0017, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.029818181818181817, | |
| "grad_norm": 1.1425820589065552, | |
| "learning_rate": 4.9254545454545456e-05, | |
| "loss": 1.0291, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.03018181818181818, | |
| "grad_norm": 0.8121577501296997, | |
| "learning_rate": 4.924545454545455e-05, | |
| "loss": 1.1896, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.030545454545454546, | |
| "grad_norm": 0.7760348320007324, | |
| "learning_rate": 4.9236363636363636e-05, | |
| "loss": 1.0086, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.03090909090909091, | |
| "grad_norm": 1.0461779832839966, | |
| "learning_rate": 4.9227272727272726e-05, | |
| "loss": 1.067, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.03127272727272727, | |
| "grad_norm": 0.7449011206626892, | |
| "learning_rate": 4.9218181818181816e-05, | |
| "loss": 0.9166, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.031636363636363636, | |
| "grad_norm": 0.8614558577537537, | |
| "learning_rate": 4.920909090909091e-05, | |
| "loss": 1.0984, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.9124776721000671, | |
| "learning_rate": 4.92e-05, | |
| "loss": 0.9428, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.032363636363636365, | |
| "grad_norm": 0.7346855998039246, | |
| "learning_rate": 4.919090909090909e-05, | |
| "loss": 0.9891, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.03272727272727273, | |
| "grad_norm": 0.6888474822044373, | |
| "learning_rate": 4.9181818181818183e-05, | |
| "loss": 1.1186, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.03309090909090909, | |
| "grad_norm": 0.5666208267211914, | |
| "learning_rate": 4.9172727272727273e-05, | |
| "loss": 0.9862, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.03345454545454545, | |
| "grad_norm": 0.6969447731971741, | |
| "learning_rate": 4.916363636363637e-05, | |
| "loss": 1.09, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.03381818181818182, | |
| "grad_norm": 0.7007094025611877, | |
| "learning_rate": 4.915454545454546e-05, | |
| "loss": 0.9951, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.03418181818181818, | |
| "grad_norm": 0.8678598999977112, | |
| "learning_rate": 4.914545454545455e-05, | |
| "loss": 0.9574, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.034545454545454546, | |
| "grad_norm": 0.6946694850921631, | |
| "learning_rate": 4.913636363636364e-05, | |
| "loss": 1.0867, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.03490909090909091, | |
| "grad_norm": 0.7773414254188538, | |
| "learning_rate": 4.912727272727273e-05, | |
| "loss": 1.0025, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.035272727272727275, | |
| "grad_norm": 0.8114984631538391, | |
| "learning_rate": 4.911818181818182e-05, | |
| "loss": 0.9855, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.03563636363636364, | |
| "grad_norm": 0.6157673597335815, | |
| "learning_rate": 4.910909090909091e-05, | |
| "loss": 0.9988, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.8051506280899048, | |
| "learning_rate": 4.91e-05, | |
| "loss": 0.9818, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.03636363636363636, | |
| "grad_norm": 0.8241102695465088, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 0.9893, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.036727272727272726, | |
| "grad_norm": 0.7396079897880554, | |
| "learning_rate": 4.908181818181818e-05, | |
| "loss": 0.9775, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.03709090909090909, | |
| "grad_norm": 0.730338454246521, | |
| "learning_rate": 4.907272727272728e-05, | |
| "loss": 1.0172, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.037454545454545456, | |
| "grad_norm": 0.9117375016212463, | |
| "learning_rate": 4.906363636363637e-05, | |
| "loss": 1.0273, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.03781818181818182, | |
| "grad_norm": 0.699874997138977, | |
| "learning_rate": 4.905454545454546e-05, | |
| "loss": 0.9398, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.038181818181818185, | |
| "grad_norm": 0.724445641040802, | |
| "learning_rate": 4.904545454545455e-05, | |
| "loss": 0.9617, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.03854545454545454, | |
| "grad_norm": 0.7848714590072632, | |
| "learning_rate": 4.903636363636364e-05, | |
| "loss": 1.0037, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.03890909090909091, | |
| "grad_norm": 1.0549767017364502, | |
| "learning_rate": 4.902727272727273e-05, | |
| "loss": 1.0691, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.03927272727272727, | |
| "grad_norm": 0.8227099776268005, | |
| "learning_rate": 4.901818181818182e-05, | |
| "loss": 1.0037, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.039636363636363636, | |
| "grad_norm": 1.0453096628189087, | |
| "learning_rate": 4.900909090909091e-05, | |
| "loss": 1.0145, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8218994140625, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.9857, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.040363636363636365, | |
| "grad_norm": 0.76604825258255, | |
| "learning_rate": 4.8990909090909095e-05, | |
| "loss": 0.9576, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.04072727272727273, | |
| "grad_norm": 0.7639636397361755, | |
| "learning_rate": 4.8981818181818185e-05, | |
| "loss": 1.0402, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.04109090909090909, | |
| "grad_norm": 0.8489947319030762, | |
| "learning_rate": 4.8972727272727275e-05, | |
| "loss": 1.023, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.04145454545454545, | |
| "grad_norm": 0.7406574487686157, | |
| "learning_rate": 4.8963636363636365e-05, | |
| "loss": 1.0324, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.04181818181818182, | |
| "grad_norm": 0.6394333243370056, | |
| "learning_rate": 4.8954545454545456e-05, | |
| "loss": 0.9831, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.04218181818181818, | |
| "grad_norm": 0.9677541851997375, | |
| "learning_rate": 4.894545454545455e-05, | |
| "loss": 0.9928, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.042545454545454546, | |
| "grad_norm": 0.7677833437919617, | |
| "learning_rate": 4.893636363636364e-05, | |
| "loss": 0.9301, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.04290909090909091, | |
| "grad_norm": 0.6206602454185486, | |
| "learning_rate": 4.8927272727272726e-05, | |
| "loss": 1.0012, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.043272727272727275, | |
| "grad_norm": 0.8425969481468201, | |
| "learning_rate": 4.8918181818181816e-05, | |
| "loss": 1.1275, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.04363636363636364, | |
| "grad_norm": 0.7413673400878906, | |
| "learning_rate": 4.890909090909091e-05, | |
| "loss": 1.0117, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.9616818428039551, | |
| "learning_rate": 4.89e-05, | |
| "loss": 0.9658, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.04436363636363636, | |
| "grad_norm": 0.7987754344940186, | |
| "learning_rate": 4.889090909090909e-05, | |
| "loss": 0.9262, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.04472727272727273, | |
| "grad_norm": 0.8735281825065613, | |
| "learning_rate": 4.888181818181818e-05, | |
| "loss": 0.9414, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.04509090909090909, | |
| "grad_norm": 0.7942488789558411, | |
| "learning_rate": 4.887272727272727e-05, | |
| "loss": 1.0506, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 0.8182056546211243, | |
| "learning_rate": 4.886363636363637e-05, | |
| "loss": 1.0721, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.04581818181818182, | |
| "grad_norm": 0.7209412455558777, | |
| "learning_rate": 4.885454545454546e-05, | |
| "loss": 1.0205, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.046181818181818185, | |
| "grad_norm": 0.824597954750061, | |
| "learning_rate": 4.884545454545455e-05, | |
| "loss": 0.9563, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.04654545454545454, | |
| "grad_norm": 0.6784900426864624, | |
| "learning_rate": 4.883636363636364e-05, | |
| "loss": 1.0971, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.04690909090909091, | |
| "grad_norm": 0.680962324142456, | |
| "learning_rate": 4.882727272727273e-05, | |
| "loss": 1.0043, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.04727272727272727, | |
| "grad_norm": 0.7917009592056274, | |
| "learning_rate": 4.881818181818182e-05, | |
| "loss": 1.053, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.047636363636363636, | |
| "grad_norm": 0.8048487305641174, | |
| "learning_rate": 4.880909090909091e-05, | |
| "loss": 1.0842, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.7836024165153503, | |
| "learning_rate": 4.88e-05, | |
| "loss": 1.0537, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.048363636363636366, | |
| "grad_norm": 0.6501603126525879, | |
| "learning_rate": 4.879090909090909e-05, | |
| "loss": 1.0084, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.04872727272727273, | |
| "grad_norm": 1.0370051860809326, | |
| "learning_rate": 4.878181818181819e-05, | |
| "loss": 0.9449, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.04909090909090909, | |
| "grad_norm": 0.8803650736808777, | |
| "learning_rate": 4.877272727272728e-05, | |
| "loss": 1.0154, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.04945454545454545, | |
| "grad_norm": 0.8375502824783325, | |
| "learning_rate": 4.876363636363637e-05, | |
| "loss": 0.9437, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.04981818181818182, | |
| "grad_norm": 0.8356055617332458, | |
| "learning_rate": 4.875454545454546e-05, | |
| "loss": 0.9443, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.05018181818181818, | |
| "grad_norm": 0.9145257472991943, | |
| "learning_rate": 4.874545454545455e-05, | |
| "loss": 1.0326, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.050545454545454546, | |
| "grad_norm": 0.7850838899612427, | |
| "learning_rate": 4.8736363636363644e-05, | |
| "loss": 0.9943, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.05090909090909091, | |
| "grad_norm": 0.8555696606636047, | |
| "learning_rate": 4.872727272727273e-05, | |
| "loss": 1.0791, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.051272727272727275, | |
| "grad_norm": 0.6297926306724548, | |
| "learning_rate": 4.871818181818182e-05, | |
| "loss": 0.9766, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.05163636363636363, | |
| "grad_norm": 0.7590431571006775, | |
| "learning_rate": 4.870909090909091e-05, | |
| "loss": 1.0184, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.7111615538597107, | |
| "learning_rate": 4.87e-05, | |
| "loss": 0.9545, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.05236363636363636, | |
| "grad_norm": 0.9266930222511292, | |
| "learning_rate": 4.8690909090909095e-05, | |
| "loss": 0.9373, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.05272727272727273, | |
| "grad_norm": 0.7775997519493103, | |
| "learning_rate": 4.8681818181818185e-05, | |
| "loss": 1.084, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.05309090909090909, | |
| "grad_norm": 0.7686333656311035, | |
| "learning_rate": 4.8672727272727275e-05, | |
| "loss": 0.9543, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.053454545454545456, | |
| "grad_norm": 0.8209460973739624, | |
| "learning_rate": 4.8663636363636365e-05, | |
| "loss": 1.0379, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.05381818181818182, | |
| "grad_norm": 0.5993338227272034, | |
| "learning_rate": 4.8654545454545455e-05, | |
| "loss": 0.9836, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.054181818181818185, | |
| "grad_norm": 0.8321117758750916, | |
| "learning_rate": 4.864545454545455e-05, | |
| "loss": 0.9789, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.05454545454545454, | |
| "grad_norm": 0.7136039733886719, | |
| "learning_rate": 4.863636363636364e-05, | |
| "loss": 0.941, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.05490909090909091, | |
| "grad_norm": 0.6142838001251221, | |
| "learning_rate": 4.8627272727272725e-05, | |
| "loss": 1.0277, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.05527272727272727, | |
| "grad_norm": 0.7588635087013245, | |
| "learning_rate": 4.8618181818181815e-05, | |
| "loss": 1.0854, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.05563636363636364, | |
| "grad_norm": 0.9304160475730896, | |
| "learning_rate": 4.860909090909091e-05, | |
| "loss": 1.0713, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.6855290532112122, | |
| "learning_rate": 4.86e-05, | |
| "loss": 0.9658, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.056363636363636366, | |
| "grad_norm": 0.7574965357780457, | |
| "learning_rate": 4.859090909090909e-05, | |
| "loss": 0.9271, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.05672727272727273, | |
| "grad_norm": 0.6834359765052795, | |
| "learning_rate": 4.858181818181818e-05, | |
| "loss": 0.9395, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.05709090909090909, | |
| "grad_norm": 0.5964385271072388, | |
| "learning_rate": 4.857272727272727e-05, | |
| "loss": 1.0156, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.05745454545454545, | |
| "grad_norm": 0.6349005699157715, | |
| "learning_rate": 4.856363636363637e-05, | |
| "loss": 0.9195, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.05781818181818182, | |
| "grad_norm": 0.7871122360229492, | |
| "learning_rate": 4.855454545454546e-05, | |
| "loss": 1.008, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.05818181818181818, | |
| "grad_norm": 0.743977427482605, | |
| "learning_rate": 4.854545454545455e-05, | |
| "loss": 1.0277, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.058545454545454546, | |
| "grad_norm": 0.8460306525230408, | |
| "learning_rate": 4.853636363636364e-05, | |
| "loss": 1.0178, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.05890909090909091, | |
| "grad_norm": 0.6202099323272705, | |
| "learning_rate": 4.852727272727273e-05, | |
| "loss": 0.9564, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.059272727272727276, | |
| "grad_norm": 0.6819918155670166, | |
| "learning_rate": 4.851818181818182e-05, | |
| "loss": 1.008, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.05963636363636363, | |
| "grad_norm": 0.8113404512405396, | |
| "learning_rate": 4.850909090909091e-05, | |
| "loss": 1.0469, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.7554293274879456, | |
| "learning_rate": 4.85e-05, | |
| "loss": 0.9373, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.06036363636363636, | |
| "grad_norm": 0.7795326709747314, | |
| "learning_rate": 4.849090909090909e-05, | |
| "loss": 1.0633, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.06072727272727273, | |
| "grad_norm": 0.6483361721038818, | |
| "learning_rate": 4.848181818181819e-05, | |
| "loss": 1.0172, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.06109090909090909, | |
| "grad_norm": 0.7647920846939087, | |
| "learning_rate": 4.847272727272728e-05, | |
| "loss": 0.9861, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.061454545454545456, | |
| "grad_norm": 0.9443255066871643, | |
| "learning_rate": 4.846363636363637e-05, | |
| "loss": 1.0828, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.06181818181818182, | |
| "grad_norm": 0.8193866610527039, | |
| "learning_rate": 4.845454545454546e-05, | |
| "loss": 1.0604, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.06218181818181818, | |
| "grad_norm": 0.707897961139679, | |
| "learning_rate": 4.844545454545455e-05, | |
| "loss": 0.9975, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.06254545454545454, | |
| "grad_norm": 0.9864387512207031, | |
| "learning_rate": 4.8436363636363644e-05, | |
| "loss": 1.0488, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.06290909090909091, | |
| "grad_norm": 0.741073727607727, | |
| "learning_rate": 4.8427272727272734e-05, | |
| "loss": 0.9912, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.06327272727272727, | |
| "grad_norm": 0.6622138619422913, | |
| "learning_rate": 4.841818181818182e-05, | |
| "loss": 0.9904, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.06363636363636363, | |
| "grad_norm": 0.9334690570831299, | |
| "learning_rate": 4.840909090909091e-05, | |
| "loss": 0.9775, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.5777163505554199, | |
| "learning_rate": 4.8400000000000004e-05, | |
| "loss": 0.951, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.06436363636363636, | |
| "grad_norm": 0.9769054651260376, | |
| "learning_rate": 4.8390909090909094e-05, | |
| "loss": 0.9883, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.06472727272727273, | |
| "grad_norm": 0.8204796314239502, | |
| "learning_rate": 4.8381818181818184e-05, | |
| "loss": 0.9746, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.06509090909090909, | |
| "grad_norm": 0.8641470670700073, | |
| "learning_rate": 4.8372727272727274e-05, | |
| "loss": 1.0391, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.06545454545454546, | |
| "grad_norm": 0.732994794845581, | |
| "learning_rate": 4.8363636363636364e-05, | |
| "loss": 0.9494, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.06581818181818182, | |
| "grad_norm": 0.5632991194725037, | |
| "learning_rate": 4.835454545454546e-05, | |
| "loss": 1.1303, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.06618181818181817, | |
| "grad_norm": 0.6839861869812012, | |
| "learning_rate": 4.834545454545455e-05, | |
| "loss": 0.9652, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.06654545454545455, | |
| "grad_norm": 0.5407667756080627, | |
| "learning_rate": 4.833636363636364e-05, | |
| "loss": 0.9885, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.0669090909090909, | |
| "grad_norm": 0.8409451246261597, | |
| "learning_rate": 4.832727272727273e-05, | |
| "loss": 1.0197, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.06727272727272728, | |
| "grad_norm": 0.8460031151771545, | |
| "learning_rate": 4.8318181818181815e-05, | |
| "loss": 1.0137, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.06763636363636363, | |
| "grad_norm": 0.6154618859291077, | |
| "learning_rate": 4.830909090909091e-05, | |
| "loss": 0.9316, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 0.7288620471954346, | |
| "learning_rate": 4.83e-05, | |
| "loss": 0.957, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.06836363636363636, | |
| "grad_norm": 0.6734368205070496, | |
| "learning_rate": 4.829090909090909e-05, | |
| "loss": 0.9178, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.06872727272727273, | |
| "grad_norm": 0.5886709690093994, | |
| "learning_rate": 4.828181818181818e-05, | |
| "loss": 0.8715, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.06909090909090909, | |
| "grad_norm": 0.6626171469688416, | |
| "learning_rate": 4.827272727272727e-05, | |
| "loss": 1.0303, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.06945454545454545, | |
| "grad_norm": 0.6848061680793762, | |
| "learning_rate": 4.826363636363637e-05, | |
| "loss": 0.9477, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.06981818181818182, | |
| "grad_norm": 0.9979369640350342, | |
| "learning_rate": 4.825454545454546e-05, | |
| "loss": 1.1021, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.07018181818181818, | |
| "grad_norm": 0.8587698340415955, | |
| "learning_rate": 4.824545454545455e-05, | |
| "loss": 0.9975, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.07054545454545455, | |
| "grad_norm": 0.7505869269371033, | |
| "learning_rate": 4.823636363636364e-05, | |
| "loss": 0.902, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.07090909090909091, | |
| "grad_norm": 1.0160633325576782, | |
| "learning_rate": 4.822727272727273e-05, | |
| "loss": 1.0432, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.07127272727272728, | |
| "grad_norm": 0.6609899401664734, | |
| "learning_rate": 4.821818181818182e-05, | |
| "loss": 0.9576, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.07163636363636364, | |
| "grad_norm": 0.8230269551277161, | |
| "learning_rate": 4.820909090909091e-05, | |
| "loss": 1.0172, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.7711009979248047, | |
| "learning_rate": 4.82e-05, | |
| "loss": 1.133, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.07236363636363637, | |
| "grad_norm": 0.8839887380599976, | |
| "learning_rate": 4.819090909090909e-05, | |
| "loss": 1.0662, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.07272727272727272, | |
| "grad_norm": 0.6672515869140625, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 0.9764, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 55000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2566081788222917e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |