| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 30.0, | |
| "eval_steps": 500, | |
| "global_step": 26250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 1.9886202812194824, | |
| "learning_rate": 0.00029886857142857144, | |
| "loss": 1.2287, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1.3383790254592896, | |
| "learning_rate": 0.0002977257142857143, | |
| "loss": 1.0151, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 1.4753262996673584, | |
| "learning_rate": 0.0002965828571428571, | |
| "loss": 0.9255, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.6120792627334595, | |
| "learning_rate": 0.00029544, | |
| "loss": 0.9156, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 1.7478028535842896, | |
| "learning_rate": 0.00029429714285714284, | |
| "loss": 0.8849, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 1.4654852151870728, | |
| "learning_rate": 0.0002931542857142857, | |
| "loss": 0.847, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.6262428760528564, | |
| "learning_rate": 0.0002920114285714285, | |
| "loss": 0.8322, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 1.2275465726852417, | |
| "learning_rate": 0.0002908685714285714, | |
| "loss": 0.8382, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0285714285714285, | |
| "grad_norm": 0.8290926218032837, | |
| "learning_rate": 0.00028972571428571424, | |
| "loss": 0.8152, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 1.5321491956710815, | |
| "learning_rate": 0.0002885828571428571, | |
| "loss": 0.766, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2571428571428571, | |
| "grad_norm": 1.1432169675827026, | |
| "learning_rate": 0.00028743999999999997, | |
| "loss": 0.7438, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3714285714285714, | |
| "grad_norm": 3.4796738624572754, | |
| "learning_rate": 0.00028629714285714286, | |
| "loss": 0.7422, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4857142857142858, | |
| "grad_norm": 0.8977236747741699, | |
| "learning_rate": 0.0002851542857142857, | |
| "loss": 0.7047, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.8644102811813354, | |
| "learning_rate": 0.00028401142857142854, | |
| "loss": 0.7545, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 1.1881957054138184, | |
| "learning_rate": 0.00028286857142857143, | |
| "loss": 0.7159, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8285714285714287, | |
| "grad_norm": 1.2069973945617676, | |
| "learning_rate": 0.00028172571428571427, | |
| "loss": 0.6974, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.9428571428571428, | |
| "grad_norm": 1.1324719190597534, | |
| "learning_rate": 0.0002805828571428571, | |
| "loss": 0.7014, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.057142857142857, | |
| "grad_norm": 1.6258182525634766, | |
| "learning_rate": 0.00027944, | |
| "loss": 0.6531, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1714285714285713, | |
| "grad_norm": 1.6277265548706055, | |
| "learning_rate": 0.00027829714285714283, | |
| "loss": 0.6167, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 1.05016028881073, | |
| "learning_rate": 0.00027715428571428567, | |
| "loss": 0.6365, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.4809651374816895, | |
| "learning_rate": 0.00027601142857142856, | |
| "loss": 0.64, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5142857142857142, | |
| "grad_norm": 0.9632180333137512, | |
| "learning_rate": 0.0002748685714285714, | |
| "loss": 0.631, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6285714285714286, | |
| "grad_norm": 1.4610211849212646, | |
| "learning_rate": 0.00027372571428571423, | |
| "loss": 0.6183, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.742857142857143, | |
| "grad_norm": 0.95488440990448, | |
| "learning_rate": 0.0002725828571428571, | |
| "loss": 0.6116, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 1.4659332036972046, | |
| "learning_rate": 0.00027144, | |
| "loss": 0.6281, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.9714285714285715, | |
| "grad_norm": 1.0329852104187012, | |
| "learning_rate": 0.00027029714285714285, | |
| "loss": 0.6289, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.085714285714286, | |
| "grad_norm": 1.1207853555679321, | |
| "learning_rate": 0.0002691542857142857, | |
| "loss": 0.5658, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.1021409034729004, | |
| "learning_rate": 0.00026801142857142853, | |
| "loss": 0.5508, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.314285714285714, | |
| "grad_norm": 1.3904248476028442, | |
| "learning_rate": 0.0002668685714285714, | |
| "loss": 0.5705, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.4285714285714284, | |
| "grad_norm": 1.0779317617416382, | |
| "learning_rate": 0.00026572571428571426, | |
| "loss": 0.5586, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.5428571428571427, | |
| "grad_norm": 1.5445743799209595, | |
| "learning_rate": 0.0002645828571428571, | |
| "loss": 0.5457, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.657142857142857, | |
| "grad_norm": 1.100846290588379, | |
| "learning_rate": 0.00026344, | |
| "loss": 0.5495, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.7714285714285714, | |
| "grad_norm": 1.7807058095932007, | |
| "learning_rate": 0.0002622971428571428, | |
| "loss": 0.5752, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.8857142857142857, | |
| "grad_norm": 1.18954336643219, | |
| "learning_rate": 0.00026115428571428566, | |
| "loss": 0.5443, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.0324475765228271, | |
| "learning_rate": 0.00026001142857142855, | |
| "loss": 0.5344, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.114285714285714, | |
| "grad_norm": 1.2776315212249756, | |
| "learning_rate": 0.0002588685714285714, | |
| "loss": 0.4963, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.228571428571429, | |
| "grad_norm": 1.8203849792480469, | |
| "learning_rate": 0.0002577257142857142, | |
| "loss": 0.4796, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.3428571428571425, | |
| "grad_norm": 1.0043076276779175, | |
| "learning_rate": 0.0002565828571428571, | |
| "loss": 0.4828, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 4.457142857142857, | |
| "grad_norm": 1.506948471069336, | |
| "learning_rate": 0.00025544, | |
| "loss": 0.5112, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 4.571428571428571, | |
| "grad_norm": 1.1585667133331299, | |
| "learning_rate": 0.00025429714285714284, | |
| "loss": 0.4957, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.685714285714286, | |
| "grad_norm": 1.3756364583969116, | |
| "learning_rate": 0.0002531542857142857, | |
| "loss": 0.5106, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.9732717871665955, | |
| "learning_rate": 0.00025201142857142857, | |
| "loss": 0.483, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 4.914285714285715, | |
| "grad_norm": 1.2413655519485474, | |
| "learning_rate": 0.0002508685714285714, | |
| "loss": 0.5, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.0285714285714285, | |
| "grad_norm": 1.0629197359085083, | |
| "learning_rate": 0.00024972571428571425, | |
| "loss": 0.4751, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 5.142857142857143, | |
| "grad_norm": 1.3967541456222534, | |
| "learning_rate": 0.00024858285714285714, | |
| "loss": 0.4427, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.257142857142857, | |
| "grad_norm": 1.1337262392044067, | |
| "learning_rate": 0.00024744, | |
| "loss": 0.4413, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 5.371428571428572, | |
| "grad_norm": 1.4365642070770264, | |
| "learning_rate": 0.0002462971428571428, | |
| "loss": 0.4342, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 5.485714285714286, | |
| "grad_norm": 1.1386431455612183, | |
| "learning_rate": 0.0002451542857142857, | |
| "loss": 0.4302, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 1.6757099628448486, | |
| "learning_rate": 0.00024401142857142854, | |
| "loss": 0.448, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 1.2465338706970215, | |
| "learning_rate": 0.00024286857142857143, | |
| "loss": 0.4401, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 5.828571428571428, | |
| "grad_norm": 1.482226014137268, | |
| "learning_rate": 0.00024172571428571427, | |
| "loss": 0.4613, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 5.942857142857143, | |
| "grad_norm": 2.177501678466797, | |
| "learning_rate": 0.00024058285714285713, | |
| "loss": 0.4364, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 6.057142857142857, | |
| "grad_norm": 1.069575309753418, | |
| "learning_rate": 0.00023944, | |
| "loss": 0.4228, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 6.171428571428572, | |
| "grad_norm": 1.0357013940811157, | |
| "learning_rate": 0.00023829714285714283, | |
| "loss": 0.3895, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 6.285714285714286, | |
| "grad_norm": 1.3480335474014282, | |
| "learning_rate": 0.0002371542857142857, | |
| "loss": 0.4087, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 1.4253255128860474, | |
| "learning_rate": 0.00023601142857142856, | |
| "loss": 0.4058, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 6.514285714285714, | |
| "grad_norm": 1.293028712272644, | |
| "learning_rate": 0.0002348685714285714, | |
| "loss": 0.4066, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 6.628571428571428, | |
| "grad_norm": 1.0373749732971191, | |
| "learning_rate": 0.00023372571428571426, | |
| "loss": 0.4091, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 6.742857142857143, | |
| "grad_norm": 1.2542062997817993, | |
| "learning_rate": 0.0002325828571428571, | |
| "loss": 0.3926, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 6.857142857142857, | |
| "grad_norm": 1.4273713827133179, | |
| "learning_rate": 0.00023143999999999997, | |
| "loss": 0.3899, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 6.9714285714285715, | |
| "grad_norm": 1.169870376586914, | |
| "learning_rate": 0.00023029714285714283, | |
| "loss": 0.4092, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 7.085714285714285, | |
| "grad_norm": 1.1924947500228882, | |
| "learning_rate": 0.00022915428571428567, | |
| "loss": 0.3754, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 1.349892497062683, | |
| "learning_rate": 0.00022801142857142856, | |
| "loss": 0.3537, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 7.314285714285714, | |
| "grad_norm": 1.2282274961471558, | |
| "learning_rate": 0.00022686857142857142, | |
| "loss": 0.3639, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 7.428571428571429, | |
| "grad_norm": 1.1712229251861572, | |
| "learning_rate": 0.0002257257142857143, | |
| "loss": 0.3688, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 7.542857142857143, | |
| "grad_norm": 2.2879111766815186, | |
| "learning_rate": 0.00022458285714285712, | |
| "loss": 0.3559, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 7.6571428571428575, | |
| "grad_norm": 1.1137748956680298, | |
| "learning_rate": 0.00022344, | |
| "loss": 0.3574, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 7.771428571428571, | |
| "grad_norm": 1.289184808731079, | |
| "learning_rate": 0.00022229714285714285, | |
| "loss": 0.3653, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 7.885714285714286, | |
| "grad_norm": 1.6281535625457764, | |
| "learning_rate": 0.0002211542857142857, | |
| "loss": 0.3476, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 1.3460326194763184, | |
| "learning_rate": 0.00022001142857142855, | |
| "loss": 0.3606, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 8.114285714285714, | |
| "grad_norm": 1.2988694906234741, | |
| "learning_rate": 0.0002188685714285714, | |
| "loss": 0.3064, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 8.228571428571428, | |
| "grad_norm": 1.059658408164978, | |
| "learning_rate": 0.00021772571428571426, | |
| "loss": 0.333, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 8.342857142857143, | |
| "grad_norm": 1.6318204402923584, | |
| "learning_rate": 0.00021658285714285712, | |
| "loss": 0.3313, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 8.457142857142857, | |
| "grad_norm": 1.5393931865692139, | |
| "learning_rate": 0.00021543999999999996, | |
| "loss": 0.3133, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 1.6614785194396973, | |
| "learning_rate": 0.00021429714285714282, | |
| "loss": 0.3393, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 8.685714285714285, | |
| "grad_norm": 1.206910490989685, | |
| "learning_rate": 0.0002131542857142857, | |
| "loss": 0.3279, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 1.3397912979125977, | |
| "learning_rate": 0.00021201142857142858, | |
| "loss": 0.3342, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 8.914285714285715, | |
| "grad_norm": 1.1242660284042358, | |
| "learning_rate": 0.0002108685714285714, | |
| "loss": 0.3237, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 9.028571428571428, | |
| "grad_norm": 1.088550329208374, | |
| "learning_rate": 0.00020972571428571428, | |
| "loss": 0.3128, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 9.142857142857142, | |
| "grad_norm": 1.238797664642334, | |
| "learning_rate": 0.00020858285714285714, | |
| "loss": 0.2904, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 9.257142857142856, | |
| "grad_norm": 1.502020239830017, | |
| "learning_rate": 0.00020743999999999998, | |
| "loss": 0.2967, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 9.371428571428572, | |
| "grad_norm": 1.6706186532974243, | |
| "learning_rate": 0.00020629714285714284, | |
| "loss": 0.2889, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 9.485714285714286, | |
| "grad_norm": 1.4780162572860718, | |
| "learning_rate": 0.00020515428571428568, | |
| "loss": 0.3039, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 1.3221495151519775, | |
| "learning_rate": 0.00020401142857142854, | |
| "loss": 0.3087, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 9.714285714285714, | |
| "grad_norm": 1.9556879997253418, | |
| "learning_rate": 0.0002028685714285714, | |
| "loss": 0.3006, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 9.82857142857143, | |
| "grad_norm": 2.5484800338745117, | |
| "learning_rate": 0.00020172571428571425, | |
| "loss": 0.2918, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 9.942857142857143, | |
| "grad_norm": 1.202545166015625, | |
| "learning_rate": 0.0002005828571428571, | |
| "loss": 0.2955, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 10.057142857142857, | |
| "grad_norm": 1.2312058210372925, | |
| "learning_rate": 0.00019943999999999997, | |
| "loss": 0.2711, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 10.17142857142857, | |
| "grad_norm": 0.9658439755439758, | |
| "learning_rate": 0.00019829714285714287, | |
| "loss": 0.2536, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 10.285714285714286, | |
| "grad_norm": 1.4016692638397217, | |
| "learning_rate": 0.0001971542857142857, | |
| "loss": 0.2712, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 1.0595059394836426, | |
| "learning_rate": 0.00019601142857142857, | |
| "loss": 0.2631, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 10.514285714285714, | |
| "grad_norm": 1.6047881841659546, | |
| "learning_rate": 0.00019486857142857143, | |
| "loss": 0.2755, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 10.628571428571428, | |
| "grad_norm": 1.8473031520843506, | |
| "learning_rate": 0.00019372571428571427, | |
| "loss": 0.2783, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 10.742857142857144, | |
| "grad_norm": 1.9216639995574951, | |
| "learning_rate": 0.00019258285714285713, | |
| "loss": 0.2734, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 10.857142857142858, | |
| "grad_norm": 1.0926100015640259, | |
| "learning_rate": 0.00019143999999999997, | |
| "loss": 0.2778, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 10.971428571428572, | |
| "grad_norm": 1.4277852773666382, | |
| "learning_rate": 0.00019029714285714283, | |
| "loss": 0.2678, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 11.085714285714285, | |
| "grad_norm": 1.273311734199524, | |
| "learning_rate": 0.0001891542857142857, | |
| "loss": 0.2468, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 1.50382399559021, | |
| "learning_rate": 0.00018801142857142854, | |
| "loss": 0.2328, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 11.314285714285715, | |
| "grad_norm": 2.2691490650177, | |
| "learning_rate": 0.0001868685714285714, | |
| "loss": 0.246, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 11.428571428571429, | |
| "grad_norm": 1.2265568971633911, | |
| "learning_rate": 0.00018572571428571426, | |
| "loss": 0.2492, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 11.542857142857143, | |
| "grad_norm": 1.1587599515914917, | |
| "learning_rate": 0.0001845828571428571, | |
| "loss": 0.2453, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 11.657142857142857, | |
| "grad_norm": 1.1462079286575317, | |
| "learning_rate": 0.00018344, | |
| "loss": 0.253, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 11.771428571428572, | |
| "grad_norm": 1.9905078411102295, | |
| "learning_rate": 0.00018229714285714286, | |
| "loss": 0.2556, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 11.885714285714286, | |
| "grad_norm": 1.667157769203186, | |
| "learning_rate": 0.00018115428571428572, | |
| "loss": 0.249, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 1.08194899559021, | |
| "learning_rate": 0.00018001142857142856, | |
| "loss": 0.2404, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 12.114285714285714, | |
| "grad_norm": 1.2508606910705566, | |
| "learning_rate": 0.00017886857142857142, | |
| "loss": 0.2171, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 12.228571428571428, | |
| "grad_norm": 1.10196053981781, | |
| "learning_rate": 0.00017772571428571426, | |
| "loss": 0.2162, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 12.342857142857143, | |
| "grad_norm": 1.325040340423584, | |
| "learning_rate": 0.00017658285714285712, | |
| "loss": 0.2165, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 12.457142857142857, | |
| "grad_norm": 1.4882842302322388, | |
| "learning_rate": 0.00017544, | |
| "loss": 0.2207, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 12.571428571428571, | |
| "grad_norm": 1.2574632167816162, | |
| "learning_rate": 0.00017429714285714282, | |
| "loss": 0.2292, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 12.685714285714285, | |
| "grad_norm": 1.5161538124084473, | |
| "learning_rate": 0.0001731542857142857, | |
| "loss": 0.2216, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 1.5018984079360962, | |
| "learning_rate": 0.00017201142857142855, | |
| "loss": 0.2273, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 12.914285714285715, | |
| "grad_norm": 1.2884104251861572, | |
| "learning_rate": 0.0001708685714285714, | |
| "loss": 0.2243, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 13.028571428571428, | |
| "grad_norm": 1.378460168838501, | |
| "learning_rate": 0.00016972571428571428, | |
| "loss": 0.225, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 13.142857142857142, | |
| "grad_norm": 1.5688245296478271, | |
| "learning_rate": 0.00016858285714285715, | |
| "loss": 0.1992, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 13.257142857142856, | |
| "grad_norm": 1.3006786108016968, | |
| "learning_rate": 0.00016744, | |
| "loss": 0.2068, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 13.371428571428572, | |
| "grad_norm": 1.047890543937683, | |
| "learning_rate": 0.00016629714285714285, | |
| "loss": 0.1957, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 13.485714285714286, | |
| "grad_norm": 0.9967881441116333, | |
| "learning_rate": 0.0001651542857142857, | |
| "loss": 0.2058, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 1.4264742136001587, | |
| "learning_rate": 0.00016401142857142855, | |
| "loss": 0.2063, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 13.714285714285714, | |
| "grad_norm": 1.3013545274734497, | |
| "learning_rate": 0.0001628685714285714, | |
| "loss": 0.2032, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 13.82857142857143, | |
| "grad_norm": 1.3055994510650635, | |
| "learning_rate": 0.00016172571428571428, | |
| "loss": 0.2162, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 13.942857142857143, | |
| "grad_norm": 1.4893743991851807, | |
| "learning_rate": 0.00016058285714285711, | |
| "loss": 0.2081, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 14.057142857142857, | |
| "grad_norm": 1.4137383699417114, | |
| "learning_rate": 0.00015943999999999998, | |
| "loss": 0.1841, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 14.17142857142857, | |
| "grad_norm": 1.8859280347824097, | |
| "learning_rate": 0.00015829714285714284, | |
| "loss": 0.1857, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 14.285714285714286, | |
| "grad_norm": 1.5282500982284546, | |
| "learning_rate": 0.00015715428571428568, | |
| "loss": 0.1904, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 0.9001047015190125, | |
| "learning_rate": 0.00015601142857142854, | |
| "loss": 0.1853, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 14.514285714285714, | |
| "grad_norm": 1.1927658319473267, | |
| "learning_rate": 0.00015486857142857143, | |
| "loss": 0.1874, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 14.628571428571428, | |
| "grad_norm": 1.1758664846420288, | |
| "learning_rate": 0.0001537257142857143, | |
| "loss": 0.1754, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 14.742857142857144, | |
| "grad_norm": 1.1734734773635864, | |
| "learning_rate": 0.00015258285714285714, | |
| "loss": 0.1868, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 14.857142857142858, | |
| "grad_norm": 0.8678969740867615, | |
| "learning_rate": 0.00015144, | |
| "loss": 0.1795, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 14.971428571428572, | |
| "grad_norm": 2.2901735305786133, | |
| "learning_rate": 0.00015029714285714284, | |
| "loss": 0.1965, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 15.085714285714285, | |
| "grad_norm": 1.0252338647842407, | |
| "learning_rate": 0.0001491542857142857, | |
| "loss": 0.1649, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 1.1025043725967407, | |
| "learning_rate": 0.00014801142857142857, | |
| "loss": 0.1722, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 15.314285714285715, | |
| "grad_norm": 1.2872519493103027, | |
| "learning_rate": 0.0001468685714285714, | |
| "loss": 0.1694, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 15.428571428571429, | |
| "grad_norm": 2.815004348754883, | |
| "learning_rate": 0.00014572571428571427, | |
| "loss": 0.1758, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 15.542857142857143, | |
| "grad_norm": 1.7336875200271606, | |
| "learning_rate": 0.00014458285714285713, | |
| "loss": 0.1698, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 15.657142857142857, | |
| "grad_norm": 1.5906660556793213, | |
| "learning_rate": 0.00014344, | |
| "loss": 0.1554, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 15.771428571428572, | |
| "grad_norm": 1.0536751747131348, | |
| "learning_rate": 0.00014229714285714286, | |
| "loss": 0.1667, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 15.885714285714286, | |
| "grad_norm": 2.1150689125061035, | |
| "learning_rate": 0.0001411542857142857, | |
| "loss": 0.1725, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 2.3918333053588867, | |
| "learning_rate": 0.00014001142857142856, | |
| "loss": 0.1855, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 16.114285714285714, | |
| "grad_norm": 1.5179935693740845, | |
| "learning_rate": 0.00013886857142857143, | |
| "loss": 0.1509, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 16.228571428571428, | |
| "grad_norm": 1.0404243469238281, | |
| "learning_rate": 0.00013772571428571426, | |
| "loss": 0.1545, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 16.34285714285714, | |
| "grad_norm": 1.7319324016571045, | |
| "learning_rate": 0.00013658285714285713, | |
| "loss": 0.1553, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 16.457142857142856, | |
| "grad_norm": 1.694320797920227, | |
| "learning_rate": 0.00013544, | |
| "loss": 0.1543, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 16.571428571428573, | |
| "grad_norm": 1.1056307554244995, | |
| "learning_rate": 0.00013429714285714285, | |
| "loss": 0.1639, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 16.685714285714287, | |
| "grad_norm": 1.5444873571395874, | |
| "learning_rate": 0.0001331542857142857, | |
| "loss": 0.1575, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 16.8, | |
| "grad_norm": 1.3893969058990479, | |
| "learning_rate": 0.00013201142857142856, | |
| "loss": 0.163, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 16.914285714285715, | |
| "grad_norm": 1.6132880449295044, | |
| "learning_rate": 0.00013086857142857142, | |
| "loss": 0.1537, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 17.02857142857143, | |
| "grad_norm": 1.5396114587783813, | |
| "learning_rate": 0.00012972571428571426, | |
| "loss": 0.1548, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 17.142857142857142, | |
| "grad_norm": 1.0118181705474854, | |
| "learning_rate": 0.00012858285714285715, | |
| "loss": 0.1417, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 17.257142857142856, | |
| "grad_norm": 1.0827256441116333, | |
| "learning_rate": 0.00012743999999999999, | |
| "loss": 0.1418, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 17.37142857142857, | |
| "grad_norm": 1.5309821367263794, | |
| "learning_rate": 0.00012629714285714285, | |
| "loss": 0.1375, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 17.485714285714284, | |
| "grad_norm": 1.1401481628417969, | |
| "learning_rate": 0.00012515428571428571, | |
| "loss": 0.1465, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 1.150075912475586, | |
| "learning_rate": 0.00012401142857142855, | |
| "loss": 0.1421, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 17.714285714285715, | |
| "grad_norm": 1.1666033267974854, | |
| "learning_rate": 0.00012286857142857142, | |
| "loss": 0.1527, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 17.82857142857143, | |
| "grad_norm": 0.8309689164161682, | |
| "learning_rate": 0.00012172571428571428, | |
| "loss": 0.1505, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 17.942857142857143, | |
| "grad_norm": 1.5989408493041992, | |
| "learning_rate": 0.00012058285714285713, | |
| "loss": 0.1436, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 18.057142857142857, | |
| "grad_norm": 0.6489440202713013, | |
| "learning_rate": 0.00011944, | |
| "loss": 0.1375, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 18.17142857142857, | |
| "grad_norm": 1.6196086406707764, | |
| "learning_rate": 0.00011829714285714285, | |
| "loss": 0.1352, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 18.285714285714285, | |
| "grad_norm": 1.2241395711898804, | |
| "learning_rate": 0.0001171542857142857, | |
| "loss": 0.1313, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 18.4, | |
| "grad_norm": 0.9825499653816223, | |
| "learning_rate": 0.00011601142857142856, | |
| "loss": 0.1346, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 18.514285714285712, | |
| "grad_norm": 1.8467905521392822, | |
| "learning_rate": 0.00011486857142857142, | |
| "loss": 0.1334, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 18.62857142857143, | |
| "grad_norm": 1.0591373443603516, | |
| "learning_rate": 0.00011372571428571428, | |
| "loss": 0.1392, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 18.742857142857144, | |
| "grad_norm": 2.0883259773254395, | |
| "learning_rate": 0.00011258285714285714, | |
| "loss": 0.134, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 18.857142857142858, | |
| "grad_norm": 1.0738496780395508, | |
| "learning_rate": 0.00011143999999999999, | |
| "loss": 0.1338, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 18.97142857142857, | |
| "grad_norm": 1.3434749841690063, | |
| "learning_rate": 0.00011029714285714284, | |
| "loss": 0.1296, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 19.085714285714285, | |
| "grad_norm": 0.6826351284980774, | |
| "learning_rate": 0.0001091542857142857, | |
| "loss": 0.1221, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 1.7247623205184937, | |
| "learning_rate": 0.00010801142857142856, | |
| "loss": 0.1212, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 19.314285714285713, | |
| "grad_norm": 1.4077801704406738, | |
| "learning_rate": 0.00010686857142857142, | |
| "loss": 0.1204, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 19.428571428571427, | |
| "grad_norm": 0.789215624332428, | |
| "learning_rate": 0.00010572571428571428, | |
| "loss": 0.1184, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 19.542857142857144, | |
| "grad_norm": 2.175265073776245, | |
| "learning_rate": 0.00010458285714285713, | |
| "loss": 0.1231, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 19.65714285714286, | |
| "grad_norm": 1.1125848293304443, | |
| "learning_rate": 0.00010343999999999999, | |
| "loss": 0.1227, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 19.771428571428572, | |
| "grad_norm": 1.0844342708587646, | |
| "learning_rate": 0.00010229714285714285, | |
| "loss": 0.1255, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 19.885714285714286, | |
| "grad_norm": 0.6973736882209778, | |
| "learning_rate": 0.0001011542857142857, | |
| "loss": 0.1277, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.9680613279342651, | |
| "learning_rate": 0.00010001142857142856, | |
| "loss": 0.1254, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 20.114285714285714, | |
| "grad_norm": 1.4217584133148193, | |
| "learning_rate": 9.886857142857143e-05, | |
| "loss": 0.1108, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 20.228571428571428, | |
| "grad_norm": 1.2597243785858154, | |
| "learning_rate": 9.772571428571428e-05, | |
| "loss": 0.1209, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 20.34285714285714, | |
| "grad_norm": 1.3436779975891113, | |
| "learning_rate": 9.658285714285713e-05, | |
| "loss": 0.1206, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 20.457142857142856, | |
| "grad_norm": 1.175439715385437, | |
| "learning_rate": 9.544e-05, | |
| "loss": 0.1164, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 20.571428571428573, | |
| "grad_norm": 1.3990012407302856, | |
| "learning_rate": 9.429714285714284e-05, | |
| "loss": 0.1144, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 20.685714285714287, | |
| "grad_norm": 1.0105007886886597, | |
| "learning_rate": 9.31542857142857e-05, | |
| "loss": 0.1135, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "grad_norm": 1.1308010816574097, | |
| "learning_rate": 9.201142857142857e-05, | |
| "loss": 0.1141, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 20.914285714285715, | |
| "grad_norm": 0.7414535284042358, | |
| "learning_rate": 9.086857142857142e-05, | |
| "loss": 0.1208, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 21.02857142857143, | |
| "grad_norm": 0.8291124701499939, | |
| "learning_rate": 8.972571428571427e-05, | |
| "loss": 0.1049, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 21.142857142857142, | |
| "grad_norm": 0.7733851671218872, | |
| "learning_rate": 8.858285714285714e-05, | |
| "loss": 0.1071, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 21.257142857142856, | |
| "grad_norm": 2.2193784713745117, | |
| "learning_rate": 8.743999999999999e-05, | |
| "loss": 0.102, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 21.37142857142857, | |
| "grad_norm": 1.0695987939834595, | |
| "learning_rate": 8.629714285714284e-05, | |
| "loss": 0.106, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 21.485714285714284, | |
| "grad_norm": 1.0461671352386475, | |
| "learning_rate": 8.515428571428572e-05, | |
| "loss": 0.107, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 21.6, | |
| "grad_norm": 1.5757765769958496, | |
| "learning_rate": 8.401142857142857e-05, | |
| "loss": 0.106, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 21.714285714285715, | |
| "grad_norm": 1.9472708702087402, | |
| "learning_rate": 8.286857142857142e-05, | |
| "loss": 0.108, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 21.82857142857143, | |
| "grad_norm": 0.9918070435523987, | |
| "learning_rate": 8.172571428571428e-05, | |
| "loss": 0.1152, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 21.942857142857143, | |
| "grad_norm": 1.0311241149902344, | |
| "learning_rate": 8.058285714285713e-05, | |
| "loss": 0.1047, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 22.057142857142857, | |
| "grad_norm": 1.0554375648498535, | |
| "learning_rate": 7.943999999999998e-05, | |
| "loss": 0.1076, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 22.17142857142857, | |
| "grad_norm": 0.7874680757522583, | |
| "learning_rate": 7.829714285714286e-05, | |
| "loss": 0.0997, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 22.285714285714285, | |
| "grad_norm": 1.1105536222457886, | |
| "learning_rate": 7.715428571428571e-05, | |
| "loss": 0.1, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 22.4, | |
| "grad_norm": 1.0579336881637573, | |
| "learning_rate": 7.601142857142856e-05, | |
| "loss": 0.0963, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 22.514285714285712, | |
| "grad_norm": 1.0726921558380127, | |
| "learning_rate": 7.486857142857143e-05, | |
| "loss": 0.1005, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 22.62857142857143, | |
| "grad_norm": 1.0421086549758911, | |
| "learning_rate": 7.372571428571428e-05, | |
| "loss": 0.0976, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 22.742857142857144, | |
| "grad_norm": 1.198748230934143, | |
| "learning_rate": 7.258285714285714e-05, | |
| "loss": 0.1059, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 22.857142857142858, | |
| "grad_norm": 1.479467749595642, | |
| "learning_rate": 7.144e-05, | |
| "loss": 0.0993, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 22.97142857142857, | |
| "grad_norm": 1.1370179653167725, | |
| "learning_rate": 7.029714285714284e-05, | |
| "loss": 0.1021, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 23.085714285714285, | |
| "grad_norm": 0.9663624167442322, | |
| "learning_rate": 6.915428571428571e-05, | |
| "loss": 0.0956, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 23.2, | |
| "grad_norm": 0.6943888664245605, | |
| "learning_rate": 6.801142857142857e-05, | |
| "loss": 0.0994, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 23.314285714285713, | |
| "grad_norm": 1.3604183197021484, | |
| "learning_rate": 6.686857142857142e-05, | |
| "loss": 0.0963, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 23.428571428571427, | |
| "grad_norm": 1.3610256910324097, | |
| "learning_rate": 6.572571428571427e-05, | |
| "loss": 0.0956, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 23.542857142857144, | |
| "grad_norm": 1.2277686595916748, | |
| "learning_rate": 6.458285714285714e-05, | |
| "loss": 0.0953, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 23.65714285714286, | |
| "grad_norm": 1.1448625326156616, | |
| "learning_rate": 6.343999999999999e-05, | |
| "loss": 0.0954, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 23.771428571428572, | |
| "grad_norm": 0.8833436369895935, | |
| "learning_rate": 6.229714285714285e-05, | |
| "loss": 0.0968, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 23.885714285714286, | |
| "grad_norm": 1.0425817966461182, | |
| "learning_rate": 6.115428571428572e-05, | |
| "loss": 0.094, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 1.7617619037628174, | |
| "learning_rate": 6.001142857142857e-05, | |
| "loss": 0.098, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 24.114285714285714, | |
| "grad_norm": 0.7041512131690979, | |
| "learning_rate": 5.886857142857142e-05, | |
| "loss": 0.0891, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 24.228571428571428, | |
| "grad_norm": 1.3548294305801392, | |
| "learning_rate": 5.772571428571428e-05, | |
| "loss": 0.0937, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 24.34285714285714, | |
| "grad_norm": 1.0486685037612915, | |
| "learning_rate": 5.658285714285714e-05, | |
| "loss": 0.0947, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 24.457142857142856, | |
| "grad_norm": 1.6215624809265137, | |
| "learning_rate": 5.543999999999999e-05, | |
| "loss": 0.0934, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 24.571428571428573, | |
| "grad_norm": 1.0335862636566162, | |
| "learning_rate": 5.4297142857142855e-05, | |
| "loss": 0.088, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 24.685714285714287, | |
| "grad_norm": 1.3150044679641724, | |
| "learning_rate": 5.315428571428571e-05, | |
| "loss": 0.0957, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 24.8, | |
| "grad_norm": 1.3469773530960083, | |
| "learning_rate": 5.201142857142856e-05, | |
| "loss": 0.0893, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 24.914285714285715, | |
| "grad_norm": 0.7123535871505737, | |
| "learning_rate": 5.086857142857143e-05, | |
| "loss": 0.0833, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 25.02857142857143, | |
| "grad_norm": 0.9019558429718018, | |
| "learning_rate": 4.9725714285714285e-05, | |
| "loss": 0.084, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 25.142857142857142, | |
| "grad_norm": 1.2943990230560303, | |
| "learning_rate": 4.8582857142857136e-05, | |
| "loss": 0.0786, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 25.257142857142856, | |
| "grad_norm": 1.431429386138916, | |
| "learning_rate": 4.743999999999999e-05, | |
| "loss": 0.0882, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 25.37142857142857, | |
| "grad_norm": 0.8068431615829468, | |
| "learning_rate": 4.629714285714286e-05, | |
| "loss": 0.0846, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 25.485714285714284, | |
| "grad_norm": 1.0088834762573242, | |
| "learning_rate": 4.515428571428571e-05, | |
| "loss": 0.0832, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 25.6, | |
| "grad_norm": 1.1686701774597168, | |
| "learning_rate": 4.4011428571428565e-05, | |
| "loss": 0.0761, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 25.714285714285715, | |
| "grad_norm": 1.5640618801116943, | |
| "learning_rate": 4.286857142857143e-05, | |
| "loss": 0.0894, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 25.82857142857143, | |
| "grad_norm": 1.4497385025024414, | |
| "learning_rate": 4.172571428571428e-05, | |
| "loss": 0.0917, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 25.942857142857143, | |
| "grad_norm": 1.151632308959961, | |
| "learning_rate": 4.058285714285714e-05, | |
| "loss": 0.0826, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 26.057142857142857, | |
| "grad_norm": 1.5499285459518433, | |
| "learning_rate": 3.944e-05, | |
| "loss": 0.0826, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 26.17142857142857, | |
| "grad_norm": 1.4620537757873535, | |
| "learning_rate": 3.829714285714285e-05, | |
| "loss": 0.0843, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 26.285714285714285, | |
| "grad_norm": 0.7379088401794434, | |
| "learning_rate": 3.715428571428571e-05, | |
| "loss": 0.0748, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 26.4, | |
| "grad_norm": 0.5435966849327087, | |
| "learning_rate": 3.601142857142857e-05, | |
| "loss": 0.0758, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 26.514285714285712, | |
| "grad_norm": 0.7340735197067261, | |
| "learning_rate": 3.4868571428571425e-05, | |
| "loss": 0.0836, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 26.62857142857143, | |
| "grad_norm": 0.8306871056556702, | |
| "learning_rate": 3.372571428571428e-05, | |
| "loss": 0.0848, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 26.742857142857144, | |
| "grad_norm": 1.6395269632339478, | |
| "learning_rate": 3.258285714285714e-05, | |
| "loss": 0.0804, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 26.857142857142858, | |
| "grad_norm": 1.2032523155212402, | |
| "learning_rate": 3.144e-05, | |
| "loss": 0.0804, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 26.97142857142857, | |
| "grad_norm": 1.6825361251831055, | |
| "learning_rate": 3.0297142857142855e-05, | |
| "loss": 0.0827, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 27.085714285714285, | |
| "grad_norm": 1.4704478979110718, | |
| "learning_rate": 2.9154285714285712e-05, | |
| "loss": 0.0804, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 27.2, | |
| "grad_norm": 1.5141932964324951, | |
| "learning_rate": 2.801142857142857e-05, | |
| "loss": 0.0748, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 27.314285714285713, | |
| "grad_norm": 0.6547297835350037, | |
| "learning_rate": 2.6868571428571427e-05, | |
| "loss": 0.0791, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 27.428571428571427, | |
| "grad_norm": 1.112829327583313, | |
| "learning_rate": 2.5725714285714284e-05, | |
| "loss": 0.0781, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 27.542857142857144, | |
| "grad_norm": 1.1256695985794067, | |
| "learning_rate": 2.458285714285714e-05, | |
| "loss": 0.0828, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 27.65714285714286, | |
| "grad_norm": 1.4872969388961792, | |
| "learning_rate": 2.344e-05, | |
| "loss": 0.0793, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 27.771428571428572, | |
| "grad_norm": 1.0138152837753296, | |
| "learning_rate": 2.2297142857142857e-05, | |
| "loss": 0.0794, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 27.885714285714286, | |
| "grad_norm": 1.5701348781585693, | |
| "learning_rate": 2.115428571428571e-05, | |
| "loss": 0.079, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 1.4342325925827026, | |
| "learning_rate": 2.001142857142857e-05, | |
| "loss": 0.0815, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 28.114285714285714, | |
| "grad_norm": 0.6444075107574463, | |
| "learning_rate": 1.886857142857143e-05, | |
| "loss": 0.082, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 28.228571428571428, | |
| "grad_norm": 1.483933448791504, | |
| "learning_rate": 1.7725714285714283e-05, | |
| "loss": 0.0743, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 28.34285714285714, | |
| "grad_norm": 0.6548141241073608, | |
| "learning_rate": 1.6582857142857144e-05, | |
| "loss": 0.0755, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 28.457142857142856, | |
| "grad_norm": 1.0112509727478027, | |
| "learning_rate": 1.5439999999999998e-05, | |
| "loss": 0.0752, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 28.571428571428573, | |
| "grad_norm": 0.8533725738525391, | |
| "learning_rate": 1.4297142857142855e-05, | |
| "loss": 0.0819, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 28.685714285714287, | |
| "grad_norm": 1.2526988983154297, | |
| "learning_rate": 1.3154285714285713e-05, | |
| "loss": 0.0725, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 28.8, | |
| "grad_norm": 0.8007093667984009, | |
| "learning_rate": 1.2011428571428572e-05, | |
| "loss": 0.0764, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 28.914285714285715, | |
| "grad_norm": 0.9422992467880249, | |
| "learning_rate": 1.0868571428571428e-05, | |
| "loss": 0.0743, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 29.02857142857143, | |
| "grad_norm": 0.9128634333610535, | |
| "learning_rate": 9.725714285714285e-06, | |
| "loss": 0.0754, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 29.142857142857142, | |
| "grad_norm": 1.241011619567871, | |
| "learning_rate": 8.582857142857142e-06, | |
| "loss": 0.077, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 29.257142857142856, | |
| "grad_norm": 1.006628394126892, | |
| "learning_rate": 7.439999999999999e-06, | |
| "loss": 0.0689, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 29.37142857142857, | |
| "grad_norm": 1.3787076473236084, | |
| "learning_rate": 6.2971428571428565e-06, | |
| "loss": 0.0789, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 29.485714285714284, | |
| "grad_norm": 1.1843293905258179, | |
| "learning_rate": 5.154285714285714e-06, | |
| "loss": 0.0746, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 29.6, | |
| "grad_norm": 0.8705450296401978, | |
| "learning_rate": 4.0114285714285705e-06, | |
| "loss": 0.0754, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 29.714285714285715, | |
| "grad_norm": 1.1954331398010254, | |
| "learning_rate": 2.868571428571428e-06, | |
| "loss": 0.0716, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 29.82857142857143, | |
| "grad_norm": 0.8840579986572266, | |
| "learning_rate": 1.7257142857142856e-06, | |
| "loss": 0.0802, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 29.942857142857143, | |
| "grad_norm": 0.8856578469276428, | |
| "learning_rate": 5.828571428571428e-07, | |
| "loss": 0.076, | |
| "step": 26200 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 26250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 30, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7105444577280000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |