| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.728622631848438, | |
| "eval_steps": 99999999, | |
| "global_step": 19000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.051203277009728626, | |
| "grad_norm": 4.247892379760742, | |
| "learning_rate": 2.6315789473684212e-05, | |
| "loss": 7.4934, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10240655401945725, | |
| "grad_norm": 3.517791748046875, | |
| "learning_rate": 4.997341839447103e-05, | |
| "loss": 5.579, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10240655401945725, | |
| "eval_accuracy": 0.31893968280934165, | |
| "eval_loss": 4.767695903778076, | |
| "eval_runtime": 94.8557, | |
| "eval_samples_per_second": 353.632, | |
| "eval_steps_per_second": 22.107, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15360983102918588, | |
| "grad_norm": 2.3053529262542725, | |
| "learning_rate": 4.970760233918128e-05, | |
| "loss": 4.9703, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2048131080389145, | |
| "grad_norm": 2.0182411670684814, | |
| "learning_rate": 4.944178628389155e-05, | |
| "loss": 4.7716, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2048131080389145, | |
| "eval_accuracy": 0.354354102424925, | |
| "eval_loss": 4.33852481842041, | |
| "eval_runtime": 95.6535, | |
| "eval_samples_per_second": 350.682, | |
| "eval_steps_per_second": 21.923, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2560163850486431, | |
| "grad_norm": 1.6917961835861206, | |
| "learning_rate": 4.917597022860181e-05, | |
| "loss": 4.5491, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.30721966205837176, | |
| "grad_norm": 1.3638228178024292, | |
| "learning_rate": 4.8910154173312074e-05, | |
| "loss": 4.5162, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.30721966205837176, | |
| "eval_accuracy": 0.35930259313000934, | |
| "eval_loss": 4.177200794219971, | |
| "eval_runtime": 95.6652, | |
| "eval_samples_per_second": 350.64, | |
| "eval_steps_per_second": 21.92, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35842293906810035, | |
| "grad_norm": 1.5296661853790283, | |
| "learning_rate": 4.8644338118022334e-05, | |
| "loss": 4.5039, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.409626216077829, | |
| "grad_norm": 1.1248077154159546, | |
| "learning_rate": 4.837852206273259e-05, | |
| "loss": 4.4056, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.409626216077829, | |
| "eval_accuracy": 0.36928340870045734, | |
| "eval_loss": 4.075441837310791, | |
| "eval_runtime": 96.0011, | |
| "eval_samples_per_second": 349.412, | |
| "eval_steps_per_second": 21.843, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 1.6162877082824707, | |
| "learning_rate": 4.811270600744285e-05, | |
| "loss": 4.3569, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5120327700972862, | |
| "grad_norm": 1.6072983741760254, | |
| "learning_rate": 4.784688995215311e-05, | |
| "loss": 4.3138, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5120327700972862, | |
| "eval_accuracy": 0.3626070708659433, | |
| "eval_loss": 4.014294624328613, | |
| "eval_runtime": 95.7344, | |
| "eval_samples_per_second": 350.386, | |
| "eval_steps_per_second": 21.904, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5632360471070148, | |
| "grad_norm": 1.4564248323440552, | |
| "learning_rate": 4.758107389686337e-05, | |
| "loss": 4.2611, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6144393241167435, | |
| "grad_norm": 1.2788530588150024, | |
| "learning_rate": 4.731525784157364e-05, | |
| "loss": 4.2148, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6144393241167435, | |
| "eval_accuracy": 0.35539686636426476, | |
| "eval_loss": 3.9600954055786133, | |
| "eval_runtime": 95.926, | |
| "eval_samples_per_second": 349.686, | |
| "eval_steps_per_second": 21.861, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6656426011264721, | |
| "grad_norm": 1.3278919458389282, | |
| "learning_rate": 4.7049441786283896e-05, | |
| "loss": 4.2095, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7168458781362007, | |
| "grad_norm": 1.6991857290267944, | |
| "learning_rate": 4.678362573099415e-05, | |
| "loss": 4.1925, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7168458781362007, | |
| "eval_accuracy": 0.3723133981106336, | |
| "eval_loss": 3.901859998703003, | |
| "eval_runtime": 95.2539, | |
| "eval_samples_per_second": 352.154, | |
| "eval_steps_per_second": 22.015, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7680491551459293, | |
| "grad_norm": 1.7155863046646118, | |
| "learning_rate": 4.6517809675704415e-05, | |
| "loss": 4.1556, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.819252432155658, | |
| "grad_norm": 1.6039901971817017, | |
| "learning_rate": 4.6251993620414674e-05, | |
| "loss": 4.0293, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.819252432155658, | |
| "eval_accuracy": 0.37494784430112066, | |
| "eval_loss": 3.857882499694824, | |
| "eval_runtime": 95.7224, | |
| "eval_samples_per_second": 350.43, | |
| "eval_steps_per_second": 21.907, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8704557091653866, | |
| "grad_norm": 1.6892971992492676, | |
| "learning_rate": 4.598617756512494e-05, | |
| "loss": 3.939, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 1.5192146301269531, | |
| "learning_rate": 4.57203615098352e-05, | |
| "loss": 3.9407, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "eval_accuracy": 0.37819625757774467, | |
| "eval_loss": 3.8101186752319336, | |
| "eval_runtime": 95.9641, | |
| "eval_samples_per_second": 349.547, | |
| "eval_steps_per_second": 21.852, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9728622631848438, | |
| "grad_norm": 1.6878364086151123, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 3.9125, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0240655401945724, | |
| "grad_norm": 1.6546193361282349, | |
| "learning_rate": 4.518872939925572e-05, | |
| "loss": 3.8371, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0240655401945724, | |
| "eval_accuracy": 0.37209030706755225, | |
| "eval_loss": 3.787036180496216, | |
| "eval_runtime": 95.7788, | |
| "eval_samples_per_second": 350.224, | |
| "eval_steps_per_second": 21.894, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.075268817204301, | |
| "grad_norm": 1.744416356086731, | |
| "learning_rate": 4.492291334396598e-05, | |
| "loss": 3.8068, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.1264720942140296, | |
| "grad_norm": 1.6214791536331177, | |
| "learning_rate": 4.4657097288676236e-05, | |
| "loss": 3.7642, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.1776753712237582, | |
| "grad_norm": 1.5377932786941528, | |
| "learning_rate": 4.43912812333865e-05, | |
| "loss": 3.7599, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.228878648233487, | |
| "grad_norm": 1.8464128971099854, | |
| "learning_rate": 4.412546517809676e-05, | |
| "loss": 3.6853, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.2800819252432156, | |
| "grad_norm": 1.901739239692688, | |
| "learning_rate": 4.3859649122807014e-05, | |
| "loss": 3.6537, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3312852022529442, | |
| "grad_norm": 1.936339020729065, | |
| "learning_rate": 4.359383306751728e-05, | |
| "loss": 3.6377, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.3824884792626728, | |
| "grad_norm": 1.8641879558563232, | |
| "learning_rate": 4.332801701222754e-05, | |
| "loss": 3.5523, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4336917562724014, | |
| "grad_norm": 2.087099075317383, | |
| "learning_rate": 4.3062200956937806e-05, | |
| "loss": 3.5007, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.48489503328213, | |
| "grad_norm": 1.7824835777282715, | |
| "learning_rate": 4.2796384901648065e-05, | |
| "loss": 3.4422, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5360983102918588, | |
| "grad_norm": 2.0298142433166504, | |
| "learning_rate": 4.253056884635832e-05, | |
| "loss": 3.4254, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "grad_norm": 2.227325439453125, | |
| "learning_rate": 4.2264752791068584e-05, | |
| "loss": 3.4087, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.638504864311316, | |
| "grad_norm": 2.2700014114379883, | |
| "learning_rate": 4.199893673577884e-05, | |
| "loss": 3.4023, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.6897081413210446, | |
| "grad_norm": 2.290431022644043, | |
| "learning_rate": 4.17331206804891e-05, | |
| "loss": 3.3579, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.7409114183307732, | |
| "grad_norm": 2.5916295051574707, | |
| "learning_rate": 4.146730462519937e-05, | |
| "loss": 3.2602, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.7921146953405018, | |
| "grad_norm": 2.0242862701416016, | |
| "learning_rate": 4.120148856990963e-05, | |
| "loss": 3.1941, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8433179723502304, | |
| "grad_norm": 1.8922170400619507, | |
| "learning_rate": 4.093567251461988e-05, | |
| "loss": 3.1774, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.894521249359959, | |
| "grad_norm": 2.2697248458862305, | |
| "learning_rate": 4.0669856459330146e-05, | |
| "loss": 3.2001, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.9457245263696876, | |
| "grad_norm": 2.68994402885437, | |
| "learning_rate": 4.0404040404040405e-05, | |
| "loss": 3.1232, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.9969278033794162, | |
| "grad_norm": 2.16379451751709, | |
| "learning_rate": 4.0138224348750665e-05, | |
| "loss": 3.101, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.048131080389145, | |
| "grad_norm": 1.908273458480835, | |
| "learning_rate": 3.987240829346093e-05, | |
| "loss": 3.0659, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.048131080389145, | |
| "eval_accuracy": 0.4085060110901451, | |
| "eval_loss": 3.467172384262085, | |
| "eval_runtime": 95.3375, | |
| "eval_samples_per_second": 351.845, | |
| "eval_steps_per_second": 21.996, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.0993343573988734, | |
| "grad_norm": 2.217175006866455, | |
| "learning_rate": 3.960659223817118e-05, | |
| "loss": 3.009, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.150537634408602, | |
| "grad_norm": 2.744868755340576, | |
| "learning_rate": 3.934077618288145e-05, | |
| "loss": 3.0265, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.2017409114183306, | |
| "grad_norm": 2.3460192680358887, | |
| "learning_rate": 3.907496012759171e-05, | |
| "loss": 2.9379, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.252944188428059, | |
| "grad_norm": 2.120131015777588, | |
| "learning_rate": 3.880914407230197e-05, | |
| "loss": 2.9437, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.3041474654377883, | |
| "grad_norm": 2.3475334644317627, | |
| "learning_rate": 3.8543328017012234e-05, | |
| "loss": 2.933, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.3553507424475164, | |
| "grad_norm": 2.450538396835327, | |
| "learning_rate": 3.8277511961722486e-05, | |
| "loss": 2.9619, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.4065540194572455, | |
| "grad_norm": 2.027897596359253, | |
| "learning_rate": 3.8011695906432746e-05, | |
| "loss": 2.9179, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.457757296466974, | |
| "grad_norm": 2.3188517093658447, | |
| "learning_rate": 3.774587985114301e-05, | |
| "loss": 2.8655, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.5089605734767026, | |
| "grad_norm": 1.909847617149353, | |
| "learning_rate": 3.748006379585327e-05, | |
| "loss": 2.8649, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.5601638504864312, | |
| "grad_norm": 1.8613382577896118, | |
| "learning_rate": 3.721424774056353e-05, | |
| "loss": 2.8555, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.61136712749616, | |
| "grad_norm": 1.7689636945724487, | |
| "learning_rate": 3.6948431685273796e-05, | |
| "loss": 2.8957, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.6625704045058884, | |
| "grad_norm": 1.8662793636322021, | |
| "learning_rate": 3.668261562998405e-05, | |
| "loss": 2.8132, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.713773681515617, | |
| "grad_norm": 1.9725576639175415, | |
| "learning_rate": 3.6416799574694315e-05, | |
| "loss": 2.8121, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.7649769585253456, | |
| "grad_norm": 1.7306541204452515, | |
| "learning_rate": 3.6150983519404574e-05, | |
| "loss": 2.8154, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.8161802355350742, | |
| "grad_norm": 1.824539065361023, | |
| "learning_rate": 3.5885167464114834e-05, | |
| "loss": 2.776, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.867383512544803, | |
| "grad_norm": 1.9677956104278564, | |
| "learning_rate": 3.56193514088251e-05, | |
| "loss": 2.8155, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.9185867895545314, | |
| "grad_norm": 1.9770982265472412, | |
| "learning_rate": 3.535353535353535e-05, | |
| "loss": 2.7638, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.96979006656426, | |
| "grad_norm": 2.1280829906463623, | |
| "learning_rate": 3.508771929824561e-05, | |
| "loss": 2.7771, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.0209933435739886, | |
| "grad_norm": 1.7677674293518066, | |
| "learning_rate": 3.482190324295588e-05, | |
| "loss": 2.7382, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.0721966205837172, | |
| "grad_norm": 1.6372385025024414, | |
| "learning_rate": 3.455608718766614e-05, | |
| "loss": 2.6866, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.0721966205837172, | |
| "eval_accuracy": 0.4316442393272171, | |
| "eval_loss": 3.2849764823913574, | |
| "eval_runtime": 95.1067, | |
| "eval_samples_per_second": 352.699, | |
| "eval_steps_per_second": 22.049, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.123399897593446, | |
| "grad_norm": 1.9155375957489014, | |
| "learning_rate": 3.4290271132376396e-05, | |
| "loss": 2.6794, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "grad_norm": 1.5887531042099, | |
| "learning_rate": 3.402445507708666e-05, | |
| "loss": 2.6785, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 1.7831392288208008, | |
| "learning_rate": 3.3758639021796915e-05, | |
| "loss": 2.682, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.277009728622632, | |
| "grad_norm": 1.6347112655639648, | |
| "learning_rate": 3.349282296650718e-05, | |
| "loss": 2.6161, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.32821300563236, | |
| "grad_norm": 1.774921178817749, | |
| "learning_rate": 3.322700691121744e-05, | |
| "loss": 2.6727, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.3794162826420893, | |
| "grad_norm": 1.7162151336669922, | |
| "learning_rate": 3.29611908559277e-05, | |
| "loss": 2.692, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.430619559651818, | |
| "grad_norm": 1.7720268964767456, | |
| "learning_rate": 3.2695374800637965e-05, | |
| "loss": 2.6795, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.4818228366615465, | |
| "grad_norm": 1.7839425802230835, | |
| "learning_rate": 3.242955874534822e-05, | |
| "loss": 2.6487, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.533026113671275, | |
| "grad_norm": 1.7593103647232056, | |
| "learning_rate": 3.216374269005848e-05, | |
| "loss": 2.6821, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.5842293906810037, | |
| "grad_norm": 1.838611125946045, | |
| "learning_rate": 3.189792663476874e-05, | |
| "loss": 2.6547, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.6354326676907323, | |
| "grad_norm": 1.5734059810638428, | |
| "learning_rate": 3.1632110579479e-05, | |
| "loss": 2.6343, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.686635944700461, | |
| "grad_norm": 1.7309696674346924, | |
| "learning_rate": 3.136629452418926e-05, | |
| "loss": 2.6379, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.7378392217101895, | |
| "grad_norm": 1.837202787399292, | |
| "learning_rate": 3.110047846889952e-05, | |
| "loss": 2.6094, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.789042498719918, | |
| "grad_norm": 1.6872186660766602, | |
| "learning_rate": 3.083466241360978e-05, | |
| "loss": 2.5951, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.8402457757296466, | |
| "grad_norm": 1.7772648334503174, | |
| "learning_rate": 3.056884635832004e-05, | |
| "loss": 2.5935, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.8914490527393752, | |
| "grad_norm": 1.7157689332962036, | |
| "learning_rate": 3.0303030303030306e-05, | |
| "loss": 2.6076, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.942652329749104, | |
| "grad_norm": 1.7484747171401978, | |
| "learning_rate": 3.0037214247740565e-05, | |
| "loss": 2.6294, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.9938556067588324, | |
| "grad_norm": 1.6567468643188477, | |
| "learning_rate": 2.9771398192450828e-05, | |
| "loss": 2.603, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.0450588837685615, | |
| "grad_norm": 1.6986713409423828, | |
| "learning_rate": 2.9505582137161087e-05, | |
| "loss": 2.5767, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.09626216077829, | |
| "grad_norm": 1.9400880336761475, | |
| "learning_rate": 2.9239766081871346e-05, | |
| "loss": 2.5063, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.09626216077829, | |
| "eval_accuracy": 0.4372182483806064, | |
| "eval_loss": 3.1963303089141846, | |
| "eval_runtime": 95.395, | |
| "eval_samples_per_second": 351.633, | |
| "eval_steps_per_second": 21.982, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.147465437788019, | |
| "grad_norm": 1.7278245687484741, | |
| "learning_rate": 2.8976608187134503e-05, | |
| "loss": 2.5088, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.198668714797747, | |
| "grad_norm": 1.778096318244934, | |
| "learning_rate": 2.8710792131844765e-05, | |
| "loss": 2.5284, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.249871991807476, | |
| "grad_norm": 1.990983009338379, | |
| "learning_rate": 2.8444976076555025e-05, | |
| "loss": 2.5181, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.301075268817204, | |
| "grad_norm": 1.577682375907898, | |
| "learning_rate": 2.8179160021265287e-05, | |
| "loss": 2.4916, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.352278545826933, | |
| "grad_norm": 1.6727075576782227, | |
| "learning_rate": 2.7913343965975547e-05, | |
| "loss": 2.5346, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.403481822836661, | |
| "grad_norm": 1.6889344453811646, | |
| "learning_rate": 2.7647527910685806e-05, | |
| "loss": 2.5164, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.45468509984639, | |
| "grad_norm": 1.763092041015625, | |
| "learning_rate": 2.738171185539607e-05, | |
| "loss": 2.5073, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.505888376856118, | |
| "grad_norm": 1.6232457160949707, | |
| "learning_rate": 2.7115895800106328e-05, | |
| "loss": 2.515, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.5570916538658475, | |
| "grad_norm": 1.7539464235305786, | |
| "learning_rate": 2.6850079744816587e-05, | |
| "loss": 2.538, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.6082949308755765, | |
| "grad_norm": 1.729683518409729, | |
| "learning_rate": 2.658426368952685e-05, | |
| "loss": 2.5301, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.659498207885305, | |
| "grad_norm": 1.753625750541687, | |
| "learning_rate": 2.631844763423711e-05, | |
| "loss": 2.5267, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.710701484895033, | |
| "grad_norm": 1.7453761100769043, | |
| "learning_rate": 2.605263157894737e-05, | |
| "loss": 2.5207, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 1.9901796579360962, | |
| "learning_rate": 2.578681552365763e-05, | |
| "loss": 2.4987, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.813108038914491, | |
| "grad_norm": 1.6312249898910522, | |
| "learning_rate": 2.552099946836789e-05, | |
| "loss": 2.4943, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.864311315924219, | |
| "grad_norm": 1.7069945335388184, | |
| "learning_rate": 2.525518341307815e-05, | |
| "loss": 2.4666, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.915514592933948, | |
| "grad_norm": 1.7387930154800415, | |
| "learning_rate": 2.4989367357788412e-05, | |
| "loss": 2.4932, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.966717869943676, | |
| "grad_norm": 1.6212061643600464, | |
| "learning_rate": 2.472355130249867e-05, | |
| "loss": 2.52, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 5.017921146953405, | |
| "grad_norm": 1.7624274492263794, | |
| "learning_rate": 2.445773524720893e-05, | |
| "loss": 2.4938, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 5.0691244239631335, | |
| "grad_norm": 1.824257254600525, | |
| "learning_rate": 2.4191919191919194e-05, | |
| "loss": 2.4275, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 5.1203277009728625, | |
| "grad_norm": 1.616184115409851, | |
| "learning_rate": 2.3926103136629456e-05, | |
| "loss": 2.4139, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.1203277009728625, | |
| "eval_accuracy": 0.4442466080127022, | |
| "eval_loss": 3.1405584812164307, | |
| "eval_runtime": 95.5986, | |
| "eval_samples_per_second": 350.884, | |
| "eval_steps_per_second": 21.935, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.171530977982591, | |
| "grad_norm": 1.764162302017212, | |
| "learning_rate": 2.3660287081339712e-05, | |
| "loss": 2.4878, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 5.22273425499232, | |
| "grad_norm": 1.6120530366897583, | |
| "learning_rate": 2.3394471026049975e-05, | |
| "loss": 2.4391, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 5.273937532002048, | |
| "grad_norm": 1.6949564218521118, | |
| "learning_rate": 2.3128654970760234e-05, | |
| "loss": 2.4131, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 5.325140809011777, | |
| "grad_norm": 1.667853832244873, | |
| "learning_rate": 2.2862838915470493e-05, | |
| "loss": 2.4112, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 5.376344086021505, | |
| "grad_norm": 1.427404522895813, | |
| "learning_rate": 2.2597022860180756e-05, | |
| "loss": 2.4242, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 5.427547363031234, | |
| "grad_norm": 1.6098235845565796, | |
| "learning_rate": 2.2331206804891015e-05, | |
| "loss": 2.429, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 5.478750640040962, | |
| "grad_norm": 1.8480441570281982, | |
| "learning_rate": 2.2065390749601278e-05, | |
| "loss": 2.4439, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 5.529953917050691, | |
| "grad_norm": 1.8656944036483765, | |
| "learning_rate": 2.1799574694311537e-05, | |
| "loss": 2.4164, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 5.58115719406042, | |
| "grad_norm": 1.9880622625350952, | |
| "learning_rate": 2.1533758639021797e-05, | |
| "loss": 2.465, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 5.6323604710701485, | |
| "grad_norm": 1.8432066440582275, | |
| "learning_rate": 2.126794258373206e-05, | |
| "loss": 2.4216, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 5.683563748079877, | |
| "grad_norm": 1.7552499771118164, | |
| "learning_rate": 2.100212652844232e-05, | |
| "loss": 2.379, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 5.734767025089606, | |
| "grad_norm": 1.7043852806091309, | |
| "learning_rate": 2.0736310473152578e-05, | |
| "loss": 2.4308, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 5.785970302099335, | |
| "grad_norm": 1.8842682838439941, | |
| "learning_rate": 2.047049441786284e-05, | |
| "loss": 2.412, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 5.837173579109063, | |
| "grad_norm": 1.6765940189361572, | |
| "learning_rate": 2.02046783625731e-05, | |
| "loss": 2.4299, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 5.888376856118792, | |
| "grad_norm": 1.7585973739624023, | |
| "learning_rate": 1.993886230728336e-05, | |
| "loss": 2.4078, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 5.93958013312852, | |
| "grad_norm": 1.788201928138733, | |
| "learning_rate": 1.9673046251993622e-05, | |
| "loss": 2.4186, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 5.990783410138249, | |
| "grad_norm": 1.6128724813461304, | |
| "learning_rate": 1.940723019670388e-05, | |
| "loss": 2.3844, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 6.041986687147977, | |
| "grad_norm": 1.7053714990615845, | |
| "learning_rate": 1.9141414141414144e-05, | |
| "loss": 2.3299, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 6.093189964157706, | |
| "grad_norm": 2.06257963180542, | |
| "learning_rate": 1.8875598086124403e-05, | |
| "loss": 2.3666, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 6.1443932411674345, | |
| "grad_norm": 1.8419615030288696, | |
| "learning_rate": 1.8609782030834662e-05, | |
| "loss": 2.3246, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 6.1443932411674345, | |
| "eval_accuracy": 0.44839520298251256, | |
| "eval_loss": 3.115219831466675, | |
| "eval_runtime": 95.2063, | |
| "eval_samples_per_second": 352.329, | |
| "eval_steps_per_second": 22.026, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 6.1955965181771635, | |
| "grad_norm": 1.4872585535049438, | |
| "learning_rate": 1.834662413609782e-05, | |
| "loss": 2.3234, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 6.246799795186892, | |
| "grad_norm": 1.6765344142913818, | |
| "learning_rate": 1.808080808080808e-05, | |
| "loss": 2.353, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 6.298003072196621, | |
| "grad_norm": 1.611560583114624, | |
| "learning_rate": 1.7814992025518344e-05, | |
| "loss": 2.3758, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 6.349206349206349, | |
| "grad_norm": 1.7853387594223022, | |
| "learning_rate": 1.75491759702286e-05, | |
| "loss": 2.3232, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 6.400409626216078, | |
| "grad_norm": 1.614193320274353, | |
| "learning_rate": 1.7283359914938863e-05, | |
| "loss": 2.3759, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "grad_norm": 1.8242110013961792, | |
| "learning_rate": 1.7017543859649125e-05, | |
| "loss": 2.4106, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 6.502816180235535, | |
| "grad_norm": 1.674735188484192, | |
| "learning_rate": 1.6751727804359385e-05, | |
| "loss": 2.3834, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 6.554019457245264, | |
| "grad_norm": 1.8332616090774536, | |
| "learning_rate": 1.6485911749069644e-05, | |
| "loss": 2.3631, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 6.605222734254992, | |
| "grad_norm": 1.8917430639266968, | |
| "learning_rate": 1.6220095693779903e-05, | |
| "loss": 2.3549, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 6.65642601126472, | |
| "grad_norm": 1.9428924322128296, | |
| "learning_rate": 1.5954279638490166e-05, | |
| "loss": 2.3205, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 6.7076292882744495, | |
| "grad_norm": 1.639167308807373, | |
| "learning_rate": 1.568846358320043e-05, | |
| "loss": 2.3646, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 6.7588325652841785, | |
| "grad_norm": 1.6185379028320312, | |
| "learning_rate": 1.5422647527910685e-05, | |
| "loss": 2.3757, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 6.810035842293907, | |
| "grad_norm": 1.6808964014053345, | |
| "learning_rate": 1.5156831472620947e-05, | |
| "loss": 2.3542, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 6.861239119303636, | |
| "grad_norm": 1.4692785739898682, | |
| "learning_rate": 1.4891015417331208e-05, | |
| "loss": 2.3476, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 6.912442396313364, | |
| "grad_norm": 1.6999351978302002, | |
| "learning_rate": 1.4625199362041468e-05, | |
| "loss": 2.3441, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 6.963645673323093, | |
| "grad_norm": 1.7041444778442383, | |
| "learning_rate": 1.4359383306751729e-05, | |
| "loss": 2.3725, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 7.014848950332821, | |
| "grad_norm": 1.773606538772583, | |
| "learning_rate": 1.409356725146199e-05, | |
| "loss": 2.3379, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 7.06605222734255, | |
| "grad_norm": 1.7857049703598022, | |
| "learning_rate": 1.382775119617225e-05, | |
| "loss": 2.3471, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 7.117255504352278, | |
| "grad_norm": 1.7172592878341675, | |
| "learning_rate": 1.356193514088251e-05, | |
| "loss": 2.2639, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 7.168458781362007, | |
| "grad_norm": 2.0828044414520264, | |
| "learning_rate": 1.329611908559277e-05, | |
| "loss": 2.3111, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 7.168458781362007, | |
| "eval_accuracy": 0.44894773835621105, | |
| "eval_loss": 3.0879344940185547, | |
| "eval_runtime": 95.1844, | |
| "eval_samples_per_second": 352.411, | |
| "eval_steps_per_second": 22.031, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 7.2196620583717355, | |
| "grad_norm": 1.7171036005020142, | |
| "learning_rate": 1.3032961190855927e-05, | |
| "loss": 2.3203, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 7.2708653353814645, | |
| "grad_norm": 1.8851195573806763, | |
| "learning_rate": 1.2767145135566188e-05, | |
| "loss": 2.3111, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 7.322068612391193, | |
| "grad_norm": 1.8158164024353027, | |
| "learning_rate": 1.250132908027645e-05, | |
| "loss": 2.3227, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 7.373271889400922, | |
| "grad_norm": 1.833143711090088, | |
| "learning_rate": 1.223551302498671e-05, | |
| "loss": 2.2796, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 7.42447516641065, | |
| "grad_norm": 1.5996191501617432, | |
| "learning_rate": 1.196969696969697e-05, | |
| "loss": 2.2753, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 7.475678443420379, | |
| "grad_norm": 1.7543715238571167, | |
| "learning_rate": 1.170388091440723e-05, | |
| "loss": 2.3191, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 7.526881720430108, | |
| "grad_norm": 1.8224458694458008, | |
| "learning_rate": 1.1438064859117491e-05, | |
| "loss": 2.3362, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 7.578084997439836, | |
| "grad_norm": 1.825265884399414, | |
| "learning_rate": 1.1172248803827752e-05, | |
| "loss": 2.2788, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 7.629288274449565, | |
| "grad_norm": 1.6064096689224243, | |
| "learning_rate": 1.0906432748538012e-05, | |
| "loss": 2.3287, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 7.680491551459293, | |
| "grad_norm": 1.9973961114883423, | |
| "learning_rate": 1.0640616693248273e-05, | |
| "loss": 2.3054, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 7.731694828469022, | |
| "grad_norm": 1.9129321575164795, | |
| "learning_rate": 1.0374800637958534e-05, | |
| "loss": 2.2897, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 7.7828981054787505, | |
| "grad_norm": 1.7142568826675415, | |
| "learning_rate": 1.0108984582668795e-05, | |
| "loss": 2.2763, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 7.8341013824884795, | |
| "grad_norm": 1.8948873281478882, | |
| "learning_rate": 9.843168527379054e-06, | |
| "loss": 2.2831, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 7.885304659498208, | |
| "grad_norm": 1.671781301498413, | |
| "learning_rate": 9.577352472089313e-06, | |
| "loss": 2.301, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 7.936507936507937, | |
| "grad_norm": 1.8178682327270508, | |
| "learning_rate": 9.311536416799576e-06, | |
| "loss": 2.3297, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 7.987711213517665, | |
| "grad_norm": 1.7934144735336304, | |
| "learning_rate": 9.045720361509835e-06, | |
| "loss": 2.2703, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 8.038914490527393, | |
| "grad_norm": 1.7798469066619873, | |
| "learning_rate": 8.779904306220096e-06, | |
| "loss": 2.3071, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 8.090117767537123, | |
| "grad_norm": 2.005934238433838, | |
| "learning_rate": 8.514088250930355e-06, | |
| "loss": 2.2579, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 8.141321044546851, | |
| "grad_norm": 1.7841377258300781, | |
| "learning_rate": 8.248272195640618e-06, | |
| "loss": 2.2942, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 8.19252432155658, | |
| "grad_norm": 1.814175009727478, | |
| "learning_rate": 7.982456140350877e-06, | |
| "loss": 2.2761, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 8.19252432155658, | |
| "eval_accuracy": 0.45419183636131977, | |
| "eval_loss": 3.0668067932128906, | |
| "eval_runtime": 95.2855, | |
| "eval_samples_per_second": 352.037, | |
| "eval_steps_per_second": 22.008, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 8.243727598566307, | |
| "grad_norm": 1.7196694612503052, | |
| "learning_rate": 7.719298245614036e-06, | |
| "loss": 2.2475, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 8.294930875576037, | |
| "grad_norm": 1.780168890953064, | |
| "learning_rate": 7.453482190324296e-06, | |
| "loss": 2.2623, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 8.346134152585766, | |
| "grad_norm": 1.7353187799453735, | |
| "learning_rate": 7.187666135034557e-06, | |
| "loss": 2.2436, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 8.397337429595494, | |
| "grad_norm": 1.8766587972640991, | |
| "learning_rate": 6.921850079744817e-06, | |
| "loss": 2.2514, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 8.448540706605222, | |
| "grad_norm": 1.8156400918960571, | |
| "learning_rate": 6.656034024455077e-06, | |
| "loss": 2.2647, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 8.499743983614952, | |
| "grad_norm": 1.7604610919952393, | |
| "learning_rate": 6.390217969165338e-06, | |
| "loss": 2.2633, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 8.55094726062468, | |
| "grad_norm": 1.9719775915145874, | |
| "learning_rate": 6.124401913875599e-06, | |
| "loss": 2.2729, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 8.602150537634408, | |
| "grad_norm": 1.7817537784576416, | |
| "learning_rate": 5.858585858585859e-06, | |
| "loss": 2.2605, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 8.653353814644138, | |
| "grad_norm": 2.052873373031616, | |
| "learning_rate": 5.59276980329612e-06, | |
| "loss": 2.2612, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 8.704557091653866, | |
| "grad_norm": 1.7299405336380005, | |
| "learning_rate": 5.326953748006379e-06, | |
| "loss": 2.2544, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 8.755760368663594, | |
| "grad_norm": 1.68231999874115, | |
| "learning_rate": 5.06113769271664e-06, | |
| "loss": 2.2786, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 8.806963645673322, | |
| "grad_norm": 1.740615725517273, | |
| "learning_rate": 4.7953216374269005e-06, | |
| "loss": 2.2514, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 8.858166922683052, | |
| "grad_norm": 1.6444553136825562, | |
| "learning_rate": 4.5295055821371614e-06, | |
| "loss": 2.2666, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 8.90937019969278, | |
| "grad_norm": 1.6717356443405151, | |
| "learning_rate": 4.263689526847422e-06, | |
| "loss": 2.2459, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 8.960573476702509, | |
| "grad_norm": 1.9801234006881714, | |
| "learning_rate": 3.9978734715576826e-06, | |
| "loss": 2.2494, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 9.011776753712237, | |
| "grad_norm": 1.8575870990753174, | |
| "learning_rate": 3.732057416267943e-06, | |
| "loss": 2.2438, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 9.062980030721967, | |
| "grad_norm": 1.7382246255874634, | |
| "learning_rate": 3.4662413609782037e-06, | |
| "loss": 2.2627, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 9.114183307731695, | |
| "grad_norm": 1.739556074142456, | |
| "learning_rate": 3.2004253056884642e-06, | |
| "loss": 2.249, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 9.165386584741423, | |
| "grad_norm": 1.9629456996917725, | |
| "learning_rate": 2.9346092503987244e-06, | |
| "loss": 2.2321, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 9.216589861751151, | |
| "grad_norm": 1.9510672092437744, | |
| "learning_rate": 2.668793195108985e-06, | |
| "loss": 2.2231, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 9.216589861751151, | |
| "eval_accuracy": 0.4517146740233816, | |
| "eval_loss": 3.069502830505371, | |
| "eval_runtime": 95.8923, | |
| "eval_samples_per_second": 349.809, | |
| "eval_steps_per_second": 21.868, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 9.267793138760881, | |
| "grad_norm": 1.7088594436645508, | |
| "learning_rate": 2.4056353003721427e-06, | |
| "loss": 2.2545, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 9.31899641577061, | |
| "grad_norm": 1.7437766790390015, | |
| "learning_rate": 2.1398192450824032e-06, | |
| "loss": 2.232, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 9.370199692780337, | |
| "grad_norm": 1.936566948890686, | |
| "learning_rate": 1.8740031897926634e-06, | |
| "loss": 2.2572, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 9.421402969790067, | |
| "grad_norm": 1.9586554765701294, | |
| "learning_rate": 1.608187134502924e-06, | |
| "loss": 2.2504, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 9.472606246799796, | |
| "grad_norm": 1.8036390542984009, | |
| "learning_rate": 1.3423710792131845e-06, | |
| "loss": 2.2402, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 9.523809523809524, | |
| "grad_norm": 1.866513967514038, | |
| "learning_rate": 1.076555023923445e-06, | |
| "loss": 2.2471, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 9.575012800819252, | |
| "grad_norm": 1.9675647020339966, | |
| "learning_rate": 8.107389686337055e-07, | |
| "loss": 2.2286, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 9.626216077828982, | |
| "grad_norm": 1.8157950639724731, | |
| "learning_rate": 5.44922913343966e-07, | |
| "loss": 2.2222, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 9.67741935483871, | |
| "grad_norm": 1.8483563661575317, | |
| "learning_rate": 2.7910685805422647e-07, | |
| "loss": 2.2518, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 9.728622631848438, | |
| "grad_norm": 1.9589660167694092, | |
| "learning_rate": 1.3290802764486977e-08, | |
| "loss": 2.2022, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 9.728622631848438, | |
| "step": 19000, | |
| "total_flos": 7.9413964701696e+16, | |
| "train_loss": 2.775124670731394, | |
| "train_runtime": 3271.669, | |
| "train_samples_per_second": 92.919, | |
| "train_steps_per_second": 5.807 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 19000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 99999999, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.9413964701696e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |