| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995721009841677, | |
| "eval_steps": 876, | |
| "global_step": 1752, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "eval_loss": 2.8475096225738525, | |
| "eval_runtime": 103.0812, | |
| "eval_samples_per_second": 110.845, | |
| "eval_steps_per_second": 13.863, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 8.440493443386991, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 2.8492, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.9490346173447717, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 2.5819, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.600282990359268, | |
| "learning_rate": 6e-06, | |
| "loss": 2.368, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5128342355183866, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 2.285, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.374859750679287, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2456, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5897286646420046, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.2245, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.5989674455070406, | |
| "learning_rate": 1.4e-05, | |
| "loss": 2.2195, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.5945958733352381, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 2.1831, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.694220003324143, | |
| "learning_rate": 1.8e-05, | |
| "loss": 2.1767, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.4554411206177154, | |
| "learning_rate": 2e-05, | |
| "loss": 2.1882, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.512788547176528, | |
| "learning_rate": 1.9998191841174705e-05, | |
| "loss": 2.1663, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.4884413661778526, | |
| "learning_rate": 1.999276801858648e-05, | |
| "loss": 2.1768, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.3566762491802051, | |
| "learning_rate": 1.998373049366187e-05, | |
| "loss": 2.1667, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.3202292242087335, | |
| "learning_rate": 1.9971082534656958e-05, | |
| "loss": 2.1643, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.5085528727448396, | |
| "learning_rate": 1.995482871547548e-05, | |
| "loss": 2.1487, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.5895534610050568, | |
| "learning_rate": 1.9934974914014765e-05, | |
| "loss": 2.1724, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.3805999898228127, | |
| "learning_rate": 1.9911528310040073e-05, | |
| "loss": 2.1537, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.4290062832106343, | |
| "learning_rate": 1.9884497382588185e-05, | |
| "loss": 2.1519, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.3924244336208988, | |
| "learning_rate": 1.985389190690111e-05, | |
| "loss": 2.1671, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.746424181546391, | |
| "learning_rate": 1.9819722950891034e-05, | |
| "loss": 2.1265, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.487352260526166, | |
| "learning_rate": 1.9782002871137835e-05, | |
| "loss": 2.1444, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.8190326103740864, | |
| "learning_rate": 1.974074530842053e-05, | |
| "loss": 2.1355, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.714020487946669, | |
| "learning_rate": 1.9695965182784347e-05, | |
| "loss": 2.1142, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.5433895344149444, | |
| "learning_rate": 1.9647678688145163e-05, | |
| "loss": 2.1351, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.512497351400695, | |
| "learning_rate": 1.9595903286433256e-05, | |
| "loss": 2.1321, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.5487574537580002, | |
| "learning_rate": 1.9540657701278536e-05, | |
| "loss": 2.1423, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.4010705826153367, | |
| "learning_rate": 1.948196191123948e-05, | |
| "loss": 2.1352, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.6179557586451234, | |
| "learning_rate": 1.9419837142578228e-05, | |
| "loss": 2.1248, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.4561046353587423, | |
| "learning_rate": 1.9354305861584542e-05, | |
| "loss": 2.1301, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.2669388875042749, | |
| "learning_rate": 1.928539176645122e-05, | |
| "loss": 2.1379, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.341753008560977, | |
| "learning_rate": 1.921311977870413e-05, | |
| "loss": 2.1439, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3365022464568894, | |
| "learning_rate": 1.9137516034189768e-05, | |
| "loss": 2.1262, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.2734810511458647, | |
| "learning_rate": 1.9058607873623697e-05, | |
| "loss": 2.1266, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3720854684658654, | |
| "learning_rate": 1.897642383270331e-05, | |
| "loss": 2.1343, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.121677779284525, | |
| "learning_rate": 1.8890993631788384e-05, | |
| "loss": 2.1352, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.189239436645775, | |
| "learning_rate": 1.880234816515326e-05, | |
| "loss": 2.0924, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3089885528086298, | |
| "learning_rate": 1.8710519489814503e-05, | |
| "loss": 2.1121, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.2381050831578697, | |
| "learning_rate": 1.8615540813938063e-05, | |
| "loss": 2.1225, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.3316405801410958, | |
| "learning_rate": 1.851744648483014e-05, | |
| "loss": 2.1203, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.1917580911147747, | |
| "learning_rate": 1.84162719765161e-05, | |
| "loss": 2.1079, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.0924602070988052, | |
| "learning_rate": 1.831205387691198e-05, | |
| "loss": 2.111, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.4364738066604248, | |
| "learning_rate": 1.8204829874593083e-05, | |
| "loss": 2.0951, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.1895141642997293, | |
| "learning_rate": 1.809463874516462e-05, | |
| "loss": 2.104, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.0767130864233465, | |
| "learning_rate": 1.798152033723923e-05, | |
| "loss": 2.0951, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.2427794223386772, | |
| "learning_rate": 1.786551555802643e-05, | |
| "loss": 2.1186, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.2682001982680597, | |
| "learning_rate": 1.774666635853927e-05, | |
| "loss": 2.0969, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.3411561694518368, | |
| "learning_rate": 1.762501571842355e-05, | |
| "loss": 2.0851, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.1660818376097224, | |
| "learning_rate": 1.7500607630414973e-05, | |
| "loss": 2.0842, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 26.782646263514767, | |
| "learning_rate": 1.7373487084429988e-05, | |
| "loss": 2.109, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.0725118823204605, | |
| "learning_rate": 1.7243700051296016e-05, | |
| "loss": 2.1033, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.1473544876658683, | |
| "learning_rate": 1.7111293466126938e-05, | |
| "loss": 2.1067, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.1233164996213143, | |
| "learning_rate": 1.6976315211349848e-05, | |
| "loss": 2.1005, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.6171587137595458, | |
| "learning_rate": 1.6838814099389268e-05, | |
| "loss": 2.1043, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.372813395267995, | |
| "learning_rate": 1.669883985501501e-05, | |
| "loss": 2.1002, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.1127836389394197, | |
| "learning_rate": 1.6556443097360136e-05, | |
| "loss": 2.0851, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.717610258588088, | |
| "learning_rate": 1.641167532161545e-05, | |
| "loss": 2.0845, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.5653769104194137, | |
| "learning_rate": 1.6264588880407218e-05, | |
| "loss": 2.0724, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.2316818487469028, | |
| "learning_rate": 1.6115236964864798e-05, | |
| "loss": 2.078, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1213742181427604, | |
| "learning_rate": 1.5963673585385016e-05, | |
| "loss": 2.0748, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1502465686154286, | |
| "learning_rate": 1.580995355210031e-05, | |
| "loss": 2.093, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0993269956575817, | |
| "learning_rate": 1.565413245505765e-05, | |
| "loss": 2.0444, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.2751286594255482, | |
| "learning_rate": 1.5496266644115386e-05, | |
| "loss": 2.0861, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.446189093106462, | |
| "learning_rate": 1.5336413208565373e-05, | |
| "loss": 2.0779, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.2051699362872939, | |
| "learning_rate": 1.5174629956487659e-05, | |
| "loss": 2.0606, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.1973773050996088, | |
| "learning_rate": 1.5010975393845257e-05, | |
| "loss": 2.053, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.1884541045802726, | |
| "learning_rate": 1.4845508703326504e-05, | |
| "loss": 2.0756, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.073704568611228, | |
| "learning_rate": 1.4678289722942757e-05, | |
| "loss": 2.1002, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.2085026626189295, | |
| "learning_rate": 1.4509378924389044e-05, | |
| "loss": 2.0443, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.120124358877359, | |
| "learning_rate": 1.4338837391175582e-05, | |
| "loss": 2.0861, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.1688083159685203, | |
| "learning_rate": 1.4166726796538044e-05, | |
| "loss": 2.0521, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.1839022965360089, | |
| "learning_rate": 1.3993109381134553e-05, | |
| "loss": 2.0825, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.204413246457808, | |
| "learning_rate": 1.3818047930537491e-05, | |
| "loss": 2.0638, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.0984747546501181, | |
| "learning_rate": 1.3641605752528225e-05, | |
| "loss": 2.0668, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.2424062933194955, | |
| "learning_rate": 1.3463846654203021e-05, | |
| "loss": 2.0704, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2287388164827397, | |
| "learning_rate": 1.3284834918898362e-05, | |
| "loss": 2.0604, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2727536904439012, | |
| "learning_rate": 1.3104635282944054e-05, | |
| "loss": 2.062, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.2445619190644972, | |
| "learning_rate": 1.2923312912252509e-05, | |
| "loss": 2.0675, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.209175703891303, | |
| "learning_rate": 1.2740933378752685e-05, | |
| "loss": 2.0813, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.05801221856849, | |
| "learning_rate": 1.2557562636677195e-05, | |
| "loss": 2.0291, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.2744525431893252, | |
| "learning_rate": 1.2373266998711152e-05, | |
| "loss": 2.07, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.1119947967102999, | |
| "learning_rate": 1.2188113112011407e-05, | |
| "loss": 2.0593, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.0106196828749796, | |
| "learning_rate": 1.2002167934104815e-05, | |
| "loss": 2.0631, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.0010665987533642, | |
| "learning_rate": 1.1815498708674266e-05, | |
| "loss": 2.0714, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.2516027979222362, | |
| "learning_rate": 1.162817294124124e-05, | |
| "loss": 2.0464, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.0156082708830316, | |
| "learning_rate": 1.144025837475365e-05, | |
| "loss": 2.0557, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.1250221739850519, | |
| "learning_rate": 1.1251822965087856e-05, | |
| "loss": 2.0496, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.2672537523919118, | |
| "learning_rate": 1.1062934856473655e-05, | |
| "loss": 2.0448, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 2.0418317317962646, | |
| "eval_runtime": 103.7763, | |
| "eval_samples_per_second": 110.102, | |
| "eval_steps_per_second": 13.77, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.1236822586139523, | |
| "learning_rate": 1.0873662356851164e-05, | |
| "loss": 2.0371, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.992435580318823, | |
| "learning_rate": 1.0684073913168502e-05, | |
| "loss": 2.0444, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.102811565572559, | |
| "learning_rate": 1.0494238086629184e-05, | |
| "loss": 2.0619, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.1541097046897795, | |
| "learning_rate": 1.0304223527898244e-05, | |
| "loss": 2.0344, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0366360190903774, | |
| "learning_rate": 1.0114098952275935e-05, | |
| "loss": 2.0665, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0161399852210633, | |
| "learning_rate": 9.923933114848125e-06, | |
| "loss": 2.036, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.24836412296511, | |
| "learning_rate": 9.733794785622254e-06, | |
| "loss": 2.0575, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1159767378128298, | |
| "learning_rate": 9.543752724657924e-06, | |
| "loss": 2.0264, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.089755633213397, | |
| "learning_rate": 9.353875657201084e-06, | |
| "loss": 2.0253, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.106234215385007, | |
| "learning_rate": 9.164232248830777e-06, | |
| "loss": 2.0188, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.0275698738246675, | |
| "learning_rate": 8.974891080627504e-06, | |
| "loss": 2.0551, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1622793874021287, | |
| "learning_rate": 8.785920624372122e-06, | |
| "loss": 2.036, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.1956825372131208, | |
| "learning_rate": 8.597389217784268e-06, | |
| "loss": 2.0121, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1603630656961061, | |
| "learning_rate": 8.409365039809282e-06, | |
| "loss": 2.0522, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.9807193257053537, | |
| "learning_rate": 8.221916085962511e-06, | |
| "loss": 2.0383, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.1576541284187967, | |
| "learning_rate": 8.03511014374e-06, | |
| "loss": 2.0338, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.058077247137416, | |
| "learning_rate": 7.849014768104354e-06, | |
| "loss": 2.0087, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.0492342153111835, | |
| "learning_rate": 7.663697257054736e-06, | |
| "loss": 2.0375, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.0607146652851884, | |
| "learning_rate": 7.479224627289765e-06, | |
| "loss": 2.027, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.0606337357914948, | |
| "learning_rate": 7.295663589972139e-06, | |
| "loss": 2.0304, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.2458990385743096, | |
| "learning_rate": 7.113080526603793e-06, | |
| "loss": 2.0247, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0520000264708274, | |
| "learning_rate": 6.93154146502019e-06, | |
| "loss": 2.0347, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.0190764685112967, | |
| "learning_rate": 6.7511120555126055e-06, | |
| "loss": 2.0245, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.0288266074715982, | |
| "learning_rate": 6.571857547086864e-06, | |
| "loss": 2.0269, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0406522209538531, | |
| "learning_rate": 6.393842763867248e-06, | |
| "loss": 2.0148, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9707392990243359, | |
| "learning_rate": 6.2171320816540144e-06, | |
| "loss": 2.0242, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0459965944310712, | |
| "learning_rate": 6.041789404643078e-06, | |
| "loss": 2.0217, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1477415350002944, | |
| "learning_rate": 5.867878142316221e-06, | |
| "loss": 2.0396, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.02016191198618, | |
| "learning_rate": 5.695461186510194e-06, | |
| "loss": 2.01, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.0555922529805126, | |
| "learning_rate": 5.524600888673058e-06, | |
| "loss": 2.0279, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.047057359776084, | |
| "learning_rate": 5.355359037315893e-06, | |
| "loss": 2.0288, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.2084733042646434, | |
| "learning_rate": 5.187796835668137e-06, | |
| "loss": 2.0069, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0679386080767506, | |
| "learning_rate": 5.021974879544522e-06, | |
| "loss": 2.0239, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.1309906953604523, | |
| "learning_rate": 4.857953135431723e-06, | |
| "loss": 1.9917, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0606942502452434, | |
| "learning_rate": 4.695790918802577e-06, | |
| "loss": 2.0103, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.222543190695613, | |
| "learning_rate": 4.535546872665707e-06, | |
| "loss": 2.0211, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.1395641397720997, | |
| "learning_rate": 4.377278946358363e-06, | |
| "loss": 2.0281, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.2221683190418635, | |
| "learning_rate": 4.2210443745900806e-06, | |
| "loss": 1.9907, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.080522060394201, | |
| "learning_rate": 4.066899656744816e-06, | |
| "loss": 1.9982, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9711133864519806, | |
| "learning_rate": 3.914900536448959e-06, | |
| "loss": 1.9983, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.1380541178528216, | |
| "learning_rate": 3.7651019814126656e-06, | |
| "loss": 2.0097, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.9633220950938465, | |
| "learning_rate": 3.617558163551802e-06, | |
| "loss": 1.986, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0500879776289904, | |
| "learning_rate": 3.4723224393976353e-06, | |
| "loss": 2.0258, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.064203126844793, | |
| "learning_rate": 3.329447330801455e-06, | |
| "loss": 2.03, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.9922208622268893, | |
| "learning_rate": 3.1889845059409552e-06, | |
| "loss": 2.0059, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.153139782016341, | |
| "learning_rate": 3.0509847606354215e-06, | |
| "loss": 2.024, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.4479646282996403, | |
| "learning_rate": 2.91549799997632e-06, | |
| "loss": 1.996, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0338025335111782, | |
| "learning_rate": 2.782573220280055e-06, | |
| "loss": 2.0027, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.063732584831313, | |
| "learning_rate": 2.6522584913693295e-06, | |
| "loss": 1.999, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.18151388005813, | |
| "learning_rate": 2.5246009391895665e-06, | |
| "loss": 2.0197, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.96078239213874, | |
| "learning_rate": 2.3996467287666914e-06, | |
| "loss": 1.9811, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.2378184274393051, | |
| "learning_rate": 2.277441047512361e-06, | |
| "loss": 2.0001, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0558611234794726, | |
| "learning_rate": 2.1580280888828e-06, | |
| "loss": 1.9939, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9611056812337401, | |
| "learning_rate": 2.041451036397002e-06, | |
| "loss": 1.9958, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.031950356244068, | |
| "learning_rate": 1.9277520480202205e-06, | |
| "loss": 2.0299, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.2064256312461334, | |
| "learning_rate": 1.81697224091831e-06, | |
| "loss": 1.9985, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.06230601717853, | |
| "learning_rate": 1.7091516765884464e-06, | |
| "loss": 2.0173, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2012632430707513, | |
| "learning_rate": 1.6043293463716202e-06, | |
| "loss": 2.0259, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0228988400529624, | |
| "learning_rate": 1.5025431573521209e-06, | |
| "loss": 2.0066, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.0275759044730708, | |
| "learning_rate": 1.4038299186491444e-06, | |
| "loss": 2.0044, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.0956109756099108, | |
| "learning_rate": 1.308225328105439e-06, | |
| "loss": 1.9951, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.5121064882610806, | |
| "learning_rate": 1.215763959377827e-06, | |
| "loss": 1.9667, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9788828191357646, | |
| "learning_rate": 1.1264792494342858e-06, | |
| "loss": 2.0037, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9967966044408001, | |
| "learning_rate": 1.0404034864620605e-06, | |
| "loss": 1.9866, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.0099926855076669, | |
| "learning_rate": 9.575677981912457e-07, | |
| "loss": 1.9836, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.002362361634915, | |
| "learning_rate": 8.780021406380012e-07, | |
| "loss": 1.9983, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9435464306765974, | |
| "learning_rate": 8.017352872715078e-07, | |
| "loss": 1.9797, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.999977176523449, | |
| "learning_rate": 7.287948186085614e-07, | |
| "loss": 2.0142, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.152097320833946, | |
| "learning_rate": 6.592071122395849e-07, | |
| "loss": 2.0097, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0857203698752966, | |
| "learning_rate": 5.929973332896677e-07, | |
| "loss": 2.0183, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0543333030471254, | |
| "learning_rate": 5.301894253180295e-07, | |
| "loss": 1.985, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0212489054938816, | |
| "learning_rate": 4.708061016592924e-07, | |
| "loss": 1.9888, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.2687381408479979, | |
| "learning_rate": 4.1486883720960436e-07, | |
| "loss": 1.997, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0473866672283625, | |
| "learning_rate": 3.6239786066064264e-07, | |
| "loss": 2.0062, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.9105990124075911, | |
| "learning_rate": 3.1341214718426885e-07, | |
| "loss": 1.999, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0250683907923996, | |
| "learning_rate": 2.6792941157051446e-07, | |
| "loss": 1.9727, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9011974189916996, | |
| "learning_rate": 2.2596610182133328e-07, | |
| "loss": 1.998, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.0603603261183756, | |
| "learning_rate": 1.8753739320250153e-07, | |
| "loss": 2.0138, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.9978845922018089, | |
| "learning_rate": 1.5265718275574658e-07, | |
| "loss": 1.9968, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.092063001361788, | |
| "learning_rate": 1.2133808427313486e-07, | |
| "loss": 2.011, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.0628189984992609, | |
| "learning_rate": 9.359142373553287e-08, | |
| "loss": 2.0037, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.9584594600500118, | |
| "learning_rate": 6.942723521676465e-08, | |
| "loss": 1.9823, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.0728091649423657, | |
| "learning_rate": 4.88542572549755e-08, | |
| "loss": 1.9732, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.0209005535735403, | |
| "learning_rate": 3.187992969249876e-08, | |
| "loss": 2.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.0849633622627997, | |
| "learning_rate": 1.851039098537122e-08, | |
| "loss": 1.9958, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.9962734868875971, | |
| "learning_rate": 8.750475983472228e-09, | |
| "loss": 1.9992, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.083369781103426, | |
| "learning_rate": 2.6037141820933752e-09, | |
| "loss": 1.9757, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9782925803672103, | |
| "learning_rate": 7.232844555282725e-11, | |
| "loss": 1.9782, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.987204909324646, | |
| "eval_runtime": 103.685, | |
| "eval_samples_per_second": 110.199, | |
| "eval_steps_per_second": 13.782, | |
| "step": 1752 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1752, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 876, | |
| "total_flos": 366806984294400.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |