| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 8.230839729309082, | |
| "learning_rate": 4.9755e-05, | |
| "loss": 4.8912, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.688719749450684, | |
| "learning_rate": 4.9505e-05, | |
| "loss": 1.6864, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 5.002687454223633, | |
| "learning_rate": 4.9255e-05, | |
| "loss": 1.0988, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.780168533325195, | |
| "learning_rate": 4.9005e-05, | |
| "loss": 0.8088, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 4.83399772644043, | |
| "learning_rate": 4.8755e-05, | |
| "loss": 0.6977, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.855978488922119, | |
| "learning_rate": 4.8505e-05, | |
| "loss": 0.5413, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 2.9968976974487305, | |
| "learning_rate": 4.8255e-05, | |
| "loss": 0.4831, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 3.2543575763702393, | |
| "learning_rate": 4.8005e-05, | |
| "loss": 0.464, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 2.6646761894226074, | |
| "learning_rate": 4.7755e-05, | |
| "loss": 0.4362, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 3.7676384449005127, | |
| "learning_rate": 4.7505e-05, | |
| "loss": 0.4083, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 3.2820680141448975, | |
| "learning_rate": 4.725500000000001e-05, | |
| "loss": 0.3694, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.288299322128296, | |
| "learning_rate": 4.7005e-05, | |
| "loss": 0.3735, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 3.515395402908325, | |
| "learning_rate": 4.6755e-05, | |
| "loss": 0.3563, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.2986159324646, | |
| "learning_rate": 4.6505e-05, | |
| "loss": 0.3822, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 2.9694278240203857, | |
| "learning_rate": 4.6255000000000004e-05, | |
| "loss": 0.3447, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.727344512939453, | |
| "learning_rate": 4.6005000000000004e-05, | |
| "loss": 0.3232, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 2.925055503845215, | |
| "learning_rate": 4.5755000000000005e-05, | |
| "loss": 0.3168, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.7304507493972778, | |
| "learning_rate": 4.5505000000000006e-05, | |
| "loss": 0.3269, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 2.8221561908721924, | |
| "learning_rate": 4.5255000000000006e-05, | |
| "loss": 0.3053, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.467146396636963, | |
| "learning_rate": 4.5005e-05, | |
| "loss": 0.3055, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 2.420332908630371, | |
| "learning_rate": 4.4755e-05, | |
| "loss": 0.3226, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.0012848377227783, | |
| "learning_rate": 4.4505e-05, | |
| "loss": 0.318, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 3.340888500213623, | |
| "learning_rate": 4.4255e-05, | |
| "loss": 0.3017, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.88847279548645, | |
| "learning_rate": 4.4005e-05, | |
| "loss": 0.3067, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 3.5503618717193604, | |
| "learning_rate": 4.3755000000000004e-05, | |
| "loss": 0.2866, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.97746741771698, | |
| "learning_rate": 4.3505000000000004e-05, | |
| "loss": 0.2764, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 2.513754367828369, | |
| "learning_rate": 4.3255e-05, | |
| "loss": 0.2738, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.928359866142273, | |
| "learning_rate": 4.3005e-05, | |
| "loss": 0.272, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 3.0566446781158447, | |
| "learning_rate": 4.2755e-05, | |
| "loss": 0.2858, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.126354932785034, | |
| "learning_rate": 4.2505e-05, | |
| "loss": 0.2687, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 2.6933629512786865, | |
| "learning_rate": 4.2255e-05, | |
| "loss": 0.2625, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.8243509531021118, | |
| "learning_rate": 4.2005e-05, | |
| "loss": 0.26, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 1.5713759660720825, | |
| "learning_rate": 4.1755e-05, | |
| "loss": 0.2658, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 3.0764338970184326, | |
| "learning_rate": 4.1504999999999996e-05, | |
| "loss": 0.2391, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 1.8904582262039185, | |
| "learning_rate": 4.1255e-05, | |
| "loss": 0.2362, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.638389825820923, | |
| "learning_rate": 4.1005000000000005e-05, | |
| "loss": 0.2635, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 3.567256450653076, | |
| "learning_rate": 4.0755000000000005e-05, | |
| "loss": 0.2487, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0650864839553833, | |
| "learning_rate": 4.0505000000000006e-05, | |
| "loss": 0.2561, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 1.547837257385254, | |
| "learning_rate": 4.025500000000001e-05, | |
| "loss": 0.2529, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.24297297000885, | |
| "learning_rate": 4.0005e-05, | |
| "loss": 0.2592, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.17051538825035095, | |
| "eval_runtime": 64.5766, | |
| "eval_samples_per_second": 15.485, | |
| "eval_steps_per_second": 3.871, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.025, | |
| "grad_norm": 2.5889508724212646, | |
| "learning_rate": 3.9755e-05, | |
| "loss": 0.2366, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.5479001998901367, | |
| "learning_rate": 3.9505e-05, | |
| "loss": 0.2362, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.075, | |
| "grad_norm": 1.6057442426681519, | |
| "learning_rate": 3.9255e-05, | |
| "loss": 0.2418, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 6.403870582580566, | |
| "learning_rate": 3.9005000000000003e-05, | |
| "loss": 0.239, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 1.7852602005004883, | |
| "learning_rate": 3.8755000000000004e-05, | |
| "loss": 0.2317, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.732895851135254, | |
| "learning_rate": 3.8505000000000005e-05, | |
| "loss": 0.2362, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.175, | |
| "grad_norm": 1.529485821723938, | |
| "learning_rate": 3.8255e-05, | |
| "loss": 0.2168, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.7006224393844604, | |
| "learning_rate": 3.8005e-05, | |
| "loss": 0.2307, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.225, | |
| "grad_norm": 1.7951953411102295, | |
| "learning_rate": 3.7755e-05, | |
| "loss": 0.2161, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.2083799839019775, | |
| "learning_rate": 3.7505e-05, | |
| "loss": 0.2013, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.275, | |
| "grad_norm": 3.326378107070923, | |
| "learning_rate": 3.7255e-05, | |
| "loss": 0.2323, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.719395160675049, | |
| "learning_rate": 3.7005e-05, | |
| "loss": 0.2244, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.325, | |
| "grad_norm": 1.8044047355651855, | |
| "learning_rate": 3.6755e-05, | |
| "loss": 0.2256, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.9023215770721436, | |
| "learning_rate": 3.6505e-05, | |
| "loss": 0.2171, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 3.302866220474243, | |
| "learning_rate": 3.6255e-05, | |
| "loss": 0.2366, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.5518901348114014, | |
| "learning_rate": 3.6005e-05, | |
| "loss": 0.2179, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.425, | |
| "grad_norm": 2.0442991256713867, | |
| "learning_rate": 3.5755e-05, | |
| "loss": 0.2054, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 2.274137020111084, | |
| "learning_rate": 3.5505e-05, | |
| "loss": 0.2158, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.475, | |
| "grad_norm": 2.449765205383301, | |
| "learning_rate": 3.5255e-05, | |
| "loss": 0.2241, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.705886960029602, | |
| "learning_rate": 3.5005e-05, | |
| "loss": 0.2107, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.525, | |
| "grad_norm": 1.9636527299880981, | |
| "learning_rate": 3.4755e-05, | |
| "loss": 0.2127, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.048348903656006, | |
| "learning_rate": 3.4505e-05, | |
| "loss": 0.2244, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.575, | |
| "grad_norm": 2.6252517700195312, | |
| "learning_rate": 3.4255e-05, | |
| "loss": 0.2038, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.407904624938965, | |
| "learning_rate": 3.4005000000000004e-05, | |
| "loss": 0.2146, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 2.5721547603607178, | |
| "learning_rate": 3.3755000000000005e-05, | |
| "loss": 0.2236, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 2.8725879192352295, | |
| "learning_rate": 3.3505000000000005e-05, | |
| "loss": 0.2178, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.675, | |
| "grad_norm": 3.0591113567352295, | |
| "learning_rate": 3.3255000000000006e-05, | |
| "loss": 0.208, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.7534451484680176, | |
| "learning_rate": 3.3005e-05, | |
| "loss": 0.2073, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.725, | |
| "grad_norm": 2.5346784591674805, | |
| "learning_rate": 3.2755e-05, | |
| "loss": 0.1929, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.595905780792236, | |
| "learning_rate": 3.2505e-05, | |
| "loss": 0.2034, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.775, | |
| "grad_norm": 2.5899410247802734, | |
| "learning_rate": 3.2255e-05, | |
| "loss": 0.2061, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.3893258571624756, | |
| "learning_rate": 3.2005e-05, | |
| "loss": 0.1996, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.825, | |
| "grad_norm": 1.27448570728302, | |
| "learning_rate": 3.1755000000000003e-05, | |
| "loss": 0.2008, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.1095352172851562, | |
| "learning_rate": 3.1505000000000004e-05, | |
| "loss": 0.194, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 3.202643632888794, | |
| "learning_rate": 3.1255e-05, | |
| "loss": 0.1924, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.5014896392822266, | |
| "learning_rate": 3.1005e-05, | |
| "loss": 0.1814, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.925, | |
| "grad_norm": 1.8779760599136353, | |
| "learning_rate": 3.0755e-05, | |
| "loss": 0.1876, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.9704879522323608, | |
| "learning_rate": 3.0505e-05, | |
| "loss": 0.2035, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.975, | |
| "grad_norm": 1.5915486812591553, | |
| "learning_rate": 3.0255e-05, | |
| "loss": 0.1871, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.360573172569275, | |
| "learning_rate": 3.0004999999999998e-05, | |
| "loss": 0.1721, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.14247964322566986, | |
| "eval_runtime": 57.8322, | |
| "eval_samples_per_second": 17.291, | |
| "eval_steps_per_second": 4.323, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.025, | |
| "grad_norm": 1.7904751300811768, | |
| "learning_rate": 2.9755e-05, | |
| "loss": 0.1838, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.8318934440612793, | |
| "learning_rate": 2.9505e-05, | |
| "loss": 0.1726, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.075, | |
| "grad_norm": 2.9856910705566406, | |
| "learning_rate": 2.9255e-05, | |
| "loss": 0.1935, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.2206937074661255, | |
| "learning_rate": 2.9004999999999998e-05, | |
| "loss": 0.173, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 1.7264775037765503, | |
| "learning_rate": 2.8754999999999998e-05, | |
| "loss": 0.2022, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.7826628684997559, | |
| "learning_rate": 2.8505000000000002e-05, | |
| "loss": 0.1703, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.175, | |
| "grad_norm": 1.6034504175186157, | |
| "learning_rate": 2.8255000000000003e-05, | |
| "loss": 0.1809, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.0662107467651367, | |
| "learning_rate": 2.8005000000000004e-05, | |
| "loss": 0.1707, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.225, | |
| "grad_norm": 2.3653218746185303, | |
| "learning_rate": 2.7755000000000004e-05, | |
| "loss": 0.176, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.6109884977340698, | |
| "learning_rate": 2.7505000000000002e-05, | |
| "loss": 0.1823, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.275, | |
| "grad_norm": 1.0589327812194824, | |
| "learning_rate": 2.7255000000000002e-05, | |
| "loss": 0.172, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.8202016353607178, | |
| "learning_rate": 2.7005000000000003e-05, | |
| "loss": 0.1794, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.325, | |
| "grad_norm": 1.9865130186080933, | |
| "learning_rate": 2.6755000000000004e-05, | |
| "loss": 0.1734, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.5040825605392456, | |
| "learning_rate": 2.6505e-05, | |
| "loss": 0.1757, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 1.8152176141738892, | |
| "learning_rate": 2.6255000000000002e-05, | |
| "loss": 0.2076, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.0795916318893433, | |
| "learning_rate": 2.6005000000000003e-05, | |
| "loss": 0.1967, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.425, | |
| "grad_norm": 2.887174129486084, | |
| "learning_rate": 2.5755e-05, | |
| "loss": 0.189, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.6766159534454346, | |
| "learning_rate": 2.5505e-05, | |
| "loss": 0.1768, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.475, | |
| "grad_norm": 1.336639404296875, | |
| "learning_rate": 2.5255e-05, | |
| "loss": 0.1686, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.3105192184448242, | |
| "learning_rate": 2.5005000000000002e-05, | |
| "loss": 0.1906, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.525, | |
| "grad_norm": 3.529892683029175, | |
| "learning_rate": 2.4755e-05, | |
| "loss": 0.1802, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 2.232103109359741, | |
| "learning_rate": 2.4505e-05, | |
| "loss": 0.1843, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.575, | |
| "grad_norm": 3.0913138389587402, | |
| "learning_rate": 2.4255e-05, | |
| "loss": 0.1825, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.240821361541748, | |
| "learning_rate": 2.4005e-05, | |
| "loss": 0.1681, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 1.1728991270065308, | |
| "learning_rate": 2.3755000000000002e-05, | |
| "loss": 0.1711, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.0626940727233887, | |
| "learning_rate": 2.3505000000000003e-05, | |
| "loss": 0.1685, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.675, | |
| "grad_norm": 3.6568186283111572, | |
| "learning_rate": 2.3255e-05, | |
| "loss": 0.1835, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.1457467079162598, | |
| "learning_rate": 2.3005e-05, | |
| "loss": 0.1844, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.725, | |
| "grad_norm": 2.00649094581604, | |
| "learning_rate": 2.2755e-05, | |
| "loss": 0.1853, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.8169037103652954, | |
| "learning_rate": 2.2505000000000002e-05, | |
| "loss": 0.1808, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.775, | |
| "grad_norm": 1.9334608316421509, | |
| "learning_rate": 2.2255e-05, | |
| "loss": 0.1579, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.9589757919311523, | |
| "learning_rate": 2.2005e-05, | |
| "loss": 0.149, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.825, | |
| "grad_norm": 1.5965051651000977, | |
| "learning_rate": 2.1755e-05, | |
| "loss": 0.1826, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.2711127996444702, | |
| "learning_rate": 2.1505e-05, | |
| "loss": 0.1633, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 1.6561341285705566, | |
| "learning_rate": 2.1255e-05, | |
| "loss": 0.174, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.9388906955718994, | |
| "learning_rate": 2.1005e-05, | |
| "loss": 0.1692, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.925, | |
| "grad_norm": 2.6603758335113525, | |
| "learning_rate": 2.0755000000000004e-05, | |
| "loss": 0.1672, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 3.2559964656829834, | |
| "learning_rate": 2.0505e-05, | |
| "loss": 0.1757, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.975, | |
| "grad_norm": 1.919048547744751, | |
| "learning_rate": 2.0255000000000002e-05, | |
| "loss": 0.1735, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.5572509765625, | |
| "learning_rate": 2.0005000000000002e-05, | |
| "loss": 0.1922, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.12903086841106415, | |
| "eval_runtime": 57.4845, | |
| "eval_samples_per_second": 17.396, | |
| "eval_steps_per_second": 4.349, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.025, | |
| "grad_norm": 1.5934720039367676, | |
| "learning_rate": 1.9755e-05, | |
| "loss": 0.1699, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 2.1117281913757324, | |
| "learning_rate": 1.9505e-05, | |
| "loss": 0.1738, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.075, | |
| "grad_norm": 3.2797703742980957, | |
| "learning_rate": 1.9255e-05, | |
| "loss": 0.1694, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 1.5360231399536133, | |
| "learning_rate": 1.9005000000000002e-05, | |
| "loss": 0.1512, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 1.4587829113006592, | |
| "learning_rate": 1.8755e-05, | |
| "loss": 0.1697, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 3.891972064971924, | |
| "learning_rate": 1.8505e-05, | |
| "loss": 0.1733, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.175, | |
| "grad_norm": 3.0549535751342773, | |
| "learning_rate": 1.8255e-05, | |
| "loss": 0.1514, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.4268131256103516, | |
| "learning_rate": 1.8005e-05, | |
| "loss": 0.1844, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.225, | |
| "grad_norm": 1.3441020250320435, | |
| "learning_rate": 1.7755000000000002e-05, | |
| "loss": 0.1581, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 1.2834066152572632, | |
| "learning_rate": 1.7505000000000003e-05, | |
| "loss": 0.1426, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.275, | |
| "grad_norm": 2.298339366912842, | |
| "learning_rate": 1.7255000000000003e-05, | |
| "loss": 0.1597, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 2.287705659866333, | |
| "learning_rate": 1.7005e-05, | |
| "loss": 0.1716, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.325, | |
| "grad_norm": 1.9559084177017212, | |
| "learning_rate": 1.6755e-05, | |
| "loss": 0.142, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 1.5680958032608032, | |
| "learning_rate": 1.6505000000000002e-05, | |
| "loss": 0.1573, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.375, | |
| "grad_norm": 0.831417441368103, | |
| "learning_rate": 1.6255e-05, | |
| "loss": 0.1687, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.815106153488159, | |
| "learning_rate": 1.6005e-05, | |
| "loss": 0.1577, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.425, | |
| "grad_norm": 3.7050323486328125, | |
| "learning_rate": 1.5755e-05, | |
| "loss": 0.1719, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 2.6608221530914307, | |
| "learning_rate": 1.5505e-05, | |
| "loss": 0.1644, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.475, | |
| "grad_norm": 2.1603200435638428, | |
| "learning_rate": 1.5255e-05, | |
| "loss": 0.1501, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 5.807039260864258, | |
| "learning_rate": 1.5005e-05, | |
| "loss": 0.1557, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.525, | |
| "grad_norm": 2.840177297592163, | |
| "learning_rate": 1.4755e-05, | |
| "loss": 0.1632, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 1.3503317832946777, | |
| "learning_rate": 1.4505000000000003e-05, | |
| "loss": 0.1546, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.575, | |
| "grad_norm": 2.2482845783233643, | |
| "learning_rate": 1.4255000000000002e-05, | |
| "loss": 0.1729, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.3148233890533447, | |
| "learning_rate": 1.4005000000000002e-05, | |
| "loss": 0.1611, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.625, | |
| "grad_norm": 1.579547643661499, | |
| "learning_rate": 1.3755000000000001e-05, | |
| "loss": 0.1448, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 2.187784433364868, | |
| "learning_rate": 1.3505e-05, | |
| "loss": 0.1517, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.675, | |
| "grad_norm": 2.1139655113220215, | |
| "learning_rate": 1.3255000000000001e-05, | |
| "loss": 0.1499, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 2.0396082401275635, | |
| "learning_rate": 1.3005e-05, | |
| "loss": 0.1669, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.725, | |
| "grad_norm": 3.240251064300537, | |
| "learning_rate": 1.2755e-05, | |
| "loss": 0.154, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 1.9571031332015991, | |
| "learning_rate": 1.2505e-05, | |
| "loss": 0.1563, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.775, | |
| "grad_norm": 3.260962963104248, | |
| "learning_rate": 1.2255e-05, | |
| "loss": 0.169, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.8954269886016846, | |
| "learning_rate": 1.2005000000000001e-05, | |
| "loss": 0.157, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.825, | |
| "grad_norm": 2.130872964859009, | |
| "learning_rate": 1.1755e-05, | |
| "loss": 0.1573, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 2.272871494293213, | |
| "learning_rate": 1.1505e-05, | |
| "loss": 0.1459, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.875, | |
| "grad_norm": 2.3315696716308594, | |
| "learning_rate": 1.1255e-05, | |
| "loss": 0.1647, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 1.6526434421539307, | |
| "learning_rate": 1.1005e-05, | |
| "loss": 0.1436, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.925, | |
| "grad_norm": 0.8647462129592896, | |
| "learning_rate": 1.0755000000000001e-05, | |
| "loss": 0.1579, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 2.794496774673462, | |
| "learning_rate": 1.0505e-05, | |
| "loss": 0.17, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.975, | |
| "grad_norm": 1.3235639333724976, | |
| "learning_rate": 1.0255000000000001e-05, | |
| "loss": 0.1565, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.952364206314087, | |
| "learning_rate": 1.0005e-05, | |
| "loss": 0.1558, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.12239066511392593, | |
| "eval_runtime": 65.1197, | |
| "eval_samples_per_second": 15.356, | |
| "eval_steps_per_second": 3.839, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.025, | |
| "grad_norm": 2.830843210220337, | |
| "learning_rate": 9.755e-06, | |
| "loss": 0.1528, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 3.2282028198242188, | |
| "learning_rate": 9.505e-06, | |
| "loss": 0.1545, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.075, | |
| "grad_norm": 2.520493268966675, | |
| "learning_rate": 9.255e-06, | |
| "loss": 0.1643, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 1.4111956357955933, | |
| "learning_rate": 9.005000000000001e-06, | |
| "loss": 0.164, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.125, | |
| "grad_norm": 2.735272169113159, | |
| "learning_rate": 8.755e-06, | |
| "loss": 0.1411, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 1.5018057823181152, | |
| "learning_rate": 8.505e-06, | |
| "loss": 0.1607, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.175, | |
| "grad_norm": 2.3549647331237793, | |
| "learning_rate": 8.255e-06, | |
| "loss": 0.1589, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 1.8919823169708252, | |
| "learning_rate": 8.005e-06, | |
| "loss": 0.1354, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.225, | |
| "grad_norm": 1.9286110401153564, | |
| "learning_rate": 7.755e-06, | |
| "loss": 0.1523, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 1.6039056777954102, | |
| "learning_rate": 7.505000000000001e-06, | |
| "loss": 0.141, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.275, | |
| "grad_norm": 1.682285189628601, | |
| "learning_rate": 7.255000000000001e-06, | |
| "loss": 0.1542, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 2.6263935565948486, | |
| "learning_rate": 7.005000000000001e-06, | |
| "loss": 0.1622, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.325, | |
| "grad_norm": 1.4278345108032227, | |
| "learning_rate": 6.7550000000000005e-06, | |
| "loss": 0.1381, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 2.0380184650421143, | |
| "learning_rate": 6.505e-06, | |
| "loss": 0.1579, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 2.6025896072387695, | |
| "learning_rate": 6.254999999999999e-06, | |
| "loss": 0.1742, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 2.4303677082061768, | |
| "learning_rate": 6.005e-06, | |
| "loss": 0.139, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.425, | |
| "grad_norm": 1.5707917213439941, | |
| "learning_rate": 5.755e-06, | |
| "loss": 0.1442, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 3.853732109069824, | |
| "learning_rate": 5.505000000000001e-06, | |
| "loss": 0.1486, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.475, | |
| "grad_norm": 2.794379234313965, | |
| "learning_rate": 5.2550000000000005e-06, | |
| "loss": 0.1599, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 1.6828454732894897, | |
| "learning_rate": 5.005e-06, | |
| "loss": 0.1338, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.525, | |
| "grad_norm": 1.348374843597412, | |
| "learning_rate": 4.755e-06, | |
| "loss": 0.1308, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 3.534005880355835, | |
| "learning_rate": 4.505e-06, | |
| "loss": 0.1622, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.575, | |
| "grad_norm": 3.5305874347686768, | |
| "learning_rate": 4.255e-06, | |
| "loss": 0.1527, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 3.4382681846618652, | |
| "learning_rate": 4.005000000000001e-06, | |
| "loss": 0.1546, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.625, | |
| "grad_norm": 3.7415757179260254, | |
| "learning_rate": 3.755e-06, | |
| "loss": 0.1595, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 1.252262830734253, | |
| "learning_rate": 3.505e-06, | |
| "loss": 0.1273, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.675, | |
| "grad_norm": 1.3889789581298828, | |
| "learning_rate": 3.2550000000000006e-06, | |
| "loss": 0.1519, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 1.1047695875167847, | |
| "learning_rate": 3.005e-06, | |
| "loss": 0.1472, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.725, | |
| "grad_norm": 1.56324303150177, | |
| "learning_rate": 2.7550000000000003e-06, | |
| "loss": 0.1217, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 2.3788673877716064, | |
| "learning_rate": 2.505e-06, | |
| "loss": 0.1534, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.775, | |
| "grad_norm": 1.8210504055023193, | |
| "learning_rate": 2.255e-06, | |
| "loss": 0.1424, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.365644097328186, | |
| "learning_rate": 2.005e-06, | |
| "loss": 0.1686, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.825, | |
| "grad_norm": 1.9589169025421143, | |
| "learning_rate": 1.7550000000000001e-06, | |
| "loss": 0.1543, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 2.8975677490234375, | |
| "learning_rate": 1.505e-06, | |
| "loss": 0.1449, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.875, | |
| "grad_norm": 0.7453130483627319, | |
| "learning_rate": 1.255e-06, | |
| "loss": 0.1548, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 1.493669867515564, | |
| "learning_rate": 1.0050000000000001e-06, | |
| "loss": 0.1537, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.925, | |
| "grad_norm": 2.2871060371398926, | |
| "learning_rate": 7.550000000000001e-07, | |
| "loss": 0.1611, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 1.1288138628005981, | |
| "learning_rate": 5.05e-07, | |
| "loss": 0.1619, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.975, | |
| "grad_norm": 1.5970274209976196, | |
| "learning_rate": 2.5500000000000005e-07, | |
| "loss": 0.1349, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 2.328122615814209, | |
| "learning_rate": 5e-09, | |
| "loss": 0.1619, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.12139008939266205, | |
| "eval_runtime": 58.7123, | |
| "eval_samples_per_second": 17.032, | |
| "eval_steps_per_second": 4.258, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 338354503680000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |