{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 8.230839729309082, "learning_rate": 4.9755e-05, "loss": 4.8912, "step": 50 }, { "epoch": 0.05, "grad_norm": 4.688719749450684, "learning_rate": 4.9505e-05, "loss": 1.6864, "step": 100 }, { "epoch": 0.075, "grad_norm": 5.002687454223633, "learning_rate": 4.9255e-05, "loss": 1.0988, "step": 150 }, { "epoch": 0.1, "grad_norm": 4.780168533325195, "learning_rate": 4.9005e-05, "loss": 0.8088, "step": 200 }, { "epoch": 0.125, "grad_norm": 4.83399772644043, "learning_rate": 4.8755e-05, "loss": 0.6977, "step": 250 }, { "epoch": 0.15, "grad_norm": 3.855978488922119, "learning_rate": 4.8505e-05, "loss": 0.5413, "step": 300 }, { "epoch": 0.175, "grad_norm": 2.9968976974487305, "learning_rate": 4.8255e-05, "loss": 0.4831, "step": 350 }, { "epoch": 0.2, "grad_norm": 3.2543575763702393, "learning_rate": 4.8005e-05, "loss": 0.464, "step": 400 }, { "epoch": 0.225, "grad_norm": 2.6646761894226074, "learning_rate": 4.7755e-05, "loss": 0.4362, "step": 450 }, { "epoch": 0.25, "grad_norm": 3.7676384449005127, "learning_rate": 4.7505e-05, "loss": 0.4083, "step": 500 }, { "epoch": 0.275, "grad_norm": 3.2820680141448975, "learning_rate": 4.725500000000001e-05, "loss": 0.3694, "step": 550 }, { "epoch": 0.3, "grad_norm": 3.288299322128296, "learning_rate": 4.7005e-05, "loss": 0.3735, "step": 600 }, { "epoch": 0.325, "grad_norm": 3.515395402908325, "learning_rate": 4.6755e-05, "loss": 0.3563, "step": 650 }, { "epoch": 0.35, "grad_norm": 4.2986159324646, "learning_rate": 4.6505e-05, "loss": 0.3822, "step": 700 }, { "epoch": 0.375, "grad_norm": 2.9694278240203857, "learning_rate": 4.6255000000000004e-05, "loss": 0.3447, "step": 750 }, { "epoch": 0.4, "grad_norm": 2.727344512939453, "learning_rate": 4.6005000000000004e-05, "loss": 0.3232, "step": 800 }, { "epoch": 0.425, "grad_norm": 2.925055503845215, "learning_rate": 4.5755000000000005e-05, "loss": 0.3168, "step": 850 }, { "epoch": 0.45, "grad_norm": 1.7304507493972778, "learning_rate": 4.5505000000000006e-05, "loss": 0.3269, "step": 900 }, { "epoch": 0.475, "grad_norm": 2.8221561908721924, "learning_rate": 4.5255000000000006e-05, "loss": 0.3053, "step": 950 }, { "epoch": 0.5, "grad_norm": 3.467146396636963, "learning_rate": 4.5005e-05, "loss": 0.3055, "step": 1000 }, { "epoch": 0.525, "grad_norm": 2.420332908630371, "learning_rate": 4.4755e-05, "loss": 0.3226, "step": 1050 }, { "epoch": 0.55, "grad_norm": 2.0012848377227783, "learning_rate": 4.4505e-05, "loss": 0.318, "step": 1100 }, { "epoch": 0.575, "grad_norm": 3.340888500213623, "learning_rate": 4.4255e-05, "loss": 0.3017, "step": 1150 }, { "epoch": 0.6, "grad_norm": 2.88847279548645, "learning_rate": 4.4005e-05, "loss": 0.3067, "step": 1200 }, { "epoch": 0.625, "grad_norm": 3.5503618717193604, "learning_rate": 4.3755000000000004e-05, "loss": 0.2866, "step": 1250 }, { "epoch": 0.65, "grad_norm": 1.97746741771698, "learning_rate": 4.3505000000000004e-05, "loss": 0.2764, "step": 1300 }, { "epoch": 0.675, "grad_norm": 2.513754367828369, "learning_rate": 4.3255e-05, "loss": 0.2738, "step": 1350 }, { "epoch": 0.7, "grad_norm": 1.928359866142273, "learning_rate": 4.3005e-05, "loss": 0.272, "step": 1400 }, { "epoch": 0.725, "grad_norm": 3.0566446781158447, "learning_rate": 4.2755e-05, "loss": 0.2858, "step": 1450 }, { "epoch": 0.75, "grad_norm": 3.126354932785034, "learning_rate": 4.2505e-05, "loss": 0.2687, "step": 1500 }, { "epoch": 0.775, "grad_norm": 2.6933629512786865, "learning_rate": 4.2255e-05, "loss": 0.2625, "step": 1550 }, { "epoch": 0.8, "grad_norm": 1.8243509531021118, "learning_rate": 4.2005e-05, "loss": 0.26, "step": 1600 }, { "epoch": 0.825, "grad_norm": 1.5713759660720825, "learning_rate": 4.1755e-05, "loss": 0.2658, "step": 1650 }, { "epoch": 0.85, "grad_norm": 3.0764338970184326, "learning_rate": 4.1504999999999996e-05, "loss": 0.2391, "step": 1700 }, { "epoch": 0.875, "grad_norm": 1.8904582262039185, "learning_rate": 4.1255e-05, "loss": 0.2362, "step": 1750 }, { "epoch": 0.9, "grad_norm": 2.638389825820923, "learning_rate": 4.1005000000000005e-05, "loss": 0.2635, "step": 1800 }, { "epoch": 0.925, "grad_norm": 3.567256450653076, "learning_rate": 4.0755000000000005e-05, "loss": 0.2487, "step": 1850 }, { "epoch": 0.95, "grad_norm": 1.0650864839553833, "learning_rate": 4.0505000000000006e-05, "loss": 0.2561, "step": 1900 }, { "epoch": 0.975, "grad_norm": 1.547837257385254, "learning_rate": 4.025500000000001e-05, "loss": 0.2529, "step": 1950 }, { "epoch": 1.0, "grad_norm": 1.24297297000885, "learning_rate": 4.0005e-05, "loss": 0.2592, "step": 2000 }, { "epoch": 1.0, "eval_loss": 0.17051538825035095, "eval_runtime": 64.5766, "eval_samples_per_second": 15.485, "eval_steps_per_second": 3.871, "step": 2000 }, { "epoch": 1.025, "grad_norm": 2.5889508724212646, "learning_rate": 3.9755e-05, "loss": 0.2366, "step": 2050 }, { "epoch": 1.05, "grad_norm": 2.5479001998901367, "learning_rate": 3.9505e-05, "loss": 0.2362, "step": 2100 }, { "epoch": 1.075, "grad_norm": 1.6057442426681519, "learning_rate": 3.9255e-05, "loss": 0.2418, "step": 2150 }, { "epoch": 1.1, "grad_norm": 6.403870582580566, "learning_rate": 3.9005000000000003e-05, "loss": 0.239, "step": 2200 }, { "epoch": 1.125, "grad_norm": 1.7852602005004883, "learning_rate": 3.8755000000000004e-05, "loss": 0.2317, "step": 2250 }, { "epoch": 1.15, "grad_norm": 1.732895851135254, "learning_rate": 3.8505000000000005e-05, "loss": 0.2362, "step": 2300 }, { "epoch": 1.175, "grad_norm": 1.529485821723938, "learning_rate": 3.8255e-05, "loss": 0.2168, "step": 2350 }, { "epoch": 1.2, "grad_norm": 1.7006224393844604, "learning_rate": 3.8005e-05, "loss": 0.2307, "step": 2400 }, { "epoch": 1.225, "grad_norm": 1.7951953411102295, "learning_rate": 3.7755e-05, "loss": 0.2161, "step": 2450 }, { "epoch": 1.25, "grad_norm": 2.2083799839019775, "learning_rate": 3.7505e-05, "loss": 0.2013, "step": 2500 }, { "epoch": 1.275, "grad_norm": 3.326378107070923, "learning_rate": 3.7255e-05, "loss": 0.2323, "step": 2550 }, { "epoch": 1.3, "grad_norm": 2.719395160675049, "learning_rate": 3.7005e-05, "loss": 0.2244, "step": 2600 }, { "epoch": 1.325, "grad_norm": 1.8044047355651855, "learning_rate": 3.6755e-05, "loss": 0.2256, "step": 2650 }, { "epoch": 1.35, "grad_norm": 2.9023215770721436, "learning_rate": 3.6505e-05, "loss": 0.2171, "step": 2700 }, { "epoch": 1.375, "grad_norm": 3.302866220474243, "learning_rate": 3.6255e-05, "loss": 0.2366, "step": 2750 }, { "epoch": 1.4, "grad_norm": 1.5518901348114014, "learning_rate": 3.6005e-05, "loss": 0.2179, "step": 2800 }, { "epoch": 1.425, "grad_norm": 2.0442991256713867, "learning_rate": 3.5755e-05, "loss": 0.2054, "step": 2850 }, { "epoch": 1.45, "grad_norm": 2.274137020111084, "learning_rate": 3.5505e-05, "loss": 0.2158, "step": 2900 }, { "epoch": 1.475, "grad_norm": 2.449765205383301, "learning_rate": 3.5255e-05, "loss": 0.2241, "step": 2950 }, { "epoch": 1.5, "grad_norm": 1.705886960029602, "learning_rate": 3.5005e-05, "loss": 0.2107, "step": 3000 }, { "epoch": 1.525, "grad_norm": 1.9636527299880981, "learning_rate": 3.4755e-05, "loss": 0.2127, "step": 3050 }, { "epoch": 1.55, "grad_norm": 2.048348903656006, "learning_rate": 3.4505e-05, "loss": 0.2244, "step": 3100 }, { "epoch": 1.575, "grad_norm": 2.6252517700195312, "learning_rate": 3.4255e-05, "loss": 0.2038, "step": 3150 }, { "epoch": 1.6, "grad_norm": 5.407904624938965, "learning_rate": 3.4005000000000004e-05, "loss": 0.2146, "step": 3200 }, { "epoch": 1.625, "grad_norm": 2.5721547603607178, "learning_rate": 3.3755000000000005e-05, "loss": 0.2236, "step": 3250 }, { "epoch": 1.65, "grad_norm": 2.8725879192352295, "learning_rate": 3.3505000000000005e-05, "loss": 0.2178, "step": 3300 }, { "epoch": 1.675, "grad_norm": 3.0591113567352295, "learning_rate": 3.3255000000000006e-05, "loss": 0.208, "step": 3350 }, { "epoch": 1.7, "grad_norm": 2.7534451484680176, "learning_rate": 3.3005e-05, "loss": 0.2073, "step": 3400 }, { "epoch": 1.725, "grad_norm": 2.5346784591674805, "learning_rate": 3.2755e-05, "loss": 0.1929, "step": 3450 }, { "epoch": 1.75, "grad_norm": 4.595905780792236, "learning_rate": 3.2505e-05, "loss": 0.2034, "step": 3500 }, { "epoch": 1.775, "grad_norm": 2.5899410247802734, "learning_rate": 3.2255e-05, "loss": 0.2061, "step": 3550 }, { "epoch": 1.8, "grad_norm": 2.3893258571624756, "learning_rate": 3.2005e-05, "loss": 0.1996, "step": 3600 }, { "epoch": 1.825, "grad_norm": 1.27448570728302, "learning_rate": 3.1755000000000003e-05, "loss": 0.2008, "step": 3650 }, { "epoch": 1.85, "grad_norm": 2.1095352172851562, "learning_rate": 3.1505000000000004e-05, "loss": 0.194, "step": 3700 }, { "epoch": 1.875, "grad_norm": 3.202643632888794, "learning_rate": 3.1255e-05, "loss": 0.1924, "step": 3750 }, { "epoch": 1.9, "grad_norm": 2.5014896392822266, "learning_rate": 3.1005e-05, "loss": 0.1814, "step": 3800 }, { "epoch": 1.925, "grad_norm": 1.8779760599136353, "learning_rate": 3.0755e-05, "loss": 0.1876, "step": 3850 }, { "epoch": 1.95, "grad_norm": 1.9704879522323608, "learning_rate": 3.0505e-05, "loss": 0.2035, "step": 3900 }, { "epoch": 1.975, "grad_norm": 1.5915486812591553, "learning_rate": 3.0255e-05, "loss": 0.1871, "step": 3950 }, { "epoch": 2.0, "grad_norm": 1.360573172569275, "learning_rate": 3.0004999999999998e-05, "loss": 0.1721, "step": 4000 }, { "epoch": 2.0, "eval_loss": 0.14247964322566986, "eval_runtime": 57.8322, "eval_samples_per_second": 17.291, "eval_steps_per_second": 4.323, "step": 4000 }, { "epoch": 2.025, "grad_norm": 1.7904751300811768, "learning_rate": 2.9755e-05, "loss": 0.1838, "step": 4050 }, { "epoch": 2.05, "grad_norm": 2.8318934440612793, "learning_rate": 2.9505e-05, "loss": 0.1726, "step": 4100 }, { "epoch": 2.075, "grad_norm": 2.9856910705566406, "learning_rate": 2.9255e-05, "loss": 0.1935, "step": 4150 }, { "epoch": 2.1, "grad_norm": 1.2206937074661255, "learning_rate": 2.9004999999999998e-05, "loss": 0.173, "step": 4200 }, { "epoch": 2.125, "grad_norm": 1.7264775037765503, "learning_rate": 2.8754999999999998e-05, "loss": 0.2022, "step": 4250 }, { "epoch": 2.15, "grad_norm": 1.7826628684997559, "learning_rate": 2.8505000000000002e-05, "loss": 0.1703, "step": 4300 }, { "epoch": 2.175, "grad_norm": 1.6034504175186157, "learning_rate": 2.8255000000000003e-05, "loss": 0.1809, "step": 4350 }, { "epoch": 2.2, "grad_norm": 2.0662107467651367, "learning_rate": 2.8005000000000004e-05, "loss": 0.1707, "step": 4400 }, { "epoch": 2.225, "grad_norm": 2.3653218746185303, "learning_rate": 2.7755000000000004e-05, "loss": 0.176, "step": 4450 }, { "epoch": 2.25, "grad_norm": 1.6109884977340698, "learning_rate": 2.7505000000000002e-05, "loss": 0.1823, "step": 4500 }, { "epoch": 2.275, "grad_norm": 1.0589327812194824, "learning_rate": 2.7255000000000002e-05, "loss": 0.172, "step": 4550 }, { "epoch": 2.3, "grad_norm": 1.8202016353607178, "learning_rate": 2.7005000000000003e-05, "loss": 0.1794, "step": 4600 }, { "epoch": 2.325, "grad_norm": 1.9865130186080933, "learning_rate": 2.6755000000000004e-05, "loss": 0.1734, "step": 4650 }, { "epoch": 2.35, "grad_norm": 1.5040825605392456, "learning_rate": 2.6505e-05, "loss": 0.1757, "step": 4700 }, { "epoch": 2.375, "grad_norm": 1.8152176141738892, "learning_rate": 2.6255000000000002e-05, "loss": 0.2076, "step": 4750 }, { "epoch": 2.4, "grad_norm": 1.0795916318893433, "learning_rate": 2.6005000000000003e-05, "loss": 0.1967, "step": 4800 }, { "epoch": 2.425, "grad_norm": 2.887174129486084, "learning_rate": 2.5755e-05, "loss": 0.189, "step": 4850 }, { "epoch": 2.45, "grad_norm": 1.6766159534454346, "learning_rate": 2.5505e-05, "loss": 0.1768, "step": 4900 }, { "epoch": 2.475, "grad_norm": 1.336639404296875, "learning_rate": 2.5255e-05, "loss": 0.1686, "step": 4950 }, { "epoch": 2.5, "grad_norm": 1.3105192184448242, "learning_rate": 2.5005000000000002e-05, "loss": 0.1906, "step": 5000 }, { "epoch": 2.525, "grad_norm": 3.529892683029175, "learning_rate": 2.4755e-05, "loss": 0.1802, "step": 5050 }, { "epoch": 2.55, "grad_norm": 2.232103109359741, "learning_rate": 2.4505e-05, "loss": 0.1843, "step": 5100 }, { "epoch": 2.575, "grad_norm": 3.0913138389587402, "learning_rate": 2.4255e-05, "loss": 0.1825, "step": 5150 }, { "epoch": 2.6, "grad_norm": 2.240821361541748, "learning_rate": 2.4005e-05, "loss": 0.1681, "step": 5200 }, { "epoch": 2.625, "grad_norm": 1.1728991270065308, "learning_rate": 2.3755000000000002e-05, "loss": 0.1711, "step": 5250 }, { "epoch": 2.65, "grad_norm": 2.0626940727233887, "learning_rate": 2.3505000000000003e-05, "loss": 0.1685, "step": 5300 }, { "epoch": 2.675, "grad_norm": 3.6568186283111572, "learning_rate": 2.3255e-05, "loss": 0.1835, "step": 5350 }, { "epoch": 2.7, "grad_norm": 2.1457467079162598, "learning_rate": 2.3005e-05, "loss": 0.1844, "step": 5400 }, { "epoch": 2.725, "grad_norm": 2.00649094581604, "learning_rate": 2.2755e-05, "loss": 0.1853, "step": 5450 }, { "epoch": 2.75, "grad_norm": 1.8169037103652954, "learning_rate": 2.2505000000000002e-05, "loss": 0.1808, "step": 5500 }, { "epoch": 2.775, "grad_norm": 1.9334608316421509, "learning_rate": 2.2255e-05, "loss": 0.1579, "step": 5550 }, { "epoch": 2.8, "grad_norm": 1.9589757919311523, "learning_rate": 2.2005e-05, "loss": 0.149, "step": 5600 }, { "epoch": 2.825, "grad_norm": 1.5965051651000977, "learning_rate": 2.1755e-05, "loss": 0.1826, "step": 5650 }, { "epoch": 2.85, "grad_norm": 1.2711127996444702, "learning_rate": 2.1505e-05, "loss": 0.1633, "step": 5700 }, { "epoch": 2.875, "grad_norm": 1.6561341285705566, "learning_rate": 2.1255e-05, "loss": 0.174, "step": 5750 }, { "epoch": 2.9, "grad_norm": 1.9388906955718994, "learning_rate": 2.1005e-05, "loss": 0.1692, "step": 5800 }, { "epoch": 2.925, "grad_norm": 2.6603758335113525, "learning_rate": 2.0755000000000004e-05, "loss": 0.1672, "step": 5850 }, { "epoch": 2.95, "grad_norm": 3.2559964656829834, "learning_rate": 2.0505e-05, "loss": 0.1757, "step": 5900 }, { "epoch": 2.975, "grad_norm": 1.919048547744751, "learning_rate": 2.0255000000000002e-05, "loss": 0.1735, "step": 5950 }, { "epoch": 3.0, "grad_norm": 3.5572509765625, "learning_rate": 2.0005000000000002e-05, "loss": 0.1922, "step": 6000 }, { "epoch": 3.0, "eval_loss": 0.12903086841106415, "eval_runtime": 57.4845, "eval_samples_per_second": 17.396, "eval_steps_per_second": 4.349, "step": 6000 }, { "epoch": 3.025, "grad_norm": 1.5934720039367676, "learning_rate": 1.9755e-05, "loss": 0.1699, "step": 6050 }, { "epoch": 3.05, "grad_norm": 2.1117281913757324, "learning_rate": 1.9505e-05, "loss": 0.1738, "step": 6100 }, { "epoch": 3.075, "grad_norm": 3.2797703742980957, "learning_rate": 1.9255e-05, "loss": 0.1694, "step": 6150 }, { "epoch": 3.1, "grad_norm": 1.5360231399536133, "learning_rate": 1.9005000000000002e-05, "loss": 0.1512, "step": 6200 }, { "epoch": 3.125, "grad_norm": 1.4587829113006592, "learning_rate": 1.8755e-05, "loss": 0.1697, "step": 6250 }, { "epoch": 3.15, "grad_norm": 3.891972064971924, "learning_rate": 1.8505e-05, "loss": 0.1733, "step": 6300 }, { "epoch": 3.175, "grad_norm": 3.0549535751342773, "learning_rate": 1.8255e-05, "loss": 0.1514, "step": 6350 }, { "epoch": 3.2, "grad_norm": 1.4268131256103516, "learning_rate": 1.8005e-05, "loss": 0.1844, "step": 6400 }, { "epoch": 3.225, "grad_norm": 1.3441020250320435, "learning_rate": 1.7755000000000002e-05, "loss": 0.1581, "step": 6450 }, { "epoch": 3.25, "grad_norm": 1.2834066152572632, "learning_rate": 1.7505000000000003e-05, "loss": 0.1426, "step": 6500 }, { "epoch": 3.275, "grad_norm": 2.298339366912842, "learning_rate": 1.7255000000000003e-05, "loss": 0.1597, "step": 6550 }, { "epoch": 3.3, "grad_norm": 2.287705659866333, "learning_rate": 1.7005e-05, "loss": 0.1716, "step": 6600 }, { "epoch": 3.325, "grad_norm": 1.9559084177017212, "learning_rate": 1.6755e-05, "loss": 0.142, "step": 6650 }, { "epoch": 3.35, "grad_norm": 1.5680958032608032, "learning_rate": 1.6505000000000002e-05, "loss": 0.1573, "step": 6700 }, { "epoch": 3.375, "grad_norm": 0.831417441368103, "learning_rate": 1.6255e-05, "loss": 0.1687, "step": 6750 }, { "epoch": 3.4, "grad_norm": 2.815106153488159, "learning_rate": 1.6005e-05, "loss": 0.1577, "step": 6800 }, { "epoch": 3.425, "grad_norm": 3.7050323486328125, "learning_rate": 1.5755e-05, "loss": 0.1719, "step": 6850 }, { "epoch": 3.45, "grad_norm": 2.6608221530914307, "learning_rate": 1.5505e-05, "loss": 0.1644, "step": 6900 }, { "epoch": 3.475, "grad_norm": 2.1603200435638428, "learning_rate": 1.5255e-05, "loss": 0.1501, "step": 6950 }, { "epoch": 3.5, "grad_norm": 5.807039260864258, "learning_rate": 1.5005e-05, "loss": 0.1557, "step": 7000 }, { "epoch": 3.525, "grad_norm": 2.840177297592163, "learning_rate": 1.4755e-05, "loss": 0.1632, "step": 7050 }, { "epoch": 3.55, "grad_norm": 1.3503317832946777, "learning_rate": 1.4505000000000003e-05, "loss": 0.1546, "step": 7100 }, { "epoch": 3.575, "grad_norm": 2.2482845783233643, "learning_rate": 1.4255000000000002e-05, "loss": 0.1729, "step": 7150 }, { "epoch": 3.6, "grad_norm": 2.3148233890533447, "learning_rate": 1.4005000000000002e-05, "loss": 0.1611, "step": 7200 }, { "epoch": 3.625, "grad_norm": 1.579547643661499, "learning_rate": 1.3755000000000001e-05, "loss": 0.1448, "step": 7250 }, { "epoch": 3.65, "grad_norm": 2.187784433364868, "learning_rate": 1.3505e-05, "loss": 0.1517, "step": 7300 }, { "epoch": 3.675, "grad_norm": 2.1139655113220215, "learning_rate": 1.3255000000000001e-05, "loss": 0.1499, "step": 7350 }, { "epoch": 3.7, "grad_norm": 2.0396082401275635, "learning_rate": 1.3005e-05, "loss": 0.1669, "step": 7400 }, { "epoch": 3.725, "grad_norm": 3.240251064300537, "learning_rate": 1.2755e-05, "loss": 0.154, "step": 7450 }, { "epoch": 3.75, "grad_norm": 1.9571031332015991, "learning_rate": 1.2505e-05, "loss": 0.1563, "step": 7500 }, { "epoch": 3.775, "grad_norm": 3.260962963104248, "learning_rate": 1.2255e-05, "loss": 0.169, "step": 7550 }, { "epoch": 3.8, "grad_norm": 1.8954269886016846, "learning_rate": 1.2005000000000001e-05, "loss": 0.157, "step": 7600 }, { "epoch": 3.825, "grad_norm": 2.130872964859009, "learning_rate": 1.1755e-05, "loss": 0.1573, "step": 7650 }, { "epoch": 3.85, "grad_norm": 2.272871494293213, "learning_rate": 1.1505e-05, "loss": 0.1459, "step": 7700 }, { "epoch": 3.875, "grad_norm": 2.3315696716308594, "learning_rate": 1.1255e-05, "loss": 0.1647, "step": 7750 }, { "epoch": 3.9, "grad_norm": 1.6526434421539307, "learning_rate": 1.1005e-05, "loss": 0.1436, "step": 7800 }, { "epoch": 3.925, "grad_norm": 0.8647462129592896, "learning_rate": 1.0755000000000001e-05, "loss": 0.1579, "step": 7850 }, { "epoch": 3.95, "grad_norm": 2.794496774673462, "learning_rate": 1.0505e-05, "loss": 0.17, "step": 7900 }, { "epoch": 3.975, "grad_norm": 1.3235639333724976, "learning_rate": 1.0255000000000001e-05, "loss": 0.1565, "step": 7950 }, { "epoch": 4.0, "grad_norm": 2.952364206314087, "learning_rate": 1.0005e-05, "loss": 0.1558, "step": 8000 }, { "epoch": 4.0, "eval_loss": 0.12239066511392593, "eval_runtime": 65.1197, "eval_samples_per_second": 15.356, "eval_steps_per_second": 3.839, "step": 8000 }, { "epoch": 4.025, "grad_norm": 2.830843210220337, "learning_rate": 9.755e-06, "loss": 0.1528, "step": 8050 }, { "epoch": 4.05, "grad_norm": 3.2282028198242188, "learning_rate": 9.505e-06, "loss": 0.1545, "step": 8100 }, { "epoch": 4.075, "grad_norm": 2.520493268966675, "learning_rate": 9.255e-06, "loss": 0.1643, "step": 8150 }, { "epoch": 4.1, "grad_norm": 1.4111956357955933, "learning_rate": 9.005000000000001e-06, "loss": 0.164, "step": 8200 }, { "epoch": 4.125, "grad_norm": 2.735272169113159, "learning_rate": 8.755e-06, "loss": 0.1411, "step": 8250 }, { "epoch": 4.15, "grad_norm": 1.5018057823181152, "learning_rate": 8.505e-06, "loss": 0.1607, "step": 8300 }, { "epoch": 4.175, "grad_norm": 2.3549647331237793, "learning_rate": 8.255e-06, "loss": 0.1589, "step": 8350 }, { "epoch": 4.2, "grad_norm": 1.8919823169708252, "learning_rate": 8.005e-06, "loss": 0.1354, "step": 8400 }, { "epoch": 4.225, "grad_norm": 1.9286110401153564, "learning_rate": 7.755e-06, "loss": 0.1523, "step": 8450 }, { "epoch": 4.25, "grad_norm": 1.6039056777954102, "learning_rate": 7.505000000000001e-06, "loss": 0.141, "step": 8500 }, { "epoch": 4.275, "grad_norm": 1.682285189628601, "learning_rate": 7.255000000000001e-06, "loss": 0.1542, "step": 8550 }, { "epoch": 4.3, "grad_norm": 2.6263935565948486, "learning_rate": 7.005000000000001e-06, "loss": 0.1622, "step": 8600 }, { "epoch": 4.325, "grad_norm": 1.4278345108032227, "learning_rate": 6.7550000000000005e-06, "loss": 0.1381, "step": 8650 }, { "epoch": 4.35, "grad_norm": 2.0380184650421143, "learning_rate": 6.505e-06, "loss": 0.1579, "step": 8700 }, { "epoch": 4.375, "grad_norm": 2.6025896072387695, "learning_rate": 6.254999999999999e-06, "loss": 0.1742, "step": 8750 }, { "epoch": 4.4, "grad_norm": 2.4303677082061768, "learning_rate": 6.005e-06, "loss": 0.139, "step": 8800 }, { "epoch": 4.425, "grad_norm": 1.5707917213439941, "learning_rate": 5.755e-06, "loss": 0.1442, "step": 8850 }, { "epoch": 4.45, "grad_norm": 3.853732109069824, "learning_rate": 5.505000000000001e-06, "loss": 0.1486, "step": 8900 }, { "epoch": 4.475, "grad_norm": 2.794379234313965, "learning_rate": 5.2550000000000005e-06, "loss": 0.1599, "step": 8950 }, { "epoch": 4.5, "grad_norm": 1.6828454732894897, "learning_rate": 5.005e-06, "loss": 0.1338, "step": 9000 }, { "epoch": 4.525, "grad_norm": 1.348374843597412, "learning_rate": 4.755e-06, "loss": 0.1308, "step": 9050 }, { "epoch": 4.55, "grad_norm": 3.534005880355835, "learning_rate": 4.505e-06, "loss": 0.1622, "step": 9100 }, { "epoch": 4.575, "grad_norm": 3.5305874347686768, "learning_rate": 4.255e-06, "loss": 0.1527, "step": 9150 }, { "epoch": 4.6, "grad_norm": 3.4382681846618652, "learning_rate": 4.005000000000001e-06, "loss": 0.1546, "step": 9200 }, { "epoch": 4.625, "grad_norm": 3.7415757179260254, "learning_rate": 3.755e-06, "loss": 0.1595, "step": 9250 }, { "epoch": 4.65, "grad_norm": 1.252262830734253, "learning_rate": 3.505e-06, "loss": 0.1273, "step": 9300 }, { "epoch": 4.675, "grad_norm": 1.3889789581298828, "learning_rate": 3.2550000000000006e-06, "loss": 0.1519, "step": 9350 }, { "epoch": 4.7, "grad_norm": 1.1047695875167847, "learning_rate": 3.005e-06, "loss": 0.1472, "step": 9400 }, { "epoch": 4.725, "grad_norm": 1.56324303150177, "learning_rate": 2.7550000000000003e-06, "loss": 0.1217, "step": 9450 }, { "epoch": 4.75, "grad_norm": 2.3788673877716064, "learning_rate": 2.505e-06, "loss": 0.1534, "step": 9500 }, { "epoch": 4.775, "grad_norm": 1.8210504055023193, "learning_rate": 2.255e-06, "loss": 0.1424, "step": 9550 }, { "epoch": 4.8, "grad_norm": 1.365644097328186, "learning_rate": 2.005e-06, "loss": 0.1686, "step": 9600 }, { "epoch": 4.825, "grad_norm": 1.9589169025421143, "learning_rate": 1.7550000000000001e-06, "loss": 0.1543, "step": 9650 }, { "epoch": 4.85, "grad_norm": 2.8975677490234375, "learning_rate": 1.505e-06, "loss": 0.1449, "step": 9700 }, { "epoch": 4.875, "grad_norm": 0.7453130483627319, "learning_rate": 1.255e-06, "loss": 0.1548, "step": 9750 }, { "epoch": 4.9, "grad_norm": 1.493669867515564, "learning_rate": 1.0050000000000001e-06, "loss": 0.1537, "step": 9800 }, { "epoch": 4.925, "grad_norm": 2.2871060371398926, "learning_rate": 7.550000000000001e-07, "loss": 0.1611, "step": 9850 }, { "epoch": 4.95, "grad_norm": 1.1288138628005981, "learning_rate": 5.05e-07, "loss": 0.1619, "step": 9900 }, { "epoch": 4.975, "grad_norm": 1.5970274209976196, "learning_rate": 2.5500000000000005e-07, "loss": 0.1349, "step": 9950 }, { "epoch": 5.0, "grad_norm": 2.328122615814209, "learning_rate": 5e-09, "loss": 0.1619, "step": 10000 }, { "epoch": 5.0, "eval_loss": 0.12139008939266205, "eval_runtime": 58.7123, "eval_samples_per_second": 17.032, "eval_steps_per_second": 4.258, "step": 10000 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 338354503680000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }