| { |
| "best_global_step": 2400, |
| "best_metric": 0.4560001492500305, |
| "best_model_checkpoint": "./phi4-mini-ifc-FULL-2xRTX3090-20250623-182201/checkpoint-2400", |
| "epoch": 2.9993898718730936, |
| "eval_steps": 100, |
| "global_step": 2457, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03050640634533252, |
| "grad_norm": 2.6699628829956055, |
| "learning_rate": 1.9512195121951222e-05, |
| "loss": 3.4284, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06101281269066504, |
| "grad_norm": 3.3811914920806885, |
| "learning_rate": 3.983739837398374e-05, |
| "loss": 2.0523, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09151921903599756, |
| "grad_norm": 1.4363490343093872, |
| "learning_rate": 6.016260162601627e-05, |
| "loss": 1.1186, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.12202562538133008, |
| "grad_norm": 0.9445595741271973, |
| "learning_rate": 8.048780487804879e-05, |
| "loss": 0.7767, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12202562538133008, |
| "eval_loss": 0.8299265503883362, |
| "eval_runtime": 871.0818, |
| "eval_samples_per_second": 4.192, |
| "eval_steps_per_second": 0.699, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1525320317266626, |
| "grad_norm": 0.9062737822532654, |
| "learning_rate": 0.0001008130081300813, |
| "loss": 0.6824, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.18303843807199513, |
| "grad_norm": 0.6746254563331604, |
| "learning_rate": 0.00012113821138211383, |
| "loss": 0.5972, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21354484441732763, |
| "grad_norm": 0.6552643179893494, |
| "learning_rate": 0.00014146341463414634, |
| "loss": 0.5609, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.24405125076266015, |
| "grad_norm": 0.654053270816803, |
| "learning_rate": 0.00016178861788617888, |
| "loss": 0.5291, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.24405125076266015, |
| "eval_loss": 0.6649972200393677, |
| "eval_runtime": 877.5464, |
| "eval_samples_per_second": 4.162, |
| "eval_steps_per_second": 0.694, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2745576571079927, |
| "grad_norm": 0.7661640048027039, |
| "learning_rate": 0.00018211382113821138, |
| "loss": 0.5005, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.3050640634533252, |
| "grad_norm": 0.5404186248779297, |
| "learning_rate": 0.00019999909148078624, |
| "loss": 0.5128, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.33557046979865773, |
| "grad_norm": 0.6695619225502014, |
| "learning_rate": 0.00019992086820059076, |
| "loss": 0.4854, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.36607687614399026, |
| "grad_norm": 0.6187945008277893, |
| "learning_rate": 0.00019971657461388795, |
| "loss": 0.4872, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.36607687614399026, |
| "eval_loss": 0.6011127233505249, |
| "eval_runtime": 894.271, |
| "eval_samples_per_second": 4.084, |
| "eval_steps_per_second": 0.681, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.39658328248932273, |
| "grad_norm": 0.5339706540107727, |
| "learning_rate": 0.00019938646847819693, |
| "loss": 0.458, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.42708968883465526, |
| "grad_norm": 0.5768976211547852, |
| "learning_rate": 0.00019893096628891503, |
| "loss": 0.4696, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4575960951799878, |
| "grad_norm": 0.4378525912761688, |
| "learning_rate": 0.00019835064275382507, |
| "loss": 0.4474, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4881025015253203, |
| "grad_norm": 0.637508749961853, |
| "learning_rate": 0.00019764623006798555, |
| "loss": 0.4469, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4881025015253203, |
| "eval_loss": 0.572228193283081, |
| "eval_runtime": 1153.4356, |
| "eval_samples_per_second": 3.166, |
| "eval_steps_per_second": 0.528, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5186089078706528, |
| "grad_norm": 0.4896605908870697, |
| "learning_rate": 0.00019681861698991922, |
| "loss": 0.4232, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.5491153142159854, |
| "grad_norm": 0.5206599235534668, |
| "learning_rate": 0.0001958688477202651, |
| "loss": 0.4537, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5796217205613179, |
| "grad_norm": 0.3808891177177429, |
| "learning_rate": 0.00019479812058430883, |
| "loss": 0.4206, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.6101281269066504, |
| "grad_norm": 0.4902428090572357, |
| "learning_rate": 0.00019360778652005416, |
| "loss": 0.4821, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6101281269066504, |
| "eval_loss": 0.547258734703064, |
| "eval_runtime": 1090.047, |
| "eval_samples_per_second": 3.35, |
| "eval_steps_per_second": 0.559, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6406345332519829, |
| "grad_norm": 0.48209357261657715, |
| "learning_rate": 0.00019229934737374232, |
| "loss": 0.4004, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.6711409395973155, |
| "grad_norm": 0.3675863742828369, |
| "learning_rate": 0.00019087445400497042, |
| "loss": 0.405, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.701647345942648, |
| "grad_norm": 0.5796023011207581, |
| "learning_rate": 0.00018933490420379947, |
| "loss": 0.4028, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.7321537522879805, |
| "grad_norm": 0.44113054871559143, |
| "learning_rate": 0.00018768264042248013, |
| "loss": 0.3989, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7321537522879805, |
| "eval_loss": 0.5261130332946777, |
| "eval_runtime": 1094.835, |
| "eval_samples_per_second": 3.336, |
| "eval_steps_per_second": 0.556, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.762660158633313, |
| "grad_norm": 0.4297366440296173, |
| "learning_rate": 0.0001859197473246576, |
| "loss": 0.3941, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.7931665649786455, |
| "grad_norm": 0.48033469915390015, |
| "learning_rate": 0.00018404844915514867, |
| "loss": 0.406, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.823672971323978, |
| "grad_norm": 0.4652460217475891, |
| "learning_rate": 0.00018207110693360868, |
| "loss": 0.3799, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.8541793776693105, |
| "grad_norm": 0.3977755606174469, |
| "learning_rate": 0.00017999021547562943, |
| "loss": 0.3809, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8541793776693105, |
| "eval_loss": 0.5122238993644714, |
| "eval_runtime": 1006.0944, |
| "eval_samples_per_second": 3.63, |
| "eval_steps_per_second": 0.605, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.884685784014643, |
| "grad_norm": 0.4193669855594635, |
| "learning_rate": 0.00017780840024502693, |
| "loss": 0.3872, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.9151921903599756, |
| "grad_norm": 0.5903205275535583, |
| "learning_rate": 0.00017552841404128947, |
| "loss": 0.3786, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.9456985967053081, |
| "grad_norm": 0.4734324514865875, |
| "learning_rate": 0.0001731531335263669, |
| "loss": 0.3463, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.9762050030506406, |
| "grad_norm": 0.590374231338501, |
| "learning_rate": 0.00017068555559518163, |
| "loss": 0.373, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9762050030506406, |
| "eval_loss": 0.5041590332984924, |
| "eval_runtime": 1029.9504, |
| "eval_samples_per_second": 3.546, |
| "eval_steps_per_second": 0.591, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.00732153752288, |
| "grad_norm": 0.35001957416534424, |
| "learning_rate": 0.0001681287935944421, |
| "loss": 0.4096, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.0378279438682123, |
| "grad_norm": 0.715282142162323, |
| "learning_rate": 0.00016548607339452853, |
| "loss": 0.362, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.068334350213545, |
| "grad_norm": 0.39849480986595154, |
| "learning_rate": 0.0001627607293194077, |
| "loss": 0.3521, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.0988407565588774, |
| "grad_norm": 0.46415719389915466, |
| "learning_rate": 0.00015995619993971122, |
| "loss": 0.3523, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.0988407565588774, |
| "eval_loss": 0.4963458776473999, |
| "eval_runtime": 1026.062, |
| "eval_samples_per_second": 3.559, |
| "eval_steps_per_second": 0.594, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.1293471629042098, |
| "grad_norm": 0.4831911623477936, |
| "learning_rate": 0.00015707602373428626, |
| "loss": 0.3414, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.1598535692495424, |
| "grad_norm": 0.4268845021724701, |
| "learning_rate": 0.0001541238346256912, |
| "loss": 0.3456, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.1903599755948748, |
| "grad_norm": 0.4288281202316284, |
| "learning_rate": 0.00015110335739527045, |
| "loss": 0.3139, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.2208663819402075, |
| "grad_norm": 0.4492523968219757, |
| "learning_rate": 0.00014801840298359217, |
| "loss": 0.3559, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.2208663819402075, |
| "eval_loss": 0.4898269474506378, |
| "eval_runtime": 1025.9893, |
| "eval_samples_per_second": 3.559, |
| "eval_steps_per_second": 0.594, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.2513727882855399, |
| "grad_norm": 0.4456675052642822, |
| "learning_rate": 0.00014487286368217915, |
| "loss": 0.3524, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.2818791946308725, |
| "grad_norm": 0.4612857401371002, |
| "learning_rate": 0.00014167070822259867, |
| "loss": 0.3376, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.312385600976205, |
| "grad_norm": 0.3742743134498596, |
| "learning_rate": 0.00013841597676910816, |
| "loss": 0.3461, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.3428920073215376, |
| "grad_norm": 0.4589081406593323, |
| "learning_rate": 0.0001351127758211739, |
| "loss": 0.3294, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.3428920073215376, |
| "eval_loss": 0.4819416105747223, |
| "eval_runtime": 1027.6928, |
| "eval_samples_per_second": 3.554, |
| "eval_steps_per_second": 0.593, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.37339841366687, |
| "grad_norm": 0.4929927587509155, |
| "learning_rate": 0.0001317652730322948, |
| "loss": 0.3518, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.4039048200122026, |
| "grad_norm": 0.4028312861919403, |
| "learning_rate": 0.00012837769195166756, |
| "loss": 0.3156, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.434411226357535, |
| "grad_norm": 0.47802016139030457, |
| "learning_rate": 0.00012495430669532862, |
| "loss": 0.3314, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.4649176327028677, |
| "grad_norm": 0.4614291191101074, |
| "learning_rate": 0.00012149943655349567, |
| "loss": 0.3499, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.4649176327028677, |
| "eval_loss": 0.4747631251811981, |
| "eval_runtime": 1024.8622, |
| "eval_samples_per_second": 3.563, |
| "eval_steps_per_second": 0.594, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.4954240390482, |
| "grad_norm": 0.3750015199184418, |
| "learning_rate": 0.00011801744054091276, |
| "loss": 0.3173, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.5259304453935325, |
| "grad_norm": 0.4086696207523346, |
| "learning_rate": 0.00011451271189707497, |
| "loss": 0.3086, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.5564368517388651, |
| "grad_norm": 0.5094380974769592, |
| "learning_rate": 0.00011098967254327173, |
| "loss": 0.3135, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.5869432580841978, |
| "grad_norm": 0.6511872410774231, |
| "learning_rate": 0.00010745276750344218, |
| "loss": 0.3239, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.5869432580841978, |
| "eval_loss": 0.4694528877735138, |
| "eval_runtime": 1025.1911, |
| "eval_samples_per_second": 3.562, |
| "eval_steps_per_second": 0.594, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.6174496644295302, |
| "grad_norm": 0.40622764825820923, |
| "learning_rate": 0.00010390645929588196, |
| "loss": 0.3124, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.6479560707748626, |
| "grad_norm": 0.42357689142227173, |
| "learning_rate": 0.0001003552223028772, |
| "loss": 0.2789, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.6784624771201953, |
| "grad_norm": 0.4693623483181, |
| "learning_rate": 9.680353712536995e-05, |
| "loss": 0.2955, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.7089688834655279, |
| "grad_norm": 0.39055606722831726, |
| "learning_rate": 9.325588492977734e-05, |
| "loss": 0.2731, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.7089688834655279, |
| "eval_loss": 0.47373446822166443, |
| "eval_runtime": 964.0616, |
| "eval_samples_per_second": 3.788, |
| "eval_steps_per_second": 0.632, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.7394752898108603, |
| "grad_norm": 0.35268428921699524, |
| "learning_rate": 8.971674179409714e-05, |
| "loss": 0.2899, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.7699816961561927, |
| "grad_norm": 0.6586225628852844, |
| "learning_rate": 8.619057306043388e-05, |
| "loss": 0.2802, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.8004881025015254, |
| "grad_norm": 0.43787944316864014, |
| "learning_rate": 8.268182770106981e-05, |
| "loss": 0.2951, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.830994508846858, |
| "grad_norm": 0.4165858328342438, |
| "learning_rate": 7.91949327051903e-05, |
| "loss": 0.269, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.830994508846858, |
| "eval_loss": 0.47291475534439087, |
| "eval_runtime": 993.1212, |
| "eval_samples_per_second": 3.677, |
| "eval_steps_per_second": 0.613, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.8615009151921904, |
| "grad_norm": 0.38626885414123535, |
| "learning_rate": 7.573428749334481e-05, |
| "loss": 0.2769, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.8920073215375228, |
| "grad_norm": 0.38758519291877747, |
| "learning_rate": 7.230425836669183e-05, |
| "loss": 0.2807, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.9225137278828552, |
| "grad_norm": 0.387899786233902, |
| "learning_rate": 6.890917299802986e-05, |
| "loss": 0.265, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.9530201342281879, |
| "grad_norm": 0.37289875745773315, |
| "learning_rate": 6.555331497156672e-05, |
| "loss": 0.2492, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.9530201342281879, |
| "eval_loss": 0.4726342558860779, |
| "eval_runtime": 1054.2196, |
| "eval_samples_per_second": 3.464, |
| "eval_steps_per_second": 0.578, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.9835265405735205, |
| "grad_norm": 0.6593974232673645, |
| "learning_rate": 6.224091837831467e-05, |
| "loss": 0.2924, |
| "step": 1625 |
| }, |
| { |
| "epoch": 2.01464307504576, |
| "grad_norm": 0.4300393760204315, |
| "learning_rate": 5.897616247393181e-05, |
| "loss": 0.2949, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.045149481391092, |
| "grad_norm": 0.5719879865646362, |
| "learning_rate": 5.5763166405748855e-05, |
| "loss": 0.2876, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.0756558877364246, |
| "grad_norm": 0.37001481652259827, |
| "learning_rate": 5.260598401563508e-05, |
| "loss": 0.286, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.0756558877364246, |
| "eval_loss": 0.4663603603839874, |
| "eval_runtime": 1040.9239, |
| "eval_samples_per_second": 3.508, |
| "eval_steps_per_second": 0.585, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.106162294081757, |
| "grad_norm": 0.291629433631897, |
| "learning_rate": 4.950859872525999e-05, |
| "loss": 0.2641, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.13666870042709, |
| "grad_norm": 0.5670744776725769, |
| "learning_rate": 4.647491851020414e-05, |
| "loss": 0.287, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.1671751067724223, |
| "grad_norm": 0.43546485900878906, |
| "learning_rate": 4.350877096926107e-05, |
| "loss": 0.3091, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.1976815131177547, |
| "grad_norm": 0.35784369707107544, |
| "learning_rate": 4.061389849514965e-05, |
| "loss": 0.2824, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.1976815131177547, |
| "eval_loss": 0.4643763601779938, |
| "eval_runtime": 1066.3493, |
| "eval_samples_per_second": 3.425, |
| "eval_steps_per_second": 0.571, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.228187919463087, |
| "grad_norm": 0.46165725588798523, |
| "learning_rate": 3.7793953552732294e-05, |
| "loss": 0.2716, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.2586943258084196, |
| "grad_norm": 0.4466606378555298, |
| "learning_rate": 3.505249407069414e-05, |
| "loss": 0.2543, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.2892007321537524, |
| "grad_norm": 0.4068796932697296, |
| "learning_rate": 3.239297895249955e-05, |
| "loss": 0.2624, |
| "step": 1875 |
| }, |
| { |
| "epoch": 2.319707138499085, |
| "grad_norm": 0.4468878209590912, |
| "learning_rate": 2.9818763712288354e-05, |
| "loss": 0.2812, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.319707138499085, |
| "eval_loss": 0.46269848942756653, |
| "eval_runtime": 1022.7507, |
| "eval_samples_per_second": 3.571, |
| "eval_steps_per_second": 0.595, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.3502135448444172, |
| "grad_norm": 0.5847700834274292, |
| "learning_rate": 2.733309624121877e-05, |
| "loss": 0.27, |
| "step": 1925 |
| }, |
| { |
| "epoch": 2.3807199511897497, |
| "grad_norm": 0.45481202006340027, |
| "learning_rate": 2.4939112709598324e-05, |
| "loss": 0.2931, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.4112263575350825, |
| "grad_norm": 0.5133854150772095, |
| "learning_rate": 2.2639833609973182e-05, |
| "loss": 0.2639, |
| "step": 1975 |
| }, |
| { |
| "epoch": 2.441732763880415, |
| "grad_norm": 0.4334775507450104, |
| "learning_rate": 2.0438159946168167e-05, |
| "loss": 0.2716, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.441732763880415, |
| "eval_loss": 0.45944076776504517, |
| "eval_runtime": 967.2644, |
| "eval_samples_per_second": 3.776, |
| "eval_steps_per_second": 0.63, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.4722391702257474, |
| "grad_norm": 0.6098644733428955, |
| "learning_rate": 1.833686957308619e-05, |
| "loss": 0.2661, |
| "step": 2025 |
| }, |
| { |
| "epoch": 2.5027455765710798, |
| "grad_norm": 0.6568627953529358, |
| "learning_rate": 1.633861369188431e-05, |
| "loss": 0.277, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.533251982916412, |
| "grad_norm": 0.9738965034484863, |
| "learning_rate": 1.4445913504949603e-05, |
| "loss": 0.2594, |
| "step": 2075 |
| }, |
| { |
| "epoch": 2.563758389261745, |
| "grad_norm": 0.5284731388092041, |
| "learning_rate": 1.2661157034894267e-05, |
| "loss": 0.2898, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.563758389261745, |
| "eval_loss": 0.45677462220191956, |
| "eval_runtime": 943.8598, |
| "eval_samples_per_second": 3.869, |
| "eval_steps_per_second": 0.645, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.5942647956070775, |
| "grad_norm": 0.4135531187057495, |
| "learning_rate": 1.098659611158399e-05, |
| "loss": 0.2904, |
| "step": 2125 |
| }, |
| { |
| "epoch": 2.62477120195241, |
| "grad_norm": 0.5006974935531616, |
| "learning_rate": 9.424343531000968e-06, |
| "loss": 0.2595, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.6552776082977427, |
| "grad_norm": 0.36891356110572815, |
| "learning_rate": 7.9763703895259e-06, |
| "loss": 0.2404, |
| "step": 2175 |
| }, |
| { |
| "epoch": 2.685784014643075, |
| "grad_norm": 0.5526717305183411, |
| "learning_rate": 6.644503597003126e-06, |
| "loss": 0.2766, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.685784014643075, |
| "eval_loss": 0.4571227431297302, |
| "eval_runtime": 964.8941, |
| "eval_samples_per_second": 3.785, |
| "eval_steps_per_second": 0.631, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.7162904209884076, |
| "grad_norm": 0.37439122796058655, |
| "learning_rate": 5.430423571725396e-06, |
| "loss": 0.2767, |
| "step": 2225 |
| }, |
| { |
| "epoch": 2.74679682733374, |
| "grad_norm": 0.5347726941108704, |
| "learning_rate": 4.3356621202479855e-06, |
| "loss": 0.2782, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.7773032336790724, |
| "grad_norm": 0.5072506070137024, |
| "learning_rate": 3.3616005047058197e-06, |
| "loss": 0.2525, |
| "step": 2275 |
| }, |
| { |
| "epoch": 2.8078096400244053, |
| "grad_norm": 0.4093267023563385, |
| "learning_rate": 2.5094677000732205e-06, |
| "loss": 0.2584, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.8078096400244053, |
| "eval_loss": 0.4563479721546173, |
| "eval_runtime": 988.2982, |
| "eval_samples_per_second": 3.695, |
| "eval_steps_per_second": 0.616, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.8383160463697377, |
| "grad_norm": 0.3554539084434509, |
| "learning_rate": 1.7803388435642666e-06, |
| "loss": 0.2956, |
| "step": 2325 |
| }, |
| { |
| "epoch": 2.86882245271507, |
| "grad_norm": 0.3955806791782379, |
| "learning_rate": 1.1751338781305854e-06, |
| "loss": 0.2595, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.899328859060403, |
| "grad_norm": 0.47958260774612427, |
| "learning_rate": 6.946163917680327e-07, |
| "loss": 0.2693, |
| "step": 2375 |
| }, |
| { |
| "epoch": 2.9298352654057354, |
| "grad_norm": 0.395571768283844, |
| "learning_rate": 3.393926540965264e-07, |
| "loss": 0.2578, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.9298352654057354, |
| "eval_loss": 0.4560001492500305, |
| "eval_runtime": 984.0097, |
| "eval_samples_per_second": 3.711, |
| "eval_steps_per_second": 0.619, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.9603416717510678, |
| "grad_norm": 0.4586999714374542, |
| "learning_rate": 1.0991085142886271e-07, |
| "loss": 0.2722, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.9908480780964, |
| "grad_norm": 0.45692864060401917, |
| "learning_rate": 6.4605212932611344e-09, |
| "loss": 0.2744, |
| "step": 2450 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2457, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.869883433513124e+17, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|