{ "best_global_step": 4000, "best_metric": 0.17307986319065094, "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results/2026-02-05/01-13-34/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000", "epoch": 3.875968992248062, "eval_steps": 50, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04844961240310078, "grad_norm": 1.166297197341919, "learning_rate": 0.00019981007751937986, "loss": 5.9313, "step": 50 }, { "epoch": 0.04844961240310078, "eval_loss": 4.05467414855957, "eval_runtime": 86.2208, "eval_samples_per_second": 220.411, "eval_steps_per_second": 4.593, "step": 50 }, { "epoch": 0.09689922480620156, "grad_norm": 0.7852576375007629, "learning_rate": 0.00019961627906976747, "loss": 3.8511, "step": 100 }, { "epoch": 0.09689922480620156, "eval_loss": 3.614811658859253, "eval_runtime": 105.419, "eval_samples_per_second": 180.271, "eval_steps_per_second": 3.756, "step": 100 }, { "epoch": 0.14534883720930233, "grad_norm": 0.9356492161750793, "learning_rate": 0.00019942248062015505, "loss": 3.5665, "step": 150 }, { "epoch": 0.14534883720930233, "eval_loss": 3.42499041557312, "eval_runtime": 119.6728, "eval_samples_per_second": 158.8, "eval_steps_per_second": 3.309, "step": 150 }, { "epoch": 0.1937984496124031, "grad_norm": 0.8641079068183899, "learning_rate": 0.00019922868217054265, "loss": 3.4112, "step": 200 }, { "epoch": 0.1937984496124031, "eval_loss": 3.2944159507751465, "eval_runtime": 107.5464, "eval_samples_per_second": 176.705, "eval_steps_per_second": 3.682, "step": 200 }, { "epoch": 0.24224806201550386, "grad_norm": 0.8657866716384888, "learning_rate": 0.00019903488372093023, "loss": 3.2915, "step": 250 }, { "epoch": 0.24224806201550386, "eval_loss": 3.1811130046844482, "eval_runtime": 111.4126, "eval_samples_per_second": 170.573, "eval_steps_per_second": 3.554, "step": 250 }, { "epoch": 0.29069767441860467, "grad_norm": 1.086035132408142, "learning_rate": 0.00019884108527131784, "loss": 3.2011, "step": 300 }, { "epoch": 0.29069767441860467, "eval_loss": 3.0944509506225586, "eval_runtime": 109.0889, "eval_samples_per_second": 174.207, "eval_steps_per_second": 3.63, "step": 300 }, { "epoch": 0.3391472868217054, "grad_norm": 1.14714515209198, "learning_rate": 0.00019864728682170545, "loss": 3.1197, "step": 350 }, { "epoch": 0.3391472868217054, "eval_loss": 3.013719081878662, "eval_runtime": 113.2854, "eval_samples_per_second": 167.753, "eval_steps_per_second": 3.496, "step": 350 }, { "epoch": 0.3875968992248062, "grad_norm": 1.3692086935043335, "learning_rate": 0.00019845348837209303, "loss": 3.0293, "step": 400 }, { "epoch": 0.3875968992248062, "eval_loss": 2.9276790618896484, "eval_runtime": 109.3164, "eval_samples_per_second": 173.844, "eval_steps_per_second": 3.623, "step": 400 }, { "epoch": 0.436046511627907, "grad_norm": 1.0482667684555054, "learning_rate": 0.00019825968992248064, "loss": 2.937, "step": 450 }, { "epoch": 0.436046511627907, "eval_loss": 2.824265718460083, "eval_runtime": 113.3929, "eval_samples_per_second": 167.594, "eval_steps_per_second": 3.492, "step": 450 }, { "epoch": 0.4844961240310077, "grad_norm": 1.4893537759780884, "learning_rate": 0.00019806589147286822, "loss": 2.8401, "step": 500 }, { "epoch": 0.4844961240310077, "eval_loss": 2.681960344314575, "eval_runtime": 112.2413, "eval_samples_per_second": 169.314, "eval_steps_per_second": 3.528, "step": 500 }, { "epoch": 0.5329457364341085, "grad_norm": 1.3668580055236816, "learning_rate": 0.00019787209302325582, "loss": 2.6953, "step": 550 }, { "epoch": 0.5329457364341085, "eval_loss": 2.482776165008545, "eval_runtime": 109.465, "eval_samples_per_second": 173.608, "eval_steps_per_second": 3.618, "step": 550 }, { "epoch": 0.5813953488372093, "grad_norm": 1.7991057634353638, "learning_rate": 0.00019767829457364343, "loss": 2.4792, "step": 600 }, { "epoch": 0.5813953488372093, "eval_loss": 2.0755560398101807, "eval_runtime": 111.9949, "eval_samples_per_second": 169.686, "eval_steps_per_second": 3.536, "step": 600 }, { "epoch": 0.6298449612403101, "grad_norm": 1.6921905279159546, "learning_rate": 0.000197484496124031, "loss": 1.9556, "step": 650 }, { "epoch": 0.6298449612403101, "eval_loss": 1.431490421295166, "eval_runtime": 111.5482, "eval_samples_per_second": 170.366, "eval_steps_per_second": 3.55, "step": 650 }, { "epoch": 0.6782945736434108, "grad_norm": 1.6035951375961304, "learning_rate": 0.00019729069767441862, "loss": 1.4683, "step": 700 }, { "epoch": 0.6782945736434108, "eval_loss": 1.0505075454711914, "eval_runtime": 110.2108, "eval_samples_per_second": 172.433, "eval_steps_per_second": 3.593, "step": 700 }, { "epoch": 0.7267441860465116, "grad_norm": 1.1903802156448364, "learning_rate": 0.0001970968992248062, "loss": 1.1292, "step": 750 }, { "epoch": 0.7267441860465116, "eval_loss": 0.881108820438385, "eval_runtime": 112.8095, "eval_samples_per_second": 168.461, "eval_steps_per_second": 3.51, "step": 750 }, { "epoch": 0.7751937984496124, "grad_norm": 1.0895042419433594, "learning_rate": 0.0001969031007751938, "loss": 0.9955, "step": 800 }, { "epoch": 0.7751937984496124, "eval_loss": 0.7912827730178833, "eval_runtime": 113.3802, "eval_samples_per_second": 167.613, "eval_steps_per_second": 3.493, "step": 800 }, { "epoch": 0.8236434108527132, "grad_norm": 1.1862001419067383, "learning_rate": 0.0001967093023255814, "loss": 0.9121, "step": 850 }, { "epoch": 0.8236434108527132, "eval_loss": 0.7358818650245667, "eval_runtime": 110.3147, "eval_samples_per_second": 172.271, "eval_steps_per_second": 3.59, "step": 850 }, { "epoch": 0.872093023255814, "grad_norm": 0.9009504318237305, "learning_rate": 0.000196515503875969, "loss": 0.8213, "step": 900 }, { "epoch": 0.872093023255814, "eval_loss": 0.673936665058136, "eval_runtime": 110.4965, "eval_samples_per_second": 171.987, "eval_steps_per_second": 3.584, "step": 900 }, { "epoch": 0.9205426356589147, "grad_norm": 0.8950819373130798, "learning_rate": 0.0001963217054263566, "loss": 0.7789, "step": 950 }, { "epoch": 0.9205426356589147, "eval_loss": 0.6290402412414551, "eval_runtime": 126.6737, "eval_samples_per_second": 150.023, "eval_steps_per_second": 3.126, "step": 950 }, { "epoch": 0.9689922480620154, "grad_norm": 0.8964680433273315, "learning_rate": 0.00019612790697674418, "loss": 0.7162, "step": 1000 }, { "epoch": 0.9689922480620154, "eval_loss": 0.6034494042396545, "eval_runtime": 107.8336, "eval_samples_per_second": 176.235, "eval_steps_per_second": 3.672, "step": 1000 }, { "epoch": 1.0174418604651163, "grad_norm": 0.8578282594680786, "learning_rate": 0.0001959341085271318, "loss": 0.6918, "step": 1050 }, { "epoch": 1.0174418604651163, "eval_loss": 0.571689784526825, "eval_runtime": 109.0331, "eval_samples_per_second": 174.296, "eval_steps_per_second": 3.632, "step": 1050 }, { "epoch": 1.0658914728682172, "grad_norm": 0.8138054609298706, "learning_rate": 0.00019574031007751937, "loss": 0.6908, "step": 1100 }, { "epoch": 1.0658914728682172, "eval_loss": 0.5466533899307251, "eval_runtime": 107.3453, "eval_samples_per_second": 177.036, "eval_steps_per_second": 3.689, "step": 1100 }, { "epoch": 1.1143410852713178, "grad_norm": 0.8871294856071472, "learning_rate": 0.00019554651162790698, "loss": 0.649, "step": 1150 }, { "epoch": 1.1143410852713178, "eval_loss": 0.523186206817627, "eval_runtime": 110.6186, "eval_samples_per_second": 171.798, "eval_steps_per_second": 3.58, "step": 1150 }, { "epoch": 1.1627906976744187, "grad_norm": 0.7663435339927673, "learning_rate": 0.0001953527131782946, "loss": 0.5908, "step": 1200 }, { "epoch": 1.1627906976744187, "eval_loss": 0.5085064172744751, "eval_runtime": 108.5509, "eval_samples_per_second": 175.07, "eval_steps_per_second": 3.648, "step": 1200 }, { "epoch": 1.2112403100775193, "grad_norm": 0.7912157773971558, "learning_rate": 0.00019515891472868217, "loss": 0.5644, "step": 1250 }, { "epoch": 1.2112403100775193, "eval_loss": 0.4843023419380188, "eval_runtime": 107.8466, "eval_samples_per_second": 176.213, "eval_steps_per_second": 3.672, "step": 1250 }, { "epoch": 1.2596899224806202, "grad_norm": 0.7282177805900574, "learning_rate": 0.00019496511627906978, "loss": 0.5577, "step": 1300 }, { "epoch": 1.2596899224806202, "eval_loss": 0.46449020504951477, "eval_runtime": 106.8549, "eval_samples_per_second": 177.849, "eval_steps_per_second": 3.706, "step": 1300 }, { "epoch": 1.308139534883721, "grad_norm": 0.6605934500694275, "learning_rate": 0.00019477131782945736, "loss": 0.5335, "step": 1350 }, { "epoch": 1.308139534883721, "eval_loss": 0.4484124481678009, "eval_runtime": 126.0012, "eval_samples_per_second": 150.824, "eval_steps_per_second": 3.143, "step": 1350 }, { "epoch": 1.3565891472868217, "grad_norm": 0.6995412707328796, "learning_rate": 0.00019457751937984496, "loss": 0.5396, "step": 1400 }, { "epoch": 1.3565891472868217, "eval_loss": 0.4377157688140869, "eval_runtime": 118.8732, "eval_samples_per_second": 159.868, "eval_steps_per_second": 3.331, "step": 1400 }, { "epoch": 1.4050387596899225, "grad_norm": 0.6287787556648254, "learning_rate": 0.00019438372093023257, "loss": 0.4988, "step": 1450 }, { "epoch": 1.4050387596899225, "eval_loss": 0.41537100076675415, "eval_runtime": 117.3214, "eval_samples_per_second": 161.982, "eval_steps_per_second": 3.375, "step": 1450 }, { "epoch": 1.4534883720930232, "grad_norm": 0.7416621446609497, "learning_rate": 0.00019418992248062015, "loss": 0.4888, "step": 1500 }, { "epoch": 1.4534883720930232, "eval_loss": 0.4060279130935669, "eval_runtime": 118.3486, "eval_samples_per_second": 160.577, "eval_steps_per_second": 3.346, "step": 1500 }, { "epoch": 1.501937984496124, "grad_norm": 0.70711749792099, "learning_rate": 0.00019399612403100776, "loss": 0.4664, "step": 1550 }, { "epoch": 1.501937984496124, "eval_loss": 0.38495033979415894, "eval_runtime": 116.7883, "eval_samples_per_second": 162.722, "eval_steps_per_second": 3.391, "step": 1550 }, { "epoch": 1.550387596899225, "grad_norm": 0.6704577207565308, "learning_rate": 0.00019380232558139534, "loss": 0.4331, "step": 1600 }, { "epoch": 1.550387596899225, "eval_loss": 0.36785683035850525, "eval_runtime": 134.2231, "eval_samples_per_second": 141.585, "eval_steps_per_second": 2.95, "step": 1600 }, { "epoch": 1.5988372093023255, "grad_norm": 0.7335864305496216, "learning_rate": 0.00019360852713178295, "loss": 0.4298, "step": 1650 }, { "epoch": 1.5988372093023255, "eval_loss": 0.34367096424102783, "eval_runtime": 193.7622, "eval_samples_per_second": 98.079, "eval_steps_per_second": 2.044, "step": 1650 }, { "epoch": 1.6472868217054264, "grad_norm": 0.6677731871604919, "learning_rate": 0.00019341472868217055, "loss": 0.3978, "step": 1700 }, { "epoch": 1.6472868217054264, "eval_loss": 0.32630470395088196, "eval_runtime": 242.3944, "eval_samples_per_second": 78.401, "eval_steps_per_second": 1.634, "step": 1700 }, { "epoch": 1.695736434108527, "grad_norm": 0.8195075392723083, "learning_rate": 0.00019322093023255813, "loss": 0.3778, "step": 1750 }, { "epoch": 1.695736434108527, "eval_loss": 0.31104475259780884, "eval_runtime": 231.8205, "eval_samples_per_second": 81.977, "eval_steps_per_second": 1.708, "step": 1750 }, { "epoch": 1.744186046511628, "grad_norm": 0.735170841217041, "learning_rate": 0.00019302713178294574, "loss": 0.35, "step": 1800 }, { "epoch": 1.744186046511628, "eval_loss": 0.2956381142139435, "eval_runtime": 236.8085, "eval_samples_per_second": 80.25, "eval_steps_per_second": 1.672, "step": 1800 }, { "epoch": 1.7926356589147288, "grad_norm": 0.5745160579681396, "learning_rate": 0.00019283333333333332, "loss": 0.3407, "step": 1850 }, { "epoch": 1.7926356589147288, "eval_loss": 0.2869073152542114, "eval_runtime": 232.6087, "eval_samples_per_second": 81.699, "eval_steps_per_second": 1.702, "step": 1850 }, { "epoch": 1.8410852713178296, "grad_norm": 0.6333633661270142, "learning_rate": 0.00019263953488372093, "loss": 0.3317, "step": 1900 }, { "epoch": 1.8410852713178296, "eval_loss": 0.2736239731311798, "eval_runtime": 235.3635, "eval_samples_per_second": 80.743, "eval_steps_per_second": 1.683, "step": 1900 }, { "epoch": 1.8895348837209303, "grad_norm": 0.6406286358833313, "learning_rate": 0.00019244573643410854, "loss": 0.3247, "step": 1950 }, { "epoch": 1.8895348837209303, "eval_loss": 0.26805564761161804, "eval_runtime": 235.5356, "eval_samples_per_second": 80.684, "eval_steps_per_second": 1.681, "step": 1950 }, { "epoch": 1.937984496124031, "grad_norm": 0.4822098910808563, "learning_rate": 0.00019225193798449612, "loss": 0.3057, "step": 2000 }, { "epoch": 1.937984496124031, "eval_loss": 0.26311546564102173, "eval_runtime": 235.1083, "eval_samples_per_second": 80.831, "eval_steps_per_second": 1.684, "step": 2000 }, { "epoch": 1.9864341085271318, "grad_norm": 0.6586690545082092, "learning_rate": 0.00019205813953488375, "loss": 0.2954, "step": 2050 }, { "epoch": 1.9864341085271318, "eval_loss": 0.2547176778316498, "eval_runtime": 235.1319, "eval_samples_per_second": 80.823, "eval_steps_per_second": 1.684, "step": 2050 }, { "epoch": 2.0348837209302326, "grad_norm": 0.648765504360199, "learning_rate": 0.00019186434108527133, "loss": 0.3008, "step": 2100 }, { "epoch": 2.0348837209302326, "eval_loss": 0.2547691762447357, "eval_runtime": 227.135, "eval_samples_per_second": 83.668, "eval_steps_per_second": 1.743, "step": 2100 }, { "epoch": 2.0833333333333335, "grad_norm": 0.6317530870437622, "learning_rate": 0.00019167054263565894, "loss": 0.2911, "step": 2150 }, { "epoch": 2.0833333333333335, "eval_loss": 0.24894124269485474, "eval_runtime": 225.2054, "eval_samples_per_second": 84.385, "eval_steps_per_second": 1.758, "step": 2150 }, { "epoch": 2.1317829457364343, "grad_norm": 0.5900191068649292, "learning_rate": 0.00019147674418604652, "loss": 0.2873, "step": 2200 }, { "epoch": 2.1317829457364343, "eval_loss": 0.24607273936271667, "eval_runtime": 231.2435, "eval_samples_per_second": 82.182, "eval_steps_per_second": 1.712, "step": 2200 }, { "epoch": 2.1802325581395348, "grad_norm": 0.5394904017448425, "learning_rate": 0.00019128294573643413, "loss": 0.2716, "step": 2250 }, { "epoch": 2.1802325581395348, "eval_loss": 0.2443438619375229, "eval_runtime": 225.1673, "eval_samples_per_second": 84.399, "eval_steps_per_second": 1.759, "step": 2250 }, { "epoch": 2.2286821705426356, "grad_norm": 0.5458412766456604, "learning_rate": 0.00019108914728682174, "loss": 0.2727, "step": 2300 }, { "epoch": 2.2286821705426356, "eval_loss": 0.23843063414096832, "eval_runtime": 233.0169, "eval_samples_per_second": 81.556, "eval_steps_per_second": 1.699, "step": 2300 }, { "epoch": 2.2771317829457365, "grad_norm": 0.6243239641189575, "learning_rate": 0.00019089534883720932, "loss": 0.2741, "step": 2350 }, { "epoch": 2.2771317829457365, "eval_loss": 0.23107607662677765, "eval_runtime": 223.8881, "eval_samples_per_second": 84.882, "eval_steps_per_second": 1.769, "step": 2350 }, { "epoch": 2.3255813953488373, "grad_norm": 0.6094734072685242, "learning_rate": 0.00019070155038759692, "loss": 0.2719, "step": 2400 }, { "epoch": 2.3255813953488373, "eval_loss": 0.23176899552345276, "eval_runtime": 219.8841, "eval_samples_per_second": 86.427, "eval_steps_per_second": 1.801, "step": 2400 }, { "epoch": 2.374031007751938, "grad_norm": 0.6149279475212097, "learning_rate": 0.0001905077519379845, "loss": 0.2641, "step": 2450 }, { "epoch": 2.374031007751938, "eval_loss": 0.22322338819503784, "eval_runtime": 224.8759, "eval_samples_per_second": 84.509, "eval_steps_per_second": 1.761, "step": 2450 }, { "epoch": 2.4224806201550386, "grad_norm": 0.5470075607299805, "learning_rate": 0.0001903139534883721, "loss": 0.2558, "step": 2500 }, { "epoch": 2.4224806201550386, "eval_loss": 0.22380074858665466, "eval_runtime": 220.6655, "eval_samples_per_second": 86.121, "eval_steps_per_second": 1.795, "step": 2500 }, { "epoch": 2.4709302325581395, "grad_norm": 0.6210835576057434, "learning_rate": 0.00019012015503875972, "loss": 0.2544, "step": 2550 }, { "epoch": 2.4709302325581395, "eval_loss": 0.22070536017417908, "eval_runtime": 223.1476, "eval_samples_per_second": 85.163, "eval_steps_per_second": 1.775, "step": 2550 }, { "epoch": 2.5193798449612403, "grad_norm": 0.43341922760009766, "learning_rate": 0.0001899263565891473, "loss": 0.2496, "step": 2600 }, { "epoch": 2.5193798449612403, "eval_loss": 0.2163931280374527, "eval_runtime": 232.0825, "eval_samples_per_second": 81.885, "eval_steps_per_second": 1.706, "step": 2600 }, { "epoch": 2.567829457364341, "grad_norm": 0.4871957302093506, "learning_rate": 0.0001897325581395349, "loss": 0.2506, "step": 2650 }, { "epoch": 2.567829457364341, "eval_loss": 0.21461744606494904, "eval_runtime": 219.2336, "eval_samples_per_second": 86.684, "eval_steps_per_second": 1.806, "step": 2650 }, { "epoch": 2.616279069767442, "grad_norm": 0.5310043692588806, "learning_rate": 0.0001895387596899225, "loss": 0.247, "step": 2700 }, { "epoch": 2.616279069767442, "eval_loss": 0.21458372473716736, "eval_runtime": 224.6247, "eval_samples_per_second": 84.603, "eval_steps_per_second": 1.763, "step": 2700 }, { "epoch": 2.6647286821705425, "grad_norm": 0.6074191927909851, "learning_rate": 0.0001893449612403101, "loss": 0.2455, "step": 2750 }, { "epoch": 2.6647286821705425, "eval_loss": 0.21306496858596802, "eval_runtime": 223.479, "eval_samples_per_second": 85.037, "eval_steps_per_second": 1.772, "step": 2750 }, { "epoch": 2.7131782945736433, "grad_norm": 0.5458905100822449, "learning_rate": 0.00018915116279069768, "loss": 0.243, "step": 2800 }, { "epoch": 2.7131782945736433, "eval_loss": 0.2097616195678711, "eval_runtime": 220.5231, "eval_samples_per_second": 86.177, "eval_steps_per_second": 1.796, "step": 2800 }, { "epoch": 2.761627906976744, "grad_norm": 0.4153260588645935, "learning_rate": 0.00018895736434108528, "loss": 0.2356, "step": 2850 }, { "epoch": 2.761627906976744, "eval_loss": 0.20619072020053864, "eval_runtime": 214.0517, "eval_samples_per_second": 88.782, "eval_steps_per_second": 1.85, "step": 2850 }, { "epoch": 2.810077519379845, "grad_norm": 0.4979017674922943, "learning_rate": 0.0001887635658914729, "loss": 0.2361, "step": 2900 }, { "epoch": 2.810077519379845, "eval_loss": 0.20226989686489105, "eval_runtime": 216.8896, "eval_samples_per_second": 87.621, "eval_steps_per_second": 1.826, "step": 2900 }, { "epoch": 2.858527131782946, "grad_norm": 0.5476083755493164, "learning_rate": 0.00018856976744186047, "loss": 0.2347, "step": 2950 }, { "epoch": 2.858527131782946, "eval_loss": 0.2028292417526245, "eval_runtime": 214.8481, "eval_samples_per_second": 88.453, "eval_steps_per_second": 1.843, "step": 2950 }, { "epoch": 2.9069767441860463, "grad_norm": 0.5912747979164124, "learning_rate": 0.00018837596899224808, "loss": 0.2367, "step": 3000 }, { "epoch": 2.9069767441860463, "eval_loss": 0.1990610510110855, "eval_runtime": 219.8884, "eval_samples_per_second": 86.426, "eval_steps_per_second": 1.801, "step": 3000 }, { "epoch": 2.955426356589147, "grad_norm": 0.4696279466152191, "learning_rate": 0.00018818217054263566, "loss": 0.2292, "step": 3050 }, { "epoch": 2.955426356589147, "eval_loss": 0.20148637890815735, "eval_runtime": 225.0994, "eval_samples_per_second": 84.425, "eval_steps_per_second": 1.759, "step": 3050 }, { "epoch": 3.003875968992248, "grad_norm": 0.47198325395584106, "learning_rate": 0.00018798837209302327, "loss": 0.2286, "step": 3100 }, { "epoch": 3.003875968992248, "eval_loss": 0.1965794712305069, "eval_runtime": 216.7977, "eval_samples_per_second": 87.658, "eval_steps_per_second": 1.827, "step": 3100 }, { "epoch": 3.052325581395349, "grad_norm": 0.49365678429603577, "learning_rate": 0.00018779457364341087, "loss": 0.2239, "step": 3150 }, { "epoch": 3.052325581395349, "eval_loss": 0.1976451575756073, "eval_runtime": 215.2246, "eval_samples_per_second": 88.298, "eval_steps_per_second": 1.84, "step": 3150 }, { "epoch": 3.10077519379845, "grad_norm": 0.5356510281562805, "learning_rate": 0.00018760077519379845, "loss": 0.224, "step": 3200 }, { "epoch": 3.10077519379845, "eval_loss": 0.1925686150789261, "eval_runtime": 218.8392, "eval_samples_per_second": 86.84, "eval_steps_per_second": 1.81, "step": 3200 }, { "epoch": 3.14922480620155, "grad_norm": 0.46862003207206726, "learning_rate": 0.00018740697674418606, "loss": 0.212, "step": 3250 }, { "epoch": 3.14922480620155, "eval_loss": 0.19242693483829498, "eval_runtime": 216.1261, "eval_samples_per_second": 87.93, "eval_steps_per_second": 1.832, "step": 3250 }, { "epoch": 3.197674418604651, "grad_norm": 0.40870046615600586, "learning_rate": 0.00018721317829457364, "loss": 0.2188, "step": 3300 }, { "epoch": 3.197674418604651, "eval_loss": 0.18868175148963928, "eval_runtime": 214.9811, "eval_samples_per_second": 88.398, "eval_steps_per_second": 1.842, "step": 3300 }, { "epoch": 3.246124031007752, "grad_norm": 0.5424318909645081, "learning_rate": 0.00018701937984496125, "loss": 0.2157, "step": 3350 }, { "epoch": 3.246124031007752, "eval_loss": 0.1919330656528473, "eval_runtime": 217.7522, "eval_samples_per_second": 87.274, "eval_steps_per_second": 1.819, "step": 3350 }, { "epoch": 3.294573643410853, "grad_norm": 0.41225236654281616, "learning_rate": 0.00018682558139534886, "loss": 0.2156, "step": 3400 }, { "epoch": 3.294573643410853, "eval_loss": 0.18903516232967377, "eval_runtime": 221.615, "eval_samples_per_second": 85.752, "eval_steps_per_second": 1.787, "step": 3400 }, { "epoch": 3.3430232558139537, "grad_norm": 0.4955701231956482, "learning_rate": 0.00018663178294573644, "loss": 0.2087, "step": 3450 }, { "epoch": 3.3430232558139537, "eval_loss": 0.18453241884708405, "eval_runtime": 217.9492, "eval_samples_per_second": 87.195, "eval_steps_per_second": 1.817, "step": 3450 }, { "epoch": 3.391472868217054, "grad_norm": 0.4107048809528351, "learning_rate": 0.00018643798449612405, "loss": 0.2128, "step": 3500 }, { "epoch": 3.391472868217054, "eval_loss": 0.18335753679275513, "eval_runtime": 222.918, "eval_samples_per_second": 85.251, "eval_steps_per_second": 1.776, "step": 3500 }, { "epoch": 3.439922480620155, "grad_norm": 0.44138869643211365, "learning_rate": 0.00018624418604651163, "loss": 0.2069, "step": 3550 }, { "epoch": 3.439922480620155, "eval_loss": 0.18529056012630463, "eval_runtime": 226.8247, "eval_samples_per_second": 83.783, "eval_steps_per_second": 1.746, "step": 3550 }, { "epoch": 3.488372093023256, "grad_norm": 0.41599947214126587, "learning_rate": 0.00018605038759689923, "loss": 0.2069, "step": 3600 }, { "epoch": 3.488372093023256, "eval_loss": 0.18560050427913666, "eval_runtime": 237.1428, "eval_samples_per_second": 80.137, "eval_steps_per_second": 1.67, "step": 3600 }, { "epoch": 3.5368217054263567, "grad_norm": 0.39731481671333313, "learning_rate": 0.00018585658914728684, "loss": 0.2074, "step": 3650 }, { "epoch": 3.5368217054263567, "eval_loss": 0.1789853423833847, "eval_runtime": 220.4447, "eval_samples_per_second": 86.208, "eval_steps_per_second": 1.796, "step": 3650 }, { "epoch": 3.5852713178294575, "grad_norm": 0.46680784225463867, "learning_rate": 0.00018566279069767442, "loss": 0.2042, "step": 3700 }, { "epoch": 3.5852713178294575, "eval_loss": 0.1809505820274353, "eval_runtime": 218.2028, "eval_samples_per_second": 87.093, "eval_steps_per_second": 1.815, "step": 3700 }, { "epoch": 3.633720930232558, "grad_norm": 0.552648663520813, "learning_rate": 0.00018546899224806203, "loss": 0.2089, "step": 3750 }, { "epoch": 3.633720930232558, "eval_loss": 0.17837607860565186, "eval_runtime": 221.244, "eval_samples_per_second": 85.896, "eval_steps_per_second": 1.79, "step": 3750 }, { "epoch": 3.682170542635659, "grad_norm": 0.5326802134513855, "learning_rate": 0.0001852751937984496, "loss": 0.1985, "step": 3800 }, { "epoch": 3.682170542635659, "eval_loss": 0.17835576832294464, "eval_runtime": 227.413, "eval_samples_per_second": 83.566, "eval_steps_per_second": 1.741, "step": 3800 }, { "epoch": 3.7306201550387597, "grad_norm": 0.5640744566917419, "learning_rate": 0.00018508139534883722, "loss": 0.201, "step": 3850 }, { "epoch": 3.7306201550387597, "eval_loss": 0.17722435295581818, "eval_runtime": 220.9949, "eval_samples_per_second": 85.993, "eval_steps_per_second": 1.792, "step": 3850 }, { "epoch": 3.7790697674418605, "grad_norm": 0.41974276304244995, "learning_rate": 0.00018488759689922482, "loss": 0.1945, "step": 3900 }, { "epoch": 3.7790697674418605, "eval_loss": 0.17568857967853546, "eval_runtime": 227.7226, "eval_samples_per_second": 83.452, "eval_steps_per_second": 1.739, "step": 3900 }, { "epoch": 3.8275193798449614, "grad_norm": 0.4548977315425873, "learning_rate": 0.0001846937984496124, "loss": 0.2035, "step": 3950 }, { "epoch": 3.8275193798449614, "eval_loss": 0.17479188740253448, "eval_runtime": 225.9318, "eval_samples_per_second": 84.114, "eval_steps_per_second": 1.753, "step": 3950 }, { "epoch": 3.875968992248062, "grad_norm": 0.44886454939842224, "learning_rate": 0.0001845, "loss": 0.1965, "step": 4000 }, { "epoch": 3.875968992248062, "eval_loss": 0.17307986319065094, "eval_runtime": 221.697, "eval_samples_per_second": 85.721, "eval_steps_per_second": 1.786, "step": 4000 } ], "logging_steps": 50, "max_steps": 51600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.034795399365018e+16, "train_batch_size": 96, "trial_name": null, "trial_params": null }