Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 66238, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 3.019414837404511e-05, | |
| "grad_norm": 5.361182689666748, | |
| "learning_rate": 0.0, | |
| "loss": 4.5864, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.015097074187022556, | |
| "grad_norm": 4.397288799285889, | |
| "learning_rate": 4.5202004951989857e-07, | |
| "loss": 4.4178, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.015097074187022556, | |
| "eval_cosine_accuracy": 0.9432226419448853, | |
| "eval_loss": 1.708533525466919, | |
| "eval_runtime": 24.4327, | |
| "eval_samples_per_second": 387.104, | |
| "eval_steps_per_second": 0.778, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03019414837404511, | |
| "grad_norm": 3.148864984512329, | |
| "learning_rate": 9.049459508424422e-07, | |
| "loss": 4.1295, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.03019414837404511, | |
| "eval_cosine_accuracy": 0.9505180716514587, | |
| "eval_loss": 1.5775120258331299, | |
| "eval_runtime": 24.5488, | |
| "eval_samples_per_second": 385.274, | |
| "eval_steps_per_second": 0.774, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.045291222561067664, | |
| "grad_norm": 2.853407382965088, | |
| "learning_rate": 1.3578718521649858e-06, | |
| "loss": 3.9183, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.045291222561067664, | |
| "eval_cosine_accuracy": 0.9493550658226013, | |
| "eval_loss": 1.5401873588562012, | |
| "eval_runtime": 24.6069, | |
| "eval_samples_per_second": 384.364, | |
| "eval_steps_per_second": 0.772, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.06038829674809022, | |
| "grad_norm": 2.850583791732788, | |
| "learning_rate": 1.8107977534875296e-06, | |
| "loss": 3.7909, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.06038829674809022, | |
| "eval_cosine_accuracy": 0.9497779607772827, | |
| "eval_loss": 1.5130109786987305, | |
| "eval_runtime": 24.9452, | |
| "eval_samples_per_second": 379.151, | |
| "eval_steps_per_second": 0.762, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.07548537093511277, | |
| "grad_norm": 2.7371764183044434, | |
| "learning_rate": 2.263723654810073e-06, | |
| "loss": 3.6971, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.07548537093511277, | |
| "eval_cosine_accuracy": 0.9504123330116272, | |
| "eval_loss": 1.4736826419830322, | |
| "eval_runtime": 24.9145, | |
| "eval_samples_per_second": 379.618, | |
| "eval_steps_per_second": 0.763, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.09058244512213533, | |
| "grad_norm": 2.864781379699707, | |
| "learning_rate": 2.7166495561326167e-06, | |
| "loss": 3.6265, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.09058244512213533, | |
| "eval_cosine_accuracy": 0.9510467052459717, | |
| "eval_loss": 1.441090703010559, | |
| "eval_runtime": 25.0444, | |
| "eval_samples_per_second": 377.65, | |
| "eval_steps_per_second": 0.759, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.10567951930915788, | |
| "grad_norm": 2.8439295291900635, | |
| "learning_rate": 3.1695754574551603e-06, | |
| "loss": 3.5424, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.10567951930915788, | |
| "eval_cosine_accuracy": 0.952421247959137, | |
| "eval_loss": 1.419554591178894, | |
| "eval_runtime": 24.765, | |
| "eval_samples_per_second": 381.91, | |
| "eval_steps_per_second": 0.767, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.12077659349618045, | |
| "grad_norm": 2.7819902896881104, | |
| "learning_rate": 3.6225013587777042e-06, | |
| "loss": 3.4728, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.12077659349618045, | |
| "eval_cosine_accuracy": 0.9529498815536499, | |
| "eval_loss": 1.405138373374939, | |
| "eval_runtime": 24.8221, | |
| "eval_samples_per_second": 381.031, | |
| "eval_steps_per_second": 0.765, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.135873667683203, | |
| "grad_norm": 3.0670077800750732, | |
| "learning_rate": 4.075427260100247e-06, | |
| "loss": 3.4177, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.135873667683203, | |
| "eval_cosine_accuracy": 0.9542186260223389, | |
| "eval_loss": 1.3949991464614868, | |
| "eval_runtime": 24.7597, | |
| "eval_samples_per_second": 381.991, | |
| "eval_steps_per_second": 0.767, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.15097074187022555, | |
| "grad_norm": 3.037975549697876, | |
| "learning_rate": 4.528353161422791e-06, | |
| "loss": 3.3723, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.15097074187022555, | |
| "eval_cosine_accuracy": 0.954430103302002, | |
| "eval_loss": 1.400901436805725, | |
| "eval_runtime": 24.8332, | |
| "eval_samples_per_second": 380.861, | |
| "eval_steps_per_second": 0.765, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1660678160572481, | |
| "grad_norm": 2.9103686809539795, | |
| "learning_rate": 4.9812790627453345e-06, | |
| "loss": 3.314, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.1660678160572481, | |
| "eval_cosine_accuracy": 0.9536899924278259, | |
| "eval_loss": 1.376301884651184, | |
| "eval_runtime": 24.9503, | |
| "eval_samples_per_second": 379.074, | |
| "eval_steps_per_second": 0.762, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.18116489024427065, | |
| "grad_norm": 2.823807716369629, | |
| "learning_rate": 5.4332991122652334e-06, | |
| "loss": 3.2727, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.18116489024427065, | |
| "eval_cosine_accuracy": 0.9555931687355042, | |
| "eval_loss": 1.3673756122589111, | |
| "eval_runtime": 24.7308, | |
| "eval_samples_per_second": 382.438, | |
| "eval_steps_per_second": 0.768, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.1962619644312932, | |
| "grad_norm": 2.8840878009796143, | |
| "learning_rate": 5.886225013587777e-06, | |
| "loss": 3.2443, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1962619644312932, | |
| "eval_cosine_accuracy": 0.9550644755363464, | |
| "eval_loss": 1.3680225610733032, | |
| "eval_runtime": 24.7458, | |
| "eval_samples_per_second": 382.207, | |
| "eval_steps_per_second": 0.768, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.21135903861831576, | |
| "grad_norm": 2.826021194458008, | |
| "learning_rate": 6.3391509149103205e-06, | |
| "loss": 3.2021, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.21135903861831576, | |
| "eval_cosine_accuracy": 0.9558045864105225, | |
| "eval_loss": 1.3505871295928955, | |
| "eval_runtime": 24.9126, | |
| "eval_samples_per_second": 379.648, | |
| "eval_steps_per_second": 0.763, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.22645611280533834, | |
| "grad_norm": 2.8310251235961914, | |
| "learning_rate": 6.792076816232864e-06, | |
| "loss": 3.1722, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.22645611280533834, | |
| "eval_cosine_accuracy": 0.9562275409698486, | |
| "eval_loss": 1.3529975414276123, | |
| "eval_runtime": 25.1169, | |
| "eval_samples_per_second": 376.559, | |
| "eval_steps_per_second": 0.756, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.2415531869923609, | |
| "grad_norm": 2.9824509620666504, | |
| "learning_rate": 7.2450027175554085e-06, | |
| "loss": 3.141, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2415531869923609, | |
| "eval_cosine_accuracy": 0.9558045864105225, | |
| "eval_loss": 1.360655665397644, | |
| "eval_runtime": 24.8793, | |
| "eval_samples_per_second": 380.155, | |
| "eval_steps_per_second": 0.764, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.25665026117938344, | |
| "grad_norm": 2.9972078800201416, | |
| "learning_rate": 7.697928618877951e-06, | |
| "loss": 3.1142, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.25665026117938344, | |
| "eval_cosine_accuracy": 0.9562275409698486, | |
| "eval_loss": 1.3441089391708374, | |
| "eval_runtime": 24.8523, | |
| "eval_samples_per_second": 380.568, | |
| "eval_steps_per_second": 0.765, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.271747335366406, | |
| "grad_norm": 2.882063865661621, | |
| "learning_rate": 8.14994866839785e-06, | |
| "loss": 3.0868, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.271747335366406, | |
| "eval_cosine_accuracy": 0.9574962854385376, | |
| "eval_loss": 1.3387176990509033, | |
| "eval_runtime": 24.9253, | |
| "eval_samples_per_second": 379.453, | |
| "eval_steps_per_second": 0.762, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.28684440955342855, | |
| "grad_norm": 3.041818857192993, | |
| "learning_rate": 8.602874569720394e-06, | |
| "loss": 3.0581, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.28684440955342855, | |
| "eval_cosine_accuracy": 0.9586593508720398, | |
| "eval_loss": 1.318073034286499, | |
| "eval_runtime": 24.7817, | |
| "eval_samples_per_second": 381.652, | |
| "eval_steps_per_second": 0.767, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.3019414837404511, | |
| "grad_norm": 3.2543036937713623, | |
| "learning_rate": 9.055800471042937e-06, | |
| "loss": 3.0356, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.3019414837404511, | |
| "eval_cosine_accuracy": 0.9595052003860474, | |
| "eval_loss": 1.313987135887146, | |
| "eval_runtime": 24.8717, | |
| "eval_samples_per_second": 380.272, | |
| "eval_steps_per_second": 0.764, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.31703855792747365, | |
| "grad_norm": 3.0757057666778564, | |
| "learning_rate": 9.50872637236548e-06, | |
| "loss": 3.0144, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.31703855792747365, | |
| "eval_cosine_accuracy": 0.9592937231063843, | |
| "eval_loss": 1.3049250841140747, | |
| "eval_runtime": 45.4301, | |
| "eval_samples_per_second": 208.188, | |
| "eval_steps_per_second": 0.418, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.3321356321144962, | |
| "grad_norm": 3.0446619987487793, | |
| "learning_rate": 9.961652273688024e-06, | |
| "loss": 2.9902, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.3321356321144962, | |
| "eval_cosine_accuracy": 0.9595052003860474, | |
| "eval_loss": 1.308658480644226, | |
| "eval_runtime": 25.5656, | |
| "eval_samples_per_second": 369.95, | |
| "eval_steps_per_second": 0.743, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.34723270630151876, | |
| "grad_norm": 2.9077670574188232, | |
| "learning_rate": 1.041457817501057e-05, | |
| "loss": 2.9743, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.34723270630151876, | |
| "eval_cosine_accuracy": 0.9600338339805603, | |
| "eval_loss": 1.3093079328536987, | |
| "eval_runtime": 25.135, | |
| "eval_samples_per_second": 376.287, | |
| "eval_steps_per_second": 0.756, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.3623297804885413, | |
| "grad_norm": 3.023320198059082, | |
| "learning_rate": 1.0867504076333111e-05, | |
| "loss": 2.9547, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.3623297804885413, | |
| "eval_cosine_accuracy": 0.9597166180610657, | |
| "eval_loss": 1.2953948974609375, | |
| "eval_runtime": 25.1058, | |
| "eval_samples_per_second": 376.726, | |
| "eval_steps_per_second": 0.757, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.37742685467556386, | |
| "grad_norm": 3.1133012771606445, | |
| "learning_rate": 1.1320429977655657e-05, | |
| "loss": 2.9464, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.37742685467556386, | |
| "eval_cosine_accuracy": 0.9593994617462158, | |
| "eval_loss": 1.2992297410964966, | |
| "eval_runtime": 24.7741, | |
| "eval_samples_per_second": 381.77, | |
| "eval_steps_per_second": 0.767, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.3925239288625864, | |
| "grad_norm": 2.912932872772217, | |
| "learning_rate": 1.1772450027175554e-05, | |
| "loss": 2.9343, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.3925239288625864, | |
| "eval_cosine_accuracy": 0.9616197943687439, | |
| "eval_loss": 1.283966302871704, | |
| "eval_runtime": 24.707, | |
| "eval_samples_per_second": 382.807, | |
| "eval_steps_per_second": 0.769, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.40762100304960897, | |
| "grad_norm": 3.116110324859619, | |
| "learning_rate": 1.22253759284981e-05, | |
| "loss": 2.9116, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.40762100304960897, | |
| "eval_cosine_accuracy": 0.9605624675750732, | |
| "eval_loss": 1.2977007627487183, | |
| "eval_runtime": 24.8066, | |
| "eval_samples_per_second": 381.269, | |
| "eval_steps_per_second": 0.766, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.4227180772366315, | |
| "grad_norm": 2.9808337688446045, | |
| "learning_rate": 1.2678301829820641e-05, | |
| "loss": 2.8881, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.4227180772366315, | |
| "eval_cosine_accuracy": 0.9615140557289124, | |
| "eval_loss": 1.2986812591552734, | |
| "eval_runtime": 24.7178, | |
| "eval_samples_per_second": 382.639, | |
| "eval_steps_per_second": 0.769, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.43781515142365407, | |
| "grad_norm": 2.966411828994751, | |
| "learning_rate": 1.3131227731143186e-05, | |
| "loss": 2.8767, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.43781515142365407, | |
| "eval_cosine_accuracy": 0.9616197943687439, | |
| "eval_loss": 1.2957323789596558, | |
| "eval_runtime": 24.7859, | |
| "eval_samples_per_second": 381.588, | |
| "eval_steps_per_second": 0.767, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.4529122256106767, | |
| "grad_norm": 2.91023588180542, | |
| "learning_rate": 1.3583247780663084e-05, | |
| "loss": 2.8615, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.4529122256106767, | |
| "eval_cosine_accuracy": 0.9615140557289124, | |
| "eval_loss": 1.3000255823135376, | |
| "eval_runtime": 24.7801, | |
| "eval_samples_per_second": 381.677, | |
| "eval_steps_per_second": 0.767, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.46800929979769923, | |
| "grad_norm": 3.1636173725128174, | |
| "learning_rate": 1.4036173681985627e-05, | |
| "loss": 2.8495, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.46800929979769923, | |
| "eval_cosine_accuracy": 0.9614083170890808, | |
| "eval_loss": 1.3052477836608887, | |
| "eval_runtime": 24.727, | |
| "eval_samples_per_second": 382.497, | |
| "eval_steps_per_second": 0.768, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.4831063739847218, | |
| "grad_norm": 2.824103355407715, | |
| "learning_rate": 1.448909958330817e-05, | |
| "loss": 2.8396, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.4831063739847218, | |
| "eval_cosine_accuracy": 0.963311493396759, | |
| "eval_loss": 1.3045930862426758, | |
| "eval_runtime": 24.7919, | |
| "eval_samples_per_second": 381.495, | |
| "eval_steps_per_second": 0.766, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.49820344817174433, | |
| "grad_norm": 2.780132532119751, | |
| "learning_rate": 1.4942025484630714e-05, | |
| "loss": 2.8268, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.49820344817174433, | |
| "eval_cosine_accuracy": 0.961831271648407, | |
| "eval_loss": 1.3152496814727783, | |
| "eval_runtime": 24.7154, | |
| "eval_samples_per_second": 382.676, | |
| "eval_steps_per_second": 0.769, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.5133005223587669, | |
| "grad_norm": 3.0766918659210205, | |
| "learning_rate": 1.5394045534150613e-05, | |
| "loss": 2.8204, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5133005223587669, | |
| "eval_cosine_accuracy": 0.9627828001976013, | |
| "eval_loss": 1.3005013465881348, | |
| "eval_runtime": 24.862, | |
| "eval_samples_per_second": 380.42, | |
| "eval_steps_per_second": 0.764, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5283975965457894, | |
| "grad_norm": 3.02154541015625, | |
| "learning_rate": 1.5846971435473157e-05, | |
| "loss": 2.8037, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.5283975965457894, | |
| "eval_cosine_accuracy": 0.9629942774772644, | |
| "eval_loss": 1.298316240310669, | |
| "eval_runtime": 24.7624, | |
| "eval_samples_per_second": 381.95, | |
| "eval_steps_per_second": 0.767, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.543494670732812, | |
| "grad_norm": 2.928821086883545, | |
| "learning_rate": 1.62998973367957e-05, | |
| "loss": 2.7931, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.543494670732812, | |
| "eval_cosine_accuracy": 0.9620426893234253, | |
| "eval_loss": 1.3049898147583008, | |
| "eval_runtime": 24.7029, | |
| "eval_samples_per_second": 382.87, | |
| "eval_steps_per_second": 0.769, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5585917449198345, | |
| "grad_norm": 2.72847843170166, | |
| "learning_rate": 1.6752823238118247e-05, | |
| "loss": 2.7864, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.5585917449198345, | |
| "eval_cosine_accuracy": 0.962571382522583, | |
| "eval_loss": 1.3096941709518433, | |
| "eval_runtime": 24.7148, | |
| "eval_samples_per_second": 382.686, | |
| "eval_steps_per_second": 0.769, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.5736888191068571, | |
| "grad_norm": 2.908419609069824, | |
| "learning_rate": 1.7204843287638144e-05, | |
| "loss": 2.7714, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.5736888191068571, | |
| "eval_cosine_accuracy": 0.9614083170890808, | |
| "eval_loss": 1.3053295612335205, | |
| "eval_runtime": 24.766, | |
| "eval_samples_per_second": 381.895, | |
| "eval_steps_per_second": 0.767, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.5887858932938796, | |
| "grad_norm": 3.065990686416626, | |
| "learning_rate": 1.7657769188960685e-05, | |
| "loss": 2.7555, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.5887858932938796, | |
| "eval_cosine_accuracy": 0.9626771211624146, | |
| "eval_loss": 1.2939248085021973, | |
| "eval_runtime": 24.9445, | |
| "eval_samples_per_second": 379.162, | |
| "eval_steps_per_second": 0.762, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.6038829674809022, | |
| "grad_norm": 2.832427501678467, | |
| "learning_rate": 1.8110695090283228e-05, | |
| "loss": 2.7539, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6038829674809022, | |
| "eval_cosine_accuracy": 0.9627828001976013, | |
| "eval_loss": 1.2884334325790405, | |
| "eval_runtime": 25.3776, | |
| "eval_samples_per_second": 372.691, | |
| "eval_steps_per_second": 0.749, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6189800416679248, | |
| "grad_norm": 2.8832485675811768, | |
| "learning_rate": 1.8563620991605775e-05, | |
| "loss": 2.7374, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.6189800416679248, | |
| "eval_cosine_accuracy": 0.9632057547569275, | |
| "eval_loss": 1.2746211290359497, | |
| "eval_runtime": 25.079, | |
| "eval_samples_per_second": 377.129, | |
| "eval_steps_per_second": 0.758, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.6340771158549473, | |
| "grad_norm": 2.9037137031555176, | |
| "learning_rate": 1.9015641041125672e-05, | |
| "loss": 2.7352, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.6340771158549473, | |
| "eval_cosine_accuracy": 0.9629942774772644, | |
| "eval_loss": 1.276559829711914, | |
| "eval_runtime": 25.1377, | |
| "eval_samples_per_second": 376.248, | |
| "eval_steps_per_second": 0.756, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.6491741900419699, | |
| "grad_norm": 2.797557830810547, | |
| "learning_rate": 1.9468566942448216e-05, | |
| "loss": 2.7224, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.6491741900419699, | |
| "eval_cosine_accuracy": 0.9640516042709351, | |
| "eval_loss": 1.2923390865325928, | |
| "eval_runtime": 24.7534, | |
| "eval_samples_per_second": 382.09, | |
| "eval_steps_per_second": 0.768, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.6642712642289924, | |
| "grad_norm": 2.7658629417419434, | |
| "learning_rate": 1.992149284377076e-05, | |
| "loss": 2.7047, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.6642712642289924, | |
| "eval_cosine_accuracy": 0.963100016117096, | |
| "eval_loss": 1.2848035097122192, | |
| "eval_runtime": 26.2586, | |
| "eval_samples_per_second": 360.187, | |
| "eval_steps_per_second": 0.724, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.679368338416015, | |
| "grad_norm": 2.8352978229522705, | |
| "learning_rate": 2.0374418745093303e-05, | |
| "loss": 2.7037, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.679368338416015, | |
| "eval_cosine_accuracy": 0.9647917151451111, | |
| "eval_loss": 1.294427514076233, | |
| "eval_runtime": 25.7103, | |
| "eval_samples_per_second": 367.868, | |
| "eval_steps_per_second": 0.739, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.6944654126030375, | |
| "grad_norm": 2.6762068271636963, | |
| "learning_rate": 2.0827344646415847e-05, | |
| "loss": 2.6983, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.6944654126030375, | |
| "eval_cosine_accuracy": 0.9636286497116089, | |
| "eval_loss": 1.282760739326477, | |
| "eval_runtime": 25.5021, | |
| "eval_samples_per_second": 370.871, | |
| "eval_steps_per_second": 0.745, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.7095624867900601, | |
| "grad_norm": 2.7681946754455566, | |
| "learning_rate": 2.1279364695935747e-05, | |
| "loss": 2.6881, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.7095624867900601, | |
| "eval_cosine_accuracy": 0.9646859765052795, | |
| "eval_loss": 1.2692368030548096, | |
| "eval_runtime": 24.7607, | |
| "eval_samples_per_second": 381.976, | |
| "eval_steps_per_second": 0.767, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.7246595609770826, | |
| "grad_norm": 2.740656852722168, | |
| "learning_rate": 2.1732290597258287e-05, | |
| "loss": 2.6803, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7246595609770826, | |
| "eval_cosine_accuracy": 0.9648974537849426, | |
| "eval_loss": 1.289289951324463, | |
| "eval_runtime": 24.8054, | |
| "eval_samples_per_second": 381.288, | |
| "eval_steps_per_second": 0.766, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7397566351641052, | |
| "grad_norm": 2.850194215774536, | |
| "learning_rate": 2.218521649858083e-05, | |
| "loss": 2.663, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.7397566351641052, | |
| "eval_cosine_accuracy": 0.9658490419387817, | |
| "eval_loss": 1.2865654230117798, | |
| "eval_runtime": 24.7982, | |
| "eval_samples_per_second": 381.399, | |
| "eval_steps_per_second": 0.766, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.7548537093511277, | |
| "grad_norm": 2.8644144535064697, | |
| "learning_rate": 2.2638142399903378e-05, | |
| "loss": 2.67, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.7548537093511277, | |
| "eval_cosine_accuracy": 0.9647917151451111, | |
| "eval_loss": 1.2818125486373901, | |
| "eval_runtime": 24.8907, | |
| "eval_samples_per_second": 379.981, | |
| "eval_steps_per_second": 0.763, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.7699507835381503, | |
| "grad_norm": 3.004762887954712, | |
| "learning_rate": 2.309106830122592e-05, | |
| "loss": 2.6578, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.7699507835381503, | |
| "eval_cosine_accuracy": 0.9642630815505981, | |
| "eval_loss": 1.2803616523742676, | |
| "eval_runtime": 26.9665, | |
| "eval_samples_per_second": 350.732, | |
| "eval_steps_per_second": 0.705, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.7850478577251728, | |
| "grad_norm": 2.760558605194092, | |
| "learning_rate": 2.3543994202548465e-05, | |
| "loss": 2.6533, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.7850478577251728, | |
| "eval_cosine_accuracy": 0.9646859765052795, | |
| "eval_loss": 1.2608861923217773, | |
| "eval_runtime": 25.6588, | |
| "eval_samples_per_second": 368.607, | |
| "eval_steps_per_second": 0.74, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.8001449319121954, | |
| "grad_norm": 2.9594552516937256, | |
| "learning_rate": 2.3996920103871005e-05, | |
| "loss": 2.6466, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.8001449319121954, | |
| "eval_cosine_accuracy": 0.9634172320365906, | |
| "eval_loss": 1.2851191759109497, | |
| "eval_runtime": 25.6335, | |
| "eval_samples_per_second": 368.971, | |
| "eval_steps_per_second": 0.741, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.8152420060992179, | |
| "grad_norm": 2.670184373855591, | |
| "learning_rate": 2.4449846005193552e-05, | |
| "loss": 2.6269, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.8152420060992179, | |
| "eval_cosine_accuracy": 0.9658490419387817, | |
| "eval_loss": 1.260596752166748, | |
| "eval_runtime": 25.5908, | |
| "eval_samples_per_second": 369.586, | |
| "eval_steps_per_second": 0.742, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.8303390802862405, | |
| "grad_norm": 2.615385055541992, | |
| "learning_rate": 2.490186605471345e-05, | |
| "loss": 2.6246, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.8303390802862405, | |
| "eval_cosine_accuracy": 0.9644744992256165, | |
| "eval_loss": 1.2606145143508911, | |
| "eval_runtime": 25.6253, | |
| "eval_samples_per_second": 369.089, | |
| "eval_steps_per_second": 0.741, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.845436154473263, | |
| "grad_norm": 2.8142237663269043, | |
| "learning_rate": 2.5353886104233347e-05, | |
| "loss": 2.6253, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.845436154473263, | |
| "eval_cosine_accuracy": 0.9644744992256165, | |
| "eval_loss": 1.239962100982666, | |
| "eval_runtime": 25.5806, | |
| "eval_samples_per_second": 369.733, | |
| "eval_steps_per_second": 0.743, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.8605332286602856, | |
| "grad_norm": 2.9930691719055176, | |
| "learning_rate": 2.580681200555589e-05, | |
| "loss": 2.6173, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.8605332286602856, | |
| "eval_cosine_accuracy": 0.9635229706764221, | |
| "eval_loss": 1.2646968364715576, | |
| "eval_runtime": 25.6919, | |
| "eval_samples_per_second": 368.132, | |
| "eval_steps_per_second": 0.74, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.8756303028473081, | |
| "grad_norm": 2.7095680236816406, | |
| "learning_rate": 2.6259737906878434e-05, | |
| "loss": 2.6139, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.8756303028473081, | |
| "eval_cosine_accuracy": 0.9641573429107666, | |
| "eval_loss": 1.239478349685669, | |
| "eval_runtime": 25.6695, | |
| "eval_samples_per_second": 368.452, | |
| "eval_steps_per_second": 0.74, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.8907273770343307, | |
| "grad_norm": 2.694718599319458, | |
| "learning_rate": 2.671266380820098e-05, | |
| "loss": 2.6031, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.8907273770343307, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.2557895183563232, | |
| "eval_runtime": 25.52, | |
| "eval_samples_per_second": 370.611, | |
| "eval_steps_per_second": 0.745, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.9058244512213534, | |
| "grad_norm": 2.7020938396453857, | |
| "learning_rate": 2.7165589709523524e-05, | |
| "loss": 2.5981, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.9058244512213534, | |
| "eval_cosine_accuracy": 0.9648974537849426, | |
| "eval_loss": 1.2560282945632935, | |
| "eval_runtime": 25.5053, | |
| "eval_samples_per_second": 370.825, | |
| "eval_steps_per_second": 0.745, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.9209215254083759, | |
| "grad_norm": 2.7241218090057373, | |
| "learning_rate": 2.7618515610846064e-05, | |
| "loss": 2.5947, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.9209215254083759, | |
| "eval_cosine_accuracy": 0.9668005704879761, | |
| "eval_loss": 1.2227962017059326, | |
| "eval_runtime": 25.4374, | |
| "eval_samples_per_second": 371.815, | |
| "eval_steps_per_second": 0.747, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.9360185995953985, | |
| "grad_norm": 2.6481659412384033, | |
| "learning_rate": 2.8071441512168608e-05, | |
| "loss": 2.5854, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.9360185995953985, | |
| "eval_cosine_accuracy": 0.9651089310646057, | |
| "eval_loss": 1.2358726263046265, | |
| "eval_runtime": 25.5016, | |
| "eval_samples_per_second": 370.878, | |
| "eval_steps_per_second": 0.745, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.951115673782421, | |
| "grad_norm": 2.7532155513763428, | |
| "learning_rate": 2.8524367413491155e-05, | |
| "loss": 2.577, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.951115673782421, | |
| "eval_cosine_accuracy": 0.9650031924247742, | |
| "eval_loss": 1.2355579137802124, | |
| "eval_runtime": 25.495, | |
| "eval_samples_per_second": 370.975, | |
| "eval_steps_per_second": 0.745, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.9662127479694436, | |
| "grad_norm": 2.8746631145477295, | |
| "learning_rate": 2.8976387463011052e-05, | |
| "loss": 2.5767, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.9662127479694436, | |
| "eval_cosine_accuracy": 0.9651089310646057, | |
| "eval_loss": 1.2236350774765015, | |
| "eval_runtime": 25.8758, | |
| "eval_samples_per_second": 365.515, | |
| "eval_steps_per_second": 0.734, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.9813098221564661, | |
| "grad_norm": 2.3970577716827393, | |
| "learning_rate": 2.9429313364333596e-05, | |
| "loss": 3.0097, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.9813098221564661, | |
| "eval_cosine_accuracy": 0.9570733904838562, | |
| "eval_loss": 1.2788145542144775, | |
| "eval_runtime": 25.8609, | |
| "eval_samples_per_second": 365.725, | |
| "eval_steps_per_second": 0.735, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.9964068963434887, | |
| "grad_norm": 4.550646781921387, | |
| "learning_rate": 2.9881333413853493e-05, | |
| "loss": 2.9935, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.9964068963434887, | |
| "eval_cosine_accuracy": 0.9425882697105408, | |
| "eval_loss": 1.4643794298171997, | |
| "eval_runtime": 25.682, | |
| "eval_samples_per_second": 368.274, | |
| "eval_steps_per_second": 0.74, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.011503970530511, | |
| "grad_norm": 2.425041913986206, | |
| "learning_rate": 2.9998979053152034e-05, | |
| "loss": 2.5754, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.011503970530511, | |
| "eval_cosine_accuracy": 0.9621484279632568, | |
| "eval_loss": 1.261721134185791, | |
| "eval_runtime": 25.4189, | |
| "eval_samples_per_second": 372.085, | |
| "eval_steps_per_second": 0.747, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.0266010447175338, | |
| "grad_norm": 2.5277788639068604, | |
| "learning_rate": 2.9994338030101077e-05, | |
| "loss": 2.5581, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.0266010447175338, | |
| "eval_cosine_accuracy": 0.9632057547569275, | |
| "eval_loss": 1.2438400983810425, | |
| "eval_runtime": 25.4498, | |
| "eval_samples_per_second": 371.634, | |
| "eval_steps_per_second": 0.747, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.0416981189045562, | |
| "grad_norm": 2.544344663619995, | |
| "learning_rate": 2.9985949414871202e-05, | |
| "loss": 2.55, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.0416981189045562, | |
| "eval_cosine_accuracy": 0.9640516042709351, | |
| "eval_loss": 1.2398865222930908, | |
| "eval_runtime": 25.6677, | |
| "eval_samples_per_second": 368.478, | |
| "eval_steps_per_second": 0.74, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.0567951930915789, | |
| "grad_norm": 2.7393147945404053, | |
| "learning_rate": 2.997381530406107e-05, | |
| "loss": 2.5385, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.0567951930915789, | |
| "eval_cosine_accuracy": 0.9623599052429199, | |
| "eval_loss": 1.2346230745315552, | |
| "eval_runtime": 25.4184, | |
| "eval_samples_per_second": 372.092, | |
| "eval_steps_per_second": 0.747, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.0718922672786013, | |
| "grad_norm": 2.5983469486236572, | |
| "learning_rate": 2.995793873039533e-05, | |
| "loss": 2.5364, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.0718922672786013, | |
| "eval_cosine_accuracy": 0.9646859765052795, | |
| "eval_loss": 1.2296756505966187, | |
| "eval_runtime": 26.1785, | |
| "eval_samples_per_second": 361.289, | |
| "eval_steps_per_second": 0.726, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.086989341465624, | |
| "grad_norm": 2.7347781658172607, | |
| "learning_rate": 2.9938323661966688e-05, | |
| "loss": 2.5405, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.086989341465624, | |
| "eval_cosine_accuracy": 0.963100016117096, | |
| "eval_loss": 1.242247462272644, | |
| "eval_runtime": 25.9571, | |
| "eval_samples_per_second": 364.371, | |
| "eval_steps_per_second": 0.732, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.1020864156526464, | |
| "grad_norm": 2.627800941467285, | |
| "learning_rate": 2.9915025420967164e-05, | |
| "loss": 2.5253, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.1020864156526464, | |
| "eval_cosine_accuracy": 0.9658490419387817, | |
| "eval_loss": 1.2386823892593384, | |
| "eval_runtime": 25.7101, | |
| "eval_samples_per_second": 367.871, | |
| "eval_steps_per_second": 0.739, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.117183489839669, | |
| "grad_norm": 2.651385545730591, | |
| "learning_rate": 2.9887956452640508e-05, | |
| "loss": 2.5242, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.117183489839669, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.2386800050735474, | |
| "eval_runtime": 25.5529, | |
| "eval_samples_per_second": 370.134, | |
| "eval_steps_per_second": 0.744, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.1322805640266917, | |
| "grad_norm": 2.608771800994873, | |
| "learning_rate": 2.9857166480488943e-05, | |
| "loss": 2.522, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.1322805640266917, | |
| "eval_cosine_accuracy": 0.9636286497116089, | |
| "eval_loss": 1.248028039932251, | |
| "eval_runtime": 25.7727, | |
| "eval_samples_per_second": 366.978, | |
| "eval_steps_per_second": 0.737, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.1473776382137142, | |
| "grad_norm": 2.6256067752838135, | |
| "learning_rate": 2.982266319996783e-05, | |
| "loss": 2.5117, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.1473776382137142, | |
| "eval_cosine_accuracy": 0.964580237865448, | |
| "eval_loss": 1.2536381483078003, | |
| "eval_runtime": 25.6468, | |
| "eval_samples_per_second": 368.778, | |
| "eval_steps_per_second": 0.741, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.1624747124007366, | |
| "grad_norm": 2.654273271560669, | |
| "learning_rate": 2.9784455234613855e-05, | |
| "loss": 2.5106, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.1624747124007366, | |
| "eval_cosine_accuracy": 0.9640516042709351, | |
| "eval_loss": 1.2445483207702637, | |
| "eval_runtime": 25.6195, | |
| "eval_samples_per_second": 369.172, | |
| "eval_steps_per_second": 0.742, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.1775717865877593, | |
| "grad_norm": 2.5933237075805664, | |
| "learning_rate": 2.9742552133889753e-05, | |
| "loss": 2.5084, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.1775717865877593, | |
| "eval_cosine_accuracy": 0.9639458656311035, | |
| "eval_loss": 1.2357311248779297, | |
| "eval_runtime": 25.6068, | |
| "eval_samples_per_second": 369.355, | |
| "eval_steps_per_second": 0.742, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.192668860774782, | |
| "grad_norm": 2.682175636291504, | |
| "learning_rate": 2.9696964370797545e-05, | |
| "loss": 2.508, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.192668860774782, | |
| "eval_cosine_accuracy": 0.963840126991272, | |
| "eval_loss": 1.2286454439163208, | |
| "eval_runtime": 25.9048, | |
| "eval_samples_per_second": 365.107, | |
| "eval_steps_per_second": 0.733, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.2077659349618044, | |
| "grad_norm": 2.6224212646484375, | |
| "learning_rate": 2.964770333926099e-05, | |
| "loss": 2.5002, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.2077659349618044, | |
| "eval_cosine_accuracy": 0.9632057547569275, | |
| "eval_loss": 1.2165610790252686, | |
| "eval_runtime": 25.7909, | |
| "eval_samples_per_second": 366.719, | |
| "eval_steps_per_second": 0.737, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.2228630091488268, | |
| "grad_norm": 2.700840473175049, | |
| "learning_rate": 2.959489084024884e-05, | |
| "loss": 2.4964, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.2228630091488268, | |
| "eval_cosine_accuracy": 0.9658490419387817, | |
| "eval_loss": 1.210123062133789, | |
| "eval_runtime": 27.3723, | |
| "eval_samples_per_second": 345.531, | |
| "eval_steps_per_second": 0.694, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.2379600833358495, | |
| "grad_norm": 2.504908323287964, | |
| "learning_rate": 2.9538328404465586e-05, | |
| "loss": 2.4908, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.2379600833358495, | |
| "eval_cosine_accuracy": 0.9654260873794556, | |
| "eval_loss": 1.2111036777496338, | |
| "eval_runtime": 27.9784, | |
| "eval_samples_per_second": 338.046, | |
| "eval_steps_per_second": 0.679, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.2530571575228722, | |
| "grad_norm": 2.440549850463867, | |
| "learning_rate": 2.9478132348731005e-05, | |
| "loss": 2.49, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.2530571575228722, | |
| "eval_cosine_accuracy": 0.9643687605857849, | |
| "eval_loss": 1.2143770456314087, | |
| "eval_runtime": 26.0195, | |
| "eval_samples_per_second": 363.496, | |
| "eval_steps_per_second": 0.73, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.2681542317098946, | |
| "grad_norm": 2.454235076904297, | |
| "learning_rate": 2.9414317718075254e-05, | |
| "loss": 2.4876, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.2681542317098946, | |
| "eval_cosine_accuracy": 0.9656375646591187, | |
| "eval_loss": 1.2297664880752563, | |
| "eval_runtime": 26.1652, | |
| "eval_samples_per_second": 361.473, | |
| "eval_steps_per_second": 0.726, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.283251305896917, | |
| "grad_norm": 2.669309616088867, | |
| "learning_rate": 2.934703888081519e-05, | |
| "loss": 2.4891, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.283251305896917, | |
| "eval_cosine_accuracy": 0.9665891528129578, | |
| "eval_loss": 1.2173755168914795, | |
| "eval_runtime": 26.9222, | |
| "eval_samples_per_second": 351.308, | |
| "eval_steps_per_second": 0.706, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.2983483800839397, | |
| "grad_norm": 2.5629279613494873, | |
| "learning_rate": 2.9276043003176936e-05, | |
| "loss": 2.4832, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.2983483800839397, | |
| "eval_cosine_accuracy": 0.9662719368934631, | |
| "eval_loss": 1.228603720664978, | |
| "eval_runtime": 25.6824, | |
| "eval_samples_per_second": 368.268, | |
| "eval_steps_per_second": 0.74, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.3134454542709624, | |
| "grad_norm": 2.510059356689453, | |
| "learning_rate": 2.9201479059579577e-05, | |
| "loss": 2.4836, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.3134454542709624, | |
| "eval_cosine_accuracy": 0.9675406813621521, | |
| "eval_loss": 1.2030081748962402, | |
| "eval_runtime": 37.7702, | |
| "eval_samples_per_second": 250.409, | |
| "eval_steps_per_second": 0.503, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.3285425284579848, | |
| "grad_norm": 2.399697780609131, | |
| "learning_rate": 2.9123365686074353e-05, | |
| "loss": 2.4719, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.3285425284579848, | |
| "eval_cosine_accuracy": 0.9673292636871338, | |
| "eval_loss": 1.2092621326446533, | |
| "eval_runtime": 25.8999, | |
| "eval_samples_per_second": 365.175, | |
| "eval_steps_per_second": 0.734, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.3436396026450075, | |
| "grad_norm": 2.5001821517944336, | |
| "learning_rate": 2.9041722405835078e-05, | |
| "loss": 2.4712, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.3436396026450075, | |
| "eval_cosine_accuracy": 0.9651089310646057, | |
| "eval_loss": 1.2292317152023315, | |
| "eval_runtime": 25.6099, | |
| "eval_samples_per_second": 369.311, | |
| "eval_steps_per_second": 0.742, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.35873667683203, | |
| "grad_norm": 2.411466598510742, | |
| "learning_rate": 2.8956743418323756e-05, | |
| "loss": 2.4672, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.35873667683203, | |
| "eval_cosine_accuracy": 0.9637343883514404, | |
| "eval_loss": 1.2306199073791504, | |
| "eval_runtime": 25.7851, | |
| "eval_samples_per_second": 366.801, | |
| "eval_steps_per_second": 0.737, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.3738337510190526, | |
| "grad_norm": 2.5372798442840576, | |
| "learning_rate": 2.886810937262726e-05, | |
| "loss": 2.466, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.3738337510190526, | |
| "eval_cosine_accuracy": 0.9664834141731262, | |
| "eval_loss": 1.2176159620285034, | |
| "eval_runtime": 25.7619, | |
| "eval_samples_per_second": 367.131, | |
| "eval_steps_per_second": 0.738, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.388930825206075, | |
| "grad_norm": 2.5013914108276367, | |
| "learning_rate": 2.877600921738216e-05, | |
| "loss": 2.4531, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.388930825206075, | |
| "eval_cosine_accuracy": 0.9668005704879761, | |
| "eval_loss": 1.2011289596557617, | |
| "eval_runtime": 26.2915, | |
| "eval_samples_per_second": 359.737, | |
| "eval_steps_per_second": 0.723, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.4040278993930977, | |
| "grad_norm": 2.503460645675659, | |
| "learning_rate": 2.8680465971532057e-05, | |
| "loss": 2.4532, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.4040278993930977, | |
| "eval_cosine_accuracy": 0.9676464200019836, | |
| "eval_loss": 1.2036337852478027, | |
| "eval_runtime": 26.2087, | |
| "eval_samples_per_second": 360.873, | |
| "eval_steps_per_second": 0.725, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.4191249735801201, | |
| "grad_norm": 2.513176679611206, | |
| "learning_rate": 2.8581704835553698e-05, | |
| "loss": 2.4583, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.4191249735801201, | |
| "eval_cosine_accuracy": 0.9673292636871338, | |
| "eval_loss": 1.2205578088760376, | |
| "eval_runtime": 26.2508, | |
| "eval_samples_per_second": 360.294, | |
| "eval_steps_per_second": 0.724, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.4342220477671428, | |
| "grad_norm": 2.671133279800415, | |
| "learning_rate": 2.8479354665239807e-05, | |
| "loss": 2.4531, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.4342220477671428, | |
| "eval_cosine_accuracy": 0.9661661982536316, | |
| "eval_loss": 1.209764838218689, | |
| "eval_runtime": 26.2234, | |
| "eval_samples_per_second": 360.67, | |
| "eval_steps_per_second": 0.725, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.4493191219541652, | |
| "grad_norm": 2.3808069229125977, | |
| "learning_rate": 2.8373635548326275e-05, | |
| "loss": 2.447, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.4493191219541652, | |
| "eval_cosine_accuracy": 0.9660604596138, | |
| "eval_loss": 1.2082879543304443, | |
| "eval_runtime": 26.2685, | |
| "eval_samples_per_second": 360.051, | |
| "eval_steps_per_second": 0.723, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.464416196141188, | |
| "grad_norm": 2.3825228214263916, | |
| "learning_rate": 2.826457390759583e-05, | |
| "loss": 2.4447, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.464416196141188, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.2263509035110474, | |
| "eval_runtime": 26.4189, | |
| "eval_samples_per_second": 358.001, | |
| "eval_steps_per_second": 0.719, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.4795132703282103, | |
| "grad_norm": 2.4729747772216797, | |
| "learning_rate": 2.81524250451576e-05, | |
| "loss": 2.4429, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.4795132703282103, | |
| "eval_cosine_accuracy": 0.9662719368934631, | |
| "eval_loss": 1.2109318971633911, | |
| "eval_runtime": 26.3423, | |
| "eval_samples_per_second": 359.042, | |
| "eval_steps_per_second": 0.721, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.494610344515233, | |
| "grad_norm": 2.4820468425750732, | |
| "learning_rate": 2.8036767505747488e-05, | |
| "loss": 2.4432, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.494610344515233, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.2026188373565674, | |
| "eval_runtime": 26.1704, | |
| "eval_samples_per_second": 361.4, | |
| "eval_steps_per_second": 0.726, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.5097074187022557, | |
| "grad_norm": 2.7118115425109863, | |
| "learning_rate": 2.7917851637237094e-05, | |
| "loss": 2.4349, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.5097074187022557, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.2329705953598022, | |
| "eval_runtime": 26.0247, | |
| "eval_samples_per_second": 363.423, | |
| "eval_steps_per_second": 0.73, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.524804492889278, | |
| "grad_norm": 2.5479893684387207, | |
| "learning_rate": 2.7795707160723676e-05, | |
| "loss": 2.4413, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.524804492889278, | |
| "eval_cosine_accuracy": 0.9640516042709351, | |
| "eval_loss": 1.217067003250122, | |
| "eval_runtime": 26.3236, | |
| "eval_samples_per_second": 359.298, | |
| "eval_steps_per_second": 0.722, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.5399015670763005, | |
| "grad_norm": 2.5022010803222656, | |
| "learning_rate": 2.7670364604242808e-05, | |
| "loss": 2.4314, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.5399015670763005, | |
| "eval_cosine_accuracy": 0.9639458656311035, | |
| "eval_loss": 1.1970478296279907, | |
| "eval_runtime": 26.0308, | |
| "eval_samples_per_second": 363.339, | |
| "eval_steps_per_second": 0.73, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.5549986412633232, | |
| "grad_norm": 2.469269275665283, | |
| "learning_rate": 2.754211545295838e-05, | |
| "loss": 2.4311, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.5549986412633232, | |
| "eval_cosine_accuracy": 0.9654260873794556, | |
| "eval_loss": 1.2095298767089844, | |
| "eval_runtime": 26.0282, | |
| "eval_samples_per_second": 363.375, | |
| "eval_steps_per_second": 0.73, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.5700957154503459, | |
| "grad_norm": 2.4655797481536865, | |
| "learning_rate": 2.7410477746743892e-05, | |
| "loss": 2.4308, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.5700957154503459, | |
| "eval_cosine_accuracy": 0.9664834141731262, | |
| "eval_loss": 1.1810933351516724, | |
| "eval_runtime": 26.3782, | |
| "eval_samples_per_second": 358.553, | |
| "eval_steps_per_second": 0.72, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.5851927896373683, | |
| "grad_norm": 2.4508461952209473, | |
| "learning_rate": 2.7275738242420885e-05, | |
| "loss": 2.4266, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.5851927896373683, | |
| "eval_cosine_accuracy": 0.9650031924247742, | |
| "eval_loss": 1.1934112310409546, | |
| "eval_runtime": 26.3979, | |
| "eval_samples_per_second": 358.286, | |
| "eval_steps_per_second": 0.72, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.6002898638243908, | |
| "grad_norm": 2.5196404457092285, | |
| "learning_rate": 2.713793061594835e-05, | |
| "loss": 2.4273, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.6002898638243908, | |
| "eval_cosine_accuracy": 0.9658490419387817, | |
| "eval_loss": 1.1869851350784302, | |
| "eval_runtime": 26.2175, | |
| "eval_samples_per_second": 360.751, | |
| "eval_steps_per_second": 0.725, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.6153869380114134, | |
| "grad_norm": 2.488192081451416, | |
| "learning_rate": 2.699737399706483e-05, | |
| "loss": 2.4187, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.6153869380114134, | |
| "eval_cosine_accuracy": 0.9654260873794556, | |
| "eval_loss": 1.191279649734497, | |
| "eval_runtime": 25.5662, | |
| "eval_samples_per_second": 369.941, | |
| "eval_steps_per_second": 0.743, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.630484012198436, | |
| "grad_norm": 2.568108558654785, | |
| "learning_rate": 2.6853540174198254e-05, | |
| "loss": 2.4229, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.630484012198436, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.198256254196167, | |
| "eval_runtime": 25.1113, | |
| "eval_samples_per_second": 376.643, | |
| "eval_steps_per_second": 0.757, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.6455810863854585, | |
| "grad_norm": 2.6221678256988525, | |
| "learning_rate": 2.670704027601641e-05, | |
| "loss": 2.4225, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.6455810863854585, | |
| "eval_cosine_accuracy": 0.9662719368934631, | |
| "eval_loss": 1.1917139291763306, | |
| "eval_runtime": 25.0122, | |
| "eval_samples_per_second": 378.135, | |
| "eval_steps_per_second": 0.76, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.660678160572481, | |
| "grad_norm": 2.439676284790039, | |
| "learning_rate": 2.6557323756112153e-05, | |
| "loss": 2.4171, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.660678160572481, | |
| "eval_cosine_accuracy": 0.965320348739624, | |
| "eval_loss": 1.203063726425171, | |
| "eval_runtime": 24.9951, | |
| "eval_samples_per_second": 378.394, | |
| "eval_steps_per_second": 0.76, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.6757752347595036, | |
| "grad_norm": 2.397371530532837, | |
| "learning_rate": 2.6404718670135114e-05, | |
| "loss": 2.4091, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.6757752347595036, | |
| "eval_cosine_accuracy": 0.9660604596138, | |
| "eval_loss": 1.1928857564926147, | |
| "eval_runtime": 24.9359, | |
| "eval_samples_per_second": 379.292, | |
| "eval_steps_per_second": 0.762, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.6908723089465263, | |
| "grad_norm": 2.433346748352051, | |
| "learning_rate": 2.6249263159257348e-05, | |
| "loss": 2.4143, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.6908723089465263, | |
| "eval_cosine_accuracy": 0.9655318260192871, | |
| "eval_loss": 1.1848769187927246, | |
| "eval_runtime": 25.0231, | |
| "eval_samples_per_second": 377.971, | |
| "eval_steps_per_second": 0.759, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.7059693831335487, | |
| "grad_norm": 2.4731833934783936, | |
| "learning_rate": 2.6090996077068493e-05, | |
| "loss": 2.4049, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.7059693831335487, | |
| "eval_cosine_accuracy": 0.9664834141731262, | |
| "eval_loss": 1.168144941329956, | |
| "eval_runtime": 25.7021, | |
| "eval_samples_per_second": 367.985, | |
| "eval_steps_per_second": 0.739, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.7210664573205712, | |
| "grad_norm": 2.548151969909668, | |
| "learning_rate": 2.592995697986495e-05, | |
| "loss": 2.4052, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.7210664573205712, | |
| "eval_cosine_accuracy": 0.9677521586418152, | |
| "eval_loss": 1.1782138347625732, | |
| "eval_runtime": 25.2357, | |
| "eval_samples_per_second": 374.787, | |
| "eval_steps_per_second": 0.753, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.7361635315075938, | |
| "grad_norm": 2.4638047218322754, | |
| "learning_rate": 2.5766186116763416e-05, | |
| "loss": 2.4113, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.7361635315075938, | |
| "eval_cosine_accuracy": 0.9657433032989502, | |
| "eval_loss": 1.1835379600524902, | |
| "eval_runtime": 25.2634, | |
| "eval_samples_per_second": 374.376, | |
| "eval_steps_per_second": 0.752, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.7512606056946165, | |
| "grad_norm": 2.5692343711853027, | |
| "learning_rate": 2.559972441964127e-05, | |
| "loss": 2.4055, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.7512606056946165, | |
| "eval_cosine_accuracy": 0.9666948318481445, | |
| "eval_loss": 1.1879708766937256, | |
| "eval_runtime": 25.2328, | |
| "eval_samples_per_second": 374.83, | |
| "eval_steps_per_second": 0.753, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.766357679881639, | |
| "grad_norm": 2.5655770301818848, | |
| "learning_rate": 2.543095433070693e-05, | |
| "loss": 2.3996, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.766357679881639, | |
| "eval_cosine_accuracy": 0.9665891528129578, | |
| "eval_loss": 1.2095972299575806, | |
| "eval_runtime": 24.9713, | |
| "eval_samples_per_second": 378.755, | |
| "eval_steps_per_second": 0.761, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.7814547540686614, | |
| "grad_norm": 2.4523584842681885, | |
| "learning_rate": 2.525924161220836e-05, | |
| "loss": 2.397, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.7814547540686614, | |
| "eval_cosine_accuracy": 0.9679636359214783, | |
| "eval_loss": 1.1942859888076782, | |
| "eval_runtime": 25.0212, | |
| "eval_samples_per_second": 378.0, | |
| "eval_steps_per_second": 0.759, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.796551828255684, | |
| "grad_norm": 2.3042044639587402, | |
| "learning_rate": 2.508496476226562e-05, | |
| "loss": 2.3954, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.796551828255684, | |
| "eval_cosine_accuracy": 0.9660604596138, | |
| "eval_loss": 1.1957042217254639, | |
| "eval_runtime": 24.9958, | |
| "eval_samples_per_second": 378.383, | |
| "eval_steps_per_second": 0.76, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.8116489024427067, | |
| "grad_norm": 2.443878412246704, | |
| "learning_rate": 2.4908167338557513e-05, | |
| "loss": 2.3955, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.8116489024427067, | |
| "eval_cosine_accuracy": 0.9656375646591187, | |
| "eval_loss": 1.1924560070037842, | |
| "eval_runtime": 25.298, | |
| "eval_samples_per_second": 373.864, | |
| "eval_steps_per_second": 0.751, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.8267459766297292, | |
| "grad_norm": 2.3578927516937256, | |
| "learning_rate": 2.4728893528739477e-05, | |
| "loss": 2.3901, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 1.8267459766297292, | |
| "eval_cosine_accuracy": 0.9663776755332947, | |
| "eval_loss": 1.1898130178451538, | |
| "eval_runtime": 25.5116, | |
| "eval_samples_per_second": 370.734, | |
| "eval_steps_per_second": 0.745, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 1.8418430508167516, | |
| "grad_norm": 2.4540822505950928, | |
| "learning_rate": 2.4547553946809237e-05, | |
| "loss": 2.3966, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.8418430508167516, | |
| "eval_cosine_accuracy": 0.9666948318481445, | |
| "eval_loss": 1.205179214477539, | |
| "eval_runtime": 25.3455, | |
| "eval_samples_per_second": 373.162, | |
| "eval_steps_per_second": 0.75, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.8569401250037743, | |
| "grad_norm": 2.4436283111572266, | |
| "learning_rate": 2.4363467118878128e-05, | |
| "loss": 2.3934, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 1.8569401250037743, | |
| "eval_cosine_accuracy": 0.9668005704879761, | |
| "eval_loss": 1.1965515613555908, | |
| "eval_runtime": 25.134, | |
| "eval_samples_per_second": 376.303, | |
| "eval_steps_per_second": 0.756, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 1.872037199190797, | |
| "grad_norm": 2.504840612411499, | |
| "learning_rate": 2.4177040043843623e-05, | |
| "loss": 2.39, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.872037199190797, | |
| "eval_cosine_accuracy": 0.9674350023269653, | |
| "eval_loss": 1.1888070106506348, | |
| "eval_runtime": 25.0974, | |
| "eval_samples_per_second": 376.852, | |
| "eval_steps_per_second": 0.757, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.8871342733778194, | |
| "grad_norm": 2.423356771469116, | |
| "learning_rate": 2.39883193161367e-05, | |
| "loss": 2.3842, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 1.8871342733778194, | |
| "eval_cosine_accuracy": 0.9675406813621521, | |
| "eval_loss": 1.1591609716415405, | |
| "eval_runtime": 25.3547, | |
| "eval_samples_per_second": 373.027, | |
| "eval_steps_per_second": 0.749, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 1.9022313475648418, | |
| "grad_norm": Infinity, | |
| "learning_rate": 2.3797352103449708e-05, | |
| "loss": 2.3834, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.9022313475648418, | |
| "eval_cosine_accuracy": 0.9675406813621521, | |
| "eval_loss": 1.157774567604065, | |
| "eval_runtime": 25.3953, | |
| "eval_samples_per_second": 372.432, | |
| "eval_steps_per_second": 0.748, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.9173284217518645, | |
| "grad_norm": 2.5663857460021973, | |
| "learning_rate": 2.360457462924386e-05, | |
| "loss": 2.3848, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 1.9173284217518645, | |
| "eval_cosine_accuracy": 0.9673292636871338, | |
| "eval_loss": 1.1656157970428467, | |
| "eval_runtime": 25.2211, | |
| "eval_samples_per_second": 375.004, | |
| "eval_steps_per_second": 0.753, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 1.9324254959388871, | |
| "grad_norm": 2.4691641330718994, | |
| "learning_rate": 2.3409262436047722e-05, | |
| "loss": 2.3845, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.9324254959388871, | |
| "eval_cosine_accuracy": 0.9679636359214783, | |
| "eval_loss": 1.1666511297225952, | |
| "eval_runtime": 25.1314, | |
| "eval_samples_per_second": 376.341, | |
| "eval_steps_per_second": 0.756, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.9475225701259096, | |
| "grad_norm": 2.5696003437042236, | |
| "learning_rate": 2.321184848376942e-05, | |
| "loss": 2.3858, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 1.9475225701259096, | |
| "eval_cosine_accuracy": 0.9668005704879761, | |
| "eval_loss": 1.1749593019485474, | |
| "eval_runtime": 25.1663, | |
| "eval_samples_per_second": 375.82, | |
| "eval_steps_per_second": 0.755, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 1.962619644312932, | |
| "grad_norm": 2.4359042644500732, | |
| "learning_rate": 2.3012382112832128e-05, | |
| "loss": 2.3825, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.962619644312932, | |
| "eval_cosine_accuracy": 0.9670120477676392, | |
| "eval_loss": 1.177405595779419, | |
| "eval_runtime": 25.6293, | |
| "eval_samples_per_second": 369.031, | |
| "eval_steps_per_second": 0.741, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.9777167184999547, | |
| "grad_norm": 2.386305093765259, | |
| "learning_rate": 2.281131807967385e-05, | |
| "loss": 2.6758, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 1.9777167184999547, | |
| "eval_cosine_accuracy": 0.9617255330085754, | |
| "eval_loss": 1.2431585788726807, | |
| "eval_runtime": 25.5435, | |
| "eval_samples_per_second": 370.27, | |
| "eval_steps_per_second": 0.744, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 1.9928137926869773, | |
| "grad_norm": 4.19161319732666, | |
| "learning_rate": 2.2607900785946504e-05, | |
| "loss": 3.0304, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.9928137926869773, | |
| "eval_cosine_accuracy": 0.9528441429138184, | |
| "eval_loss": 1.3187161684036255, | |
| "eval_runtime": 26.0183, | |
| "eval_samples_per_second": 363.514, | |
| "eval_steps_per_second": 0.73, | |
| "step": 66000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 132476, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 512, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |