LamaDiab's picture
Training checkpoint - Epoch 2, Step 66238
5fb1621 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 66238,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.019414837404511e-05,
"grad_norm": 5.361182689666748,
"learning_rate": 0.0,
"loss": 4.5864,
"step": 1
},
{
"epoch": 0.015097074187022556,
"grad_norm": 4.397288799285889,
"learning_rate": 4.5202004951989857e-07,
"loss": 4.4178,
"step": 500
},
{
"epoch": 0.015097074187022556,
"eval_cosine_accuracy": 0.9432226419448853,
"eval_loss": 1.708533525466919,
"eval_runtime": 24.4327,
"eval_samples_per_second": 387.104,
"eval_steps_per_second": 0.778,
"step": 500
},
{
"epoch": 0.03019414837404511,
"grad_norm": 3.148864984512329,
"learning_rate": 9.049459508424422e-07,
"loss": 4.1295,
"step": 1000
},
{
"epoch": 0.03019414837404511,
"eval_cosine_accuracy": 0.9505180716514587,
"eval_loss": 1.5775120258331299,
"eval_runtime": 24.5488,
"eval_samples_per_second": 385.274,
"eval_steps_per_second": 0.774,
"step": 1000
},
{
"epoch": 0.045291222561067664,
"grad_norm": 2.853407382965088,
"learning_rate": 1.3578718521649858e-06,
"loss": 3.9183,
"step": 1500
},
{
"epoch": 0.045291222561067664,
"eval_cosine_accuracy": 0.9493550658226013,
"eval_loss": 1.5401873588562012,
"eval_runtime": 24.6069,
"eval_samples_per_second": 384.364,
"eval_steps_per_second": 0.772,
"step": 1500
},
{
"epoch": 0.06038829674809022,
"grad_norm": 2.850583791732788,
"learning_rate": 1.8107977534875296e-06,
"loss": 3.7909,
"step": 2000
},
{
"epoch": 0.06038829674809022,
"eval_cosine_accuracy": 0.9497779607772827,
"eval_loss": 1.5130109786987305,
"eval_runtime": 24.9452,
"eval_samples_per_second": 379.151,
"eval_steps_per_second": 0.762,
"step": 2000
},
{
"epoch": 0.07548537093511277,
"grad_norm": 2.7371764183044434,
"learning_rate": 2.263723654810073e-06,
"loss": 3.6971,
"step": 2500
},
{
"epoch": 0.07548537093511277,
"eval_cosine_accuracy": 0.9504123330116272,
"eval_loss": 1.4736826419830322,
"eval_runtime": 24.9145,
"eval_samples_per_second": 379.618,
"eval_steps_per_second": 0.763,
"step": 2500
},
{
"epoch": 0.09058244512213533,
"grad_norm": 2.864781379699707,
"learning_rate": 2.7166495561326167e-06,
"loss": 3.6265,
"step": 3000
},
{
"epoch": 0.09058244512213533,
"eval_cosine_accuracy": 0.9510467052459717,
"eval_loss": 1.441090703010559,
"eval_runtime": 25.0444,
"eval_samples_per_second": 377.65,
"eval_steps_per_second": 0.759,
"step": 3000
},
{
"epoch": 0.10567951930915788,
"grad_norm": 2.8439295291900635,
"learning_rate": 3.1695754574551603e-06,
"loss": 3.5424,
"step": 3500
},
{
"epoch": 0.10567951930915788,
"eval_cosine_accuracy": 0.952421247959137,
"eval_loss": 1.419554591178894,
"eval_runtime": 24.765,
"eval_samples_per_second": 381.91,
"eval_steps_per_second": 0.767,
"step": 3500
},
{
"epoch": 0.12077659349618045,
"grad_norm": 2.7819902896881104,
"learning_rate": 3.6225013587777042e-06,
"loss": 3.4728,
"step": 4000
},
{
"epoch": 0.12077659349618045,
"eval_cosine_accuracy": 0.9529498815536499,
"eval_loss": 1.405138373374939,
"eval_runtime": 24.8221,
"eval_samples_per_second": 381.031,
"eval_steps_per_second": 0.765,
"step": 4000
},
{
"epoch": 0.135873667683203,
"grad_norm": 3.0670077800750732,
"learning_rate": 4.075427260100247e-06,
"loss": 3.4177,
"step": 4500
},
{
"epoch": 0.135873667683203,
"eval_cosine_accuracy": 0.9542186260223389,
"eval_loss": 1.3949991464614868,
"eval_runtime": 24.7597,
"eval_samples_per_second": 381.991,
"eval_steps_per_second": 0.767,
"step": 4500
},
{
"epoch": 0.15097074187022555,
"grad_norm": 3.037975549697876,
"learning_rate": 4.528353161422791e-06,
"loss": 3.3723,
"step": 5000
},
{
"epoch": 0.15097074187022555,
"eval_cosine_accuracy": 0.954430103302002,
"eval_loss": 1.400901436805725,
"eval_runtime": 24.8332,
"eval_samples_per_second": 380.861,
"eval_steps_per_second": 0.765,
"step": 5000
},
{
"epoch": 0.1660678160572481,
"grad_norm": 2.9103686809539795,
"learning_rate": 4.9812790627453345e-06,
"loss": 3.314,
"step": 5500
},
{
"epoch": 0.1660678160572481,
"eval_cosine_accuracy": 0.9536899924278259,
"eval_loss": 1.376301884651184,
"eval_runtime": 24.9503,
"eval_samples_per_second": 379.074,
"eval_steps_per_second": 0.762,
"step": 5500
},
{
"epoch": 0.18116489024427065,
"grad_norm": 2.823807716369629,
"learning_rate": 5.4332991122652334e-06,
"loss": 3.2727,
"step": 6000
},
{
"epoch": 0.18116489024427065,
"eval_cosine_accuracy": 0.9555931687355042,
"eval_loss": 1.3673756122589111,
"eval_runtime": 24.7308,
"eval_samples_per_second": 382.438,
"eval_steps_per_second": 0.768,
"step": 6000
},
{
"epoch": 0.1962619644312932,
"grad_norm": 2.8840878009796143,
"learning_rate": 5.886225013587777e-06,
"loss": 3.2443,
"step": 6500
},
{
"epoch": 0.1962619644312932,
"eval_cosine_accuracy": 0.9550644755363464,
"eval_loss": 1.3680225610733032,
"eval_runtime": 24.7458,
"eval_samples_per_second": 382.207,
"eval_steps_per_second": 0.768,
"step": 6500
},
{
"epoch": 0.21135903861831576,
"grad_norm": 2.826021194458008,
"learning_rate": 6.3391509149103205e-06,
"loss": 3.2021,
"step": 7000
},
{
"epoch": 0.21135903861831576,
"eval_cosine_accuracy": 0.9558045864105225,
"eval_loss": 1.3505871295928955,
"eval_runtime": 24.9126,
"eval_samples_per_second": 379.648,
"eval_steps_per_second": 0.763,
"step": 7000
},
{
"epoch": 0.22645611280533834,
"grad_norm": 2.8310251235961914,
"learning_rate": 6.792076816232864e-06,
"loss": 3.1722,
"step": 7500
},
{
"epoch": 0.22645611280533834,
"eval_cosine_accuracy": 0.9562275409698486,
"eval_loss": 1.3529975414276123,
"eval_runtime": 25.1169,
"eval_samples_per_second": 376.559,
"eval_steps_per_second": 0.756,
"step": 7500
},
{
"epoch": 0.2415531869923609,
"grad_norm": 2.9824509620666504,
"learning_rate": 7.2450027175554085e-06,
"loss": 3.141,
"step": 8000
},
{
"epoch": 0.2415531869923609,
"eval_cosine_accuracy": 0.9558045864105225,
"eval_loss": 1.360655665397644,
"eval_runtime": 24.8793,
"eval_samples_per_second": 380.155,
"eval_steps_per_second": 0.764,
"step": 8000
},
{
"epoch": 0.25665026117938344,
"grad_norm": 2.9972078800201416,
"learning_rate": 7.697928618877951e-06,
"loss": 3.1142,
"step": 8500
},
{
"epoch": 0.25665026117938344,
"eval_cosine_accuracy": 0.9562275409698486,
"eval_loss": 1.3441089391708374,
"eval_runtime": 24.8523,
"eval_samples_per_second": 380.568,
"eval_steps_per_second": 0.765,
"step": 8500
},
{
"epoch": 0.271747335366406,
"grad_norm": 2.882063865661621,
"learning_rate": 8.14994866839785e-06,
"loss": 3.0868,
"step": 9000
},
{
"epoch": 0.271747335366406,
"eval_cosine_accuracy": 0.9574962854385376,
"eval_loss": 1.3387176990509033,
"eval_runtime": 24.9253,
"eval_samples_per_second": 379.453,
"eval_steps_per_second": 0.762,
"step": 9000
},
{
"epoch": 0.28684440955342855,
"grad_norm": 3.041818857192993,
"learning_rate": 8.602874569720394e-06,
"loss": 3.0581,
"step": 9500
},
{
"epoch": 0.28684440955342855,
"eval_cosine_accuracy": 0.9586593508720398,
"eval_loss": 1.318073034286499,
"eval_runtime": 24.7817,
"eval_samples_per_second": 381.652,
"eval_steps_per_second": 0.767,
"step": 9500
},
{
"epoch": 0.3019414837404511,
"grad_norm": 3.2543036937713623,
"learning_rate": 9.055800471042937e-06,
"loss": 3.0356,
"step": 10000
},
{
"epoch": 0.3019414837404511,
"eval_cosine_accuracy": 0.9595052003860474,
"eval_loss": 1.313987135887146,
"eval_runtime": 24.8717,
"eval_samples_per_second": 380.272,
"eval_steps_per_second": 0.764,
"step": 10000
},
{
"epoch": 0.31703855792747365,
"grad_norm": 3.0757057666778564,
"learning_rate": 9.50872637236548e-06,
"loss": 3.0144,
"step": 10500
},
{
"epoch": 0.31703855792747365,
"eval_cosine_accuracy": 0.9592937231063843,
"eval_loss": 1.3049250841140747,
"eval_runtime": 45.4301,
"eval_samples_per_second": 208.188,
"eval_steps_per_second": 0.418,
"step": 10500
},
{
"epoch": 0.3321356321144962,
"grad_norm": 3.0446619987487793,
"learning_rate": 9.961652273688024e-06,
"loss": 2.9902,
"step": 11000
},
{
"epoch": 0.3321356321144962,
"eval_cosine_accuracy": 0.9595052003860474,
"eval_loss": 1.308658480644226,
"eval_runtime": 25.5656,
"eval_samples_per_second": 369.95,
"eval_steps_per_second": 0.743,
"step": 11000
},
{
"epoch": 0.34723270630151876,
"grad_norm": 2.9077670574188232,
"learning_rate": 1.041457817501057e-05,
"loss": 2.9743,
"step": 11500
},
{
"epoch": 0.34723270630151876,
"eval_cosine_accuracy": 0.9600338339805603,
"eval_loss": 1.3093079328536987,
"eval_runtime": 25.135,
"eval_samples_per_second": 376.287,
"eval_steps_per_second": 0.756,
"step": 11500
},
{
"epoch": 0.3623297804885413,
"grad_norm": 3.023320198059082,
"learning_rate": 1.0867504076333111e-05,
"loss": 2.9547,
"step": 12000
},
{
"epoch": 0.3623297804885413,
"eval_cosine_accuracy": 0.9597166180610657,
"eval_loss": 1.2953948974609375,
"eval_runtime": 25.1058,
"eval_samples_per_second": 376.726,
"eval_steps_per_second": 0.757,
"step": 12000
},
{
"epoch": 0.37742685467556386,
"grad_norm": 3.1133012771606445,
"learning_rate": 1.1320429977655657e-05,
"loss": 2.9464,
"step": 12500
},
{
"epoch": 0.37742685467556386,
"eval_cosine_accuracy": 0.9593994617462158,
"eval_loss": 1.2992297410964966,
"eval_runtime": 24.7741,
"eval_samples_per_second": 381.77,
"eval_steps_per_second": 0.767,
"step": 12500
},
{
"epoch": 0.3925239288625864,
"grad_norm": 2.912932872772217,
"learning_rate": 1.1772450027175554e-05,
"loss": 2.9343,
"step": 13000
},
{
"epoch": 0.3925239288625864,
"eval_cosine_accuracy": 0.9616197943687439,
"eval_loss": 1.283966302871704,
"eval_runtime": 24.707,
"eval_samples_per_second": 382.807,
"eval_steps_per_second": 0.769,
"step": 13000
},
{
"epoch": 0.40762100304960897,
"grad_norm": 3.116110324859619,
"learning_rate": 1.22253759284981e-05,
"loss": 2.9116,
"step": 13500
},
{
"epoch": 0.40762100304960897,
"eval_cosine_accuracy": 0.9605624675750732,
"eval_loss": 1.2977007627487183,
"eval_runtime": 24.8066,
"eval_samples_per_second": 381.269,
"eval_steps_per_second": 0.766,
"step": 13500
},
{
"epoch": 0.4227180772366315,
"grad_norm": 2.9808337688446045,
"learning_rate": 1.2678301829820641e-05,
"loss": 2.8881,
"step": 14000
},
{
"epoch": 0.4227180772366315,
"eval_cosine_accuracy": 0.9615140557289124,
"eval_loss": 1.2986812591552734,
"eval_runtime": 24.7178,
"eval_samples_per_second": 382.639,
"eval_steps_per_second": 0.769,
"step": 14000
},
{
"epoch": 0.43781515142365407,
"grad_norm": 2.966411828994751,
"learning_rate": 1.3131227731143186e-05,
"loss": 2.8767,
"step": 14500
},
{
"epoch": 0.43781515142365407,
"eval_cosine_accuracy": 0.9616197943687439,
"eval_loss": 1.2957323789596558,
"eval_runtime": 24.7859,
"eval_samples_per_second": 381.588,
"eval_steps_per_second": 0.767,
"step": 14500
},
{
"epoch": 0.4529122256106767,
"grad_norm": 2.91023588180542,
"learning_rate": 1.3583247780663084e-05,
"loss": 2.8615,
"step": 15000
},
{
"epoch": 0.4529122256106767,
"eval_cosine_accuracy": 0.9615140557289124,
"eval_loss": 1.3000255823135376,
"eval_runtime": 24.7801,
"eval_samples_per_second": 381.677,
"eval_steps_per_second": 0.767,
"step": 15000
},
{
"epoch": 0.46800929979769923,
"grad_norm": 3.1636173725128174,
"learning_rate": 1.4036173681985627e-05,
"loss": 2.8495,
"step": 15500
},
{
"epoch": 0.46800929979769923,
"eval_cosine_accuracy": 0.9614083170890808,
"eval_loss": 1.3052477836608887,
"eval_runtime": 24.727,
"eval_samples_per_second": 382.497,
"eval_steps_per_second": 0.768,
"step": 15500
},
{
"epoch": 0.4831063739847218,
"grad_norm": 2.824103355407715,
"learning_rate": 1.448909958330817e-05,
"loss": 2.8396,
"step": 16000
},
{
"epoch": 0.4831063739847218,
"eval_cosine_accuracy": 0.963311493396759,
"eval_loss": 1.3045930862426758,
"eval_runtime": 24.7919,
"eval_samples_per_second": 381.495,
"eval_steps_per_second": 0.766,
"step": 16000
},
{
"epoch": 0.49820344817174433,
"grad_norm": 2.780132532119751,
"learning_rate": 1.4942025484630714e-05,
"loss": 2.8268,
"step": 16500
},
{
"epoch": 0.49820344817174433,
"eval_cosine_accuracy": 0.961831271648407,
"eval_loss": 1.3152496814727783,
"eval_runtime": 24.7154,
"eval_samples_per_second": 382.676,
"eval_steps_per_second": 0.769,
"step": 16500
},
{
"epoch": 0.5133005223587669,
"grad_norm": 3.0766918659210205,
"learning_rate": 1.5394045534150613e-05,
"loss": 2.8204,
"step": 17000
},
{
"epoch": 0.5133005223587669,
"eval_cosine_accuracy": 0.9627828001976013,
"eval_loss": 1.3005013465881348,
"eval_runtime": 24.862,
"eval_samples_per_second": 380.42,
"eval_steps_per_second": 0.764,
"step": 17000
},
{
"epoch": 0.5283975965457894,
"grad_norm": 3.02154541015625,
"learning_rate": 1.5846971435473157e-05,
"loss": 2.8037,
"step": 17500
},
{
"epoch": 0.5283975965457894,
"eval_cosine_accuracy": 0.9629942774772644,
"eval_loss": 1.298316240310669,
"eval_runtime": 24.7624,
"eval_samples_per_second": 381.95,
"eval_steps_per_second": 0.767,
"step": 17500
},
{
"epoch": 0.543494670732812,
"grad_norm": 2.928821086883545,
"learning_rate": 1.62998973367957e-05,
"loss": 2.7931,
"step": 18000
},
{
"epoch": 0.543494670732812,
"eval_cosine_accuracy": 0.9620426893234253,
"eval_loss": 1.3049898147583008,
"eval_runtime": 24.7029,
"eval_samples_per_second": 382.87,
"eval_steps_per_second": 0.769,
"step": 18000
},
{
"epoch": 0.5585917449198345,
"grad_norm": 2.72847843170166,
"learning_rate": 1.6752823238118247e-05,
"loss": 2.7864,
"step": 18500
},
{
"epoch": 0.5585917449198345,
"eval_cosine_accuracy": 0.962571382522583,
"eval_loss": 1.3096941709518433,
"eval_runtime": 24.7148,
"eval_samples_per_second": 382.686,
"eval_steps_per_second": 0.769,
"step": 18500
},
{
"epoch": 0.5736888191068571,
"grad_norm": 2.908419609069824,
"learning_rate": 1.7204843287638144e-05,
"loss": 2.7714,
"step": 19000
},
{
"epoch": 0.5736888191068571,
"eval_cosine_accuracy": 0.9614083170890808,
"eval_loss": 1.3053295612335205,
"eval_runtime": 24.766,
"eval_samples_per_second": 381.895,
"eval_steps_per_second": 0.767,
"step": 19000
},
{
"epoch": 0.5887858932938796,
"grad_norm": 3.065990686416626,
"learning_rate": 1.7657769188960685e-05,
"loss": 2.7555,
"step": 19500
},
{
"epoch": 0.5887858932938796,
"eval_cosine_accuracy": 0.9626771211624146,
"eval_loss": 1.2939248085021973,
"eval_runtime": 24.9445,
"eval_samples_per_second": 379.162,
"eval_steps_per_second": 0.762,
"step": 19500
},
{
"epoch": 0.6038829674809022,
"grad_norm": 2.832427501678467,
"learning_rate": 1.8110695090283228e-05,
"loss": 2.7539,
"step": 20000
},
{
"epoch": 0.6038829674809022,
"eval_cosine_accuracy": 0.9627828001976013,
"eval_loss": 1.2884334325790405,
"eval_runtime": 25.3776,
"eval_samples_per_second": 372.691,
"eval_steps_per_second": 0.749,
"step": 20000
},
{
"epoch": 0.6189800416679248,
"grad_norm": 2.8832485675811768,
"learning_rate": 1.8563620991605775e-05,
"loss": 2.7374,
"step": 20500
},
{
"epoch": 0.6189800416679248,
"eval_cosine_accuracy": 0.9632057547569275,
"eval_loss": 1.2746211290359497,
"eval_runtime": 25.079,
"eval_samples_per_second": 377.129,
"eval_steps_per_second": 0.758,
"step": 20500
},
{
"epoch": 0.6340771158549473,
"grad_norm": 2.9037137031555176,
"learning_rate": 1.9015641041125672e-05,
"loss": 2.7352,
"step": 21000
},
{
"epoch": 0.6340771158549473,
"eval_cosine_accuracy": 0.9629942774772644,
"eval_loss": 1.276559829711914,
"eval_runtime": 25.1377,
"eval_samples_per_second": 376.248,
"eval_steps_per_second": 0.756,
"step": 21000
},
{
"epoch": 0.6491741900419699,
"grad_norm": 2.797557830810547,
"learning_rate": 1.9468566942448216e-05,
"loss": 2.7224,
"step": 21500
},
{
"epoch": 0.6491741900419699,
"eval_cosine_accuracy": 0.9640516042709351,
"eval_loss": 1.2923390865325928,
"eval_runtime": 24.7534,
"eval_samples_per_second": 382.09,
"eval_steps_per_second": 0.768,
"step": 21500
},
{
"epoch": 0.6642712642289924,
"grad_norm": 2.7658629417419434,
"learning_rate": 1.992149284377076e-05,
"loss": 2.7047,
"step": 22000
},
{
"epoch": 0.6642712642289924,
"eval_cosine_accuracy": 0.963100016117096,
"eval_loss": 1.2848035097122192,
"eval_runtime": 26.2586,
"eval_samples_per_second": 360.187,
"eval_steps_per_second": 0.724,
"step": 22000
},
{
"epoch": 0.679368338416015,
"grad_norm": 2.8352978229522705,
"learning_rate": 2.0374418745093303e-05,
"loss": 2.7037,
"step": 22500
},
{
"epoch": 0.679368338416015,
"eval_cosine_accuracy": 0.9647917151451111,
"eval_loss": 1.294427514076233,
"eval_runtime": 25.7103,
"eval_samples_per_second": 367.868,
"eval_steps_per_second": 0.739,
"step": 22500
},
{
"epoch": 0.6944654126030375,
"grad_norm": 2.6762068271636963,
"learning_rate": 2.0827344646415847e-05,
"loss": 2.6983,
"step": 23000
},
{
"epoch": 0.6944654126030375,
"eval_cosine_accuracy": 0.9636286497116089,
"eval_loss": 1.282760739326477,
"eval_runtime": 25.5021,
"eval_samples_per_second": 370.871,
"eval_steps_per_second": 0.745,
"step": 23000
},
{
"epoch": 0.7095624867900601,
"grad_norm": 2.7681946754455566,
"learning_rate": 2.1279364695935747e-05,
"loss": 2.6881,
"step": 23500
},
{
"epoch": 0.7095624867900601,
"eval_cosine_accuracy": 0.9646859765052795,
"eval_loss": 1.2692368030548096,
"eval_runtime": 24.7607,
"eval_samples_per_second": 381.976,
"eval_steps_per_second": 0.767,
"step": 23500
},
{
"epoch": 0.7246595609770826,
"grad_norm": 2.740656852722168,
"learning_rate": 2.1732290597258287e-05,
"loss": 2.6803,
"step": 24000
},
{
"epoch": 0.7246595609770826,
"eval_cosine_accuracy": 0.9648974537849426,
"eval_loss": 1.289289951324463,
"eval_runtime": 24.8054,
"eval_samples_per_second": 381.288,
"eval_steps_per_second": 0.766,
"step": 24000
},
{
"epoch": 0.7397566351641052,
"grad_norm": 2.850194215774536,
"learning_rate": 2.218521649858083e-05,
"loss": 2.663,
"step": 24500
},
{
"epoch": 0.7397566351641052,
"eval_cosine_accuracy": 0.9658490419387817,
"eval_loss": 1.2865654230117798,
"eval_runtime": 24.7982,
"eval_samples_per_second": 381.399,
"eval_steps_per_second": 0.766,
"step": 24500
},
{
"epoch": 0.7548537093511277,
"grad_norm": 2.8644144535064697,
"learning_rate": 2.2638142399903378e-05,
"loss": 2.67,
"step": 25000
},
{
"epoch": 0.7548537093511277,
"eval_cosine_accuracy": 0.9647917151451111,
"eval_loss": 1.2818125486373901,
"eval_runtime": 24.8907,
"eval_samples_per_second": 379.981,
"eval_steps_per_second": 0.763,
"step": 25000
},
{
"epoch": 0.7699507835381503,
"grad_norm": 3.004762887954712,
"learning_rate": 2.309106830122592e-05,
"loss": 2.6578,
"step": 25500
},
{
"epoch": 0.7699507835381503,
"eval_cosine_accuracy": 0.9642630815505981,
"eval_loss": 1.2803616523742676,
"eval_runtime": 26.9665,
"eval_samples_per_second": 350.732,
"eval_steps_per_second": 0.705,
"step": 25500
},
{
"epoch": 0.7850478577251728,
"grad_norm": 2.760558605194092,
"learning_rate": 2.3543994202548465e-05,
"loss": 2.6533,
"step": 26000
},
{
"epoch": 0.7850478577251728,
"eval_cosine_accuracy": 0.9646859765052795,
"eval_loss": 1.2608861923217773,
"eval_runtime": 25.6588,
"eval_samples_per_second": 368.607,
"eval_steps_per_second": 0.74,
"step": 26000
},
{
"epoch": 0.8001449319121954,
"grad_norm": 2.9594552516937256,
"learning_rate": 2.3996920103871005e-05,
"loss": 2.6466,
"step": 26500
},
{
"epoch": 0.8001449319121954,
"eval_cosine_accuracy": 0.9634172320365906,
"eval_loss": 1.2851191759109497,
"eval_runtime": 25.6335,
"eval_samples_per_second": 368.971,
"eval_steps_per_second": 0.741,
"step": 26500
},
{
"epoch": 0.8152420060992179,
"grad_norm": 2.670184373855591,
"learning_rate": 2.4449846005193552e-05,
"loss": 2.6269,
"step": 27000
},
{
"epoch": 0.8152420060992179,
"eval_cosine_accuracy": 0.9658490419387817,
"eval_loss": 1.260596752166748,
"eval_runtime": 25.5908,
"eval_samples_per_second": 369.586,
"eval_steps_per_second": 0.742,
"step": 27000
},
{
"epoch": 0.8303390802862405,
"grad_norm": 2.615385055541992,
"learning_rate": 2.490186605471345e-05,
"loss": 2.6246,
"step": 27500
},
{
"epoch": 0.8303390802862405,
"eval_cosine_accuracy": 0.9644744992256165,
"eval_loss": 1.2606145143508911,
"eval_runtime": 25.6253,
"eval_samples_per_second": 369.089,
"eval_steps_per_second": 0.741,
"step": 27500
},
{
"epoch": 0.845436154473263,
"grad_norm": 2.8142237663269043,
"learning_rate": 2.5353886104233347e-05,
"loss": 2.6253,
"step": 28000
},
{
"epoch": 0.845436154473263,
"eval_cosine_accuracy": 0.9644744992256165,
"eval_loss": 1.239962100982666,
"eval_runtime": 25.5806,
"eval_samples_per_second": 369.733,
"eval_steps_per_second": 0.743,
"step": 28000
},
{
"epoch": 0.8605332286602856,
"grad_norm": 2.9930691719055176,
"learning_rate": 2.580681200555589e-05,
"loss": 2.6173,
"step": 28500
},
{
"epoch": 0.8605332286602856,
"eval_cosine_accuracy": 0.9635229706764221,
"eval_loss": 1.2646968364715576,
"eval_runtime": 25.6919,
"eval_samples_per_second": 368.132,
"eval_steps_per_second": 0.74,
"step": 28500
},
{
"epoch": 0.8756303028473081,
"grad_norm": 2.7095680236816406,
"learning_rate": 2.6259737906878434e-05,
"loss": 2.6139,
"step": 29000
},
{
"epoch": 0.8756303028473081,
"eval_cosine_accuracy": 0.9641573429107666,
"eval_loss": 1.239478349685669,
"eval_runtime": 25.6695,
"eval_samples_per_second": 368.452,
"eval_steps_per_second": 0.74,
"step": 29000
},
{
"epoch": 0.8907273770343307,
"grad_norm": 2.694718599319458,
"learning_rate": 2.671266380820098e-05,
"loss": 2.6031,
"step": 29500
},
{
"epoch": 0.8907273770343307,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.2557895183563232,
"eval_runtime": 25.52,
"eval_samples_per_second": 370.611,
"eval_steps_per_second": 0.745,
"step": 29500
},
{
"epoch": 0.9058244512213534,
"grad_norm": 2.7020938396453857,
"learning_rate": 2.7165589709523524e-05,
"loss": 2.5981,
"step": 30000
},
{
"epoch": 0.9058244512213534,
"eval_cosine_accuracy": 0.9648974537849426,
"eval_loss": 1.2560282945632935,
"eval_runtime": 25.5053,
"eval_samples_per_second": 370.825,
"eval_steps_per_second": 0.745,
"step": 30000
},
{
"epoch": 0.9209215254083759,
"grad_norm": 2.7241218090057373,
"learning_rate": 2.7618515610846064e-05,
"loss": 2.5947,
"step": 30500
},
{
"epoch": 0.9209215254083759,
"eval_cosine_accuracy": 0.9668005704879761,
"eval_loss": 1.2227962017059326,
"eval_runtime": 25.4374,
"eval_samples_per_second": 371.815,
"eval_steps_per_second": 0.747,
"step": 30500
},
{
"epoch": 0.9360185995953985,
"grad_norm": 2.6481659412384033,
"learning_rate": 2.8071441512168608e-05,
"loss": 2.5854,
"step": 31000
},
{
"epoch": 0.9360185995953985,
"eval_cosine_accuracy": 0.9651089310646057,
"eval_loss": 1.2358726263046265,
"eval_runtime": 25.5016,
"eval_samples_per_second": 370.878,
"eval_steps_per_second": 0.745,
"step": 31000
},
{
"epoch": 0.951115673782421,
"grad_norm": 2.7532155513763428,
"learning_rate": 2.8524367413491155e-05,
"loss": 2.577,
"step": 31500
},
{
"epoch": 0.951115673782421,
"eval_cosine_accuracy": 0.9650031924247742,
"eval_loss": 1.2355579137802124,
"eval_runtime": 25.495,
"eval_samples_per_second": 370.975,
"eval_steps_per_second": 0.745,
"step": 31500
},
{
"epoch": 0.9662127479694436,
"grad_norm": 2.8746631145477295,
"learning_rate": 2.8976387463011052e-05,
"loss": 2.5767,
"step": 32000
},
{
"epoch": 0.9662127479694436,
"eval_cosine_accuracy": 0.9651089310646057,
"eval_loss": 1.2236350774765015,
"eval_runtime": 25.8758,
"eval_samples_per_second": 365.515,
"eval_steps_per_second": 0.734,
"step": 32000
},
{
"epoch": 0.9813098221564661,
"grad_norm": 2.3970577716827393,
"learning_rate": 2.9429313364333596e-05,
"loss": 3.0097,
"step": 32500
},
{
"epoch": 0.9813098221564661,
"eval_cosine_accuracy": 0.9570733904838562,
"eval_loss": 1.2788145542144775,
"eval_runtime": 25.8609,
"eval_samples_per_second": 365.725,
"eval_steps_per_second": 0.735,
"step": 32500
},
{
"epoch": 0.9964068963434887,
"grad_norm": 4.550646781921387,
"learning_rate": 2.9881333413853493e-05,
"loss": 2.9935,
"step": 33000
},
{
"epoch": 0.9964068963434887,
"eval_cosine_accuracy": 0.9425882697105408,
"eval_loss": 1.4643794298171997,
"eval_runtime": 25.682,
"eval_samples_per_second": 368.274,
"eval_steps_per_second": 0.74,
"step": 33000
},
{
"epoch": 1.011503970530511,
"grad_norm": 2.425041913986206,
"learning_rate": 2.9998979053152034e-05,
"loss": 2.5754,
"step": 33500
},
{
"epoch": 1.011503970530511,
"eval_cosine_accuracy": 0.9621484279632568,
"eval_loss": 1.261721134185791,
"eval_runtime": 25.4189,
"eval_samples_per_second": 372.085,
"eval_steps_per_second": 0.747,
"step": 33500
},
{
"epoch": 1.0266010447175338,
"grad_norm": 2.5277788639068604,
"learning_rate": 2.9994338030101077e-05,
"loss": 2.5581,
"step": 34000
},
{
"epoch": 1.0266010447175338,
"eval_cosine_accuracy": 0.9632057547569275,
"eval_loss": 1.2438400983810425,
"eval_runtime": 25.4498,
"eval_samples_per_second": 371.634,
"eval_steps_per_second": 0.747,
"step": 34000
},
{
"epoch": 1.0416981189045562,
"grad_norm": 2.544344663619995,
"learning_rate": 2.9985949414871202e-05,
"loss": 2.55,
"step": 34500
},
{
"epoch": 1.0416981189045562,
"eval_cosine_accuracy": 0.9640516042709351,
"eval_loss": 1.2398865222930908,
"eval_runtime": 25.6677,
"eval_samples_per_second": 368.478,
"eval_steps_per_second": 0.74,
"step": 34500
},
{
"epoch": 1.0567951930915789,
"grad_norm": 2.7393147945404053,
"learning_rate": 2.997381530406107e-05,
"loss": 2.5385,
"step": 35000
},
{
"epoch": 1.0567951930915789,
"eval_cosine_accuracy": 0.9623599052429199,
"eval_loss": 1.2346230745315552,
"eval_runtime": 25.4184,
"eval_samples_per_second": 372.092,
"eval_steps_per_second": 0.747,
"step": 35000
},
{
"epoch": 1.0718922672786013,
"grad_norm": 2.5983469486236572,
"learning_rate": 2.995793873039533e-05,
"loss": 2.5364,
"step": 35500
},
{
"epoch": 1.0718922672786013,
"eval_cosine_accuracy": 0.9646859765052795,
"eval_loss": 1.2296756505966187,
"eval_runtime": 26.1785,
"eval_samples_per_second": 361.289,
"eval_steps_per_second": 0.726,
"step": 35500
},
{
"epoch": 1.086989341465624,
"grad_norm": 2.7347781658172607,
"learning_rate": 2.9938323661966688e-05,
"loss": 2.5405,
"step": 36000
},
{
"epoch": 1.086989341465624,
"eval_cosine_accuracy": 0.963100016117096,
"eval_loss": 1.242247462272644,
"eval_runtime": 25.9571,
"eval_samples_per_second": 364.371,
"eval_steps_per_second": 0.732,
"step": 36000
},
{
"epoch": 1.1020864156526464,
"grad_norm": 2.627800941467285,
"learning_rate": 2.9915025420967164e-05,
"loss": 2.5253,
"step": 36500
},
{
"epoch": 1.1020864156526464,
"eval_cosine_accuracy": 0.9658490419387817,
"eval_loss": 1.2386823892593384,
"eval_runtime": 25.7101,
"eval_samples_per_second": 367.871,
"eval_steps_per_second": 0.739,
"step": 36500
},
{
"epoch": 1.117183489839669,
"grad_norm": 2.651385545730591,
"learning_rate": 2.9887956452640508e-05,
"loss": 2.5242,
"step": 37000
},
{
"epoch": 1.117183489839669,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.2386800050735474,
"eval_runtime": 25.5529,
"eval_samples_per_second": 370.134,
"eval_steps_per_second": 0.744,
"step": 37000
},
{
"epoch": 1.1322805640266917,
"grad_norm": 2.608771800994873,
"learning_rate": 2.9857166480488943e-05,
"loss": 2.522,
"step": 37500
},
{
"epoch": 1.1322805640266917,
"eval_cosine_accuracy": 0.9636286497116089,
"eval_loss": 1.248028039932251,
"eval_runtime": 25.7727,
"eval_samples_per_second": 366.978,
"eval_steps_per_second": 0.737,
"step": 37500
},
{
"epoch": 1.1473776382137142,
"grad_norm": 2.6256067752838135,
"learning_rate": 2.982266319996783e-05,
"loss": 2.5117,
"step": 38000
},
{
"epoch": 1.1473776382137142,
"eval_cosine_accuracy": 0.964580237865448,
"eval_loss": 1.2536381483078003,
"eval_runtime": 25.6468,
"eval_samples_per_second": 368.778,
"eval_steps_per_second": 0.741,
"step": 38000
},
{
"epoch": 1.1624747124007366,
"grad_norm": 2.654273271560669,
"learning_rate": 2.9784455234613855e-05,
"loss": 2.5106,
"step": 38500
},
{
"epoch": 1.1624747124007366,
"eval_cosine_accuracy": 0.9640516042709351,
"eval_loss": 1.2445483207702637,
"eval_runtime": 25.6195,
"eval_samples_per_second": 369.172,
"eval_steps_per_second": 0.742,
"step": 38500
},
{
"epoch": 1.1775717865877593,
"grad_norm": 2.5933237075805664,
"learning_rate": 2.9742552133889753e-05,
"loss": 2.5084,
"step": 39000
},
{
"epoch": 1.1775717865877593,
"eval_cosine_accuracy": 0.9639458656311035,
"eval_loss": 1.2357311248779297,
"eval_runtime": 25.6068,
"eval_samples_per_second": 369.355,
"eval_steps_per_second": 0.742,
"step": 39000
},
{
"epoch": 1.192668860774782,
"grad_norm": 2.682175636291504,
"learning_rate": 2.9696964370797545e-05,
"loss": 2.508,
"step": 39500
},
{
"epoch": 1.192668860774782,
"eval_cosine_accuracy": 0.963840126991272,
"eval_loss": 1.2286454439163208,
"eval_runtime": 25.9048,
"eval_samples_per_second": 365.107,
"eval_steps_per_second": 0.733,
"step": 39500
},
{
"epoch": 1.2077659349618044,
"grad_norm": 2.6224212646484375,
"learning_rate": 2.964770333926099e-05,
"loss": 2.5002,
"step": 40000
},
{
"epoch": 1.2077659349618044,
"eval_cosine_accuracy": 0.9632057547569275,
"eval_loss": 1.2165610790252686,
"eval_runtime": 25.7909,
"eval_samples_per_second": 366.719,
"eval_steps_per_second": 0.737,
"step": 40000
},
{
"epoch": 1.2228630091488268,
"grad_norm": 2.700840473175049,
"learning_rate": 2.959489084024884e-05,
"loss": 2.4964,
"step": 40500
},
{
"epoch": 1.2228630091488268,
"eval_cosine_accuracy": 0.9658490419387817,
"eval_loss": 1.210123062133789,
"eval_runtime": 27.3723,
"eval_samples_per_second": 345.531,
"eval_steps_per_second": 0.694,
"step": 40500
},
{
"epoch": 1.2379600833358495,
"grad_norm": 2.504908323287964,
"learning_rate": 2.9538328404465586e-05,
"loss": 2.4908,
"step": 41000
},
{
"epoch": 1.2379600833358495,
"eval_cosine_accuracy": 0.9654260873794556,
"eval_loss": 1.2111036777496338,
"eval_runtime": 27.9784,
"eval_samples_per_second": 338.046,
"eval_steps_per_second": 0.679,
"step": 41000
},
{
"epoch": 1.2530571575228722,
"grad_norm": 2.440549850463867,
"learning_rate": 2.9478132348731005e-05,
"loss": 2.49,
"step": 41500
},
{
"epoch": 1.2530571575228722,
"eval_cosine_accuracy": 0.9643687605857849,
"eval_loss": 1.2143770456314087,
"eval_runtime": 26.0195,
"eval_samples_per_second": 363.496,
"eval_steps_per_second": 0.73,
"step": 41500
},
{
"epoch": 1.2681542317098946,
"grad_norm": 2.454235076904297,
"learning_rate": 2.9414317718075254e-05,
"loss": 2.4876,
"step": 42000
},
{
"epoch": 1.2681542317098946,
"eval_cosine_accuracy": 0.9656375646591187,
"eval_loss": 1.2297664880752563,
"eval_runtime": 26.1652,
"eval_samples_per_second": 361.473,
"eval_steps_per_second": 0.726,
"step": 42000
},
{
"epoch": 1.283251305896917,
"grad_norm": 2.669309616088867,
"learning_rate": 2.934703888081519e-05,
"loss": 2.4891,
"step": 42500
},
{
"epoch": 1.283251305896917,
"eval_cosine_accuracy": 0.9665891528129578,
"eval_loss": 1.2173755168914795,
"eval_runtime": 26.9222,
"eval_samples_per_second": 351.308,
"eval_steps_per_second": 0.706,
"step": 42500
},
{
"epoch": 1.2983483800839397,
"grad_norm": 2.5629279613494873,
"learning_rate": 2.9276043003176936e-05,
"loss": 2.4832,
"step": 43000
},
{
"epoch": 1.2983483800839397,
"eval_cosine_accuracy": 0.9662719368934631,
"eval_loss": 1.228603720664978,
"eval_runtime": 25.6824,
"eval_samples_per_second": 368.268,
"eval_steps_per_second": 0.74,
"step": 43000
},
{
"epoch": 1.3134454542709624,
"grad_norm": 2.510059356689453,
"learning_rate": 2.9201479059579577e-05,
"loss": 2.4836,
"step": 43500
},
{
"epoch": 1.3134454542709624,
"eval_cosine_accuracy": 0.9675406813621521,
"eval_loss": 1.2030081748962402,
"eval_runtime": 37.7702,
"eval_samples_per_second": 250.409,
"eval_steps_per_second": 0.503,
"step": 43500
},
{
"epoch": 1.3285425284579848,
"grad_norm": 2.399697780609131,
"learning_rate": 2.9123365686074353e-05,
"loss": 2.4719,
"step": 44000
},
{
"epoch": 1.3285425284579848,
"eval_cosine_accuracy": 0.9673292636871338,
"eval_loss": 1.2092621326446533,
"eval_runtime": 25.8999,
"eval_samples_per_second": 365.175,
"eval_steps_per_second": 0.734,
"step": 44000
},
{
"epoch": 1.3436396026450075,
"grad_norm": 2.5001821517944336,
"learning_rate": 2.9041722405835078e-05,
"loss": 2.4712,
"step": 44500
},
{
"epoch": 1.3436396026450075,
"eval_cosine_accuracy": 0.9651089310646057,
"eval_loss": 1.2292317152023315,
"eval_runtime": 25.6099,
"eval_samples_per_second": 369.311,
"eval_steps_per_second": 0.742,
"step": 44500
},
{
"epoch": 1.35873667683203,
"grad_norm": 2.411466598510742,
"learning_rate": 2.8956743418323756e-05,
"loss": 2.4672,
"step": 45000
},
{
"epoch": 1.35873667683203,
"eval_cosine_accuracy": 0.9637343883514404,
"eval_loss": 1.2306199073791504,
"eval_runtime": 25.7851,
"eval_samples_per_second": 366.801,
"eval_steps_per_second": 0.737,
"step": 45000
},
{
"epoch": 1.3738337510190526,
"grad_norm": 2.5372798442840576,
"learning_rate": 2.886810937262726e-05,
"loss": 2.466,
"step": 45500
},
{
"epoch": 1.3738337510190526,
"eval_cosine_accuracy": 0.9664834141731262,
"eval_loss": 1.2176159620285034,
"eval_runtime": 25.7619,
"eval_samples_per_second": 367.131,
"eval_steps_per_second": 0.738,
"step": 45500
},
{
"epoch": 1.388930825206075,
"grad_norm": 2.5013914108276367,
"learning_rate": 2.877600921738216e-05,
"loss": 2.4531,
"step": 46000
},
{
"epoch": 1.388930825206075,
"eval_cosine_accuracy": 0.9668005704879761,
"eval_loss": 1.2011289596557617,
"eval_runtime": 26.2915,
"eval_samples_per_second": 359.737,
"eval_steps_per_second": 0.723,
"step": 46000
},
{
"epoch": 1.4040278993930977,
"grad_norm": 2.503460645675659,
"learning_rate": 2.8680465971532057e-05,
"loss": 2.4532,
"step": 46500
},
{
"epoch": 1.4040278993930977,
"eval_cosine_accuracy": 0.9676464200019836,
"eval_loss": 1.2036337852478027,
"eval_runtime": 26.2087,
"eval_samples_per_second": 360.873,
"eval_steps_per_second": 0.725,
"step": 46500
},
{
"epoch": 1.4191249735801201,
"grad_norm": 2.513176679611206,
"learning_rate": 2.8581704835553698e-05,
"loss": 2.4583,
"step": 47000
},
{
"epoch": 1.4191249735801201,
"eval_cosine_accuracy": 0.9673292636871338,
"eval_loss": 1.2205578088760376,
"eval_runtime": 26.2508,
"eval_samples_per_second": 360.294,
"eval_steps_per_second": 0.724,
"step": 47000
},
{
"epoch": 1.4342220477671428,
"grad_norm": 2.671133279800415,
"learning_rate": 2.8479354665239807e-05,
"loss": 2.4531,
"step": 47500
},
{
"epoch": 1.4342220477671428,
"eval_cosine_accuracy": 0.9661661982536316,
"eval_loss": 1.209764838218689,
"eval_runtime": 26.2234,
"eval_samples_per_second": 360.67,
"eval_steps_per_second": 0.725,
"step": 47500
},
{
"epoch": 1.4493191219541652,
"grad_norm": 2.3808069229125977,
"learning_rate": 2.8373635548326275e-05,
"loss": 2.447,
"step": 48000
},
{
"epoch": 1.4493191219541652,
"eval_cosine_accuracy": 0.9660604596138,
"eval_loss": 1.2082879543304443,
"eval_runtime": 26.2685,
"eval_samples_per_second": 360.051,
"eval_steps_per_second": 0.723,
"step": 48000
},
{
"epoch": 1.464416196141188,
"grad_norm": 2.3825228214263916,
"learning_rate": 2.826457390759583e-05,
"loss": 2.4447,
"step": 48500
},
{
"epoch": 1.464416196141188,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.2263509035110474,
"eval_runtime": 26.4189,
"eval_samples_per_second": 358.001,
"eval_steps_per_second": 0.719,
"step": 48500
},
{
"epoch": 1.4795132703282103,
"grad_norm": 2.4729747772216797,
"learning_rate": 2.81524250451576e-05,
"loss": 2.4429,
"step": 49000
},
{
"epoch": 1.4795132703282103,
"eval_cosine_accuracy": 0.9662719368934631,
"eval_loss": 1.2109318971633911,
"eval_runtime": 26.3423,
"eval_samples_per_second": 359.042,
"eval_steps_per_second": 0.721,
"step": 49000
},
{
"epoch": 1.494610344515233,
"grad_norm": 2.4820468425750732,
"learning_rate": 2.8036767505747488e-05,
"loss": 2.4432,
"step": 49500
},
{
"epoch": 1.494610344515233,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.2026188373565674,
"eval_runtime": 26.1704,
"eval_samples_per_second": 361.4,
"eval_steps_per_second": 0.726,
"step": 49500
},
{
"epoch": 1.5097074187022557,
"grad_norm": 2.7118115425109863,
"learning_rate": 2.7917851637237094e-05,
"loss": 2.4349,
"step": 50000
},
{
"epoch": 1.5097074187022557,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.2329705953598022,
"eval_runtime": 26.0247,
"eval_samples_per_second": 363.423,
"eval_steps_per_second": 0.73,
"step": 50000
},
{
"epoch": 1.524804492889278,
"grad_norm": 2.5479893684387207,
"learning_rate": 2.7795707160723676e-05,
"loss": 2.4413,
"step": 50500
},
{
"epoch": 1.524804492889278,
"eval_cosine_accuracy": 0.9640516042709351,
"eval_loss": 1.217067003250122,
"eval_runtime": 26.3236,
"eval_samples_per_second": 359.298,
"eval_steps_per_second": 0.722,
"step": 50500
},
{
"epoch": 1.5399015670763005,
"grad_norm": 2.5022010803222656,
"learning_rate": 2.7670364604242808e-05,
"loss": 2.4314,
"step": 51000
},
{
"epoch": 1.5399015670763005,
"eval_cosine_accuracy": 0.9639458656311035,
"eval_loss": 1.1970478296279907,
"eval_runtime": 26.0308,
"eval_samples_per_second": 363.339,
"eval_steps_per_second": 0.73,
"step": 51000
},
{
"epoch": 1.5549986412633232,
"grad_norm": 2.469269275665283,
"learning_rate": 2.754211545295838e-05,
"loss": 2.4311,
"step": 51500
},
{
"epoch": 1.5549986412633232,
"eval_cosine_accuracy": 0.9654260873794556,
"eval_loss": 1.2095298767089844,
"eval_runtime": 26.0282,
"eval_samples_per_second": 363.375,
"eval_steps_per_second": 0.73,
"step": 51500
},
{
"epoch": 1.5700957154503459,
"grad_norm": 2.4655797481536865,
"learning_rate": 2.7410477746743892e-05,
"loss": 2.4308,
"step": 52000
},
{
"epoch": 1.5700957154503459,
"eval_cosine_accuracy": 0.9664834141731262,
"eval_loss": 1.1810933351516724,
"eval_runtime": 26.3782,
"eval_samples_per_second": 358.553,
"eval_steps_per_second": 0.72,
"step": 52000
},
{
"epoch": 1.5851927896373683,
"grad_norm": 2.4508461952209473,
"learning_rate": 2.7275738242420885e-05,
"loss": 2.4266,
"step": 52500
},
{
"epoch": 1.5851927896373683,
"eval_cosine_accuracy": 0.9650031924247742,
"eval_loss": 1.1934112310409546,
"eval_runtime": 26.3979,
"eval_samples_per_second": 358.286,
"eval_steps_per_second": 0.72,
"step": 52500
},
{
"epoch": 1.6002898638243908,
"grad_norm": 2.5196404457092285,
"learning_rate": 2.713793061594835e-05,
"loss": 2.4273,
"step": 53000
},
{
"epoch": 1.6002898638243908,
"eval_cosine_accuracy": 0.9658490419387817,
"eval_loss": 1.1869851350784302,
"eval_runtime": 26.2175,
"eval_samples_per_second": 360.751,
"eval_steps_per_second": 0.725,
"step": 53000
},
{
"epoch": 1.6153869380114134,
"grad_norm": 2.488192081451416,
"learning_rate": 2.699737399706483e-05,
"loss": 2.4187,
"step": 53500
},
{
"epoch": 1.6153869380114134,
"eval_cosine_accuracy": 0.9654260873794556,
"eval_loss": 1.191279649734497,
"eval_runtime": 25.5662,
"eval_samples_per_second": 369.941,
"eval_steps_per_second": 0.743,
"step": 53500
},
{
"epoch": 1.630484012198436,
"grad_norm": 2.568108558654785,
"learning_rate": 2.6853540174198254e-05,
"loss": 2.4229,
"step": 54000
},
{
"epoch": 1.630484012198436,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.198256254196167,
"eval_runtime": 25.1113,
"eval_samples_per_second": 376.643,
"eval_steps_per_second": 0.757,
"step": 54000
},
{
"epoch": 1.6455810863854585,
"grad_norm": 2.6221678256988525,
"learning_rate": 2.670704027601641e-05,
"loss": 2.4225,
"step": 54500
},
{
"epoch": 1.6455810863854585,
"eval_cosine_accuracy": 0.9662719368934631,
"eval_loss": 1.1917139291763306,
"eval_runtime": 25.0122,
"eval_samples_per_second": 378.135,
"eval_steps_per_second": 0.76,
"step": 54500
},
{
"epoch": 1.660678160572481,
"grad_norm": 2.439676284790039,
"learning_rate": 2.6557323756112153e-05,
"loss": 2.4171,
"step": 55000
},
{
"epoch": 1.660678160572481,
"eval_cosine_accuracy": 0.965320348739624,
"eval_loss": 1.203063726425171,
"eval_runtime": 24.9951,
"eval_samples_per_second": 378.394,
"eval_steps_per_second": 0.76,
"step": 55000
},
{
"epoch": 1.6757752347595036,
"grad_norm": 2.397371530532837,
"learning_rate": 2.6404718670135114e-05,
"loss": 2.4091,
"step": 55500
},
{
"epoch": 1.6757752347595036,
"eval_cosine_accuracy": 0.9660604596138,
"eval_loss": 1.1928857564926147,
"eval_runtime": 24.9359,
"eval_samples_per_second": 379.292,
"eval_steps_per_second": 0.762,
"step": 55500
},
{
"epoch": 1.6908723089465263,
"grad_norm": 2.433346748352051,
"learning_rate": 2.6249263159257348e-05,
"loss": 2.4143,
"step": 56000
},
{
"epoch": 1.6908723089465263,
"eval_cosine_accuracy": 0.9655318260192871,
"eval_loss": 1.1848769187927246,
"eval_runtime": 25.0231,
"eval_samples_per_second": 377.971,
"eval_steps_per_second": 0.759,
"step": 56000
},
{
"epoch": 1.7059693831335487,
"grad_norm": 2.4731833934783936,
"learning_rate": 2.6090996077068493e-05,
"loss": 2.4049,
"step": 56500
},
{
"epoch": 1.7059693831335487,
"eval_cosine_accuracy": 0.9664834141731262,
"eval_loss": 1.168144941329956,
"eval_runtime": 25.7021,
"eval_samples_per_second": 367.985,
"eval_steps_per_second": 0.739,
"step": 56500
},
{
"epoch": 1.7210664573205712,
"grad_norm": 2.548151969909668,
"learning_rate": 2.592995697986495e-05,
"loss": 2.4052,
"step": 57000
},
{
"epoch": 1.7210664573205712,
"eval_cosine_accuracy": 0.9677521586418152,
"eval_loss": 1.1782138347625732,
"eval_runtime": 25.2357,
"eval_samples_per_second": 374.787,
"eval_steps_per_second": 0.753,
"step": 57000
},
{
"epoch": 1.7361635315075938,
"grad_norm": 2.4638047218322754,
"learning_rate": 2.5766186116763416e-05,
"loss": 2.4113,
"step": 57500
},
{
"epoch": 1.7361635315075938,
"eval_cosine_accuracy": 0.9657433032989502,
"eval_loss": 1.1835379600524902,
"eval_runtime": 25.2634,
"eval_samples_per_second": 374.376,
"eval_steps_per_second": 0.752,
"step": 57500
},
{
"epoch": 1.7512606056946165,
"grad_norm": 2.5692343711853027,
"learning_rate": 2.559972441964127e-05,
"loss": 2.4055,
"step": 58000
},
{
"epoch": 1.7512606056946165,
"eval_cosine_accuracy": 0.9666948318481445,
"eval_loss": 1.1879708766937256,
"eval_runtime": 25.2328,
"eval_samples_per_second": 374.83,
"eval_steps_per_second": 0.753,
"step": 58000
},
{
"epoch": 1.766357679881639,
"grad_norm": 2.5655770301818848,
"learning_rate": 2.543095433070693e-05,
"loss": 2.3996,
"step": 58500
},
{
"epoch": 1.766357679881639,
"eval_cosine_accuracy": 0.9665891528129578,
"eval_loss": 1.2095972299575806,
"eval_runtime": 24.9713,
"eval_samples_per_second": 378.755,
"eval_steps_per_second": 0.761,
"step": 58500
},
{
"epoch": 1.7814547540686614,
"grad_norm": 2.4523584842681885,
"learning_rate": 2.525924161220836e-05,
"loss": 2.397,
"step": 59000
},
{
"epoch": 1.7814547540686614,
"eval_cosine_accuracy": 0.9679636359214783,
"eval_loss": 1.1942859888076782,
"eval_runtime": 25.0212,
"eval_samples_per_second": 378.0,
"eval_steps_per_second": 0.759,
"step": 59000
},
{
"epoch": 1.796551828255684,
"grad_norm": 2.3042044639587402,
"learning_rate": 2.508496476226562e-05,
"loss": 2.3954,
"step": 59500
},
{
"epoch": 1.796551828255684,
"eval_cosine_accuracy": 0.9660604596138,
"eval_loss": 1.1957042217254639,
"eval_runtime": 24.9958,
"eval_samples_per_second": 378.383,
"eval_steps_per_second": 0.76,
"step": 59500
},
{
"epoch": 1.8116489024427067,
"grad_norm": 2.443878412246704,
"learning_rate": 2.4908167338557513e-05,
"loss": 2.3955,
"step": 60000
},
{
"epoch": 1.8116489024427067,
"eval_cosine_accuracy": 0.9656375646591187,
"eval_loss": 1.1924560070037842,
"eval_runtime": 25.298,
"eval_samples_per_second": 373.864,
"eval_steps_per_second": 0.751,
"step": 60000
},
{
"epoch": 1.8267459766297292,
"grad_norm": 2.3578927516937256,
"learning_rate": 2.4728893528739477e-05,
"loss": 2.3901,
"step": 60500
},
{
"epoch": 1.8267459766297292,
"eval_cosine_accuracy": 0.9663776755332947,
"eval_loss": 1.1898130178451538,
"eval_runtime": 25.5116,
"eval_samples_per_second": 370.734,
"eval_steps_per_second": 0.745,
"step": 60500
},
{
"epoch": 1.8418430508167516,
"grad_norm": 2.4540822505950928,
"learning_rate": 2.4547553946809237e-05,
"loss": 2.3966,
"step": 61000
},
{
"epoch": 1.8418430508167516,
"eval_cosine_accuracy": 0.9666948318481445,
"eval_loss": 1.205179214477539,
"eval_runtime": 25.3455,
"eval_samples_per_second": 373.162,
"eval_steps_per_second": 0.75,
"step": 61000
},
{
"epoch": 1.8569401250037743,
"grad_norm": 2.4436283111572266,
"learning_rate": 2.4363467118878128e-05,
"loss": 2.3934,
"step": 61500
},
{
"epoch": 1.8569401250037743,
"eval_cosine_accuracy": 0.9668005704879761,
"eval_loss": 1.1965515613555908,
"eval_runtime": 25.134,
"eval_samples_per_second": 376.303,
"eval_steps_per_second": 0.756,
"step": 61500
},
{
"epoch": 1.872037199190797,
"grad_norm": 2.504840612411499,
"learning_rate": 2.4177040043843623e-05,
"loss": 2.39,
"step": 62000
},
{
"epoch": 1.872037199190797,
"eval_cosine_accuracy": 0.9674350023269653,
"eval_loss": 1.1888070106506348,
"eval_runtime": 25.0974,
"eval_samples_per_second": 376.852,
"eval_steps_per_second": 0.757,
"step": 62000
},
{
"epoch": 1.8871342733778194,
"grad_norm": 2.423356771469116,
"learning_rate": 2.39883193161367e-05,
"loss": 2.3842,
"step": 62500
},
{
"epoch": 1.8871342733778194,
"eval_cosine_accuracy": 0.9675406813621521,
"eval_loss": 1.1591609716415405,
"eval_runtime": 25.3547,
"eval_samples_per_second": 373.027,
"eval_steps_per_second": 0.749,
"step": 62500
},
{
"epoch": 1.9022313475648418,
"grad_norm": Infinity,
"learning_rate": 2.3797352103449708e-05,
"loss": 2.3834,
"step": 63000
},
{
"epoch": 1.9022313475648418,
"eval_cosine_accuracy": 0.9675406813621521,
"eval_loss": 1.157774567604065,
"eval_runtime": 25.3953,
"eval_samples_per_second": 372.432,
"eval_steps_per_second": 0.748,
"step": 63000
},
{
"epoch": 1.9173284217518645,
"grad_norm": 2.5663857460021973,
"learning_rate": 2.360457462924386e-05,
"loss": 2.3848,
"step": 63500
},
{
"epoch": 1.9173284217518645,
"eval_cosine_accuracy": 0.9673292636871338,
"eval_loss": 1.1656157970428467,
"eval_runtime": 25.2211,
"eval_samples_per_second": 375.004,
"eval_steps_per_second": 0.753,
"step": 63500
},
{
"epoch": 1.9324254959388871,
"grad_norm": 2.4691641330718994,
"learning_rate": 2.3409262436047722e-05,
"loss": 2.3845,
"step": 64000
},
{
"epoch": 1.9324254959388871,
"eval_cosine_accuracy": 0.9679636359214783,
"eval_loss": 1.1666511297225952,
"eval_runtime": 25.1314,
"eval_samples_per_second": 376.341,
"eval_steps_per_second": 0.756,
"step": 64000
},
{
"epoch": 1.9475225701259096,
"grad_norm": 2.5696003437042236,
"learning_rate": 2.321184848376942e-05,
"loss": 2.3858,
"step": 64500
},
{
"epoch": 1.9475225701259096,
"eval_cosine_accuracy": 0.9668005704879761,
"eval_loss": 1.1749593019485474,
"eval_runtime": 25.1663,
"eval_samples_per_second": 375.82,
"eval_steps_per_second": 0.755,
"step": 64500
},
{
"epoch": 1.962619644312932,
"grad_norm": 2.4359042644500732,
"learning_rate": 2.3012382112832128e-05,
"loss": 2.3825,
"step": 65000
},
{
"epoch": 1.962619644312932,
"eval_cosine_accuracy": 0.9670120477676392,
"eval_loss": 1.177405595779419,
"eval_runtime": 25.6293,
"eval_samples_per_second": 369.031,
"eval_steps_per_second": 0.741,
"step": 65000
},
{
"epoch": 1.9777167184999547,
"grad_norm": 2.386305093765259,
"learning_rate": 2.281131807967385e-05,
"loss": 2.6758,
"step": 65500
},
{
"epoch": 1.9777167184999547,
"eval_cosine_accuracy": 0.9617255330085754,
"eval_loss": 1.2431585788726807,
"eval_runtime": 25.5435,
"eval_samples_per_second": 370.27,
"eval_steps_per_second": 0.744,
"step": 65500
},
{
"epoch": 1.9928137926869773,
"grad_norm": 4.19161319732666,
"learning_rate": 2.2607900785946504e-05,
"loss": 3.0304,
"step": 66000
},
{
"epoch": 1.9928137926869773,
"eval_cosine_accuracy": 0.9528441429138184,
"eval_loss": 1.3187161684036255,
"eval_runtime": 26.0183,
"eval_samples_per_second": 363.514,
"eval_steps_per_second": 0.73,
"step": 66000
}
],
"logging_steps": 500,
"max_steps": 132476,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}