{ "best_metric": 0.3867943286895752, "best_model_checkpoint": "./model_fine-tune/glot/mbert/npi-Deva/checkpoint-99000", "epoch": 22.307345651194233, "eval_steps": 500, "global_step": 99000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11266336187471834, "grad_norm": 2.965751886367798, "learning_rate": 9.95e-05, "loss": 1.2301, "step": 500 }, { "epoch": 0.11266336187471834, "eval_accuracy": 0.773184921040326, "eval_loss": 1.0424206256866455, "eval_runtime": 220.277, "eval_samples_per_second": 125.542, "eval_steps_per_second": 3.927, "step": 500 }, { "epoch": 0.22532672374943669, "grad_norm": 2.740410327911377, "learning_rate": 9.900000000000001e-05, "loss": 1.0543, "step": 1000 }, { "epoch": 0.22532672374943669, "eval_accuracy": 0.7909380298838988, "eval_loss": 0.9471855163574219, "eval_runtime": 220.5287, "eval_samples_per_second": 125.399, "eval_steps_per_second": 3.922, "step": 1000 }, { "epoch": 0.337990085624155, "grad_norm": 2.7863168716430664, "learning_rate": 9.850000000000001e-05, "loss": 0.9779, "step": 1500 }, { "epoch": 0.337990085624155, "eval_accuracy": 0.8025380313108481, "eval_loss": 0.8843335509300232, "eval_runtime": 220.1694, "eval_samples_per_second": 125.603, "eval_steps_per_second": 3.929, "step": 1500 }, { "epoch": 0.45065344749887337, "grad_norm": 2.5414557456970215, "learning_rate": 9.8e-05, "loss": 0.926, "step": 2000 }, { "epoch": 0.45065344749887337, "eval_accuracy": 0.8101656739468518, "eval_loss": 0.8543083667755127, "eval_runtime": 220.7797, "eval_samples_per_second": 125.256, "eval_steps_per_second": 3.918, "step": 2000 }, { "epoch": 0.5633168093735917, "grad_norm": 2.377737283706665, "learning_rate": 9.75e-05, "loss": 0.8865, "step": 2500 }, { "epoch": 0.5633168093735917, "eval_accuracy": 0.8173040702951884, "eval_loss": 0.810703456401825, "eval_runtime": 220.8366, "eval_samples_per_second": 125.224, "eval_steps_per_second": 3.917, "step": 2500 }, { "epoch": 0.67598017124831, "grad_norm": 2.3488988876342773, "learning_rate": 9.7e-05, "loss": 0.8609, "step": 3000 }, { "epoch": 0.67598017124831, "eval_accuracy": 0.821973743458109, "eval_loss": 0.7787520289421082, "eval_runtime": 221.2787, "eval_samples_per_second": 124.974, "eval_steps_per_second": 3.909, "step": 3000 }, { "epoch": 0.7886435331230284, "grad_norm": 2.1220295429229736, "learning_rate": 9.65e-05, "loss": 0.8342, "step": 3500 }, { "epoch": 0.7886435331230284, "eval_accuracy": 0.8262216641689282, "eval_loss": 0.762144923210144, "eval_runtime": 220.8289, "eval_samples_per_second": 125.228, "eval_steps_per_second": 3.917, "step": 3500 }, { "epoch": 0.9013068949977467, "grad_norm": 2.0968008041381836, "learning_rate": 9.6e-05, "loss": 0.819, "step": 4000 }, { "epoch": 0.9013068949977467, "eval_accuracy": 0.8298798163116309, "eval_loss": 0.7466955184936523, "eval_runtime": 220.013, "eval_samples_per_second": 125.693, "eval_steps_per_second": 3.932, "step": 4000 }, { "epoch": 1.0139702568724651, "grad_norm": 2.1498773097991943, "learning_rate": 9.55e-05, "loss": 0.7979, "step": 4500 }, { "epoch": 1.0139702568724651, "eval_accuracy": 0.8318434647099575, "eval_loss": 0.7348815202713013, "eval_runtime": 220.8229, "eval_samples_per_second": 125.232, "eval_steps_per_second": 3.917, "step": 4500 }, { "epoch": 1.1266336187471835, "grad_norm": 2.163381576538086, "learning_rate": 9.5e-05, "loss": 0.7814, "step": 5000 }, { "epoch": 1.1266336187471835, "eval_accuracy": 0.8349864178467281, "eval_loss": 0.7180664539337158, "eval_runtime": 219.7442, "eval_samples_per_second": 125.846, "eval_steps_per_second": 3.936, "step": 5000 }, { "epoch": 1.2392969806219019, "grad_norm": 2.431119680404663, "learning_rate": 9.449999999999999e-05, "loss": 0.7665, "step": 5500 }, { "epoch": 1.2392969806219019, "eval_accuracy": 0.8369151584278837, "eval_loss": 0.7159287333488464, "eval_runtime": 220.9495, "eval_samples_per_second": 125.16, "eval_steps_per_second": 3.915, "step": 5500 }, { "epoch": 1.35196034249662, "grad_norm": 2.2182135581970215, "learning_rate": 9.4e-05, "loss": 0.7555, "step": 6000 }, { "epoch": 1.35196034249662, "eval_accuracy": 0.8390386817390197, "eval_loss": 0.6999027132987976, "eval_runtime": 221.2257, "eval_samples_per_second": 125.004, "eval_steps_per_second": 3.91, "step": 6000 }, { "epoch": 1.4646237043713384, "grad_norm": 2.1569323539733887, "learning_rate": 9.350000000000001e-05, "loss": 0.7479, "step": 6500 }, { "epoch": 1.4646237043713384, "eval_accuracy": 0.8422681179825285, "eval_loss": 0.689199686050415, "eval_runtime": 222.1646, "eval_samples_per_second": 124.475, "eval_steps_per_second": 3.894, "step": 6500 }, { "epoch": 1.5772870662460567, "grad_norm": 2.1323976516723633, "learning_rate": 9.300000000000001e-05, "loss": 0.733, "step": 7000 }, { "epoch": 1.5772870662460567, "eval_accuracy": 0.8439326355101854, "eval_loss": 0.6770957112312317, "eval_runtime": 220.0047, "eval_samples_per_second": 125.697, "eval_steps_per_second": 3.932, "step": 7000 }, { "epoch": 1.6899504281207751, "grad_norm": 5.134857177734375, "learning_rate": 9.250000000000001e-05, "loss": 0.7254, "step": 7500 }, { "epoch": 1.6899504281207751, "eval_accuracy": 0.8453398865939418, "eval_loss": 0.6668263077735901, "eval_runtime": 220.3226, "eval_samples_per_second": 125.516, "eval_steps_per_second": 3.926, "step": 7500 }, { "epoch": 1.8026137899954935, "grad_norm": 2.0616443157196045, "learning_rate": 9.200000000000001e-05, "loss": 0.7155, "step": 8000 }, { "epoch": 1.8026137899954935, "eval_accuracy": 0.8463415002486427, "eval_loss": 0.6613638997077942, "eval_runtime": 221.4149, "eval_samples_per_second": 124.897, "eval_steps_per_second": 3.907, "step": 8000 }, { "epoch": 1.9152771518702119, "grad_norm": 1.8310041427612305, "learning_rate": 9.15e-05, "loss": 0.7057, "step": 8500 }, { "epoch": 1.9152771518702119, "eval_accuracy": 0.8482940895875467, "eval_loss": 0.6528915762901306, "eval_runtime": 222.056, "eval_samples_per_second": 124.536, "eval_steps_per_second": 3.895, "step": 8500 }, { "epoch": 2.0279405137449302, "grad_norm": 1.93686842918396, "learning_rate": 9.1e-05, "loss": 0.7005, "step": 9000 }, { "epoch": 2.0279405137449302, "eval_accuracy": 0.849507663539711, "eval_loss": 0.6522949934005737, "eval_runtime": 221.8006, "eval_samples_per_second": 124.68, "eval_steps_per_second": 3.9, "step": 9000 }, { "epoch": 2.1406038756196484, "grad_norm": 6.06415319442749, "learning_rate": 9.05e-05, "loss": 0.6884, "step": 9500 }, { "epoch": 2.1406038756196484, "eval_accuracy": 0.8502759566677828, "eval_loss": 0.6491975784301758, "eval_runtime": 221.9876, "eval_samples_per_second": 124.575, "eval_steps_per_second": 3.897, "step": 9500 }, { "epoch": 2.253267237494367, "grad_norm": 1.9235719442367554, "learning_rate": 9e-05, "loss": 0.6821, "step": 10000 }, { "epoch": 2.253267237494367, "eval_accuracy": 0.8524909295282275, "eval_loss": 0.6343050599098206, "eval_runtime": 222.2817, "eval_samples_per_second": 124.41, "eval_steps_per_second": 3.891, "step": 10000 }, { "epoch": 2.365930599369085, "grad_norm": 1.8421759605407715, "learning_rate": 8.950000000000001e-05, "loss": 0.6767, "step": 10500 }, { "epoch": 2.365930599369085, "eval_accuracy": 0.8534436334949597, "eval_loss": 0.623904287815094, "eval_runtime": 221.9784, "eval_samples_per_second": 124.58, "eval_steps_per_second": 3.897, "step": 10500 }, { "epoch": 2.4785939612438037, "grad_norm": 1.9507330656051636, "learning_rate": 8.900000000000001e-05, "loss": 0.6792, "step": 11000 }, { "epoch": 2.4785939612438037, "eval_accuracy": 0.8552298873542783, "eval_loss": 0.6220438480377197, "eval_runtime": 221.6861, "eval_samples_per_second": 124.744, "eval_steps_per_second": 3.902, "step": 11000 }, { "epoch": 2.591257323118522, "grad_norm": 2.11086106300354, "learning_rate": 8.850000000000001e-05, "loss": 0.668, "step": 11500 }, { "epoch": 2.591257323118522, "eval_accuracy": 0.8557371543230742, "eval_loss": 0.6222216486930847, "eval_runtime": 221.2659, "eval_samples_per_second": 124.981, "eval_steps_per_second": 3.909, "step": 11500 }, { "epoch": 2.70392068499324, "grad_norm": 2.1847715377807617, "learning_rate": 8.800000000000001e-05, "loss": 0.6636, "step": 12000 }, { "epoch": 2.70392068499324, "eval_accuracy": 0.8559949812795903, "eval_loss": 0.6197636127471924, "eval_runtime": 222.3204, "eval_samples_per_second": 124.388, "eval_steps_per_second": 3.891, "step": 12000 }, { "epoch": 2.8165840468679586, "grad_norm": 2.1351499557495117, "learning_rate": 8.75e-05, "loss": 0.6576, "step": 12500 }, { "epoch": 2.8165840468679586, "eval_accuracy": 0.8577423671742627, "eval_loss": 0.6103814840316772, "eval_runtime": 222.1201, "eval_samples_per_second": 124.5, "eval_steps_per_second": 3.894, "step": 12500 }, { "epoch": 2.9292474087426768, "grad_norm": 3.9510111808776855, "learning_rate": 8.7e-05, "loss": 0.6488, "step": 13000 }, { "epoch": 2.9292474087426768, "eval_accuracy": 0.858561338833408, "eval_loss": 0.6049174070358276, "eval_runtime": 221.9998, "eval_samples_per_second": 124.568, "eval_steps_per_second": 3.896, "step": 13000 }, { "epoch": 3.0419107706173953, "grad_norm": 1.8234397172927856, "learning_rate": 8.65e-05, "loss": 0.6438, "step": 13500 }, { "epoch": 3.0419107706173953, "eval_accuracy": 0.8591052959636053, "eval_loss": 0.6051846742630005, "eval_runtime": 220.9713, "eval_samples_per_second": 125.147, "eval_steps_per_second": 3.915, "step": 13500 }, { "epoch": 3.1545741324921135, "grad_norm": 1.9275134801864624, "learning_rate": 8.6e-05, "loss": 0.6369, "step": 14000 }, { "epoch": 3.1545741324921135, "eval_accuracy": 0.8599297280864646, "eval_loss": 0.6021236181259155, "eval_runtime": 222.1682, "eval_samples_per_second": 124.473, "eval_steps_per_second": 3.893, "step": 14000 }, { "epoch": 3.267237494366832, "grad_norm": 2.4342575073242188, "learning_rate": 8.55e-05, "loss": 0.6375, "step": 14500 }, { "epoch": 3.267237494366832, "eval_accuracy": 0.8612232782302242, "eval_loss": 0.5935059785842896, "eval_runtime": 221.4028, "eval_samples_per_second": 124.904, "eval_steps_per_second": 3.907, "step": 14500 }, { "epoch": 3.3799008562415502, "grad_norm": 1.8208547830581665, "learning_rate": 8.5e-05, "loss": 0.6327, "step": 15000 }, { "epoch": 3.3799008562415502, "eval_accuracy": 0.8619705169680766, "eval_loss": 0.5865727663040161, "eval_runtime": 221.4519, "eval_samples_per_second": 124.876, "eval_steps_per_second": 3.906, "step": 15000 }, { "epoch": 3.492564218116269, "grad_norm": 1.8497122526168823, "learning_rate": 8.450000000000001e-05, "loss": 0.6289, "step": 15500 }, { "epoch": 3.492564218116269, "eval_accuracy": 0.8624703434485368, "eval_loss": 0.5854940414428711, "eval_runtime": 222.074, "eval_samples_per_second": 124.526, "eval_steps_per_second": 3.895, "step": 15500 }, { "epoch": 3.605227579990987, "grad_norm": 1.7389825582504272, "learning_rate": 8.4e-05, "loss": 0.6231, "step": 16000 }, { "epoch": 3.605227579990987, "eval_accuracy": 0.8635307164001665, "eval_loss": 0.5809486508369446, "eval_runtime": 222.3436, "eval_samples_per_second": 124.375, "eval_steps_per_second": 3.89, "step": 16000 }, { "epoch": 3.717890941865705, "grad_norm": 1.7109190225601196, "learning_rate": 8.35e-05, "loss": 0.6193, "step": 16500 }, { "epoch": 3.717890941865705, "eval_accuracy": 0.8642588913962003, "eval_loss": 0.5757493376731873, "eval_runtime": 220.862, "eval_samples_per_second": 125.209, "eval_steps_per_second": 3.916, "step": 16500 }, { "epoch": 3.8305543037404237, "grad_norm": 2.09114408493042, "learning_rate": 8.3e-05, "loss": 0.619, "step": 17000 }, { "epoch": 3.8305543037404237, "eval_accuracy": 0.8644031427528578, "eval_loss": 0.5797725319862366, "eval_runtime": 220.9835, "eval_samples_per_second": 125.141, "eval_steps_per_second": 3.914, "step": 17000 }, { "epoch": 3.943217665615142, "grad_norm": 6.745112419128418, "learning_rate": 8.25e-05, "loss": 0.6127, "step": 17500 }, { "epoch": 3.943217665615142, "eval_accuracy": 0.8645245282957764, "eval_loss": 0.5759025812149048, "eval_runtime": 222.2291, "eval_samples_per_second": 124.439, "eval_steps_per_second": 3.892, "step": 17500 }, { "epoch": 4.0558810274898605, "grad_norm": 1.7710591554641724, "learning_rate": 8.2e-05, "loss": 0.6081, "step": 18000 }, { "epoch": 4.0558810274898605, "eval_accuracy": 0.8658915432042757, "eval_loss": 0.5714759230613708, "eval_runtime": 221.6135, "eval_samples_per_second": 124.785, "eval_steps_per_second": 3.903, "step": 18000 }, { "epoch": 4.168544389364579, "grad_norm": 1.8267593383789062, "learning_rate": 8.15e-05, "loss": 0.5988, "step": 18500 }, { "epoch": 4.168544389364579, "eval_accuracy": 0.8665697779685045, "eval_loss": 0.5671255588531494, "eval_runtime": 221.0373, "eval_samples_per_second": 125.11, "eval_steps_per_second": 3.913, "step": 18500 }, { "epoch": 4.281207751239297, "grad_norm": 1.6686463356018066, "learning_rate": 8.1e-05, "loss": 0.5981, "step": 19000 }, { "epoch": 4.281207751239297, "eval_accuracy": 0.8667210799508446, "eval_loss": 0.5654014348983765, "eval_runtime": 221.1716, "eval_samples_per_second": 125.034, "eval_steps_per_second": 3.911, "step": 19000 }, { "epoch": 4.393871113114015, "grad_norm": 1.6965349912643433, "learning_rate": 8.05e-05, "loss": 0.599, "step": 19500 }, { "epoch": 4.393871113114015, "eval_accuracy": 0.8677269725072129, "eval_loss": 0.5655470490455627, "eval_runtime": 221.4343, "eval_samples_per_second": 124.886, "eval_steps_per_second": 3.906, "step": 19500 }, { "epoch": 4.506534474988734, "grad_norm": 1.653952956199646, "learning_rate": 8e-05, "loss": 0.5976, "step": 20000 }, { "epoch": 4.506534474988734, "eval_accuracy": 0.8685987876288259, "eval_loss": 0.5560412406921387, "eval_runtime": 220.5715, "eval_samples_per_second": 125.374, "eval_steps_per_second": 3.922, "step": 20000 }, { "epoch": 4.619197836863452, "grad_norm": 1.7568910121917725, "learning_rate": 7.950000000000001e-05, "loss": 0.5941, "step": 20500 }, { "epoch": 4.619197836863452, "eval_accuracy": 0.868412802308659, "eval_loss": 0.5624808669090271, "eval_runtime": 220.7945, "eval_samples_per_second": 125.248, "eval_steps_per_second": 3.918, "step": 20500 }, { "epoch": 4.73186119873817, "grad_norm": 1.7545663118362427, "learning_rate": 7.900000000000001e-05, "loss": 0.5871, "step": 21000 }, { "epoch": 4.73186119873817, "eval_accuracy": 0.8700149406874658, "eval_loss": 0.5546574592590332, "eval_runtime": 220.2428, "eval_samples_per_second": 125.561, "eval_steps_per_second": 3.927, "step": 21000 }, { "epoch": 4.844524560612888, "grad_norm": 1.9459997415542603, "learning_rate": 7.850000000000001e-05, "loss": 0.5891, "step": 21500 }, { "epoch": 4.844524560612888, "eval_accuracy": 0.8703311716376315, "eval_loss": 0.5456222295761108, "eval_runtime": 220.5867, "eval_samples_per_second": 125.366, "eval_steps_per_second": 3.921, "step": 21500 }, { "epoch": 4.957187922487607, "grad_norm": 1.9034132957458496, "learning_rate": 7.800000000000001e-05, "loss": 0.5828, "step": 22000 }, { "epoch": 4.957187922487607, "eval_accuracy": 0.8704027728365514, "eval_loss": 0.549776554107666, "eval_runtime": 221.4908, "eval_samples_per_second": 124.854, "eval_steps_per_second": 3.905, "step": 22000 }, { "epoch": 5.069851284362326, "grad_norm": 1.881596565246582, "learning_rate": 7.75e-05, "loss": 0.5767, "step": 22500 }, { "epoch": 5.069851284362326, "eval_accuracy": 0.8711589106147363, "eval_loss": 0.5461272597312927, "eval_runtime": 220.457, "eval_samples_per_second": 125.439, "eval_steps_per_second": 3.924, "step": 22500 }, { "epoch": 5.182514646237044, "grad_norm": 1.9157260656356812, "learning_rate": 7.7e-05, "loss": 0.5731, "step": 23000 }, { "epoch": 5.182514646237044, "eval_accuracy": 0.871975417070376, "eval_loss": 0.5400785207748413, "eval_runtime": 220.8692, "eval_samples_per_second": 125.205, "eval_steps_per_second": 3.916, "step": 23000 }, { "epoch": 5.295178008111762, "grad_norm": 1.9823201894760132, "learning_rate": 7.65e-05, "loss": 0.5736, "step": 23500 }, { "epoch": 5.295178008111762, "eval_accuracy": 0.8723751389743424, "eval_loss": 0.5401638746261597, "eval_runtime": 221.6042, "eval_samples_per_second": 124.79, "eval_steps_per_second": 3.903, "step": 23500 }, { "epoch": 5.40784136998648, "grad_norm": 1.905613660812378, "learning_rate": 7.6e-05, "loss": 0.5747, "step": 24000 }, { "epoch": 5.40784136998648, "eval_accuracy": 0.8724923660478054, "eval_loss": 0.5441656112670898, "eval_runtime": 221.3067, "eval_samples_per_second": 124.958, "eval_steps_per_second": 3.909, "step": 24000 }, { "epoch": 5.520504731861199, "grad_norm": 1.5278126001358032, "learning_rate": 7.55e-05, "loss": 0.5681, "step": 24500 }, { "epoch": 5.520504731861199, "eval_accuracy": 0.8728878650306285, "eval_loss": 0.538100004196167, "eval_runtime": 222.0369, "eval_samples_per_second": 124.547, "eval_steps_per_second": 3.896, "step": 24500 }, { "epoch": 5.633168093735917, "grad_norm": 1.6478660106658936, "learning_rate": 7.500000000000001e-05, "loss": 0.5658, "step": 25000 }, { "epoch": 5.633168093735917, "eval_accuracy": 0.8736624848239579, "eval_loss": 0.5357881784439087, "eval_runtime": 220.4147, "eval_samples_per_second": 125.463, "eval_steps_per_second": 3.924, "step": 25000 }, { "epoch": 5.745831455610635, "grad_norm": 3.0473523139953613, "learning_rate": 7.450000000000001e-05, "loss": 0.5644, "step": 25500 }, { "epoch": 5.745831455610635, "eval_accuracy": 0.8743903767129565, "eval_loss": 0.5344362854957581, "eval_runtime": 221.5481, "eval_samples_per_second": 124.822, "eval_steps_per_second": 3.904, "step": 25500 }, { "epoch": 5.8584948174853535, "grad_norm": 1.8053028583526611, "learning_rate": 7.4e-05, "loss": 0.5622, "step": 26000 }, { "epoch": 5.8584948174853535, "eval_accuracy": 0.874178054098396, "eval_loss": 0.5315510630607605, "eval_runtime": 221.4537, "eval_samples_per_second": 124.875, "eval_steps_per_second": 3.906, "step": 26000 }, { "epoch": 5.9711581793600725, "grad_norm": 1.5863131284713745, "learning_rate": 7.35e-05, "loss": 0.5578, "step": 26500 }, { "epoch": 5.9711581793600725, "eval_accuracy": 0.8753070050808498, "eval_loss": 0.5271232724189758, "eval_runtime": 221.5103, "eval_samples_per_second": 124.843, "eval_steps_per_second": 3.905, "step": 26500 }, { "epoch": 6.083821541234791, "grad_norm": 1.7924689054489136, "learning_rate": 7.3e-05, "loss": 0.5546, "step": 27000 }, { "epoch": 6.083821541234791, "eval_accuracy": 0.8749559789605048, "eval_loss": 0.5305372476577759, "eval_runtime": 220.6828, "eval_samples_per_second": 125.311, "eval_steps_per_second": 3.92, "step": 27000 }, { "epoch": 6.196484903109509, "grad_norm": 1.6176671981811523, "learning_rate": 7.25e-05, "loss": 0.5553, "step": 27500 }, { "epoch": 6.196484903109509, "eval_accuracy": 0.8752024294778373, "eval_loss": 0.5255776047706604, "eval_runtime": 220.8919, "eval_samples_per_second": 125.192, "eval_steps_per_second": 3.916, "step": 27500 }, { "epoch": 6.309148264984227, "grad_norm": 1.855047583580017, "learning_rate": 7.2e-05, "loss": 0.5506, "step": 28000 }, { "epoch": 6.309148264984227, "eval_accuracy": 0.8761331460452507, "eval_loss": 0.52358478307724, "eval_runtime": 220.7028, "eval_samples_per_second": 125.3, "eval_steps_per_second": 3.919, "step": 28000 }, { "epoch": 6.421811626858945, "grad_norm": 1.6553348302841187, "learning_rate": 7.15e-05, "loss": 0.5439, "step": 28500 }, { "epoch": 6.421811626858945, "eval_accuracy": 0.8768096662621753, "eval_loss": 0.5175614953041077, "eval_runtime": 221.3868, "eval_samples_per_second": 124.913, "eval_steps_per_second": 3.907, "step": 28500 }, { "epoch": 6.534474988733664, "grad_norm": 1.8099743127822876, "learning_rate": 7.1e-05, "loss": 0.5486, "step": 29000 }, { "epoch": 6.534474988733664, "eval_accuracy": 0.8767345488093528, "eval_loss": 0.5191013216972351, "eval_runtime": 221.0646, "eval_samples_per_second": 125.095, "eval_steps_per_second": 3.913, "step": 29000 }, { "epoch": 6.647138350608382, "grad_norm": 1.7723827362060547, "learning_rate": 7.05e-05, "loss": 0.5442, "step": 29500 }, { "epoch": 6.647138350608382, "eval_accuracy": 0.8777180592418201, "eval_loss": 0.5211535096168518, "eval_runtime": 222.034, "eval_samples_per_second": 124.548, "eval_steps_per_second": 3.896, "step": 29500 }, { "epoch": 6.7598017124831005, "grad_norm": 1.7134077548980713, "learning_rate": 7e-05, "loss": 0.5412, "step": 30000 }, { "epoch": 6.7598017124831005, "eval_accuracy": 0.8771853054768167, "eval_loss": 0.5161250829696655, "eval_runtime": 221.7362, "eval_samples_per_second": 124.716, "eval_steps_per_second": 3.901, "step": 30000 }, { "epoch": 6.872465074357819, "grad_norm": 1.7683045864105225, "learning_rate": 6.95e-05, "loss": 0.5402, "step": 30500 }, { "epoch": 6.872465074357819, "eval_accuracy": 0.8773839402820733, "eval_loss": 0.5139411687850952, "eval_runtime": 220.8209, "eval_samples_per_second": 125.233, "eval_steps_per_second": 3.917, "step": 30500 }, { "epoch": 6.985128436232538, "grad_norm": 1.8624660968780518, "learning_rate": 6.9e-05, "loss": 0.5395, "step": 31000 }, { "epoch": 6.985128436232538, "eval_accuracy": 0.8783624777319803, "eval_loss": 0.5147821307182312, "eval_runtime": 221.2924, "eval_samples_per_second": 124.966, "eval_steps_per_second": 3.909, "step": 31000 }, { "epoch": 7.097791798107256, "grad_norm": 1.6134588718414307, "learning_rate": 6.850000000000001e-05, "loss": 0.5323, "step": 31500 }, { "epoch": 7.097791798107256, "eval_accuracy": 0.8783227617479554, "eval_loss": 0.5111725330352783, "eval_runtime": 221.33, "eval_samples_per_second": 124.945, "eval_steps_per_second": 3.908, "step": 31500 }, { "epoch": 7.210455159981974, "grad_norm": 1.8190521001815796, "learning_rate": 6.800000000000001e-05, "loss": 0.5341, "step": 32000 }, { "epoch": 7.210455159981974, "eval_accuracy": 0.8789992011172492, "eval_loss": 0.5084385275840759, "eval_runtime": 221.6067, "eval_samples_per_second": 124.789, "eval_steps_per_second": 3.903, "step": 32000 }, { "epoch": 7.323118521856692, "grad_norm": 1.7497199773788452, "learning_rate": 6.750000000000001e-05, "loss": 0.5325, "step": 32500 }, { "epoch": 7.323118521856692, "eval_accuracy": 0.8800775404890228, "eval_loss": 0.5039363503456116, "eval_runtime": 222.1191, "eval_samples_per_second": 124.501, "eval_steps_per_second": 3.894, "step": 32500 }, { "epoch": 7.43578188373141, "grad_norm": 1.6325268745422363, "learning_rate": 6.7e-05, "loss": 0.5309, "step": 33000 }, { "epoch": 7.43578188373141, "eval_accuracy": 0.8801297464169966, "eval_loss": 0.505262017250061, "eval_runtime": 221.5433, "eval_samples_per_second": 124.824, "eval_steps_per_second": 3.904, "step": 33000 }, { "epoch": 7.548445245606128, "grad_norm": 1.7531828880310059, "learning_rate": 6.65e-05, "loss": 0.5283, "step": 33500 }, { "epoch": 7.548445245606128, "eval_accuracy": 0.8802065110814512, "eval_loss": 0.5030723810195923, "eval_runtime": 221.3827, "eval_samples_per_second": 124.915, "eval_steps_per_second": 3.907, "step": 33500 }, { "epoch": 7.661108607480847, "grad_norm": 1.7174723148345947, "learning_rate": 6.6e-05, "loss": 0.5254, "step": 34000 }, { "epoch": 7.661108607480847, "eval_accuracy": 0.8808179417817528, "eval_loss": 0.5008535385131836, "eval_runtime": 220.9595, "eval_samples_per_second": 125.154, "eval_steps_per_second": 3.915, "step": 34000 }, { "epoch": 7.773771969355566, "grad_norm": 1.4874796867370605, "learning_rate": 6.55e-05, "loss": 0.5217, "step": 34500 }, { "epoch": 7.773771969355566, "eval_accuracy": 0.8805426353661477, "eval_loss": 0.5023674368858337, "eval_runtime": 221.8005, "eval_samples_per_second": 124.68, "eval_steps_per_second": 3.9, "step": 34500 }, { "epoch": 7.886435331230284, "grad_norm": 1.721684455871582, "learning_rate": 6.500000000000001e-05, "loss": 0.5268, "step": 35000 }, { "epoch": 7.886435331230284, "eval_accuracy": 0.8809166125769631, "eval_loss": 0.49913424253463745, "eval_runtime": 221.9778, "eval_samples_per_second": 124.58, "eval_steps_per_second": 3.897, "step": 35000 }, { "epoch": 7.999098693105002, "grad_norm": 1.6311215162277222, "learning_rate": 6.450000000000001e-05, "loss": 0.5279, "step": 35500 }, { "epoch": 7.999098693105002, "eval_accuracy": 0.8812218745311298, "eval_loss": 0.49702906608581543, "eval_runtime": 221.8919, "eval_samples_per_second": 124.628, "eval_steps_per_second": 3.898, "step": 35500 }, { "epoch": 8.111762054979721, "grad_norm": 1.843680739402771, "learning_rate": 6.400000000000001e-05, "loss": 0.5113, "step": 36000 }, { "epoch": 8.111762054979721, "eval_accuracy": 0.8819420345077154, "eval_loss": 0.4924590289592743, "eval_runtime": 220.9686, "eval_samples_per_second": 125.149, "eval_steps_per_second": 3.915, "step": 36000 }, { "epoch": 8.224425416854439, "grad_norm": 2.186274290084839, "learning_rate": 6.35e-05, "loss": 0.519, "step": 36500 }, { "epoch": 8.224425416854439, "eval_accuracy": 0.8821893135016634, "eval_loss": 0.49259641766548157, "eval_runtime": 221.6028, "eval_samples_per_second": 124.791, "eval_steps_per_second": 3.903, "step": 36500 }, { "epoch": 8.337088778729157, "grad_norm": 1.6842992305755615, "learning_rate": 6.3e-05, "loss": 0.5142, "step": 37000 }, { "epoch": 8.337088778729157, "eval_accuracy": 0.8831766874688345, "eval_loss": 0.49090540409088135, "eval_runtime": 221.5262, "eval_samples_per_second": 124.834, "eval_steps_per_second": 3.905, "step": 37000 }, { "epoch": 8.449752140603875, "grad_norm": 1.69620943069458, "learning_rate": 6.25e-05, "loss": 0.5118, "step": 37500 }, { "epoch": 8.449752140603875, "eval_accuracy": 0.8829115546267233, "eval_loss": 0.48741188645362854, "eval_runtime": 220.7417, "eval_samples_per_second": 125.278, "eval_steps_per_second": 3.919, "step": 37500 }, { "epoch": 8.562415502478594, "grad_norm": 1.7428566217422485, "learning_rate": 6.2e-05, "loss": 0.513, "step": 38000 }, { "epoch": 8.562415502478594, "eval_accuracy": 0.883224147742304, "eval_loss": 0.4888308644294739, "eval_runtime": 221.978, "eval_samples_per_second": 124.58, "eval_steps_per_second": 3.897, "step": 38000 }, { "epoch": 8.675078864353312, "grad_norm": 1.7894220352172852, "learning_rate": 6.15e-05, "loss": 0.5158, "step": 38500 }, { "epoch": 8.675078864353312, "eval_accuracy": 0.8831726096386725, "eval_loss": 0.48892539739608765, "eval_runtime": 221.6541, "eval_samples_per_second": 124.762, "eval_steps_per_second": 3.902, "step": 38500 }, { "epoch": 8.78774222622803, "grad_norm": 1.8389184474945068, "learning_rate": 6.1e-05, "loss": 0.5074, "step": 39000 }, { "epoch": 8.78774222622803, "eval_accuracy": 0.8836604617832933, "eval_loss": 0.4904680550098419, "eval_runtime": 220.5534, "eval_samples_per_second": 125.385, "eval_steps_per_second": 3.922, "step": 39000 }, { "epoch": 8.90040558810275, "grad_norm": 1.6285669803619385, "learning_rate": 6.05e-05, "loss": 0.5115, "step": 39500 }, { "epoch": 8.90040558810275, "eval_accuracy": 0.8834682744822249, "eval_loss": 0.4873930513858795, "eval_runtime": 221.8348, "eval_samples_per_second": 124.66, "eval_steps_per_second": 3.899, "step": 39500 }, { "epoch": 9.013068949977468, "grad_norm": 1.892903447151184, "learning_rate": 6e-05, "loss": 0.506, "step": 40000 }, { "epoch": 9.013068949977468, "eval_accuracy": 0.8844121521679462, "eval_loss": 0.48553282022476196, "eval_runtime": 221.0439, "eval_samples_per_second": 125.106, "eval_steps_per_second": 3.913, "step": 40000 }, { "epoch": 9.125732311852186, "grad_norm": 1.5161460638046265, "learning_rate": 5.95e-05, "loss": 0.5006, "step": 40500 }, { "epoch": 9.125732311852186, "eval_accuracy": 0.8845845321702142, "eval_loss": 0.4854166805744171, "eval_runtime": 221.5376, "eval_samples_per_second": 124.828, "eval_steps_per_second": 3.905, "step": 40500 }, { "epoch": 9.238395673726904, "grad_norm": 1.6559338569641113, "learning_rate": 5.9e-05, "loss": 0.4998, "step": 41000 }, { "epoch": 9.238395673726904, "eval_accuracy": 0.8846773258713508, "eval_loss": 0.47937873005867004, "eval_runtime": 221.7432, "eval_samples_per_second": 124.712, "eval_steps_per_second": 3.901, "step": 41000 }, { "epoch": 9.351059035601622, "grad_norm": 1.5425843000411987, "learning_rate": 5.85e-05, "loss": 0.4993, "step": 41500 }, { "epoch": 9.351059035601622, "eval_accuracy": 0.8852505184740784, "eval_loss": 0.47994357347488403, "eval_runtime": 220.8861, "eval_samples_per_second": 125.196, "eval_steps_per_second": 3.916, "step": 41500 }, { "epoch": 9.46372239747634, "grad_norm": 1.6957345008850098, "learning_rate": 5.8e-05, "loss": 0.4978, "step": 42000 }, { "epoch": 9.46372239747634, "eval_accuracy": 0.8847172732012654, "eval_loss": 0.48131656646728516, "eval_runtime": 222.0591, "eval_samples_per_second": 124.534, "eval_steps_per_second": 3.895, "step": 42000 }, { "epoch": 9.576385759351059, "grad_norm": 1.9139741659164429, "learning_rate": 5.7499999999999995e-05, "loss": 0.4989, "step": 42500 }, { "epoch": 9.576385759351059, "eval_accuracy": 0.8862352978048973, "eval_loss": 0.4748667776584625, "eval_runtime": 221.9766, "eval_samples_per_second": 124.581, "eval_steps_per_second": 3.897, "step": 42500 }, { "epoch": 9.689049121225777, "grad_norm": 1.770585536956787, "learning_rate": 5.6999999999999996e-05, "loss": 0.4974, "step": 43000 }, { "epoch": 9.689049121225777, "eval_accuracy": 0.8855722252421147, "eval_loss": 0.4763648211956024, "eval_runtime": 220.554, "eval_samples_per_second": 125.384, "eval_steps_per_second": 3.922, "step": 43000 }, { "epoch": 9.801712483100495, "grad_norm": 1.6551371812820435, "learning_rate": 5.65e-05, "loss": 0.4978, "step": 43500 }, { "epoch": 9.801712483100495, "eval_accuracy": 0.8858765050235756, "eval_loss": 0.47770920395851135, "eval_runtime": 221.8932, "eval_samples_per_second": 124.628, "eval_steps_per_second": 3.898, "step": 43500 }, { "epoch": 9.914375844975215, "grad_norm": 1.6118969917297363, "learning_rate": 5.6000000000000006e-05, "loss": 0.4942, "step": 44000 }, { "epoch": 9.914375844975215, "eval_accuracy": 0.8865845660569847, "eval_loss": 0.47676002979278564, "eval_runtime": 221.7004, "eval_samples_per_second": 124.736, "eval_steps_per_second": 3.902, "step": 44000 }, { "epoch": 10.027039206849933, "grad_norm": 1.8588035106658936, "learning_rate": 5.550000000000001e-05, "loss": 0.4955, "step": 44500 }, { "epoch": 10.027039206849933, "eval_accuracy": 0.8870998796760368, "eval_loss": 0.47594934701919556, "eval_runtime": 221.976, "eval_samples_per_second": 124.581, "eval_steps_per_second": 3.897, "step": 44500 }, { "epoch": 10.139702568724651, "grad_norm": 1.6966643333435059, "learning_rate": 5.500000000000001e-05, "loss": 0.489, "step": 45000 }, { "epoch": 10.139702568724651, "eval_accuracy": 0.8869448016018396, "eval_loss": 0.477344274520874, "eval_runtime": 221.5008, "eval_samples_per_second": 124.848, "eval_steps_per_second": 3.905, "step": 45000 }, { "epoch": 10.25236593059937, "grad_norm": 1.7615017890930176, "learning_rate": 5.45e-05, "loss": 0.4849, "step": 45500 }, { "epoch": 10.25236593059937, "eval_accuracy": 0.8868306586288885, "eval_loss": 0.4725435972213745, "eval_runtime": 221.7608, "eval_samples_per_second": 124.702, "eval_steps_per_second": 3.901, "step": 45500 }, { "epoch": 10.365029292474087, "grad_norm": 1.7889434099197388, "learning_rate": 5.4000000000000005e-05, "loss": 0.4818, "step": 46000 }, { "epoch": 10.365029292474087, "eval_accuracy": 0.887833900017014, "eval_loss": 0.4671822190284729, "eval_runtime": 222.1274, "eval_samples_per_second": 124.496, "eval_steps_per_second": 3.894, "step": 46000 }, { "epoch": 10.477692654348806, "grad_norm": 1.7761868238449097, "learning_rate": 5.3500000000000006e-05, "loss": 0.4864, "step": 46500 }, { "epoch": 10.477692654348806, "eval_accuracy": 0.887966177980069, "eval_loss": 0.46516725420951843, "eval_runtime": 221.4768, "eval_samples_per_second": 124.862, "eval_steps_per_second": 3.906, "step": 46500 }, { "epoch": 10.590356016223524, "grad_norm": 1.7193918228149414, "learning_rate": 5.300000000000001e-05, "loss": 0.4854, "step": 47000 }, { "epoch": 10.590356016223524, "eval_accuracy": 0.8878875431862944, "eval_loss": 0.4649243652820587, "eval_runtime": 221.9203, "eval_samples_per_second": 124.612, "eval_steps_per_second": 3.898, "step": 47000 }, { "epoch": 10.703019378098242, "grad_norm": 1.681303858757019, "learning_rate": 5.25e-05, "loss": 0.4842, "step": 47500 }, { "epoch": 10.703019378098242, "eval_accuracy": 0.8880860212733241, "eval_loss": 0.4627833366394043, "eval_runtime": 220.7325, "eval_samples_per_second": 125.283, "eval_steps_per_second": 3.919, "step": 47500 }, { "epoch": 10.81568273997296, "grad_norm": 1.689483642578125, "learning_rate": 5.2000000000000004e-05, "loss": 0.4853, "step": 48000 }, { "epoch": 10.81568273997296, "eval_accuracy": 0.8884850427627177, "eval_loss": 0.4670482873916626, "eval_runtime": 222.2087, "eval_samples_per_second": 124.451, "eval_steps_per_second": 3.893, "step": 48000 }, { "epoch": 10.92834610184768, "grad_norm": 1.6489872932434082, "learning_rate": 5.1500000000000005e-05, "loss": 0.4825, "step": 48500 }, { "epoch": 10.92834610184768, "eval_accuracy": 0.8886944679602043, "eval_loss": 0.4673362970352173, "eval_runtime": 221.8366, "eval_samples_per_second": 124.659, "eval_steps_per_second": 3.899, "step": 48500 }, { "epoch": 11.041009463722398, "grad_norm": 1.6207237243652344, "learning_rate": 5.1000000000000006e-05, "loss": 0.4783, "step": 49000 }, { "epoch": 11.041009463722398, "eval_accuracy": 0.8887642317859056, "eval_loss": 0.46382275223731995, "eval_runtime": 222.0203, "eval_samples_per_second": 124.556, "eval_steps_per_second": 3.896, "step": 49000 }, { "epoch": 11.153672825597116, "grad_norm": 1.7849069833755493, "learning_rate": 5.05e-05, "loss": 0.4755, "step": 49500 }, { "epoch": 11.153672825597116, "eval_accuracy": 0.8889988225245398, "eval_loss": 0.4611697793006897, "eval_runtime": 221.8086, "eval_samples_per_second": 124.675, "eval_steps_per_second": 3.9, "step": 49500 }, { "epoch": 11.266336187471834, "grad_norm": 1.7341585159301758, "learning_rate": 5e-05, "loss": 0.4766, "step": 50000 }, { "epoch": 11.266336187471834, "eval_accuracy": 0.8896719975387671, "eval_loss": 0.45947107672691345, "eval_runtime": 221.6153, "eval_samples_per_second": 124.784, "eval_steps_per_second": 3.903, "step": 50000 }, { "epoch": 11.378999549346553, "grad_norm": 1.6157374382019043, "learning_rate": 4.9500000000000004e-05, "loss": 0.4758, "step": 50500 }, { "epoch": 11.378999549346553, "eval_accuracy": 0.8899909483321304, "eval_loss": 0.4591013193130493, "eval_runtime": 221.8952, "eval_samples_per_second": 124.626, "eval_steps_per_second": 3.898, "step": 50500 }, { "epoch": 11.49166291122127, "grad_norm": 1.3931312561035156, "learning_rate": 4.9e-05, "loss": 0.4749, "step": 51000 }, { "epoch": 11.49166291122127, "eval_accuracy": 0.8898331334878133, "eval_loss": 0.4599143862724304, "eval_runtime": 221.5141, "eval_samples_per_second": 124.841, "eval_steps_per_second": 3.905, "step": 51000 }, { "epoch": 11.604326273095989, "grad_norm": 1.5027562379837036, "learning_rate": 4.85e-05, "loss": 0.4696, "step": 51500 }, { "epoch": 11.604326273095989, "eval_accuracy": 0.8903252192404275, "eval_loss": 0.4557996988296509, "eval_runtime": 221.4742, "eval_samples_per_second": 124.863, "eval_steps_per_second": 3.906, "step": 51500 }, { "epoch": 11.716989634970707, "grad_norm": 2.007624864578247, "learning_rate": 4.8e-05, "loss": 0.4731, "step": 52000 }, { "epoch": 11.716989634970707, "eval_accuracy": 0.8906047731898101, "eval_loss": 0.4601598381996155, "eval_runtime": 221.6161, "eval_samples_per_second": 124.783, "eval_steps_per_second": 3.903, "step": 52000 }, { "epoch": 11.829652996845425, "grad_norm": 1.623124361038208, "learning_rate": 4.75e-05, "loss": 0.4705, "step": 52500 }, { "epoch": 11.829652996845425, "eval_accuracy": 0.8907063641623542, "eval_loss": 0.4568343460559845, "eval_runtime": 221.7063, "eval_samples_per_second": 124.733, "eval_steps_per_second": 3.902, "step": 52500 }, { "epoch": 11.942316358720145, "grad_norm": 1.7550790309906006, "learning_rate": 4.7e-05, "loss": 0.4712, "step": 53000 }, { "epoch": 11.942316358720145, "eval_accuracy": 0.8906701808811146, "eval_loss": 0.4544416666030884, "eval_runtime": 221.7786, "eval_samples_per_second": 124.692, "eval_steps_per_second": 3.9, "step": 53000 }, { "epoch": 12.054979720594863, "grad_norm": 1.8783979415893555, "learning_rate": 4.6500000000000005e-05, "loss": 0.4672, "step": 53500 }, { "epoch": 12.054979720594863, "eval_accuracy": 0.8910758036453728, "eval_loss": 0.45520085096359253, "eval_runtime": 221.6441, "eval_samples_per_second": 124.768, "eval_steps_per_second": 3.903, "step": 53500 }, { "epoch": 12.167643082469581, "grad_norm": 1.7316193580627441, "learning_rate": 4.600000000000001e-05, "loss": 0.4643, "step": 54000 }, { "epoch": 12.167643082469581, "eval_accuracy": 0.8910305824271204, "eval_loss": 0.4555051028728485, "eval_runtime": 221.45, "eval_samples_per_second": 124.877, "eval_steps_per_second": 3.906, "step": 54000 }, { "epoch": 12.2803064443443, "grad_norm": 1.6475858688354492, "learning_rate": 4.55e-05, "loss": 0.4634, "step": 54500 }, { "epoch": 12.2803064443443, "eval_accuracy": 0.8916132904164534, "eval_loss": 0.450579971075058, "eval_runtime": 221.3289, "eval_samples_per_second": 124.945, "eval_steps_per_second": 3.908, "step": 54500 }, { "epoch": 12.392969806219018, "grad_norm": 1.6666234731674194, "learning_rate": 4.5e-05, "loss": 0.4629, "step": 55000 }, { "epoch": 12.392969806219018, "eval_accuracy": 0.8920182501631405, "eval_loss": 0.4492991268634796, "eval_runtime": 220.8234, "eval_samples_per_second": 125.231, "eval_steps_per_second": 3.917, "step": 55000 }, { "epoch": 12.505633168093736, "grad_norm": 2.040255308151245, "learning_rate": 4.4500000000000004e-05, "loss": 0.4577, "step": 55500 }, { "epoch": 12.505633168093736, "eval_accuracy": 0.8917890008025184, "eval_loss": 0.45352259278297424, "eval_runtime": 222.1131, "eval_samples_per_second": 124.504, "eval_steps_per_second": 3.894, "step": 55500 }, { "epoch": 12.618296529968454, "grad_norm": 1.6200906038284302, "learning_rate": 4.4000000000000006e-05, "loss": 0.4597, "step": 56000 }, { "epoch": 12.618296529968454, "eval_accuracy": 0.8922171868098442, "eval_loss": 0.45517975091934204, "eval_runtime": 221.5067, "eval_samples_per_second": 124.845, "eval_steps_per_second": 3.905, "step": 56000 }, { "epoch": 12.730959891843172, "grad_norm": 1.8632248640060425, "learning_rate": 4.35e-05, "loss": 0.4624, "step": 56500 }, { "epoch": 12.730959891843172, "eval_accuracy": 0.8927531961352251, "eval_loss": 0.44637200236320496, "eval_runtime": 221.488, "eval_samples_per_second": 124.856, "eval_steps_per_second": 3.905, "step": 56500 }, { "epoch": 12.84362325371789, "grad_norm": 1.6908427476882935, "learning_rate": 4.3e-05, "loss": 0.46, "step": 57000 }, { "epoch": 12.84362325371789, "eval_accuracy": 0.8920658168036726, "eval_loss": 0.44909459352493286, "eval_runtime": 221.6242, "eval_samples_per_second": 124.779, "eval_steps_per_second": 3.903, "step": 57000 }, { "epoch": 12.95628661559261, "grad_norm": 1.7786799669265747, "learning_rate": 4.25e-05, "loss": 0.4586, "step": 57500 }, { "epoch": 12.95628661559261, "eval_accuracy": 0.8929494005257145, "eval_loss": 0.4447159469127655, "eval_runtime": 221.7737, "eval_samples_per_second": 124.695, "eval_steps_per_second": 3.9, "step": 57500 }, { "epoch": 13.068949977467328, "grad_norm": 1.7628467082977295, "learning_rate": 4.2e-05, "loss": 0.4558, "step": 58000 }, { "epoch": 13.068949977467328, "eval_accuracy": 0.8926064267317521, "eval_loss": 0.4458833336830139, "eval_runtime": 221.4227, "eval_samples_per_second": 124.892, "eval_steps_per_second": 3.907, "step": 58000 }, { "epoch": 13.181613339342046, "grad_norm": 1.5658234357833862, "learning_rate": 4.15e-05, "loss": 0.4542, "step": 58500 }, { "epoch": 13.181613339342046, "eval_accuracy": 0.8932430818063928, "eval_loss": 0.4461354613304138, "eval_runtime": 221.5061, "eval_samples_per_second": 124.845, "eval_steps_per_second": 3.905, "step": 58500 }, { "epoch": 13.294276701216765, "grad_norm": 1.5327554941177368, "learning_rate": 4.1e-05, "loss": 0.455, "step": 59000 }, { "epoch": 13.294276701216765, "eval_accuracy": 0.8931742171282963, "eval_loss": 0.4385415017604828, "eval_runtime": 220.6336, "eval_samples_per_second": 125.339, "eval_steps_per_second": 3.921, "step": 59000 }, { "epoch": 13.406940063091483, "grad_norm": 1.804396390914917, "learning_rate": 4.05e-05, "loss": 0.4506, "step": 59500 }, { "epoch": 13.406940063091483, "eval_accuracy": 0.8937421063264991, "eval_loss": 0.4429979622364044, "eval_runtime": 221.8018, "eval_samples_per_second": 124.679, "eval_steps_per_second": 3.9, "step": 59500 }, { "epoch": 13.519603424966201, "grad_norm": 1.8558369874954224, "learning_rate": 4e-05, "loss": 0.4542, "step": 60000 }, { "epoch": 13.519603424966201, "eval_accuracy": 0.8935920533223497, "eval_loss": 0.4469524025917053, "eval_runtime": 220.934, "eval_samples_per_second": 125.169, "eval_steps_per_second": 3.915, "step": 60000 }, { "epoch": 13.632266786840919, "grad_norm": 1.7201515436172485, "learning_rate": 3.9500000000000005e-05, "loss": 0.4535, "step": 60500 }, { "epoch": 13.632266786840919, "eval_accuracy": 0.8939343684906524, "eval_loss": 0.44035276770591736, "eval_runtime": 220.7553, "eval_samples_per_second": 125.27, "eval_steps_per_second": 3.918, "step": 60500 }, { "epoch": 13.744930148715637, "grad_norm": 1.5173367261886597, "learning_rate": 3.9000000000000006e-05, "loss": 0.4519, "step": 61000 }, { "epoch": 13.744930148715637, "eval_accuracy": 0.8938849251143673, "eval_loss": 0.44248726963996887, "eval_runtime": 220.2187, "eval_samples_per_second": 125.575, "eval_steps_per_second": 3.928, "step": 61000 }, { "epoch": 13.857593510590355, "grad_norm": 1.6886624097824097, "learning_rate": 3.85e-05, "loss": 0.4492, "step": 61500 }, { "epoch": 13.857593510590355, "eval_accuracy": 0.8941456344925675, "eval_loss": 0.44254130125045776, "eval_runtime": 220.9103, "eval_samples_per_second": 125.182, "eval_steps_per_second": 3.916, "step": 61500 }, { "epoch": 13.970256872465074, "grad_norm": 1.560421109199524, "learning_rate": 3.8e-05, "loss": 0.4495, "step": 62000 }, { "epoch": 13.970256872465074, "eval_accuracy": 0.8943100925877457, "eval_loss": 0.43967217206954956, "eval_runtime": 221.8474, "eval_samples_per_second": 124.653, "eval_steps_per_second": 3.899, "step": 62000 }, { "epoch": 14.082920234339793, "grad_norm": 2.146169662475586, "learning_rate": 3.7500000000000003e-05, "loss": 0.4438, "step": 62500 }, { "epoch": 14.082920234339793, "eval_accuracy": 0.8951792684420733, "eval_loss": 0.43378108739852905, "eval_runtime": 221.3932, "eval_samples_per_second": 124.909, "eval_steps_per_second": 3.907, "step": 62500 }, { "epoch": 14.195583596214512, "grad_norm": 1.6352427005767822, "learning_rate": 3.7e-05, "loss": 0.4437, "step": 63000 }, { "epoch": 14.195583596214512, "eval_accuracy": 0.8944950022699799, "eval_loss": 0.4368970990180969, "eval_runtime": 221.6512, "eval_samples_per_second": 124.764, "eval_steps_per_second": 3.903, "step": 63000 }, { "epoch": 14.30824695808923, "grad_norm": 1.6570438146591187, "learning_rate": 3.65e-05, "loss": 0.4407, "step": 63500 }, { "epoch": 14.30824695808923, "eval_accuracy": 0.8954035156096067, "eval_loss": 0.43203291296958923, "eval_runtime": 220.9732, "eval_samples_per_second": 125.146, "eval_steps_per_second": 3.915, "step": 63500 }, { "epoch": 14.420910319963948, "grad_norm": 1.6666638851165771, "learning_rate": 3.6e-05, "loss": 0.4409, "step": 64000 }, { "epoch": 14.420910319963948, "eval_accuracy": 0.8951364676803809, "eval_loss": 0.43579936027526855, "eval_runtime": 221.9618, "eval_samples_per_second": 124.589, "eval_steps_per_second": 3.897, "step": 64000 }, { "epoch": 14.533573681838666, "grad_norm": 1.5540229082107544, "learning_rate": 3.55e-05, "loss": 0.4425, "step": 64500 }, { "epoch": 14.533573681838666, "eval_accuracy": 0.8954575425427099, "eval_loss": 0.432124525308609, "eval_runtime": 222.2861, "eval_samples_per_second": 124.407, "eval_steps_per_second": 3.891, "step": 64500 }, { "epoch": 14.646237043713384, "grad_norm": 1.6039586067199707, "learning_rate": 3.5e-05, "loss": 0.4375, "step": 65000 }, { "epoch": 14.646237043713384, "eval_accuracy": 0.8959293125901446, "eval_loss": 0.4307084083557129, "eval_runtime": 221.3855, "eval_samples_per_second": 124.913, "eval_steps_per_second": 3.907, "step": 65000 }, { "epoch": 14.758900405588102, "grad_norm": 1.4141193628311157, "learning_rate": 3.45e-05, "loss": 0.4412, "step": 65500 }, { "epoch": 14.758900405588102, "eval_accuracy": 0.8955789560165742, "eval_loss": 0.4335871934890747, "eval_runtime": 220.8948, "eval_samples_per_second": 125.191, "eval_steps_per_second": 3.916, "step": 65500 }, { "epoch": 14.87156376746282, "grad_norm": 1.479407548904419, "learning_rate": 3.4000000000000007e-05, "loss": 0.4402, "step": 66000 }, { "epoch": 14.87156376746282, "eval_accuracy": 0.8960790351082009, "eval_loss": 0.428357869386673, "eval_runtime": 221.3572, "eval_samples_per_second": 124.929, "eval_steps_per_second": 3.908, "step": 66000 }, { "epoch": 14.984227129337539, "grad_norm": 1.6063992977142334, "learning_rate": 3.35e-05, "loss": 0.4386, "step": 66500 }, { "epoch": 14.984227129337539, "eval_accuracy": 0.8961887828028311, "eval_loss": 0.42679697275161743, "eval_runtime": 220.1773, "eval_samples_per_second": 125.599, "eval_steps_per_second": 3.929, "step": 66500 }, { "epoch": 15.096890491212259, "grad_norm": 1.7383469343185425, "learning_rate": 3.3e-05, "loss": 0.4342, "step": 67000 }, { "epoch": 15.096890491212259, "eval_accuracy": 0.8963356120392529, "eval_loss": 0.43058517575263977, "eval_runtime": 221.0267, "eval_samples_per_second": 125.116, "eval_steps_per_second": 3.914, "step": 67000 }, { "epoch": 15.209553853086977, "grad_norm": 1.4529184103012085, "learning_rate": 3.2500000000000004e-05, "loss": 0.4355, "step": 67500 }, { "epoch": 15.209553853086977, "eval_accuracy": 0.8963537523060265, "eval_loss": 0.4319207966327667, "eval_runtime": 221.0822, "eval_samples_per_second": 125.085, "eval_steps_per_second": 3.913, "step": 67500 }, { "epoch": 15.322217214961695, "grad_norm": 1.5925979614257812, "learning_rate": 3.2000000000000005e-05, "loss": 0.434, "step": 68000 }, { "epoch": 15.322217214961695, "eval_accuracy": 0.8967275614111444, "eval_loss": 0.4326106905937195, "eval_runtime": 221.1171, "eval_samples_per_second": 125.065, "eval_steps_per_second": 3.912, "step": 68000 }, { "epoch": 15.434880576836413, "grad_norm": 1.5591844320297241, "learning_rate": 3.15e-05, "loss": 0.4299, "step": 68500 }, { "epoch": 15.434880576836413, "eval_accuracy": 0.8968720289079662, "eval_loss": 0.42554253339767456, "eval_runtime": 221.0599, "eval_samples_per_second": 125.097, "eval_steps_per_second": 3.913, "step": 68500 }, { "epoch": 15.547543938711131, "grad_norm": 1.6964800357818604, "learning_rate": 3.1e-05, "loss": 0.4302, "step": 69000 }, { "epoch": 15.547543938711131, "eval_accuracy": 0.8968983814075581, "eval_loss": 0.43178391456604004, "eval_runtime": 220.2723, "eval_samples_per_second": 125.545, "eval_steps_per_second": 3.927, "step": 69000 }, { "epoch": 15.66020730058585, "grad_norm": 1.7176204919815063, "learning_rate": 3.05e-05, "loss": 0.4317, "step": 69500 }, { "epoch": 15.66020730058585, "eval_accuracy": 0.8971937797880897, "eval_loss": 0.42581045627593994, "eval_runtime": 221.117, "eval_samples_per_second": 125.065, "eval_steps_per_second": 3.912, "step": 69500 }, { "epoch": 15.772870662460567, "grad_norm": 1.4802976846694946, "learning_rate": 3e-05, "loss": 0.4335, "step": 70000 }, { "epoch": 15.772870662460567, "eval_accuracy": 0.8972389358742884, "eval_loss": 0.4227333068847656, "eval_runtime": 220.2787, "eval_samples_per_second": 125.541, "eval_steps_per_second": 3.927, "step": 70000 }, { "epoch": 15.885534024335286, "grad_norm": 1.4625871181488037, "learning_rate": 2.95e-05, "loss": 0.4313, "step": 70500 }, { "epoch": 15.885534024335286, "eval_accuracy": 0.8974343061715017, "eval_loss": 0.420085072517395, "eval_runtime": 221.0709, "eval_samples_per_second": 125.091, "eval_steps_per_second": 3.913, "step": 70500 }, { "epoch": 15.998197386210004, "grad_norm": 1.4574440717697144, "learning_rate": 2.9e-05, "loss": 0.4288, "step": 71000 }, { "epoch": 15.998197386210004, "eval_accuracy": 0.8976216192291354, "eval_loss": 0.42089083790779114, "eval_runtime": 221.0359, "eval_samples_per_second": 125.111, "eval_steps_per_second": 3.913, "step": 71000 }, { "epoch": 16.110860748084722, "grad_norm": 1.415560245513916, "learning_rate": 2.8499999999999998e-05, "loss": 0.4245, "step": 71500 }, { "epoch": 16.110860748084722, "eval_accuracy": 0.8975563777935319, "eval_loss": 0.42667824029922485, "eval_runtime": 220.6503, "eval_samples_per_second": 125.33, "eval_steps_per_second": 3.92, "step": 71500 }, { "epoch": 16.223524109959442, "grad_norm": 1.6393336057662964, "learning_rate": 2.8000000000000003e-05, "loss": 0.4271, "step": 72000 }, { "epoch": 16.223524109959442, "eval_accuracy": 0.8984646682572308, "eval_loss": 0.4213043749332428, "eval_runtime": 219.8857, "eval_samples_per_second": 125.765, "eval_steps_per_second": 3.934, "step": 72000 }, { "epoch": 16.336187471834158, "grad_norm": 1.6446831226348877, "learning_rate": 2.7500000000000004e-05, "loss": 0.4234, "step": 72500 }, { "epoch": 16.336187471834158, "eval_accuracy": 0.8985838129375973, "eval_loss": 0.42193278670310974, "eval_runtime": 221.0608, "eval_samples_per_second": 125.097, "eval_steps_per_second": 3.913, "step": 72500 }, { "epoch": 16.448850833708878, "grad_norm": 1.725674033164978, "learning_rate": 2.7000000000000002e-05, "loss": 0.4251, "step": 73000 }, { "epoch": 16.448850833708878, "eval_accuracy": 0.8987038252593214, "eval_loss": 0.4171987771987915, "eval_runtime": 220.8886, "eval_samples_per_second": 125.194, "eval_steps_per_second": 3.916, "step": 73000 }, { "epoch": 16.561514195583594, "grad_norm": 1.5979257822036743, "learning_rate": 2.6500000000000004e-05, "loss": 0.4217, "step": 73500 }, { "epoch": 16.561514195583594, "eval_accuracy": 0.8987482542802462, "eval_loss": 0.418377161026001, "eval_runtime": 221.121, "eval_samples_per_second": 125.063, "eval_steps_per_second": 3.912, "step": 73500 }, { "epoch": 16.674177557458314, "grad_norm": 1.4892100095748901, "learning_rate": 2.6000000000000002e-05, "loss": 0.4205, "step": 74000 }, { "epoch": 16.674177557458314, "eval_accuracy": 0.8989114915507669, "eval_loss": 0.4165091812610626, "eval_runtime": 222.5924, "eval_samples_per_second": 124.236, "eval_steps_per_second": 3.886, "step": 74000 }, { "epoch": 16.786840919333034, "grad_norm": 1.4461849927902222, "learning_rate": 2.5500000000000003e-05, "loss": 0.4228, "step": 74500 }, { "epoch": 16.786840919333034, "eval_accuracy": 0.8989125542022752, "eval_loss": 0.4175247848033905, "eval_runtime": 222.3981, "eval_samples_per_second": 124.345, "eval_steps_per_second": 3.889, "step": 74500 }, { "epoch": 16.89950428120775, "grad_norm": 1.768370509147644, "learning_rate": 2.5e-05, "loss": 0.421, "step": 75000 }, { "epoch": 16.89950428120775, "eval_accuracy": 0.8991065894891168, "eval_loss": 0.41619065403938293, "eval_runtime": 222.6024, "eval_samples_per_second": 124.23, "eval_steps_per_second": 3.886, "step": 75000 }, { "epoch": 17.01216764308247, "grad_norm": 1.4250850677490234, "learning_rate": 2.45e-05, "loss": 0.4178, "step": 75500 }, { "epoch": 17.01216764308247, "eval_accuracy": 0.8994014895612595, "eval_loss": 0.4117368161678314, "eval_runtime": 222.14, "eval_samples_per_second": 124.489, "eval_steps_per_second": 3.894, "step": 75500 }, { "epoch": 17.124831004957187, "grad_norm": 1.4036965370178223, "learning_rate": 2.4e-05, "loss": 0.4176, "step": 76000 }, { "epoch": 17.124831004957187, "eval_accuracy": 0.8995830389073786, "eval_loss": 0.4121379852294922, "eval_runtime": 222.2839, "eval_samples_per_second": 124.408, "eval_steps_per_second": 3.891, "step": 76000 }, { "epoch": 17.237494366831907, "grad_norm": 1.395093321800232, "learning_rate": 2.35e-05, "loss": 0.4172, "step": 76500 }, { "epoch": 17.237494366831907, "eval_accuracy": 0.8998577766066815, "eval_loss": 0.41285398602485657, "eval_runtime": 222.2267, "eval_samples_per_second": 124.44, "eval_steps_per_second": 3.892, "step": 76500 }, { "epoch": 17.350157728706623, "grad_norm": 1.5492697954177856, "learning_rate": 2.3000000000000003e-05, "loss": 0.4133, "step": 77000 }, { "epoch": 17.350157728706623, "eval_accuracy": 0.8992890854661668, "eval_loss": 0.41492369771003723, "eval_runtime": 219.0021, "eval_samples_per_second": 126.273, "eval_steps_per_second": 3.95, "step": 77000 }, { "epoch": 17.462821090581343, "grad_norm": 1.4863234758377075, "learning_rate": 2.25e-05, "loss": 0.4166, "step": 77500 }, { "epoch": 17.462821090581343, "eval_accuracy": 0.8995439142560963, "eval_loss": 0.41370296478271484, "eval_runtime": 220.4874, "eval_samples_per_second": 125.422, "eval_steps_per_second": 3.923, "step": 77500 }, { "epoch": 17.57548445245606, "grad_norm": 1.8134657144546509, "learning_rate": 2.2000000000000003e-05, "loss": 0.4167, "step": 78000 }, { "epoch": 17.57548445245606, "eval_accuracy": 0.8998953243247179, "eval_loss": 0.4118014872074127, "eval_runtime": 220.8002, "eval_samples_per_second": 125.244, "eval_steps_per_second": 3.918, "step": 78000 }, { "epoch": 17.68814781433078, "grad_norm": 1.7903392314910889, "learning_rate": 2.15e-05, "loss": 0.4164, "step": 78500 }, { "epoch": 17.68814781433078, "eval_accuracy": 0.9001628949311502, "eval_loss": 0.4123002886772156, "eval_runtime": 219.63, "eval_samples_per_second": 125.912, "eval_steps_per_second": 3.938, "step": 78500 }, { "epoch": 17.8008111762055, "grad_norm": 1.6216607093811035, "learning_rate": 2.1e-05, "loss": 0.4143, "step": 79000 }, { "epoch": 17.8008111762055, "eval_accuracy": 0.9001079811521843, "eval_loss": 0.40997758507728577, "eval_runtime": 219.8198, "eval_samples_per_second": 125.803, "eval_steps_per_second": 3.935, "step": 79000 }, { "epoch": 17.913474538080216, "grad_norm": 1.5128173828125, "learning_rate": 2.05e-05, "loss": 0.4136, "step": 79500 }, { "epoch": 17.913474538080216, "eval_accuracy": 0.9006287821890727, "eval_loss": 0.41052308678627014, "eval_runtime": 219.697, "eval_samples_per_second": 125.873, "eval_steps_per_second": 3.937, "step": 79500 }, { "epoch": 18.026137899954936, "grad_norm": 1.413712978363037, "learning_rate": 2e-05, "loss": 0.4132, "step": 80000 }, { "epoch": 18.026137899954936, "eval_accuracy": 0.9007660373895346, "eval_loss": 0.4081571400165558, "eval_runtime": 220.6703, "eval_samples_per_second": 125.318, "eval_steps_per_second": 3.92, "step": 80000 }, { "epoch": 18.138801261829652, "grad_norm": 1.7320311069488525, "learning_rate": 1.9500000000000003e-05, "loss": 0.4102, "step": 80500 }, { "epoch": 18.138801261829652, "eval_accuracy": 0.9009554825729237, "eval_loss": 0.407240092754364, "eval_runtime": 221.4224, "eval_samples_per_second": 124.893, "eval_steps_per_second": 3.907, "step": 80500 }, { "epoch": 18.251464623704372, "grad_norm": 1.8033103942871094, "learning_rate": 1.9e-05, "loss": 0.4097, "step": 81000 }, { "epoch": 18.251464623704372, "eval_accuracy": 0.9008985457774398, "eval_loss": 0.4109956920146942, "eval_runtime": 221.1014, "eval_samples_per_second": 125.074, "eval_steps_per_second": 3.912, "step": 81000 }, { "epoch": 18.36412798557909, "grad_norm": 1.8222883939743042, "learning_rate": 1.85e-05, "loss": 0.4085, "step": 81500 }, { "epoch": 18.36412798557909, "eval_accuracy": 0.9007539025464132, "eval_loss": 0.4095366299152374, "eval_runtime": 220.8203, "eval_samples_per_second": 125.233, "eval_steps_per_second": 3.917, "step": 81500 }, { "epoch": 18.47679134745381, "grad_norm": 1.4663125276565552, "learning_rate": 1.8e-05, "loss": 0.4105, "step": 82000 }, { "epoch": 18.47679134745381, "eval_accuracy": 0.9014532811520996, "eval_loss": 0.4047625958919525, "eval_runtime": 219.6263, "eval_samples_per_second": 125.914, "eval_steps_per_second": 3.939, "step": 82000 }, { "epoch": 18.589454709328525, "grad_norm": 1.8482975959777832, "learning_rate": 1.75e-05, "loss": 0.4096, "step": 82500 }, { "epoch": 18.589454709328525, "eval_accuracy": 0.9010233806097327, "eval_loss": 0.4072835445404053, "eval_runtime": 220.7586, "eval_samples_per_second": 125.268, "eval_steps_per_second": 3.918, "step": 82500 }, { "epoch": 18.702118071203245, "grad_norm": 1.4483723640441895, "learning_rate": 1.7000000000000003e-05, "loss": 0.4041, "step": 83000 }, { "epoch": 18.702118071203245, "eval_accuracy": 0.9015295597674521, "eval_loss": 0.4039141833782196, "eval_runtime": 220.7392, "eval_samples_per_second": 125.279, "eval_steps_per_second": 3.919, "step": 83000 }, { "epoch": 18.814781433077965, "grad_norm": 1.6040253639221191, "learning_rate": 1.65e-05, "loss": 0.4062, "step": 83500 }, { "epoch": 18.814781433077965, "eval_accuracy": 0.9016612318058135, "eval_loss": 0.40488725900650024, "eval_runtime": 221.3884, "eval_samples_per_second": 124.912, "eval_steps_per_second": 3.907, "step": 83500 }, { "epoch": 18.92744479495268, "grad_norm": 1.3560248613357544, "learning_rate": 1.6000000000000003e-05, "loss": 0.4045, "step": 84000 }, { "epoch": 18.92744479495268, "eval_accuracy": 0.9015874866980568, "eval_loss": 0.4032597243785858, "eval_runtime": 221.9037, "eval_samples_per_second": 124.622, "eval_steps_per_second": 3.898, "step": 84000 }, { "epoch": 19.0401081568274, "grad_norm": 1.6236895322799683, "learning_rate": 1.55e-05, "loss": 0.4038, "step": 84500 }, { "epoch": 19.0401081568274, "eval_accuracy": 0.901710217516976, "eval_loss": 0.4084183871746063, "eval_runtime": 220.8431, "eval_samples_per_second": 125.22, "eval_steps_per_second": 3.917, "step": 84500 }, { "epoch": 19.152771518702117, "grad_norm": 1.6514983177185059, "learning_rate": 1.5e-05, "loss": 0.4037, "step": 85000 }, { "epoch": 19.152771518702117, "eval_accuracy": 0.9016946022320732, "eval_loss": 0.4033704102039337, "eval_runtime": 221.6212, "eval_samples_per_second": 124.78, "eval_steps_per_second": 3.903, "step": 85000 }, { "epoch": 19.265434880576837, "grad_norm": 1.3684407472610474, "learning_rate": 1.45e-05, "loss": 0.4022, "step": 85500 }, { "epoch": 19.265434880576837, "eval_accuracy": 0.9021324676993308, "eval_loss": 0.40617531538009644, "eval_runtime": 221.8256, "eval_samples_per_second": 124.666, "eval_steps_per_second": 3.899, "step": 85500 }, { "epoch": 19.378098242451554, "grad_norm": 1.592301607131958, "learning_rate": 1.4000000000000001e-05, "loss": 0.4059, "step": 86000 }, { "epoch": 19.378098242451554, "eval_accuracy": 0.902363044454423, "eval_loss": 0.3991073668003082, "eval_runtime": 220.8011, "eval_samples_per_second": 125.244, "eval_steps_per_second": 3.918, "step": 86000 }, { "epoch": 19.490761604326273, "grad_norm": 1.5463926792144775, "learning_rate": 1.3500000000000001e-05, "loss": 0.4013, "step": 86500 }, { "epoch": 19.490761604326273, "eval_accuracy": 0.9023868906868481, "eval_loss": 0.39859089255332947, "eval_runtime": 220.6504, "eval_samples_per_second": 125.329, "eval_steps_per_second": 3.92, "step": 86500 }, { "epoch": 19.60342496620099, "grad_norm": 1.6952037811279297, "learning_rate": 1.3000000000000001e-05, "loss": 0.4004, "step": 87000 }, { "epoch": 19.60342496620099, "eval_accuracy": 0.9029012333672634, "eval_loss": 0.4017859995365143, "eval_runtime": 220.6857, "eval_samples_per_second": 125.309, "eval_steps_per_second": 3.92, "step": 87000 }, { "epoch": 19.71608832807571, "grad_norm": 1.5156389474868774, "learning_rate": 1.25e-05, "loss": 0.4023, "step": 87500 }, { "epoch": 19.71608832807571, "eval_accuracy": 0.9022691715502759, "eval_loss": 0.40082216262817383, "eval_runtime": 220.7786, "eval_samples_per_second": 125.257, "eval_steps_per_second": 3.918, "step": 87500 }, { "epoch": 19.82875168995043, "grad_norm": 1.5951709747314453, "learning_rate": 1.2e-05, "loss": 0.3987, "step": 88000 }, { "epoch": 19.82875168995043, "eval_accuracy": 0.9028266490406112, "eval_loss": 0.4010894000530243, "eval_runtime": 220.1664, "eval_samples_per_second": 125.605, "eval_steps_per_second": 3.929, "step": 88000 }, { "epoch": 19.941415051825146, "grad_norm": 1.4990533590316772, "learning_rate": 1.1500000000000002e-05, "loss": 0.3935, "step": 88500 }, { "epoch": 19.941415051825146, "eval_accuracy": 0.9027395900326748, "eval_loss": 0.401162326335907, "eval_runtime": 220.111, "eval_samples_per_second": 125.637, "eval_steps_per_second": 3.93, "step": 88500 }, { "epoch": 20.054078413699866, "grad_norm": 1.5961695909500122, "learning_rate": 1.1000000000000001e-05, "loss": 0.3978, "step": 89000 }, { "epoch": 20.054078413699866, "eval_accuracy": 0.902977115716753, "eval_loss": 0.3981638252735138, "eval_runtime": 219.6972, "eval_samples_per_second": 125.873, "eval_steps_per_second": 3.937, "step": 89000 }, { "epoch": 20.166741775574582, "grad_norm": 1.5186184644699097, "learning_rate": 1.05e-05, "loss": 0.4012, "step": 89500 }, { "epoch": 20.166741775574582, "eval_accuracy": 0.9029895131243953, "eval_loss": 0.39535069465637207, "eval_runtime": 220.4034, "eval_samples_per_second": 125.47, "eval_steps_per_second": 3.925, "step": 89500 }, { "epoch": 20.279405137449302, "grad_norm": 1.7340284585952759, "learning_rate": 1e-05, "loss": 0.3954, "step": 90000 }, { "epoch": 20.279405137449302, "eval_accuracy": 0.9031845731573412, "eval_loss": 0.3972371816635132, "eval_runtime": 220.9175, "eval_samples_per_second": 125.178, "eval_steps_per_second": 3.915, "step": 90000 }, { "epoch": 20.39206849932402, "grad_norm": 1.4601465463638306, "learning_rate": 9.5e-06, "loss": 0.3933, "step": 90500 }, { "epoch": 20.39206849932402, "eval_accuracy": 0.9031702530935091, "eval_loss": 0.39749225974082947, "eval_runtime": 220.5834, "eval_samples_per_second": 125.368, "eval_steps_per_second": 3.921, "step": 90500 }, { "epoch": 20.50473186119874, "grad_norm": 1.6822484731674194, "learning_rate": 9e-06, "loss": 0.3985, "step": 91000 }, { "epoch": 20.50473186119874, "eval_accuracy": 0.903283638473266, "eval_loss": 0.39412999153137207, "eval_runtime": 220.402, "eval_samples_per_second": 125.471, "eval_steps_per_second": 3.925, "step": 91000 }, { "epoch": 20.617395223073455, "grad_norm": 1.5493133068084717, "learning_rate": 8.500000000000002e-06, "loss": 0.3952, "step": 91500 }, { "epoch": 20.617395223073455, "eval_accuracy": 0.9031870870760611, "eval_loss": 0.39998504519462585, "eval_runtime": 219.1703, "eval_samples_per_second": 126.176, "eval_steps_per_second": 3.947, "step": 91500 }, { "epoch": 20.730058584948175, "grad_norm": 1.6142163276672363, "learning_rate": 8.000000000000001e-06, "loss": 0.395, "step": 92000 }, { "epoch": 20.730058584948175, "eval_accuracy": 0.9037042508521438, "eval_loss": 0.39454683661460876, "eval_runtime": 220.1482, "eval_samples_per_second": 125.615, "eval_steps_per_second": 3.929, "step": 92000 }, { "epoch": 20.842721946822895, "grad_norm": 1.3768945932388306, "learning_rate": 7.5e-06, "loss": 0.3925, "step": 92500 }, { "epoch": 20.842721946822895, "eval_accuracy": 0.9035520393735632, "eval_loss": 0.3969292640686035, "eval_runtime": 218.8787, "eval_samples_per_second": 126.344, "eval_steps_per_second": 3.952, "step": 92500 }, { "epoch": 20.95538530869761, "grad_norm": 1.8161870241165161, "learning_rate": 7.000000000000001e-06, "loss": 0.3911, "step": 93000 }, { "epoch": 20.95538530869761, "eval_accuracy": 0.9034115695768419, "eval_loss": 0.39153432846069336, "eval_runtime": 219.6974, "eval_samples_per_second": 125.873, "eval_steps_per_second": 3.937, "step": 93000 }, { "epoch": 21.06804867057233, "grad_norm": 1.7550774812698364, "learning_rate": 6.5000000000000004e-06, "loss": 0.3927, "step": 93500 }, { "epoch": 21.06804867057233, "eval_accuracy": 0.9035121668560334, "eval_loss": 0.39775171875953674, "eval_runtime": 221.0095, "eval_samples_per_second": 125.126, "eval_steps_per_second": 3.914, "step": 93500 }, { "epoch": 21.180712032447047, "grad_norm": 1.5582369565963745, "learning_rate": 6e-06, "loss": 0.3891, "step": 94000 }, { "epoch": 21.180712032447047, "eval_accuracy": 0.9037201879273532, "eval_loss": 0.3943246006965637, "eval_runtime": 220.3117, "eval_samples_per_second": 125.522, "eval_steps_per_second": 3.926, "step": 94000 }, { "epoch": 21.293375394321767, "grad_norm": 1.6729559898376465, "learning_rate": 5.500000000000001e-06, "loss": 0.3912, "step": 94500 }, { "epoch": 21.293375394321767, "eval_accuracy": 0.9036670141570837, "eval_loss": 0.39444249868392944, "eval_runtime": 219.9471, "eval_samples_per_second": 125.73, "eval_steps_per_second": 3.933, "step": 94500 }, { "epoch": 21.406038756196484, "grad_norm": 1.6871699094772339, "learning_rate": 5e-06, "loss": 0.3908, "step": 95000 }, { "epoch": 21.406038756196484, "eval_accuracy": 0.9037736297217607, "eval_loss": 0.39369192719459534, "eval_runtime": 219.9205, "eval_samples_per_second": 125.745, "eval_steps_per_second": 3.933, "step": 95000 }, { "epoch": 21.518702118071204, "grad_norm": 1.486741304397583, "learning_rate": 4.5e-06, "loss": 0.3902, "step": 95500 }, { "epoch": 21.518702118071204, "eval_accuracy": 0.9034302972672164, "eval_loss": 0.39573636651039124, "eval_runtime": 219.8759, "eval_samples_per_second": 125.771, "eval_steps_per_second": 3.934, "step": 95500 }, { "epoch": 21.63136547994592, "grad_norm": 1.8056081533432007, "learning_rate": 4.000000000000001e-06, "loss": 0.3891, "step": 96000 }, { "epoch": 21.63136547994592, "eval_accuracy": 0.9045647365783699, "eval_loss": 0.39023157954216003, "eval_runtime": 221.3034, "eval_samples_per_second": 124.96, "eval_steps_per_second": 3.909, "step": 96000 }, { "epoch": 21.74402884182064, "grad_norm": 1.552370309829712, "learning_rate": 3.5000000000000004e-06, "loss": 0.3894, "step": 96500 }, { "epoch": 21.74402884182064, "eval_accuracy": 0.9044615558398447, "eval_loss": 0.39400991797447205, "eval_runtime": 219.8746, "eval_samples_per_second": 125.772, "eval_steps_per_second": 3.934, "step": 96500 }, { "epoch": 21.85669220369536, "grad_norm": 1.506536841392517, "learning_rate": 3e-06, "loss": 0.3904, "step": 97000 }, { "epoch": 21.85669220369536, "eval_accuracy": 0.9044962394479266, "eval_loss": 0.390458881855011, "eval_runtime": 220.131, "eval_samples_per_second": 125.625, "eval_steps_per_second": 3.929, "step": 97000 }, { "epoch": 21.969355565570076, "grad_norm": 1.6080279350280762, "learning_rate": 2.5e-06, "loss": 0.3882, "step": 97500 }, { "epoch": 21.969355565570076, "eval_accuracy": 0.9043700852475594, "eval_loss": 0.39395132660865784, "eval_runtime": 220.1175, "eval_samples_per_second": 125.633, "eval_steps_per_second": 3.93, "step": 97500 }, { "epoch": 22.082018927444796, "grad_norm": 1.6551542282104492, "learning_rate": 2.0000000000000003e-06, "loss": 0.388, "step": 98000 }, { "epoch": 22.082018927444796, "eval_accuracy": 0.904642958920198, "eval_loss": 0.39477479457855225, "eval_runtime": 219.108, "eval_samples_per_second": 126.212, "eval_steps_per_second": 3.948, "step": 98000 }, { "epoch": 22.194682289319513, "grad_norm": 1.3376331329345703, "learning_rate": 1.5e-06, "loss": 0.3888, "step": 98500 }, { "epoch": 22.194682289319513, "eval_accuracy": 0.9042594879589607, "eval_loss": 0.39155128598213196, "eval_runtime": 221.2476, "eval_samples_per_second": 124.991, "eval_steps_per_second": 3.91, "step": 98500 }, { "epoch": 22.307345651194233, "grad_norm": 1.6391901969909668, "learning_rate": 1.0000000000000002e-06, "loss": 0.385, "step": 99000 }, { "epoch": 22.307345651194233, "eval_accuracy": 0.9047423169505552, "eval_loss": 0.3867943286895752, "eval_runtime": 220.9463, "eval_samples_per_second": 125.162, "eval_steps_per_second": 3.915, "step": 99000 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 23, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.346992290195046e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }