| { | |
| "best_metric": 0.3867943286895752, | |
| "best_model_checkpoint": "./model_fine-tune/glot/mbert/npi-Deva/checkpoint-99000", | |
| "epoch": 22.307345651194233, | |
| "eval_steps": 500, | |
| "global_step": 99000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11266336187471834, | |
| "grad_norm": 2.965751886367798, | |
| "learning_rate": 9.95e-05, | |
| "loss": 1.2301, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11266336187471834, | |
| "eval_accuracy": 0.773184921040326, | |
| "eval_loss": 1.0424206256866455, | |
| "eval_runtime": 220.277, | |
| "eval_samples_per_second": 125.542, | |
| "eval_steps_per_second": 3.927, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22532672374943669, | |
| "grad_norm": 2.740410327911377, | |
| "learning_rate": 9.900000000000001e-05, | |
| "loss": 1.0543, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22532672374943669, | |
| "eval_accuracy": 0.7909380298838988, | |
| "eval_loss": 0.9471855163574219, | |
| "eval_runtime": 220.5287, | |
| "eval_samples_per_second": 125.399, | |
| "eval_steps_per_second": 3.922, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.337990085624155, | |
| "grad_norm": 2.7863168716430664, | |
| "learning_rate": 9.850000000000001e-05, | |
| "loss": 0.9779, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.337990085624155, | |
| "eval_accuracy": 0.8025380313108481, | |
| "eval_loss": 0.8843335509300232, | |
| "eval_runtime": 220.1694, | |
| "eval_samples_per_second": 125.603, | |
| "eval_steps_per_second": 3.929, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45065344749887337, | |
| "grad_norm": 2.5414557456970215, | |
| "learning_rate": 9.8e-05, | |
| "loss": 0.926, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.45065344749887337, | |
| "eval_accuracy": 0.8101656739468518, | |
| "eval_loss": 0.8543083667755127, | |
| "eval_runtime": 220.7797, | |
| "eval_samples_per_second": 125.256, | |
| "eval_steps_per_second": 3.918, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5633168093735917, | |
| "grad_norm": 2.377737283706665, | |
| "learning_rate": 9.75e-05, | |
| "loss": 0.8865, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5633168093735917, | |
| "eval_accuracy": 0.8173040702951884, | |
| "eval_loss": 0.810703456401825, | |
| "eval_runtime": 220.8366, | |
| "eval_samples_per_second": 125.224, | |
| "eval_steps_per_second": 3.917, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.67598017124831, | |
| "grad_norm": 2.3488988876342773, | |
| "learning_rate": 9.7e-05, | |
| "loss": 0.8609, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.67598017124831, | |
| "eval_accuracy": 0.821973743458109, | |
| "eval_loss": 0.7787520289421082, | |
| "eval_runtime": 221.2787, | |
| "eval_samples_per_second": 124.974, | |
| "eval_steps_per_second": 3.909, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7886435331230284, | |
| "grad_norm": 2.1220295429229736, | |
| "learning_rate": 9.65e-05, | |
| "loss": 0.8342, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7886435331230284, | |
| "eval_accuracy": 0.8262216641689282, | |
| "eval_loss": 0.762144923210144, | |
| "eval_runtime": 220.8289, | |
| "eval_samples_per_second": 125.228, | |
| "eval_steps_per_second": 3.917, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9013068949977467, | |
| "grad_norm": 2.0968008041381836, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.819, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.9013068949977467, | |
| "eval_accuracy": 0.8298798163116309, | |
| "eval_loss": 0.7466955184936523, | |
| "eval_runtime": 220.013, | |
| "eval_samples_per_second": 125.693, | |
| "eval_steps_per_second": 3.932, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0139702568724651, | |
| "grad_norm": 2.1498773097991943, | |
| "learning_rate": 9.55e-05, | |
| "loss": 0.7979, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.0139702568724651, | |
| "eval_accuracy": 0.8318434647099575, | |
| "eval_loss": 0.7348815202713013, | |
| "eval_runtime": 220.8229, | |
| "eval_samples_per_second": 125.232, | |
| "eval_steps_per_second": 3.917, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1266336187471835, | |
| "grad_norm": 2.163381576538086, | |
| "learning_rate": 9.5e-05, | |
| "loss": 0.7814, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.1266336187471835, | |
| "eval_accuracy": 0.8349864178467281, | |
| "eval_loss": 0.7180664539337158, | |
| "eval_runtime": 219.7442, | |
| "eval_samples_per_second": 125.846, | |
| "eval_steps_per_second": 3.936, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.2392969806219019, | |
| "grad_norm": 2.431119680404663, | |
| "learning_rate": 9.449999999999999e-05, | |
| "loss": 0.7665, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.2392969806219019, | |
| "eval_accuracy": 0.8369151584278837, | |
| "eval_loss": 0.7159287333488464, | |
| "eval_runtime": 220.9495, | |
| "eval_samples_per_second": 125.16, | |
| "eval_steps_per_second": 3.915, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.35196034249662, | |
| "grad_norm": 2.2182135581970215, | |
| "learning_rate": 9.4e-05, | |
| "loss": 0.7555, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.35196034249662, | |
| "eval_accuracy": 0.8390386817390197, | |
| "eval_loss": 0.6999027132987976, | |
| "eval_runtime": 221.2257, | |
| "eval_samples_per_second": 125.004, | |
| "eval_steps_per_second": 3.91, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.4646237043713384, | |
| "grad_norm": 2.1569323539733887, | |
| "learning_rate": 9.350000000000001e-05, | |
| "loss": 0.7479, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.4646237043713384, | |
| "eval_accuracy": 0.8422681179825285, | |
| "eval_loss": 0.689199686050415, | |
| "eval_runtime": 222.1646, | |
| "eval_samples_per_second": 124.475, | |
| "eval_steps_per_second": 3.894, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.5772870662460567, | |
| "grad_norm": 2.1323976516723633, | |
| "learning_rate": 9.300000000000001e-05, | |
| "loss": 0.733, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.5772870662460567, | |
| "eval_accuracy": 0.8439326355101854, | |
| "eval_loss": 0.6770957112312317, | |
| "eval_runtime": 220.0047, | |
| "eval_samples_per_second": 125.697, | |
| "eval_steps_per_second": 3.932, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.6899504281207751, | |
| "grad_norm": 5.134857177734375, | |
| "learning_rate": 9.250000000000001e-05, | |
| "loss": 0.7254, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.6899504281207751, | |
| "eval_accuracy": 0.8453398865939418, | |
| "eval_loss": 0.6668263077735901, | |
| "eval_runtime": 220.3226, | |
| "eval_samples_per_second": 125.516, | |
| "eval_steps_per_second": 3.926, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.8026137899954935, | |
| "grad_norm": 2.0616443157196045, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 0.7155, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.8026137899954935, | |
| "eval_accuracy": 0.8463415002486427, | |
| "eval_loss": 0.6613638997077942, | |
| "eval_runtime": 221.4149, | |
| "eval_samples_per_second": 124.897, | |
| "eval_steps_per_second": 3.907, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.9152771518702119, | |
| "grad_norm": 1.8310041427612305, | |
| "learning_rate": 9.15e-05, | |
| "loss": 0.7057, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.9152771518702119, | |
| "eval_accuracy": 0.8482940895875467, | |
| "eval_loss": 0.6528915762901306, | |
| "eval_runtime": 222.056, | |
| "eval_samples_per_second": 124.536, | |
| "eval_steps_per_second": 3.895, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.0279405137449302, | |
| "grad_norm": 1.93686842918396, | |
| "learning_rate": 9.1e-05, | |
| "loss": 0.7005, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.0279405137449302, | |
| "eval_accuracy": 0.849507663539711, | |
| "eval_loss": 0.6522949934005737, | |
| "eval_runtime": 221.8006, | |
| "eval_samples_per_second": 124.68, | |
| "eval_steps_per_second": 3.9, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.1406038756196484, | |
| "grad_norm": 6.06415319442749, | |
| "learning_rate": 9.05e-05, | |
| "loss": 0.6884, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.1406038756196484, | |
| "eval_accuracy": 0.8502759566677828, | |
| "eval_loss": 0.6491975784301758, | |
| "eval_runtime": 221.9876, | |
| "eval_samples_per_second": 124.575, | |
| "eval_steps_per_second": 3.897, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.253267237494367, | |
| "grad_norm": 1.9235719442367554, | |
| "learning_rate": 9e-05, | |
| "loss": 0.6821, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.253267237494367, | |
| "eval_accuracy": 0.8524909295282275, | |
| "eval_loss": 0.6343050599098206, | |
| "eval_runtime": 222.2817, | |
| "eval_samples_per_second": 124.41, | |
| "eval_steps_per_second": 3.891, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.365930599369085, | |
| "grad_norm": 1.8421759605407715, | |
| "learning_rate": 8.950000000000001e-05, | |
| "loss": 0.6767, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.365930599369085, | |
| "eval_accuracy": 0.8534436334949597, | |
| "eval_loss": 0.623904287815094, | |
| "eval_runtime": 221.9784, | |
| "eval_samples_per_second": 124.58, | |
| "eval_steps_per_second": 3.897, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.4785939612438037, | |
| "grad_norm": 1.9507330656051636, | |
| "learning_rate": 8.900000000000001e-05, | |
| "loss": 0.6792, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.4785939612438037, | |
| "eval_accuracy": 0.8552298873542783, | |
| "eval_loss": 0.6220438480377197, | |
| "eval_runtime": 221.6861, | |
| "eval_samples_per_second": 124.744, | |
| "eval_steps_per_second": 3.902, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.591257323118522, | |
| "grad_norm": 2.11086106300354, | |
| "learning_rate": 8.850000000000001e-05, | |
| "loss": 0.668, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.591257323118522, | |
| "eval_accuracy": 0.8557371543230742, | |
| "eval_loss": 0.6222216486930847, | |
| "eval_runtime": 221.2659, | |
| "eval_samples_per_second": 124.981, | |
| "eval_steps_per_second": 3.909, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.70392068499324, | |
| "grad_norm": 2.1847715377807617, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.6636, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.70392068499324, | |
| "eval_accuracy": 0.8559949812795903, | |
| "eval_loss": 0.6197636127471924, | |
| "eval_runtime": 222.3204, | |
| "eval_samples_per_second": 124.388, | |
| "eval_steps_per_second": 3.891, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.8165840468679586, | |
| "grad_norm": 2.1351499557495117, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.6576, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.8165840468679586, | |
| "eval_accuracy": 0.8577423671742627, | |
| "eval_loss": 0.6103814840316772, | |
| "eval_runtime": 222.1201, | |
| "eval_samples_per_second": 124.5, | |
| "eval_steps_per_second": 3.894, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.9292474087426768, | |
| "grad_norm": 3.9510111808776855, | |
| "learning_rate": 8.7e-05, | |
| "loss": 0.6488, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.9292474087426768, | |
| "eval_accuracy": 0.858561338833408, | |
| "eval_loss": 0.6049174070358276, | |
| "eval_runtime": 221.9998, | |
| "eval_samples_per_second": 124.568, | |
| "eval_steps_per_second": 3.896, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.0419107706173953, | |
| "grad_norm": 1.8234397172927856, | |
| "learning_rate": 8.65e-05, | |
| "loss": 0.6438, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.0419107706173953, | |
| "eval_accuracy": 0.8591052959636053, | |
| "eval_loss": 0.6051846742630005, | |
| "eval_runtime": 220.9713, | |
| "eval_samples_per_second": 125.147, | |
| "eval_steps_per_second": 3.915, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.1545741324921135, | |
| "grad_norm": 1.9275134801864624, | |
| "learning_rate": 8.6e-05, | |
| "loss": 0.6369, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.1545741324921135, | |
| "eval_accuracy": 0.8599297280864646, | |
| "eval_loss": 0.6021236181259155, | |
| "eval_runtime": 222.1682, | |
| "eval_samples_per_second": 124.473, | |
| "eval_steps_per_second": 3.893, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.267237494366832, | |
| "grad_norm": 2.4342575073242188, | |
| "learning_rate": 8.55e-05, | |
| "loss": 0.6375, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.267237494366832, | |
| "eval_accuracy": 0.8612232782302242, | |
| "eval_loss": 0.5935059785842896, | |
| "eval_runtime": 221.4028, | |
| "eval_samples_per_second": 124.904, | |
| "eval_steps_per_second": 3.907, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.3799008562415502, | |
| "grad_norm": 1.8208547830581665, | |
| "learning_rate": 8.5e-05, | |
| "loss": 0.6327, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.3799008562415502, | |
| "eval_accuracy": 0.8619705169680766, | |
| "eval_loss": 0.5865727663040161, | |
| "eval_runtime": 221.4519, | |
| "eval_samples_per_second": 124.876, | |
| "eval_steps_per_second": 3.906, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.492564218116269, | |
| "grad_norm": 1.8497122526168823, | |
| "learning_rate": 8.450000000000001e-05, | |
| "loss": 0.6289, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.492564218116269, | |
| "eval_accuracy": 0.8624703434485368, | |
| "eval_loss": 0.5854940414428711, | |
| "eval_runtime": 222.074, | |
| "eval_samples_per_second": 124.526, | |
| "eval_steps_per_second": 3.895, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.605227579990987, | |
| "grad_norm": 1.7389825582504272, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.6231, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.605227579990987, | |
| "eval_accuracy": 0.8635307164001665, | |
| "eval_loss": 0.5809486508369446, | |
| "eval_runtime": 222.3436, | |
| "eval_samples_per_second": 124.375, | |
| "eval_steps_per_second": 3.89, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.717890941865705, | |
| "grad_norm": 1.7109190225601196, | |
| "learning_rate": 8.35e-05, | |
| "loss": 0.6193, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.717890941865705, | |
| "eval_accuracy": 0.8642588913962003, | |
| "eval_loss": 0.5757493376731873, | |
| "eval_runtime": 220.862, | |
| "eval_samples_per_second": 125.209, | |
| "eval_steps_per_second": 3.916, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.8305543037404237, | |
| "grad_norm": 2.09114408493042, | |
| "learning_rate": 8.3e-05, | |
| "loss": 0.619, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.8305543037404237, | |
| "eval_accuracy": 0.8644031427528578, | |
| "eval_loss": 0.5797725319862366, | |
| "eval_runtime": 220.9835, | |
| "eval_samples_per_second": 125.141, | |
| "eval_steps_per_second": 3.914, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.943217665615142, | |
| "grad_norm": 6.745112419128418, | |
| "learning_rate": 8.25e-05, | |
| "loss": 0.6127, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.943217665615142, | |
| "eval_accuracy": 0.8645245282957764, | |
| "eval_loss": 0.5759025812149048, | |
| "eval_runtime": 222.2291, | |
| "eval_samples_per_second": 124.439, | |
| "eval_steps_per_second": 3.892, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.0558810274898605, | |
| "grad_norm": 1.7710591554641724, | |
| "learning_rate": 8.2e-05, | |
| "loss": 0.6081, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.0558810274898605, | |
| "eval_accuracy": 0.8658915432042757, | |
| "eval_loss": 0.5714759230613708, | |
| "eval_runtime": 221.6135, | |
| "eval_samples_per_second": 124.785, | |
| "eval_steps_per_second": 3.903, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.168544389364579, | |
| "grad_norm": 1.8267593383789062, | |
| "learning_rate": 8.15e-05, | |
| "loss": 0.5988, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.168544389364579, | |
| "eval_accuracy": 0.8665697779685045, | |
| "eval_loss": 0.5671255588531494, | |
| "eval_runtime": 221.0373, | |
| "eval_samples_per_second": 125.11, | |
| "eval_steps_per_second": 3.913, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.281207751239297, | |
| "grad_norm": 1.6686463356018066, | |
| "learning_rate": 8.1e-05, | |
| "loss": 0.5981, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.281207751239297, | |
| "eval_accuracy": 0.8667210799508446, | |
| "eval_loss": 0.5654014348983765, | |
| "eval_runtime": 221.1716, | |
| "eval_samples_per_second": 125.034, | |
| "eval_steps_per_second": 3.911, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.393871113114015, | |
| "grad_norm": 1.6965349912643433, | |
| "learning_rate": 8.05e-05, | |
| "loss": 0.599, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.393871113114015, | |
| "eval_accuracy": 0.8677269725072129, | |
| "eval_loss": 0.5655470490455627, | |
| "eval_runtime": 221.4343, | |
| "eval_samples_per_second": 124.886, | |
| "eval_steps_per_second": 3.906, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.506534474988734, | |
| "grad_norm": 1.653952956199646, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5976, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.506534474988734, | |
| "eval_accuracy": 0.8685987876288259, | |
| "eval_loss": 0.5560412406921387, | |
| "eval_runtime": 220.5715, | |
| "eval_samples_per_second": 125.374, | |
| "eval_steps_per_second": 3.922, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.619197836863452, | |
| "grad_norm": 1.7568910121917725, | |
| "learning_rate": 7.950000000000001e-05, | |
| "loss": 0.5941, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 4.619197836863452, | |
| "eval_accuracy": 0.868412802308659, | |
| "eval_loss": 0.5624808669090271, | |
| "eval_runtime": 220.7945, | |
| "eval_samples_per_second": 125.248, | |
| "eval_steps_per_second": 3.918, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 4.73186119873817, | |
| "grad_norm": 1.7545663118362427, | |
| "learning_rate": 7.900000000000001e-05, | |
| "loss": 0.5871, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.73186119873817, | |
| "eval_accuracy": 0.8700149406874658, | |
| "eval_loss": 0.5546574592590332, | |
| "eval_runtime": 220.2428, | |
| "eval_samples_per_second": 125.561, | |
| "eval_steps_per_second": 3.927, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.844524560612888, | |
| "grad_norm": 1.9459997415542603, | |
| "learning_rate": 7.850000000000001e-05, | |
| "loss": 0.5891, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.844524560612888, | |
| "eval_accuracy": 0.8703311716376315, | |
| "eval_loss": 0.5456222295761108, | |
| "eval_runtime": 220.5867, | |
| "eval_samples_per_second": 125.366, | |
| "eval_steps_per_second": 3.921, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.957187922487607, | |
| "grad_norm": 1.9034132957458496, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 0.5828, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.957187922487607, | |
| "eval_accuracy": 0.8704027728365514, | |
| "eval_loss": 0.549776554107666, | |
| "eval_runtime": 221.4908, | |
| "eval_samples_per_second": 124.854, | |
| "eval_steps_per_second": 3.905, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 5.069851284362326, | |
| "grad_norm": 1.881596565246582, | |
| "learning_rate": 7.75e-05, | |
| "loss": 0.5767, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.069851284362326, | |
| "eval_accuracy": 0.8711589106147363, | |
| "eval_loss": 0.5461272597312927, | |
| "eval_runtime": 220.457, | |
| "eval_samples_per_second": 125.439, | |
| "eval_steps_per_second": 3.924, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.182514646237044, | |
| "grad_norm": 1.9157260656356812, | |
| "learning_rate": 7.7e-05, | |
| "loss": 0.5731, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.182514646237044, | |
| "eval_accuracy": 0.871975417070376, | |
| "eval_loss": 0.5400785207748413, | |
| "eval_runtime": 220.8692, | |
| "eval_samples_per_second": 125.205, | |
| "eval_steps_per_second": 3.916, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.295178008111762, | |
| "grad_norm": 1.9823201894760132, | |
| "learning_rate": 7.65e-05, | |
| "loss": 0.5736, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 5.295178008111762, | |
| "eval_accuracy": 0.8723751389743424, | |
| "eval_loss": 0.5401638746261597, | |
| "eval_runtime": 221.6042, | |
| "eval_samples_per_second": 124.79, | |
| "eval_steps_per_second": 3.903, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 5.40784136998648, | |
| "grad_norm": 1.905613660812378, | |
| "learning_rate": 7.6e-05, | |
| "loss": 0.5747, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 5.40784136998648, | |
| "eval_accuracy": 0.8724923660478054, | |
| "eval_loss": 0.5441656112670898, | |
| "eval_runtime": 221.3067, | |
| "eval_samples_per_second": 124.958, | |
| "eval_steps_per_second": 3.909, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 5.520504731861199, | |
| "grad_norm": 1.5278126001358032, | |
| "learning_rate": 7.55e-05, | |
| "loss": 0.5681, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 5.520504731861199, | |
| "eval_accuracy": 0.8728878650306285, | |
| "eval_loss": 0.538100004196167, | |
| "eval_runtime": 222.0369, | |
| "eval_samples_per_second": 124.547, | |
| "eval_steps_per_second": 3.896, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 5.633168093735917, | |
| "grad_norm": 1.6478660106658936, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.5658, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 5.633168093735917, | |
| "eval_accuracy": 0.8736624848239579, | |
| "eval_loss": 0.5357881784439087, | |
| "eval_runtime": 220.4147, | |
| "eval_samples_per_second": 125.463, | |
| "eval_steps_per_second": 3.924, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 5.745831455610635, | |
| "grad_norm": 3.0473523139953613, | |
| "learning_rate": 7.450000000000001e-05, | |
| "loss": 0.5644, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 5.745831455610635, | |
| "eval_accuracy": 0.8743903767129565, | |
| "eval_loss": 0.5344362854957581, | |
| "eval_runtime": 221.5481, | |
| "eval_samples_per_second": 124.822, | |
| "eval_steps_per_second": 3.904, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 5.8584948174853535, | |
| "grad_norm": 1.8053028583526611, | |
| "learning_rate": 7.4e-05, | |
| "loss": 0.5622, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 5.8584948174853535, | |
| "eval_accuracy": 0.874178054098396, | |
| "eval_loss": 0.5315510630607605, | |
| "eval_runtime": 221.4537, | |
| "eval_samples_per_second": 124.875, | |
| "eval_steps_per_second": 3.906, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 5.9711581793600725, | |
| "grad_norm": 1.5863131284713745, | |
| "learning_rate": 7.35e-05, | |
| "loss": 0.5578, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 5.9711581793600725, | |
| "eval_accuracy": 0.8753070050808498, | |
| "eval_loss": 0.5271232724189758, | |
| "eval_runtime": 221.5103, | |
| "eval_samples_per_second": 124.843, | |
| "eval_steps_per_second": 3.905, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 6.083821541234791, | |
| "grad_norm": 1.7924689054489136, | |
| "learning_rate": 7.3e-05, | |
| "loss": 0.5546, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.083821541234791, | |
| "eval_accuracy": 0.8749559789605048, | |
| "eval_loss": 0.5305372476577759, | |
| "eval_runtime": 220.6828, | |
| "eval_samples_per_second": 125.311, | |
| "eval_steps_per_second": 3.92, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.196484903109509, | |
| "grad_norm": 1.6176671981811523, | |
| "learning_rate": 7.25e-05, | |
| "loss": 0.5553, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 6.196484903109509, | |
| "eval_accuracy": 0.8752024294778373, | |
| "eval_loss": 0.5255776047706604, | |
| "eval_runtime": 220.8919, | |
| "eval_samples_per_second": 125.192, | |
| "eval_steps_per_second": 3.916, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 6.309148264984227, | |
| "grad_norm": 1.855047583580017, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.5506, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 6.309148264984227, | |
| "eval_accuracy": 0.8761331460452507, | |
| "eval_loss": 0.52358478307724, | |
| "eval_runtime": 220.7028, | |
| "eval_samples_per_second": 125.3, | |
| "eval_steps_per_second": 3.919, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 6.421811626858945, | |
| "grad_norm": 1.6553348302841187, | |
| "learning_rate": 7.15e-05, | |
| "loss": 0.5439, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 6.421811626858945, | |
| "eval_accuracy": 0.8768096662621753, | |
| "eval_loss": 0.5175614953041077, | |
| "eval_runtime": 221.3868, | |
| "eval_samples_per_second": 124.913, | |
| "eval_steps_per_second": 3.907, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 6.534474988733664, | |
| "grad_norm": 1.8099743127822876, | |
| "learning_rate": 7.1e-05, | |
| "loss": 0.5486, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 6.534474988733664, | |
| "eval_accuracy": 0.8767345488093528, | |
| "eval_loss": 0.5191013216972351, | |
| "eval_runtime": 221.0646, | |
| "eval_samples_per_second": 125.095, | |
| "eval_steps_per_second": 3.913, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 6.647138350608382, | |
| "grad_norm": 1.7723827362060547, | |
| "learning_rate": 7.05e-05, | |
| "loss": 0.5442, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 6.647138350608382, | |
| "eval_accuracy": 0.8777180592418201, | |
| "eval_loss": 0.5211535096168518, | |
| "eval_runtime": 222.034, | |
| "eval_samples_per_second": 124.548, | |
| "eval_steps_per_second": 3.896, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 6.7598017124831005, | |
| "grad_norm": 1.7134077548980713, | |
| "learning_rate": 7e-05, | |
| "loss": 0.5412, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 6.7598017124831005, | |
| "eval_accuracy": 0.8771853054768167, | |
| "eval_loss": 0.5161250829696655, | |
| "eval_runtime": 221.7362, | |
| "eval_samples_per_second": 124.716, | |
| "eval_steps_per_second": 3.901, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 6.872465074357819, | |
| "grad_norm": 1.7683045864105225, | |
| "learning_rate": 6.95e-05, | |
| "loss": 0.5402, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 6.872465074357819, | |
| "eval_accuracy": 0.8773839402820733, | |
| "eval_loss": 0.5139411687850952, | |
| "eval_runtime": 220.8209, | |
| "eval_samples_per_second": 125.233, | |
| "eval_steps_per_second": 3.917, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 6.985128436232538, | |
| "grad_norm": 1.8624660968780518, | |
| "learning_rate": 6.9e-05, | |
| "loss": 0.5395, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 6.985128436232538, | |
| "eval_accuracy": 0.8783624777319803, | |
| "eval_loss": 0.5147821307182312, | |
| "eval_runtime": 221.2924, | |
| "eval_samples_per_second": 124.966, | |
| "eval_steps_per_second": 3.909, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 7.097791798107256, | |
| "grad_norm": 1.6134588718414307, | |
| "learning_rate": 6.850000000000001e-05, | |
| "loss": 0.5323, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 7.097791798107256, | |
| "eval_accuracy": 0.8783227617479554, | |
| "eval_loss": 0.5111725330352783, | |
| "eval_runtime": 221.33, | |
| "eval_samples_per_second": 124.945, | |
| "eval_steps_per_second": 3.908, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 7.210455159981974, | |
| "grad_norm": 1.8190521001815796, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 0.5341, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 7.210455159981974, | |
| "eval_accuracy": 0.8789992011172492, | |
| "eval_loss": 0.5084385275840759, | |
| "eval_runtime": 221.6067, | |
| "eval_samples_per_second": 124.789, | |
| "eval_steps_per_second": 3.903, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 7.323118521856692, | |
| "grad_norm": 1.7497199773788452, | |
| "learning_rate": 6.750000000000001e-05, | |
| "loss": 0.5325, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 7.323118521856692, | |
| "eval_accuracy": 0.8800775404890228, | |
| "eval_loss": 0.5039363503456116, | |
| "eval_runtime": 222.1191, | |
| "eval_samples_per_second": 124.501, | |
| "eval_steps_per_second": 3.894, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 7.43578188373141, | |
| "grad_norm": 1.6325268745422363, | |
| "learning_rate": 6.7e-05, | |
| "loss": 0.5309, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 7.43578188373141, | |
| "eval_accuracy": 0.8801297464169966, | |
| "eval_loss": 0.505262017250061, | |
| "eval_runtime": 221.5433, | |
| "eval_samples_per_second": 124.824, | |
| "eval_steps_per_second": 3.904, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 7.548445245606128, | |
| "grad_norm": 1.7531828880310059, | |
| "learning_rate": 6.65e-05, | |
| "loss": 0.5283, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 7.548445245606128, | |
| "eval_accuracy": 0.8802065110814512, | |
| "eval_loss": 0.5030723810195923, | |
| "eval_runtime": 221.3827, | |
| "eval_samples_per_second": 124.915, | |
| "eval_steps_per_second": 3.907, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 7.661108607480847, | |
| "grad_norm": 1.7174723148345947, | |
| "learning_rate": 6.6e-05, | |
| "loss": 0.5254, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 7.661108607480847, | |
| "eval_accuracy": 0.8808179417817528, | |
| "eval_loss": 0.5008535385131836, | |
| "eval_runtime": 220.9595, | |
| "eval_samples_per_second": 125.154, | |
| "eval_steps_per_second": 3.915, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 7.773771969355566, | |
| "grad_norm": 1.4874796867370605, | |
| "learning_rate": 6.55e-05, | |
| "loss": 0.5217, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 7.773771969355566, | |
| "eval_accuracy": 0.8805426353661477, | |
| "eval_loss": 0.5023674368858337, | |
| "eval_runtime": 221.8005, | |
| "eval_samples_per_second": 124.68, | |
| "eval_steps_per_second": 3.9, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 7.886435331230284, | |
| "grad_norm": 1.721684455871582, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.5268, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 7.886435331230284, | |
| "eval_accuracy": 0.8809166125769631, | |
| "eval_loss": 0.49913424253463745, | |
| "eval_runtime": 221.9778, | |
| "eval_samples_per_second": 124.58, | |
| "eval_steps_per_second": 3.897, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 7.999098693105002, | |
| "grad_norm": 1.6311215162277222, | |
| "learning_rate": 6.450000000000001e-05, | |
| "loss": 0.5279, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 7.999098693105002, | |
| "eval_accuracy": 0.8812218745311298, | |
| "eval_loss": 0.49702906608581543, | |
| "eval_runtime": 221.8919, | |
| "eval_samples_per_second": 124.628, | |
| "eval_steps_per_second": 3.898, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 8.111762054979721, | |
| "grad_norm": 1.843680739402771, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.5113, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 8.111762054979721, | |
| "eval_accuracy": 0.8819420345077154, | |
| "eval_loss": 0.4924590289592743, | |
| "eval_runtime": 220.9686, | |
| "eval_samples_per_second": 125.149, | |
| "eval_steps_per_second": 3.915, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 8.224425416854439, | |
| "grad_norm": 2.186274290084839, | |
| "learning_rate": 6.35e-05, | |
| "loss": 0.519, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 8.224425416854439, | |
| "eval_accuracy": 0.8821893135016634, | |
| "eval_loss": 0.49259641766548157, | |
| "eval_runtime": 221.6028, | |
| "eval_samples_per_second": 124.791, | |
| "eval_steps_per_second": 3.903, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 8.337088778729157, | |
| "grad_norm": 1.6842992305755615, | |
| "learning_rate": 6.3e-05, | |
| "loss": 0.5142, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 8.337088778729157, | |
| "eval_accuracy": 0.8831766874688345, | |
| "eval_loss": 0.49090540409088135, | |
| "eval_runtime": 221.5262, | |
| "eval_samples_per_second": 124.834, | |
| "eval_steps_per_second": 3.905, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 8.449752140603875, | |
| "grad_norm": 1.69620943069458, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.5118, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 8.449752140603875, | |
| "eval_accuracy": 0.8829115546267233, | |
| "eval_loss": 0.48741188645362854, | |
| "eval_runtime": 220.7417, | |
| "eval_samples_per_second": 125.278, | |
| "eval_steps_per_second": 3.919, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 8.562415502478594, | |
| "grad_norm": 1.7428566217422485, | |
| "learning_rate": 6.2e-05, | |
| "loss": 0.513, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 8.562415502478594, | |
| "eval_accuracy": 0.883224147742304, | |
| "eval_loss": 0.4888308644294739, | |
| "eval_runtime": 221.978, | |
| "eval_samples_per_second": 124.58, | |
| "eval_steps_per_second": 3.897, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 8.675078864353312, | |
| "grad_norm": 1.7894220352172852, | |
| "learning_rate": 6.15e-05, | |
| "loss": 0.5158, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 8.675078864353312, | |
| "eval_accuracy": 0.8831726096386725, | |
| "eval_loss": 0.48892539739608765, | |
| "eval_runtime": 221.6541, | |
| "eval_samples_per_second": 124.762, | |
| "eval_steps_per_second": 3.902, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 8.78774222622803, | |
| "grad_norm": 1.8389184474945068, | |
| "learning_rate": 6.1e-05, | |
| "loss": 0.5074, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 8.78774222622803, | |
| "eval_accuracy": 0.8836604617832933, | |
| "eval_loss": 0.4904680550098419, | |
| "eval_runtime": 220.5534, | |
| "eval_samples_per_second": 125.385, | |
| "eval_steps_per_second": 3.922, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 8.90040558810275, | |
| "grad_norm": 1.6285669803619385, | |
| "learning_rate": 6.05e-05, | |
| "loss": 0.5115, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 8.90040558810275, | |
| "eval_accuracy": 0.8834682744822249, | |
| "eval_loss": 0.4873930513858795, | |
| "eval_runtime": 221.8348, | |
| "eval_samples_per_second": 124.66, | |
| "eval_steps_per_second": 3.899, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 9.013068949977468, | |
| "grad_norm": 1.892903447151184, | |
| "learning_rate": 6e-05, | |
| "loss": 0.506, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 9.013068949977468, | |
| "eval_accuracy": 0.8844121521679462, | |
| "eval_loss": 0.48553282022476196, | |
| "eval_runtime": 221.0439, | |
| "eval_samples_per_second": 125.106, | |
| "eval_steps_per_second": 3.913, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 9.125732311852186, | |
| "grad_norm": 1.5161460638046265, | |
| "learning_rate": 5.95e-05, | |
| "loss": 0.5006, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 9.125732311852186, | |
| "eval_accuracy": 0.8845845321702142, | |
| "eval_loss": 0.4854166805744171, | |
| "eval_runtime": 221.5376, | |
| "eval_samples_per_second": 124.828, | |
| "eval_steps_per_second": 3.905, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 9.238395673726904, | |
| "grad_norm": 1.6559338569641113, | |
| "learning_rate": 5.9e-05, | |
| "loss": 0.4998, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 9.238395673726904, | |
| "eval_accuracy": 0.8846773258713508, | |
| "eval_loss": 0.47937873005867004, | |
| "eval_runtime": 221.7432, | |
| "eval_samples_per_second": 124.712, | |
| "eval_steps_per_second": 3.901, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 9.351059035601622, | |
| "grad_norm": 1.5425843000411987, | |
| "learning_rate": 5.85e-05, | |
| "loss": 0.4993, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 9.351059035601622, | |
| "eval_accuracy": 0.8852505184740784, | |
| "eval_loss": 0.47994357347488403, | |
| "eval_runtime": 220.8861, | |
| "eval_samples_per_second": 125.196, | |
| "eval_steps_per_second": 3.916, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 9.46372239747634, | |
| "grad_norm": 1.6957345008850098, | |
| "learning_rate": 5.8e-05, | |
| "loss": 0.4978, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 9.46372239747634, | |
| "eval_accuracy": 0.8847172732012654, | |
| "eval_loss": 0.48131656646728516, | |
| "eval_runtime": 222.0591, | |
| "eval_samples_per_second": 124.534, | |
| "eval_steps_per_second": 3.895, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 9.576385759351059, | |
| "grad_norm": 1.9139741659164429, | |
| "learning_rate": 5.7499999999999995e-05, | |
| "loss": 0.4989, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 9.576385759351059, | |
| "eval_accuracy": 0.8862352978048973, | |
| "eval_loss": 0.4748667776584625, | |
| "eval_runtime": 221.9766, | |
| "eval_samples_per_second": 124.581, | |
| "eval_steps_per_second": 3.897, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 9.689049121225777, | |
| "grad_norm": 1.770585536956787, | |
| "learning_rate": 5.6999999999999996e-05, | |
| "loss": 0.4974, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 9.689049121225777, | |
| "eval_accuracy": 0.8855722252421147, | |
| "eval_loss": 0.4763648211956024, | |
| "eval_runtime": 220.554, | |
| "eval_samples_per_second": 125.384, | |
| "eval_steps_per_second": 3.922, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 9.801712483100495, | |
| "grad_norm": 1.6551371812820435, | |
| "learning_rate": 5.65e-05, | |
| "loss": 0.4978, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 9.801712483100495, | |
| "eval_accuracy": 0.8858765050235756, | |
| "eval_loss": 0.47770920395851135, | |
| "eval_runtime": 221.8932, | |
| "eval_samples_per_second": 124.628, | |
| "eval_steps_per_second": 3.898, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 9.914375844975215, | |
| "grad_norm": 1.6118969917297363, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.4942, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 9.914375844975215, | |
| "eval_accuracy": 0.8865845660569847, | |
| "eval_loss": 0.47676002979278564, | |
| "eval_runtime": 221.7004, | |
| "eval_samples_per_second": 124.736, | |
| "eval_steps_per_second": 3.902, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 10.027039206849933, | |
| "grad_norm": 1.8588035106658936, | |
| "learning_rate": 5.550000000000001e-05, | |
| "loss": 0.4955, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 10.027039206849933, | |
| "eval_accuracy": 0.8870998796760368, | |
| "eval_loss": 0.47594934701919556, | |
| "eval_runtime": 221.976, | |
| "eval_samples_per_second": 124.581, | |
| "eval_steps_per_second": 3.897, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 10.139702568724651, | |
| "grad_norm": 1.6966643333435059, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": 0.489, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 10.139702568724651, | |
| "eval_accuracy": 0.8869448016018396, | |
| "eval_loss": 0.477344274520874, | |
| "eval_runtime": 221.5008, | |
| "eval_samples_per_second": 124.848, | |
| "eval_steps_per_second": 3.905, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 10.25236593059937, | |
| "grad_norm": 1.7615017890930176, | |
| "learning_rate": 5.45e-05, | |
| "loss": 0.4849, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 10.25236593059937, | |
| "eval_accuracy": 0.8868306586288885, | |
| "eval_loss": 0.4725435972213745, | |
| "eval_runtime": 221.7608, | |
| "eval_samples_per_second": 124.702, | |
| "eval_steps_per_second": 3.901, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 10.365029292474087, | |
| "grad_norm": 1.7889434099197388, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 0.4818, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 10.365029292474087, | |
| "eval_accuracy": 0.887833900017014, | |
| "eval_loss": 0.4671822190284729, | |
| "eval_runtime": 222.1274, | |
| "eval_samples_per_second": 124.496, | |
| "eval_steps_per_second": 3.894, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 10.477692654348806, | |
| "grad_norm": 1.7761868238449097, | |
| "learning_rate": 5.3500000000000006e-05, | |
| "loss": 0.4864, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 10.477692654348806, | |
| "eval_accuracy": 0.887966177980069, | |
| "eval_loss": 0.46516725420951843, | |
| "eval_runtime": 221.4768, | |
| "eval_samples_per_second": 124.862, | |
| "eval_steps_per_second": 3.906, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 10.590356016223524, | |
| "grad_norm": 1.7193918228149414, | |
| "learning_rate": 5.300000000000001e-05, | |
| "loss": 0.4854, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 10.590356016223524, | |
| "eval_accuracy": 0.8878875431862944, | |
| "eval_loss": 0.4649243652820587, | |
| "eval_runtime": 221.9203, | |
| "eval_samples_per_second": 124.612, | |
| "eval_steps_per_second": 3.898, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 10.703019378098242, | |
| "grad_norm": 1.681303858757019, | |
| "learning_rate": 5.25e-05, | |
| "loss": 0.4842, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 10.703019378098242, | |
| "eval_accuracy": 0.8880860212733241, | |
| "eval_loss": 0.4627833366394043, | |
| "eval_runtime": 220.7325, | |
| "eval_samples_per_second": 125.283, | |
| "eval_steps_per_second": 3.919, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 10.81568273997296, | |
| "grad_norm": 1.689483642578125, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 0.4853, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 10.81568273997296, | |
| "eval_accuracy": 0.8884850427627177, | |
| "eval_loss": 0.4670482873916626, | |
| "eval_runtime": 222.2087, | |
| "eval_samples_per_second": 124.451, | |
| "eval_steps_per_second": 3.893, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 10.92834610184768, | |
| "grad_norm": 1.6489872932434082, | |
| "learning_rate": 5.1500000000000005e-05, | |
| "loss": 0.4825, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 10.92834610184768, | |
| "eval_accuracy": 0.8886944679602043, | |
| "eval_loss": 0.4673362970352173, | |
| "eval_runtime": 221.8366, | |
| "eval_samples_per_second": 124.659, | |
| "eval_steps_per_second": 3.899, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 11.041009463722398, | |
| "grad_norm": 1.6207237243652344, | |
| "learning_rate": 5.1000000000000006e-05, | |
| "loss": 0.4783, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 11.041009463722398, | |
| "eval_accuracy": 0.8887642317859056, | |
| "eval_loss": 0.46382275223731995, | |
| "eval_runtime": 222.0203, | |
| "eval_samples_per_second": 124.556, | |
| "eval_steps_per_second": 3.896, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 11.153672825597116, | |
| "grad_norm": 1.7849069833755493, | |
| "learning_rate": 5.05e-05, | |
| "loss": 0.4755, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 11.153672825597116, | |
| "eval_accuracy": 0.8889988225245398, | |
| "eval_loss": 0.4611697793006897, | |
| "eval_runtime": 221.8086, | |
| "eval_samples_per_second": 124.675, | |
| "eval_steps_per_second": 3.9, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 11.266336187471834, | |
| "grad_norm": 1.7341585159301758, | |
| "learning_rate": 5e-05, | |
| "loss": 0.4766, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 11.266336187471834, | |
| "eval_accuracy": 0.8896719975387671, | |
| "eval_loss": 0.45947107672691345, | |
| "eval_runtime": 221.6153, | |
| "eval_samples_per_second": 124.784, | |
| "eval_steps_per_second": 3.903, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 11.378999549346553, | |
| "grad_norm": 1.6157374382019043, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 0.4758, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 11.378999549346553, | |
| "eval_accuracy": 0.8899909483321304, | |
| "eval_loss": 0.4591013193130493, | |
| "eval_runtime": 221.8952, | |
| "eval_samples_per_second": 124.626, | |
| "eval_steps_per_second": 3.898, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 11.49166291122127, | |
| "grad_norm": 1.3931312561035156, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.4749, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 11.49166291122127, | |
| "eval_accuracy": 0.8898331334878133, | |
| "eval_loss": 0.4599143862724304, | |
| "eval_runtime": 221.5141, | |
| "eval_samples_per_second": 124.841, | |
| "eval_steps_per_second": 3.905, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 11.604326273095989, | |
| "grad_norm": 1.5027562379837036, | |
| "learning_rate": 4.85e-05, | |
| "loss": 0.4696, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 11.604326273095989, | |
| "eval_accuracy": 0.8903252192404275, | |
| "eval_loss": 0.4557996988296509, | |
| "eval_runtime": 221.4742, | |
| "eval_samples_per_second": 124.863, | |
| "eval_steps_per_second": 3.906, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 11.716989634970707, | |
| "grad_norm": 2.007624864578247, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.4731, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 11.716989634970707, | |
| "eval_accuracy": 0.8906047731898101, | |
| "eval_loss": 0.4601598381996155, | |
| "eval_runtime": 221.6161, | |
| "eval_samples_per_second": 124.783, | |
| "eval_steps_per_second": 3.903, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 11.829652996845425, | |
| "grad_norm": 1.623124361038208, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.4705, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 11.829652996845425, | |
| "eval_accuracy": 0.8907063641623542, | |
| "eval_loss": 0.4568343460559845, | |
| "eval_runtime": 221.7063, | |
| "eval_samples_per_second": 124.733, | |
| "eval_steps_per_second": 3.902, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 11.942316358720145, | |
| "grad_norm": 1.7550790309906006, | |
| "learning_rate": 4.7e-05, | |
| "loss": 0.4712, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 11.942316358720145, | |
| "eval_accuracy": 0.8906701808811146, | |
| "eval_loss": 0.4544416666030884, | |
| "eval_runtime": 221.7786, | |
| "eval_samples_per_second": 124.692, | |
| "eval_steps_per_second": 3.9, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 12.054979720594863, | |
| "grad_norm": 1.8783979415893555, | |
| "learning_rate": 4.6500000000000005e-05, | |
| "loss": 0.4672, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 12.054979720594863, | |
| "eval_accuracy": 0.8910758036453728, | |
| "eval_loss": 0.45520085096359253, | |
| "eval_runtime": 221.6441, | |
| "eval_samples_per_second": 124.768, | |
| "eval_steps_per_second": 3.903, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 12.167643082469581, | |
| "grad_norm": 1.7316193580627441, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 0.4643, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 12.167643082469581, | |
| "eval_accuracy": 0.8910305824271204, | |
| "eval_loss": 0.4555051028728485, | |
| "eval_runtime": 221.45, | |
| "eval_samples_per_second": 124.877, | |
| "eval_steps_per_second": 3.906, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 12.2803064443443, | |
| "grad_norm": 1.6475858688354492, | |
| "learning_rate": 4.55e-05, | |
| "loss": 0.4634, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 12.2803064443443, | |
| "eval_accuracy": 0.8916132904164534, | |
| "eval_loss": 0.450579971075058, | |
| "eval_runtime": 221.3289, | |
| "eval_samples_per_second": 124.945, | |
| "eval_steps_per_second": 3.908, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 12.392969806219018, | |
| "grad_norm": 1.6666234731674194, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.4629, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 12.392969806219018, | |
| "eval_accuracy": 0.8920182501631405, | |
| "eval_loss": 0.4492991268634796, | |
| "eval_runtime": 220.8234, | |
| "eval_samples_per_second": 125.231, | |
| "eval_steps_per_second": 3.917, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 12.505633168093736, | |
| "grad_norm": 2.040255308151245, | |
| "learning_rate": 4.4500000000000004e-05, | |
| "loss": 0.4577, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 12.505633168093736, | |
| "eval_accuracy": 0.8917890008025184, | |
| "eval_loss": 0.45352259278297424, | |
| "eval_runtime": 222.1131, | |
| "eval_samples_per_second": 124.504, | |
| "eval_steps_per_second": 3.894, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 12.618296529968454, | |
| "grad_norm": 1.6200906038284302, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.4597, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 12.618296529968454, | |
| "eval_accuracy": 0.8922171868098442, | |
| "eval_loss": 0.45517975091934204, | |
| "eval_runtime": 221.5067, | |
| "eval_samples_per_second": 124.845, | |
| "eval_steps_per_second": 3.905, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 12.730959891843172, | |
| "grad_norm": 1.8632248640060425, | |
| "learning_rate": 4.35e-05, | |
| "loss": 0.4624, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 12.730959891843172, | |
| "eval_accuracy": 0.8927531961352251, | |
| "eval_loss": 0.44637200236320496, | |
| "eval_runtime": 221.488, | |
| "eval_samples_per_second": 124.856, | |
| "eval_steps_per_second": 3.905, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 12.84362325371789, | |
| "grad_norm": 1.6908427476882935, | |
| "learning_rate": 4.3e-05, | |
| "loss": 0.46, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 12.84362325371789, | |
| "eval_accuracy": 0.8920658168036726, | |
| "eval_loss": 0.44909459352493286, | |
| "eval_runtime": 221.6242, | |
| "eval_samples_per_second": 124.779, | |
| "eval_steps_per_second": 3.903, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 12.95628661559261, | |
| "grad_norm": 1.7786799669265747, | |
| "learning_rate": 4.25e-05, | |
| "loss": 0.4586, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 12.95628661559261, | |
| "eval_accuracy": 0.8929494005257145, | |
| "eval_loss": 0.4447159469127655, | |
| "eval_runtime": 221.7737, | |
| "eval_samples_per_second": 124.695, | |
| "eval_steps_per_second": 3.9, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 13.068949977467328, | |
| "grad_norm": 1.7628467082977295, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.4558, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 13.068949977467328, | |
| "eval_accuracy": 0.8926064267317521, | |
| "eval_loss": 0.4458833336830139, | |
| "eval_runtime": 221.4227, | |
| "eval_samples_per_second": 124.892, | |
| "eval_steps_per_second": 3.907, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 13.181613339342046, | |
| "grad_norm": 1.5658234357833862, | |
| "learning_rate": 4.15e-05, | |
| "loss": 0.4542, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 13.181613339342046, | |
| "eval_accuracy": 0.8932430818063928, | |
| "eval_loss": 0.4461354613304138, | |
| "eval_runtime": 221.5061, | |
| "eval_samples_per_second": 124.845, | |
| "eval_steps_per_second": 3.905, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 13.294276701216765, | |
| "grad_norm": 1.5327554941177368, | |
| "learning_rate": 4.1e-05, | |
| "loss": 0.455, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 13.294276701216765, | |
| "eval_accuracy": 0.8931742171282963, | |
| "eval_loss": 0.4385415017604828, | |
| "eval_runtime": 220.6336, | |
| "eval_samples_per_second": 125.339, | |
| "eval_steps_per_second": 3.921, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 13.406940063091483, | |
| "grad_norm": 1.804396390914917, | |
| "learning_rate": 4.05e-05, | |
| "loss": 0.4506, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 13.406940063091483, | |
| "eval_accuracy": 0.8937421063264991, | |
| "eval_loss": 0.4429979622364044, | |
| "eval_runtime": 221.8018, | |
| "eval_samples_per_second": 124.679, | |
| "eval_steps_per_second": 3.9, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 13.519603424966201, | |
| "grad_norm": 1.8558369874954224, | |
| "learning_rate": 4e-05, | |
| "loss": 0.4542, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 13.519603424966201, | |
| "eval_accuracy": 0.8935920533223497, | |
| "eval_loss": 0.4469524025917053, | |
| "eval_runtime": 220.934, | |
| "eval_samples_per_second": 125.169, | |
| "eval_steps_per_second": 3.915, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 13.632266786840919, | |
| "grad_norm": 1.7201515436172485, | |
| "learning_rate": 3.9500000000000005e-05, | |
| "loss": 0.4535, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 13.632266786840919, | |
| "eval_accuracy": 0.8939343684906524, | |
| "eval_loss": 0.44035276770591736, | |
| "eval_runtime": 220.7553, | |
| "eval_samples_per_second": 125.27, | |
| "eval_steps_per_second": 3.918, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 13.744930148715637, | |
| "grad_norm": 1.5173367261886597, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.4519, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 13.744930148715637, | |
| "eval_accuracy": 0.8938849251143673, | |
| "eval_loss": 0.44248726963996887, | |
| "eval_runtime": 220.2187, | |
| "eval_samples_per_second": 125.575, | |
| "eval_steps_per_second": 3.928, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 13.857593510590355, | |
| "grad_norm": 1.6886624097824097, | |
| "learning_rate": 3.85e-05, | |
| "loss": 0.4492, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 13.857593510590355, | |
| "eval_accuracy": 0.8941456344925675, | |
| "eval_loss": 0.44254130125045776, | |
| "eval_runtime": 220.9103, | |
| "eval_samples_per_second": 125.182, | |
| "eval_steps_per_second": 3.916, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 13.970256872465074, | |
| "grad_norm": 1.560421109199524, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.4495, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 13.970256872465074, | |
| "eval_accuracy": 0.8943100925877457, | |
| "eval_loss": 0.43967217206954956, | |
| "eval_runtime": 221.8474, | |
| "eval_samples_per_second": 124.653, | |
| "eval_steps_per_second": 3.899, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 14.082920234339793, | |
| "grad_norm": 2.146169662475586, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.4438, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 14.082920234339793, | |
| "eval_accuracy": 0.8951792684420733, | |
| "eval_loss": 0.43378108739852905, | |
| "eval_runtime": 221.3932, | |
| "eval_samples_per_second": 124.909, | |
| "eval_steps_per_second": 3.907, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 14.195583596214512, | |
| "grad_norm": 1.6352427005767822, | |
| "learning_rate": 3.7e-05, | |
| "loss": 0.4437, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 14.195583596214512, | |
| "eval_accuracy": 0.8944950022699799, | |
| "eval_loss": 0.4368970990180969, | |
| "eval_runtime": 221.6512, | |
| "eval_samples_per_second": 124.764, | |
| "eval_steps_per_second": 3.903, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 14.30824695808923, | |
| "grad_norm": 1.6570438146591187, | |
| "learning_rate": 3.65e-05, | |
| "loss": 0.4407, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 14.30824695808923, | |
| "eval_accuracy": 0.8954035156096067, | |
| "eval_loss": 0.43203291296958923, | |
| "eval_runtime": 220.9732, | |
| "eval_samples_per_second": 125.146, | |
| "eval_steps_per_second": 3.915, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 14.420910319963948, | |
| "grad_norm": 1.6666638851165771, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.4409, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 14.420910319963948, | |
| "eval_accuracy": 0.8951364676803809, | |
| "eval_loss": 0.43579936027526855, | |
| "eval_runtime": 221.9618, | |
| "eval_samples_per_second": 124.589, | |
| "eval_steps_per_second": 3.897, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 14.533573681838666, | |
| "grad_norm": 1.5540229082107544, | |
| "learning_rate": 3.55e-05, | |
| "loss": 0.4425, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 14.533573681838666, | |
| "eval_accuracy": 0.8954575425427099, | |
| "eval_loss": 0.432124525308609, | |
| "eval_runtime": 222.2861, | |
| "eval_samples_per_second": 124.407, | |
| "eval_steps_per_second": 3.891, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 14.646237043713384, | |
| "grad_norm": 1.6039586067199707, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.4375, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 14.646237043713384, | |
| "eval_accuracy": 0.8959293125901446, | |
| "eval_loss": 0.4307084083557129, | |
| "eval_runtime": 221.3855, | |
| "eval_samples_per_second": 124.913, | |
| "eval_steps_per_second": 3.907, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 14.758900405588102, | |
| "grad_norm": 1.4141193628311157, | |
| "learning_rate": 3.45e-05, | |
| "loss": 0.4412, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 14.758900405588102, | |
| "eval_accuracy": 0.8955789560165742, | |
| "eval_loss": 0.4335871934890747, | |
| "eval_runtime": 220.8948, | |
| "eval_samples_per_second": 125.191, | |
| "eval_steps_per_second": 3.916, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 14.87156376746282, | |
| "grad_norm": 1.479407548904419, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.4402, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 14.87156376746282, | |
| "eval_accuracy": 0.8960790351082009, | |
| "eval_loss": 0.428357869386673, | |
| "eval_runtime": 221.3572, | |
| "eval_samples_per_second": 124.929, | |
| "eval_steps_per_second": 3.908, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 14.984227129337539, | |
| "grad_norm": 1.6063992977142334, | |
| "learning_rate": 3.35e-05, | |
| "loss": 0.4386, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 14.984227129337539, | |
| "eval_accuracy": 0.8961887828028311, | |
| "eval_loss": 0.42679697275161743, | |
| "eval_runtime": 220.1773, | |
| "eval_samples_per_second": 125.599, | |
| "eval_steps_per_second": 3.929, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 15.096890491212259, | |
| "grad_norm": 1.7383469343185425, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.4342, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 15.096890491212259, | |
| "eval_accuracy": 0.8963356120392529, | |
| "eval_loss": 0.43058517575263977, | |
| "eval_runtime": 221.0267, | |
| "eval_samples_per_second": 125.116, | |
| "eval_steps_per_second": 3.914, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 15.209553853086977, | |
| "grad_norm": 1.4529184103012085, | |
| "learning_rate": 3.2500000000000004e-05, | |
| "loss": 0.4355, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 15.209553853086977, | |
| "eval_accuracy": 0.8963537523060265, | |
| "eval_loss": 0.4319207966327667, | |
| "eval_runtime": 221.0822, | |
| "eval_samples_per_second": 125.085, | |
| "eval_steps_per_second": 3.913, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 15.322217214961695, | |
| "grad_norm": 1.5925979614257812, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.434, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 15.322217214961695, | |
| "eval_accuracy": 0.8967275614111444, | |
| "eval_loss": 0.4326106905937195, | |
| "eval_runtime": 221.1171, | |
| "eval_samples_per_second": 125.065, | |
| "eval_steps_per_second": 3.912, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 15.434880576836413, | |
| "grad_norm": 1.5591844320297241, | |
| "learning_rate": 3.15e-05, | |
| "loss": 0.4299, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 15.434880576836413, | |
| "eval_accuracy": 0.8968720289079662, | |
| "eval_loss": 0.42554253339767456, | |
| "eval_runtime": 221.0599, | |
| "eval_samples_per_second": 125.097, | |
| "eval_steps_per_second": 3.913, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 15.547543938711131, | |
| "grad_norm": 1.6964800357818604, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.4302, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 15.547543938711131, | |
| "eval_accuracy": 0.8968983814075581, | |
| "eval_loss": 0.43178391456604004, | |
| "eval_runtime": 220.2723, | |
| "eval_samples_per_second": 125.545, | |
| "eval_steps_per_second": 3.927, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 15.66020730058585, | |
| "grad_norm": 1.7176204919815063, | |
| "learning_rate": 3.05e-05, | |
| "loss": 0.4317, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 15.66020730058585, | |
| "eval_accuracy": 0.8971937797880897, | |
| "eval_loss": 0.42581045627593994, | |
| "eval_runtime": 221.117, | |
| "eval_samples_per_second": 125.065, | |
| "eval_steps_per_second": 3.912, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 15.772870662460567, | |
| "grad_norm": 1.4802976846694946, | |
| "learning_rate": 3e-05, | |
| "loss": 0.4335, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 15.772870662460567, | |
| "eval_accuracy": 0.8972389358742884, | |
| "eval_loss": 0.4227333068847656, | |
| "eval_runtime": 220.2787, | |
| "eval_samples_per_second": 125.541, | |
| "eval_steps_per_second": 3.927, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 15.885534024335286, | |
| "grad_norm": 1.4625871181488037, | |
| "learning_rate": 2.95e-05, | |
| "loss": 0.4313, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 15.885534024335286, | |
| "eval_accuracy": 0.8974343061715017, | |
| "eval_loss": 0.420085072517395, | |
| "eval_runtime": 221.0709, | |
| "eval_samples_per_second": 125.091, | |
| "eval_steps_per_second": 3.913, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 15.998197386210004, | |
| "grad_norm": 1.4574440717697144, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.4288, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 15.998197386210004, | |
| "eval_accuracy": 0.8976216192291354, | |
| "eval_loss": 0.42089083790779114, | |
| "eval_runtime": 221.0359, | |
| "eval_samples_per_second": 125.111, | |
| "eval_steps_per_second": 3.913, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 16.110860748084722, | |
| "grad_norm": 1.415560245513916, | |
| "learning_rate": 2.8499999999999998e-05, | |
| "loss": 0.4245, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 16.110860748084722, | |
| "eval_accuracy": 0.8975563777935319, | |
| "eval_loss": 0.42667824029922485, | |
| "eval_runtime": 220.6503, | |
| "eval_samples_per_second": 125.33, | |
| "eval_steps_per_second": 3.92, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 16.223524109959442, | |
| "grad_norm": 1.6393336057662964, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.4271, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 16.223524109959442, | |
| "eval_accuracy": 0.8984646682572308, | |
| "eval_loss": 0.4213043749332428, | |
| "eval_runtime": 219.8857, | |
| "eval_samples_per_second": 125.765, | |
| "eval_steps_per_second": 3.934, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 16.336187471834158, | |
| "grad_norm": 1.6446831226348877, | |
| "learning_rate": 2.7500000000000004e-05, | |
| "loss": 0.4234, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 16.336187471834158, | |
| "eval_accuracy": 0.8985838129375973, | |
| "eval_loss": 0.42193278670310974, | |
| "eval_runtime": 221.0608, | |
| "eval_samples_per_second": 125.097, | |
| "eval_steps_per_second": 3.913, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 16.448850833708878, | |
| "grad_norm": 1.725674033164978, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.4251, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 16.448850833708878, | |
| "eval_accuracy": 0.8987038252593214, | |
| "eval_loss": 0.4171987771987915, | |
| "eval_runtime": 220.8886, | |
| "eval_samples_per_second": 125.194, | |
| "eval_steps_per_second": 3.916, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 16.561514195583594, | |
| "grad_norm": 1.5979257822036743, | |
| "learning_rate": 2.6500000000000004e-05, | |
| "loss": 0.4217, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 16.561514195583594, | |
| "eval_accuracy": 0.8987482542802462, | |
| "eval_loss": 0.418377161026001, | |
| "eval_runtime": 221.121, | |
| "eval_samples_per_second": 125.063, | |
| "eval_steps_per_second": 3.912, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 16.674177557458314, | |
| "grad_norm": 1.4892100095748901, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.4205, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 16.674177557458314, | |
| "eval_accuracy": 0.8989114915507669, | |
| "eval_loss": 0.4165091812610626, | |
| "eval_runtime": 222.5924, | |
| "eval_samples_per_second": 124.236, | |
| "eval_steps_per_second": 3.886, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 16.786840919333034, | |
| "grad_norm": 1.4461849927902222, | |
| "learning_rate": 2.5500000000000003e-05, | |
| "loss": 0.4228, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 16.786840919333034, | |
| "eval_accuracy": 0.8989125542022752, | |
| "eval_loss": 0.4175247848033905, | |
| "eval_runtime": 222.3981, | |
| "eval_samples_per_second": 124.345, | |
| "eval_steps_per_second": 3.889, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 16.89950428120775, | |
| "grad_norm": 1.768370509147644, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.421, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 16.89950428120775, | |
| "eval_accuracy": 0.8991065894891168, | |
| "eval_loss": 0.41619065403938293, | |
| "eval_runtime": 222.6024, | |
| "eval_samples_per_second": 124.23, | |
| "eval_steps_per_second": 3.886, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 17.01216764308247, | |
| "grad_norm": 1.4250850677490234, | |
| "learning_rate": 2.45e-05, | |
| "loss": 0.4178, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 17.01216764308247, | |
| "eval_accuracy": 0.8994014895612595, | |
| "eval_loss": 0.4117368161678314, | |
| "eval_runtime": 222.14, | |
| "eval_samples_per_second": 124.489, | |
| "eval_steps_per_second": 3.894, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 17.124831004957187, | |
| "grad_norm": 1.4036965370178223, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.4176, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 17.124831004957187, | |
| "eval_accuracy": 0.8995830389073786, | |
| "eval_loss": 0.4121379852294922, | |
| "eval_runtime": 222.2839, | |
| "eval_samples_per_second": 124.408, | |
| "eval_steps_per_second": 3.891, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 17.237494366831907, | |
| "grad_norm": 1.395093321800232, | |
| "learning_rate": 2.35e-05, | |
| "loss": 0.4172, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 17.237494366831907, | |
| "eval_accuracy": 0.8998577766066815, | |
| "eval_loss": 0.41285398602485657, | |
| "eval_runtime": 222.2267, | |
| "eval_samples_per_second": 124.44, | |
| "eval_steps_per_second": 3.892, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 17.350157728706623, | |
| "grad_norm": 1.5492697954177856, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 0.4133, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 17.350157728706623, | |
| "eval_accuracy": 0.8992890854661668, | |
| "eval_loss": 0.41492369771003723, | |
| "eval_runtime": 219.0021, | |
| "eval_samples_per_second": 126.273, | |
| "eval_steps_per_second": 3.95, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 17.462821090581343, | |
| "grad_norm": 1.4863234758377075, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.4166, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 17.462821090581343, | |
| "eval_accuracy": 0.8995439142560963, | |
| "eval_loss": 0.41370296478271484, | |
| "eval_runtime": 220.4874, | |
| "eval_samples_per_second": 125.422, | |
| "eval_steps_per_second": 3.923, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 17.57548445245606, | |
| "grad_norm": 1.8134657144546509, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.4167, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 17.57548445245606, | |
| "eval_accuracy": 0.8998953243247179, | |
| "eval_loss": 0.4118014872074127, | |
| "eval_runtime": 220.8002, | |
| "eval_samples_per_second": 125.244, | |
| "eval_steps_per_second": 3.918, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 17.68814781433078, | |
| "grad_norm": 1.7903392314910889, | |
| "learning_rate": 2.15e-05, | |
| "loss": 0.4164, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 17.68814781433078, | |
| "eval_accuracy": 0.9001628949311502, | |
| "eval_loss": 0.4123002886772156, | |
| "eval_runtime": 219.63, | |
| "eval_samples_per_second": 125.912, | |
| "eval_steps_per_second": 3.938, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 17.8008111762055, | |
| "grad_norm": 1.6216607093811035, | |
| "learning_rate": 2.1e-05, | |
| "loss": 0.4143, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 17.8008111762055, | |
| "eval_accuracy": 0.9001079811521843, | |
| "eval_loss": 0.40997758507728577, | |
| "eval_runtime": 219.8198, | |
| "eval_samples_per_second": 125.803, | |
| "eval_steps_per_second": 3.935, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 17.913474538080216, | |
| "grad_norm": 1.5128173828125, | |
| "learning_rate": 2.05e-05, | |
| "loss": 0.4136, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 17.913474538080216, | |
| "eval_accuracy": 0.9006287821890727, | |
| "eval_loss": 0.41052308678627014, | |
| "eval_runtime": 219.697, | |
| "eval_samples_per_second": 125.873, | |
| "eval_steps_per_second": 3.937, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 18.026137899954936, | |
| "grad_norm": 1.413712978363037, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4132, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 18.026137899954936, | |
| "eval_accuracy": 0.9007660373895346, | |
| "eval_loss": 0.4081571400165558, | |
| "eval_runtime": 220.6703, | |
| "eval_samples_per_second": 125.318, | |
| "eval_steps_per_second": 3.92, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 18.138801261829652, | |
| "grad_norm": 1.7320311069488525, | |
| "learning_rate": 1.9500000000000003e-05, | |
| "loss": 0.4102, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 18.138801261829652, | |
| "eval_accuracy": 0.9009554825729237, | |
| "eval_loss": 0.407240092754364, | |
| "eval_runtime": 221.4224, | |
| "eval_samples_per_second": 124.893, | |
| "eval_steps_per_second": 3.907, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 18.251464623704372, | |
| "grad_norm": 1.8033103942871094, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.4097, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 18.251464623704372, | |
| "eval_accuracy": 0.9008985457774398, | |
| "eval_loss": 0.4109956920146942, | |
| "eval_runtime": 221.1014, | |
| "eval_samples_per_second": 125.074, | |
| "eval_steps_per_second": 3.912, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 18.36412798557909, | |
| "grad_norm": 1.8222883939743042, | |
| "learning_rate": 1.85e-05, | |
| "loss": 0.4085, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 18.36412798557909, | |
| "eval_accuracy": 0.9007539025464132, | |
| "eval_loss": 0.4095366299152374, | |
| "eval_runtime": 220.8203, | |
| "eval_samples_per_second": 125.233, | |
| "eval_steps_per_second": 3.917, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 18.47679134745381, | |
| "grad_norm": 1.4663125276565552, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.4105, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 18.47679134745381, | |
| "eval_accuracy": 0.9014532811520996, | |
| "eval_loss": 0.4047625958919525, | |
| "eval_runtime": 219.6263, | |
| "eval_samples_per_second": 125.914, | |
| "eval_steps_per_second": 3.939, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 18.589454709328525, | |
| "grad_norm": 1.8482975959777832, | |
| "learning_rate": 1.75e-05, | |
| "loss": 0.4096, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 18.589454709328525, | |
| "eval_accuracy": 0.9010233806097327, | |
| "eval_loss": 0.4072835445404053, | |
| "eval_runtime": 220.7586, | |
| "eval_samples_per_second": 125.268, | |
| "eval_steps_per_second": 3.918, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 18.702118071203245, | |
| "grad_norm": 1.4483723640441895, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 0.4041, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 18.702118071203245, | |
| "eval_accuracy": 0.9015295597674521, | |
| "eval_loss": 0.4039141833782196, | |
| "eval_runtime": 220.7392, | |
| "eval_samples_per_second": 125.279, | |
| "eval_steps_per_second": 3.919, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 18.814781433077965, | |
| "grad_norm": 1.6040253639221191, | |
| "learning_rate": 1.65e-05, | |
| "loss": 0.4062, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 18.814781433077965, | |
| "eval_accuracy": 0.9016612318058135, | |
| "eval_loss": 0.40488725900650024, | |
| "eval_runtime": 221.3884, | |
| "eval_samples_per_second": 124.912, | |
| "eval_steps_per_second": 3.907, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 18.92744479495268, | |
| "grad_norm": 1.3560248613357544, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.4045, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 18.92744479495268, | |
| "eval_accuracy": 0.9015874866980568, | |
| "eval_loss": 0.4032597243785858, | |
| "eval_runtime": 221.9037, | |
| "eval_samples_per_second": 124.622, | |
| "eval_steps_per_second": 3.898, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 19.0401081568274, | |
| "grad_norm": 1.6236895322799683, | |
| "learning_rate": 1.55e-05, | |
| "loss": 0.4038, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 19.0401081568274, | |
| "eval_accuracy": 0.901710217516976, | |
| "eval_loss": 0.4084183871746063, | |
| "eval_runtime": 220.8431, | |
| "eval_samples_per_second": 125.22, | |
| "eval_steps_per_second": 3.917, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 19.152771518702117, | |
| "grad_norm": 1.6514983177185059, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.4037, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 19.152771518702117, | |
| "eval_accuracy": 0.9016946022320732, | |
| "eval_loss": 0.4033704102039337, | |
| "eval_runtime": 221.6212, | |
| "eval_samples_per_second": 124.78, | |
| "eval_steps_per_second": 3.903, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 19.265434880576837, | |
| "grad_norm": 1.3684407472610474, | |
| "learning_rate": 1.45e-05, | |
| "loss": 0.4022, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 19.265434880576837, | |
| "eval_accuracy": 0.9021324676993308, | |
| "eval_loss": 0.40617531538009644, | |
| "eval_runtime": 221.8256, | |
| "eval_samples_per_second": 124.666, | |
| "eval_steps_per_second": 3.899, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 19.378098242451554, | |
| "grad_norm": 1.592301607131958, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.4059, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 19.378098242451554, | |
| "eval_accuracy": 0.902363044454423, | |
| "eval_loss": 0.3991073668003082, | |
| "eval_runtime": 220.8011, | |
| "eval_samples_per_second": 125.244, | |
| "eval_steps_per_second": 3.918, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 19.490761604326273, | |
| "grad_norm": 1.5463926792144775, | |
| "learning_rate": 1.3500000000000001e-05, | |
| "loss": 0.4013, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 19.490761604326273, | |
| "eval_accuracy": 0.9023868906868481, | |
| "eval_loss": 0.39859089255332947, | |
| "eval_runtime": 220.6504, | |
| "eval_samples_per_second": 125.329, | |
| "eval_steps_per_second": 3.92, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 19.60342496620099, | |
| "grad_norm": 1.6952037811279297, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.4004, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 19.60342496620099, | |
| "eval_accuracy": 0.9029012333672634, | |
| "eval_loss": 0.4017859995365143, | |
| "eval_runtime": 220.6857, | |
| "eval_samples_per_second": 125.309, | |
| "eval_steps_per_second": 3.92, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 19.71608832807571, | |
| "grad_norm": 1.5156389474868774, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.4023, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 19.71608832807571, | |
| "eval_accuracy": 0.9022691715502759, | |
| "eval_loss": 0.40082216262817383, | |
| "eval_runtime": 220.7786, | |
| "eval_samples_per_second": 125.257, | |
| "eval_steps_per_second": 3.918, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 19.82875168995043, | |
| "grad_norm": 1.5951709747314453, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.3987, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 19.82875168995043, | |
| "eval_accuracy": 0.9028266490406112, | |
| "eval_loss": 0.4010894000530243, | |
| "eval_runtime": 220.1664, | |
| "eval_samples_per_second": 125.605, | |
| "eval_steps_per_second": 3.929, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 19.941415051825146, | |
| "grad_norm": 1.4990533590316772, | |
| "learning_rate": 1.1500000000000002e-05, | |
| "loss": 0.3935, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 19.941415051825146, | |
| "eval_accuracy": 0.9027395900326748, | |
| "eval_loss": 0.401162326335907, | |
| "eval_runtime": 220.111, | |
| "eval_samples_per_second": 125.637, | |
| "eval_steps_per_second": 3.93, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 20.054078413699866, | |
| "grad_norm": 1.5961695909500122, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.3978, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 20.054078413699866, | |
| "eval_accuracy": 0.902977115716753, | |
| "eval_loss": 0.3981638252735138, | |
| "eval_runtime": 219.6972, | |
| "eval_samples_per_second": 125.873, | |
| "eval_steps_per_second": 3.937, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 20.166741775574582, | |
| "grad_norm": 1.5186184644699097, | |
| "learning_rate": 1.05e-05, | |
| "loss": 0.4012, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 20.166741775574582, | |
| "eval_accuracy": 0.9029895131243953, | |
| "eval_loss": 0.39535069465637207, | |
| "eval_runtime": 220.4034, | |
| "eval_samples_per_second": 125.47, | |
| "eval_steps_per_second": 3.925, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 20.279405137449302, | |
| "grad_norm": 1.7340284585952759, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3954, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 20.279405137449302, | |
| "eval_accuracy": 0.9031845731573412, | |
| "eval_loss": 0.3972371816635132, | |
| "eval_runtime": 220.9175, | |
| "eval_samples_per_second": 125.178, | |
| "eval_steps_per_second": 3.915, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 20.39206849932402, | |
| "grad_norm": 1.4601465463638306, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.3933, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 20.39206849932402, | |
| "eval_accuracy": 0.9031702530935091, | |
| "eval_loss": 0.39749225974082947, | |
| "eval_runtime": 220.5834, | |
| "eval_samples_per_second": 125.368, | |
| "eval_steps_per_second": 3.921, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 20.50473186119874, | |
| "grad_norm": 1.6822484731674194, | |
| "learning_rate": 9e-06, | |
| "loss": 0.3985, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 20.50473186119874, | |
| "eval_accuracy": 0.903283638473266, | |
| "eval_loss": 0.39412999153137207, | |
| "eval_runtime": 220.402, | |
| "eval_samples_per_second": 125.471, | |
| "eval_steps_per_second": 3.925, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 20.617395223073455, | |
| "grad_norm": 1.5493133068084717, | |
| "learning_rate": 8.500000000000002e-06, | |
| "loss": 0.3952, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 20.617395223073455, | |
| "eval_accuracy": 0.9031870870760611, | |
| "eval_loss": 0.39998504519462585, | |
| "eval_runtime": 219.1703, | |
| "eval_samples_per_second": 126.176, | |
| "eval_steps_per_second": 3.947, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 20.730058584948175, | |
| "grad_norm": 1.6142163276672363, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.395, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 20.730058584948175, | |
| "eval_accuracy": 0.9037042508521438, | |
| "eval_loss": 0.39454683661460876, | |
| "eval_runtime": 220.1482, | |
| "eval_samples_per_second": 125.615, | |
| "eval_steps_per_second": 3.929, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 20.842721946822895, | |
| "grad_norm": 1.3768945932388306, | |
| "learning_rate": 7.5e-06, | |
| "loss": 0.3925, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 20.842721946822895, | |
| "eval_accuracy": 0.9035520393735632, | |
| "eval_loss": 0.3969292640686035, | |
| "eval_runtime": 218.8787, | |
| "eval_samples_per_second": 126.344, | |
| "eval_steps_per_second": 3.952, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 20.95538530869761, | |
| "grad_norm": 1.8161870241165161, | |
| "learning_rate": 7.000000000000001e-06, | |
| "loss": 0.3911, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 20.95538530869761, | |
| "eval_accuracy": 0.9034115695768419, | |
| "eval_loss": 0.39153432846069336, | |
| "eval_runtime": 219.6974, | |
| "eval_samples_per_second": 125.873, | |
| "eval_steps_per_second": 3.937, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 21.06804867057233, | |
| "grad_norm": 1.7550774812698364, | |
| "learning_rate": 6.5000000000000004e-06, | |
| "loss": 0.3927, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 21.06804867057233, | |
| "eval_accuracy": 0.9035121668560334, | |
| "eval_loss": 0.39775171875953674, | |
| "eval_runtime": 221.0095, | |
| "eval_samples_per_second": 125.126, | |
| "eval_steps_per_second": 3.914, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 21.180712032447047, | |
| "grad_norm": 1.5582369565963745, | |
| "learning_rate": 6e-06, | |
| "loss": 0.3891, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 21.180712032447047, | |
| "eval_accuracy": 0.9037201879273532, | |
| "eval_loss": 0.3943246006965637, | |
| "eval_runtime": 220.3117, | |
| "eval_samples_per_second": 125.522, | |
| "eval_steps_per_second": 3.926, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 21.293375394321767, | |
| "grad_norm": 1.6729559898376465, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.3912, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 21.293375394321767, | |
| "eval_accuracy": 0.9036670141570837, | |
| "eval_loss": 0.39444249868392944, | |
| "eval_runtime": 219.9471, | |
| "eval_samples_per_second": 125.73, | |
| "eval_steps_per_second": 3.933, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 21.406038756196484, | |
| "grad_norm": 1.6871699094772339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3908, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 21.406038756196484, | |
| "eval_accuracy": 0.9037736297217607, | |
| "eval_loss": 0.39369192719459534, | |
| "eval_runtime": 219.9205, | |
| "eval_samples_per_second": 125.745, | |
| "eval_steps_per_second": 3.933, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 21.518702118071204, | |
| "grad_norm": 1.486741304397583, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.3902, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 21.518702118071204, | |
| "eval_accuracy": 0.9034302972672164, | |
| "eval_loss": 0.39573636651039124, | |
| "eval_runtime": 219.8759, | |
| "eval_samples_per_second": 125.771, | |
| "eval_steps_per_second": 3.934, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 21.63136547994592, | |
| "grad_norm": 1.8056081533432007, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.3891, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 21.63136547994592, | |
| "eval_accuracy": 0.9045647365783699, | |
| "eval_loss": 0.39023157954216003, | |
| "eval_runtime": 221.3034, | |
| "eval_samples_per_second": 124.96, | |
| "eval_steps_per_second": 3.909, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 21.74402884182064, | |
| "grad_norm": 1.552370309829712, | |
| "learning_rate": 3.5000000000000004e-06, | |
| "loss": 0.3894, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 21.74402884182064, | |
| "eval_accuracy": 0.9044615558398447, | |
| "eval_loss": 0.39400991797447205, | |
| "eval_runtime": 219.8746, | |
| "eval_samples_per_second": 125.772, | |
| "eval_steps_per_second": 3.934, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 21.85669220369536, | |
| "grad_norm": 1.506536841392517, | |
| "learning_rate": 3e-06, | |
| "loss": 0.3904, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 21.85669220369536, | |
| "eval_accuracy": 0.9044962394479266, | |
| "eval_loss": 0.390458881855011, | |
| "eval_runtime": 220.131, | |
| "eval_samples_per_second": 125.625, | |
| "eval_steps_per_second": 3.929, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 21.969355565570076, | |
| "grad_norm": 1.6080279350280762, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.3882, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 21.969355565570076, | |
| "eval_accuracy": 0.9043700852475594, | |
| "eval_loss": 0.39395132660865784, | |
| "eval_runtime": 220.1175, | |
| "eval_samples_per_second": 125.633, | |
| "eval_steps_per_second": 3.93, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 22.082018927444796, | |
| "grad_norm": 1.6551542282104492, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.388, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 22.082018927444796, | |
| "eval_accuracy": 0.904642958920198, | |
| "eval_loss": 0.39477479457855225, | |
| "eval_runtime": 219.108, | |
| "eval_samples_per_second": 126.212, | |
| "eval_steps_per_second": 3.948, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 22.194682289319513, | |
| "grad_norm": 1.3376331329345703, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.3888, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 22.194682289319513, | |
| "eval_accuracy": 0.9042594879589607, | |
| "eval_loss": 0.39155128598213196, | |
| "eval_runtime": 221.2476, | |
| "eval_samples_per_second": 124.991, | |
| "eval_steps_per_second": 3.91, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 22.307345651194233, | |
| "grad_norm": 1.6391901969909668, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.385, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 22.307345651194233, | |
| "eval_accuracy": 0.9047423169505552, | |
| "eval_loss": 0.3867943286895752, | |
| "eval_runtime": 220.9463, | |
| "eval_samples_per_second": 125.162, | |
| "eval_steps_per_second": 3.915, | |
| "step": 99000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 23, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.346992290195046e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |