| { |
| "best_global_step": 25600, |
| "best_metric": 0.5076445937156677, |
| "best_model_checkpoint": "/data/alamparan/mattext_ckpt_2/results/2026-02-13/00-23-20/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-14000", |
| "epoch": 50.0, |
| "eval_steps": 50, |
| "global_step": 25800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.09689922480620156, |
| "grad_norm": 4.480398654937744, |
| "learning_rate": 0.0001996201550387597, |
| "loss": 35.8864501953125, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09689922480620156, |
| "eval_loss": 24.066509246826172, |
| "eval_runtime": 197.2924, |
| "eval_samples_per_second": 96.324, |
| "eval_steps_per_second": 2.007, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1937984496124031, |
| "grad_norm": 3.3033525943756104, |
| "learning_rate": 0.0001992325581395349, |
| "loss": 22.6541064453125, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1937984496124031, |
| "eval_loss": 21.18689727783203, |
| "eval_runtime": 195.7213, |
| "eval_samples_per_second": 97.097, |
| "eval_steps_per_second": 2.023, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.29069767441860467, |
| "grad_norm": 6.851633071899414, |
| "learning_rate": 0.00019884496124031008, |
| "loss": 20.8401513671875, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.29069767441860467, |
| "eval_loss": 19.933605194091797, |
| "eval_runtime": 198.4052, |
| "eval_samples_per_second": 95.784, |
| "eval_steps_per_second": 1.996, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3875968992248062, |
| "grad_norm": 5.096057415008545, |
| "learning_rate": 0.00019845736434108527, |
| "loss": 19.824486083984375, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3875968992248062, |
| "eval_loss": 19.05492401123047, |
| "eval_runtime": 192.534, |
| "eval_samples_per_second": 98.705, |
| "eval_steps_per_second": 2.057, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4844961240310077, |
| "grad_norm": 6.72298526763916, |
| "learning_rate": 0.00019806976744186049, |
| "loss": 19.06490234375, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4844961240310077, |
| "eval_loss": 18.332548141479492, |
| "eval_runtime": 196.956, |
| "eval_samples_per_second": 96.489, |
| "eval_steps_per_second": 2.011, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5813953488372093, |
| "grad_norm": 7.238275527954102, |
| "learning_rate": 0.00019768217054263567, |
| "loss": 18.413529052734376, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5813953488372093, |
| "eval_loss": 17.510297775268555, |
| "eval_runtime": 192.236, |
| "eval_samples_per_second": 98.858, |
| "eval_steps_per_second": 2.06, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6782945736434108, |
| "grad_norm": 6.704541206359863, |
| "learning_rate": 0.00019729457364341086, |
| "loss": 17.664228515625, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6782945736434108, |
| "eval_loss": 16.745044708251953, |
| "eval_runtime": 192.5744, |
| "eval_samples_per_second": 98.684, |
| "eval_steps_per_second": 2.056, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7751937984496124, |
| "grad_norm": 8.426316261291504, |
| "learning_rate": 0.00019690697674418605, |
| "loss": 16.83594970703125, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7751937984496124, |
| "eval_loss": 15.575336456298828, |
| "eval_runtime": 195.6385, |
| "eval_samples_per_second": 97.138, |
| "eval_steps_per_second": 2.024, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.872093023255814, |
| "grad_norm": 8.754973411560059, |
| "learning_rate": 0.00019651937984496124, |
| "loss": 15.635130615234376, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.872093023255814, |
| "eval_loss": 13.746257781982422, |
| "eval_runtime": 196.8027, |
| "eval_samples_per_second": 96.564, |
| "eval_steps_per_second": 2.012, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9689922480620154, |
| "grad_norm": 10.155797004699707, |
| "learning_rate": 0.00019613178294573645, |
| "loss": 13.158505859375, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9689922480620154, |
| "eval_loss": 9.434493064880371, |
| "eval_runtime": 215.6111, |
| "eval_samples_per_second": 88.14, |
| "eval_steps_per_second": 1.837, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0658914728682172, |
| "grad_norm": 7.4197540283203125, |
| "learning_rate": 0.00019574418604651164, |
| "loss": 8.888753051757812, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.0658914728682172, |
| "eval_loss": 5.876725673675537, |
| "eval_runtime": 195.3015, |
| "eval_samples_per_second": 97.306, |
| "eval_steps_per_second": 2.028, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1627906976744187, |
| "grad_norm": 5.196544647216797, |
| "learning_rate": 0.00019535658914728683, |
| "loss": 6.424893188476562, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1627906976744187, |
| "eval_loss": 4.754482269287109, |
| "eval_runtime": 197.8093, |
| "eval_samples_per_second": 96.072, |
| "eval_steps_per_second": 2.002, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2596899224806202, |
| "grad_norm": 4.070798397064209, |
| "learning_rate": 0.00019496899224806202, |
| "loss": 5.379293823242188, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.2596899224806202, |
| "eval_loss": 4.241163730621338, |
| "eval_runtime": 195.9777, |
| "eval_samples_per_second": 96.97, |
| "eval_steps_per_second": 2.021, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3565891472868217, |
| "grad_norm": 4.9117655754089355, |
| "learning_rate": 0.0001945813953488372, |
| "loss": 4.92329345703125, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3565891472868217, |
| "eval_loss": 3.873270034790039, |
| "eval_runtime": 194.9014, |
| "eval_samples_per_second": 97.506, |
| "eval_steps_per_second": 2.032, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.4534883720930232, |
| "grad_norm": 4.443520545959473, |
| "learning_rate": 0.00019419379844961242, |
| "loss": 4.538243713378907, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.4534883720930232, |
| "eval_loss": 3.622462749481201, |
| "eval_runtime": 190.9707, |
| "eval_samples_per_second": 99.513, |
| "eval_steps_per_second": 2.074, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.550387596899225, |
| "grad_norm": 4.078836441040039, |
| "learning_rate": 0.0001938062015503876, |
| "loss": 4.198520812988281, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.550387596899225, |
| "eval_loss": 3.3803393840789795, |
| "eval_runtime": 195.0941, |
| "eval_samples_per_second": 97.409, |
| "eval_steps_per_second": 2.03, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6472868217054264, |
| "grad_norm": 4.003866195678711, |
| "learning_rate": 0.0001934186046511628, |
| "loss": 4.023372497558594, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.6472868217054264, |
| "eval_loss": 3.1622314453125, |
| "eval_runtime": 197.7245, |
| "eval_samples_per_second": 96.114, |
| "eval_steps_per_second": 2.003, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.744186046511628, |
| "grad_norm": 3.277304172515869, |
| "learning_rate": 0.00019303100775193798, |
| "loss": 3.669682922363281, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.744186046511628, |
| "eval_loss": 2.9944067001342773, |
| "eval_runtime": 196.1326, |
| "eval_samples_per_second": 96.894, |
| "eval_steps_per_second": 2.019, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.8410852713178296, |
| "grad_norm": 3.905837059020996, |
| "learning_rate": 0.00019264341085271317, |
| "loss": 3.536895751953125, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.8410852713178296, |
| "eval_loss": 2.87215518951416, |
| "eval_runtime": 194.2696, |
| "eval_samples_per_second": 97.823, |
| "eval_steps_per_second": 2.038, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.937984496124031, |
| "grad_norm": 2.900581121444702, |
| "learning_rate": 0.00019225581395348839, |
| "loss": 3.3392208862304686, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.937984496124031, |
| "eval_loss": 2.717120885848999, |
| "eval_runtime": 195.0025, |
| "eval_samples_per_second": 97.455, |
| "eval_steps_per_second": 2.031, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.0348837209302326, |
| "grad_norm": 2.5737144947052, |
| "learning_rate": 0.00019186821705426357, |
| "loss": 3.0850595092773436, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.0348837209302326, |
| "eval_loss": 2.564358711242676, |
| "eval_runtime": 193.706, |
| "eval_samples_per_second": 98.107, |
| "eval_steps_per_second": 2.044, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.1317829457364343, |
| "grad_norm": 3.0205888748168945, |
| "learning_rate": 0.00019148062015503876, |
| "loss": 3.0150396728515627, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.1317829457364343, |
| "eval_loss": 2.4571454524993896, |
| "eval_runtime": 194.1336, |
| "eval_samples_per_second": 97.891, |
| "eval_steps_per_second": 2.04, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.2286821705426356, |
| "grad_norm": 2.9379639625549316, |
| "learning_rate": 0.00019109302325581395, |
| "loss": 2.7774249267578126, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.2286821705426356, |
| "eval_loss": 2.341874122619629, |
| "eval_runtime": 196.8183, |
| "eval_samples_per_second": 96.556, |
| "eval_steps_per_second": 2.012, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.3255813953488373, |
| "grad_norm": 2.8902111053466797, |
| "learning_rate": 0.00019070542635658916, |
| "loss": 2.78979736328125, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.3255813953488373, |
| "eval_loss": 2.1844258308410645, |
| "eval_runtime": 196.3212, |
| "eval_samples_per_second": 96.801, |
| "eval_steps_per_second": 2.017, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.4224806201550386, |
| "grad_norm": 3.2294564247131348, |
| "learning_rate": 0.00019031782945736435, |
| "loss": 2.461886444091797, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.4224806201550386, |
| "eval_loss": 1.9487553834915161, |
| "eval_runtime": 194.5758, |
| "eval_samples_per_second": 97.669, |
| "eval_steps_per_second": 2.035, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.5193798449612403, |
| "grad_norm": 2.7845847606658936, |
| "learning_rate": 0.00018993023255813954, |
| "loss": 2.2011062622070314, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.5193798449612403, |
| "eval_loss": 1.752272129058838, |
| "eval_runtime": 194.8447, |
| "eval_samples_per_second": 97.534, |
| "eval_steps_per_second": 2.032, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.616279069767442, |
| "grad_norm": 2.7175710201263428, |
| "learning_rate": 0.00018954263565891476, |
| "loss": 1.9771908569335936, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.616279069767442, |
| "eval_loss": 1.6105732917785645, |
| "eval_runtime": 194.2477, |
| "eval_samples_per_second": 97.834, |
| "eval_steps_per_second": 2.039, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.7131782945736433, |
| "grad_norm": 2.9664433002471924, |
| "learning_rate": 0.00018915503875968994, |
| "loss": 1.8786553955078125, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.7131782945736433, |
| "eval_loss": 1.5160634517669678, |
| "eval_runtime": 193.1833, |
| "eval_samples_per_second": 98.373, |
| "eval_steps_per_second": 2.05, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.810077519379845, |
| "grad_norm": 2.208284616470337, |
| "learning_rate": 0.00018876744186046513, |
| "loss": 1.7303669738769532, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.810077519379845, |
| "eval_loss": 1.454694151878357, |
| "eval_runtime": 193.997, |
| "eval_samples_per_second": 97.96, |
| "eval_steps_per_second": 2.041, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.9069767441860463, |
| "grad_norm": 2.5443525314331055, |
| "learning_rate": 0.00018837984496124032, |
| "loss": 1.6828465270996094, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.9069767441860463, |
| "eval_loss": 1.3904365301132202, |
| "eval_runtime": 195.2492, |
| "eval_samples_per_second": 97.332, |
| "eval_steps_per_second": 2.028, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.003875968992248, |
| "grad_norm": 2.5396857261657715, |
| "learning_rate": 0.0001879922480620155, |
| "loss": 1.595220489501953, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.003875968992248, |
| "eval_loss": 1.366494059562683, |
| "eval_runtime": 195.4358, |
| "eval_samples_per_second": 97.239, |
| "eval_steps_per_second": 2.026, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.10077519379845, |
| "grad_norm": 2.160076379776001, |
| "learning_rate": 0.00018760465116279072, |
| "loss": 1.5685009765625, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.10077519379845, |
| "eval_loss": 1.3070204257965088, |
| "eval_runtime": 196.739, |
| "eval_samples_per_second": 96.595, |
| "eval_steps_per_second": 2.013, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.197674418604651, |
| "grad_norm": 2.343022346496582, |
| "learning_rate": 0.0001872170542635659, |
| "loss": 1.4889743041992187, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.197674418604651, |
| "eval_loss": 1.2801202535629272, |
| "eval_runtime": 198.8275, |
| "eval_samples_per_second": 95.58, |
| "eval_steps_per_second": 1.992, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.294573643410853, |
| "grad_norm": 2.2497730255126953, |
| "learning_rate": 0.0001868294573643411, |
| "loss": 1.4677432250976563, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.294573643410853, |
| "eval_loss": 1.2478386163711548, |
| "eval_runtime": 200.1485, |
| "eval_samples_per_second": 94.949, |
| "eval_steps_per_second": 1.979, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.391472868217054, |
| "grad_norm": 2.0219998359680176, |
| "learning_rate": 0.00018644186046511629, |
| "loss": 1.4308297729492188, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.391472868217054, |
| "eval_loss": 1.2120215892791748, |
| "eval_runtime": 199.2468, |
| "eval_samples_per_second": 95.379, |
| "eval_steps_per_second": 1.987, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.488372093023256, |
| "grad_norm": 2.2400925159454346, |
| "learning_rate": 0.00018605426356589147, |
| "loss": 1.3934197998046876, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.488372093023256, |
| "eval_loss": 1.1917102336883545, |
| "eval_runtime": 209.7445, |
| "eval_samples_per_second": 90.605, |
| "eval_steps_per_second": 1.888, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.5852713178294575, |
| "grad_norm": 2.072272539138794, |
| "learning_rate": 0.0001856666666666667, |
| "loss": 1.3590359497070312, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.5852713178294575, |
| "eval_loss": 1.1820892095565796, |
| "eval_runtime": 194.7483, |
| "eval_samples_per_second": 97.582, |
| "eval_steps_per_second": 2.033, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.682170542635659, |
| "grad_norm": 1.9475743770599365, |
| "learning_rate": 0.00018527906976744188, |
| "loss": 1.3555595397949218, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.682170542635659, |
| "eval_loss": 1.1485044956207275, |
| "eval_runtime": 198.2014, |
| "eval_samples_per_second": 95.882, |
| "eval_steps_per_second": 1.998, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.7790697674418605, |
| "grad_norm": 2.3375675678253174, |
| "learning_rate": 0.00018489147286821707, |
| "loss": 1.3280506896972657, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.7790697674418605, |
| "eval_loss": 1.1500495672225952, |
| "eval_runtime": 195.9158, |
| "eval_samples_per_second": 97.001, |
| "eval_steps_per_second": 2.021, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.875968992248062, |
| "grad_norm": 2.0993704795837402, |
| "learning_rate": 0.00018450387596899225, |
| "loss": 1.3038815307617186, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.875968992248062, |
| "eval_loss": 1.1234172582626343, |
| "eval_runtime": 202.1206, |
| "eval_samples_per_second": 94.023, |
| "eval_steps_per_second": 1.959, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.9728682170542635, |
| "grad_norm": 1.8678816556930542, |
| "learning_rate": 0.00018411627906976744, |
| "loss": 1.270460968017578, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.9728682170542635, |
| "eval_loss": 1.0995925664901733, |
| "eval_runtime": 201.1179, |
| "eval_samples_per_second": 94.492, |
| "eval_steps_per_second": 1.969, |
| "step": 2050 |
| }, |
| { |
| "epoch": 4.069767441860465, |
| "grad_norm": 1.9179855585098267, |
| "learning_rate": 0.00018372868217054266, |
| "loss": 1.2543260955810547, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.069767441860465, |
| "eval_loss": 1.0878671407699585, |
| "eval_runtime": 195.9153, |
| "eval_samples_per_second": 97.001, |
| "eval_steps_per_second": 2.021, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 1.8298465013504028, |
| "learning_rate": 0.00018334108527131784, |
| "loss": 1.2222324371337892, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "eval_loss": 1.0717836618423462, |
| "eval_runtime": 197.4785, |
| "eval_samples_per_second": 96.233, |
| "eval_steps_per_second": 2.005, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.263565891472869, |
| "grad_norm": 1.885225534439087, |
| "learning_rate": 0.00018295348837209303, |
| "loss": 1.1911175537109375, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.263565891472869, |
| "eval_loss": 1.0390491485595703, |
| "eval_runtime": 200.4332, |
| "eval_samples_per_second": 94.815, |
| "eval_steps_per_second": 1.976, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.3604651162790695, |
| "grad_norm": 1.9083372354507446, |
| "learning_rate": 0.00018256589147286822, |
| "loss": 1.1913112640380858, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.3604651162790695, |
| "eval_loss": 1.0383073091506958, |
| "eval_runtime": 192.9555, |
| "eval_samples_per_second": 98.489, |
| "eval_steps_per_second": 2.052, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.457364341085271, |
| "grad_norm": 2.0930843353271484, |
| "learning_rate": 0.0001821782945736434, |
| "loss": 1.1640525817871095, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.457364341085271, |
| "eval_loss": 1.0316708087921143, |
| "eval_runtime": 198.9764, |
| "eval_samples_per_second": 95.509, |
| "eval_steps_per_second": 1.99, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.554263565891473, |
| "grad_norm": 1.8796041011810303, |
| "learning_rate": 0.0001817906976744186, |
| "loss": 1.1836517333984375, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.554263565891473, |
| "eval_loss": 1.021742343902588, |
| "eval_runtime": 199.5658, |
| "eval_samples_per_second": 95.227, |
| "eval_steps_per_second": 1.984, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.651162790697675, |
| "grad_norm": 1.826181411743164, |
| "learning_rate": 0.0001814031007751938, |
| "loss": 1.1762975311279298, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.651162790697675, |
| "eval_loss": 1.011062502861023, |
| "eval_runtime": 198.4529, |
| "eval_samples_per_second": 95.761, |
| "eval_steps_per_second": 1.995, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.748062015503876, |
| "grad_norm": 1.852156400680542, |
| "learning_rate": 0.000181015503875969, |
| "loss": 1.1690707397460938, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.748062015503876, |
| "eval_loss": 0.9975070953369141, |
| "eval_runtime": 197.7981, |
| "eval_samples_per_second": 96.078, |
| "eval_steps_per_second": 2.002, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.844961240310077, |
| "grad_norm": 2.0098986625671387, |
| "learning_rate": 0.0001806279069767442, |
| "loss": 1.1409013366699219, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.844961240310077, |
| "eval_loss": 0.9758601188659668, |
| "eval_runtime": 199.2887, |
| "eval_samples_per_second": 95.359, |
| "eval_steps_per_second": 1.987, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.941860465116279, |
| "grad_norm": 1.8541340827941895, |
| "learning_rate": 0.00018024031007751937, |
| "loss": 1.1015814971923827, |
| "step": 2550 |
| }, |
| { |
| "epoch": 4.941860465116279, |
| "eval_loss": 0.9661723971366882, |
| "eval_runtime": 200.2897, |
| "eval_samples_per_second": 94.883, |
| "eval_steps_per_second": 1.977, |
| "step": 2550 |
| }, |
| { |
| "epoch": 5.038759689922481, |
| "grad_norm": 1.6043637990951538, |
| "learning_rate": 0.00017985271317829456, |
| "loss": 1.1134808349609375, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.038759689922481, |
| "eval_loss": 0.967205286026001, |
| "eval_runtime": 198.2009, |
| "eval_samples_per_second": 95.883, |
| "eval_steps_per_second": 1.998, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.135658914728682, |
| "grad_norm": 1.7434065341949463, |
| "learning_rate": 0.00017946511627906978, |
| "loss": 1.0963155364990234, |
| "step": 2650 |
| }, |
| { |
| "epoch": 5.135658914728682, |
| "eval_loss": 0.9636672735214233, |
| "eval_runtime": 195.4024, |
| "eval_samples_per_second": 97.256, |
| "eval_steps_per_second": 2.027, |
| "step": 2650 |
| }, |
| { |
| "epoch": 5.232558139534884, |
| "grad_norm": 2.160961866378784, |
| "learning_rate": 0.00017907751937984497, |
| "loss": 1.0813286590576172, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.232558139534884, |
| "eval_loss": 0.9522321820259094, |
| "eval_runtime": 195.1264, |
| "eval_samples_per_second": 97.393, |
| "eval_steps_per_second": 2.029, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.329457364341085, |
| "grad_norm": 1.9293419122695923, |
| "learning_rate": 0.00017868992248062015, |
| "loss": 1.0441783142089844, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.329457364341085, |
| "eval_loss": 0.9408562183380127, |
| "eval_runtime": 193.0048, |
| "eval_samples_per_second": 98.464, |
| "eval_steps_per_second": 2.052, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.426356589147287, |
| "grad_norm": 1.6187139749526978, |
| "learning_rate": 0.00017830232558139534, |
| "loss": 1.0531800079345703, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.426356589147287, |
| "eval_loss": 0.9365593791007996, |
| "eval_runtime": 199.8497, |
| "eval_samples_per_second": 95.091, |
| "eval_steps_per_second": 1.981, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.523255813953488, |
| "grad_norm": 1.7480401992797852, |
| "learning_rate": 0.00017791472868217056, |
| "loss": 1.0467662048339843, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.523255813953488, |
| "eval_loss": 0.9324782490730286, |
| "eval_runtime": 194.6714, |
| "eval_samples_per_second": 97.621, |
| "eval_steps_per_second": 2.034, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.62015503875969, |
| "grad_norm": 1.6348450183868408, |
| "learning_rate": 0.00017752713178294574, |
| "loss": 1.0491456604003906, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.62015503875969, |
| "eval_loss": 0.9339238405227661, |
| "eval_runtime": 193.6506, |
| "eval_samples_per_second": 98.135, |
| "eval_steps_per_second": 2.045, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.717054263565892, |
| "grad_norm": 1.7508739233016968, |
| "learning_rate": 0.00017713953488372096, |
| "loss": 1.0431405639648437, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.717054263565892, |
| "eval_loss": 0.9144666194915771, |
| "eval_runtime": 200.9709, |
| "eval_samples_per_second": 94.561, |
| "eval_steps_per_second": 1.97, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.813953488372093, |
| "grad_norm": 1.8117504119873047, |
| "learning_rate": 0.00017675193798449615, |
| "loss": 1.029040298461914, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.813953488372093, |
| "eval_loss": 0.910137951374054, |
| "eval_runtime": 198.6885, |
| "eval_samples_per_second": 95.647, |
| "eval_steps_per_second": 1.993, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.910852713178294, |
| "grad_norm": 1.9361231327056885, |
| "learning_rate": 0.00017636434108527134, |
| "loss": 1.0382290649414063, |
| "step": 3050 |
| }, |
| { |
| "epoch": 5.910852713178294, |
| "eval_loss": 0.8883566856384277, |
| "eval_runtime": 199.026, |
| "eval_samples_per_second": 95.485, |
| "eval_steps_per_second": 1.99, |
| "step": 3050 |
| }, |
| { |
| "epoch": 6.007751937984496, |
| "grad_norm": 1.7819427251815796, |
| "learning_rate": 0.00017597674418604652, |
| "loss": 1.0060308837890626, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.007751937984496, |
| "eval_loss": 0.9043192863464355, |
| "eval_runtime": 193.7551, |
| "eval_samples_per_second": 98.083, |
| "eval_steps_per_second": 2.044, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.104651162790698, |
| "grad_norm": 1.7325843572616577, |
| "learning_rate": 0.0001755891472868217, |
| "loss": 1.010067138671875, |
| "step": 3150 |
| }, |
| { |
| "epoch": 6.104651162790698, |
| "eval_loss": 0.8911006450653076, |
| "eval_runtime": 193.7454, |
| "eval_samples_per_second": 98.087, |
| "eval_steps_per_second": 2.044, |
| "step": 3150 |
| }, |
| { |
| "epoch": 6.2015503875969, |
| "grad_norm": 1.707995057106018, |
| "learning_rate": 0.00017520155038759693, |
| "loss": 0.9815747833251953, |
| "step": 3200 |
| }, |
| { |
| "epoch": 6.2015503875969, |
| "eval_loss": 0.8801227807998657, |
| "eval_runtime": 197.3882, |
| "eval_samples_per_second": 96.277, |
| "eval_steps_per_second": 2.006, |
| "step": 3200 |
| }, |
| { |
| "epoch": 6.2984496124031, |
| "grad_norm": 1.4234212636947632, |
| "learning_rate": 0.00017481395348837211, |
| "loss": 0.9903421020507812, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.2984496124031, |
| "eval_loss": 0.8660780787467957, |
| "eval_runtime": 199.7721, |
| "eval_samples_per_second": 95.128, |
| "eval_steps_per_second": 1.982, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.395348837209302, |
| "grad_norm": 1.743849277496338, |
| "learning_rate": 0.0001744263565891473, |
| "loss": 0.9762661743164063, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.395348837209302, |
| "eval_loss": 0.8824377655982971, |
| "eval_runtime": 192.4288, |
| "eval_samples_per_second": 98.759, |
| "eval_steps_per_second": 2.058, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.492248062015504, |
| "grad_norm": 1.758987545967102, |
| "learning_rate": 0.0001740387596899225, |
| "loss": 0.9555432891845703, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.492248062015504, |
| "eval_loss": 0.868446409702301, |
| "eval_runtime": 201.784, |
| "eval_samples_per_second": 94.18, |
| "eval_steps_per_second": 1.962, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.589147286821706, |
| "grad_norm": 1.546152949333191, |
| "learning_rate": 0.00017365116279069768, |
| "loss": 0.9780608367919922, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.589147286821706, |
| "eval_loss": 0.8624841570854187, |
| "eval_runtime": 200.0021, |
| "eval_samples_per_second": 95.019, |
| "eval_steps_per_second": 1.98, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.686046511627907, |
| "grad_norm": 1.8314995765686035, |
| "learning_rate": 0.00017326356589147287, |
| "loss": 0.9537903594970704, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.686046511627907, |
| "eval_loss": 0.8573926091194153, |
| "eval_runtime": 202.7677, |
| "eval_samples_per_second": 93.723, |
| "eval_steps_per_second": 1.953, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.782945736434108, |
| "grad_norm": 1.645276427268982, |
| "learning_rate": 0.00017287596899224808, |
| "loss": 0.9461723327636719, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.782945736434108, |
| "eval_loss": 0.8483996391296387, |
| "eval_runtime": 201.379, |
| "eval_samples_per_second": 94.369, |
| "eval_steps_per_second": 1.966, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.87984496124031, |
| "grad_norm": 1.6280810832977295, |
| "learning_rate": 0.00017248837209302327, |
| "loss": 0.951067886352539, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.87984496124031, |
| "eval_loss": 0.8427923321723938, |
| "eval_runtime": 212.6273, |
| "eval_samples_per_second": 89.377, |
| "eval_steps_per_second": 1.862, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.976744186046512, |
| "grad_norm": 1.6208834648132324, |
| "learning_rate": 0.00017210077519379846, |
| "loss": 0.9458073425292969, |
| "step": 3600 |
| }, |
| { |
| "epoch": 6.976744186046512, |
| "eval_loss": 0.8510859608650208, |
| "eval_runtime": 206.0969, |
| "eval_samples_per_second": 92.209, |
| "eval_steps_per_second": 1.921, |
| "step": 3600 |
| }, |
| { |
| "epoch": 7.073643410852713, |
| "grad_norm": 1.702273964881897, |
| "learning_rate": 0.00017171317829457365, |
| "loss": 0.9501612854003906, |
| "step": 3650 |
| }, |
| { |
| "epoch": 7.073643410852713, |
| "eval_loss": 0.8405491709709167, |
| "eval_runtime": 182.1875, |
| "eval_samples_per_second": 104.31, |
| "eval_steps_per_second": 2.174, |
| "step": 3650 |
| }, |
| { |
| "epoch": 7.170542635658915, |
| "grad_norm": 1.7668451070785522, |
| "learning_rate": 0.00017132558139534883, |
| "loss": 0.9099185180664062, |
| "step": 3700 |
| }, |
| { |
| "epoch": 7.170542635658915, |
| "eval_loss": 0.8351926803588867, |
| "eval_runtime": 208.1044, |
| "eval_samples_per_second": 91.32, |
| "eval_steps_per_second": 1.903, |
| "step": 3700 |
| }, |
| { |
| "epoch": 7.267441860465116, |
| "grad_norm": 1.3699573278427124, |
| "learning_rate": 0.00017093798449612405, |
| "loss": 0.8952218627929688, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.267441860465116, |
| "eval_loss": 0.8353874683380127, |
| "eval_runtime": 200.4099, |
| "eval_samples_per_second": 94.826, |
| "eval_steps_per_second": 1.976, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.364341085271318, |
| "grad_norm": 1.6185437440872192, |
| "learning_rate": 0.00017055038759689924, |
| "loss": 0.9168299865722657, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.364341085271318, |
| "eval_loss": 0.8311659693717957, |
| "eval_runtime": 203.7994, |
| "eval_samples_per_second": 93.249, |
| "eval_steps_per_second": 1.943, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.461240310077519, |
| "grad_norm": 1.5149507522583008, |
| "learning_rate": 0.00017016279069767442, |
| "loss": 0.9037581634521484, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.461240310077519, |
| "eval_loss": 0.8201795220375061, |
| "eval_runtime": 201.3115, |
| "eval_samples_per_second": 94.401, |
| "eval_steps_per_second": 1.967, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.558139534883721, |
| "grad_norm": 1.7122201919555664, |
| "learning_rate": 0.0001697751937984496, |
| "loss": 0.9189765930175782, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.558139534883721, |
| "eval_loss": 0.8164864182472229, |
| "eval_runtime": 206.2635, |
| "eval_samples_per_second": 92.135, |
| "eval_steps_per_second": 1.92, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.655038759689923, |
| "grad_norm": 1.7001962661743164, |
| "learning_rate": 0.0001693875968992248, |
| "loss": 0.8901979827880859, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.655038759689923, |
| "eval_loss": 0.8141375184059143, |
| "eval_runtime": 203.6025, |
| "eval_samples_per_second": 93.339, |
| "eval_steps_per_second": 1.945, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.751937984496124, |
| "grad_norm": 1.6305474042892456, |
| "learning_rate": 0.00016900000000000002, |
| "loss": 0.9295430755615235, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.751937984496124, |
| "eval_loss": 0.8144168257713318, |
| "eval_runtime": 202.7837, |
| "eval_samples_per_second": 93.716, |
| "eval_steps_per_second": 1.953, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.848837209302325, |
| "grad_norm": 1.7076892852783203, |
| "learning_rate": 0.0001686124031007752, |
| "loss": 0.9095289611816406, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.848837209302325, |
| "eval_loss": 0.8173167109489441, |
| "eval_runtime": 205.0403, |
| "eval_samples_per_second": 92.684, |
| "eval_steps_per_second": 1.931, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.945736434108527, |
| "grad_norm": 1.469099521636963, |
| "learning_rate": 0.0001682248062015504, |
| "loss": 0.8886381530761719, |
| "step": 4100 |
| }, |
| { |
| "epoch": 7.945736434108527, |
| "eval_loss": 0.8015902042388916, |
| "eval_runtime": 206.0488, |
| "eval_samples_per_second": 92.231, |
| "eval_steps_per_second": 1.922, |
| "step": 4100 |
| }, |
| { |
| "epoch": 8.042635658914728, |
| "grad_norm": 1.439172387123108, |
| "learning_rate": 0.00016783720930232558, |
| "loss": 0.8689357757568359, |
| "step": 4150 |
| }, |
| { |
| "epoch": 8.042635658914728, |
| "eval_loss": 0.8171545267105103, |
| "eval_runtime": 202.334, |
| "eval_samples_per_second": 93.924, |
| "eval_steps_per_second": 1.957, |
| "step": 4150 |
| }, |
| { |
| "epoch": 8.13953488372093, |
| "grad_norm": 1.7564213275909424, |
| "learning_rate": 0.00016744961240310077, |
| "loss": 0.8746170043945313, |
| "step": 4200 |
| }, |
| { |
| "epoch": 8.13953488372093, |
| "eval_loss": 0.8083029389381409, |
| "eval_runtime": 202.7276, |
| "eval_samples_per_second": 93.742, |
| "eval_steps_per_second": 1.953, |
| "step": 4200 |
| }, |
| { |
| "epoch": 8.236434108527131, |
| "grad_norm": 1.6364485025405884, |
| "learning_rate": 0.00016706201550387595, |
| "loss": 0.8728688812255859, |
| "step": 4250 |
| }, |
| { |
| "epoch": 8.236434108527131, |
| "eval_loss": 0.7914299368858337, |
| "eval_runtime": 205.9727, |
| "eval_samples_per_second": 92.265, |
| "eval_steps_per_second": 1.923, |
| "step": 4250 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 1.4534393548965454, |
| "learning_rate": 0.00016667441860465117, |
| "loss": 0.8740718841552735, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "eval_loss": 0.7857058048248291, |
| "eval_runtime": 205.9583, |
| "eval_samples_per_second": 92.271, |
| "eval_steps_per_second": 1.923, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.430232558139535, |
| "grad_norm": 1.2946027517318726, |
| "learning_rate": 0.00016628682170542636, |
| "loss": 0.8651500701904297, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.430232558139535, |
| "eval_loss": 0.7967393398284912, |
| "eval_runtime": 202.1907, |
| "eval_samples_per_second": 93.99, |
| "eval_steps_per_second": 1.959, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.527131782945737, |
| "grad_norm": 1.2329447269439697, |
| "learning_rate": 0.00016589922480620155, |
| "loss": 0.8730928802490234, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.527131782945737, |
| "eval_loss": 0.7840523719787598, |
| "eval_runtime": 200.6499, |
| "eval_samples_per_second": 94.712, |
| "eval_steps_per_second": 1.974, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.624031007751938, |
| "grad_norm": 1.441260814666748, |
| "learning_rate": 0.00016551162790697676, |
| "loss": 0.8639598083496094, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.624031007751938, |
| "eval_loss": 0.7866470217704773, |
| "eval_runtime": 201.2114, |
| "eval_samples_per_second": 94.448, |
| "eval_steps_per_second": 1.968, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.720930232558139, |
| "grad_norm": 1.4688923358917236, |
| "learning_rate": 0.00016512403100775195, |
| "loss": 0.8695674896240234, |
| "step": 4500 |
| }, |
| { |
| "epoch": 8.720930232558139, |
| "eval_loss": 0.7772266864776611, |
| "eval_runtime": 203.6283, |
| "eval_samples_per_second": 93.327, |
| "eval_steps_per_second": 1.945, |
| "step": 4500 |
| }, |
| { |
| "epoch": 8.817829457364342, |
| "grad_norm": 1.7634507417678833, |
| "learning_rate": 0.00016473643410852714, |
| "loss": 0.867518310546875, |
| "step": 4550 |
| }, |
| { |
| "epoch": 8.817829457364342, |
| "eval_loss": 0.7764750123023987, |
| "eval_runtime": 197.3536, |
| "eval_samples_per_second": 96.294, |
| "eval_steps_per_second": 2.007, |
| "step": 4550 |
| }, |
| { |
| "epoch": 8.914728682170542, |
| "grad_norm": 1.3822124004364014, |
| "learning_rate": 0.00016434883720930235, |
| "loss": 0.8379183959960937, |
| "step": 4600 |
| }, |
| { |
| "epoch": 8.914728682170542, |
| "eval_loss": 0.7751489877700806, |
| "eval_runtime": 204.9848, |
| "eval_samples_per_second": 92.709, |
| "eval_steps_per_second": 1.932, |
| "step": 4600 |
| }, |
| { |
| "epoch": 9.011627906976743, |
| "grad_norm": 1.485487461090088, |
| "learning_rate": 0.00016396124031007754, |
| "loss": 0.8471622467041016, |
| "step": 4650 |
| }, |
| { |
| "epoch": 9.011627906976743, |
| "eval_loss": 0.7833512425422668, |
| "eval_runtime": 204.7415, |
| "eval_samples_per_second": 92.819, |
| "eval_steps_per_second": 1.934, |
| "step": 4650 |
| }, |
| { |
| "epoch": 9.108527131782946, |
| "grad_norm": 1.704746961593628, |
| "learning_rate": 0.00016357364341085273, |
| "loss": 0.8221251678466797, |
| "step": 4700 |
| }, |
| { |
| "epoch": 9.108527131782946, |
| "eval_loss": 0.782015860080719, |
| "eval_runtime": 205.5077, |
| "eval_samples_per_second": 92.473, |
| "eval_steps_per_second": 1.927, |
| "step": 4700 |
| }, |
| { |
| "epoch": 9.205426356589147, |
| "grad_norm": 1.6045122146606445, |
| "learning_rate": 0.00016318604651162792, |
| "loss": 0.8225302124023437, |
| "step": 4750 |
| }, |
| { |
| "epoch": 9.205426356589147, |
| "eval_loss": 0.7847553491592407, |
| "eval_runtime": 203.1694, |
| "eval_samples_per_second": 93.538, |
| "eval_steps_per_second": 1.949, |
| "step": 4750 |
| }, |
| { |
| "epoch": 9.30232558139535, |
| "grad_norm": 1.482059121131897, |
| "learning_rate": 0.0001627984496124031, |
| "loss": 0.8403683471679687, |
| "step": 4800 |
| }, |
| { |
| "epoch": 9.30232558139535, |
| "eval_loss": 0.7648666501045227, |
| "eval_runtime": 207.7484, |
| "eval_samples_per_second": 91.476, |
| "eval_steps_per_second": 1.906, |
| "step": 4800 |
| }, |
| { |
| "epoch": 9.39922480620155, |
| "grad_norm": 1.5278195142745972, |
| "learning_rate": 0.00016241085271317832, |
| "loss": 0.8287559509277344, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.39922480620155, |
| "eval_loss": 0.7762807607650757, |
| "eval_runtime": 207.8748, |
| "eval_samples_per_second": 91.42, |
| "eval_steps_per_second": 1.905, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.496124031007753, |
| "grad_norm": 1.3437010049819946, |
| "learning_rate": 0.0001620232558139535, |
| "loss": 0.8490474700927735, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.496124031007753, |
| "eval_loss": 0.7723644375801086, |
| "eval_runtime": 202.5921, |
| "eval_samples_per_second": 93.804, |
| "eval_steps_per_second": 1.955, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.593023255813954, |
| "grad_norm": 1.41952645778656, |
| "learning_rate": 0.0001616356589147287, |
| "loss": 0.8415257263183594, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.593023255813954, |
| "eval_loss": 0.7634032964706421, |
| "eval_runtime": 204.2501, |
| "eval_samples_per_second": 93.043, |
| "eval_steps_per_second": 1.939, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.689922480620154, |
| "grad_norm": 1.5748244524002075, |
| "learning_rate": 0.00016124806201550388, |
| "loss": 0.8278125, |
| "step": 5000 |
| }, |
| { |
| "epoch": 9.689922480620154, |
| "eval_loss": 0.7638477087020874, |
| "eval_runtime": 206.8167, |
| "eval_samples_per_second": 91.888, |
| "eval_steps_per_second": 1.915, |
| "step": 5000 |
| }, |
| { |
| "epoch": 9.786821705426357, |
| "grad_norm": 1.1772520542144775, |
| "learning_rate": 0.00016086046511627907, |
| "loss": 0.8290666961669921, |
| "step": 5050 |
| }, |
| { |
| "epoch": 9.786821705426357, |
| "eval_loss": 0.7504697442054749, |
| "eval_runtime": 204.2184, |
| "eval_samples_per_second": 93.057, |
| "eval_steps_per_second": 1.939, |
| "step": 5050 |
| }, |
| { |
| "epoch": 9.883720930232558, |
| "grad_norm": 1.5144110918045044, |
| "learning_rate": 0.00016047286821705429, |
| "loss": 0.8253756713867187, |
| "step": 5100 |
| }, |
| { |
| "epoch": 9.883720930232558, |
| "eval_loss": 0.7516148090362549, |
| "eval_runtime": 203.6086, |
| "eval_samples_per_second": 93.336, |
| "eval_steps_per_second": 1.945, |
| "step": 5100 |
| }, |
| { |
| "epoch": 9.98062015503876, |
| "grad_norm": 1.3659805059432983, |
| "learning_rate": 0.00016008527131782947, |
| "loss": 0.8156401062011719, |
| "step": 5150 |
| }, |
| { |
| "epoch": 9.98062015503876, |
| "eval_loss": 0.7507323622703552, |
| "eval_runtime": 206.1146, |
| "eval_samples_per_second": 92.201, |
| "eval_steps_per_second": 1.921, |
| "step": 5150 |
| }, |
| { |
| "epoch": 10.077519379844961, |
| "grad_norm": 1.507645606994629, |
| "learning_rate": 0.00015969767441860466, |
| "loss": 0.8164395904541015, |
| "step": 5200 |
| }, |
| { |
| "epoch": 10.077519379844961, |
| "eval_loss": 0.7499141097068787, |
| "eval_runtime": 208.0903, |
| "eval_samples_per_second": 91.326, |
| "eval_steps_per_second": 1.903, |
| "step": 5200 |
| }, |
| { |
| "epoch": 10.174418604651162, |
| "grad_norm": 1.1920627355575562, |
| "learning_rate": 0.00015931007751937985, |
| "loss": 0.8137637329101562, |
| "step": 5250 |
| }, |
| { |
| "epoch": 10.174418604651162, |
| "eval_loss": 0.7570334672927856, |
| "eval_runtime": 210.8201, |
| "eval_samples_per_second": 90.143, |
| "eval_steps_per_second": 1.878, |
| "step": 5250 |
| }, |
| { |
| "epoch": 10.271317829457365, |
| "grad_norm": 1.5026272535324097, |
| "learning_rate": 0.00015892248062015504, |
| "loss": 0.8010871887207032, |
| "step": 5300 |
| }, |
| { |
| "epoch": 10.271317829457365, |
| "eval_loss": 0.7520400881767273, |
| "eval_runtime": 204.6587, |
| "eval_samples_per_second": 92.857, |
| "eval_steps_per_second": 1.935, |
| "step": 5300 |
| }, |
| { |
| "epoch": 10.368217054263566, |
| "grad_norm": 1.386483907699585, |
| "learning_rate": 0.00015853488372093023, |
| "loss": 0.808378677368164, |
| "step": 5350 |
| }, |
| { |
| "epoch": 10.368217054263566, |
| "eval_loss": 0.7365431785583496, |
| "eval_runtime": 208.4559, |
| "eval_samples_per_second": 91.166, |
| "eval_steps_per_second": 1.9, |
| "step": 5350 |
| }, |
| { |
| "epoch": 10.465116279069768, |
| "grad_norm": 1.3923448324203491, |
| "learning_rate": 0.00015814728682170544, |
| "loss": 0.7882123565673829, |
| "step": 5400 |
| }, |
| { |
| "epoch": 10.465116279069768, |
| "eval_loss": 0.7501969933509827, |
| "eval_runtime": 205.9772, |
| "eval_samples_per_second": 92.263, |
| "eval_steps_per_second": 1.923, |
| "step": 5400 |
| }, |
| { |
| "epoch": 10.562015503875969, |
| "grad_norm": 1.2909716367721558, |
| "learning_rate": 0.00015775968992248063, |
| "loss": 0.7820880126953125, |
| "step": 5450 |
| }, |
| { |
| "epoch": 10.562015503875969, |
| "eval_loss": 0.7447800636291504, |
| "eval_runtime": 209.0079, |
| "eval_samples_per_second": 90.925, |
| "eval_steps_per_second": 1.895, |
| "step": 5450 |
| }, |
| { |
| "epoch": 10.65891472868217, |
| "grad_norm": 1.2773196697235107, |
| "learning_rate": 0.00015737209302325582, |
| "loss": 0.788450698852539, |
| "step": 5500 |
| }, |
| { |
| "epoch": 10.65891472868217, |
| "eval_loss": 0.7404767274856567, |
| "eval_runtime": 213.2254, |
| "eval_samples_per_second": 89.126, |
| "eval_steps_per_second": 1.857, |
| "step": 5500 |
| }, |
| { |
| "epoch": 10.755813953488373, |
| "grad_norm": 1.4315084218978882, |
| "learning_rate": 0.000156984496124031, |
| "loss": 0.7946225738525391, |
| "step": 5550 |
| }, |
| { |
| "epoch": 10.755813953488373, |
| "eval_loss": 0.7304003238677979, |
| "eval_runtime": 205.986, |
| "eval_samples_per_second": 92.259, |
| "eval_steps_per_second": 1.922, |
| "step": 5550 |
| }, |
| { |
| "epoch": 10.852713178294573, |
| "grad_norm": 1.2794160842895508, |
| "learning_rate": 0.0001565968992248062, |
| "loss": 0.7854644012451172, |
| "step": 5600 |
| }, |
| { |
| "epoch": 10.852713178294573, |
| "eval_loss": 0.7165542244911194, |
| "eval_runtime": 211.2507, |
| "eval_samples_per_second": 89.959, |
| "eval_steps_per_second": 1.875, |
| "step": 5600 |
| }, |
| { |
| "epoch": 10.949612403100776, |
| "grad_norm": 1.4190521240234375, |
| "learning_rate": 0.0001562093023255814, |
| "loss": 0.7866236114501953, |
| "step": 5650 |
| }, |
| { |
| "epoch": 10.949612403100776, |
| "eval_loss": 0.7293540239334106, |
| "eval_runtime": 206.6809, |
| "eval_samples_per_second": 91.949, |
| "eval_steps_per_second": 1.916, |
| "step": 5650 |
| }, |
| { |
| "epoch": 11.046511627906977, |
| "grad_norm": 1.298693299293518, |
| "learning_rate": 0.0001558217054263566, |
| "loss": 0.7865670013427735, |
| "step": 5700 |
| }, |
| { |
| "epoch": 11.046511627906977, |
| "eval_loss": 0.7284151911735535, |
| "eval_runtime": 204.4335, |
| "eval_samples_per_second": 92.959, |
| "eval_steps_per_second": 1.937, |
| "step": 5700 |
| }, |
| { |
| "epoch": 11.143410852713178, |
| "grad_norm": 1.1174720525741577, |
| "learning_rate": 0.00015543410852713178, |
| "loss": 0.773541488647461, |
| "step": 5750 |
| }, |
| { |
| "epoch": 11.143410852713178, |
| "eval_loss": 0.7278522253036499, |
| "eval_runtime": 146.5948, |
| "eval_samples_per_second": 129.636, |
| "eval_steps_per_second": 2.701, |
| "step": 5750 |
| }, |
| { |
| "epoch": 11.24031007751938, |
| "grad_norm": 1.2272348403930664, |
| "learning_rate": 0.00015504651162790697, |
| "loss": 0.7666770935058593, |
| "step": 5800 |
| }, |
| { |
| "epoch": 11.24031007751938, |
| "eval_loss": 0.7202744483947754, |
| "eval_runtime": 193.7949, |
| "eval_samples_per_second": 98.062, |
| "eval_steps_per_second": 2.043, |
| "step": 5800 |
| }, |
| { |
| "epoch": 11.337209302325581, |
| "grad_norm": 1.373920202255249, |
| "learning_rate": 0.00015465891472868216, |
| "loss": 0.7730393981933594, |
| "step": 5850 |
| }, |
| { |
| "epoch": 11.337209302325581, |
| "eval_loss": 0.7175942063331604, |
| "eval_runtime": 197.7471, |
| "eval_samples_per_second": 96.103, |
| "eval_steps_per_second": 2.003, |
| "step": 5850 |
| }, |
| { |
| "epoch": 11.434108527131784, |
| "grad_norm": 1.4184571504592896, |
| "learning_rate": 0.00015427131782945737, |
| "loss": 0.7573123168945313, |
| "step": 5900 |
| }, |
| { |
| "epoch": 11.434108527131784, |
| "eval_loss": 0.7180309891700745, |
| "eval_runtime": 198.5097, |
| "eval_samples_per_second": 95.733, |
| "eval_steps_per_second": 1.995, |
| "step": 5900 |
| }, |
| { |
| "epoch": 11.531007751937985, |
| "grad_norm": 1.357619047164917, |
| "learning_rate": 0.00015388372093023256, |
| "loss": 0.7792025756835937, |
| "step": 5950 |
| }, |
| { |
| "epoch": 11.531007751937985, |
| "eval_loss": 0.7118659615516663, |
| "eval_runtime": 199.1368, |
| "eval_samples_per_second": 95.432, |
| "eval_steps_per_second": 1.989, |
| "step": 5950 |
| }, |
| { |
| "epoch": 11.627906976744185, |
| "grad_norm": 1.3384771347045898, |
| "learning_rate": 0.00015349612403100775, |
| "loss": 0.77959228515625, |
| "step": 6000 |
| }, |
| { |
| "epoch": 11.627906976744185, |
| "eval_loss": 0.7145297527313232, |
| "eval_runtime": 191.3254, |
| "eval_samples_per_second": 99.328, |
| "eval_steps_per_second": 2.07, |
| "step": 6000 |
| }, |
| { |
| "epoch": 11.724806201550388, |
| "grad_norm": 1.346449375152588, |
| "learning_rate": 0.00015310852713178296, |
| "loss": 0.760606918334961, |
| "step": 6050 |
| }, |
| { |
| "epoch": 11.724806201550388, |
| "eval_loss": 0.7076368927955627, |
| "eval_runtime": 195.1439, |
| "eval_samples_per_second": 97.385, |
| "eval_steps_per_second": 2.029, |
| "step": 6050 |
| }, |
| { |
| "epoch": 11.821705426356589, |
| "grad_norm": 1.3643659353256226, |
| "learning_rate": 0.00015272093023255815, |
| "loss": 0.7598040008544922, |
| "step": 6100 |
| }, |
| { |
| "epoch": 11.821705426356589, |
| "eval_loss": 0.7118851542472839, |
| "eval_runtime": 197.8301, |
| "eval_samples_per_second": 96.062, |
| "eval_steps_per_second": 2.002, |
| "step": 6100 |
| }, |
| { |
| "epoch": 11.918604651162791, |
| "grad_norm": 1.3271793127059937, |
| "learning_rate": 0.00015233333333333334, |
| "loss": 0.7641516876220703, |
| "step": 6150 |
| }, |
| { |
| "epoch": 11.918604651162791, |
| "eval_loss": 0.7121263146400452, |
| "eval_runtime": 193.0891, |
| "eval_samples_per_second": 98.421, |
| "eval_steps_per_second": 2.051, |
| "step": 6150 |
| }, |
| { |
| "epoch": 12.015503875968992, |
| "grad_norm": 1.4539422988891602, |
| "learning_rate": 0.00015194573643410856, |
| "loss": 0.7536121368408203, |
| "step": 6200 |
| }, |
| { |
| "epoch": 12.015503875968992, |
| "eval_loss": 0.6982870101928711, |
| "eval_runtime": 193.5336, |
| "eval_samples_per_second": 98.195, |
| "eval_steps_per_second": 2.046, |
| "step": 6200 |
| }, |
| { |
| "epoch": 12.112403100775193, |
| "grad_norm": 1.204176902770996, |
| "learning_rate": 0.00015155813953488374, |
| "loss": 0.7676261138916015, |
| "step": 6250 |
| }, |
| { |
| "epoch": 12.112403100775193, |
| "eval_loss": 0.7079237699508667, |
| "eval_runtime": 200.2318, |
| "eval_samples_per_second": 94.91, |
| "eval_steps_per_second": 1.978, |
| "step": 6250 |
| }, |
| { |
| "epoch": 12.209302325581396, |
| "grad_norm": 1.0595972537994385, |
| "learning_rate": 0.00015117054263565893, |
| "loss": 0.7566854858398437, |
| "step": 6300 |
| }, |
| { |
| "epoch": 12.209302325581396, |
| "eval_loss": 0.7162359356880188, |
| "eval_runtime": 194.1508, |
| "eval_samples_per_second": 97.883, |
| "eval_steps_per_second": 2.04, |
| "step": 6300 |
| }, |
| { |
| "epoch": 12.306201550387597, |
| "grad_norm": 1.2803763151168823, |
| "learning_rate": 0.00015078294573643412, |
| "loss": 0.7496888732910156, |
| "step": 6350 |
| }, |
| { |
| "epoch": 12.306201550387597, |
| "eval_loss": 0.7112685441970825, |
| "eval_runtime": 190.486, |
| "eval_samples_per_second": 99.766, |
| "eval_steps_per_second": 2.079, |
| "step": 6350 |
| }, |
| { |
| "epoch": 12.4031007751938, |
| "grad_norm": 1.1954985857009888, |
| "learning_rate": 0.0001503953488372093, |
| "loss": 0.7482534790039063, |
| "step": 6400 |
| }, |
| { |
| "epoch": 12.4031007751938, |
| "eval_loss": 0.702034056186676, |
| "eval_runtime": 196.6133, |
| "eval_samples_per_second": 96.657, |
| "eval_steps_per_second": 2.014, |
| "step": 6400 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 1.217117428779602, |
| "learning_rate": 0.0001500077519379845, |
| "loss": 0.7486822509765625, |
| "step": 6450 |
| }, |
| { |
| "epoch": 12.5, |
| "eval_loss": 0.7068222165107727, |
| "eval_runtime": 200.776, |
| "eval_samples_per_second": 94.653, |
| "eval_steps_per_second": 1.972, |
| "step": 6450 |
| }, |
| { |
| "epoch": 12.5968992248062, |
| "grad_norm": 1.3999775648117065, |
| "learning_rate": 0.0001496201550387597, |
| "loss": 0.7376171112060547, |
| "step": 6500 |
| }, |
| { |
| "epoch": 12.5968992248062, |
| "eval_loss": 0.7056812644004822, |
| "eval_runtime": 196.5, |
| "eval_samples_per_second": 96.712, |
| "eval_steps_per_second": 2.015, |
| "step": 6500 |
| }, |
| { |
| "epoch": 12.693798449612403, |
| "grad_norm": 1.5863757133483887, |
| "learning_rate": 0.0001492325581395349, |
| "loss": 0.7303102111816406, |
| "step": 6550 |
| }, |
| { |
| "epoch": 12.693798449612403, |
| "eval_loss": 0.7024106383323669, |
| "eval_runtime": 192.9575, |
| "eval_samples_per_second": 98.488, |
| "eval_steps_per_second": 2.052, |
| "step": 6550 |
| }, |
| { |
| "epoch": 12.790697674418604, |
| "grad_norm": 1.221718192100525, |
| "learning_rate": 0.0001488449612403101, |
| "loss": 0.7352449798583984, |
| "step": 6600 |
| }, |
| { |
| "epoch": 12.790697674418604, |
| "eval_loss": 0.6982800960540771, |
| "eval_runtime": 178.9122, |
| "eval_samples_per_second": 106.22, |
| "eval_steps_per_second": 2.213, |
| "step": 6600 |
| }, |
| { |
| "epoch": 12.887596899224807, |
| "grad_norm": 1.2759432792663574, |
| "learning_rate": 0.00014845736434108527, |
| "loss": 0.7509404754638672, |
| "step": 6650 |
| }, |
| { |
| "epoch": 12.887596899224807, |
| "eval_loss": 0.7055057883262634, |
| "eval_runtime": 180.4112, |
| "eval_samples_per_second": 105.337, |
| "eval_steps_per_second": 2.195, |
| "step": 6650 |
| }, |
| { |
| "epoch": 12.984496124031008, |
| "grad_norm": 1.3235024213790894, |
| "learning_rate": 0.00014806976744186046, |
| "loss": 0.7331123352050781, |
| "step": 6700 |
| }, |
| { |
| "epoch": 12.984496124031008, |
| "eval_loss": 0.6977774500846863, |
| "eval_runtime": 177.5576, |
| "eval_samples_per_second": 107.03, |
| "eval_steps_per_second": 2.23, |
| "step": 6700 |
| }, |
| { |
| "epoch": 13.081395348837209, |
| "grad_norm": 1.3682844638824463, |
| "learning_rate": 0.00014768217054263568, |
| "loss": 0.7287850189208984, |
| "step": 6750 |
| }, |
| { |
| "epoch": 13.081395348837209, |
| "eval_loss": 0.6928258538246155, |
| "eval_runtime": 168.0062, |
| "eval_samples_per_second": 113.115, |
| "eval_steps_per_second": 2.357, |
| "step": 6750 |
| }, |
| { |
| "epoch": 13.178294573643411, |
| "grad_norm": 1.3493455648422241, |
| "learning_rate": 0.00014729457364341087, |
| "loss": 0.7296273040771485, |
| "step": 6800 |
| }, |
| { |
| "epoch": 13.178294573643411, |
| "eval_loss": 0.6921527981758118, |
| "eval_runtime": 183.7563, |
| "eval_samples_per_second": 103.42, |
| "eval_steps_per_second": 2.155, |
| "step": 6800 |
| }, |
| { |
| "epoch": 13.275193798449612, |
| "grad_norm": 1.1569421291351318, |
| "learning_rate": 0.00014690697674418605, |
| "loss": 0.7132796478271485, |
| "step": 6850 |
| }, |
| { |
| "epoch": 13.275193798449612, |
| "eval_loss": 0.6823224425315857, |
| "eval_runtime": 184.3854, |
| "eval_samples_per_second": 103.067, |
| "eval_steps_per_second": 2.148, |
| "step": 6850 |
| }, |
| { |
| "epoch": 13.372093023255815, |
| "grad_norm": 1.392767310142517, |
| "learning_rate": 0.00014651937984496124, |
| "loss": 0.7273464965820312, |
| "step": 6900 |
| }, |
| { |
| "epoch": 13.372093023255815, |
| "eval_loss": 0.6904884576797485, |
| "eval_runtime": 167.1685, |
| "eval_samples_per_second": 113.682, |
| "eval_steps_per_second": 2.369, |
| "step": 6900 |
| }, |
| { |
| "epoch": 13.468992248062015, |
| "grad_norm": 1.1925963163375854, |
| "learning_rate": 0.00014613178294573643, |
| "loss": 0.7213536834716797, |
| "step": 6950 |
| }, |
| { |
| "epoch": 13.468992248062015, |
| "eval_loss": 0.6783022284507751, |
| "eval_runtime": 162.2209, |
| "eval_samples_per_second": 117.149, |
| "eval_steps_per_second": 2.441, |
| "step": 6950 |
| }, |
| { |
| "epoch": 13.565891472868216, |
| "grad_norm": 1.2016693353652954, |
| "learning_rate": 0.00014574418604651164, |
| "loss": 0.7257649230957032, |
| "step": 7000 |
| }, |
| { |
| "epoch": 13.565891472868216, |
| "eval_loss": 0.6885735988616943, |
| "eval_runtime": 154.1542, |
| "eval_samples_per_second": 123.279, |
| "eval_steps_per_second": 2.569, |
| "step": 7000 |
| }, |
| { |
| "epoch": 13.662790697674419, |
| "grad_norm": 1.0946542024612427, |
| "learning_rate": 0.00014535658914728683, |
| "loss": 0.7063812255859375, |
| "step": 7050 |
| }, |
| { |
| "epoch": 13.662790697674419, |
| "eval_loss": 0.6905943155288696, |
| "eval_runtime": 159.4294, |
| "eval_samples_per_second": 119.2, |
| "eval_steps_per_second": 2.484, |
| "step": 7050 |
| }, |
| { |
| "epoch": 13.75968992248062, |
| "grad_norm": 1.445551872253418, |
| "learning_rate": 0.00014496899224806202, |
| "loss": 0.7338412475585937, |
| "step": 7100 |
| }, |
| { |
| "epoch": 13.75968992248062, |
| "eval_loss": 0.6792259216308594, |
| "eval_runtime": 168.9572, |
| "eval_samples_per_second": 112.478, |
| "eval_steps_per_second": 2.344, |
| "step": 7100 |
| }, |
| { |
| "epoch": 13.856589147286822, |
| "grad_norm": 1.1020028591156006, |
| "learning_rate": 0.0001445813953488372, |
| "loss": 0.7323764801025391, |
| "step": 7150 |
| }, |
| { |
| "epoch": 13.856589147286822, |
| "eval_loss": 0.6802482604980469, |
| "eval_runtime": 177.1213, |
| "eval_samples_per_second": 107.294, |
| "eval_steps_per_second": 2.236, |
| "step": 7150 |
| }, |
| { |
| "epoch": 13.953488372093023, |
| "grad_norm": 1.3807705640792847, |
| "learning_rate": 0.0001441937984496124, |
| "loss": 0.711009521484375, |
| "step": 7200 |
| }, |
| { |
| "epoch": 13.953488372093023, |
| "eval_loss": 0.6847464442253113, |
| "eval_runtime": 186.7934, |
| "eval_samples_per_second": 101.738, |
| "eval_steps_per_second": 2.12, |
| "step": 7200 |
| }, |
| { |
| "epoch": 14.050387596899224, |
| "grad_norm": 1.2024147510528564, |
| "learning_rate": 0.0001438062015503876, |
| "loss": 0.7219676971435547, |
| "step": 7250 |
| }, |
| { |
| "epoch": 14.050387596899224, |
| "eval_loss": 0.6811977028846741, |
| "eval_runtime": 192.2788, |
| "eval_samples_per_second": 98.836, |
| "eval_steps_per_second": 2.06, |
| "step": 7250 |
| }, |
| { |
| "epoch": 14.147286821705427, |
| "grad_norm": 1.3249707221984863, |
| "learning_rate": 0.0001434186046511628, |
| "loss": 0.7023135375976562, |
| "step": 7300 |
| }, |
| { |
| "epoch": 14.147286821705427, |
| "eval_loss": 0.6774859428405762, |
| "eval_runtime": 188.9814, |
| "eval_samples_per_second": 100.56, |
| "eval_steps_per_second": 2.095, |
| "step": 7300 |
| }, |
| { |
| "epoch": 14.244186046511627, |
| "grad_norm": 1.185625433921814, |
| "learning_rate": 0.000143031007751938, |
| "loss": 0.7096491241455078, |
| "step": 7350 |
| }, |
| { |
| "epoch": 14.244186046511627, |
| "eval_loss": 0.6745359301567078, |
| "eval_runtime": 182.8908, |
| "eval_samples_per_second": 103.909, |
| "eval_steps_per_second": 2.165, |
| "step": 7350 |
| }, |
| { |
| "epoch": 14.34108527131783, |
| "grad_norm": 1.3041390180587769, |
| "learning_rate": 0.00014264341085271318, |
| "loss": 0.7081405639648437, |
| "step": 7400 |
| }, |
| { |
| "epoch": 14.34108527131783, |
| "eval_loss": 0.6794308423995972, |
| "eval_runtime": 179.4287, |
| "eval_samples_per_second": 105.914, |
| "eval_steps_per_second": 2.207, |
| "step": 7400 |
| }, |
| { |
| "epoch": 14.437984496124031, |
| "grad_norm": 1.2018780708312988, |
| "learning_rate": 0.00014225581395348836, |
| "loss": 0.7056974029541015, |
| "step": 7450 |
| }, |
| { |
| "epoch": 14.437984496124031, |
| "eval_loss": 0.6793537735939026, |
| "eval_runtime": 186.1901, |
| "eval_samples_per_second": 102.068, |
| "eval_steps_per_second": 2.127, |
| "step": 7450 |
| }, |
| { |
| "epoch": 14.534883720930232, |
| "grad_norm": 1.1330044269561768, |
| "learning_rate": 0.00014186821705426355, |
| "loss": 0.7149968719482422, |
| "step": 7500 |
| }, |
| { |
| "epoch": 14.534883720930232, |
| "eval_loss": 0.6830456256866455, |
| "eval_runtime": 183.7445, |
| "eval_samples_per_second": 103.426, |
| "eval_steps_per_second": 2.155, |
| "step": 7500 |
| }, |
| { |
| "epoch": 14.631782945736434, |
| "grad_norm": 1.2338323593139648, |
| "learning_rate": 0.00014148062015503877, |
| "loss": 0.7174818420410156, |
| "step": 7550 |
| }, |
| { |
| "epoch": 14.631782945736434, |
| "eval_loss": 0.6693219542503357, |
| "eval_runtime": 187.3627, |
| "eval_samples_per_second": 101.429, |
| "eval_steps_per_second": 2.114, |
| "step": 7550 |
| }, |
| { |
| "epoch": 14.728682170542635, |
| "grad_norm": 1.3604202270507812, |
| "learning_rate": 0.00014109302325581395, |
| "loss": 0.6990853118896484, |
| "step": 7600 |
| }, |
| { |
| "epoch": 14.728682170542635, |
| "eval_loss": 0.6684260368347168, |
| "eval_runtime": 188.918, |
| "eval_samples_per_second": 100.594, |
| "eval_steps_per_second": 2.096, |
| "step": 7600 |
| }, |
| { |
| "epoch": 14.825581395348838, |
| "grad_norm": 1.1873857975006104, |
| "learning_rate": 0.00014070542635658917, |
| "loss": 0.7052565765380859, |
| "step": 7650 |
| }, |
| { |
| "epoch": 14.825581395348838, |
| "eval_loss": 0.6708822846412659, |
| "eval_runtime": 215.4533, |
| "eval_samples_per_second": 88.205, |
| "eval_steps_per_second": 1.838, |
| "step": 7650 |
| }, |
| { |
| "epoch": 14.922480620155039, |
| "grad_norm": 1.3028804063796997, |
| "learning_rate": 0.00014031782945736436, |
| "loss": 0.7046041870117188, |
| "step": 7700 |
| }, |
| { |
| "epoch": 14.922480620155039, |
| "eval_loss": 0.6715724468231201, |
| "eval_runtime": 207.9457, |
| "eval_samples_per_second": 91.389, |
| "eval_steps_per_second": 1.904, |
| "step": 7700 |
| }, |
| { |
| "epoch": 15.01937984496124, |
| "grad_norm": 1.2554810047149658, |
| "learning_rate": 0.00013993023255813954, |
| "loss": 0.6970509338378906, |
| "step": 7750 |
| }, |
| { |
| "epoch": 15.01937984496124, |
| "eval_loss": 0.6706631779670715, |
| "eval_runtime": 208.1519, |
| "eval_samples_per_second": 91.299, |
| "eval_steps_per_second": 1.902, |
| "step": 7750 |
| }, |
| { |
| "epoch": 15.116279069767442, |
| "grad_norm": 1.1705738306045532, |
| "learning_rate": 0.00013954263565891473, |
| "loss": 0.7088002014160156, |
| "step": 7800 |
| }, |
| { |
| "epoch": 15.116279069767442, |
| "eval_loss": 0.6595144271850586, |
| "eval_runtime": 214.6398, |
| "eval_samples_per_second": 88.539, |
| "eval_steps_per_second": 1.845, |
| "step": 7800 |
| }, |
| { |
| "epoch": 15.213178294573643, |
| "grad_norm": 1.4359619617462158, |
| "learning_rate": 0.00013915503875968995, |
| "loss": 0.6887350463867188, |
| "step": 7850 |
| }, |
| { |
| "epoch": 15.213178294573643, |
| "eval_loss": 0.666128396987915, |
| "eval_runtime": 210.8256, |
| "eval_samples_per_second": 90.141, |
| "eval_steps_per_second": 1.878, |
| "step": 7850 |
| }, |
| { |
| "epoch": 15.310077519379846, |
| "grad_norm": 1.2694238424301147, |
| "learning_rate": 0.00013876744186046514, |
| "loss": 0.6924005889892578, |
| "step": 7900 |
| }, |
| { |
| "epoch": 15.310077519379846, |
| "eval_loss": 0.6729814410209656, |
| "eval_runtime": 198.8675, |
| "eval_samples_per_second": 95.561, |
| "eval_steps_per_second": 1.991, |
| "step": 7900 |
| }, |
| { |
| "epoch": 15.406976744186046, |
| "grad_norm": 1.285810947418213, |
| "learning_rate": 0.00013837984496124032, |
| "loss": 0.6942383575439454, |
| "step": 7950 |
| }, |
| { |
| "epoch": 15.406976744186046, |
| "eval_loss": 0.6664624214172363, |
| "eval_runtime": 212.0427, |
| "eval_samples_per_second": 89.623, |
| "eval_steps_per_second": 1.868, |
| "step": 7950 |
| }, |
| { |
| "epoch": 15.503875968992247, |
| "grad_norm": 1.1837239265441895, |
| "learning_rate": 0.0001379922480620155, |
| "loss": 0.6832563018798828, |
| "step": 8000 |
| }, |
| { |
| "epoch": 15.503875968992247, |
| "eval_loss": 0.6609703898429871, |
| "eval_runtime": 214.853, |
| "eval_samples_per_second": 88.451, |
| "eval_steps_per_second": 1.843, |
| "step": 8000 |
| }, |
| { |
| "epoch": 15.60077519379845, |
| "grad_norm": 1.1095117330551147, |
| "learning_rate": 0.0001376046511627907, |
| "loss": 0.6847735595703125, |
| "step": 8050 |
| }, |
| { |
| "epoch": 15.60077519379845, |
| "eval_loss": 0.6504544615745544, |
| "eval_runtime": 217.8637, |
| "eval_samples_per_second": 87.229, |
| "eval_steps_per_second": 1.818, |
| "step": 8050 |
| }, |
| { |
| "epoch": 15.69767441860465, |
| "grad_norm": 1.2389260530471802, |
| "learning_rate": 0.00013721705426356591, |
| "loss": 0.6890559387207031, |
| "step": 8100 |
| }, |
| { |
| "epoch": 15.69767441860465, |
| "eval_loss": 0.6517437696456909, |
| "eval_runtime": 210.9914, |
| "eval_samples_per_second": 90.07, |
| "eval_steps_per_second": 1.877, |
| "step": 8100 |
| }, |
| { |
| "epoch": 15.794573643410853, |
| "grad_norm": 1.157686471939087, |
| "learning_rate": 0.0001368294573643411, |
| "loss": 0.6964131164550781, |
| "step": 8150 |
| }, |
| { |
| "epoch": 15.794573643410853, |
| "eval_loss": 0.6610472202301025, |
| "eval_runtime": 201.0817, |
| "eval_samples_per_second": 94.509, |
| "eval_steps_per_second": 1.969, |
| "step": 8150 |
| }, |
| { |
| "epoch": 15.891472868217054, |
| "grad_norm": 1.1839828491210938, |
| "learning_rate": 0.0001364418604651163, |
| "loss": 0.683568115234375, |
| "step": 8200 |
| }, |
| { |
| "epoch": 15.891472868217054, |
| "eval_loss": 0.6657268404960632, |
| "eval_runtime": 209.1114, |
| "eval_samples_per_second": 90.88, |
| "eval_steps_per_second": 1.894, |
| "step": 8200 |
| }, |
| { |
| "epoch": 15.988372093023255, |
| "grad_norm": 1.1578187942504883, |
| "learning_rate": 0.00013605426356589148, |
| "loss": 0.6690034484863281, |
| "step": 8250 |
| }, |
| { |
| "epoch": 15.988372093023255, |
| "eval_loss": 0.6609968543052673, |
| "eval_runtime": 206.0582, |
| "eval_samples_per_second": 92.226, |
| "eval_steps_per_second": 1.922, |
| "step": 8250 |
| }, |
| { |
| "epoch": 16.085271317829456, |
| "grad_norm": 1.0880696773529053, |
| "learning_rate": 0.00013566666666666667, |
| "loss": 0.6861241912841797, |
| "step": 8300 |
| }, |
| { |
| "epoch": 16.085271317829456, |
| "eval_loss": 0.6592596173286438, |
| "eval_runtime": 210.5186, |
| "eval_samples_per_second": 90.272, |
| "eval_steps_per_second": 1.881, |
| "step": 8300 |
| }, |
| { |
| "epoch": 16.18217054263566, |
| "grad_norm": 1.3988213539123535, |
| "learning_rate": 0.00013527906976744188, |
| "loss": 0.6735115814208984, |
| "step": 8350 |
| }, |
| { |
| "epoch": 16.18217054263566, |
| "eval_loss": 0.6536418795585632, |
| "eval_runtime": 214.3127, |
| "eval_samples_per_second": 88.674, |
| "eval_steps_per_second": 1.848, |
| "step": 8350 |
| }, |
| { |
| "epoch": 16.27906976744186, |
| "grad_norm": 1.1758220195770264, |
| "learning_rate": 0.00013489147286821707, |
| "loss": 0.6864335632324219, |
| "step": 8400 |
| }, |
| { |
| "epoch": 16.27906976744186, |
| "eval_loss": 0.6612951159477234, |
| "eval_runtime": 207.0226, |
| "eval_samples_per_second": 91.797, |
| "eval_steps_per_second": 1.913, |
| "step": 8400 |
| }, |
| { |
| "epoch": 16.375968992248062, |
| "grad_norm": 0.9774469137191772, |
| "learning_rate": 0.00013450387596899226, |
| "loss": 0.6769390869140625, |
| "step": 8450 |
| }, |
| { |
| "epoch": 16.375968992248062, |
| "eval_loss": 0.6553090810775757, |
| "eval_runtime": 215.4301, |
| "eval_samples_per_second": 88.214, |
| "eval_steps_per_second": 1.838, |
| "step": 8450 |
| }, |
| { |
| "epoch": 16.472868217054263, |
| "grad_norm": 1.1165791749954224, |
| "learning_rate": 0.00013411627906976745, |
| "loss": 0.6800428009033204, |
| "step": 8500 |
| }, |
| { |
| "epoch": 16.472868217054263, |
| "eval_loss": 0.6524401307106018, |
| "eval_runtime": 213.4954, |
| "eval_samples_per_second": 89.014, |
| "eval_steps_per_second": 1.855, |
| "step": 8500 |
| }, |
| { |
| "epoch": 16.569767441860463, |
| "grad_norm": 1.130295753479004, |
| "learning_rate": 0.00013372868217054263, |
| "loss": 0.6797599792480469, |
| "step": 8550 |
| }, |
| { |
| "epoch": 16.569767441860463, |
| "eval_loss": 0.6471645832061768, |
| "eval_runtime": 212.8448, |
| "eval_samples_per_second": 89.286, |
| "eval_steps_per_second": 1.861, |
| "step": 8550 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 1.1940258741378784, |
| "learning_rate": 0.00013334108527131782, |
| "loss": 0.6560598754882813, |
| "step": 8600 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "eval_loss": 0.6543801426887512, |
| "eval_runtime": 215.4431, |
| "eval_samples_per_second": 88.209, |
| "eval_steps_per_second": 1.838, |
| "step": 8600 |
| }, |
| { |
| "epoch": 16.76356589147287, |
| "grad_norm": 1.2403243780136108, |
| "learning_rate": 0.00013295348837209304, |
| "loss": 0.6820440673828125, |
| "step": 8650 |
| }, |
| { |
| "epoch": 16.76356589147287, |
| "eval_loss": 0.657560408115387, |
| "eval_runtime": 222.4968, |
| "eval_samples_per_second": 85.412, |
| "eval_steps_per_second": 1.78, |
| "step": 8650 |
| }, |
| { |
| "epoch": 16.86046511627907, |
| "grad_norm": 1.1667098999023438, |
| "learning_rate": 0.00013256589147286822, |
| "loss": 0.661100082397461, |
| "step": 8700 |
| }, |
| { |
| "epoch": 16.86046511627907, |
| "eval_loss": 0.6547607183456421, |
| "eval_runtime": 225.7306, |
| "eval_samples_per_second": 84.189, |
| "eval_steps_per_second": 1.754, |
| "step": 8700 |
| }, |
| { |
| "epoch": 16.95736434108527, |
| "grad_norm": 1.4980967044830322, |
| "learning_rate": 0.0001321782945736434, |
| "loss": 0.6610430908203125, |
| "step": 8750 |
| }, |
| { |
| "epoch": 16.95736434108527, |
| "eval_loss": 0.6534817814826965, |
| "eval_runtime": 216.3877, |
| "eval_samples_per_second": 87.824, |
| "eval_steps_per_second": 1.83, |
| "step": 8750 |
| }, |
| { |
| "epoch": 17.05426356589147, |
| "grad_norm": 1.3227039575576782, |
| "learning_rate": 0.0001317906976744186, |
| "loss": 0.6713478088378906, |
| "step": 8800 |
| }, |
| { |
| "epoch": 17.05426356589147, |
| "eval_loss": 0.6427262425422668, |
| "eval_runtime": 218.6666, |
| "eval_samples_per_second": 86.909, |
| "eval_steps_per_second": 1.811, |
| "step": 8800 |
| }, |
| { |
| "epoch": 17.151162790697676, |
| "grad_norm": 1.0835665464401245, |
| "learning_rate": 0.0001314031007751938, |
| "loss": 0.6705303955078125, |
| "step": 8850 |
| }, |
| { |
| "epoch": 17.151162790697676, |
| "eval_loss": 0.6579347848892212, |
| "eval_runtime": 195.3941, |
| "eval_samples_per_second": 97.26, |
| "eval_steps_per_second": 2.027, |
| "step": 8850 |
| }, |
| { |
| "epoch": 17.248062015503876, |
| "grad_norm": 1.3096404075622559, |
| "learning_rate": 0.000131015503875969, |
| "loss": 0.6573382568359375, |
| "step": 8900 |
| }, |
| { |
| "epoch": 17.248062015503876, |
| "eval_loss": 0.6422920823097229, |
| "eval_runtime": 190.6185, |
| "eval_samples_per_second": 99.697, |
| "eval_steps_per_second": 2.077, |
| "step": 8900 |
| }, |
| { |
| "epoch": 17.344961240310077, |
| "grad_norm": 1.360499620437622, |
| "learning_rate": 0.0001306279069767442, |
| "loss": 0.6811130523681641, |
| "step": 8950 |
| }, |
| { |
| "epoch": 17.344961240310077, |
| "eval_loss": 0.6441925168037415, |
| "eval_runtime": 190.9697, |
| "eval_samples_per_second": 99.513, |
| "eval_steps_per_second": 2.074, |
| "step": 8950 |
| }, |
| { |
| "epoch": 17.441860465116278, |
| "grad_norm": 1.1536288261413574, |
| "learning_rate": 0.00013024031007751938, |
| "loss": 0.6666915893554688, |
| "step": 9000 |
| }, |
| { |
| "epoch": 17.441860465116278, |
| "eval_loss": 0.6471693515777588, |
| "eval_runtime": 200.6831, |
| "eval_samples_per_second": 94.697, |
| "eval_steps_per_second": 1.973, |
| "step": 9000 |
| }, |
| { |
| "epoch": 17.53875968992248, |
| "grad_norm": 1.135809302330017, |
| "learning_rate": 0.00012985271317829457, |
| "loss": 0.6658331298828125, |
| "step": 9050 |
| }, |
| { |
| "epoch": 17.53875968992248, |
| "eval_loss": 0.6478157043457031, |
| "eval_runtime": 195.5434, |
| "eval_samples_per_second": 97.186, |
| "eval_steps_per_second": 2.025, |
| "step": 9050 |
| }, |
| { |
| "epoch": 17.635658914728683, |
| "grad_norm": 1.2341054677963257, |
| "learning_rate": 0.00012946511627906976, |
| "loss": 0.6616094970703125, |
| "step": 9100 |
| }, |
| { |
| "epoch": 17.635658914728683, |
| "eval_loss": 0.6445872783660889, |
| "eval_runtime": 195.3659, |
| "eval_samples_per_second": 97.274, |
| "eval_steps_per_second": 2.027, |
| "step": 9100 |
| }, |
| { |
| "epoch": 17.732558139534884, |
| "grad_norm": 1.0323907136917114, |
| "learning_rate": 0.00012907751937984497, |
| "loss": 0.6661641693115234, |
| "step": 9150 |
| }, |
| { |
| "epoch": 17.732558139534884, |
| "eval_loss": 0.647544264793396, |
| "eval_runtime": 193.9072, |
| "eval_samples_per_second": 98.006, |
| "eval_steps_per_second": 2.042, |
| "step": 9150 |
| }, |
| { |
| "epoch": 17.829457364341085, |
| "grad_norm": 1.0194849967956543, |
| "learning_rate": 0.00012868992248062016, |
| "loss": 0.6610712432861328, |
| "step": 9200 |
| }, |
| { |
| "epoch": 17.829457364341085, |
| "eval_loss": 0.635890543460846, |
| "eval_runtime": 195.1292, |
| "eval_samples_per_second": 97.392, |
| "eval_steps_per_second": 2.029, |
| "step": 9200 |
| }, |
| { |
| "epoch": 17.926356589147286, |
| "grad_norm": 1.091950535774231, |
| "learning_rate": 0.00012830232558139535, |
| "loss": 0.6431932067871093, |
| "step": 9250 |
| }, |
| { |
| "epoch": 17.926356589147286, |
| "eval_loss": 0.6398835182189941, |
| "eval_runtime": 195.3898, |
| "eval_samples_per_second": 97.262, |
| "eval_steps_per_second": 2.027, |
| "step": 9250 |
| }, |
| { |
| "epoch": 18.023255813953487, |
| "grad_norm": 1.4130749702453613, |
| "learning_rate": 0.00012791472868217056, |
| "loss": 0.648166275024414, |
| "step": 9300 |
| }, |
| { |
| "epoch": 18.023255813953487, |
| "eval_loss": 0.6456313729286194, |
| "eval_runtime": 193.6438, |
| "eval_samples_per_second": 98.139, |
| "eval_steps_per_second": 2.045, |
| "step": 9300 |
| }, |
| { |
| "epoch": 18.12015503875969, |
| "grad_norm": 1.1974269151687622, |
| "learning_rate": 0.00012752713178294575, |
| "loss": 0.6472698211669922, |
| "step": 9350 |
| }, |
| { |
| "epoch": 18.12015503875969, |
| "eval_loss": 0.6428890824317932, |
| "eval_runtime": 197.0847, |
| "eval_samples_per_second": 96.426, |
| "eval_steps_per_second": 2.009, |
| "step": 9350 |
| }, |
| { |
| "epoch": 18.217054263565892, |
| "grad_norm": 1.071179747581482, |
| "learning_rate": 0.00012713953488372094, |
| "loss": 0.6578852844238281, |
| "step": 9400 |
| }, |
| { |
| "epoch": 18.217054263565892, |
| "eval_loss": 0.6396385431289673, |
| "eval_runtime": 226.4716, |
| "eval_samples_per_second": 83.913, |
| "eval_steps_per_second": 1.749, |
| "step": 9400 |
| }, |
| { |
| "epoch": 18.313953488372093, |
| "grad_norm": 1.1610106229782104, |
| "learning_rate": 0.00012675193798449615, |
| "loss": 0.645213851928711, |
| "step": 9450 |
| }, |
| { |
| "epoch": 18.313953488372093, |
| "eval_loss": 0.6382104158401489, |
| "eval_runtime": 196.6143, |
| "eval_samples_per_second": 96.656, |
| "eval_steps_per_second": 2.014, |
| "step": 9450 |
| }, |
| { |
| "epoch": 18.410852713178294, |
| "grad_norm": 0.9724190831184387, |
| "learning_rate": 0.00012636434108527134, |
| "loss": 0.637158203125, |
| "step": 9500 |
| }, |
| { |
| "epoch": 18.410852713178294, |
| "eval_loss": 0.6358678936958313, |
| "eval_runtime": 194.2795, |
| "eval_samples_per_second": 97.818, |
| "eval_steps_per_second": 2.038, |
| "step": 9500 |
| }, |
| { |
| "epoch": 18.507751937984494, |
| "grad_norm": 0.9816511273384094, |
| "learning_rate": 0.00012597674418604653, |
| "loss": 0.6615879821777344, |
| "step": 9550 |
| }, |
| { |
| "epoch": 18.507751937984494, |
| "eval_loss": 0.6309703588485718, |
| "eval_runtime": 197.0762, |
| "eval_samples_per_second": 96.43, |
| "eval_steps_per_second": 2.009, |
| "step": 9550 |
| }, |
| { |
| "epoch": 18.6046511627907, |
| "grad_norm": 1.0494227409362793, |
| "learning_rate": 0.00012558914728682172, |
| "loss": 0.6506745910644531, |
| "step": 9600 |
| }, |
| { |
| "epoch": 18.6046511627907, |
| "eval_loss": 0.636694610118866, |
| "eval_runtime": 201.6065, |
| "eval_samples_per_second": 94.263, |
| "eval_steps_per_second": 1.964, |
| "step": 9600 |
| }, |
| { |
| "epoch": 18.7015503875969, |
| "grad_norm": 1.328189730644226, |
| "learning_rate": 0.0001252015503875969, |
| "loss": 0.6442088317871094, |
| "step": 9650 |
| }, |
| { |
| "epoch": 18.7015503875969, |
| "eval_loss": 0.6276881098747253, |
| "eval_runtime": 186.5785, |
| "eval_samples_per_second": 101.855, |
| "eval_steps_per_second": 2.122, |
| "step": 9650 |
| }, |
| { |
| "epoch": 18.7984496124031, |
| "grad_norm": 1.1828275918960571, |
| "learning_rate": 0.0001248139534883721, |
| "loss": 0.6382374954223633, |
| "step": 9700 |
| }, |
| { |
| "epoch": 18.7984496124031, |
| "eval_loss": 0.636686384677887, |
| "eval_runtime": 180.6464, |
| "eval_samples_per_second": 105.2, |
| "eval_steps_per_second": 2.192, |
| "step": 9700 |
| }, |
| { |
| "epoch": 18.8953488372093, |
| "grad_norm": 1.2978507280349731, |
| "learning_rate": 0.0001244263565891473, |
| "loss": 0.6523281860351563, |
| "step": 9750 |
| }, |
| { |
| "epoch": 18.8953488372093, |
| "eval_loss": 0.6327991485595703, |
| "eval_runtime": 177.6563, |
| "eval_samples_per_second": 106.971, |
| "eval_steps_per_second": 2.229, |
| "step": 9750 |
| }, |
| { |
| "epoch": 18.992248062015506, |
| "grad_norm": 1.143589735031128, |
| "learning_rate": 0.0001240387596899225, |
| "loss": 0.6488991546630859, |
| "step": 9800 |
| }, |
| { |
| "epoch": 18.992248062015506, |
| "eval_loss": 0.6304866075515747, |
| "eval_runtime": 211.5352, |
| "eval_samples_per_second": 89.838, |
| "eval_steps_per_second": 1.872, |
| "step": 9800 |
| }, |
| { |
| "epoch": 19.089147286821706, |
| "grad_norm": 1.0623714923858643, |
| "learning_rate": 0.00012365116279069768, |
| "loss": 0.6333879852294921, |
| "step": 9850 |
| }, |
| { |
| "epoch": 19.089147286821706, |
| "eval_loss": 0.6215238571166992, |
| "eval_runtime": 202.8482, |
| "eval_samples_per_second": 93.686, |
| "eval_steps_per_second": 1.952, |
| "step": 9850 |
| }, |
| { |
| "epoch": 19.186046511627907, |
| "grad_norm": 1.1245452165603638, |
| "learning_rate": 0.00012326356589147287, |
| "loss": 0.642474136352539, |
| "step": 9900 |
| }, |
| { |
| "epoch": 19.186046511627907, |
| "eval_loss": 0.637874960899353, |
| "eval_runtime": 215.8143, |
| "eval_samples_per_second": 88.057, |
| "eval_steps_per_second": 1.835, |
| "step": 9900 |
| }, |
| { |
| "epoch": 19.282945736434108, |
| "grad_norm": 1.5039018392562866, |
| "learning_rate": 0.00012287596899224806, |
| "loss": 0.6468492126464844, |
| "step": 9950 |
| }, |
| { |
| "epoch": 19.282945736434108, |
| "eval_loss": 0.6239569187164307, |
| "eval_runtime": 212.329, |
| "eval_samples_per_second": 89.503, |
| "eval_steps_per_second": 1.865, |
| "step": 9950 |
| }, |
| { |
| "epoch": 19.37984496124031, |
| "grad_norm": 1.0403733253479004, |
| "learning_rate": 0.00012248837209302327, |
| "loss": 0.6217416381835937, |
| "step": 10000 |
| }, |
| { |
| "epoch": 19.37984496124031, |
| "eval_loss": 0.6286182999610901, |
| "eval_runtime": 205.5809, |
| "eval_samples_per_second": 92.44, |
| "eval_steps_per_second": 1.926, |
| "step": 10000 |
| }, |
| { |
| "epoch": 19.476744186046513, |
| "grad_norm": 1.1081093549728394, |
| "learning_rate": 0.00012210077519379846, |
| "loss": 0.6342117691040039, |
| "step": 10050 |
| }, |
| { |
| "epoch": 19.476744186046513, |
| "eval_loss": 0.6270238161087036, |
| "eval_runtime": 208.2038, |
| "eval_samples_per_second": 91.276, |
| "eval_steps_per_second": 1.902, |
| "step": 10050 |
| }, |
| { |
| "epoch": 19.573643410852714, |
| "grad_norm": 1.0839483737945557, |
| "learning_rate": 0.00012171317829457365, |
| "loss": 0.6427702331542968, |
| "step": 10100 |
| }, |
| { |
| "epoch": 19.573643410852714, |
| "eval_loss": 0.6364043354988098, |
| "eval_runtime": 196.6629, |
| "eval_samples_per_second": 96.632, |
| "eval_steps_per_second": 2.014, |
| "step": 10100 |
| }, |
| { |
| "epoch": 19.670542635658915, |
| "grad_norm": 0.998694658279419, |
| "learning_rate": 0.00012132558139534884, |
| "loss": 0.6340975570678711, |
| "step": 10150 |
| }, |
| { |
| "epoch": 19.670542635658915, |
| "eval_loss": 0.6303887963294983, |
| "eval_runtime": 204.4391, |
| "eval_samples_per_second": 92.957, |
| "eval_steps_per_second": 1.937, |
| "step": 10150 |
| }, |
| { |
| "epoch": 19.767441860465116, |
| "grad_norm": 1.228639006614685, |
| "learning_rate": 0.00012093798449612404, |
| "loss": 0.6313095474243164, |
| "step": 10200 |
| }, |
| { |
| "epoch": 19.767441860465116, |
| "eval_loss": 0.6240493655204773, |
| "eval_runtime": 204.6714, |
| "eval_samples_per_second": 92.851, |
| "eval_steps_per_second": 1.935, |
| "step": 10200 |
| }, |
| { |
| "epoch": 19.864341085271317, |
| "grad_norm": 1.3371620178222656, |
| "learning_rate": 0.00012055038759689923, |
| "loss": 0.6461412048339844, |
| "step": 10250 |
| }, |
| { |
| "epoch": 19.864341085271317, |
| "eval_loss": 0.6157258152961731, |
| "eval_runtime": 198.6411, |
| "eval_samples_per_second": 95.67, |
| "eval_steps_per_second": 1.994, |
| "step": 10250 |
| }, |
| { |
| "epoch": 19.96124031007752, |
| "grad_norm": 1.006132960319519, |
| "learning_rate": 0.00012016279069767441, |
| "loss": 0.6349713897705078, |
| "step": 10300 |
| }, |
| { |
| "epoch": 19.96124031007752, |
| "eval_loss": 0.6245470643043518, |
| "eval_runtime": 195.4527, |
| "eval_samples_per_second": 97.231, |
| "eval_steps_per_second": 2.026, |
| "step": 10300 |
| }, |
| { |
| "epoch": 20.058139534883722, |
| "grad_norm": 1.0197911262512207, |
| "learning_rate": 0.00011977519379844962, |
| "loss": 0.6510966491699218, |
| "step": 10350 |
| }, |
| { |
| "epoch": 20.058139534883722, |
| "eval_loss": 0.6246243715286255, |
| "eval_runtime": 191.3549, |
| "eval_samples_per_second": 99.313, |
| "eval_steps_per_second": 2.069, |
| "step": 10350 |
| }, |
| { |
| "epoch": 20.155038759689923, |
| "grad_norm": 1.1822741031646729, |
| "learning_rate": 0.0001193875968992248, |
| "loss": 0.6449718475341797, |
| "step": 10400 |
| }, |
| { |
| "epoch": 20.155038759689923, |
| "eval_loss": 0.6225494742393494, |
| "eval_runtime": 183.6561, |
| "eval_samples_per_second": 103.476, |
| "eval_steps_per_second": 2.156, |
| "step": 10400 |
| }, |
| { |
| "epoch": 20.251937984496124, |
| "grad_norm": 1.1490117311477661, |
| "learning_rate": 0.000119, |
| "loss": 0.6246649551391602, |
| "step": 10450 |
| }, |
| { |
| "epoch": 20.251937984496124, |
| "eval_loss": 0.6180397868156433, |
| "eval_runtime": 178.3831, |
| "eval_samples_per_second": 106.535, |
| "eval_steps_per_second": 2.22, |
| "step": 10450 |
| }, |
| { |
| "epoch": 20.348837209302324, |
| "grad_norm": 1.1532789468765259, |
| "learning_rate": 0.0001186124031007752, |
| "loss": 0.6352950668334961, |
| "step": 10500 |
| }, |
| { |
| "epoch": 20.348837209302324, |
| "eval_loss": 0.6122626662254333, |
| "eval_runtime": 191.7685, |
| "eval_samples_per_second": 99.099, |
| "eval_steps_per_second": 2.065, |
| "step": 10500 |
| }, |
| { |
| "epoch": 20.44573643410853, |
| "grad_norm": 1.1865143775939941, |
| "learning_rate": 0.00011822480620155038, |
| "loss": 0.6208504486083984, |
| "step": 10550 |
| }, |
| { |
| "epoch": 20.44573643410853, |
| "eval_loss": 0.6198189854621887, |
| "eval_runtime": 187.1718, |
| "eval_samples_per_second": 101.532, |
| "eval_steps_per_second": 2.116, |
| "step": 10550 |
| }, |
| { |
| "epoch": 20.54263565891473, |
| "grad_norm": 1.3015577793121338, |
| "learning_rate": 0.00011783720930232558, |
| "loss": 0.6392872619628907, |
| "step": 10600 |
| }, |
| { |
| "epoch": 20.54263565891473, |
| "eval_loss": 0.6212337017059326, |
| "eval_runtime": 189.8147, |
| "eval_samples_per_second": 100.119, |
| "eval_steps_per_second": 2.086, |
| "step": 10600 |
| }, |
| { |
| "epoch": 20.63953488372093, |
| "grad_norm": 1.0097442865371704, |
| "learning_rate": 0.00011744961240310077, |
| "loss": 0.630739974975586, |
| "step": 10650 |
| }, |
| { |
| "epoch": 20.63953488372093, |
| "eval_loss": 0.618180513381958, |
| "eval_runtime": 186.5814, |
| "eval_samples_per_second": 101.854, |
| "eval_steps_per_second": 2.122, |
| "step": 10650 |
| }, |
| { |
| "epoch": 20.73643410852713, |
| "grad_norm": 1.3883415460586548, |
| "learning_rate": 0.00011706201550387596, |
| "loss": 0.6176488494873047, |
| "step": 10700 |
| }, |
| { |
| "epoch": 20.73643410852713, |
| "eval_loss": 0.6110183596611023, |
| "eval_runtime": 184.6587, |
| "eval_samples_per_second": 102.914, |
| "eval_steps_per_second": 2.144, |
| "step": 10700 |
| }, |
| { |
| "epoch": 20.833333333333332, |
| "grad_norm": 1.1491754055023193, |
| "learning_rate": 0.00011667441860465116, |
| "loss": 0.6281963729858399, |
| "step": 10750 |
| }, |
| { |
| "epoch": 20.833333333333332, |
| "eval_loss": 0.6063302159309387, |
| "eval_runtime": 190.669, |
| "eval_samples_per_second": 99.67, |
| "eval_steps_per_second": 2.077, |
| "step": 10750 |
| }, |
| { |
| "epoch": 20.930232558139537, |
| "grad_norm": 1.07293701171875, |
| "learning_rate": 0.00011628682170542635, |
| "loss": 0.6193827438354492, |
| "step": 10800 |
| }, |
| { |
| "epoch": 20.930232558139537, |
| "eval_loss": 0.6157019734382629, |
| "eval_runtime": 181.2458, |
| "eval_samples_per_second": 104.852, |
| "eval_steps_per_second": 2.185, |
| "step": 10800 |
| }, |
| { |
| "epoch": 21.027131782945737, |
| "grad_norm": 1.0467058420181274, |
| "learning_rate": 0.00011589922480620155, |
| "loss": 0.6196688079833984, |
| "step": 10850 |
| }, |
| { |
| "epoch": 21.027131782945737, |
| "eval_loss": 0.6192191243171692, |
| "eval_runtime": 180.8537, |
| "eval_samples_per_second": 105.079, |
| "eval_steps_per_second": 2.19, |
| "step": 10850 |
| }, |
| { |
| "epoch": 21.124031007751938, |
| "grad_norm": 1.2847251892089844, |
| "learning_rate": 0.00011551162790697677, |
| "loss": 0.6209733581542969, |
| "step": 10900 |
| }, |
| { |
| "epoch": 21.124031007751938, |
| "eval_loss": 0.6143134832382202, |
| "eval_runtime": 186.2509, |
| "eval_samples_per_second": 102.034, |
| "eval_steps_per_second": 2.126, |
| "step": 10900 |
| }, |
| { |
| "epoch": 21.22093023255814, |
| "grad_norm": 1.1140937805175781, |
| "learning_rate": 0.00011512403100775195, |
| "loss": 0.6021680450439453, |
| "step": 10950 |
| }, |
| { |
| "epoch": 21.22093023255814, |
| "eval_loss": 0.6277331709861755, |
| "eval_runtime": 173.1683, |
| "eval_samples_per_second": 109.743, |
| "eval_steps_per_second": 2.287, |
| "step": 10950 |
| }, |
| { |
| "epoch": 21.31782945736434, |
| "grad_norm": 1.1287832260131836, |
| "learning_rate": 0.00011473643410852714, |
| "loss": 0.6102452087402344, |
| "step": 11000 |
| }, |
| { |
| "epoch": 21.31782945736434, |
| "eval_loss": 0.6136771440505981, |
| "eval_runtime": 176.7361, |
| "eval_samples_per_second": 107.528, |
| "eval_steps_per_second": 2.241, |
| "step": 11000 |
| }, |
| { |
| "epoch": 21.414728682170544, |
| "grad_norm": 1.0684071779251099, |
| "learning_rate": 0.00011434883720930234, |
| "loss": 0.6249099731445312, |
| "step": 11050 |
| }, |
| { |
| "epoch": 21.414728682170544, |
| "eval_loss": 0.6049215793609619, |
| "eval_runtime": 182.8852, |
| "eval_samples_per_second": 103.912, |
| "eval_steps_per_second": 2.165, |
| "step": 11050 |
| }, |
| { |
| "epoch": 21.511627906976745, |
| "grad_norm": 1.1844762563705444, |
| "learning_rate": 0.00011396124031007753, |
| "loss": 0.6158002853393555, |
| "step": 11100 |
| }, |
| { |
| "epoch": 21.511627906976745, |
| "eval_loss": 0.6154988408088684, |
| "eval_runtime": 181.5179, |
| "eval_samples_per_second": 104.695, |
| "eval_steps_per_second": 2.182, |
| "step": 11100 |
| }, |
| { |
| "epoch": 21.608527131782946, |
| "grad_norm": 0.994705855846405, |
| "learning_rate": 0.00011357364341085273, |
| "loss": 0.6291318130493164, |
| "step": 11150 |
| }, |
| { |
| "epoch": 21.608527131782946, |
| "eval_loss": 0.6036517024040222, |
| "eval_runtime": 183.5149, |
| "eval_samples_per_second": 103.556, |
| "eval_steps_per_second": 2.158, |
| "step": 11150 |
| }, |
| { |
| "epoch": 21.705426356589147, |
| "grad_norm": 1.1102783679962158, |
| "learning_rate": 0.00011318604651162792, |
| "loss": 0.6337633514404297, |
| "step": 11200 |
| }, |
| { |
| "epoch": 21.705426356589147, |
| "eval_loss": 0.6135731935501099, |
| "eval_runtime": 187.0592, |
| "eval_samples_per_second": 101.594, |
| "eval_steps_per_second": 2.117, |
| "step": 11200 |
| }, |
| { |
| "epoch": 21.802325581395348, |
| "grad_norm": 1.244672179222107, |
| "learning_rate": 0.00011279844961240311, |
| "loss": 0.6188518524169921, |
| "step": 11250 |
| }, |
| { |
| "epoch": 21.802325581395348, |
| "eval_loss": 0.6139986515045166, |
| "eval_runtime": 183.6873, |
| "eval_samples_per_second": 103.458, |
| "eval_steps_per_second": 2.156, |
| "step": 11250 |
| }, |
| { |
| "epoch": 21.899224806201552, |
| "grad_norm": 1.0099879503250122, |
| "learning_rate": 0.00011241085271317831, |
| "loss": 0.6150000381469727, |
| "step": 11300 |
| }, |
| { |
| "epoch": 21.899224806201552, |
| "eval_loss": 0.6092088222503662, |
| "eval_runtime": 183.1899, |
| "eval_samples_per_second": 103.739, |
| "eval_steps_per_second": 2.162, |
| "step": 11300 |
| }, |
| { |
| "epoch": 21.996124031007753, |
| "grad_norm": 1.1326417922973633, |
| "learning_rate": 0.0001120232558139535, |
| "loss": 0.6261586380004883, |
| "step": 11350 |
| }, |
| { |
| "epoch": 21.996124031007753, |
| "eval_loss": 0.6016091704368591, |
| "eval_runtime": 188.0838, |
| "eval_samples_per_second": 101.04, |
| "eval_steps_per_second": 2.105, |
| "step": 11350 |
| }, |
| { |
| "epoch": 22.093023255813954, |
| "grad_norm": 1.046021819114685, |
| "learning_rate": 0.00011163565891472869, |
| "loss": 0.6060951614379883, |
| "step": 11400 |
| }, |
| { |
| "epoch": 22.093023255813954, |
| "eval_loss": 0.6057671308517456, |
| "eval_runtime": 196.3362, |
| "eval_samples_per_second": 96.793, |
| "eval_steps_per_second": 2.017, |
| "step": 11400 |
| }, |
| { |
| "epoch": 22.189922480620154, |
| "grad_norm": 1.077669382095337, |
| "learning_rate": 0.00011124806201550389, |
| "loss": 0.6060348510742187, |
| "step": 11450 |
| }, |
| { |
| "epoch": 22.189922480620154, |
| "eval_loss": 0.6033645272254944, |
| "eval_runtime": 197.9875, |
| "eval_samples_per_second": 95.986, |
| "eval_steps_per_second": 2.0, |
| "step": 11450 |
| }, |
| { |
| "epoch": 22.286821705426355, |
| "grad_norm": 1.2075508832931519, |
| "learning_rate": 0.00011086046511627907, |
| "loss": 0.6069083786010743, |
| "step": 11500 |
| }, |
| { |
| "epoch": 22.286821705426355, |
| "eval_loss": 0.6047888994216919, |
| "eval_runtime": 199.6553, |
| "eval_samples_per_second": 95.184, |
| "eval_steps_per_second": 1.983, |
| "step": 11500 |
| }, |
| { |
| "epoch": 22.38372093023256, |
| "grad_norm": 1.1000014543533325, |
| "learning_rate": 0.00011047286821705428, |
| "loss": 0.6034501266479492, |
| "step": 11550 |
| }, |
| { |
| "epoch": 22.38372093023256, |
| "eval_loss": 0.6033228635787964, |
| "eval_runtime": 196.9171, |
| "eval_samples_per_second": 96.508, |
| "eval_steps_per_second": 2.011, |
| "step": 11550 |
| }, |
| { |
| "epoch": 22.48062015503876, |
| "grad_norm": 1.1755788326263428, |
| "learning_rate": 0.00011008527131782946, |
| "loss": 0.6222854232788086, |
| "step": 11600 |
| }, |
| { |
| "epoch": 22.48062015503876, |
| "eval_loss": 0.6062292456626892, |
| "eval_runtime": 198.4476, |
| "eval_samples_per_second": 95.763, |
| "eval_steps_per_second": 1.995, |
| "step": 11600 |
| }, |
| { |
| "epoch": 22.57751937984496, |
| "grad_norm": 1.0309172868728638, |
| "learning_rate": 0.00010969767441860465, |
| "loss": 0.6081157302856446, |
| "step": 11650 |
| }, |
| { |
| "epoch": 22.57751937984496, |
| "eval_loss": 0.6113541126251221, |
| "eval_runtime": 202.1539, |
| "eval_samples_per_second": 94.008, |
| "eval_steps_per_second": 1.959, |
| "step": 11650 |
| }, |
| { |
| "epoch": 22.674418604651162, |
| "grad_norm": 1.137413501739502, |
| "learning_rate": 0.00010931007751937985, |
| "loss": 0.6076561737060547, |
| "step": 11700 |
| }, |
| { |
| "epoch": 22.674418604651162, |
| "eval_loss": 0.6047736406326294, |
| "eval_runtime": 192.1483, |
| "eval_samples_per_second": 98.903, |
| "eval_steps_per_second": 2.061, |
| "step": 11700 |
| }, |
| { |
| "epoch": 22.771317829457363, |
| "grad_norm": 0.9180145263671875, |
| "learning_rate": 0.00010892248062015504, |
| "loss": 0.5997943878173828, |
| "step": 11750 |
| }, |
| { |
| "epoch": 22.771317829457363, |
| "eval_loss": 0.6039275527000427, |
| "eval_runtime": 193.6762, |
| "eval_samples_per_second": 98.123, |
| "eval_steps_per_second": 2.045, |
| "step": 11750 |
| }, |
| { |
| "epoch": 22.868217054263567, |
| "grad_norm": 0.9683696031570435, |
| "learning_rate": 0.00010853488372093023, |
| "loss": 0.6150859451293945, |
| "step": 11800 |
| }, |
| { |
| "epoch": 22.868217054263567, |
| "eval_loss": 0.5961532592773438, |
| "eval_runtime": 200.108, |
| "eval_samples_per_second": 94.969, |
| "eval_steps_per_second": 1.979, |
| "step": 11800 |
| }, |
| { |
| "epoch": 22.96511627906977, |
| "grad_norm": 1.114216685295105, |
| "learning_rate": 0.00010814728682170543, |
| "loss": 0.6122099304199219, |
| "step": 11850 |
| }, |
| { |
| "epoch": 22.96511627906977, |
| "eval_loss": 0.6024142503738403, |
| "eval_runtime": 206.7493, |
| "eval_samples_per_second": 91.918, |
| "eval_steps_per_second": 1.915, |
| "step": 11850 |
| }, |
| { |
| "epoch": 23.06201550387597, |
| "grad_norm": 1.209195613861084, |
| "learning_rate": 0.00010775968992248062, |
| "loss": 0.6041343688964844, |
| "step": 11900 |
| }, |
| { |
| "epoch": 23.06201550387597, |
| "eval_loss": 0.602457582950592, |
| "eval_runtime": 209.0607, |
| "eval_samples_per_second": 90.902, |
| "eval_steps_per_second": 1.894, |
| "step": 11900 |
| }, |
| { |
| "epoch": 23.15891472868217, |
| "grad_norm": 1.2101048231124878, |
| "learning_rate": 0.00010737209302325582, |
| "loss": 0.6141452026367188, |
| "step": 11950 |
| }, |
| { |
| "epoch": 23.15891472868217, |
| "eval_loss": 0.5966194868087769, |
| "eval_runtime": 218.8529, |
| "eval_samples_per_second": 86.835, |
| "eval_steps_per_second": 1.809, |
| "step": 11950 |
| }, |
| { |
| "epoch": 23.25581395348837, |
| "grad_norm": 1.0980631113052368, |
| "learning_rate": 0.00010698449612403101, |
| "loss": 0.6167293548583984, |
| "step": 12000 |
| }, |
| { |
| "epoch": 23.25581395348837, |
| "eval_loss": 0.5952345728874207, |
| "eval_runtime": 217.6315, |
| "eval_samples_per_second": 87.322, |
| "eval_steps_per_second": 1.82, |
| "step": 12000 |
| }, |
| { |
| "epoch": 23.352713178294575, |
| "grad_norm": 1.0111398696899414, |
| "learning_rate": 0.0001065968992248062, |
| "loss": 0.6044289779663086, |
| "step": 12050 |
| }, |
| { |
| "epoch": 23.352713178294575, |
| "eval_loss": 0.5976375341415405, |
| "eval_runtime": 213.6111, |
| "eval_samples_per_second": 88.965, |
| "eval_steps_per_second": 1.854, |
| "step": 12050 |
| }, |
| { |
| "epoch": 23.449612403100776, |
| "grad_norm": 0.9988157749176025, |
| "learning_rate": 0.0001062093023255814, |
| "loss": 0.5936355972290039, |
| "step": 12100 |
| }, |
| { |
| "epoch": 23.449612403100776, |
| "eval_loss": 0.5917235612869263, |
| "eval_runtime": 212.9936, |
| "eval_samples_per_second": 89.223, |
| "eval_steps_per_second": 1.859, |
| "step": 12100 |
| }, |
| { |
| "epoch": 23.546511627906977, |
| "grad_norm": 1.4022223949432373, |
| "learning_rate": 0.00010582170542635659, |
| "loss": 0.6073113250732421, |
| "step": 12150 |
| }, |
| { |
| "epoch": 23.546511627906977, |
| "eval_loss": 0.5900039672851562, |
| "eval_runtime": 213.2849, |
| "eval_samples_per_second": 89.101, |
| "eval_steps_per_second": 1.857, |
| "step": 12150 |
| }, |
| { |
| "epoch": 23.643410852713178, |
| "grad_norm": 1.0619962215423584, |
| "learning_rate": 0.00010543410852713179, |
| "loss": 0.5950005722045898, |
| "step": 12200 |
| }, |
| { |
| "epoch": 23.643410852713178, |
| "eval_loss": 0.6011573076248169, |
| "eval_runtime": 212.3832, |
| "eval_samples_per_second": 89.48, |
| "eval_steps_per_second": 1.865, |
| "step": 12200 |
| }, |
| { |
| "epoch": 23.74031007751938, |
| "grad_norm": 1.211785912513733, |
| "learning_rate": 0.00010504651162790698, |
| "loss": 0.5973508834838868, |
| "step": 12250 |
| }, |
| { |
| "epoch": 23.74031007751938, |
| "eval_loss": 0.5984842777252197, |
| "eval_runtime": 212.6764, |
| "eval_samples_per_second": 89.356, |
| "eval_steps_per_second": 1.862, |
| "step": 12250 |
| }, |
| { |
| "epoch": 23.837209302325583, |
| "grad_norm": 1.1594852209091187, |
| "learning_rate": 0.00010465891472868216, |
| "loss": 0.5971969223022461, |
| "step": 12300 |
| }, |
| { |
| "epoch": 23.837209302325583, |
| "eval_loss": 0.5884484052658081, |
| "eval_runtime": 211.7184, |
| "eval_samples_per_second": 89.761, |
| "eval_steps_per_second": 1.87, |
| "step": 12300 |
| }, |
| { |
| "epoch": 23.934108527131784, |
| "grad_norm": 0.8441356420516968, |
| "learning_rate": 0.00010427131782945736, |
| "loss": 0.5886388397216797, |
| "step": 12350 |
| }, |
| { |
| "epoch": 23.934108527131784, |
| "eval_loss": 0.595369815826416, |
| "eval_runtime": 202.9896, |
| "eval_samples_per_second": 93.621, |
| "eval_steps_per_second": 1.951, |
| "step": 12350 |
| }, |
| { |
| "epoch": 24.031007751937985, |
| "grad_norm": 0.9786841869354248, |
| "learning_rate": 0.00010388372093023255, |
| "loss": 0.6087466812133789, |
| "step": 12400 |
| }, |
| { |
| "epoch": 24.031007751937985, |
| "eval_loss": 0.5952097177505493, |
| "eval_runtime": 211.1496, |
| "eval_samples_per_second": 90.003, |
| "eval_steps_per_second": 1.875, |
| "step": 12400 |
| }, |
| { |
| "epoch": 24.127906976744185, |
| "grad_norm": 1.0507079362869263, |
| "learning_rate": 0.00010349612403100774, |
| "loss": 0.5952305221557617, |
| "step": 12450 |
| }, |
| { |
| "epoch": 24.127906976744185, |
| "eval_loss": 0.5925743579864502, |
| "eval_runtime": 211.9219, |
| "eval_samples_per_second": 89.675, |
| "eval_steps_per_second": 1.869, |
| "step": 12450 |
| }, |
| { |
| "epoch": 24.224806201550386, |
| "grad_norm": 1.152092456817627, |
| "learning_rate": 0.00010310852713178296, |
| "loss": 0.6006411743164063, |
| "step": 12500 |
| }, |
| { |
| "epoch": 24.224806201550386, |
| "eval_loss": 0.591440737247467, |
| "eval_runtime": 210.0005, |
| "eval_samples_per_second": 90.495, |
| "eval_steps_per_second": 1.886, |
| "step": 12500 |
| }, |
| { |
| "epoch": 24.32170542635659, |
| "grad_norm": 1.2130184173583984, |
| "learning_rate": 0.00010272093023255816, |
| "loss": 0.5954698944091796, |
| "step": 12550 |
| }, |
| { |
| "epoch": 24.32170542635659, |
| "eval_loss": 0.5851362943649292, |
| "eval_runtime": 214.348, |
| "eval_samples_per_second": 88.66, |
| "eval_steps_per_second": 1.847, |
| "step": 12550 |
| }, |
| { |
| "epoch": 24.41860465116279, |
| "grad_norm": 1.0356969833374023, |
| "learning_rate": 0.00010233333333333335, |
| "loss": 0.5853539657592773, |
| "step": 12600 |
| }, |
| { |
| "epoch": 24.41860465116279, |
| "eval_loss": 0.5944467186927795, |
| "eval_runtime": 215.7672, |
| "eval_samples_per_second": 88.076, |
| "eval_steps_per_second": 1.835, |
| "step": 12600 |
| }, |
| { |
| "epoch": 24.515503875968992, |
| "grad_norm": 0.9699698686599731, |
| "learning_rate": 0.00010194573643410855, |
| "loss": 0.5965142440795899, |
| "step": 12650 |
| }, |
| { |
| "epoch": 24.515503875968992, |
| "eval_loss": 0.5891871452331543, |
| "eval_runtime": 208.592, |
| "eval_samples_per_second": 91.106, |
| "eval_steps_per_second": 1.898, |
| "step": 12650 |
| }, |
| { |
| "epoch": 24.612403100775193, |
| "grad_norm": 1.301386833190918, |
| "learning_rate": 0.00010155813953488373, |
| "loss": 0.6103283309936524, |
| "step": 12700 |
| }, |
| { |
| "epoch": 24.612403100775193, |
| "eval_loss": 0.5880603194236755, |
| "eval_runtime": 211.0333, |
| "eval_samples_per_second": 90.052, |
| "eval_steps_per_second": 1.876, |
| "step": 12700 |
| }, |
| { |
| "epoch": 24.709302325581394, |
| "grad_norm": 1.274060606956482, |
| "learning_rate": 0.00010117054263565892, |
| "loss": 0.5870831298828125, |
| "step": 12750 |
| }, |
| { |
| "epoch": 24.709302325581394, |
| "eval_loss": 0.5992385149002075, |
| "eval_runtime": 209.409, |
| "eval_samples_per_second": 90.751, |
| "eval_steps_per_second": 1.891, |
| "step": 12750 |
| }, |
| { |
| "epoch": 24.8062015503876, |
| "grad_norm": 1.1430360078811646, |
| "learning_rate": 0.00010078294573643412, |
| "loss": 0.5938863754272461, |
| "step": 12800 |
| }, |
| { |
| "epoch": 24.8062015503876, |
| "eval_loss": 0.5917322039604187, |
| "eval_runtime": 219.3042, |
| "eval_samples_per_second": 86.656, |
| "eval_steps_per_second": 1.806, |
| "step": 12800 |
| }, |
| { |
| "epoch": 24.9031007751938, |
| "grad_norm": 0.9088590741157532, |
| "learning_rate": 0.00010039534883720931, |
| "loss": 0.5874351119995117, |
| "step": 12850 |
| }, |
| { |
| "epoch": 24.9031007751938, |
| "eval_loss": 0.5859392881393433, |
| "eval_runtime": 220.2194, |
| "eval_samples_per_second": 86.296, |
| "eval_steps_per_second": 1.798, |
| "step": 12850 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 1.2513511180877686, |
| "learning_rate": 0.0001000077519379845, |
| "loss": 0.5945474243164063, |
| "step": 12900 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_loss": 0.5825695395469666, |
| "eval_runtime": 216.9535, |
| "eval_samples_per_second": 87.595, |
| "eval_steps_per_second": 1.825, |
| "step": 12900 |
| }, |
| { |
| "epoch": 25.0968992248062, |
| "grad_norm": 1.0160269737243652, |
| "learning_rate": 9.96201550387597e-05, |
| "loss": 0.5680616760253906, |
| "step": 12950 |
| }, |
| { |
| "epoch": 25.0968992248062, |
| "eval_loss": 0.5853219032287598, |
| "eval_runtime": 222.4015, |
| "eval_samples_per_second": 85.449, |
| "eval_steps_per_second": 1.781, |
| "step": 12950 |
| }, |
| { |
| "epoch": 25.1937984496124, |
| "grad_norm": 1.0439985990524292, |
| "learning_rate": 9.923255813953489e-05, |
| "loss": 0.6003565979003906, |
| "step": 13000 |
| }, |
| { |
| "epoch": 25.1937984496124, |
| "eval_loss": 0.5902218818664551, |
| "eval_runtime": 226.1714, |
| "eval_samples_per_second": 84.025, |
| "eval_steps_per_second": 1.751, |
| "step": 13000 |
| }, |
| { |
| "epoch": 25.290697674418606, |
| "grad_norm": 1.161298394203186, |
| "learning_rate": 9.884496124031009e-05, |
| "loss": 0.59041748046875, |
| "step": 13050 |
| }, |
| { |
| "epoch": 25.290697674418606, |
| "eval_loss": 0.5912677645683289, |
| "eval_runtime": 227.5252, |
| "eval_samples_per_second": 83.525, |
| "eval_steps_per_second": 1.74, |
| "step": 13050 |
| }, |
| { |
| "epoch": 25.387596899224807, |
| "grad_norm": 1.0510302782058716, |
| "learning_rate": 9.845736434108528e-05, |
| "loss": 0.5883547973632812, |
| "step": 13100 |
| }, |
| { |
| "epoch": 25.387596899224807, |
| "eval_loss": 0.5912774801254272, |
| "eval_runtime": 230.7864, |
| "eval_samples_per_second": 82.345, |
| "eval_steps_per_second": 1.716, |
| "step": 13100 |
| }, |
| { |
| "epoch": 25.484496124031008, |
| "grad_norm": 1.0604901313781738, |
| "learning_rate": 9.806976744186047e-05, |
| "loss": 0.5883957290649414, |
| "step": 13150 |
| }, |
| { |
| "epoch": 25.484496124031008, |
| "eval_loss": 0.5866997838020325, |
| "eval_runtime": 215.6859, |
| "eval_samples_per_second": 88.11, |
| "eval_steps_per_second": 1.836, |
| "step": 13150 |
| }, |
| { |
| "epoch": 25.58139534883721, |
| "grad_norm": 0.931657075881958, |
| "learning_rate": 9.768217054263567e-05, |
| "loss": 0.5975632858276367, |
| "step": 13200 |
| }, |
| { |
| "epoch": 25.58139534883721, |
| "eval_loss": 0.5827761888504028, |
| "eval_runtime": 222.4474, |
| "eval_samples_per_second": 85.431, |
| "eval_steps_per_second": 1.78, |
| "step": 13200 |
| }, |
| { |
| "epoch": 25.67829457364341, |
| "grad_norm": 0.9771053194999695, |
| "learning_rate": 9.729457364341086e-05, |
| "loss": 0.5864889907836914, |
| "step": 13250 |
| }, |
| { |
| "epoch": 25.67829457364341, |
| "eval_loss": 0.5907730460166931, |
| "eval_runtime": 226.3262, |
| "eval_samples_per_second": 83.967, |
| "eval_steps_per_second": 1.75, |
| "step": 13250 |
| }, |
| { |
| "epoch": 25.775193798449614, |
| "grad_norm": 1.1834367513656616, |
| "learning_rate": 9.690697674418606e-05, |
| "loss": 0.5737375259399414, |
| "step": 13300 |
| }, |
| { |
| "epoch": 25.775193798449614, |
| "eval_loss": 0.5830610394477844, |
| "eval_runtime": 234.6307, |
| "eval_samples_per_second": 80.995, |
| "eval_steps_per_second": 1.688, |
| "step": 13300 |
| }, |
| { |
| "epoch": 25.872093023255815, |
| "grad_norm": 1.618025302886963, |
| "learning_rate": 9.651937984496125e-05, |
| "loss": 0.582273292541504, |
| "step": 13350 |
| }, |
| { |
| "epoch": 25.872093023255815, |
| "eval_loss": 0.5806219577789307, |
| "eval_runtime": 196.4825, |
| "eval_samples_per_second": 96.721, |
| "eval_steps_per_second": 2.015, |
| "step": 13350 |
| }, |
| { |
| "epoch": 25.968992248062015, |
| "grad_norm": 1.1636735200881958, |
| "learning_rate": 9.613178294573643e-05, |
| "loss": 0.5907667541503906, |
| "step": 13400 |
| }, |
| { |
| "epoch": 25.968992248062015, |
| "eval_loss": 0.5851462483406067, |
| "eval_runtime": 203.9634, |
| "eval_samples_per_second": 93.174, |
| "eval_steps_per_second": 1.942, |
| "step": 13400 |
| }, |
| { |
| "epoch": 26.065891472868216, |
| "grad_norm": 1.0781770944595337, |
| "learning_rate": 9.574418604651164e-05, |
| "loss": 0.5699609756469727, |
| "step": 13450 |
| }, |
| { |
| "epoch": 26.065891472868216, |
| "eval_loss": 0.586871862411499, |
| "eval_runtime": 233.0709, |
| "eval_samples_per_second": 81.537, |
| "eval_steps_per_second": 1.699, |
| "step": 13450 |
| }, |
| { |
| "epoch": 26.162790697674417, |
| "grad_norm": 1.005245327949524, |
| "learning_rate": 9.535658914728682e-05, |
| "loss": 0.5884146499633789, |
| "step": 13500 |
| }, |
| { |
| "epoch": 26.162790697674417, |
| "eval_loss": 0.5832611322402954, |
| "eval_runtime": 210.0296, |
| "eval_samples_per_second": 90.482, |
| "eval_steps_per_second": 1.885, |
| "step": 13500 |
| }, |
| { |
| "epoch": 26.25968992248062, |
| "grad_norm": 0.9500683546066284, |
| "learning_rate": 9.496899224806201e-05, |
| "loss": 0.5716347122192382, |
| "step": 13550 |
| }, |
| { |
| "epoch": 26.25968992248062, |
| "eval_loss": 0.5889795422554016, |
| "eval_runtime": 218.3363, |
| "eval_samples_per_second": 87.04, |
| "eval_steps_per_second": 1.814, |
| "step": 13550 |
| }, |
| { |
| "epoch": 26.356589147286822, |
| "grad_norm": 0.9411081075668335, |
| "learning_rate": 9.458139534883721e-05, |
| "loss": 0.574790267944336, |
| "step": 13600 |
| }, |
| { |
| "epoch": 26.356589147286822, |
| "eval_loss": 0.5878632068634033, |
| "eval_runtime": 203.8131, |
| "eval_samples_per_second": 93.242, |
| "eval_steps_per_second": 1.943, |
| "step": 13600 |
| }, |
| { |
| "epoch": 26.453488372093023, |
| "grad_norm": 1.0316822528839111, |
| "learning_rate": 9.41937984496124e-05, |
| "loss": 0.5760750198364257, |
| "step": 13650 |
| }, |
| { |
| "epoch": 26.453488372093023, |
| "eval_loss": 0.5763137340545654, |
| "eval_runtime": 179.5252, |
| "eval_samples_per_second": 105.857, |
| "eval_steps_per_second": 2.206, |
| "step": 13650 |
| }, |
| { |
| "epoch": 26.550387596899224, |
| "grad_norm": 1.0404475927352905, |
| "learning_rate": 9.38062015503876e-05, |
| "loss": 0.5735264587402343, |
| "step": 13700 |
| }, |
| { |
| "epoch": 26.550387596899224, |
| "eval_loss": 0.5846853852272034, |
| "eval_runtime": 189.6159, |
| "eval_samples_per_second": 100.224, |
| "eval_steps_per_second": 2.088, |
| "step": 13700 |
| }, |
| { |
| "epoch": 26.647286821705425, |
| "grad_norm": 0.9884878993034363, |
| "learning_rate": 9.34186046511628e-05, |
| "loss": 0.5820446014404297, |
| "step": 13750 |
| }, |
| { |
| "epoch": 26.647286821705425, |
| "eval_loss": 0.5772427916526794, |
| "eval_runtime": 202.492, |
| "eval_samples_per_second": 93.851, |
| "eval_steps_per_second": 1.956, |
| "step": 13750 |
| }, |
| { |
| "epoch": 26.74418604651163, |
| "grad_norm": 1.220083475112915, |
| "learning_rate": 9.303100775193799e-05, |
| "loss": 0.5717597579956055, |
| "step": 13800 |
| }, |
| { |
| "epoch": 26.74418604651163, |
| "eval_loss": 0.5792003273963928, |
| "eval_runtime": 233.5216, |
| "eval_samples_per_second": 81.38, |
| "eval_steps_per_second": 1.696, |
| "step": 13800 |
| }, |
| { |
| "epoch": 26.84108527131783, |
| "grad_norm": 1.0216156244277954, |
| "learning_rate": 9.264341085271318e-05, |
| "loss": 0.5711225509643555, |
| "step": 13850 |
| }, |
| { |
| "epoch": 26.84108527131783, |
| "eval_loss": 0.5813571810722351, |
| "eval_runtime": 216.8193, |
| "eval_samples_per_second": 87.649, |
| "eval_steps_per_second": 1.826, |
| "step": 13850 |
| }, |
| { |
| "epoch": 26.93798449612403, |
| "grad_norm": 0.9908558130264282, |
| "learning_rate": 9.225581395348838e-05, |
| "loss": 0.5861880493164062, |
| "step": 13900 |
| }, |
| { |
| "epoch": 26.93798449612403, |
| "eval_loss": 0.5844830274581909, |
| "eval_runtime": 206.9257, |
| "eval_samples_per_second": 91.84, |
| "eval_steps_per_second": 1.914, |
| "step": 13900 |
| }, |
| { |
| "epoch": 27.03488372093023, |
| "grad_norm": 0.9165576696395874, |
| "learning_rate": 9.186821705426357e-05, |
| "loss": 0.5591924285888672, |
| "step": 13950 |
| }, |
| { |
| "epoch": 27.03488372093023, |
| "eval_loss": 0.5829132199287415, |
| "eval_runtime": 211.5927, |
| "eval_samples_per_second": 89.814, |
| "eval_steps_per_second": 1.872, |
| "step": 13950 |
| }, |
| { |
| "epoch": 27.131782945736433, |
| "grad_norm": 1.1759886741638184, |
| "learning_rate": 9.148062015503877e-05, |
| "loss": 0.5728300476074218, |
| "step": 14000 |
| }, |
| { |
| "epoch": 27.131782945736433, |
| "eval_loss": 0.5667384266853333, |
| "eval_runtime": 190.6612, |
| "eval_samples_per_second": 99.674, |
| "eval_steps_per_second": 2.077, |
| "step": 14000 |
| }, |
| { |
| "epoch": 27.228682170542637, |
| "grad_norm": 1.1444036960601807, |
| "learning_rate": 9.109302325581396e-05, |
| "loss": 0.5587886810302735, |
| "step": 14050 |
| }, |
| { |
| "epoch": 27.228682170542637, |
| "eval_loss": 0.5758188962936401, |
| "eval_runtime": 190.6437, |
| "eval_samples_per_second": 99.683, |
| "eval_steps_per_second": 2.077, |
| "step": 14050 |
| }, |
| { |
| "epoch": 27.325581395348838, |
| "grad_norm": 1.031416893005371, |
| "learning_rate": 9.070542635658915e-05, |
| "loss": 0.5681009674072266, |
| "step": 14100 |
| }, |
| { |
| "epoch": 27.325581395348838, |
| "eval_loss": 0.5733669996261597, |
| "eval_runtime": 188.284, |
| "eval_samples_per_second": 100.933, |
| "eval_steps_per_second": 2.103, |
| "step": 14100 |
| }, |
| { |
| "epoch": 27.42248062015504, |
| "grad_norm": 1.0156781673431396, |
| "learning_rate": 9.031782945736435e-05, |
| "loss": 0.5718238830566407, |
| "step": 14150 |
| }, |
| { |
| "epoch": 27.42248062015504, |
| "eval_loss": 0.5743637681007385, |
| "eval_runtime": 188.8285, |
| "eval_samples_per_second": 100.642, |
| "eval_steps_per_second": 2.097, |
| "step": 14150 |
| }, |
| { |
| "epoch": 27.51937984496124, |
| "grad_norm": 1.0280048847198486, |
| "learning_rate": 8.993023255813954e-05, |
| "loss": 0.5754650497436523, |
| "step": 14200 |
| }, |
| { |
| "epoch": 27.51937984496124, |
| "eval_loss": 0.5855010151863098, |
| "eval_runtime": 182.9128, |
| "eval_samples_per_second": 103.897, |
| "eval_steps_per_second": 2.165, |
| "step": 14200 |
| }, |
| { |
| "epoch": 27.61627906976744, |
| "grad_norm": 1.0390145778656006, |
| "learning_rate": 8.954263565891474e-05, |
| "loss": 0.5721881103515625, |
| "step": 14250 |
| }, |
| { |
| "epoch": 27.61627906976744, |
| "eval_loss": 0.5711118578910828, |
| "eval_runtime": 184.1208, |
| "eval_samples_per_second": 103.215, |
| "eval_steps_per_second": 2.151, |
| "step": 14250 |
| }, |
| { |
| "epoch": 27.713178294573645, |
| "grad_norm": 1.0992286205291748, |
| "learning_rate": 8.915503875968993e-05, |
| "loss": 0.5777717208862305, |
| "step": 14300 |
| }, |
| { |
| "epoch": 27.713178294573645, |
| "eval_loss": 0.5845617651939392, |
| "eval_runtime": 183.5085, |
| "eval_samples_per_second": 103.559, |
| "eval_steps_per_second": 2.158, |
| "step": 14300 |
| }, |
| { |
| "epoch": 27.810077519379846, |
| "grad_norm": 1.0041630268096924, |
| "learning_rate": 8.876744186046511e-05, |
| "loss": 0.5623219680786132, |
| "step": 14350 |
| }, |
| { |
| "epoch": 27.810077519379846, |
| "eval_loss": 0.5710008144378662, |
| "eval_runtime": 184.7548, |
| "eval_samples_per_second": 102.861, |
| "eval_steps_per_second": 2.143, |
| "step": 14350 |
| }, |
| { |
| "epoch": 27.906976744186046, |
| "grad_norm": 0.8692350387573242, |
| "learning_rate": 8.837984496124031e-05, |
| "loss": 0.555379638671875, |
| "step": 14400 |
| }, |
| { |
| "epoch": 27.906976744186046, |
| "eval_loss": 0.5671586990356445, |
| "eval_runtime": 188.3383, |
| "eval_samples_per_second": 100.904, |
| "eval_steps_per_second": 2.103, |
| "step": 14400 |
| }, |
| { |
| "epoch": 28.003875968992247, |
| "grad_norm": 1.0534723997116089, |
| "learning_rate": 8.79922480620155e-05, |
| "loss": 0.5748031997680664, |
| "step": 14450 |
| }, |
| { |
| "epoch": 28.003875968992247, |
| "eval_loss": 0.5694092512130737, |
| "eval_runtime": 200.2532, |
| "eval_samples_per_second": 94.9, |
| "eval_steps_per_second": 1.977, |
| "step": 14450 |
| }, |
| { |
| "epoch": 28.100775193798448, |
| "grad_norm": 0.8484736680984497, |
| "learning_rate": 8.76046511627907e-05, |
| "loss": 0.5692879486083985, |
| "step": 14500 |
| }, |
| { |
| "epoch": 28.100775193798448, |
| "eval_loss": 0.5734642744064331, |
| "eval_runtime": 198.9989, |
| "eval_samples_per_second": 95.498, |
| "eval_steps_per_second": 1.99, |
| "step": 14500 |
| }, |
| { |
| "epoch": 28.197674418604652, |
| "grad_norm": 1.0161453485488892, |
| "learning_rate": 8.72170542635659e-05, |
| "loss": 0.5690762710571289, |
| "step": 14550 |
| }, |
| { |
| "epoch": 28.197674418604652, |
| "eval_loss": 0.5682703852653503, |
| "eval_runtime": 192.8935, |
| "eval_samples_per_second": 98.521, |
| "eval_steps_per_second": 2.053, |
| "step": 14550 |
| }, |
| { |
| "epoch": 28.294573643410853, |
| "grad_norm": 1.1991984844207764, |
| "learning_rate": 8.68294573643411e-05, |
| "loss": 0.571408805847168, |
| "step": 14600 |
| }, |
| { |
| "epoch": 28.294573643410853, |
| "eval_loss": 0.580035388469696, |
| "eval_runtime": 192.3228, |
| "eval_samples_per_second": 98.813, |
| "eval_steps_per_second": 2.059, |
| "step": 14600 |
| }, |
| { |
| "epoch": 28.391472868217054, |
| "grad_norm": 1.0290623903274536, |
| "learning_rate": 8.644186046511628e-05, |
| "loss": 0.5644237136840821, |
| "step": 14650 |
| }, |
| { |
| "epoch": 28.391472868217054, |
| "eval_loss": 0.5756661295890808, |
| "eval_runtime": 201.1784, |
| "eval_samples_per_second": 94.463, |
| "eval_steps_per_second": 1.968, |
| "step": 14650 |
| }, |
| { |
| "epoch": 28.488372093023255, |
| "grad_norm": 1.0187848806381226, |
| "learning_rate": 8.605426356589148e-05, |
| "loss": 0.5739951324462891, |
| "step": 14700 |
| }, |
| { |
| "epoch": 28.488372093023255, |
| "eval_loss": 0.5649608373641968, |
| "eval_runtime": 204.5004, |
| "eval_samples_per_second": 92.929, |
| "eval_steps_per_second": 1.936, |
| "step": 14700 |
| }, |
| { |
| "epoch": 28.585271317829456, |
| "grad_norm": 1.0154606103897095, |
| "learning_rate": 8.566666666666667e-05, |
| "loss": 0.5649435806274414, |
| "step": 14750 |
| }, |
| { |
| "epoch": 28.585271317829456, |
| "eval_loss": 0.574077308177948, |
| "eval_runtime": 199.9375, |
| "eval_samples_per_second": 95.05, |
| "eval_steps_per_second": 1.981, |
| "step": 14750 |
| }, |
| { |
| "epoch": 28.68217054263566, |
| "grad_norm": 0.9040335416793823, |
| "learning_rate": 8.527906976744187e-05, |
| "loss": 0.5609535980224609, |
| "step": 14800 |
| }, |
| { |
| "epoch": 28.68217054263566, |
| "eval_loss": 0.5767226219177246, |
| "eval_runtime": 197.5815, |
| "eval_samples_per_second": 96.183, |
| "eval_steps_per_second": 2.004, |
| "step": 14800 |
| }, |
| { |
| "epoch": 28.77906976744186, |
| "grad_norm": 1.085545301437378, |
| "learning_rate": 8.489147286821706e-05, |
| "loss": 0.5725581359863281, |
| "step": 14850 |
| }, |
| { |
| "epoch": 28.77906976744186, |
| "eval_loss": 0.5696209073066711, |
| "eval_runtime": 199.5241, |
| "eval_samples_per_second": 95.247, |
| "eval_steps_per_second": 1.985, |
| "step": 14850 |
| }, |
| { |
| "epoch": 28.875968992248062, |
| "grad_norm": 0.8943235278129578, |
| "learning_rate": 8.450387596899225e-05, |
| "loss": 0.555779151916504, |
| "step": 14900 |
| }, |
| { |
| "epoch": 28.875968992248062, |
| "eval_loss": 0.5655143857002258, |
| "eval_runtime": 196.7618, |
| "eval_samples_per_second": 96.584, |
| "eval_steps_per_second": 2.013, |
| "step": 14900 |
| }, |
| { |
| "epoch": 28.972868217054263, |
| "grad_norm": 1.4611607789993286, |
| "learning_rate": 8.411627906976745e-05, |
| "loss": 0.553552131652832, |
| "step": 14950 |
| }, |
| { |
| "epoch": 28.972868217054263, |
| "eval_loss": 0.5691696405410767, |
| "eval_runtime": 199.1398, |
| "eval_samples_per_second": 95.43, |
| "eval_steps_per_second": 1.989, |
| "step": 14950 |
| }, |
| { |
| "epoch": 29.069767441860463, |
| "grad_norm": 1.0959761142730713, |
| "learning_rate": 8.372868217054264e-05, |
| "loss": 0.5541238021850586, |
| "step": 15000 |
| }, |
| { |
| "epoch": 29.069767441860463, |
| "eval_loss": 0.5719203352928162, |
| "eval_runtime": 199.5507, |
| "eval_samples_per_second": 95.234, |
| "eval_steps_per_second": 1.984, |
| "step": 15000 |
| }, |
| { |
| "epoch": 29.166666666666668, |
| "grad_norm": 1.0363779067993164, |
| "learning_rate": 8.334108527131783e-05, |
| "loss": 0.5359441375732422, |
| "step": 15050 |
| }, |
| { |
| "epoch": 29.166666666666668, |
| "eval_loss": 0.5621792674064636, |
| "eval_runtime": 199.7624, |
| "eval_samples_per_second": 95.133, |
| "eval_steps_per_second": 1.982, |
| "step": 15050 |
| }, |
| { |
| "epoch": 29.26356589147287, |
| "grad_norm": 1.0751618146896362, |
| "learning_rate": 8.295348837209303e-05, |
| "loss": 0.5572662734985352, |
| "step": 15100 |
| }, |
| { |
| "epoch": 29.26356589147287, |
| "eval_loss": 0.5694544315338135, |
| "eval_runtime": 201.8006, |
| "eval_samples_per_second": 94.172, |
| "eval_steps_per_second": 1.962, |
| "step": 15100 |
| }, |
| { |
| "epoch": 29.36046511627907, |
| "grad_norm": 0.8606336116790771, |
| "learning_rate": 8.256589147286822e-05, |
| "loss": 0.565627555847168, |
| "step": 15150 |
| }, |
| { |
| "epoch": 29.36046511627907, |
| "eval_loss": 0.5655621886253357, |
| "eval_runtime": 202.5083, |
| "eval_samples_per_second": 93.843, |
| "eval_steps_per_second": 1.955, |
| "step": 15150 |
| }, |
| { |
| "epoch": 29.45736434108527, |
| "grad_norm": 1.1224424839019775, |
| "learning_rate": 8.217829457364342e-05, |
| "loss": 0.556108627319336, |
| "step": 15200 |
| }, |
| { |
| "epoch": 29.45736434108527, |
| "eval_loss": 0.5636035799980164, |
| "eval_runtime": 197.0694, |
| "eval_samples_per_second": 96.433, |
| "eval_steps_per_second": 2.009, |
| "step": 15200 |
| }, |
| { |
| "epoch": 29.55426356589147, |
| "grad_norm": 0.8912738561630249, |
| "learning_rate": 8.17906976744186e-05, |
| "loss": 0.5589240264892578, |
| "step": 15250 |
| }, |
| { |
| "epoch": 29.55426356589147, |
| "eval_loss": 0.5628049969673157, |
| "eval_runtime": 193.2666, |
| "eval_samples_per_second": 98.33, |
| "eval_steps_per_second": 2.049, |
| "step": 15250 |
| }, |
| { |
| "epoch": 29.651162790697676, |
| "grad_norm": 1.063519835472107, |
| "learning_rate": 8.140310077519379e-05, |
| "loss": 0.5502983474731445, |
| "step": 15300 |
| }, |
| { |
| "epoch": 29.651162790697676, |
| "eval_loss": 0.5709092617034912, |
| "eval_runtime": 188.55, |
| "eval_samples_per_second": 100.79, |
| "eval_steps_per_second": 2.1, |
| "step": 15300 |
| }, |
| { |
| "epoch": 29.748062015503876, |
| "grad_norm": 1.1637099981307983, |
| "learning_rate": 8.101550387596901e-05, |
| "loss": 0.5621489715576172, |
| "step": 15350 |
| }, |
| { |
| "epoch": 29.748062015503876, |
| "eval_loss": 0.5655869245529175, |
| "eval_runtime": 194.2628, |
| "eval_samples_per_second": 97.826, |
| "eval_steps_per_second": 2.038, |
| "step": 15350 |
| }, |
| { |
| "epoch": 29.844961240310077, |
| "grad_norm": 1.0051424503326416, |
| "learning_rate": 8.06279069767442e-05, |
| "loss": 0.5704508972167969, |
| "step": 15400 |
| }, |
| { |
| "epoch": 29.844961240310077, |
| "eval_loss": 0.5648623704910278, |
| "eval_runtime": 195.0452, |
| "eval_samples_per_second": 97.434, |
| "eval_steps_per_second": 2.03, |
| "step": 15400 |
| }, |
| { |
| "epoch": 29.941860465116278, |
| "grad_norm": 0.9984987378120422, |
| "learning_rate": 8.024031007751938e-05, |
| "loss": 0.5505734634399414, |
| "step": 15450 |
| }, |
| { |
| "epoch": 29.941860465116278, |
| "eval_loss": 0.5613234043121338, |
| "eval_runtime": 188.1798, |
| "eval_samples_per_second": 100.989, |
| "eval_steps_per_second": 2.104, |
| "step": 15450 |
| }, |
| { |
| "epoch": 30.03875968992248, |
| "grad_norm": 1.1080302000045776, |
| "learning_rate": 7.985271317829459e-05, |
| "loss": 0.5553411102294922, |
| "step": 15500 |
| }, |
| { |
| "epoch": 30.03875968992248, |
| "eval_loss": 0.5688788890838623, |
| "eval_runtime": 190.9499, |
| "eval_samples_per_second": 99.524, |
| "eval_steps_per_second": 2.074, |
| "step": 15500 |
| }, |
| { |
| "epoch": 30.135658914728683, |
| "grad_norm": 1.1303520202636719, |
| "learning_rate": 7.946511627906977e-05, |
| "loss": 0.5628437042236328, |
| "step": 15550 |
| }, |
| { |
| "epoch": 30.135658914728683, |
| "eval_loss": 0.569452166557312, |
| "eval_runtime": 194.8232, |
| "eval_samples_per_second": 97.545, |
| "eval_steps_per_second": 2.033, |
| "step": 15550 |
| }, |
| { |
| "epoch": 30.232558139534884, |
| "grad_norm": 0.8545628190040588, |
| "learning_rate": 7.907751937984496e-05, |
| "loss": 0.5434987258911133, |
| "step": 15600 |
| }, |
| { |
| "epoch": 30.232558139534884, |
| "eval_loss": 0.5680402517318726, |
| "eval_runtime": 192.2391, |
| "eval_samples_per_second": 98.856, |
| "eval_steps_per_second": 2.06, |
| "step": 15600 |
| }, |
| { |
| "epoch": 30.329457364341085, |
| "grad_norm": 0.9640957713127136, |
| "learning_rate": 7.868992248062016e-05, |
| "loss": 0.5405253219604492, |
| "step": 15650 |
| }, |
| { |
| "epoch": 30.329457364341085, |
| "eval_loss": 0.5530641674995422, |
| "eval_runtime": 199.6418, |
| "eval_samples_per_second": 95.191, |
| "eval_steps_per_second": 1.984, |
| "step": 15650 |
| }, |
| { |
| "epoch": 30.426356589147286, |
| "grad_norm": 1.1416120529174805, |
| "learning_rate": 7.830232558139535e-05, |
| "loss": 0.5548144912719727, |
| "step": 15700 |
| }, |
| { |
| "epoch": 30.426356589147286, |
| "eval_loss": 0.5669378638267517, |
| "eval_runtime": 198.882, |
| "eval_samples_per_second": 95.554, |
| "eval_steps_per_second": 1.991, |
| "step": 15700 |
| }, |
| { |
| "epoch": 30.52325581395349, |
| "grad_norm": 1.0198075771331787, |
| "learning_rate": 7.791472868217055e-05, |
| "loss": 0.5592168045043945, |
| "step": 15750 |
| }, |
| { |
| "epoch": 30.52325581395349, |
| "eval_loss": 0.5626012086868286, |
| "eval_runtime": 197.3119, |
| "eval_samples_per_second": 96.315, |
| "eval_steps_per_second": 2.007, |
| "step": 15750 |
| }, |
| { |
| "epoch": 30.62015503875969, |
| "grad_norm": 1.0227652788162231, |
| "learning_rate": 7.752713178294574e-05, |
| "loss": 0.5549029922485351, |
| "step": 15800 |
| }, |
| { |
| "epoch": 30.62015503875969, |
| "eval_loss": 0.5600801706314087, |
| "eval_runtime": 192.5035, |
| "eval_samples_per_second": 98.72, |
| "eval_steps_per_second": 2.057, |
| "step": 15800 |
| }, |
| { |
| "epoch": 30.717054263565892, |
| "grad_norm": 1.0571966171264648, |
| "learning_rate": 7.713953488372093e-05, |
| "loss": 0.5535205841064453, |
| "step": 15850 |
| }, |
| { |
| "epoch": 30.717054263565892, |
| "eval_loss": 0.554684579372406, |
| "eval_runtime": 201.6318, |
| "eval_samples_per_second": 94.251, |
| "eval_steps_per_second": 1.964, |
| "step": 15850 |
| }, |
| { |
| "epoch": 30.813953488372093, |
| "grad_norm": 1.0247442722320557, |
| "learning_rate": 7.675193798449613e-05, |
| "loss": 0.5452067184448243, |
| "step": 15900 |
| }, |
| { |
| "epoch": 30.813953488372093, |
| "eval_loss": 0.5608499646186829, |
| "eval_runtime": 203.6665, |
| "eval_samples_per_second": 93.309, |
| "eval_steps_per_second": 1.944, |
| "step": 15900 |
| }, |
| { |
| "epoch": 30.910852713178294, |
| "grad_norm": 1.0259183645248413, |
| "learning_rate": 7.636434108527132e-05, |
| "loss": 0.5513217544555664, |
| "step": 15950 |
| }, |
| { |
| "epoch": 30.910852713178294, |
| "eval_loss": 0.5694592595100403, |
| "eval_runtime": 204.0706, |
| "eval_samples_per_second": 93.125, |
| "eval_steps_per_second": 1.941, |
| "step": 15950 |
| }, |
| { |
| "epoch": 31.007751937984494, |
| "grad_norm": 0.9598883390426636, |
| "learning_rate": 7.59767441860465e-05, |
| "loss": 0.5515378952026367, |
| "step": 16000 |
| }, |
| { |
| "epoch": 31.007751937984494, |
| "eval_loss": 0.5637879967689514, |
| "eval_runtime": 206.5294, |
| "eval_samples_per_second": 92.016, |
| "eval_steps_per_second": 1.917, |
| "step": 16000 |
| }, |
| { |
| "epoch": 31.1046511627907, |
| "grad_norm": 1.090155005455017, |
| "learning_rate": 7.55891472868217e-05, |
| "loss": 0.5438331985473632, |
| "step": 16050 |
| }, |
| { |
| "epoch": 31.1046511627907, |
| "eval_loss": 0.56569504737854, |
| "eval_runtime": 204.4685, |
| "eval_samples_per_second": 92.943, |
| "eval_steps_per_second": 1.937, |
| "step": 16050 |
| }, |
| { |
| "epoch": 31.2015503875969, |
| "grad_norm": 1.0309280157089233, |
| "learning_rate": 7.52015503875969e-05, |
| "loss": 0.5518190383911132, |
| "step": 16100 |
| }, |
| { |
| "epoch": 31.2015503875969, |
| "eval_loss": 0.5640277862548828, |
| "eval_runtime": 207.3379, |
| "eval_samples_per_second": 91.657, |
| "eval_steps_per_second": 1.91, |
| "step": 16100 |
| }, |
| { |
| "epoch": 31.2984496124031, |
| "grad_norm": 1.0093274116516113, |
| "learning_rate": 7.48139534883721e-05, |
| "loss": 0.5497930145263672, |
| "step": 16150 |
| }, |
| { |
| "epoch": 31.2984496124031, |
| "eval_loss": 0.5667787790298462, |
| "eval_runtime": 205.5829, |
| "eval_samples_per_second": 92.44, |
| "eval_steps_per_second": 1.926, |
| "step": 16150 |
| }, |
| { |
| "epoch": 31.3953488372093, |
| "grad_norm": 0.9880069494247437, |
| "learning_rate": 7.44263565891473e-05, |
| "loss": 0.54766357421875, |
| "step": 16200 |
| }, |
| { |
| "epoch": 31.3953488372093, |
| "eval_loss": 0.5575990676879883, |
| "eval_runtime": 206.3994, |
| "eval_samples_per_second": 92.074, |
| "eval_steps_per_second": 1.919, |
| "step": 16200 |
| }, |
| { |
| "epoch": 31.492248062015506, |
| "grad_norm": 1.0106197595596313, |
| "learning_rate": 7.403875968992249e-05, |
| "loss": 0.5469408416748047, |
| "step": 16250 |
| }, |
| { |
| "epoch": 31.492248062015506, |
| "eval_loss": 0.562467098236084, |
| "eval_runtime": 204.9034, |
| "eval_samples_per_second": 92.746, |
| "eval_steps_per_second": 1.933, |
| "step": 16250 |
| }, |
| { |
| "epoch": 31.589147286821706, |
| "grad_norm": 1.035379409790039, |
| "learning_rate": 7.365116279069769e-05, |
| "loss": 0.562320556640625, |
| "step": 16300 |
| }, |
| { |
| "epoch": 31.589147286821706, |
| "eval_loss": 0.5606642365455627, |
| "eval_runtime": 202.8477, |
| "eval_samples_per_second": 93.686, |
| "eval_steps_per_second": 1.952, |
| "step": 16300 |
| }, |
| { |
| "epoch": 31.686046511627907, |
| "grad_norm": 1.2975516319274902, |
| "learning_rate": 7.326356589147288e-05, |
| "loss": 0.5471981430053711, |
| "step": 16350 |
| }, |
| { |
| "epoch": 31.686046511627907, |
| "eval_loss": 0.5522753000259399, |
| "eval_runtime": 203.8634, |
| "eval_samples_per_second": 93.219, |
| "eval_steps_per_second": 1.942, |
| "step": 16350 |
| }, |
| { |
| "epoch": 31.782945736434108, |
| "grad_norm": 0.9750486016273499, |
| "learning_rate": 7.287596899224806e-05, |
| "loss": 0.5367609024047851, |
| "step": 16400 |
| }, |
| { |
| "epoch": 31.782945736434108, |
| "eval_loss": 0.5535518527030945, |
| "eval_runtime": 196.989, |
| "eval_samples_per_second": 96.472, |
| "eval_steps_per_second": 2.01, |
| "step": 16400 |
| }, |
| { |
| "epoch": 31.87984496124031, |
| "grad_norm": 1.2616281509399414, |
| "learning_rate": 7.248837209302326e-05, |
| "loss": 0.5479264068603515, |
| "step": 16450 |
| }, |
| { |
| "epoch": 31.87984496124031, |
| "eval_loss": 0.5599262118339539, |
| "eval_runtime": 195.7368, |
| "eval_samples_per_second": 97.09, |
| "eval_steps_per_second": 2.023, |
| "step": 16450 |
| }, |
| { |
| "epoch": 31.97674418604651, |
| "grad_norm": 1.0327950716018677, |
| "learning_rate": 7.210077519379845e-05, |
| "loss": 0.5482368469238281, |
| "step": 16500 |
| }, |
| { |
| "epoch": 31.97674418604651, |
| "eval_loss": 0.5533180832862854, |
| "eval_runtime": 195.5573, |
| "eval_samples_per_second": 97.179, |
| "eval_steps_per_second": 2.025, |
| "step": 16500 |
| }, |
| { |
| "epoch": 32.07364341085271, |
| "grad_norm": 1.1047241687774658, |
| "learning_rate": 7.171317829457364e-05, |
| "loss": 0.5321396636962891, |
| "step": 16550 |
| }, |
| { |
| "epoch": 32.07364341085271, |
| "eval_loss": 0.5518382787704468, |
| "eval_runtime": 189.5095, |
| "eval_samples_per_second": 100.28, |
| "eval_steps_per_second": 2.09, |
| "step": 16550 |
| }, |
| { |
| "epoch": 32.17054263565891, |
| "grad_norm": 1.079679250717163, |
| "learning_rate": 7.132558139534884e-05, |
| "loss": 0.5443602752685547, |
| "step": 16600 |
| }, |
| { |
| "epoch": 32.17054263565891, |
| "eval_loss": 0.5624197125434875, |
| "eval_runtime": 184.8407, |
| "eval_samples_per_second": 102.813, |
| "eval_steps_per_second": 2.142, |
| "step": 16600 |
| }, |
| { |
| "epoch": 32.26744186046512, |
| "grad_norm": 1.32503080368042, |
| "learning_rate": 7.093798449612403e-05, |
| "loss": 0.5390607070922852, |
| "step": 16650 |
| }, |
| { |
| "epoch": 32.26744186046512, |
| "eval_loss": 0.5524035692214966, |
| "eval_runtime": 190.1126, |
| "eval_samples_per_second": 99.962, |
| "eval_steps_per_second": 2.083, |
| "step": 16650 |
| }, |
| { |
| "epoch": 32.36434108527132, |
| "grad_norm": 0.9187719821929932, |
| "learning_rate": 7.055038759689923e-05, |
| "loss": 0.5373795700073242, |
| "step": 16700 |
| }, |
| { |
| "epoch": 32.36434108527132, |
| "eval_loss": 0.557758629322052, |
| "eval_runtime": 189.2424, |
| "eval_samples_per_second": 100.421, |
| "eval_steps_per_second": 2.093, |
| "step": 16700 |
| }, |
| { |
| "epoch": 32.46124031007752, |
| "grad_norm": 0.8754042387008667, |
| "learning_rate": 7.016279069767442e-05, |
| "loss": 0.5403145217895508, |
| "step": 16750 |
| }, |
| { |
| "epoch": 32.46124031007752, |
| "eval_loss": 0.5573844313621521, |
| "eval_runtime": 185.4261, |
| "eval_samples_per_second": 102.488, |
| "eval_steps_per_second": 2.136, |
| "step": 16750 |
| }, |
| { |
| "epoch": 32.55813953488372, |
| "grad_norm": 0.896300196647644, |
| "learning_rate": 6.977519379844961e-05, |
| "loss": 0.5385972595214844, |
| "step": 16800 |
| }, |
| { |
| "epoch": 32.55813953488372, |
| "eval_loss": 0.5460941791534424, |
| "eval_runtime": 191.881, |
| "eval_samples_per_second": 99.041, |
| "eval_steps_per_second": 2.064, |
| "step": 16800 |
| }, |
| { |
| "epoch": 32.65503875968992, |
| "grad_norm": 0.8643621802330017, |
| "learning_rate": 6.938759689922481e-05, |
| "loss": 0.5400046157836914, |
| "step": 16850 |
| }, |
| { |
| "epoch": 32.65503875968992, |
| "eval_loss": 0.550368070602417, |
| "eval_runtime": 193.1616, |
| "eval_samples_per_second": 98.384, |
| "eval_steps_per_second": 2.05, |
| "step": 16850 |
| }, |
| { |
| "epoch": 32.751937984496124, |
| "grad_norm": 0.9831710457801819, |
| "learning_rate": 6.9e-05, |
| "loss": 0.5370166778564454, |
| "step": 16900 |
| }, |
| { |
| "epoch": 32.751937984496124, |
| "eval_loss": 0.5594847798347473, |
| "eval_runtime": 200.5934, |
| "eval_samples_per_second": 94.739, |
| "eval_steps_per_second": 1.974, |
| "step": 16900 |
| }, |
| { |
| "epoch": 32.848837209302324, |
| "grad_norm": 1.0108966827392578, |
| "learning_rate": 6.86124031007752e-05, |
| "loss": 0.5464648056030273, |
| "step": 16950 |
| }, |
| { |
| "epoch": 32.848837209302324, |
| "eval_loss": 0.5564253330230713, |
| "eval_runtime": 194.6839, |
| "eval_samples_per_second": 97.615, |
| "eval_steps_per_second": 2.034, |
| "step": 16950 |
| }, |
| { |
| "epoch": 32.945736434108525, |
| "grad_norm": 1.013802170753479, |
| "learning_rate": 6.82248062015504e-05, |
| "loss": 0.528292465209961, |
| "step": 17000 |
| }, |
| { |
| "epoch": 32.945736434108525, |
| "eval_loss": 0.554287314414978, |
| "eval_runtime": 196.927, |
| "eval_samples_per_second": 96.503, |
| "eval_steps_per_second": 2.011, |
| "step": 17000 |
| }, |
| { |
| "epoch": 33.042635658914726, |
| "grad_norm": 0.9317029118537903, |
| "learning_rate": 6.783720930232559e-05, |
| "loss": 0.5383699035644531, |
| "step": 17050 |
| }, |
| { |
| "epoch": 33.042635658914726, |
| "eval_loss": 0.5600277185440063, |
| "eval_runtime": 189.1325, |
| "eval_samples_per_second": 100.48, |
| "eval_steps_per_second": 2.094, |
| "step": 17050 |
| }, |
| { |
| "epoch": 33.13953488372093, |
| "grad_norm": 0.9573765397071838, |
| "learning_rate": 6.744961240310078e-05, |
| "loss": 0.525397605895996, |
| "step": 17100 |
| }, |
| { |
| "epoch": 33.13953488372093, |
| "eval_loss": 0.5511126518249512, |
| "eval_runtime": 190.6, |
| "eval_samples_per_second": 99.706, |
| "eval_steps_per_second": 2.078, |
| "step": 17100 |
| }, |
| { |
| "epoch": 33.236434108527135, |
| "grad_norm": 0.988450825214386, |
| "learning_rate": 6.706201550387598e-05, |
| "loss": 0.5410281753540039, |
| "step": 17150 |
| }, |
| { |
| "epoch": 33.236434108527135, |
| "eval_loss": 0.5519395470619202, |
| "eval_runtime": 177.0128, |
| "eval_samples_per_second": 107.359, |
| "eval_steps_per_second": 2.237, |
| "step": 17150 |
| }, |
| { |
| "epoch": 33.333333333333336, |
| "grad_norm": 1.0500494241714478, |
| "learning_rate": 6.667441860465117e-05, |
| "loss": 0.5410085678100586, |
| "step": 17200 |
| }, |
| { |
| "epoch": 33.333333333333336, |
| "eval_loss": 0.5577670335769653, |
| "eval_runtime": 191.3272, |
| "eval_samples_per_second": 99.327, |
| "eval_steps_per_second": 2.07, |
| "step": 17200 |
| }, |
| { |
| "epoch": 33.43023255813954, |
| "grad_norm": 1.2153853178024292, |
| "learning_rate": 6.628682170542637e-05, |
| "loss": 0.5333014678955078, |
| "step": 17250 |
| }, |
| { |
| "epoch": 33.43023255813954, |
| "eval_loss": 0.5504041314125061, |
| "eval_runtime": 192.5064, |
| "eval_samples_per_second": 98.719, |
| "eval_steps_per_second": 2.057, |
| "step": 17250 |
| }, |
| { |
| "epoch": 33.52713178294574, |
| "grad_norm": 1.2611991167068481, |
| "learning_rate": 6.589922480620155e-05, |
| "loss": 0.5398865509033203, |
| "step": 17300 |
| }, |
| { |
| "epoch": 33.52713178294574, |
| "eval_loss": 0.5494994521141052, |
| "eval_runtime": 194.486, |
| "eval_samples_per_second": 97.714, |
| "eval_steps_per_second": 2.036, |
| "step": 17300 |
| }, |
| { |
| "epoch": 33.62403100775194, |
| "grad_norm": 1.0226876735687256, |
| "learning_rate": 6.551162790697674e-05, |
| "loss": 0.5396590423583985, |
| "step": 17350 |
| }, |
| { |
| "epoch": 33.62403100775194, |
| "eval_loss": 0.5536758899688721, |
| "eval_runtime": 193.7265, |
| "eval_samples_per_second": 98.097, |
| "eval_steps_per_second": 2.044, |
| "step": 17350 |
| }, |
| { |
| "epoch": 33.72093023255814, |
| "grad_norm": 0.9030176401138306, |
| "learning_rate": 6.512403100775194e-05, |
| "loss": 0.5370613861083985, |
| "step": 17400 |
| }, |
| { |
| "epoch": 33.72093023255814, |
| "eval_loss": 0.549870491027832, |
| "eval_runtime": 195.2981, |
| "eval_samples_per_second": 97.308, |
| "eval_steps_per_second": 2.028, |
| "step": 17400 |
| }, |
| { |
| "epoch": 33.81782945736434, |
| "grad_norm": 0.8909381628036499, |
| "learning_rate": 6.473643410852713e-05, |
| "loss": 0.5347919464111328, |
| "step": 17450 |
| }, |
| { |
| "epoch": 33.81782945736434, |
| "eval_loss": 0.541422426700592, |
| "eval_runtime": 189.4035, |
| "eval_samples_per_second": 100.336, |
| "eval_steps_per_second": 2.091, |
| "step": 17450 |
| }, |
| { |
| "epoch": 33.91472868217054, |
| "grad_norm": 0.8646638989448547, |
| "learning_rate": 6.434883720930232e-05, |
| "loss": 0.5484010696411132, |
| "step": 17500 |
| }, |
| { |
| "epoch": 33.91472868217054, |
| "eval_loss": 0.5494884252548218, |
| "eval_runtime": 193.7113, |
| "eval_samples_per_second": 98.105, |
| "eval_steps_per_second": 2.044, |
| "step": 17500 |
| }, |
| { |
| "epoch": 34.01162790697674, |
| "grad_norm": 0.9828886985778809, |
| "learning_rate": 6.396124031007752e-05, |
| "loss": 0.5419563674926757, |
| "step": 17550 |
| }, |
| { |
| "epoch": 34.01162790697674, |
| "eval_loss": 0.5491079688072205, |
| "eval_runtime": 193.1117, |
| "eval_samples_per_second": 98.409, |
| "eval_steps_per_second": 2.051, |
| "step": 17550 |
| }, |
| { |
| "epoch": 34.10852713178294, |
| "grad_norm": 1.0629746913909912, |
| "learning_rate": 6.357364341085271e-05, |
| "loss": 0.5175531768798828, |
| "step": 17600 |
| }, |
| { |
| "epoch": 34.10852713178294, |
| "eval_loss": 0.5538118481636047, |
| "eval_runtime": 196.6121, |
| "eval_samples_per_second": 96.657, |
| "eval_steps_per_second": 2.014, |
| "step": 17600 |
| }, |
| { |
| "epoch": 34.20542635658915, |
| "grad_norm": 0.9160548448562622, |
| "learning_rate": 6.318604651162791e-05, |
| "loss": 0.539161491394043, |
| "step": 17650 |
| }, |
| { |
| "epoch": 34.20542635658915, |
| "eval_loss": 0.5461863279342651, |
| "eval_runtime": 198.549, |
| "eval_samples_per_second": 95.714, |
| "eval_steps_per_second": 1.994, |
| "step": 17650 |
| }, |
| { |
| "epoch": 34.30232558139535, |
| "grad_norm": 1.0820066928863525, |
| "learning_rate": 6.27984496124031e-05, |
| "loss": 0.5332397842407226, |
| "step": 17700 |
| }, |
| { |
| "epoch": 34.30232558139535, |
| "eval_loss": 0.5470810532569885, |
| "eval_runtime": 199.5692, |
| "eval_samples_per_second": 95.225, |
| "eval_steps_per_second": 1.984, |
| "step": 17700 |
| }, |
| { |
| "epoch": 34.39922480620155, |
| "grad_norm": 1.0704736709594727, |
| "learning_rate": 6.24108527131783e-05, |
| "loss": 0.5254277801513672, |
| "step": 17750 |
| }, |
| { |
| "epoch": 34.39922480620155, |
| "eval_loss": 0.542071521282196, |
| "eval_runtime": 192.8985, |
| "eval_samples_per_second": 98.518, |
| "eval_steps_per_second": 2.053, |
| "step": 17750 |
| }, |
| { |
| "epoch": 34.49612403100775, |
| "grad_norm": 1.0921117067337036, |
| "learning_rate": 6.20232558139535e-05, |
| "loss": 0.5333176040649414, |
| "step": 17800 |
| }, |
| { |
| "epoch": 34.49612403100775, |
| "eval_loss": 0.550654947757721, |
| "eval_runtime": 186.5933, |
| "eval_samples_per_second": 101.847, |
| "eval_steps_per_second": 2.122, |
| "step": 17800 |
| }, |
| { |
| "epoch": 34.593023255813954, |
| "grad_norm": 1.2105690240859985, |
| "learning_rate": 6.163565891472869e-05, |
| "loss": 0.5238530349731445, |
| "step": 17850 |
| }, |
| { |
| "epoch": 34.593023255813954, |
| "eval_loss": 0.54400235414505, |
| "eval_runtime": 174.1456, |
| "eval_samples_per_second": 109.127, |
| "eval_steps_per_second": 2.274, |
| "step": 17850 |
| }, |
| { |
| "epoch": 34.689922480620154, |
| "grad_norm": 1.0428931713104248, |
| "learning_rate": 6.124806201550388e-05, |
| "loss": 0.5392517852783203, |
| "step": 17900 |
| }, |
| { |
| "epoch": 34.689922480620154, |
| "eval_loss": 0.5388475060462952, |
| "eval_runtime": 175.4521, |
| "eval_samples_per_second": 108.314, |
| "eval_steps_per_second": 2.257, |
| "step": 17900 |
| }, |
| { |
| "epoch": 34.786821705426355, |
| "grad_norm": 0.8393483757972717, |
| "learning_rate": 6.086046511627907e-05, |
| "loss": 0.5320695877075196, |
| "step": 17950 |
| }, |
| { |
| "epoch": 34.786821705426355, |
| "eval_loss": 0.5461158752441406, |
| "eval_runtime": 175.228, |
| "eval_samples_per_second": 108.453, |
| "eval_steps_per_second": 2.26, |
| "step": 17950 |
| }, |
| { |
| "epoch": 34.883720930232556, |
| "grad_norm": 0.833777129650116, |
| "learning_rate": 6.047286821705427e-05, |
| "loss": 0.5283815383911132, |
| "step": 18000 |
| }, |
| { |
| "epoch": 34.883720930232556, |
| "eval_loss": 0.5449761152267456, |
| "eval_runtime": 179.826, |
| "eval_samples_per_second": 105.68, |
| "eval_steps_per_second": 2.202, |
| "step": 18000 |
| }, |
| { |
| "epoch": 34.98062015503876, |
| "grad_norm": 0.9303448796272278, |
| "learning_rate": 6.008527131782946e-05, |
| "loss": 0.524686622619629, |
| "step": 18050 |
| }, |
| { |
| "epoch": 34.98062015503876, |
| "eval_loss": 0.5538901686668396, |
| "eval_runtime": 184.6088, |
| "eval_samples_per_second": 102.942, |
| "eval_steps_per_second": 2.145, |
| "step": 18050 |
| }, |
| { |
| "epoch": 35.07751937984496, |
| "grad_norm": 1.0190098285675049, |
| "learning_rate": 5.9697674418604657e-05, |
| "loss": 0.5224573516845703, |
| "step": 18100 |
| }, |
| { |
| "epoch": 35.07751937984496, |
| "eval_loss": 0.5498805642127991, |
| "eval_runtime": 180.9123, |
| "eval_samples_per_second": 105.045, |
| "eval_steps_per_second": 2.189, |
| "step": 18100 |
| }, |
| { |
| "epoch": 35.174418604651166, |
| "grad_norm": 0.8532817959785461, |
| "learning_rate": 5.9310077519379844e-05, |
| "loss": 0.5115080642700195, |
| "step": 18150 |
| }, |
| { |
| "epoch": 35.174418604651166, |
| "eval_loss": 0.5354583263397217, |
| "eval_runtime": 181.0288, |
| "eval_samples_per_second": 104.978, |
| "eval_steps_per_second": 2.187, |
| "step": 18150 |
| }, |
| { |
| "epoch": 35.27131782945737, |
| "grad_norm": 1.2438424825668335, |
| "learning_rate": 5.892248062015504e-05, |
| "loss": 0.5154667663574218, |
| "step": 18200 |
| }, |
| { |
| "epoch": 35.27131782945737, |
| "eval_loss": 0.5381179451942444, |
| "eval_runtime": 186.0022, |
| "eval_samples_per_second": 102.171, |
| "eval_steps_per_second": 2.129, |
| "step": 18200 |
| }, |
| { |
| "epoch": 35.36821705426357, |
| "grad_norm": 0.9005379676818848, |
| "learning_rate": 5.8534883720930234e-05, |
| "loss": 0.5120582199096679, |
| "step": 18250 |
| }, |
| { |
| "epoch": 35.36821705426357, |
| "eval_loss": 0.5399240851402283, |
| "eval_runtime": 183.0389, |
| "eval_samples_per_second": 103.825, |
| "eval_steps_per_second": 2.163, |
| "step": 18250 |
| }, |
| { |
| "epoch": 35.46511627906977, |
| "grad_norm": 1.5690606832504272, |
| "learning_rate": 5.814728682170543e-05, |
| "loss": 0.5389008331298828, |
| "step": 18300 |
| }, |
| { |
| "epoch": 35.46511627906977, |
| "eval_loss": 0.551359236240387, |
| "eval_runtime": 186.3615, |
| "eval_samples_per_second": 101.974, |
| "eval_steps_per_second": 2.125, |
| "step": 18300 |
| }, |
| { |
| "epoch": 35.56201550387597, |
| "grad_norm": 1.0620508193969727, |
| "learning_rate": 5.7759689922480617e-05, |
| "loss": 0.5309218215942383, |
| "step": 18350 |
| }, |
| { |
| "epoch": 35.56201550387597, |
| "eval_loss": 0.5498600006103516, |
| "eval_runtime": 186.8148, |
| "eval_samples_per_second": 101.726, |
| "eval_steps_per_second": 2.12, |
| "step": 18350 |
| }, |
| { |
| "epoch": 35.65891472868217, |
| "grad_norm": 0.8293824195861816, |
| "learning_rate": 5.737209302325581e-05, |
| "loss": 0.5250505065917969, |
| "step": 18400 |
| }, |
| { |
| "epoch": 35.65891472868217, |
| "eval_loss": 0.5435429215431213, |
| "eval_runtime": 186.1113, |
| "eval_samples_per_second": 102.111, |
| "eval_steps_per_second": 2.128, |
| "step": 18400 |
| }, |
| { |
| "epoch": 35.75581395348837, |
| "grad_norm": 0.9203481674194336, |
| "learning_rate": 5.6984496124031006e-05, |
| "loss": 0.5103243637084961, |
| "step": 18450 |
| }, |
| { |
| "epoch": 35.75581395348837, |
| "eval_loss": 0.5460033416748047, |
| "eval_runtime": 183.5687, |
| "eval_samples_per_second": 103.525, |
| "eval_steps_per_second": 2.157, |
| "step": 18450 |
| }, |
| { |
| "epoch": 35.85271317829457, |
| "grad_norm": 0.9886574745178223, |
| "learning_rate": 5.65968992248062e-05, |
| "loss": 0.5369464111328125, |
| "step": 18500 |
| }, |
| { |
| "epoch": 35.85271317829457, |
| "eval_loss": 0.5451639890670776, |
| "eval_runtime": 184.8581, |
| "eval_samples_per_second": 102.803, |
| "eval_steps_per_second": 2.142, |
| "step": 18500 |
| }, |
| { |
| "epoch": 35.94961240310077, |
| "grad_norm": 0.9914956092834473, |
| "learning_rate": 5.62093023255814e-05, |
| "loss": 0.5096720504760742, |
| "step": 18550 |
| }, |
| { |
| "epoch": 35.94961240310077, |
| "eval_loss": 0.55196213722229, |
| "eval_runtime": 186.5685, |
| "eval_samples_per_second": 101.861, |
| "eval_steps_per_second": 2.123, |
| "step": 18550 |
| }, |
| { |
| "epoch": 36.04651162790697, |
| "grad_norm": 1.1980777978897095, |
| "learning_rate": 5.58217054263566e-05, |
| "loss": 0.52376220703125, |
| "step": 18600 |
| }, |
| { |
| "epoch": 36.04651162790697, |
| "eval_loss": 0.5417166948318481, |
| "eval_runtime": 189.0404, |
| "eval_samples_per_second": 100.529, |
| "eval_steps_per_second": 2.095, |
| "step": 18600 |
| }, |
| { |
| "epoch": 36.14341085271318, |
| "grad_norm": 1.029856562614441, |
| "learning_rate": 5.543410852713179e-05, |
| "loss": 0.5111849975585937, |
| "step": 18650 |
| }, |
| { |
| "epoch": 36.14341085271318, |
| "eval_loss": 0.5434406995773315, |
| "eval_runtime": 194.8649, |
| "eval_samples_per_second": 97.524, |
| "eval_steps_per_second": 2.032, |
| "step": 18650 |
| }, |
| { |
| "epoch": 36.24031007751938, |
| "grad_norm": 0.9188030958175659, |
| "learning_rate": 5.504651162790698e-05, |
| "loss": 0.5124307250976563, |
| "step": 18700 |
| }, |
| { |
| "epoch": 36.24031007751938, |
| "eval_loss": 0.5369866490364075, |
| "eval_runtime": 191.9625, |
| "eval_samples_per_second": 98.999, |
| "eval_steps_per_second": 2.063, |
| "step": 18700 |
| }, |
| { |
| "epoch": 36.33720930232558, |
| "grad_norm": 0.8747699856758118, |
| "learning_rate": 5.4658914728682174e-05, |
| "loss": 0.5114467239379883, |
| "step": 18750 |
| }, |
| { |
| "epoch": 36.33720930232558, |
| "eval_loss": 0.5418548583984375, |
| "eval_runtime": 191.4833, |
| "eval_samples_per_second": 99.246, |
| "eval_steps_per_second": 2.068, |
| "step": 18750 |
| }, |
| { |
| "epoch": 36.434108527131784, |
| "grad_norm": 1.1166157722473145, |
| "learning_rate": 5.427131782945737e-05, |
| "loss": 0.5248300170898438, |
| "step": 18800 |
| }, |
| { |
| "epoch": 36.434108527131784, |
| "eval_loss": 0.5400023460388184, |
| "eval_runtime": 194.0727, |
| "eval_samples_per_second": 97.922, |
| "eval_steps_per_second": 2.04, |
| "step": 18800 |
| }, |
| { |
| "epoch": 36.531007751937985, |
| "grad_norm": 1.0363603830337524, |
| "learning_rate": 5.3883720930232564e-05, |
| "loss": 0.5214292526245117, |
| "step": 18850 |
| }, |
| { |
| "epoch": 36.531007751937985, |
| "eval_loss": 0.5220096111297607, |
| "eval_runtime": 198.424, |
| "eval_samples_per_second": 95.775, |
| "eval_steps_per_second": 1.996, |
| "step": 18850 |
| }, |
| { |
| "epoch": 36.627906976744185, |
| "grad_norm": 0.8305726647377014, |
| "learning_rate": 5.349612403100775e-05, |
| "loss": 0.5413295364379883, |
| "step": 18900 |
| }, |
| { |
| "epoch": 36.627906976744185, |
| "eval_loss": 0.5427697896957397, |
| "eval_runtime": 197.5266, |
| "eval_samples_per_second": 96.21, |
| "eval_steps_per_second": 2.005, |
| "step": 18900 |
| }, |
| { |
| "epoch": 36.724806201550386, |
| "grad_norm": 0.9412527680397034, |
| "learning_rate": 5.3108527131782947e-05, |
| "loss": 0.5180264282226562, |
| "step": 18950 |
| }, |
| { |
| "epoch": 36.724806201550386, |
| "eval_loss": 0.5371807813644409, |
| "eval_runtime": 193.0091, |
| "eval_samples_per_second": 98.462, |
| "eval_steps_per_second": 2.052, |
| "step": 18950 |
| }, |
| { |
| "epoch": 36.82170542635659, |
| "grad_norm": 1.1160436868667603, |
| "learning_rate": 5.272093023255814e-05, |
| "loss": 0.5069771575927734, |
| "step": 19000 |
| }, |
| { |
| "epoch": 36.82170542635659, |
| "eval_loss": 0.5473487973213196, |
| "eval_runtime": 193.0131, |
| "eval_samples_per_second": 98.46, |
| "eval_steps_per_second": 2.052, |
| "step": 19000 |
| }, |
| { |
| "epoch": 36.91860465116279, |
| "grad_norm": 0.9080687761306763, |
| "learning_rate": 5.2333333333333336e-05, |
| "loss": 0.5186505508422852, |
| "step": 19050 |
| }, |
| { |
| "epoch": 36.91860465116279, |
| "eval_loss": 0.5444969534873962, |
| "eval_runtime": 197.7748, |
| "eval_samples_per_second": 96.089, |
| "eval_steps_per_second": 2.002, |
| "step": 19050 |
| }, |
| { |
| "epoch": 37.01550387596899, |
| "grad_norm": 1.1593964099884033, |
| "learning_rate": 5.1945736434108524e-05, |
| "loss": 0.5240575790405273, |
| "step": 19100 |
| }, |
| { |
| "epoch": 37.01550387596899, |
| "eval_loss": 0.5428500175476074, |
| "eval_runtime": 194.6691, |
| "eval_samples_per_second": 97.622, |
| "eval_steps_per_second": 2.034, |
| "step": 19100 |
| }, |
| { |
| "epoch": 37.1124031007752, |
| "grad_norm": 1.0456364154815674, |
| "learning_rate": 5.155813953488372e-05, |
| "loss": 0.5200264739990235, |
| "step": 19150 |
| }, |
| { |
| "epoch": 37.1124031007752, |
| "eval_loss": 0.5357416868209839, |
| "eval_runtime": 193.2375, |
| "eval_samples_per_second": 98.345, |
| "eval_steps_per_second": 2.049, |
| "step": 19150 |
| }, |
| { |
| "epoch": 37.2093023255814, |
| "grad_norm": 0.9927902817726135, |
| "learning_rate": 5.117054263565891e-05, |
| "loss": 0.5228353500366211, |
| "step": 19200 |
| }, |
| { |
| "epoch": 37.2093023255814, |
| "eval_loss": 0.5418040156364441, |
| "eval_runtime": 196.1849, |
| "eval_samples_per_second": 96.868, |
| "eval_steps_per_second": 2.019, |
| "step": 19200 |
| }, |
| { |
| "epoch": 37.3062015503876, |
| "grad_norm": 1.0007529258728027, |
| "learning_rate": 5.078294573643411e-05, |
| "loss": 0.5095104217529297, |
| "step": 19250 |
| }, |
| { |
| "epoch": 37.3062015503876, |
| "eval_loss": 0.5377764105796814, |
| "eval_runtime": 195.1001, |
| "eval_samples_per_second": 97.406, |
| "eval_steps_per_second": 2.03, |
| "step": 19250 |
| }, |
| { |
| "epoch": 37.4031007751938, |
| "grad_norm": 1.0300018787384033, |
| "learning_rate": 5.03953488372093e-05, |
| "loss": 0.5201596069335938, |
| "step": 19300 |
| }, |
| { |
| "epoch": 37.4031007751938, |
| "eval_loss": 0.5398209095001221, |
| "eval_runtime": 186.0194, |
| "eval_samples_per_second": 102.161, |
| "eval_steps_per_second": 2.129, |
| "step": 19300 |
| }, |
| { |
| "epoch": 37.5, |
| "grad_norm": 0.8903720378875732, |
| "learning_rate": 5.0007751937984504e-05, |
| "loss": 0.5176564025878906, |
| "step": 19350 |
| }, |
| { |
| "epoch": 37.5, |
| "eval_loss": 0.5390880107879639, |
| "eval_runtime": 182.7414, |
| "eval_samples_per_second": 103.994, |
| "eval_steps_per_second": 2.167, |
| "step": 19350 |
| }, |
| { |
| "epoch": 37.5968992248062, |
| "grad_norm": 1.1395453214645386, |
| "learning_rate": 4.962015503875969e-05, |
| "loss": 0.5085873413085937, |
| "step": 19400 |
| }, |
| { |
| "epoch": 37.5968992248062, |
| "eval_loss": 0.5383904576301575, |
| "eval_runtime": 184.2422, |
| "eval_samples_per_second": 103.147, |
| "eval_steps_per_second": 2.149, |
| "step": 19400 |
| }, |
| { |
| "epoch": 37.6937984496124, |
| "grad_norm": 1.0433534383773804, |
| "learning_rate": 4.923255813953489e-05, |
| "loss": 0.5213092803955078, |
| "step": 19450 |
| }, |
| { |
| "epoch": 37.6937984496124, |
| "eval_loss": 0.5324852466583252, |
| "eval_runtime": 184.2591, |
| "eval_samples_per_second": 103.137, |
| "eval_steps_per_second": 2.149, |
| "step": 19450 |
| }, |
| { |
| "epoch": 37.7906976744186, |
| "grad_norm": 1.049198031425476, |
| "learning_rate": 4.8844961240310075e-05, |
| "loss": 0.5182023620605469, |
| "step": 19500 |
| }, |
| { |
| "epoch": 37.7906976744186, |
| "eval_loss": 0.5335138440132141, |
| "eval_runtime": 186.3364, |
| "eval_samples_per_second": 101.988, |
| "eval_steps_per_second": 2.125, |
| "step": 19500 |
| }, |
| { |
| "epoch": 37.8875968992248, |
| "grad_norm": 0.9317577481269836, |
| "learning_rate": 4.8457364341085276e-05, |
| "loss": 0.5240017318725586, |
| "step": 19550 |
| }, |
| { |
| "epoch": 37.8875968992248, |
| "eval_loss": 0.5385186076164246, |
| "eval_runtime": 185.6261, |
| "eval_samples_per_second": 102.378, |
| "eval_steps_per_second": 2.133, |
| "step": 19550 |
| }, |
| { |
| "epoch": 37.98449612403101, |
| "grad_norm": 0.9193382263183594, |
| "learning_rate": 4.806976744186047e-05, |
| "loss": 0.5023036193847656, |
| "step": 19600 |
| }, |
| { |
| "epoch": 37.98449612403101, |
| "eval_loss": 0.5272864103317261, |
| "eval_runtime": 186.6742, |
| "eval_samples_per_second": 101.803, |
| "eval_steps_per_second": 2.121, |
| "step": 19600 |
| }, |
| { |
| "epoch": 38.08139534883721, |
| "grad_norm": 0.9680274724960327, |
| "learning_rate": 4.768217054263566e-05, |
| "loss": 0.5128543853759766, |
| "step": 19650 |
| }, |
| { |
| "epoch": 38.08139534883721, |
| "eval_loss": 0.5302436351776123, |
| "eval_runtime": 188.7195, |
| "eval_samples_per_second": 100.7, |
| "eval_steps_per_second": 2.098, |
| "step": 19650 |
| }, |
| { |
| "epoch": 38.17829457364341, |
| "grad_norm": 0.9249178767204285, |
| "learning_rate": 4.7294573643410854e-05, |
| "loss": 0.5085428619384765, |
| "step": 19700 |
| }, |
| { |
| "epoch": 38.17829457364341, |
| "eval_loss": 0.5326287150382996, |
| "eval_runtime": 184.8359, |
| "eval_samples_per_second": 102.816, |
| "eval_steps_per_second": 2.142, |
| "step": 19700 |
| }, |
| { |
| "epoch": 38.275193798449614, |
| "grad_norm": 1.0035618543624878, |
| "learning_rate": 4.690697674418605e-05, |
| "loss": 0.5140943908691407, |
| "step": 19750 |
| }, |
| { |
| "epoch": 38.275193798449614, |
| "eval_loss": 0.5443971753120422, |
| "eval_runtime": 183.6324, |
| "eval_samples_per_second": 103.489, |
| "eval_steps_per_second": 2.156, |
| "step": 19750 |
| }, |
| { |
| "epoch": 38.372093023255815, |
| "grad_norm": 1.1001235246658325, |
| "learning_rate": 4.651937984496124e-05, |
| "loss": 0.502054328918457, |
| "step": 19800 |
| }, |
| { |
| "epoch": 38.372093023255815, |
| "eval_loss": 0.5287387371063232, |
| "eval_runtime": 188.4424, |
| "eval_samples_per_second": 100.848, |
| "eval_steps_per_second": 2.101, |
| "step": 19800 |
| }, |
| { |
| "epoch": 38.468992248062015, |
| "grad_norm": 1.0103139877319336, |
| "learning_rate": 4.613178294573644e-05, |
| "loss": 0.5115917587280273, |
| "step": 19850 |
| }, |
| { |
| "epoch": 38.468992248062015, |
| "eval_loss": 0.5320472121238708, |
| "eval_runtime": 182.8854, |
| "eval_samples_per_second": 103.912, |
| "eval_steps_per_second": 2.165, |
| "step": 19850 |
| }, |
| { |
| "epoch": 38.565891472868216, |
| "grad_norm": 0.9626783728599548, |
| "learning_rate": 4.5744186046511626e-05, |
| "loss": 0.5209971618652344, |
| "step": 19900 |
| }, |
| { |
| "epoch": 38.565891472868216, |
| "eval_loss": 0.5395954251289368, |
| "eval_runtime": 183.9199, |
| "eval_samples_per_second": 103.328, |
| "eval_steps_per_second": 2.153, |
| "step": 19900 |
| }, |
| { |
| "epoch": 38.66279069767442, |
| "grad_norm": 1.2447129487991333, |
| "learning_rate": 4.535658914728683e-05, |
| "loss": 0.5177993774414062, |
| "step": 19950 |
| }, |
| { |
| "epoch": 38.66279069767442, |
| "eval_loss": 0.5312191843986511, |
| "eval_runtime": 183.2959, |
| "eval_samples_per_second": 103.679, |
| "eval_steps_per_second": 2.16, |
| "step": 19950 |
| }, |
| { |
| "epoch": 38.75968992248062, |
| "grad_norm": 0.8827547430992126, |
| "learning_rate": 4.496899224806202e-05, |
| "loss": 0.5207798767089844, |
| "step": 20000 |
| }, |
| { |
| "epoch": 38.75968992248062, |
| "eval_loss": 0.5287414789199829, |
| "eval_runtime": 182.6558, |
| "eval_samples_per_second": 104.043, |
| "eval_steps_per_second": 2.168, |
| "step": 20000 |
| }, |
| { |
| "epoch": 38.85658914728682, |
| "grad_norm": 0.8883045315742493, |
| "learning_rate": 4.458139534883721e-05, |
| "loss": 0.5186112213134766, |
| "step": 20050 |
| }, |
| { |
| "epoch": 38.85658914728682, |
| "eval_loss": 0.5344378352165222, |
| "eval_runtime": 180.0159, |
| "eval_samples_per_second": 105.568, |
| "eval_steps_per_second": 2.2, |
| "step": 20050 |
| }, |
| { |
| "epoch": 38.95348837209303, |
| "grad_norm": 0.9383369088172913, |
| "learning_rate": 4.4193798449612405e-05, |
| "loss": 0.5147262573242187, |
| "step": 20100 |
| }, |
| { |
| "epoch": 38.95348837209303, |
| "eval_loss": 0.5442497730255127, |
| "eval_runtime": 168.8857, |
| "eval_samples_per_second": 112.526, |
| "eval_steps_per_second": 2.345, |
| "step": 20100 |
| }, |
| { |
| "epoch": 39.05038759689923, |
| "grad_norm": 0.8504248857498169, |
| "learning_rate": 4.38062015503876e-05, |
| "loss": 0.5029075241088867, |
| "step": 20150 |
| }, |
| { |
| "epoch": 39.05038759689923, |
| "eval_loss": 0.5233710408210754, |
| "eval_runtime": 176.4514, |
| "eval_samples_per_second": 107.701, |
| "eval_steps_per_second": 2.244, |
| "step": 20150 |
| }, |
| { |
| "epoch": 39.14728682170543, |
| "grad_norm": 1.0626157522201538, |
| "learning_rate": 4.3418604651162794e-05, |
| "loss": 0.5219727325439453, |
| "step": 20200 |
| }, |
| { |
| "epoch": 39.14728682170543, |
| "eval_loss": 0.5346750617027283, |
| "eval_runtime": 177.159, |
| "eval_samples_per_second": 107.271, |
| "eval_steps_per_second": 2.235, |
| "step": 20200 |
| }, |
| { |
| "epoch": 39.24418604651163, |
| "grad_norm": 0.8731338381767273, |
| "learning_rate": 4.303100775193798e-05, |
| "loss": 0.5061806869506836, |
| "step": 20250 |
| }, |
| { |
| "epoch": 39.24418604651163, |
| "eval_loss": 0.5310733318328857, |
| "eval_runtime": 178.8862, |
| "eval_samples_per_second": 106.235, |
| "eval_steps_per_second": 2.214, |
| "step": 20250 |
| }, |
| { |
| "epoch": 39.34108527131783, |
| "grad_norm": 0.8953520059585571, |
| "learning_rate": 4.264341085271318e-05, |
| "loss": 0.5171949768066406, |
| "step": 20300 |
| }, |
| { |
| "epoch": 39.34108527131783, |
| "eval_loss": 0.5359470248222351, |
| "eval_runtime": 169.9833, |
| "eval_samples_per_second": 111.799, |
| "eval_steps_per_second": 2.33, |
| "step": 20300 |
| }, |
| { |
| "epoch": 39.43798449612403, |
| "grad_norm": 0.8778860569000244, |
| "learning_rate": 4.225581395348838e-05, |
| "loss": 0.5076210021972656, |
| "step": 20350 |
| }, |
| { |
| "epoch": 39.43798449612403, |
| "eval_loss": 0.5338460803031921, |
| "eval_runtime": 177.8436, |
| "eval_samples_per_second": 106.858, |
| "eval_steps_per_second": 2.227, |
| "step": 20350 |
| }, |
| { |
| "epoch": 39.53488372093023, |
| "grad_norm": 1.1184332370758057, |
| "learning_rate": 4.186821705426357e-05, |
| "loss": 0.528587989807129, |
| "step": 20400 |
| }, |
| { |
| "epoch": 39.53488372093023, |
| "eval_loss": 0.5384491682052612, |
| "eval_runtime": 173.4459, |
| "eval_samples_per_second": 109.567, |
| "eval_steps_per_second": 2.283, |
| "step": 20400 |
| }, |
| { |
| "epoch": 39.63178294573643, |
| "grad_norm": 0.9835333824157715, |
| "learning_rate": 4.148062015503876e-05, |
| "loss": 0.5009258651733398, |
| "step": 20450 |
| }, |
| { |
| "epoch": 39.63178294573643, |
| "eval_loss": 0.5286412835121155, |
| "eval_runtime": 175.8829, |
| "eval_samples_per_second": 108.049, |
| "eval_steps_per_second": 2.251, |
| "step": 20450 |
| }, |
| { |
| "epoch": 39.72868217054263, |
| "grad_norm": 0.8886466026306152, |
| "learning_rate": 4.1093023255813956e-05, |
| "loss": 0.499103889465332, |
| "step": 20500 |
| }, |
| { |
| "epoch": 39.72868217054263, |
| "eval_loss": 0.5248314142227173, |
| "eval_runtime": 177.0445, |
| "eval_samples_per_second": 107.34, |
| "eval_steps_per_second": 2.237, |
| "step": 20500 |
| }, |
| { |
| "epoch": 39.825581395348834, |
| "grad_norm": 0.9667196869850159, |
| "learning_rate": 4.070542635658915e-05, |
| "loss": 0.5149082946777344, |
| "step": 20550 |
| }, |
| { |
| "epoch": 39.825581395348834, |
| "eval_loss": 0.5282675623893738, |
| "eval_runtime": 176.0589, |
| "eval_samples_per_second": 107.941, |
| "eval_steps_per_second": 2.249, |
| "step": 20550 |
| }, |
| { |
| "epoch": 39.92248062015504, |
| "grad_norm": 1.0895967483520508, |
| "learning_rate": 4.0317829457364345e-05, |
| "loss": 0.5068093109130859, |
| "step": 20600 |
| }, |
| { |
| "epoch": 39.92248062015504, |
| "eval_loss": 0.5327328443527222, |
| "eval_runtime": 175.0237, |
| "eval_samples_per_second": 108.58, |
| "eval_steps_per_second": 2.263, |
| "step": 20600 |
| }, |
| { |
| "epoch": 40.01937984496124, |
| "grad_norm": 1.0282702445983887, |
| "learning_rate": 3.993023255813953e-05, |
| "loss": 0.5070622634887695, |
| "step": 20650 |
| }, |
| { |
| "epoch": 40.01937984496124, |
| "eval_loss": 0.5209603309631348, |
| "eval_runtime": 174.6222, |
| "eval_samples_per_second": 108.829, |
| "eval_steps_per_second": 2.268, |
| "step": 20650 |
| }, |
| { |
| "epoch": 40.116279069767444, |
| "grad_norm": 1.0411657094955444, |
| "learning_rate": 3.954263565891473e-05, |
| "loss": 0.5045206451416016, |
| "step": 20700 |
| }, |
| { |
| "epoch": 40.116279069767444, |
| "eval_loss": 0.5327551364898682, |
| "eval_runtime": 177.2652, |
| "eval_samples_per_second": 107.207, |
| "eval_steps_per_second": 2.234, |
| "step": 20700 |
| }, |
| { |
| "epoch": 40.213178294573645, |
| "grad_norm": 0.9641968607902527, |
| "learning_rate": 3.915503875968993e-05, |
| "loss": 0.5015212631225586, |
| "step": 20750 |
| }, |
| { |
| "epoch": 40.213178294573645, |
| "eval_loss": 0.5277599096298218, |
| "eval_runtime": 173.5229, |
| "eval_samples_per_second": 109.519, |
| "eval_steps_per_second": 2.282, |
| "step": 20750 |
| }, |
| { |
| "epoch": 40.310077519379846, |
| "grad_norm": 0.9383164644241333, |
| "learning_rate": 3.876744186046512e-05, |
| "loss": 0.5052926254272461, |
| "step": 20800 |
| }, |
| { |
| "epoch": 40.310077519379846, |
| "eval_loss": 0.5317120552062988, |
| "eval_runtime": 180.5338, |
| "eval_samples_per_second": 105.266, |
| "eval_steps_per_second": 2.193, |
| "step": 20800 |
| }, |
| { |
| "epoch": 40.406976744186046, |
| "grad_norm": 0.9518958926200867, |
| "learning_rate": 3.837984496124031e-05, |
| "loss": 0.504459228515625, |
| "step": 20850 |
| }, |
| { |
| "epoch": 40.406976744186046, |
| "eval_loss": 0.5327459573745728, |
| "eval_runtime": 178.4024, |
| "eval_samples_per_second": 106.523, |
| "eval_steps_per_second": 2.22, |
| "step": 20850 |
| }, |
| { |
| "epoch": 40.50387596899225, |
| "grad_norm": 0.929793119430542, |
| "learning_rate": 3.799224806201551e-05, |
| "loss": 0.50721923828125, |
| "step": 20900 |
| }, |
| { |
| "epoch": 40.50387596899225, |
| "eval_loss": 0.5232871174812317, |
| "eval_runtime": 176.0828, |
| "eval_samples_per_second": 107.927, |
| "eval_steps_per_second": 2.249, |
| "step": 20900 |
| }, |
| { |
| "epoch": 40.60077519379845, |
| "grad_norm": 1.008422613143921, |
| "learning_rate": 3.76046511627907e-05, |
| "loss": 0.5139606475830079, |
| "step": 20950 |
| }, |
| { |
| "epoch": 40.60077519379845, |
| "eval_loss": 0.5307023525238037, |
| "eval_runtime": 176.7655, |
| "eval_samples_per_second": 107.51, |
| "eval_steps_per_second": 2.24, |
| "step": 20950 |
| }, |
| { |
| "epoch": 40.69767441860465, |
| "grad_norm": 0.9010120630264282, |
| "learning_rate": 3.721705426356589e-05, |
| "loss": 0.5006965255737305, |
| "step": 21000 |
| }, |
| { |
| "epoch": 40.69767441860465, |
| "eval_loss": 0.5244865417480469, |
| "eval_runtime": 568.7253, |
| "eval_samples_per_second": 33.415, |
| "eval_steps_per_second": 0.696, |
| "step": 21000 |
| }, |
| { |
| "epoch": 40.79457364341085, |
| "grad_norm": 0.8767653703689575, |
| "learning_rate": 3.6829457364341084e-05, |
| "loss": 0.5031109619140625, |
| "step": 21050 |
| }, |
| { |
| "epoch": 40.79457364341085, |
| "eval_loss": 0.5274307727813721, |
| "eval_runtime": 180.3181, |
| "eval_samples_per_second": 105.392, |
| "eval_steps_per_second": 2.196, |
| "step": 21050 |
| }, |
| { |
| "epoch": 40.89147286821706, |
| "grad_norm": 0.9830530881881714, |
| "learning_rate": 3.644186046511628e-05, |
| "loss": 0.5075841903686523, |
| "step": 21100 |
| }, |
| { |
| "epoch": 40.89147286821706, |
| "eval_loss": 0.5370468497276306, |
| "eval_runtime": 181.9153, |
| "eval_samples_per_second": 104.466, |
| "eval_steps_per_second": 2.177, |
| "step": 21100 |
| }, |
| { |
| "epoch": 40.98837209302326, |
| "grad_norm": 1.019087791442871, |
| "learning_rate": 3.605426356589148e-05, |
| "loss": 0.5040792846679687, |
| "step": 21150 |
| }, |
| { |
| "epoch": 40.98837209302326, |
| "eval_loss": 0.5332101583480835, |
| "eval_runtime": 183.2927, |
| "eval_samples_per_second": 103.681, |
| "eval_steps_per_second": 2.16, |
| "step": 21150 |
| }, |
| { |
| "epoch": 41.08527131782946, |
| "grad_norm": 0.8988721370697021, |
| "learning_rate": 3.566666666666667e-05, |
| "loss": 0.5045541381835937, |
| "step": 21200 |
| }, |
| { |
| "epoch": 41.08527131782946, |
| "eval_loss": 0.5272098183631897, |
| "eval_runtime": 184.5631, |
| "eval_samples_per_second": 102.967, |
| "eval_steps_per_second": 2.146, |
| "step": 21200 |
| }, |
| { |
| "epoch": 41.18217054263566, |
| "grad_norm": 1.0443159341812134, |
| "learning_rate": 3.527906976744186e-05, |
| "loss": 0.5021038818359375, |
| "step": 21250 |
| }, |
| { |
| "epoch": 41.18217054263566, |
| "eval_loss": 0.5261159539222717, |
| "eval_runtime": 187.475, |
| "eval_samples_per_second": 101.368, |
| "eval_steps_per_second": 2.112, |
| "step": 21250 |
| }, |
| { |
| "epoch": 41.27906976744186, |
| "grad_norm": 0.7973293662071228, |
| "learning_rate": 3.489147286821706e-05, |
| "loss": 0.5001279449462891, |
| "step": 21300 |
| }, |
| { |
| "epoch": 41.27906976744186, |
| "eval_loss": 0.5278809070587158, |
| "eval_runtime": 189.9745, |
| "eval_samples_per_second": 100.034, |
| "eval_steps_per_second": 2.084, |
| "step": 21300 |
| }, |
| { |
| "epoch": 41.37596899224806, |
| "grad_norm": 0.8979710936546326, |
| "learning_rate": 3.450387596899225e-05, |
| "loss": 0.5035758972167969, |
| "step": 21350 |
| }, |
| { |
| "epoch": 41.37596899224806, |
| "eval_loss": 0.521926760673523, |
| "eval_runtime": 188.1978, |
| "eval_samples_per_second": 100.979, |
| "eval_steps_per_second": 2.104, |
| "step": 21350 |
| }, |
| { |
| "epoch": 41.47286821705426, |
| "grad_norm": 0.9254922866821289, |
| "learning_rate": 3.411627906976744e-05, |
| "loss": 0.5044261932373046, |
| "step": 21400 |
| }, |
| { |
| "epoch": 41.47286821705426, |
| "eval_loss": 0.5313370227813721, |
| "eval_runtime": 190.0945, |
| "eval_samples_per_second": 99.971, |
| "eval_steps_per_second": 2.083, |
| "step": 21400 |
| }, |
| { |
| "epoch": 41.56976744186046, |
| "grad_norm": 0.9278942346572876, |
| "learning_rate": 3.3728682170542635e-05, |
| "loss": 0.5007383728027344, |
| "step": 21450 |
| }, |
| { |
| "epoch": 41.56976744186046, |
| "eval_loss": 0.5329434275627136, |
| "eval_runtime": 186.2051, |
| "eval_samples_per_second": 102.059, |
| "eval_steps_per_second": 2.127, |
| "step": 21450 |
| }, |
| { |
| "epoch": 41.666666666666664, |
| "grad_norm": 1.0791382789611816, |
| "learning_rate": 3.334108527131783e-05, |
| "loss": 0.504736557006836, |
| "step": 21500 |
| }, |
| { |
| "epoch": 41.666666666666664, |
| "eval_loss": 0.529855489730835, |
| "eval_runtime": 186.3264, |
| "eval_samples_per_second": 101.993, |
| "eval_steps_per_second": 2.125, |
| "step": 21500 |
| }, |
| { |
| "epoch": 41.763565891472865, |
| "grad_norm": 0.9192059636116028, |
| "learning_rate": 3.2953488372093025e-05, |
| "loss": 0.501381721496582, |
| "step": 21550 |
| }, |
| { |
| "epoch": 41.763565891472865, |
| "eval_loss": 0.5253894925117493, |
| "eval_runtime": 186.6125, |
| "eval_samples_per_second": 101.837, |
| "eval_steps_per_second": 2.122, |
| "step": 21550 |
| }, |
| { |
| "epoch": 41.86046511627907, |
| "grad_norm": 0.9805233478546143, |
| "learning_rate": 3.256589147286822e-05, |
| "loss": 0.5100160217285157, |
| "step": 21600 |
| }, |
| { |
| "epoch": 41.86046511627907, |
| "eval_loss": 0.5194661021232605, |
| "eval_runtime": 188.6485, |
| "eval_samples_per_second": 100.738, |
| "eval_steps_per_second": 2.099, |
| "step": 21600 |
| }, |
| { |
| "epoch": 41.957364341085274, |
| "grad_norm": 0.846781313419342, |
| "learning_rate": 3.2178294573643414e-05, |
| "loss": 0.5082344818115234, |
| "step": 21650 |
| }, |
| { |
| "epoch": 41.957364341085274, |
| "eval_loss": 0.5229784846305847, |
| "eval_runtime": 189.4238, |
| "eval_samples_per_second": 100.325, |
| "eval_steps_per_second": 2.091, |
| "step": 21650 |
| }, |
| { |
| "epoch": 42.054263565891475, |
| "grad_norm": 0.8974575996398926, |
| "learning_rate": 3.179069767441861e-05, |
| "loss": 0.504454231262207, |
| "step": 21700 |
| }, |
| { |
| "epoch": 42.054263565891475, |
| "eval_loss": 0.5280088782310486, |
| "eval_runtime": 187.0803, |
| "eval_samples_per_second": 101.582, |
| "eval_steps_per_second": 2.117, |
| "step": 21700 |
| }, |
| { |
| "epoch": 42.151162790697676, |
| "grad_norm": 0.933649480342865, |
| "learning_rate": 3.14031007751938e-05, |
| "loss": 0.501411361694336, |
| "step": 21750 |
| }, |
| { |
| "epoch": 42.151162790697676, |
| "eval_loss": 0.5128213167190552, |
| "eval_runtime": 187.1399, |
| "eval_samples_per_second": 101.55, |
| "eval_steps_per_second": 2.116, |
| "step": 21750 |
| }, |
| { |
| "epoch": 42.248062015503876, |
| "grad_norm": 0.8940353393554688, |
| "learning_rate": 3.101550387596899e-05, |
| "loss": 0.4968502807617188, |
| "step": 21800 |
| }, |
| { |
| "epoch": 42.248062015503876, |
| "eval_loss": 0.5252892971038818, |
| "eval_runtime": 186.0325, |
| "eval_samples_per_second": 102.154, |
| "eval_steps_per_second": 2.129, |
| "step": 21800 |
| }, |
| { |
| "epoch": 42.34496124031008, |
| "grad_norm": 0.8218147158622742, |
| "learning_rate": 3.0627906976744186e-05, |
| "loss": 0.5118127059936524, |
| "step": 21850 |
| }, |
| { |
| "epoch": 42.34496124031008, |
| "eval_loss": 0.5292873382568359, |
| "eval_runtime": 184.4715, |
| "eval_samples_per_second": 103.019, |
| "eval_steps_per_second": 2.147, |
| "step": 21850 |
| }, |
| { |
| "epoch": 42.44186046511628, |
| "grad_norm": 0.8893631100654602, |
| "learning_rate": 3.0240310077519378e-05, |
| "loss": 0.5024843978881836, |
| "step": 21900 |
| }, |
| { |
| "epoch": 42.44186046511628, |
| "eval_loss": 0.524248480796814, |
| "eval_runtime": 184.4676, |
| "eval_samples_per_second": 103.021, |
| "eval_steps_per_second": 2.147, |
| "step": 21900 |
| }, |
| { |
| "epoch": 42.53875968992248, |
| "grad_norm": 1.0226677656173706, |
| "learning_rate": 2.9852713178294572e-05, |
| "loss": 0.4969608688354492, |
| "step": 21950 |
| }, |
| { |
| "epoch": 42.53875968992248, |
| "eval_loss": 0.5290153622627258, |
| "eval_runtime": 185.7762, |
| "eval_samples_per_second": 102.295, |
| "eval_steps_per_second": 2.132, |
| "step": 21950 |
| }, |
| { |
| "epoch": 42.63565891472868, |
| "grad_norm": 0.8559462428092957, |
| "learning_rate": 2.946511627906977e-05, |
| "loss": 0.5093366622924804, |
| "step": 22000 |
| }, |
| { |
| "epoch": 42.63565891472868, |
| "eval_loss": 0.516994059085846, |
| "eval_runtime": 186.7511, |
| "eval_samples_per_second": 101.761, |
| "eval_steps_per_second": 2.12, |
| "step": 22000 |
| }, |
| { |
| "epoch": 42.73255813953488, |
| "grad_norm": 0.8686088919639587, |
| "learning_rate": 2.9077519379844965e-05, |
| "loss": 0.49697071075439453, |
| "step": 22050 |
| }, |
| { |
| "epoch": 42.73255813953488, |
| "eval_loss": 0.5155017971992493, |
| "eval_runtime": 195.3357, |
| "eval_samples_per_second": 97.289, |
| "eval_steps_per_second": 2.027, |
| "step": 22050 |
| }, |
| { |
| "epoch": 42.82945736434109, |
| "grad_norm": 0.8473958969116211, |
| "learning_rate": 2.8689922480620157e-05, |
| "loss": 0.5003182220458985, |
| "step": 22100 |
| }, |
| { |
| "epoch": 42.82945736434109, |
| "eval_loss": 0.5242047309875488, |
| "eval_runtime": 194.0238, |
| "eval_samples_per_second": 97.947, |
| "eval_steps_per_second": 2.041, |
| "step": 22100 |
| }, |
| { |
| "epoch": 42.92635658914729, |
| "grad_norm": 0.9314485788345337, |
| "learning_rate": 2.830232558139535e-05, |
| "loss": 0.5022520065307617, |
| "step": 22150 |
| }, |
| { |
| "epoch": 42.92635658914729, |
| "eval_loss": 0.5250500440597534, |
| "eval_runtime": 233.2841, |
| "eval_samples_per_second": 81.463, |
| "eval_steps_per_second": 1.698, |
| "step": 22150 |
| }, |
| { |
| "epoch": 43.02325581395349, |
| "grad_norm": 0.9403958320617676, |
| "learning_rate": 2.7914728682170543e-05, |
| "loss": 0.5011024856567383, |
| "step": 22200 |
| }, |
| { |
| "epoch": 43.02325581395349, |
| "eval_loss": 0.5284552574157715, |
| "eval_runtime": 224.6625, |
| "eval_samples_per_second": 84.589, |
| "eval_steps_per_second": 1.763, |
| "step": 22200 |
| }, |
| { |
| "epoch": 43.12015503875969, |
| "grad_norm": 0.8705305457115173, |
| "learning_rate": 2.7527131782945737e-05, |
| "loss": 0.4972893524169922, |
| "step": 22250 |
| }, |
| { |
| "epoch": 43.12015503875969, |
| "eval_loss": 0.5284101366996765, |
| "eval_runtime": 194.01, |
| "eval_samples_per_second": 97.954, |
| "eval_steps_per_second": 2.041, |
| "step": 22250 |
| }, |
| { |
| "epoch": 43.21705426356589, |
| "grad_norm": 0.9412527084350586, |
| "learning_rate": 2.713953488372093e-05, |
| "loss": 0.4995740509033203, |
| "step": 22300 |
| }, |
| { |
| "epoch": 43.21705426356589, |
| "eval_loss": 0.516463041305542, |
| "eval_runtime": 208.129, |
| "eval_samples_per_second": 91.309, |
| "eval_steps_per_second": 1.903, |
| "step": 22300 |
| }, |
| { |
| "epoch": 43.31395348837209, |
| "grad_norm": 1.1200988292694092, |
| "learning_rate": 2.6751937984496123e-05, |
| "loss": 0.49355110168457034, |
| "step": 22350 |
| }, |
| { |
| "epoch": 43.31395348837209, |
| "eval_loss": 0.5287572741508484, |
| "eval_runtime": 193.966, |
| "eval_samples_per_second": 97.976, |
| "eval_steps_per_second": 2.042, |
| "step": 22350 |
| }, |
| { |
| "epoch": 43.41085271317829, |
| "grad_norm": 0.8845555186271667, |
| "learning_rate": 2.636434108527132e-05, |
| "loss": 0.48946212768554687, |
| "step": 22400 |
| }, |
| { |
| "epoch": 43.41085271317829, |
| "eval_loss": 0.5179551243782043, |
| "eval_runtime": 191.2172, |
| "eval_samples_per_second": 99.384, |
| "eval_steps_per_second": 2.071, |
| "step": 22400 |
| }, |
| { |
| "epoch": 43.507751937984494, |
| "grad_norm": 1.2325315475463867, |
| "learning_rate": 2.5976744186046513e-05, |
| "loss": 0.48715110778808596, |
| "step": 22450 |
| }, |
| { |
| "epoch": 43.507751937984494, |
| "eval_loss": 0.5255248546600342, |
| "eval_runtime": 214.106, |
| "eval_samples_per_second": 88.76, |
| "eval_steps_per_second": 1.85, |
| "step": 22450 |
| }, |
| { |
| "epoch": 43.604651162790695, |
| "grad_norm": 0.8257450461387634, |
| "learning_rate": 2.5589147286821708e-05, |
| "loss": 0.5038369369506835, |
| "step": 22500 |
| }, |
| { |
| "epoch": 43.604651162790695, |
| "eval_loss": 0.5318561792373657, |
| "eval_runtime": 237.7016, |
| "eval_samples_per_second": 79.949, |
| "eval_steps_per_second": 1.666, |
| "step": 22500 |
| }, |
| { |
| "epoch": 43.701550387596896, |
| "grad_norm": 0.9131925702095032, |
| "learning_rate": 2.52015503875969e-05, |
| "loss": 0.48653671264648435, |
| "step": 22550 |
| }, |
| { |
| "epoch": 43.701550387596896, |
| "eval_loss": 0.5257322192192078, |
| "eval_runtime": 197.9576, |
| "eval_samples_per_second": 96.0, |
| "eval_steps_per_second": 2.0, |
| "step": 22550 |
| }, |
| { |
| "epoch": 43.798449612403104, |
| "grad_norm": 0.9946778416633606, |
| "learning_rate": 2.4813953488372094e-05, |
| "loss": 0.5035680389404297, |
| "step": 22600 |
| }, |
| { |
| "epoch": 43.798449612403104, |
| "eval_loss": 0.5150420665740967, |
| "eval_runtime": 206.912, |
| "eval_samples_per_second": 91.846, |
| "eval_steps_per_second": 1.914, |
| "step": 22600 |
| }, |
| { |
| "epoch": 43.895348837209305, |
| "grad_norm": 1.0072699785232544, |
| "learning_rate": 2.442635658914729e-05, |
| "loss": 0.49596702575683593, |
| "step": 22650 |
| }, |
| { |
| "epoch": 43.895348837209305, |
| "eval_loss": 0.5257139205932617, |
| "eval_runtime": 199.7615, |
| "eval_samples_per_second": 95.133, |
| "eval_steps_per_second": 1.982, |
| "step": 22650 |
| }, |
| { |
| "epoch": 43.992248062015506, |
| "grad_norm": 0.9366716742515564, |
| "learning_rate": 2.4038759689922483e-05, |
| "loss": 0.49716136932373045, |
| "step": 22700 |
| }, |
| { |
| "epoch": 43.992248062015506, |
| "eval_loss": 0.5198754668235779, |
| "eval_runtime": 193.3347, |
| "eval_samples_per_second": 98.296, |
| "eval_steps_per_second": 2.048, |
| "step": 22700 |
| }, |
| { |
| "epoch": 44.08914728682171, |
| "grad_norm": 0.90858393907547, |
| "learning_rate": 2.3651162790697675e-05, |
| "loss": 0.5002625656127929, |
| "step": 22750 |
| }, |
| { |
| "epoch": 44.08914728682171, |
| "eval_loss": 0.527286171913147, |
| "eval_runtime": 196.2426, |
| "eval_samples_per_second": 96.839, |
| "eval_steps_per_second": 2.018, |
| "step": 22750 |
| }, |
| { |
| "epoch": 44.18604651162791, |
| "grad_norm": 1.0151848793029785, |
| "learning_rate": 2.326356589147287e-05, |
| "loss": 0.4959259033203125, |
| "step": 22800 |
| }, |
| { |
| "epoch": 44.18604651162791, |
| "eval_loss": 0.5242442488670349, |
| "eval_runtime": 194.4345, |
| "eval_samples_per_second": 97.74, |
| "eval_steps_per_second": 2.037, |
| "step": 22800 |
| }, |
| { |
| "epoch": 44.28294573643411, |
| "grad_norm": 0.8512209057807922, |
| "learning_rate": 2.287596899224806e-05, |
| "loss": 0.500455093383789, |
| "step": 22850 |
| }, |
| { |
| "epoch": 44.28294573643411, |
| "eval_loss": 0.5242491960525513, |
| "eval_runtime": 193.4476, |
| "eval_samples_per_second": 98.238, |
| "eval_steps_per_second": 2.047, |
| "step": 22850 |
| }, |
| { |
| "epoch": 44.37984496124031, |
| "grad_norm": 0.8968107104301453, |
| "learning_rate": 2.248837209302326e-05, |
| "loss": 0.4897348022460937, |
| "step": 22900 |
| }, |
| { |
| "epoch": 44.37984496124031, |
| "eval_loss": 0.5159934759140015, |
| "eval_runtime": 189.2217, |
| "eval_samples_per_second": 100.432, |
| "eval_steps_per_second": 2.093, |
| "step": 22900 |
| }, |
| { |
| "epoch": 44.47674418604651, |
| "grad_norm": 0.8487868309020996, |
| "learning_rate": 2.210077519379845e-05, |
| "loss": 0.48243858337402346, |
| "step": 22950 |
| }, |
| { |
| "epoch": 44.47674418604651, |
| "eval_loss": 0.5207856893539429, |
| "eval_runtime": 191.921, |
| "eval_samples_per_second": 99.02, |
| "eval_steps_per_second": 2.063, |
| "step": 22950 |
| }, |
| { |
| "epoch": 44.57364341085271, |
| "grad_norm": 0.9809431433677673, |
| "learning_rate": 2.1713178294573645e-05, |
| "loss": 0.48758525848388673, |
| "step": 23000 |
| }, |
| { |
| "epoch": 44.57364341085271, |
| "eval_loss": 0.5163289308547974, |
| "eval_runtime": 196.6885, |
| "eval_samples_per_second": 96.62, |
| "eval_steps_per_second": 2.013, |
| "step": 23000 |
| }, |
| { |
| "epoch": 44.67054263565891, |
| "grad_norm": 0.9498407244682312, |
| "learning_rate": 2.1325581395348836e-05, |
| "loss": 0.48854473114013675, |
| "step": 23050 |
| }, |
| { |
| "epoch": 44.67054263565891, |
| "eval_loss": 0.5189388394355774, |
| "eval_runtime": 196.4657, |
| "eval_samples_per_second": 96.729, |
| "eval_steps_per_second": 2.016, |
| "step": 23050 |
| }, |
| { |
| "epoch": 44.76744186046512, |
| "grad_norm": 0.9875770807266235, |
| "learning_rate": 2.0937984496124034e-05, |
| "loss": 0.5109722900390625, |
| "step": 23100 |
| }, |
| { |
| "epoch": 44.76744186046512, |
| "eval_loss": 0.5207294821739197, |
| "eval_runtime": 194.5411, |
| "eval_samples_per_second": 97.686, |
| "eval_steps_per_second": 2.036, |
| "step": 23100 |
| }, |
| { |
| "epoch": 44.86434108527132, |
| "grad_norm": 0.8651441335678101, |
| "learning_rate": 2.0550387596899226e-05, |
| "loss": 0.4815971755981445, |
| "step": 23150 |
| }, |
| { |
| "epoch": 44.86434108527132, |
| "eval_loss": 0.5165086388587952, |
| "eval_runtime": 194.6518, |
| "eval_samples_per_second": 97.631, |
| "eval_steps_per_second": 2.034, |
| "step": 23150 |
| }, |
| { |
| "epoch": 44.96124031007752, |
| "grad_norm": 1.0780807733535767, |
| "learning_rate": 2.016279069767442e-05, |
| "loss": 0.49749275207519533, |
| "step": 23200 |
| }, |
| { |
| "epoch": 44.96124031007752, |
| "eval_loss": 0.5201721787452698, |
| "eval_runtime": 195.2209, |
| "eval_samples_per_second": 97.346, |
| "eval_steps_per_second": 2.028, |
| "step": 23200 |
| }, |
| { |
| "epoch": 45.05813953488372, |
| "grad_norm": 0.9796660542488098, |
| "learning_rate": 1.977519379844961e-05, |
| "loss": 0.4934458541870117, |
| "step": 23250 |
| }, |
| { |
| "epoch": 45.05813953488372, |
| "eval_loss": 0.5206090211868286, |
| "eval_runtime": 194.3101, |
| "eval_samples_per_second": 97.802, |
| "eval_steps_per_second": 2.038, |
| "step": 23250 |
| }, |
| { |
| "epoch": 45.15503875968992, |
| "grad_norm": 0.8599798679351807, |
| "learning_rate": 1.938759689922481e-05, |
| "loss": 0.5016697692871094, |
| "step": 23300 |
| }, |
| { |
| "epoch": 45.15503875968992, |
| "eval_loss": 0.5147153735160828, |
| "eval_runtime": 192.9359, |
| "eval_samples_per_second": 98.499, |
| "eval_steps_per_second": 2.052, |
| "step": 23300 |
| }, |
| { |
| "epoch": 45.251937984496124, |
| "grad_norm": 0.7895607352256775, |
| "learning_rate": 1.9e-05, |
| "loss": 0.49334972381591796, |
| "step": 23350 |
| }, |
| { |
| "epoch": 45.251937984496124, |
| "eval_loss": 0.515512228012085, |
| "eval_runtime": 193.0203, |
| "eval_samples_per_second": 98.456, |
| "eval_steps_per_second": 2.052, |
| "step": 23350 |
| }, |
| { |
| "epoch": 45.348837209302324, |
| "grad_norm": 1.0045746564865112, |
| "learning_rate": 1.8612403100775196e-05, |
| "loss": 0.49473106384277343, |
| "step": 23400 |
| }, |
| { |
| "epoch": 45.348837209302324, |
| "eval_loss": 0.5082274675369263, |
| "eval_runtime": 197.0156, |
| "eval_samples_per_second": 96.459, |
| "eval_steps_per_second": 2.01, |
| "step": 23400 |
| }, |
| { |
| "epoch": 45.445736434108525, |
| "grad_norm": 0.9885164499282837, |
| "learning_rate": 1.8224806201550387e-05, |
| "loss": 0.49016048431396486, |
| "step": 23450 |
| }, |
| { |
| "epoch": 45.445736434108525, |
| "eval_loss": 0.5208576917648315, |
| "eval_runtime": 196.5842, |
| "eval_samples_per_second": 96.671, |
| "eval_steps_per_second": 2.014, |
| "step": 23450 |
| }, |
| { |
| "epoch": 45.542635658914726, |
| "grad_norm": 0.9519304633140564, |
| "learning_rate": 1.7837209302325582e-05, |
| "loss": 0.501864242553711, |
| "step": 23500 |
| }, |
| { |
| "epoch": 45.542635658914726, |
| "eval_loss": 0.5219539403915405, |
| "eval_runtime": 196.4914, |
| "eval_samples_per_second": 96.717, |
| "eval_steps_per_second": 2.015, |
| "step": 23500 |
| }, |
| { |
| "epoch": 45.63953488372093, |
| "grad_norm": 0.8717612624168396, |
| "learning_rate": 1.7449612403100777e-05, |
| "loss": 0.4909215545654297, |
| "step": 23550 |
| }, |
| { |
| "epoch": 45.63953488372093, |
| "eval_loss": 0.5142674446105957, |
| "eval_runtime": 197.216, |
| "eval_samples_per_second": 96.361, |
| "eval_steps_per_second": 2.008, |
| "step": 23550 |
| }, |
| { |
| "epoch": 45.736434108527135, |
| "grad_norm": 0.986541748046875, |
| "learning_rate": 1.7062015503875968e-05, |
| "loss": 0.4898907470703125, |
| "step": 23600 |
| }, |
| { |
| "epoch": 45.736434108527135, |
| "eval_loss": 0.5174142122268677, |
| "eval_runtime": 194.9761, |
| "eval_samples_per_second": 97.468, |
| "eval_steps_per_second": 2.031, |
| "step": 23600 |
| }, |
| { |
| "epoch": 45.833333333333336, |
| "grad_norm": 1.0406668186187744, |
| "learning_rate": 1.6674418604651163e-05, |
| "loss": 0.48606651306152343, |
| "step": 23650 |
| }, |
| { |
| "epoch": 45.833333333333336, |
| "eval_loss": 0.5181837677955627, |
| "eval_runtime": 195.1438, |
| "eval_samples_per_second": 97.385, |
| "eval_steps_per_second": 2.029, |
| "step": 23650 |
| }, |
| { |
| "epoch": 45.93023255813954, |
| "grad_norm": 1.2656255960464478, |
| "learning_rate": 1.6286821705426357e-05, |
| "loss": 0.4883313751220703, |
| "step": 23700 |
| }, |
| { |
| "epoch": 45.93023255813954, |
| "eval_loss": 0.5129756927490234, |
| "eval_runtime": 191.7296, |
| "eval_samples_per_second": 99.119, |
| "eval_steps_per_second": 2.065, |
| "step": 23700 |
| }, |
| { |
| "epoch": 46.02713178294574, |
| "grad_norm": 0.9951698184013367, |
| "learning_rate": 1.5899224806201552e-05, |
| "loss": 0.4959673690795898, |
| "step": 23750 |
| }, |
| { |
| "epoch": 46.02713178294574, |
| "eval_loss": 0.5178924798965454, |
| "eval_runtime": 193.978, |
| "eval_samples_per_second": 97.97, |
| "eval_steps_per_second": 2.041, |
| "step": 23750 |
| }, |
| { |
| "epoch": 46.12403100775194, |
| "grad_norm": 0.8214923739433289, |
| "learning_rate": 1.5511627906976743e-05, |
| "loss": 0.47073410034179686, |
| "step": 23800 |
| }, |
| { |
| "epoch": 46.12403100775194, |
| "eval_loss": 0.5200338959693909, |
| "eval_runtime": 194.4058, |
| "eval_samples_per_second": 97.754, |
| "eval_steps_per_second": 2.037, |
| "step": 23800 |
| }, |
| { |
| "epoch": 46.22093023255814, |
| "grad_norm": 1.0531407594680786, |
| "learning_rate": 1.5124031007751938e-05, |
| "loss": 0.4762581253051758, |
| "step": 23850 |
| }, |
| { |
| "epoch": 46.22093023255814, |
| "eval_loss": 0.5189253687858582, |
| "eval_runtime": 192.1836, |
| "eval_samples_per_second": 98.885, |
| "eval_steps_per_second": 2.061, |
| "step": 23850 |
| }, |
| { |
| "epoch": 46.31782945736434, |
| "grad_norm": 0.8493297696113586, |
| "learning_rate": 1.4736434108527133e-05, |
| "loss": 0.47398033142089846, |
| "step": 23900 |
| }, |
| { |
| "epoch": 46.31782945736434, |
| "eval_loss": 0.5211531519889832, |
| "eval_runtime": 196.6, |
| "eval_samples_per_second": 96.663, |
| "eval_steps_per_second": 2.014, |
| "step": 23900 |
| }, |
| { |
| "epoch": 46.41472868217054, |
| "grad_norm": 0.9804657697677612, |
| "learning_rate": 1.4348837209302326e-05, |
| "loss": 0.4727859878540039, |
| "step": 23950 |
| }, |
| { |
| "epoch": 46.41472868217054, |
| "eval_loss": 0.5094043612480164, |
| "eval_runtime": 193.37, |
| "eval_samples_per_second": 98.278, |
| "eval_steps_per_second": 2.048, |
| "step": 23950 |
| }, |
| { |
| "epoch": 46.51162790697674, |
| "grad_norm": 0.9358029961585999, |
| "learning_rate": 1.3961240310077519e-05, |
| "loss": 0.4944017028808594, |
| "step": 24000 |
| }, |
| { |
| "epoch": 46.51162790697674, |
| "eval_loss": 0.5203186273574829, |
| "eval_runtime": 192.6269, |
| "eval_samples_per_second": 98.657, |
| "eval_steps_per_second": 2.056, |
| "step": 24000 |
| }, |
| { |
| "epoch": 46.60852713178294, |
| "grad_norm": 1.053912878036499, |
| "learning_rate": 1.3573643410852712e-05, |
| "loss": 0.48282398223876954, |
| "step": 24050 |
| }, |
| { |
| "epoch": 46.60852713178294, |
| "eval_loss": 0.5168351531028748, |
| "eval_runtime": 193.3422, |
| "eval_samples_per_second": 98.292, |
| "eval_steps_per_second": 2.048, |
| "step": 24050 |
| }, |
| { |
| "epoch": 46.70542635658915, |
| "grad_norm": 0.9707403182983398, |
| "learning_rate": 1.3186046511627908e-05, |
| "loss": 0.49057979583740235, |
| "step": 24100 |
| }, |
| { |
| "epoch": 46.70542635658915, |
| "eval_loss": 0.5090420842170715, |
| "eval_runtime": 194.8113, |
| "eval_samples_per_second": 97.551, |
| "eval_steps_per_second": 2.033, |
| "step": 24100 |
| }, |
| { |
| "epoch": 46.80232558139535, |
| "grad_norm": 0.8060054183006287, |
| "learning_rate": 1.2798449612403101e-05, |
| "loss": 0.4925830841064453, |
| "step": 24150 |
| }, |
| { |
| "epoch": 46.80232558139535, |
| "eval_loss": 0.5142449736595154, |
| "eval_runtime": 193.6387, |
| "eval_samples_per_second": 98.142, |
| "eval_steps_per_second": 2.045, |
| "step": 24150 |
| }, |
| { |
| "epoch": 46.89922480620155, |
| "grad_norm": 1.0535565614700317, |
| "learning_rate": 1.2410852713178294e-05, |
| "loss": 0.48466400146484373, |
| "step": 24200 |
| }, |
| { |
| "epoch": 46.89922480620155, |
| "eval_loss": 0.5140147805213928, |
| "eval_runtime": 194.8466, |
| "eval_samples_per_second": 97.533, |
| "eval_steps_per_second": 2.032, |
| "step": 24200 |
| }, |
| { |
| "epoch": 46.99612403100775, |
| "grad_norm": 0.8662691712379456, |
| "learning_rate": 1.202325581395349e-05, |
| "loss": 0.4897040939331055, |
| "step": 24250 |
| }, |
| { |
| "epoch": 46.99612403100775, |
| "eval_loss": 0.5235028862953186, |
| "eval_runtime": 193.8048, |
| "eval_samples_per_second": 98.057, |
| "eval_steps_per_second": 2.043, |
| "step": 24250 |
| }, |
| { |
| "epoch": 47.093023255813954, |
| "grad_norm": 1.1001732349395752, |
| "learning_rate": 1.1635658914728682e-05, |
| "loss": 0.49039371490478517, |
| "step": 24300 |
| }, |
| { |
| "epoch": 47.093023255813954, |
| "eval_loss": 0.5225379467010498, |
| "eval_runtime": 194.172, |
| "eval_samples_per_second": 97.872, |
| "eval_steps_per_second": 2.039, |
| "step": 24300 |
| }, |
| { |
| "epoch": 47.189922480620154, |
| "grad_norm": 0.8969790935516357, |
| "learning_rate": 1.1248062015503877e-05, |
| "loss": 0.47529258728027346, |
| "step": 24350 |
| }, |
| { |
| "epoch": 47.189922480620154, |
| "eval_loss": 0.5167431831359863, |
| "eval_runtime": 191.575, |
| "eval_samples_per_second": 99.199, |
| "eval_steps_per_second": 2.067, |
| "step": 24350 |
| }, |
| { |
| "epoch": 47.286821705426355, |
| "grad_norm": 0.8796032071113586, |
| "learning_rate": 1.086046511627907e-05, |
| "loss": 0.48536407470703125, |
| "step": 24400 |
| }, |
| { |
| "epoch": 47.286821705426355, |
| "eval_loss": 0.5132911205291748, |
| "eval_runtime": 192.842, |
| "eval_samples_per_second": 98.547, |
| "eval_steps_per_second": 2.053, |
| "step": 24400 |
| }, |
| { |
| "epoch": 47.383720930232556, |
| "grad_norm": 0.7977496981620789, |
| "learning_rate": 1.0472868217054265e-05, |
| "loss": 0.4799433517456055, |
| "step": 24450 |
| }, |
| { |
| "epoch": 47.383720930232556, |
| "eval_loss": 0.5126928687095642, |
| "eval_runtime": 192.6571, |
| "eval_samples_per_second": 98.642, |
| "eval_steps_per_second": 2.055, |
| "step": 24450 |
| }, |
| { |
| "epoch": 47.48062015503876, |
| "grad_norm": 1.0533097982406616, |
| "learning_rate": 1.0085271317829458e-05, |
| "loss": 0.49268310546875, |
| "step": 24500 |
| }, |
| { |
| "epoch": 47.48062015503876, |
| "eval_loss": 0.5129761695861816, |
| "eval_runtime": 193.297, |
| "eval_samples_per_second": 98.315, |
| "eval_steps_per_second": 2.049, |
| "step": 24500 |
| }, |
| { |
| "epoch": 47.57751937984496, |
| "grad_norm": 0.9760648608207703, |
| "learning_rate": 9.697674418604652e-06, |
| "loss": 0.49476985931396483, |
| "step": 24550 |
| }, |
| { |
| "epoch": 47.57751937984496, |
| "eval_loss": 0.5181257128715515, |
| "eval_runtime": 187.2145, |
| "eval_samples_per_second": 101.509, |
| "eval_steps_per_second": 2.115, |
| "step": 24550 |
| }, |
| { |
| "epoch": 47.674418604651166, |
| "grad_norm": 1.0375988483428955, |
| "learning_rate": 9.310077519379845e-06, |
| "loss": 0.4825825881958008, |
| "step": 24600 |
| }, |
| { |
| "epoch": 47.674418604651166, |
| "eval_loss": 0.5161689519882202, |
| "eval_runtime": 195.4323, |
| "eval_samples_per_second": 97.241, |
| "eval_steps_per_second": 2.026, |
| "step": 24600 |
| }, |
| { |
| "epoch": 47.77131782945737, |
| "grad_norm": 1.0778166055679321, |
| "learning_rate": 8.92248062015504e-06, |
| "loss": 0.47775421142578123, |
| "step": 24650 |
| }, |
| { |
| "epoch": 47.77131782945737, |
| "eval_loss": 0.5174992084503174, |
| "eval_runtime": 193.574, |
| "eval_samples_per_second": 98.174, |
| "eval_steps_per_second": 2.046, |
| "step": 24650 |
| }, |
| { |
| "epoch": 47.86821705426357, |
| "grad_norm": 0.8025913238525391, |
| "learning_rate": 8.534883720930233e-06, |
| "loss": 0.4860971450805664, |
| "step": 24700 |
| }, |
| { |
| "epoch": 47.86821705426357, |
| "eval_loss": 0.5220958590507507, |
| "eval_runtime": 187.9385, |
| "eval_samples_per_second": 101.118, |
| "eval_steps_per_second": 2.107, |
| "step": 24700 |
| }, |
| { |
| "epoch": 47.96511627906977, |
| "grad_norm": 0.9379816651344299, |
| "learning_rate": 8.147286821705428e-06, |
| "loss": 0.4851155471801758, |
| "step": 24750 |
| }, |
| { |
| "epoch": 47.96511627906977, |
| "eval_loss": 0.5218383073806763, |
| "eval_runtime": 191.0396, |
| "eval_samples_per_second": 99.477, |
| "eval_steps_per_second": 2.073, |
| "step": 24750 |
| }, |
| { |
| "epoch": 48.06201550387597, |
| "grad_norm": 0.8389429450035095, |
| "learning_rate": 7.759689922480621e-06, |
| "loss": 0.4862751770019531, |
| "step": 24800 |
| }, |
| { |
| "epoch": 48.06201550387597, |
| "eval_loss": 0.5244157314300537, |
| "eval_runtime": 194.1611, |
| "eval_samples_per_second": 97.878, |
| "eval_steps_per_second": 2.04, |
| "step": 24800 |
| }, |
| { |
| "epoch": 48.15891472868217, |
| "grad_norm": 1.1113231182098389, |
| "learning_rate": 7.372093023255815e-06, |
| "loss": 0.4865293502807617, |
| "step": 24850 |
| }, |
| { |
| "epoch": 48.15891472868217, |
| "eval_loss": 0.5153717398643494, |
| "eval_runtime": 188.1418, |
| "eval_samples_per_second": 101.009, |
| "eval_steps_per_second": 2.105, |
| "step": 24850 |
| }, |
| { |
| "epoch": 48.25581395348837, |
| "grad_norm": 0.9999150633811951, |
| "learning_rate": 6.984496124031008e-06, |
| "loss": 0.48201698303222656, |
| "step": 24900 |
| }, |
| { |
| "epoch": 48.25581395348837, |
| "eval_loss": 0.5217995047569275, |
| "eval_runtime": 202.7563, |
| "eval_samples_per_second": 93.728, |
| "eval_steps_per_second": 1.953, |
| "step": 24900 |
| }, |
| { |
| "epoch": 48.35271317829457, |
| "grad_norm": 0.9121875166893005, |
| "learning_rate": 6.596899224806203e-06, |
| "loss": 0.4848302459716797, |
| "step": 24950 |
| }, |
| { |
| "epoch": 48.35271317829457, |
| "eval_loss": 0.5181519389152527, |
| "eval_runtime": 203.5881, |
| "eval_samples_per_second": 93.345, |
| "eval_steps_per_second": 1.945, |
| "step": 24950 |
| }, |
| { |
| "epoch": 48.44961240310077, |
| "grad_norm": 0.9109277129173279, |
| "learning_rate": 6.209302325581396e-06, |
| "loss": 0.4896494293212891, |
| "step": 25000 |
| }, |
| { |
| "epoch": 48.44961240310077, |
| "eval_loss": 0.5163356065750122, |
| "eval_runtime": 205.6732, |
| "eval_samples_per_second": 92.399, |
| "eval_steps_per_second": 1.925, |
| "step": 25000 |
| }, |
| { |
| "epoch": 48.54651162790697, |
| "grad_norm": 0.8780000805854797, |
| "learning_rate": 5.8217054263565895e-06, |
| "loss": 0.4852371597290039, |
| "step": 25050 |
| }, |
| { |
| "epoch": 48.54651162790697, |
| "eval_loss": 0.5103786587715149, |
| "eval_runtime": 203.7202, |
| "eval_samples_per_second": 93.285, |
| "eval_steps_per_second": 1.944, |
| "step": 25050 |
| }, |
| { |
| "epoch": 48.64341085271318, |
| "grad_norm": 1.0753669738769531, |
| "learning_rate": 5.4341085271317826e-06, |
| "loss": 0.48256893157958985, |
| "step": 25100 |
| }, |
| { |
| "epoch": 48.64341085271318, |
| "eval_loss": 0.5188427567481995, |
| "eval_runtime": 206.588, |
| "eval_samples_per_second": 91.99, |
| "eval_steps_per_second": 1.917, |
| "step": 25100 |
| }, |
| { |
| "epoch": 48.74031007751938, |
| "grad_norm": 0.9151817560195923, |
| "learning_rate": 5.0465116279069764e-06, |
| "loss": 0.4795745849609375, |
| "step": 25150 |
| }, |
| { |
| "epoch": 48.74031007751938, |
| "eval_loss": 0.5129519104957581, |
| "eval_runtime": 217.8088, |
| "eval_samples_per_second": 87.251, |
| "eval_steps_per_second": 1.818, |
| "step": 25150 |
| }, |
| { |
| "epoch": 48.83720930232558, |
| "grad_norm": 0.8677839636802673, |
| "learning_rate": 4.65891472868217e-06, |
| "loss": 0.4724645233154297, |
| "step": 25200 |
| }, |
| { |
| "epoch": 48.83720930232558, |
| "eval_loss": 0.5128815174102783, |
| "eval_runtime": 222.6438, |
| "eval_samples_per_second": 85.356, |
| "eval_steps_per_second": 1.779, |
| "step": 25200 |
| }, |
| { |
| "epoch": 48.934108527131784, |
| "grad_norm": 1.0911849737167358, |
| "learning_rate": 4.271317829457364e-06, |
| "loss": 0.4840513610839844, |
| "step": 25250 |
| }, |
| { |
| "epoch": 48.934108527131784, |
| "eval_loss": 0.5184940695762634, |
| "eval_runtime": 212.3902, |
| "eval_samples_per_second": 89.477, |
| "eval_steps_per_second": 1.864, |
| "step": 25250 |
| }, |
| { |
| "epoch": 49.031007751937985, |
| "grad_norm": 1.049880027770996, |
| "learning_rate": 3.883720930232558e-06, |
| "loss": 0.4812747955322266, |
| "step": 25300 |
| }, |
| { |
| "epoch": 49.031007751937985, |
| "eval_loss": 0.5137081742286682, |
| "eval_runtime": 207.0601, |
| "eval_samples_per_second": 91.78, |
| "eval_steps_per_second": 1.912, |
| "step": 25300 |
| }, |
| { |
| "epoch": 49.127906976744185, |
| "grad_norm": 0.7971972823143005, |
| "learning_rate": 3.496124031007752e-06, |
| "loss": 0.4881282424926758, |
| "step": 25350 |
| }, |
| { |
| "epoch": 49.127906976744185, |
| "eval_loss": 0.5165645480155945, |
| "eval_runtime": 202.7666, |
| "eval_samples_per_second": 93.724, |
| "eval_steps_per_second": 1.953, |
| "step": 25350 |
| }, |
| { |
| "epoch": 49.224806201550386, |
| "grad_norm": 0.8528485298156738, |
| "learning_rate": 3.108527131782946e-06, |
| "loss": 0.48426227569580077, |
| "step": 25400 |
| }, |
| { |
| "epoch": 49.224806201550386, |
| "eval_loss": 0.5217786431312561, |
| "eval_runtime": 201.3146, |
| "eval_samples_per_second": 94.4, |
| "eval_steps_per_second": 1.967, |
| "step": 25400 |
| }, |
| { |
| "epoch": 49.32170542635659, |
| "grad_norm": 0.9365469813346863, |
| "learning_rate": 2.7209302325581397e-06, |
| "loss": 0.4818797302246094, |
| "step": 25450 |
| }, |
| { |
| "epoch": 49.32170542635659, |
| "eval_loss": 0.5201877951622009, |
| "eval_runtime": 204.7457, |
| "eval_samples_per_second": 92.818, |
| "eval_steps_per_second": 1.934, |
| "step": 25450 |
| }, |
| { |
| "epoch": 49.41860465116279, |
| "grad_norm": 0.787333071231842, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 0.48442951202392576, |
| "step": 25500 |
| }, |
| { |
| "epoch": 49.41860465116279, |
| "eval_loss": 0.5099871158599854, |
| "eval_runtime": 200.4033, |
| "eval_samples_per_second": 94.829, |
| "eval_steps_per_second": 1.976, |
| "step": 25500 |
| }, |
| { |
| "epoch": 49.51550387596899, |
| "grad_norm": 1.1736680269241333, |
| "learning_rate": 1.9457364341085275e-06, |
| "loss": 0.49098060607910154, |
| "step": 25550 |
| }, |
| { |
| "epoch": 49.51550387596899, |
| "eval_loss": 0.5097952485084534, |
| "eval_runtime": 202.1301, |
| "eval_samples_per_second": 94.019, |
| "eval_steps_per_second": 1.959, |
| "step": 25550 |
| }, |
| { |
| "epoch": 49.6124031007752, |
| "grad_norm": 0.8908654451370239, |
| "learning_rate": 1.558139534883721e-06, |
| "loss": 0.4801137924194336, |
| "step": 25600 |
| }, |
| { |
| "epoch": 49.6124031007752, |
| "eval_loss": 0.5076445937156677, |
| "eval_runtime": 205.4214, |
| "eval_samples_per_second": 92.512, |
| "eval_steps_per_second": 1.928, |
| "step": 25600 |
| }, |
| { |
| "epoch": 49.7093023255814, |
| "grad_norm": 0.9247883558273315, |
| "learning_rate": 1.1705426356589148e-06, |
| "loss": 0.47291782379150393, |
| "step": 25650 |
| }, |
| { |
| "epoch": 49.7093023255814, |
| "eval_loss": 0.5182952880859375, |
| "eval_runtime": 215.4281, |
| "eval_samples_per_second": 88.215, |
| "eval_steps_per_second": 1.838, |
| "step": 25650 |
| }, |
| { |
| "epoch": 49.8062015503876, |
| "grad_norm": 1.0541952848434448, |
| "learning_rate": 7.829457364341086e-07, |
| "loss": 0.4783976745605469, |
| "step": 25700 |
| }, |
| { |
| "epoch": 49.8062015503876, |
| "eval_loss": 0.5079155564308167, |
| "eval_runtime": 203.5265, |
| "eval_samples_per_second": 93.374, |
| "eval_steps_per_second": 1.946, |
| "step": 25700 |
| }, |
| { |
| "epoch": 49.9031007751938, |
| "grad_norm": 1.1467986106872559, |
| "learning_rate": 3.953488372093023e-07, |
| "loss": 0.48399200439453127, |
| "step": 25750 |
| }, |
| { |
| "epoch": 49.9031007751938, |
| "eval_loss": 0.5200158357620239, |
| "eval_runtime": 203.7997, |
| "eval_samples_per_second": 93.248, |
| "eval_steps_per_second": 1.943, |
| "step": 25750 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 1.033553123474121, |
| "learning_rate": 7.751937984496125e-09, |
| "loss": 0.48281837463378907, |
| "step": 25800 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_loss": 0.5126314163208008, |
| "eval_runtime": 203.2326, |
| "eval_samples_per_second": 93.509, |
| "eval_steps_per_second": 1.949, |
| "step": 25800 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 25800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.204755448469504e+17, |
| "train_batch_size": 192, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|