| { |
| "best_metric": 2.6091578006744385, |
| "best_model_checkpoint": "learning_source_20260316/protein_sequence/bert-output/protein_sequence-small/checkpoint-44000", |
| "epoch": 3505.9075391180654, |
| "eval_steps": 100, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 5.689900426742532, |
| "grad_norm": 0.5827894806861877, |
| "learning_rate": 3e-06, |
| "loss": 3.1299, |
| "step": 100 |
| }, |
| { |
| "epoch": 5.689900426742532, |
| "eval_loss": 2.7554705142974854, |
| "eval_runtime": 13.2804, |
| "eval_samples_per_second": 104.967, |
| "eval_steps_per_second": 104.967, |
| "step": 100 |
| }, |
| { |
| "epoch": 11.379800853485063, |
| "grad_norm": 0.47490769624710083, |
| "learning_rate": 6e-06, |
| "loss": 2.7301, |
| "step": 200 |
| }, |
| { |
| "epoch": 11.379800853485063, |
| "eval_loss": 2.695699453353882, |
| "eval_runtime": 13.2496, |
| "eval_samples_per_second": 105.211, |
| "eval_steps_per_second": 105.211, |
| "step": 200 |
| }, |
| { |
| "epoch": 17.069701280227594, |
| "grad_norm": 0.27892372012138367, |
| "learning_rate": 5.998999666555519e-06, |
| "loss": 2.6967, |
| "step": 300 |
| }, |
| { |
| "epoch": 17.069701280227594, |
| "eval_loss": 2.6834542751312256, |
| "eval_runtime": 13.2537, |
| "eval_samples_per_second": 105.178, |
| "eval_steps_per_second": 105.178, |
| "step": 300 |
| }, |
| { |
| "epoch": 22.759601706970127, |
| "grad_norm": 0.26072826981544495, |
| "learning_rate": 5.997999333111037e-06, |
| "loss": 2.6876, |
| "step": 400 |
| }, |
| { |
| "epoch": 22.759601706970127, |
| "eval_loss": 2.6789743900299072, |
| "eval_runtime": 13.6654, |
| "eval_samples_per_second": 102.01, |
| "eval_steps_per_second": 102.01, |
| "step": 400 |
| }, |
| { |
| "epoch": 28.44950213371266, |
| "grad_norm": 0.29876092076301575, |
| "learning_rate": 5.9969989996665554e-06, |
| "loss": 2.6822, |
| "step": 500 |
| }, |
| { |
| "epoch": 28.44950213371266, |
| "eval_loss": 2.6772079467773438, |
| "eval_runtime": 13.27, |
| "eval_samples_per_second": 105.049, |
| "eval_steps_per_second": 105.049, |
| "step": 500 |
| }, |
| { |
| "epoch": 34.13940256045519, |
| "grad_norm": 0.3491186499595642, |
| "learning_rate": 5.995998666222074e-06, |
| "loss": 2.6793, |
| "step": 600 |
| }, |
| { |
| "epoch": 34.13940256045519, |
| "eval_loss": 2.6701366901397705, |
| "eval_runtime": 13.2631, |
| "eval_samples_per_second": 105.104, |
| "eval_steps_per_second": 105.104, |
| "step": 600 |
| }, |
| { |
| "epoch": 39.82930298719772, |
| "grad_norm": 0.38635119795799255, |
| "learning_rate": 5.994998332777593e-06, |
| "loss": 2.6775, |
| "step": 700 |
| }, |
| { |
| "epoch": 39.82930298719772, |
| "eval_loss": 2.675004243850708, |
| "eval_runtime": 13.7168, |
| "eval_samples_per_second": 101.627, |
| "eval_steps_per_second": 101.627, |
| "step": 700 |
| }, |
| { |
| "epoch": 45.519203413940254, |
| "grad_norm": 0.34463900327682495, |
| "learning_rate": 5.9939979993331115e-06, |
| "loss": 2.6754, |
| "step": 800 |
| }, |
| { |
| "epoch": 45.519203413940254, |
| "eval_loss": 2.673832416534424, |
| "eval_runtime": 13.2859, |
| "eval_samples_per_second": 104.923, |
| "eval_steps_per_second": 104.923, |
| "step": 800 |
| }, |
| { |
| "epoch": 51.209103840682786, |
| "grad_norm": 0.35753050446510315, |
| "learning_rate": 5.992997665888629e-06, |
| "loss": 2.6743, |
| "step": 900 |
| }, |
| { |
| "epoch": 51.209103840682786, |
| "eval_loss": 2.670893907546997, |
| "eval_runtime": 13.2644, |
| "eval_samples_per_second": 105.093, |
| "eval_steps_per_second": 105.093, |
| "step": 900 |
| }, |
| { |
| "epoch": 56.89900426742532, |
| "grad_norm": 0.30704179406166077, |
| "learning_rate": 5.991997332444148e-06, |
| "loss": 2.6733, |
| "step": 1000 |
| }, |
| { |
| "epoch": 56.89900426742532, |
| "eval_loss": 2.6703639030456543, |
| "eval_runtime": 13.274, |
| "eval_samples_per_second": 105.017, |
| "eval_steps_per_second": 105.017, |
| "step": 1000 |
| }, |
| { |
| "epoch": 62.58890469416785, |
| "grad_norm": 0.20322857797145844, |
| "learning_rate": 5.990996998999667e-06, |
| "loss": 2.6718, |
| "step": 1100 |
| }, |
| { |
| "epoch": 62.58890469416785, |
| "eval_loss": 2.6725869178771973, |
| "eval_runtime": 13.6525, |
| "eval_samples_per_second": 102.106, |
| "eval_steps_per_second": 102.106, |
| "step": 1100 |
| }, |
| { |
| "epoch": 68.27880512091038, |
| "grad_norm": 0.29705750942230225, |
| "learning_rate": 5.989996665555185e-06, |
| "loss": 2.6712, |
| "step": 1200 |
| }, |
| { |
| "epoch": 68.27880512091038, |
| "eval_loss": 2.6705477237701416, |
| "eval_runtime": 13.7099, |
| "eval_samples_per_second": 101.678, |
| "eval_steps_per_second": 101.678, |
| "step": 1200 |
| }, |
| { |
| "epoch": 73.96870554765292, |
| "grad_norm": 0.2920830249786377, |
| "learning_rate": 5.988996332110703e-06, |
| "loss": 2.671, |
| "step": 1300 |
| }, |
| { |
| "epoch": 73.96870554765292, |
| "eval_loss": 2.6691176891326904, |
| "eval_runtime": 13.276, |
| "eval_samples_per_second": 105.002, |
| "eval_steps_per_second": 105.002, |
| "step": 1300 |
| }, |
| { |
| "epoch": 79.65860597439544, |
| "grad_norm": 0.38358381390571594, |
| "learning_rate": 5.987995998666222e-06, |
| "loss": 2.6703, |
| "step": 1400 |
| }, |
| { |
| "epoch": 79.65860597439544, |
| "eval_loss": 2.6659200191497803, |
| "eval_runtime": 13.2877, |
| "eval_samples_per_second": 104.909, |
| "eval_steps_per_second": 104.909, |
| "step": 1400 |
| }, |
| { |
| "epoch": 85.34850640113798, |
| "grad_norm": 0.23219753801822662, |
| "learning_rate": 5.986995665221741e-06, |
| "loss": 2.6704, |
| "step": 1500 |
| }, |
| { |
| "epoch": 85.34850640113798, |
| "eval_loss": 2.6674177646636963, |
| "eval_runtime": 13.666, |
| "eval_samples_per_second": 102.005, |
| "eval_steps_per_second": 102.005, |
| "step": 1500 |
| }, |
| { |
| "epoch": 91.03840682788051, |
| "grad_norm": 0.23956173658370972, |
| "learning_rate": 5.9859953317772595e-06, |
| "loss": 2.6704, |
| "step": 1600 |
| }, |
| { |
| "epoch": 91.03840682788051, |
| "eval_loss": 2.667738199234009, |
| "eval_runtime": 13.2689, |
| "eval_samples_per_second": 105.058, |
| "eval_steps_per_second": 105.058, |
| "step": 1600 |
| }, |
| { |
| "epoch": 96.72830725462305, |
| "grad_norm": 0.22576624155044556, |
| "learning_rate": 5.984994998332777e-06, |
| "loss": 2.6696, |
| "step": 1700 |
| }, |
| { |
| "epoch": 96.72830725462305, |
| "eval_loss": 2.666335344314575, |
| "eval_runtime": 13.2674, |
| "eval_samples_per_second": 105.069, |
| "eval_steps_per_second": 105.069, |
| "step": 1700 |
| }, |
| { |
| "epoch": 102.41820768136557, |
| "grad_norm": 0.2869977653026581, |
| "learning_rate": 5.983994664888296e-06, |
| "loss": 2.6692, |
| "step": 1800 |
| }, |
| { |
| "epoch": 102.41820768136557, |
| "eval_loss": 2.667121171951294, |
| "eval_runtime": 13.2883, |
| "eval_samples_per_second": 104.905, |
| "eval_steps_per_second": 104.905, |
| "step": 1800 |
| }, |
| { |
| "epoch": 108.10810810810811, |
| "grad_norm": 0.24629302322864532, |
| "learning_rate": 5.982994331443815e-06, |
| "loss": 2.6685, |
| "step": 1900 |
| }, |
| { |
| "epoch": 108.10810810810811, |
| "eval_loss": 2.6667563915252686, |
| "eval_runtime": 13.6825, |
| "eval_samples_per_second": 101.882, |
| "eval_steps_per_second": 101.882, |
| "step": 1900 |
| }, |
| { |
| "epoch": 113.79800853485064, |
| "grad_norm": 0.23221346735954285, |
| "learning_rate": 5.981993997999333e-06, |
| "loss": 2.6683, |
| "step": 2000 |
| }, |
| { |
| "epoch": 113.79800853485064, |
| "eval_loss": 2.6664071083068848, |
| "eval_runtime": 13.2695, |
| "eval_samples_per_second": 105.053, |
| "eval_steps_per_second": 105.053, |
| "step": 2000 |
| }, |
| { |
| "epoch": 119.48790896159318, |
| "grad_norm": 0.24480201303958893, |
| "learning_rate": 5.980993664554851e-06, |
| "loss": 2.668, |
| "step": 2100 |
| }, |
| { |
| "epoch": 119.48790896159318, |
| "eval_loss": 2.6675596237182617, |
| "eval_runtime": 13.6602, |
| "eval_samples_per_second": 102.048, |
| "eval_steps_per_second": 102.048, |
| "step": 2100 |
| }, |
| { |
| "epoch": 125.1778093883357, |
| "grad_norm": 0.2695687413215637, |
| "learning_rate": 5.979993331110371e-06, |
| "loss": 2.6683, |
| "step": 2200 |
| }, |
| { |
| "epoch": 125.1778093883357, |
| "eval_loss": 2.6677987575531006, |
| "eval_runtime": 13.2773, |
| "eval_samples_per_second": 104.991, |
| "eval_steps_per_second": 104.991, |
| "step": 2200 |
| }, |
| { |
| "epoch": 130.86770981507823, |
| "grad_norm": 0.2357303947210312, |
| "learning_rate": 5.978992997665889e-06, |
| "loss": 2.6678, |
| "step": 2300 |
| }, |
| { |
| "epoch": 130.86770981507823, |
| "eval_loss": 2.6650021076202393, |
| "eval_runtime": 13.256, |
| "eval_samples_per_second": 105.16, |
| "eval_steps_per_second": 105.16, |
| "step": 2300 |
| }, |
| { |
| "epoch": 136.55761024182075, |
| "grad_norm": 0.23957480490207672, |
| "learning_rate": 5.9779926642214075e-06, |
| "loss": 2.6679, |
| "step": 2400 |
| }, |
| { |
| "epoch": 136.55761024182075, |
| "eval_loss": 2.6645851135253906, |
| "eval_runtime": 13.7249, |
| "eval_samples_per_second": 101.567, |
| "eval_steps_per_second": 101.567, |
| "step": 2400 |
| }, |
| { |
| "epoch": 146.86059743954482, |
| "grad_norm": 0.19333045184612274, |
| "learning_rate": 5.976992330776926e-06, |
| "loss": 2.6671, |
| "step": 2500 |
| }, |
| { |
| "epoch": 146.86059743954482, |
| "eval_loss": 2.666929244995117, |
| "eval_runtime": 8.8729, |
| "eval_samples_per_second": 157.107, |
| "eval_steps_per_second": 19.723, |
| "step": 2500 |
| }, |
| { |
| "epoch": 152.55049786628734, |
| "grad_norm": 0.3093737065792084, |
| "learning_rate": 5.975991997332444e-06, |
| "loss": 2.6674, |
| "step": 2600 |
| }, |
| { |
| "epoch": 152.55049786628734, |
| "eval_loss": 2.6641287803649902, |
| "eval_runtime": 8.544, |
| "eval_samples_per_second": 163.155, |
| "eval_steps_per_second": 20.482, |
| "step": 2600 |
| }, |
| { |
| "epoch": 158.24039829302987, |
| "grad_norm": 0.2492215484380722, |
| "learning_rate": 5.974991663887963e-06, |
| "loss": 2.6675, |
| "step": 2700 |
| }, |
| { |
| "epoch": 158.24039829302987, |
| "eval_loss": 2.6672415733337402, |
| "eval_runtime": 8.5402, |
| "eval_samples_per_second": 163.229, |
| "eval_steps_per_second": 20.491, |
| "step": 2700 |
| }, |
| { |
| "epoch": 163.9302987197724, |
| "grad_norm": 0.3064326047897339, |
| "learning_rate": 5.973991330443481e-06, |
| "loss": 2.6674, |
| "step": 2800 |
| }, |
| { |
| "epoch": 163.9302987197724, |
| "eval_loss": 2.667715072631836, |
| "eval_runtime": 8.861, |
| "eval_samples_per_second": 157.319, |
| "eval_steps_per_second": 19.75, |
| "step": 2800 |
| }, |
| { |
| "epoch": 169.62019914651495, |
| "grad_norm": 0.2401367574930191, |
| "learning_rate": 5.972990996999e-06, |
| "loss": 2.6668, |
| "step": 2900 |
| }, |
| { |
| "epoch": 169.62019914651495, |
| "eval_loss": 2.663231611251831, |
| "eval_runtime": 8.5263, |
| "eval_samples_per_second": 163.495, |
| "eval_steps_per_second": 20.525, |
| "step": 2900 |
| }, |
| { |
| "epoch": 175.31009957325747, |
| "grad_norm": 0.26518478989601135, |
| "learning_rate": 5.971990663554519e-06, |
| "loss": 2.6664, |
| "step": 3000 |
| }, |
| { |
| "epoch": 175.31009957325747, |
| "eval_loss": 2.66806960105896, |
| "eval_runtime": 8.5256, |
| "eval_samples_per_second": 163.508, |
| "eval_steps_per_second": 20.526, |
| "step": 3000 |
| }, |
| { |
| "epoch": 181.0, |
| "grad_norm": 0.21279650926589966, |
| "learning_rate": 5.970990330110037e-06, |
| "loss": 2.6662, |
| "step": 3100 |
| }, |
| { |
| "epoch": 181.0, |
| "eval_loss": 2.66540789604187, |
| "eval_runtime": 8.9087, |
| "eval_samples_per_second": 156.477, |
| "eval_steps_per_second": 19.644, |
| "step": 3100 |
| }, |
| { |
| "epoch": 186.68990042674253, |
| "grad_norm": 0.20601896941661835, |
| "learning_rate": 5.9699899966655554e-06, |
| "loss": 2.6662, |
| "step": 3200 |
| }, |
| { |
| "epoch": 186.68990042674253, |
| "eval_loss": 2.661759614944458, |
| "eval_runtime": 8.5361, |
| "eval_samples_per_second": 163.306, |
| "eval_steps_per_second": 20.501, |
| "step": 3200 |
| }, |
| { |
| "epoch": 192.37980085348505, |
| "grad_norm": 0.30063194036483765, |
| "learning_rate": 5.968989663221074e-06, |
| "loss": 2.666, |
| "step": 3300 |
| }, |
| { |
| "epoch": 192.37980085348505, |
| "eval_loss": 2.6638128757476807, |
| "eval_runtime": 8.5253, |
| "eval_samples_per_second": 163.514, |
| "eval_steps_per_second": 20.527, |
| "step": 3300 |
| }, |
| { |
| "epoch": 198.0697012802276, |
| "grad_norm": 0.17756374180316925, |
| "learning_rate": 5.967989329776592e-06, |
| "loss": 2.6652, |
| "step": 3400 |
| }, |
| { |
| "epoch": 198.0697012802276, |
| "eval_loss": 2.6624886989593506, |
| "eval_runtime": 8.9634, |
| "eval_samples_per_second": 155.521, |
| "eval_steps_per_second": 19.524, |
| "step": 3400 |
| }, |
| { |
| "epoch": 203.75960170697013, |
| "grad_norm": 0.3183553218841553, |
| "learning_rate": 5.966988996332111e-06, |
| "loss": 2.6656, |
| "step": 3500 |
| }, |
| { |
| "epoch": 203.75960170697013, |
| "eval_loss": 2.666609764099121, |
| "eval_runtime": 8.5304, |
| "eval_samples_per_second": 163.416, |
| "eval_steps_per_second": 20.515, |
| "step": 3500 |
| }, |
| { |
| "epoch": 209.44950213371266, |
| "grad_norm": 0.23746278882026672, |
| "learning_rate": 5.965988662887629e-06, |
| "loss": 2.6656, |
| "step": 3600 |
| }, |
| { |
| "epoch": 209.44950213371266, |
| "eval_loss": 2.664607048034668, |
| "eval_runtime": 8.5297, |
| "eval_samples_per_second": 163.429, |
| "eval_steps_per_second": 20.516, |
| "step": 3600 |
| }, |
| { |
| "epoch": 215.13940256045518, |
| "grad_norm": 0.2566852271556854, |
| "learning_rate": 5.964988329443148e-06, |
| "loss": 2.6652, |
| "step": 3700 |
| }, |
| { |
| "epoch": 215.13940256045518, |
| "eval_loss": 2.663752794265747, |
| "eval_runtime": 8.5306, |
| "eval_samples_per_second": 163.412, |
| "eval_steps_per_second": 20.514, |
| "step": 3700 |
| }, |
| { |
| "epoch": 220.82930298719774, |
| "grad_norm": 0.19710654020309448, |
| "learning_rate": 5.963987995998667e-06, |
| "loss": 2.6657, |
| "step": 3800 |
| }, |
| { |
| "epoch": 220.82930298719774, |
| "eval_loss": 2.66432785987854, |
| "eval_runtime": 8.9192, |
| "eval_samples_per_second": 156.293, |
| "eval_steps_per_second": 19.621, |
| "step": 3800 |
| }, |
| { |
| "epoch": 226.51920341394026, |
| "grad_norm": 0.20113052427768707, |
| "learning_rate": 5.962987662554185e-06, |
| "loss": 2.6655, |
| "step": 3900 |
| }, |
| { |
| "epoch": 226.51920341394026, |
| "eval_loss": 2.662318706512451, |
| "eval_runtime": 8.5269, |
| "eval_samples_per_second": 163.483, |
| "eval_steps_per_second": 20.523, |
| "step": 3900 |
| }, |
| { |
| "epoch": 232.2091038406828, |
| "grad_norm": 0.24698683619499207, |
| "learning_rate": 5.961987329109703e-06, |
| "loss": 2.6652, |
| "step": 4000 |
| }, |
| { |
| "epoch": 232.2091038406828, |
| "eval_loss": 2.6657159328460693, |
| "eval_runtime": 8.5292, |
| "eval_samples_per_second": 163.438, |
| "eval_steps_per_second": 20.518, |
| "step": 4000 |
| }, |
| { |
| "epoch": 237.8990042674253, |
| "grad_norm": 0.24947816133499146, |
| "learning_rate": 5.960986995665222e-06, |
| "loss": 2.6652, |
| "step": 4100 |
| }, |
| { |
| "epoch": 237.8990042674253, |
| "eval_loss": 2.6663217544555664, |
| "eval_runtime": 8.5354, |
| "eval_samples_per_second": 163.32, |
| "eval_steps_per_second": 20.503, |
| "step": 4100 |
| }, |
| { |
| "epoch": 243.58890469416787, |
| "grad_norm": 0.2810859680175781, |
| "learning_rate": 5.95998666222074e-06, |
| "loss": 2.6649, |
| "step": 4200 |
| }, |
| { |
| "epoch": 243.58890469416787, |
| "eval_loss": 2.6661739349365234, |
| "eval_runtime": 8.8623, |
| "eval_samples_per_second": 157.295, |
| "eval_steps_per_second": 19.747, |
| "step": 4200 |
| }, |
| { |
| "epoch": 252.74679943100995, |
| "grad_norm": 0.18688435852527618, |
| "learning_rate": 5.588628762541806e-06, |
| "loss": 2.6646, |
| "step": 4300 |
| }, |
| { |
| "epoch": 252.74679943100995, |
| "eval_loss": 2.664121389389038, |
| "eval_runtime": 9.162, |
| "eval_samples_per_second": 152.149, |
| "eval_steps_per_second": 19.101, |
| "step": 4300 |
| }, |
| { |
| "epoch": 258.4366998577525, |
| "grad_norm": 0.19968199729919434, |
| "learning_rate": 5.578595317725753e-06, |
| "loss": 2.6649, |
| "step": 4400 |
| }, |
| { |
| "epoch": 258.4366998577525, |
| "eval_loss": 2.666635274887085, |
| "eval_runtime": 11.7784, |
| "eval_samples_per_second": 118.352, |
| "eval_steps_per_second": 14.858, |
| "step": 4400 |
| }, |
| { |
| "epoch": 264.126600284495, |
| "grad_norm": 0.18012067675590515, |
| "learning_rate": 5.568561872909699e-06, |
| "loss": 2.6653, |
| "step": 4500 |
| }, |
| { |
| "epoch": 264.126600284495, |
| "eval_loss": 2.662325143814087, |
| "eval_runtime": 11.7766, |
| "eval_samples_per_second": 118.371, |
| "eval_steps_per_second": 14.86, |
| "step": 4500 |
| }, |
| { |
| "epoch": 269.81650071123755, |
| "grad_norm": 0.18739238381385803, |
| "learning_rate": 5.558528428093646e-06, |
| "loss": 2.6652, |
| "step": 4600 |
| }, |
| { |
| "epoch": 269.81650071123755, |
| "eval_loss": 2.664424419403076, |
| "eval_runtime": 9.739, |
| "eval_samples_per_second": 143.137, |
| "eval_steps_per_second": 17.969, |
| "step": 4600 |
| }, |
| { |
| "epoch": 275.5064011379801, |
| "grad_norm": 0.2488318383693695, |
| "learning_rate": 5.548494983277593e-06, |
| "loss": 2.6648, |
| "step": 4700 |
| }, |
| { |
| "epoch": 275.5064011379801, |
| "eval_loss": 2.6640284061431885, |
| "eval_runtime": 8.9011, |
| "eval_samples_per_second": 156.609, |
| "eval_steps_per_second": 19.66, |
| "step": 4700 |
| }, |
| { |
| "epoch": 281.1963015647226, |
| "grad_norm": 0.22808881103992462, |
| "learning_rate": 5.5384615384615385e-06, |
| "loss": 2.6651, |
| "step": 4800 |
| }, |
| { |
| "epoch": 281.1963015647226, |
| "eval_loss": 2.6617281436920166, |
| "eval_runtime": 8.5632, |
| "eval_samples_per_second": 162.79, |
| "eval_steps_per_second": 20.436, |
| "step": 4800 |
| }, |
| { |
| "epoch": 286.88620199146516, |
| "grad_norm": 0.1917983591556549, |
| "learning_rate": 5.528428093645485e-06, |
| "loss": 2.6647, |
| "step": 4900 |
| }, |
| { |
| "epoch": 286.88620199146516, |
| "eval_loss": 2.6639668941497803, |
| "eval_runtime": 8.5741, |
| "eval_samples_per_second": 162.583, |
| "eval_steps_per_second": 20.41, |
| "step": 4900 |
| }, |
| { |
| "epoch": 292.57610241820765, |
| "grad_norm": 0.247116819024086, |
| "learning_rate": 5.518394648829432e-06, |
| "loss": 2.6648, |
| "step": 5000 |
| }, |
| { |
| "epoch": 292.57610241820765, |
| "eval_loss": 2.660776376724243, |
| "eval_runtime": 8.9014, |
| "eval_samples_per_second": 156.605, |
| "eval_steps_per_second": 19.66, |
| "step": 5000 |
| }, |
| { |
| "epoch": 298.2660028449502, |
| "grad_norm": 0.18090835213661194, |
| "learning_rate": 5.508361204013378e-06, |
| "loss": 2.6643, |
| "step": 5100 |
| }, |
| { |
| "epoch": 298.2660028449502, |
| "eval_loss": 2.6607048511505127, |
| "eval_runtime": 8.5599, |
| "eval_samples_per_second": 162.853, |
| "eval_steps_per_second": 20.444, |
| "step": 5100 |
| }, |
| { |
| "epoch": 303.95590327169276, |
| "grad_norm": 0.1796797215938568, |
| "learning_rate": 5.498327759197324e-06, |
| "loss": 2.6645, |
| "step": 5200 |
| }, |
| { |
| "epoch": 303.95590327169276, |
| "eval_loss": 2.6626744270324707, |
| "eval_runtime": 8.9139, |
| "eval_samples_per_second": 156.385, |
| "eval_steps_per_second": 19.632, |
| "step": 5200 |
| }, |
| { |
| "epoch": 309.64580369843526, |
| "grad_norm": 0.19111952185630798, |
| "learning_rate": 5.488294314381271e-06, |
| "loss": 2.6647, |
| "step": 5300 |
| }, |
| { |
| "epoch": 309.64580369843526, |
| "eval_loss": 2.6617257595062256, |
| "eval_runtime": 8.5801, |
| "eval_samples_per_second": 162.47, |
| "eval_steps_per_second": 20.396, |
| "step": 5300 |
| }, |
| { |
| "epoch": 315.3357041251778, |
| "grad_norm": 0.17278283834457397, |
| "learning_rate": 5.478260869565217e-06, |
| "loss": 2.6645, |
| "step": 5400 |
| }, |
| { |
| "epoch": 315.3357041251778, |
| "eval_loss": 2.6651811599731445, |
| "eval_runtime": 8.8919, |
| "eval_samples_per_second": 156.771, |
| "eval_steps_per_second": 19.681, |
| "step": 5400 |
| }, |
| { |
| "epoch": 321.02560455192037, |
| "grad_norm": 0.24506501853466034, |
| "learning_rate": 5.468227424749163e-06, |
| "loss": 2.6644, |
| "step": 5500 |
| }, |
| { |
| "epoch": 321.02560455192037, |
| "eval_loss": 2.6612024307250977, |
| "eval_runtime": 8.5651, |
| "eval_samples_per_second": 162.754, |
| "eval_steps_per_second": 20.432, |
| "step": 5500 |
| }, |
| { |
| "epoch": 326.71550497866286, |
| "grad_norm": 0.17717023193836212, |
| "learning_rate": 5.45819397993311e-06, |
| "loss": 2.6644, |
| "step": 5600 |
| }, |
| { |
| "epoch": 326.71550497866286, |
| "eval_loss": 2.661200523376465, |
| "eval_runtime": 8.5626, |
| "eval_samples_per_second": 162.801, |
| "eval_steps_per_second": 20.438, |
| "step": 5600 |
| }, |
| { |
| "epoch": 332.4054054054054, |
| "grad_norm": 0.12661577761173248, |
| "learning_rate": 5.448160535117057e-06, |
| "loss": 2.6641, |
| "step": 5700 |
| }, |
| { |
| "epoch": 332.4054054054054, |
| "eval_loss": 2.6609702110290527, |
| "eval_runtime": 8.883, |
| "eval_samples_per_second": 156.929, |
| "eval_steps_per_second": 19.701, |
| "step": 5700 |
| }, |
| { |
| "epoch": 338.0953058321479, |
| "grad_norm": 0.199785977602005, |
| "learning_rate": 5.438127090301003e-06, |
| "loss": 2.6643, |
| "step": 5800 |
| }, |
| { |
| "epoch": 338.0953058321479, |
| "eval_loss": 2.660168409347534, |
| "eval_runtime": 8.5745, |
| "eval_samples_per_second": 162.574, |
| "eval_steps_per_second": 20.409, |
| "step": 5800 |
| }, |
| { |
| "epoch": 343.78520625889047, |
| "grad_norm": 0.2726210057735443, |
| "learning_rate": 5.4280936454849495e-06, |
| "loss": 2.6646, |
| "step": 5900 |
| }, |
| { |
| "epoch": 343.78520625889047, |
| "eval_loss": 2.664670944213867, |
| "eval_runtime": 8.5589, |
| "eval_samples_per_second": 162.871, |
| "eval_steps_per_second": 20.447, |
| "step": 5900 |
| }, |
| { |
| "epoch": 349.475106685633, |
| "grad_norm": 0.3512348234653473, |
| "learning_rate": 5.418060200668896e-06, |
| "loss": 2.664, |
| "step": 6000 |
| }, |
| { |
| "epoch": 349.475106685633, |
| "eval_loss": 2.665173292160034, |
| "eval_runtime": 8.5571, |
| "eval_samples_per_second": 162.905, |
| "eval_steps_per_second": 20.451, |
| "step": 6000 |
| }, |
| { |
| "epoch": 355.1650071123755, |
| "grad_norm": 0.20835170149803162, |
| "learning_rate": 5.408026755852843e-06, |
| "loss": 2.6641, |
| "step": 6100 |
| }, |
| { |
| "epoch": 355.1650071123755, |
| "eval_loss": 2.662048101425171, |
| "eval_runtime": 8.9087, |
| "eval_samples_per_second": 156.476, |
| "eval_steps_per_second": 19.644, |
| "step": 6100 |
| }, |
| { |
| "epoch": 360.8549075391181, |
| "grad_norm": 0.11575555801391602, |
| "learning_rate": 5.397993311036789e-06, |
| "loss": 2.6645, |
| "step": 6200 |
| }, |
| { |
| "epoch": 360.8549075391181, |
| "eval_loss": 2.6617300510406494, |
| "eval_runtime": 8.5709, |
| "eval_samples_per_second": 162.643, |
| "eval_steps_per_second": 20.418, |
| "step": 6200 |
| }, |
| { |
| "epoch": 366.54480796586057, |
| "grad_norm": 0.18948699533939362, |
| "learning_rate": 5.387959866220736e-06, |
| "loss": 2.6639, |
| "step": 6300 |
| }, |
| { |
| "epoch": 366.54480796586057, |
| "eval_loss": 2.6628897190093994, |
| "eval_runtime": 8.9034, |
| "eval_samples_per_second": 156.569, |
| "eval_steps_per_second": 19.655, |
| "step": 6300 |
| }, |
| { |
| "epoch": 372.2347083926031, |
| "grad_norm": 0.12320856750011444, |
| "learning_rate": 5.3779264214046825e-06, |
| "loss": 2.6647, |
| "step": 6400 |
| }, |
| { |
| "epoch": 372.2347083926031, |
| "eval_loss": 2.663992166519165, |
| "eval_runtime": 8.5612, |
| "eval_samples_per_second": 162.828, |
| "eval_steps_per_second": 20.441, |
| "step": 6400 |
| }, |
| { |
| "epoch": 377.9246088193457, |
| "grad_norm": 0.26067054271698, |
| "learning_rate": 5.367892976588628e-06, |
| "loss": 2.6643, |
| "step": 6500 |
| }, |
| { |
| "epoch": 377.9246088193457, |
| "eval_loss": 2.6624581813812256, |
| "eval_runtime": 8.572, |
| "eval_samples_per_second": 162.623, |
| "eval_steps_per_second": 20.415, |
| "step": 6500 |
| }, |
| { |
| "epoch": 383.6145092460882, |
| "grad_norm": 0.18116046488285065, |
| "learning_rate": 5.357859531772575e-06, |
| "loss": 2.664, |
| "step": 6600 |
| }, |
| { |
| "epoch": 383.6145092460882, |
| "eval_loss": 2.662827491760254, |
| "eval_runtime": 8.8871, |
| "eval_samples_per_second": 156.857, |
| "eval_steps_per_second": 19.692, |
| "step": 6600 |
| }, |
| { |
| "epoch": 389.30440967283073, |
| "grad_norm": 0.21489782631397247, |
| "learning_rate": 5.347826086956522e-06, |
| "loss": 2.6635, |
| "step": 6700 |
| }, |
| { |
| "epoch": 389.30440967283073, |
| "eval_loss": 2.6603777408599854, |
| "eval_runtime": 8.5641, |
| "eval_samples_per_second": 162.772, |
| "eval_steps_per_second": 20.434, |
| "step": 6700 |
| }, |
| { |
| "epoch": 394.9943100995733, |
| "grad_norm": 0.1781698316335678, |
| "learning_rate": 5.337792642140468e-06, |
| "loss": 2.6645, |
| "step": 6800 |
| }, |
| { |
| "epoch": 394.9943100995733, |
| "eval_loss": 2.661527156829834, |
| "eval_runtime": 8.5625, |
| "eval_samples_per_second": 162.802, |
| "eval_steps_per_second": 20.438, |
| "step": 6800 |
| }, |
| { |
| "epoch": 400.6842105263158, |
| "grad_norm": 0.18622642755508423, |
| "learning_rate": 5.327759197324415e-06, |
| "loss": 2.6647, |
| "step": 6900 |
| }, |
| { |
| "epoch": 400.6842105263158, |
| "eval_loss": 2.661090135574341, |
| "eval_runtime": 8.5509, |
| "eval_samples_per_second": 163.023, |
| "eval_steps_per_second": 20.466, |
| "step": 6900 |
| }, |
| { |
| "epoch": 406.37411095305833, |
| "grad_norm": 0.15774820744991302, |
| "learning_rate": 5.317725752508361e-06, |
| "loss": 2.6636, |
| "step": 7000 |
| }, |
| { |
| "epoch": 406.37411095305833, |
| "eval_loss": 2.66558575630188, |
| "eval_runtime": 8.898, |
| "eval_samples_per_second": 156.665, |
| "eval_steps_per_second": 19.667, |
| "step": 7000 |
| }, |
| { |
| "epoch": 412.06401137980083, |
| "grad_norm": 0.18330508470535278, |
| "learning_rate": 5.307692307692307e-06, |
| "loss": 2.6645, |
| "step": 7100 |
| }, |
| { |
| "epoch": 412.06401137980083, |
| "eval_loss": 2.6627676486968994, |
| "eval_runtime": 8.5744, |
| "eval_samples_per_second": 162.576, |
| "eval_steps_per_second": 20.409, |
| "step": 7100 |
| }, |
| { |
| "epoch": 417.7539118065434, |
| "grad_norm": 0.23223190009593964, |
| "learning_rate": 5.297658862876254e-06, |
| "loss": 2.6636, |
| "step": 7200 |
| }, |
| { |
| "epoch": 417.7539118065434, |
| "eval_loss": 2.661203145980835, |
| "eval_runtime": 8.5675, |
| "eval_samples_per_second": 162.708, |
| "eval_steps_per_second": 20.426, |
| "step": 7200 |
| }, |
| { |
| "epoch": 423.44381223328594, |
| "grad_norm": 0.15261903405189514, |
| "learning_rate": 5.287625418060201e-06, |
| "loss": 2.6641, |
| "step": 7300 |
| }, |
| { |
| "epoch": 423.44381223328594, |
| "eval_loss": 2.6625847816467285, |
| "eval_runtime": 8.9059, |
| "eval_samples_per_second": 156.525, |
| "eval_steps_per_second": 19.65, |
| "step": 7300 |
| }, |
| { |
| "epoch": 429.13371266002844, |
| "grad_norm": 0.1654181033372879, |
| "learning_rate": 5.277591973244147e-06, |
| "loss": 2.6641, |
| "step": 7400 |
| }, |
| { |
| "epoch": 429.13371266002844, |
| "eval_loss": 2.6628565788269043, |
| "eval_runtime": 8.555, |
| "eval_samples_per_second": 162.946, |
| "eval_steps_per_second": 20.456, |
| "step": 7400 |
| }, |
| { |
| "epoch": 434.823613086771, |
| "grad_norm": 0.2062557488679886, |
| "learning_rate": 5.2675585284280935e-06, |
| "loss": 2.6634, |
| "step": 7500 |
| }, |
| { |
| "epoch": 434.823613086771, |
| "eval_loss": 2.6651501655578613, |
| "eval_runtime": 8.9081, |
| "eval_samples_per_second": 156.487, |
| "eval_steps_per_second": 19.645, |
| "step": 7500 |
| }, |
| { |
| "epoch": 440.5135135135135, |
| "grad_norm": 0.21824122965335846, |
| "learning_rate": 5.25752508361204e-06, |
| "loss": 2.6637, |
| "step": 7600 |
| }, |
| { |
| "epoch": 440.5135135135135, |
| "eval_loss": 2.6624720096588135, |
| "eval_runtime": 8.5608, |
| "eval_samples_per_second": 162.836, |
| "eval_steps_per_second": 20.442, |
| "step": 7600 |
| }, |
| { |
| "epoch": 446.20341394025604, |
| "grad_norm": 0.2458944469690323, |
| "learning_rate": 5.247491638795986e-06, |
| "loss": 2.6637, |
| "step": 7700 |
| }, |
| { |
| "epoch": 446.20341394025604, |
| "eval_loss": 2.661086082458496, |
| "eval_runtime": 8.8963, |
| "eval_samples_per_second": 156.694, |
| "eval_steps_per_second": 19.671, |
| "step": 7700 |
| }, |
| { |
| "epoch": 451.8933143669986, |
| "grad_norm": 0.1574467271566391, |
| "learning_rate": 5.237458193979933e-06, |
| "loss": 2.6639, |
| "step": 7800 |
| }, |
| { |
| "epoch": 451.8933143669986, |
| "eval_loss": 2.6646134853363037, |
| "eval_runtime": 8.5514, |
| "eval_samples_per_second": 163.014, |
| "eval_steps_per_second": 20.464, |
| "step": 7800 |
| }, |
| { |
| "epoch": 457.5832147937411, |
| "grad_norm": 0.1982835829257965, |
| "learning_rate": 5.22742474916388e-06, |
| "loss": 2.664, |
| "step": 7900 |
| }, |
| { |
| "epoch": 457.5832147937411, |
| "eval_loss": 2.6606264114379883, |
| "eval_runtime": 8.5483, |
| "eval_samples_per_second": 163.073, |
| "eval_steps_per_second": 20.472, |
| "step": 7900 |
| }, |
| { |
| "epoch": 463.27311522048365, |
| "grad_norm": 0.19593903422355652, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 2.6632, |
| "step": 8000 |
| }, |
| { |
| "epoch": 463.27311522048365, |
| "eval_loss": 2.664707899093628, |
| "eval_runtime": 8.5595, |
| "eval_samples_per_second": 162.861, |
| "eval_steps_per_second": 20.445, |
| "step": 8000 |
| }, |
| { |
| "epoch": 468.9630156472262, |
| "grad_norm": 0.22343507409095764, |
| "learning_rate": 5.207357859531772e-06, |
| "loss": 2.6634, |
| "step": 8100 |
| }, |
| { |
| "epoch": 468.9630156472262, |
| "eval_loss": 2.6642260551452637, |
| "eval_runtime": 8.9066, |
| "eval_samples_per_second": 156.513, |
| "eval_steps_per_second": 19.648, |
| "step": 8100 |
| }, |
| { |
| "epoch": 474.6529160739687, |
| "grad_norm": 0.16728109121322632, |
| "learning_rate": 5.197324414715719e-06, |
| "loss": 2.6633, |
| "step": 8200 |
| }, |
| { |
| "epoch": 474.6529160739687, |
| "eval_loss": 2.6625194549560547, |
| "eval_runtime": 8.5593, |
| "eval_samples_per_second": 162.863, |
| "eval_steps_per_second": 20.445, |
| "step": 8200 |
| }, |
| { |
| "epoch": 480.34281650071125, |
| "grad_norm": 0.23255111277103424, |
| "learning_rate": 5.187290969899666e-06, |
| "loss": 2.6634, |
| "step": 8300 |
| }, |
| { |
| "epoch": 480.34281650071125, |
| "eval_loss": 2.6606099605560303, |
| "eval_runtime": 8.559, |
| "eval_samples_per_second": 162.87, |
| "eval_steps_per_second": 20.446, |
| "step": 8300 |
| }, |
| { |
| "epoch": 486.03271692745375, |
| "grad_norm": 0.118553027510643, |
| "learning_rate": 5.177257525083612e-06, |
| "loss": 2.6632, |
| "step": 8400 |
| }, |
| { |
| "epoch": 486.03271692745375, |
| "eval_loss": 2.663628101348877, |
| "eval_runtime": 8.912, |
| "eval_samples_per_second": 156.418, |
| "eval_steps_per_second": 19.636, |
| "step": 8400 |
| }, |
| { |
| "epoch": 491.7226173541963, |
| "grad_norm": 0.23464259505271912, |
| "learning_rate": 5.167224080267559e-06, |
| "loss": 2.6636, |
| "step": 8500 |
| }, |
| { |
| "epoch": 491.7226173541963, |
| "eval_loss": 2.6618993282318115, |
| "eval_runtime": 8.5674, |
| "eval_samples_per_second": 162.711, |
| "eval_steps_per_second": 20.426, |
| "step": 8500 |
| }, |
| { |
| "epoch": 497.41251778093886, |
| "grad_norm": 0.14757351577281952, |
| "learning_rate": 5.157190635451505e-06, |
| "loss": 2.6634, |
| "step": 8600 |
| }, |
| { |
| "epoch": 497.41251778093886, |
| "eval_loss": 2.6627461910247803, |
| "eval_runtime": 8.553, |
| "eval_samples_per_second": 162.984, |
| "eval_steps_per_second": 20.461, |
| "step": 8600 |
| }, |
| { |
| "epoch": 503.10241820768135, |
| "grad_norm": 0.16491751372814178, |
| "learning_rate": 5.147157190635451e-06, |
| "loss": 2.6634, |
| "step": 8700 |
| }, |
| { |
| "epoch": 503.10241820768135, |
| "eval_loss": 2.6589674949645996, |
| "eval_runtime": 8.8861, |
| "eval_samples_per_second": 156.874, |
| "eval_steps_per_second": 19.694, |
| "step": 8700 |
| }, |
| { |
| "epoch": 508.7923186344239, |
| "grad_norm": 0.17845740914344788, |
| "learning_rate": 5.137123745819398e-06, |
| "loss": 2.6634, |
| "step": 8800 |
| }, |
| { |
| "epoch": 508.7923186344239, |
| "eval_loss": 2.6661596298217773, |
| "eval_runtime": 8.5694, |
| "eval_samples_per_second": 162.672, |
| "eval_steps_per_second": 20.422, |
| "step": 8800 |
| }, |
| { |
| "epoch": 514.4822190611665, |
| "grad_norm": 0.11282111704349518, |
| "learning_rate": 5.127090301003345e-06, |
| "loss": 2.6635, |
| "step": 8900 |
| }, |
| { |
| "epoch": 514.4822190611665, |
| "eval_loss": 2.6612164974212646, |
| "eval_runtime": 8.9016, |
| "eval_samples_per_second": 156.6, |
| "eval_steps_per_second": 19.659, |
| "step": 8900 |
| }, |
| { |
| "epoch": 520.172119487909, |
| "grad_norm": 0.11933238804340363, |
| "learning_rate": 5.117056856187291e-06, |
| "loss": 2.6629, |
| "step": 9000 |
| }, |
| { |
| "epoch": 520.172119487909, |
| "eval_loss": 2.663548707962036, |
| "eval_runtime": 8.5756, |
| "eval_samples_per_second": 162.553, |
| "eval_steps_per_second": 20.407, |
| "step": 9000 |
| }, |
| { |
| "epoch": 525.8620199146515, |
| "grad_norm": 0.16832073032855988, |
| "learning_rate": 5.1070234113712375e-06, |
| "loss": 2.6632, |
| "step": 9100 |
| }, |
| { |
| "epoch": 525.8620199146515, |
| "eval_loss": 2.665459156036377, |
| "eval_runtime": 8.8932, |
| "eval_samples_per_second": 156.75, |
| "eval_steps_per_second": 19.678, |
| "step": 9100 |
| }, |
| { |
| "epoch": 531.5519203413941, |
| "grad_norm": 0.1491301953792572, |
| "learning_rate": 5.096989966555184e-06, |
| "loss": 2.6633, |
| "step": 9200 |
| }, |
| { |
| "epoch": 531.5519203413941, |
| "eval_loss": 2.6649389266967773, |
| "eval_runtime": 8.5727, |
| "eval_samples_per_second": 162.609, |
| "eval_steps_per_second": 20.414, |
| "step": 9200 |
| }, |
| { |
| "epoch": 537.2418207681366, |
| "grad_norm": 0.20299378037452698, |
| "learning_rate": 5.08695652173913e-06, |
| "loss": 2.663, |
| "step": 9300 |
| }, |
| { |
| "epoch": 537.2418207681366, |
| "eval_loss": 2.662057638168335, |
| "eval_runtime": 8.5522, |
| "eval_samples_per_second": 162.999, |
| "eval_steps_per_second": 20.463, |
| "step": 9300 |
| }, |
| { |
| "epoch": 542.9317211948791, |
| "grad_norm": 0.1609990894794464, |
| "learning_rate": 5.076923076923077e-06, |
| "loss": 2.6631, |
| "step": 9400 |
| }, |
| { |
| "epoch": 542.9317211948791, |
| "eval_loss": 2.6604907512664795, |
| "eval_runtime": 8.8909, |
| "eval_samples_per_second": 156.79, |
| "eval_steps_per_second": 19.683, |
| "step": 9400 |
| }, |
| { |
| "epoch": 548.6216216216217, |
| "grad_norm": 0.18364398181438446, |
| "learning_rate": 5.066889632107024e-06, |
| "loss": 2.663, |
| "step": 9500 |
| }, |
| { |
| "epoch": 548.6216216216217, |
| "eval_loss": 2.660076856613159, |
| "eval_runtime": 8.5556, |
| "eval_samples_per_second": 162.934, |
| "eval_steps_per_second": 20.454, |
| "step": 9500 |
| }, |
| { |
| "epoch": 554.3115220483642, |
| "grad_norm": 0.15186648070812225, |
| "learning_rate": 5.05685618729097e-06, |
| "loss": 2.6631, |
| "step": 9600 |
| }, |
| { |
| "epoch": 554.3115220483642, |
| "eval_loss": 2.6639227867126465, |
| "eval_runtime": 8.551, |
| "eval_samples_per_second": 163.021, |
| "eval_steps_per_second": 20.465, |
| "step": 9600 |
| }, |
| { |
| "epoch": 560.0014224751067, |
| "grad_norm": 0.14984333515167236, |
| "learning_rate": 5.046822742474916e-06, |
| "loss": 2.6632, |
| "step": 9700 |
| }, |
| { |
| "epoch": 560.0014224751067, |
| "eval_loss": 2.6611454486846924, |
| "eval_runtime": 8.9892, |
| "eval_samples_per_second": 155.075, |
| "eval_steps_per_second": 19.468, |
| "step": 9700 |
| }, |
| { |
| "epoch": 565.6913229018492, |
| "grad_norm": 0.1124359741806984, |
| "learning_rate": 5.036789297658863e-06, |
| "loss": 2.663, |
| "step": 9800 |
| }, |
| { |
| "epoch": 565.6913229018492, |
| "eval_loss": 2.661329746246338, |
| "eval_runtime": 8.5548, |
| "eval_samples_per_second": 162.95, |
| "eval_steps_per_second": 20.456, |
| "step": 9800 |
| }, |
| { |
| "epoch": 571.3812233285918, |
| "grad_norm": 0.230003222823143, |
| "learning_rate": 5.02675585284281e-06, |
| "loss": 2.6631, |
| "step": 9900 |
| }, |
| { |
| "epoch": 571.3812233285918, |
| "eval_loss": 2.6644487380981445, |
| "eval_runtime": 8.563, |
| "eval_samples_per_second": 162.793, |
| "eval_steps_per_second": 20.437, |
| "step": 9900 |
| }, |
| { |
| "epoch": 577.0711237553343, |
| "grad_norm": 0.172781303524971, |
| "learning_rate": 5.016722408026756e-06, |
| "loss": 2.6626, |
| "step": 10000 |
| }, |
| { |
| "epoch": 577.0711237553343, |
| "eval_loss": 2.662069082260132, |
| "eval_runtime": 8.892, |
| "eval_samples_per_second": 156.769, |
| "eval_steps_per_second": 19.681, |
| "step": 10000 |
| }, |
| { |
| "epoch": 582.7610241820768, |
| "grad_norm": 0.15369383990764618, |
| "learning_rate": 5.0066889632107026e-06, |
| "loss": 2.663, |
| "step": 10100 |
| }, |
| { |
| "epoch": 582.7610241820768, |
| "eval_loss": 2.6648526191711426, |
| "eval_runtime": 8.5714, |
| "eval_samples_per_second": 162.633, |
| "eval_steps_per_second": 20.417, |
| "step": 10100 |
| }, |
| { |
| "epoch": 588.4509246088194, |
| "grad_norm": 0.1935221403837204, |
| "learning_rate": 4.996655518394649e-06, |
| "loss": 2.6632, |
| "step": 10200 |
| }, |
| { |
| "epoch": 588.4509246088194, |
| "eval_loss": 2.6587936878204346, |
| "eval_runtime": 8.5618, |
| "eval_samples_per_second": 162.816, |
| "eval_steps_per_second": 20.44, |
| "step": 10200 |
| }, |
| { |
| "epoch": 594.1408250355619, |
| "grad_norm": 0.14302797615528107, |
| "learning_rate": 4.986622073578595e-06, |
| "loss": 2.6626, |
| "step": 10300 |
| }, |
| { |
| "epoch": 594.1408250355619, |
| "eval_loss": 2.662747383117676, |
| "eval_runtime": 8.8896, |
| "eval_samples_per_second": 156.812, |
| "eval_steps_per_second": 19.686, |
| "step": 10300 |
| }, |
| { |
| "epoch": 599.8307254623044, |
| "grad_norm": 0.18007439374923706, |
| "learning_rate": 4.976588628762542e-06, |
| "loss": 2.6631, |
| "step": 10400 |
| }, |
| { |
| "epoch": 599.8307254623044, |
| "eval_loss": 2.6642062664031982, |
| "eval_runtime": 8.5619, |
| "eval_samples_per_second": 162.814, |
| "eval_steps_per_second": 20.439, |
| "step": 10400 |
| }, |
| { |
| "epoch": 605.520625889047, |
| "grad_norm": 0.2200157195329666, |
| "learning_rate": 4.966555183946489e-06, |
| "loss": 2.6625, |
| "step": 10500 |
| }, |
| { |
| "epoch": 605.520625889047, |
| "eval_loss": 2.6608235836029053, |
| "eval_runtime": 8.5591, |
| "eval_samples_per_second": 162.868, |
| "eval_steps_per_second": 20.446, |
| "step": 10500 |
| }, |
| { |
| "epoch": 611.2105263157895, |
| "grad_norm": 0.1693902462720871, |
| "learning_rate": 4.956521739130435e-06, |
| "loss": 2.6629, |
| "step": 10600 |
| }, |
| { |
| "epoch": 611.2105263157895, |
| "eval_loss": 2.6646671295166016, |
| "eval_runtime": 8.5548, |
| "eval_samples_per_second": 162.95, |
| "eval_steps_per_second": 20.456, |
| "step": 10600 |
| }, |
| { |
| "epoch": 616.900426742532, |
| "grad_norm": 0.17042887210845947, |
| "learning_rate": 4.9464882943143815e-06, |
| "loss": 2.663, |
| "step": 10700 |
| }, |
| { |
| "epoch": 616.900426742532, |
| "eval_loss": 2.6628501415252686, |
| "eval_runtime": 8.9184, |
| "eval_samples_per_second": 156.306, |
| "eval_steps_per_second": 19.622, |
| "step": 10700 |
| }, |
| { |
| "epoch": 622.5903271692746, |
| "grad_norm": 0.15105395019054413, |
| "learning_rate": 4.936454849498328e-06, |
| "loss": 2.6622, |
| "step": 10800 |
| }, |
| { |
| "epoch": 622.5903271692746, |
| "eval_loss": 2.663177251815796, |
| "eval_runtime": 8.5551, |
| "eval_samples_per_second": 162.943, |
| "eval_steps_per_second": 20.456, |
| "step": 10800 |
| }, |
| { |
| "epoch": 628.2802275960171, |
| "grad_norm": 0.16232497990131378, |
| "learning_rate": 4.926421404682274e-06, |
| "loss": 2.662, |
| "step": 10900 |
| }, |
| { |
| "epoch": 628.2802275960171, |
| "eval_loss": 2.662868022918701, |
| "eval_runtime": 8.5562, |
| "eval_samples_per_second": 162.923, |
| "eval_steps_per_second": 20.453, |
| "step": 10900 |
| }, |
| { |
| "epoch": 633.9701280227596, |
| "grad_norm": 0.19268840551376343, |
| "learning_rate": 4.916387959866221e-06, |
| "loss": 2.6616, |
| "step": 11000 |
| }, |
| { |
| "epoch": 633.9701280227596, |
| "eval_loss": 2.664335250854492, |
| "eval_runtime": 8.5889, |
| "eval_samples_per_second": 162.302, |
| "eval_steps_per_second": 20.375, |
| "step": 11000 |
| }, |
| { |
| "epoch": 639.6600284495021, |
| "grad_norm": 0.11089065670967102, |
| "learning_rate": 4.906354515050168e-06, |
| "loss": 2.6604, |
| "step": 11100 |
| }, |
| { |
| "epoch": 639.6600284495021, |
| "eval_loss": 2.656398057937622, |
| "eval_runtime": 8.9031, |
| "eval_samples_per_second": 156.575, |
| "eval_steps_per_second": 19.656, |
| "step": 11100 |
| }, |
| { |
| "epoch": 645.3499288762447, |
| "grad_norm": 0.1336248517036438, |
| "learning_rate": 4.8963210702341136e-06, |
| "loss": 2.6599, |
| "step": 11200 |
| }, |
| { |
| "epoch": 645.3499288762447, |
| "eval_loss": 2.6563735008239746, |
| "eval_runtime": 8.5539, |
| "eval_samples_per_second": 162.966, |
| "eval_steps_per_second": 20.458, |
| "step": 11200 |
| }, |
| { |
| "epoch": 651.0398293029872, |
| "grad_norm": 0.12397616356611252, |
| "learning_rate": 4.88628762541806e-06, |
| "loss": 2.6581, |
| "step": 11300 |
| }, |
| { |
| "epoch": 651.0398293029872, |
| "eval_loss": 2.65476131439209, |
| "eval_runtime": 8.5563, |
| "eval_samples_per_second": 162.92, |
| "eval_steps_per_second": 20.453, |
| "step": 11300 |
| }, |
| { |
| "epoch": 656.7297297297297, |
| "grad_norm": 0.2090333253145218, |
| "learning_rate": 4.876254180602007e-06, |
| "loss": 2.6553, |
| "step": 11400 |
| }, |
| { |
| "epoch": 656.7297297297297, |
| "eval_loss": 2.6521565914154053, |
| "eval_runtime": 8.892, |
| "eval_samples_per_second": 156.77, |
| "eval_steps_per_second": 19.681, |
| "step": 11400 |
| }, |
| { |
| "epoch": 662.4196301564723, |
| "grad_norm": 0.22825314104557037, |
| "learning_rate": 4.866220735785953e-06, |
| "loss": 2.654, |
| "step": 11500 |
| }, |
| { |
| "epoch": 662.4196301564723, |
| "eval_loss": 2.649017095565796, |
| "eval_runtime": 8.5668, |
| "eval_samples_per_second": 162.721, |
| "eval_steps_per_second": 20.428, |
| "step": 11500 |
| }, |
| { |
| "epoch": 668.1095305832148, |
| "grad_norm": 0.19265511631965637, |
| "learning_rate": 4.8561872909699e-06, |
| "loss": 2.6522, |
| "step": 11600 |
| }, |
| { |
| "epoch": 668.1095305832148, |
| "eval_loss": 2.650679588317871, |
| "eval_runtime": 8.5489, |
| "eval_samples_per_second": 163.061, |
| "eval_steps_per_second": 20.47, |
| "step": 11600 |
| }, |
| { |
| "epoch": 673.7994310099573, |
| "grad_norm": 0.1772225797176361, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 2.6506, |
| "step": 11700 |
| }, |
| { |
| "epoch": 673.7994310099573, |
| "eval_loss": 2.644835948944092, |
| "eval_runtime": 8.8953, |
| "eval_samples_per_second": 156.711, |
| "eval_steps_per_second": 19.673, |
| "step": 11700 |
| }, |
| { |
| "epoch": 679.4893314366999, |
| "grad_norm": 0.21952596306800842, |
| "learning_rate": 4.8361204013377925e-06, |
| "loss": 2.6495, |
| "step": 11800 |
| }, |
| { |
| "epoch": 679.4893314366999, |
| "eval_loss": 2.6477487087249756, |
| "eval_runtime": 8.5582, |
| "eval_samples_per_second": 162.885, |
| "eval_steps_per_second": 20.448, |
| "step": 11800 |
| }, |
| { |
| "epoch": 685.1792318634424, |
| "grad_norm": 0.15563735365867615, |
| "learning_rate": 4.826086956521739e-06, |
| "loss": 2.6488, |
| "step": 11900 |
| }, |
| { |
| "epoch": 685.1792318634424, |
| "eval_loss": 2.6446661949157715, |
| "eval_runtime": 8.5731, |
| "eval_samples_per_second": 162.602, |
| "eval_steps_per_second": 20.413, |
| "step": 11900 |
| }, |
| { |
| "epoch": 690.8691322901849, |
| "grad_norm": 0.19501689076423645, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 2.6477, |
| "step": 12000 |
| }, |
| { |
| "epoch": 690.8691322901849, |
| "eval_loss": 2.644357442855835, |
| "eval_runtime": 8.9024, |
| "eval_samples_per_second": 156.587, |
| "eval_steps_per_second": 19.658, |
| "step": 12000 |
| }, |
| { |
| "epoch": 696.5590327169275, |
| "grad_norm": 0.18384377658367157, |
| "learning_rate": 4.806020066889633e-06, |
| "loss": 2.6469, |
| "step": 12100 |
| }, |
| { |
| "epoch": 696.5590327169275, |
| "eval_loss": 2.6418209075927734, |
| "eval_runtime": 8.5626, |
| "eval_samples_per_second": 162.801, |
| "eval_steps_per_second": 20.438, |
| "step": 12100 |
| }, |
| { |
| "epoch": 702.24893314367, |
| "grad_norm": 0.1915460228919983, |
| "learning_rate": 4.795986622073579e-06, |
| "loss": 2.6454, |
| "step": 12200 |
| }, |
| { |
| "epoch": 702.24893314367, |
| "eval_loss": 2.641967535018921, |
| "eval_runtime": 8.5587, |
| "eval_samples_per_second": 162.875, |
| "eval_steps_per_second": 20.447, |
| "step": 12200 |
| }, |
| { |
| "epoch": 707.9388335704125, |
| "grad_norm": 0.18700934946537018, |
| "learning_rate": 4.785953177257525e-06, |
| "loss": 2.6448, |
| "step": 12300 |
| }, |
| { |
| "epoch": 707.9388335704125, |
| "eval_loss": 2.638808012008667, |
| "eval_runtime": 8.5628, |
| "eval_samples_per_second": 162.798, |
| "eval_steps_per_second": 20.437, |
| "step": 12300 |
| }, |
| { |
| "epoch": 713.628733997155, |
| "grad_norm": 0.17106923460960388, |
| "learning_rate": 4.775919732441472e-06, |
| "loss": 2.6446, |
| "step": 12400 |
| }, |
| { |
| "epoch": 713.628733997155, |
| "eval_loss": 2.6404778957366943, |
| "eval_runtime": 8.8985, |
| "eval_samples_per_second": 156.655, |
| "eval_steps_per_second": 19.666, |
| "step": 12400 |
| }, |
| { |
| "epoch": 719.3186344238976, |
| "grad_norm": 0.17941860854625702, |
| "learning_rate": 4.765886287625418e-06, |
| "loss": 2.6436, |
| "step": 12500 |
| }, |
| { |
| "epoch": 719.3186344238976, |
| "eval_loss": 2.6373584270477295, |
| "eval_runtime": 8.557, |
| "eval_samples_per_second": 162.907, |
| "eval_steps_per_second": 20.451, |
| "step": 12500 |
| }, |
| { |
| "epoch": 725.0085348506401, |
| "grad_norm": 0.17565137147903442, |
| "learning_rate": 4.755852842809365e-06, |
| "loss": 2.6434, |
| "step": 12600 |
| }, |
| { |
| "epoch": 725.0085348506401, |
| "eval_loss": 2.639042377471924, |
| "eval_runtime": 8.5557, |
| "eval_samples_per_second": 162.932, |
| "eval_steps_per_second": 20.454, |
| "step": 12600 |
| }, |
| { |
| "epoch": 730.6984352773826, |
| "grad_norm": 0.18980301916599274, |
| "learning_rate": 4.745819397993312e-06, |
| "loss": 2.6428, |
| "step": 12700 |
| }, |
| { |
| "epoch": 730.6984352773826, |
| "eval_loss": 2.6368398666381836, |
| "eval_runtime": 8.9007, |
| "eval_samples_per_second": 156.617, |
| "eval_steps_per_second": 19.661, |
| "step": 12700 |
| }, |
| { |
| "epoch": 736.3883357041252, |
| "grad_norm": 0.1572832465171814, |
| "learning_rate": 4.7357859531772575e-06, |
| "loss": 2.6423, |
| "step": 12800 |
| }, |
| { |
| "epoch": 736.3883357041252, |
| "eval_loss": 2.6357386112213135, |
| "eval_runtime": 8.5632, |
| "eval_samples_per_second": 162.79, |
| "eval_steps_per_second": 20.436, |
| "step": 12800 |
| }, |
| { |
| "epoch": 742.0782361308677, |
| "grad_norm": 0.17804701626300812, |
| "learning_rate": 4.725752508361204e-06, |
| "loss": 2.6415, |
| "step": 12900 |
| }, |
| { |
| "epoch": 742.0782361308677, |
| "eval_loss": 2.636728525161743, |
| "eval_runtime": 8.5558, |
| "eval_samples_per_second": 162.931, |
| "eval_steps_per_second": 20.454, |
| "step": 12900 |
| }, |
| { |
| "epoch": 747.7681365576102, |
| "grad_norm": 0.14196521043777466, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 2.6415, |
| "step": 13000 |
| }, |
| { |
| "epoch": 747.7681365576102, |
| "eval_loss": 2.6351287364959717, |
| "eval_runtime": 8.5495, |
| "eval_samples_per_second": 163.05, |
| "eval_steps_per_second": 20.469, |
| "step": 13000 |
| }, |
| { |
| "epoch": 753.4580369843528, |
| "grad_norm": 0.16282819211483002, |
| "learning_rate": 4.705685618729097e-06, |
| "loss": 2.6409, |
| "step": 13100 |
| }, |
| { |
| "epoch": 753.4580369843528, |
| "eval_loss": 2.6369380950927734, |
| "eval_runtime": 8.8961, |
| "eval_samples_per_second": 156.698, |
| "eval_steps_per_second": 19.672, |
| "step": 13100 |
| }, |
| { |
| "epoch": 759.1479374110953, |
| "grad_norm": 0.1580921709537506, |
| "learning_rate": 4.695652173913044e-06, |
| "loss": 2.6404, |
| "step": 13200 |
| }, |
| { |
| "epoch": 759.1479374110953, |
| "eval_loss": 2.6370317935943604, |
| "eval_runtime": 8.5558, |
| "eval_samples_per_second": 162.93, |
| "eval_steps_per_second": 20.454, |
| "step": 13200 |
| }, |
| { |
| "epoch": 764.8378378378378, |
| "grad_norm": 0.23563043773174286, |
| "learning_rate": 4.6856187290969905e-06, |
| "loss": 2.6394, |
| "step": 13300 |
| }, |
| { |
| "epoch": 764.8378378378378, |
| "eval_loss": 2.6321442127227783, |
| "eval_runtime": 8.5496, |
| "eval_samples_per_second": 163.048, |
| "eval_steps_per_second": 20.469, |
| "step": 13300 |
| }, |
| { |
| "epoch": 770.5277382645804, |
| "grad_norm": 0.16354724764823914, |
| "learning_rate": 4.675585284280936e-06, |
| "loss": 2.639, |
| "step": 13400 |
| }, |
| { |
| "epoch": 770.5277382645804, |
| "eval_loss": 2.6348910331726074, |
| "eval_runtime": 8.9264, |
| "eval_samples_per_second": 156.166, |
| "eval_steps_per_second": 19.605, |
| "step": 13400 |
| }, |
| { |
| "epoch": 776.2176386913229, |
| "grad_norm": 0.1707228273153305, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 2.639, |
| "step": 13500 |
| }, |
| { |
| "epoch": 776.2176386913229, |
| "eval_loss": 2.635204792022705, |
| "eval_runtime": 8.5691, |
| "eval_samples_per_second": 162.678, |
| "eval_steps_per_second": 20.422, |
| "step": 13500 |
| }, |
| { |
| "epoch": 781.9075391180654, |
| "grad_norm": 0.16934677958488464, |
| "learning_rate": 4.65551839464883e-06, |
| "loss": 2.6385, |
| "step": 13600 |
| }, |
| { |
| "epoch": 781.9075391180654, |
| "eval_loss": 2.633455276489258, |
| "eval_runtime": 8.557, |
| "eval_samples_per_second": 162.907, |
| "eval_steps_per_second": 20.451, |
| "step": 13600 |
| }, |
| { |
| "epoch": 787.5974395448079, |
| "grad_norm": 0.1871781051158905, |
| "learning_rate": 4.645484949832776e-06, |
| "loss": 2.6379, |
| "step": 13700 |
| }, |
| { |
| "epoch": 787.5974395448079, |
| "eval_loss": 2.633129119873047, |
| "eval_runtime": 8.9094, |
| "eval_samples_per_second": 156.463, |
| "eval_steps_per_second": 19.642, |
| "step": 13700 |
| }, |
| { |
| "epoch": 793.2873399715505, |
| "grad_norm": 0.20615407824516296, |
| "learning_rate": 4.635451505016723e-06, |
| "loss": 2.6376, |
| "step": 13800 |
| }, |
| { |
| "epoch": 793.2873399715505, |
| "eval_loss": 2.634012222290039, |
| "eval_runtime": 8.551, |
| "eval_samples_per_second": 163.021, |
| "eval_steps_per_second": 20.465, |
| "step": 13800 |
| }, |
| { |
| "epoch": 798.977240398293, |
| "grad_norm": 0.21352247893810272, |
| "learning_rate": 4.625418060200669e-06, |
| "loss": 2.6374, |
| "step": 13900 |
| }, |
| { |
| "epoch": 798.977240398293, |
| "eval_loss": 2.6326115131378174, |
| "eval_runtime": 8.5835, |
| "eval_samples_per_second": 162.405, |
| "eval_steps_per_second": 20.388, |
| "step": 13900 |
| }, |
| { |
| "epoch": 804.6671408250355, |
| "grad_norm": 0.21041567623615265, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 2.6373, |
| "step": 14000 |
| }, |
| { |
| "epoch": 804.6671408250355, |
| "eval_loss": 2.632585287094116, |
| "eval_runtime": 8.9033, |
| "eval_samples_per_second": 156.571, |
| "eval_steps_per_second": 19.656, |
| "step": 14000 |
| }, |
| { |
| "epoch": 810.3570412517781, |
| "grad_norm": 0.16558390855789185, |
| "learning_rate": 4.605351170568562e-06, |
| "loss": 2.637, |
| "step": 14100 |
| }, |
| { |
| "epoch": 810.3570412517781, |
| "eval_loss": 2.6330647468566895, |
| "eval_runtime": 8.5657, |
| "eval_samples_per_second": 162.743, |
| "eval_steps_per_second": 20.43, |
| "step": 14100 |
| }, |
| { |
| "epoch": 816.0469416785206, |
| "grad_norm": 0.14121714234352112, |
| "learning_rate": 4.595317725752509e-06, |
| "loss": 2.6369, |
| "step": 14200 |
| }, |
| { |
| "epoch": 816.0469416785206, |
| "eval_loss": 2.633366823196411, |
| "eval_runtime": 8.5562, |
| "eval_samples_per_second": 162.924, |
| "eval_steps_per_second": 20.453, |
| "step": 14200 |
| }, |
| { |
| "epoch": 821.7368421052631, |
| "grad_norm": 0.18725652992725372, |
| "learning_rate": 4.585284280936456e-06, |
| "loss": 2.6366, |
| "step": 14300 |
| }, |
| { |
| "epoch": 821.7368421052631, |
| "eval_loss": 2.633021116256714, |
| "eval_runtime": 8.547, |
| "eval_samples_per_second": 163.099, |
| "eval_steps_per_second": 20.475, |
| "step": 14300 |
| }, |
| { |
| "epoch": 827.4267425320057, |
| "grad_norm": 0.17320464551448822, |
| "learning_rate": 4.5752508361204015e-06, |
| "loss": 2.6362, |
| "step": 14400 |
| }, |
| { |
| "epoch": 827.4267425320057, |
| "eval_loss": 2.6336045265197754, |
| "eval_runtime": 8.9019, |
| "eval_samples_per_second": 156.595, |
| "eval_steps_per_second": 19.659, |
| "step": 14400 |
| }, |
| { |
| "epoch": 833.1166429587482, |
| "grad_norm": 0.25663965940475464, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 2.6358, |
| "step": 14500 |
| }, |
| { |
| "epoch": 833.1166429587482, |
| "eval_loss": 2.629626750946045, |
| "eval_runtime": 8.555, |
| "eval_samples_per_second": 162.946, |
| "eval_steps_per_second": 20.456, |
| "step": 14500 |
| }, |
| { |
| "epoch": 838.8065433854907, |
| "grad_norm": 0.19742050766944885, |
| "learning_rate": 4.555183946488295e-06, |
| "loss": 2.6359, |
| "step": 14600 |
| }, |
| { |
| "epoch": 838.8065433854907, |
| "eval_loss": 2.6323554515838623, |
| "eval_runtime": 8.5537, |
| "eval_samples_per_second": 162.97, |
| "eval_steps_per_second": 20.459, |
| "step": 14600 |
| }, |
| { |
| "epoch": 844.4964438122333, |
| "grad_norm": 0.17131681740283966, |
| "learning_rate": 4.545150501672241e-06, |
| "loss": 2.636, |
| "step": 14700 |
| }, |
| { |
| "epoch": 844.4964438122333, |
| "eval_loss": 2.628143787384033, |
| "eval_runtime": 8.8882, |
| "eval_samples_per_second": 156.837, |
| "eval_steps_per_second": 19.689, |
| "step": 14700 |
| }, |
| { |
| "epoch": 850.1863442389758, |
| "grad_norm": 0.12929615378379822, |
| "learning_rate": 4.535117056856188e-06, |
| "loss": 2.6355, |
| "step": 14800 |
| }, |
| { |
| "epoch": 850.1863442389758, |
| "eval_loss": 2.62906813621521, |
| "eval_runtime": 8.5719, |
| "eval_samples_per_second": 162.624, |
| "eval_steps_per_second": 20.415, |
| "step": 14800 |
| }, |
| { |
| "epoch": 855.8762446657183, |
| "grad_norm": 0.1839623749256134, |
| "learning_rate": 4.5250836120401345e-06, |
| "loss": 2.6357, |
| "step": 14900 |
| }, |
| { |
| "epoch": 855.8762446657183, |
| "eval_loss": 2.6294586658477783, |
| "eval_runtime": 8.556, |
| "eval_samples_per_second": 162.926, |
| "eval_steps_per_second": 20.453, |
| "step": 14900 |
| }, |
| { |
| "epoch": 861.5661450924608, |
| "grad_norm": 0.16717371344566345, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 2.6344, |
| "step": 15000 |
| }, |
| { |
| "epoch": 861.5661450924608, |
| "eval_loss": 2.63043475151062, |
| "eval_runtime": 8.8856, |
| "eval_samples_per_second": 156.883, |
| "eval_steps_per_second": 19.695, |
| "step": 15000 |
| }, |
| { |
| "epoch": 867.2560455192034, |
| "grad_norm": 0.1823185533285141, |
| "learning_rate": 4.505016722408027e-06, |
| "loss": 2.6348, |
| "step": 15100 |
| }, |
| { |
| "epoch": 867.2560455192034, |
| "eval_loss": 2.6305038928985596, |
| "eval_runtime": 8.5615, |
| "eval_samples_per_second": 162.822, |
| "eval_steps_per_second": 20.44, |
| "step": 15100 |
| }, |
| { |
| "epoch": 872.9459459459459, |
| "grad_norm": 0.1823842078447342, |
| "learning_rate": 4.494983277591973e-06, |
| "loss": 2.6348, |
| "step": 15200 |
| }, |
| { |
| "epoch": 872.9459459459459, |
| "eval_loss": 2.6309924125671387, |
| "eval_runtime": 8.5581, |
| "eval_samples_per_second": 162.887, |
| "eval_steps_per_second": 20.449, |
| "step": 15200 |
| }, |
| { |
| "epoch": 878.6358463726884, |
| "grad_norm": 0.20153598487377167, |
| "learning_rate": 4.48494983277592e-06, |
| "loss": 2.6342, |
| "step": 15300 |
| }, |
| { |
| "epoch": 878.6358463726884, |
| "eval_loss": 2.6329071521759033, |
| "eval_runtime": 8.8917, |
| "eval_samples_per_second": 156.776, |
| "eval_steps_per_second": 19.681, |
| "step": 15300 |
| }, |
| { |
| "epoch": 884.325746799431, |
| "grad_norm": 0.18218009173870087, |
| "learning_rate": 4.474916387959866e-06, |
| "loss": 2.6344, |
| "step": 15400 |
| }, |
| { |
| "epoch": 884.325746799431, |
| "eval_loss": 2.6302568912506104, |
| "eval_runtime": 8.5652, |
| "eval_samples_per_second": 162.752, |
| "eval_steps_per_second": 20.432, |
| "step": 15400 |
| }, |
| { |
| "epoch": 890.0156472261735, |
| "grad_norm": 0.16739265620708466, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 2.6343, |
| "step": 15500 |
| }, |
| { |
| "epoch": 890.0156472261735, |
| "eval_loss": 2.6310319900512695, |
| "eval_runtime": 8.5584, |
| "eval_samples_per_second": 162.88, |
| "eval_steps_per_second": 20.448, |
| "step": 15500 |
| }, |
| { |
| "epoch": 895.705547652916, |
| "grad_norm": 0.1390063315629959, |
| "learning_rate": 4.454849498327759e-06, |
| "loss": 2.6339, |
| "step": 15600 |
| }, |
| { |
| "epoch": 895.705547652916, |
| "eval_loss": 2.6301069259643555, |
| "eval_runtime": 8.9049, |
| "eval_samples_per_second": 156.544, |
| "eval_steps_per_second": 19.652, |
| "step": 15600 |
| }, |
| { |
| "epoch": 901.3954480796586, |
| "grad_norm": 0.18924345076084137, |
| "learning_rate": 4.444816053511705e-06, |
| "loss": 2.6339, |
| "step": 15700 |
| }, |
| { |
| "epoch": 901.3954480796586, |
| "eval_loss": 2.6323258876800537, |
| "eval_runtime": 8.5547, |
| "eval_samples_per_second": 162.952, |
| "eval_steps_per_second": 20.457, |
| "step": 15700 |
| }, |
| { |
| "epoch": 907.0853485064011, |
| "grad_norm": 0.18514582514762878, |
| "learning_rate": 4.434782608695652e-06, |
| "loss": 2.6338, |
| "step": 15800 |
| }, |
| { |
| "epoch": 907.0853485064011, |
| "eval_loss": 2.629317045211792, |
| "eval_runtime": 8.5557, |
| "eval_samples_per_second": 162.933, |
| "eval_steps_per_second": 20.454, |
| "step": 15800 |
| }, |
| { |
| "epoch": 912.7752489331436, |
| "grad_norm": 0.16134916245937347, |
| "learning_rate": 4.424749163879599e-06, |
| "loss": 2.6332, |
| "step": 15900 |
| }, |
| { |
| "epoch": 912.7752489331436, |
| "eval_loss": 2.6283786296844482, |
| "eval_runtime": 8.8916, |
| "eval_samples_per_second": 156.778, |
| "eval_steps_per_second": 19.682, |
| "step": 15900 |
| }, |
| { |
| "epoch": 918.4651493598863, |
| "grad_norm": 0.15325242280960083, |
| "learning_rate": 4.414715719063545e-06, |
| "loss": 2.6327, |
| "step": 16000 |
| }, |
| { |
| "epoch": 918.4651493598863, |
| "eval_loss": 2.628596305847168, |
| "eval_runtime": 8.5648, |
| "eval_samples_per_second": 162.759, |
| "eval_steps_per_second": 20.432, |
| "step": 16000 |
| }, |
| { |
| "epoch": 924.1550497866288, |
| "grad_norm": 0.16646109521389008, |
| "learning_rate": 4.404682274247491e-06, |
| "loss": 2.6334, |
| "step": 16100 |
| }, |
| { |
| "epoch": 924.1550497866288, |
| "eval_loss": 2.6277356147766113, |
| "eval_runtime": 8.5578, |
| "eval_samples_per_second": 162.891, |
| "eval_steps_per_second": 20.449, |
| "step": 16100 |
| }, |
| { |
| "epoch": 929.8449502133712, |
| "grad_norm": 0.190487802028656, |
| "learning_rate": 4.394648829431438e-06, |
| "loss": 2.6324, |
| "step": 16200 |
| }, |
| { |
| "epoch": 929.8449502133712, |
| "eval_loss": 2.632991075515747, |
| "eval_runtime": 8.8984, |
| "eval_samples_per_second": 156.657, |
| "eval_steps_per_second": 19.666, |
| "step": 16200 |
| }, |
| { |
| "epoch": 935.5348506401137, |
| "grad_norm": 0.12819956243038177, |
| "learning_rate": 4.384615384615384e-06, |
| "loss": 2.6329, |
| "step": 16300 |
| }, |
| { |
| "epoch": 935.5348506401137, |
| "eval_loss": 2.6287131309509277, |
| "eval_runtime": 8.5512, |
| "eval_samples_per_second": 163.019, |
| "eval_steps_per_second": 20.465, |
| "step": 16300 |
| }, |
| { |
| "epoch": 941.2247510668564, |
| "grad_norm": 0.1414095014333725, |
| "learning_rate": 4.374581939799331e-06, |
| "loss": 2.6329, |
| "step": 16400 |
| }, |
| { |
| "epoch": 941.2247510668564, |
| "eval_loss": 2.6280200481414795, |
| "eval_runtime": 8.5543, |
| "eval_samples_per_second": 162.96, |
| "eval_steps_per_second": 20.458, |
| "step": 16400 |
| }, |
| { |
| "epoch": 946.9146514935989, |
| "grad_norm": 0.1598784625530243, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 2.6321, |
| "step": 16500 |
| }, |
| { |
| "epoch": 946.9146514935989, |
| "eval_loss": 2.627798080444336, |
| "eval_runtime": 8.5617, |
| "eval_samples_per_second": 162.818, |
| "eval_steps_per_second": 20.44, |
| "step": 16500 |
| }, |
| { |
| "epoch": 952.6045519203414, |
| "grad_norm": 0.16925720870494843, |
| "learning_rate": 4.354515050167224e-06, |
| "loss": 2.6323, |
| "step": 16600 |
| }, |
| { |
| "epoch": 952.6045519203414, |
| "eval_loss": 2.627779722213745, |
| "eval_runtime": 8.8962, |
| "eval_samples_per_second": 156.697, |
| "eval_steps_per_second": 19.671, |
| "step": 16600 |
| }, |
| { |
| "epoch": 958.294452347084, |
| "grad_norm": 0.14368008077144623, |
| "learning_rate": 4.34448160535117e-06, |
| "loss": 2.6325, |
| "step": 16700 |
| }, |
| { |
| "epoch": 958.294452347084, |
| "eval_loss": 2.6313493251800537, |
| "eval_runtime": 8.5564, |
| "eval_samples_per_second": 162.919, |
| "eval_steps_per_second": 20.452, |
| "step": 16700 |
| }, |
| { |
| "epoch": 963.9843527738265, |
| "grad_norm": 0.17267128825187683, |
| "learning_rate": 4.334448160535117e-06, |
| "loss": 2.6323, |
| "step": 16800 |
| }, |
| { |
| "epoch": 963.9843527738265, |
| "eval_loss": 2.628115653991699, |
| "eval_runtime": 8.9, |
| "eval_samples_per_second": 156.628, |
| "eval_steps_per_second": 19.663, |
| "step": 16800 |
| }, |
| { |
| "epoch": 969.674253200569, |
| "grad_norm": 0.19119863212108612, |
| "learning_rate": 4.324414715719064e-06, |
| "loss": 2.6318, |
| "step": 16900 |
| }, |
| { |
| "epoch": 969.674253200569, |
| "eval_loss": 2.627437114715576, |
| "eval_runtime": 8.5548, |
| "eval_samples_per_second": 162.95, |
| "eval_steps_per_second": 20.456, |
| "step": 16900 |
| }, |
| { |
| "epoch": 975.3641536273116, |
| "grad_norm": 0.13695764541625977, |
| "learning_rate": 4.31438127090301e-06, |
| "loss": 2.6315, |
| "step": 17000 |
| }, |
| { |
| "epoch": 975.3641536273116, |
| "eval_loss": 2.6262221336364746, |
| "eval_runtime": 8.8997, |
| "eval_samples_per_second": 156.634, |
| "eval_steps_per_second": 19.664, |
| "step": 17000 |
| }, |
| { |
| "epoch": 981.0540540540541, |
| "grad_norm": 0.14241984486579895, |
| "learning_rate": 4.3043478260869565e-06, |
| "loss": 2.6318, |
| "step": 17100 |
| }, |
| { |
| "epoch": 981.0540540540541, |
| "eval_loss": 2.6269607543945312, |
| "eval_runtime": 8.5513, |
| "eval_samples_per_second": 163.017, |
| "eval_steps_per_second": 20.465, |
| "step": 17100 |
| }, |
| { |
| "epoch": 986.7439544807966, |
| "grad_norm": 0.15792237222194672, |
| "learning_rate": 4.294314381270903e-06, |
| "loss": 2.6315, |
| "step": 17200 |
| }, |
| { |
| "epoch": 986.7439544807966, |
| "eval_loss": 2.6278719902038574, |
| "eval_runtime": 8.6117, |
| "eval_samples_per_second": 161.873, |
| "eval_steps_per_second": 20.321, |
| "step": 17200 |
| }, |
| { |
| "epoch": 992.4338549075392, |
| "grad_norm": 0.17118434607982635, |
| "learning_rate": 4.284280936454849e-06, |
| "loss": 2.6316, |
| "step": 17300 |
| }, |
| { |
| "epoch": 992.4338549075392, |
| "eval_loss": 2.6280527114868164, |
| "eval_runtime": 8.9035, |
| "eval_samples_per_second": 156.568, |
| "eval_steps_per_second": 19.655, |
| "step": 17300 |
| }, |
| { |
| "epoch": 998.1237553342817, |
| "grad_norm": 0.15846611559391022, |
| "learning_rate": 4.274247491638796e-06, |
| "loss": 2.6313, |
| "step": 17400 |
| }, |
| { |
| "epoch": 998.1237553342817, |
| "eval_loss": 2.6250662803649902, |
| "eval_runtime": 8.6057, |
| "eval_samples_per_second": 161.986, |
| "eval_steps_per_second": 20.335, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1003.8136557610242, |
| "grad_norm": 0.17078837752342224, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 2.6312, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1003.8136557610242, |
| "eval_loss": 2.6274046897888184, |
| "eval_runtime": 8.564, |
| "eval_samples_per_second": 162.775, |
| "eval_steps_per_second": 20.434, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1009.5035561877667, |
| "grad_norm": 0.1965128779411316, |
| "learning_rate": 4.254180602006689e-06, |
| "loss": 2.6312, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1009.5035561877667, |
| "eval_loss": 2.6278066635131836, |
| "eval_runtime": 8.9009, |
| "eval_samples_per_second": 156.614, |
| "eval_steps_per_second": 19.661, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1015.1934566145093, |
| "grad_norm": 0.19483456015586853, |
| "learning_rate": 4.244147157190635e-06, |
| "loss": 2.6311, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1015.1934566145093, |
| "eval_loss": 2.623715400695801, |
| "eval_runtime": 8.5597, |
| "eval_samples_per_second": 162.857, |
| "eval_steps_per_second": 20.445, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1020.8833570412518, |
| "grad_norm": 0.14647985994815826, |
| "learning_rate": 4.234113712374582e-06, |
| "loss": 2.6309, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1020.8833570412518, |
| "eval_loss": 2.625011920928955, |
| "eval_runtime": 8.8958, |
| "eval_samples_per_second": 156.704, |
| "eval_steps_per_second": 19.672, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1026.5732574679944, |
| "grad_norm": 0.1495138704776764, |
| "learning_rate": 4.224080267558528e-06, |
| "loss": 2.6303, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1026.5732574679944, |
| "eval_loss": 2.6249139308929443, |
| "eval_runtime": 8.5668, |
| "eval_samples_per_second": 162.72, |
| "eval_steps_per_second": 20.428, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1032.2631578947369, |
| "grad_norm": 0.1665605753660202, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 2.6305, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1032.2631578947369, |
| "eval_loss": 2.6271395683288574, |
| "eval_runtime": 8.8963, |
| "eval_samples_per_second": 156.694, |
| "eval_steps_per_second": 19.671, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1037.9530583214794, |
| "grad_norm": 0.1886260211467743, |
| "learning_rate": 4.2040133779264216e-06, |
| "loss": 2.6307, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1037.9530583214794, |
| "eval_loss": 2.6232030391693115, |
| "eval_runtime": 8.5585, |
| "eval_samples_per_second": 162.879, |
| "eval_steps_per_second": 20.447, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1043.6429587482219, |
| "grad_norm": 0.1451101154088974, |
| "learning_rate": 4.1939799331103675e-06, |
| "loss": 2.6304, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1043.6429587482219, |
| "eval_loss": 2.624784469604492, |
| "eval_runtime": 8.5566, |
| "eval_samples_per_second": 162.916, |
| "eval_steps_per_second": 20.452, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1049.3328591749644, |
| "grad_norm": 0.13841372728347778, |
| "learning_rate": 4.183946488294314e-06, |
| "loss": 2.6305, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1049.3328591749644, |
| "eval_loss": 2.626993179321289, |
| "eval_runtime": 8.9131, |
| "eval_samples_per_second": 156.398, |
| "eval_steps_per_second": 19.634, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1055.0227596017069, |
| "grad_norm": 0.1455683559179306, |
| "learning_rate": 4.173913043478261e-06, |
| "loss": 2.6301, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1055.0227596017069, |
| "eval_loss": 2.6284282207489014, |
| "eval_runtime": 8.555, |
| "eval_samples_per_second": 162.946, |
| "eval_steps_per_second": 20.456, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1060.7126600284496, |
| "grad_norm": 0.14764897525310516, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 2.6305, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1060.7126600284496, |
| "eval_loss": 2.626128673553467, |
| "eval_runtime": 8.5552, |
| "eval_samples_per_second": 162.942, |
| "eval_steps_per_second": 20.455, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1066.402560455192, |
| "grad_norm": 0.15602290630340576, |
| "learning_rate": 4.153846153846154e-06, |
| "loss": 2.6303, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1066.402560455192, |
| "eval_loss": 2.6236324310302734, |
| "eval_runtime": 8.5623, |
| "eval_samples_per_second": 162.806, |
| "eval_steps_per_second": 20.438, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1099.8036984352773, |
| "grad_norm": 0.1434181034564972, |
| "learning_rate": 4.1438127090301005e-06, |
| "loss": 2.6299, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1099.8036984352773, |
| "eval_loss": 2.6240837574005127, |
| "eval_runtime": 8.8817, |
| "eval_samples_per_second": 156.952, |
| "eval_steps_per_second": 19.703, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1105.49359886202, |
| "grad_norm": 0.17055080831050873, |
| "learning_rate": 4.133779264214047e-06, |
| "loss": 2.6296, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1105.49359886202, |
| "eval_loss": 2.627481698989868, |
| "eval_runtime": 8.5387, |
| "eval_samples_per_second": 163.257, |
| "eval_steps_per_second": 20.495, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1111.1834992887625, |
| "grad_norm": 0.15118207037448883, |
| "learning_rate": 4.123745819397993e-06, |
| "loss": 2.6295, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1111.1834992887625, |
| "eval_loss": 2.6243932247161865, |
| "eval_runtime": 8.5529, |
| "eval_samples_per_second": 162.985, |
| "eval_steps_per_second": 20.461, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1116.873399715505, |
| "grad_norm": 0.14333444833755493, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 2.6294, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1116.873399715505, |
| "eval_loss": 2.6264147758483887, |
| "eval_runtime": 8.8805, |
| "eval_samples_per_second": 156.973, |
| "eval_steps_per_second": 19.706, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1122.5633001422475, |
| "grad_norm": 0.13676032423973083, |
| "learning_rate": 4.103678929765887e-06, |
| "loss": 2.6292, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1122.5633001422475, |
| "eval_loss": 2.6256096363067627, |
| "eval_runtime": 8.5396, |
| "eval_samples_per_second": 163.24, |
| "eval_steps_per_second": 20.493, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1128.25320056899, |
| "grad_norm": 0.13608410954475403, |
| "learning_rate": 4.0936454849498326e-06, |
| "loss": 2.6292, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1128.25320056899, |
| "eval_loss": 2.6272470951080322, |
| "eval_runtime": 8.5412, |
| "eval_samples_per_second": 163.209, |
| "eval_steps_per_second": 20.489, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1133.9431009957325, |
| "grad_norm": 0.16941364109516144, |
| "learning_rate": 4.083612040133779e-06, |
| "loss": 2.6294, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1133.9431009957325, |
| "eval_loss": 2.6245925426483154, |
| "eval_runtime": 8.8711, |
| "eval_samples_per_second": 157.14, |
| "eval_steps_per_second": 19.727, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1139.6330014224752, |
| "grad_norm": 0.17961208522319794, |
| "learning_rate": 4.073578595317726e-06, |
| "loss": 2.6291, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1139.6330014224752, |
| "eval_loss": 2.6260921955108643, |
| "eval_runtime": 8.5481, |
| "eval_samples_per_second": 163.078, |
| "eval_steps_per_second": 20.472, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1145.3229018492177, |
| "grad_norm": 0.15234056115150452, |
| "learning_rate": 4.063545150501672e-06, |
| "loss": 2.6288, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1145.3229018492177, |
| "eval_loss": 2.624178647994995, |
| "eval_runtime": 8.5458, |
| "eval_samples_per_second": 163.121, |
| "eval_steps_per_second": 20.478, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1151.0128022759602, |
| "grad_norm": 0.1660071462392807, |
| "learning_rate": 4.053511705685619e-06, |
| "loss": 2.6289, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1151.0128022759602, |
| "eval_loss": 2.625214099884033, |
| "eval_runtime": 8.8562, |
| "eval_samples_per_second": 157.404, |
| "eval_steps_per_second": 19.76, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1156.7027027027027, |
| "grad_norm": 0.1432279646396637, |
| "learning_rate": 4.0434782608695655e-06, |
| "loss": 2.6288, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1156.7027027027027, |
| "eval_loss": 2.6248371601104736, |
| "eval_runtime": 8.5388, |
| "eval_samples_per_second": 163.255, |
| "eval_steps_per_second": 20.495, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1162.3926031294452, |
| "grad_norm": 0.13359645009040833, |
| "learning_rate": 4.0334448160535115e-06, |
| "loss": 2.6291, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1162.3926031294452, |
| "eval_loss": 2.6228439807891846, |
| "eval_runtime": 8.5401, |
| "eval_samples_per_second": 163.23, |
| "eval_steps_per_second": 20.492, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1168.0825035561877, |
| "grad_norm": 0.18464621901512146, |
| "learning_rate": 4.023411371237458e-06, |
| "loss": 2.6286, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1168.0825035561877, |
| "eval_loss": 2.624844789505005, |
| "eval_runtime": 8.5435, |
| "eval_samples_per_second": 163.165, |
| "eval_steps_per_second": 20.483, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1173.7724039829302, |
| "grad_norm": 0.14693519473075867, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 2.6282, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1173.7724039829302, |
| "eval_loss": 2.625211238861084, |
| "eval_runtime": 8.876, |
| "eval_samples_per_second": 157.053, |
| "eval_steps_per_second": 19.716, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1179.462304409673, |
| "grad_norm": 0.14849957823753357, |
| "learning_rate": 4.003344481605351e-06, |
| "loss": 2.6281, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1179.462304409673, |
| "eval_loss": 2.6256697177886963, |
| "eval_runtime": 8.8789, |
| "eval_samples_per_second": 157.002, |
| "eval_steps_per_second": 19.71, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1185.1522048364154, |
| "grad_norm": 0.1465172916650772, |
| "learning_rate": 3.993311036789298e-06, |
| "loss": 2.6279, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1185.1522048364154, |
| "eval_loss": 2.6242611408233643, |
| "eval_runtime": 8.5384, |
| "eval_samples_per_second": 163.263, |
| "eval_steps_per_second": 20.496, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1190.842105263158, |
| "grad_norm": 0.15794384479522705, |
| "learning_rate": 3.9832775919732444e-06, |
| "loss": 2.6281, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1190.842105263158, |
| "eval_loss": 2.623426914215088, |
| "eval_runtime": 8.5314, |
| "eval_samples_per_second": 163.397, |
| "eval_steps_per_second": 20.512, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1196.5320056899004, |
| "grad_norm": 0.1284749060869217, |
| "learning_rate": 3.97324414715719e-06, |
| "loss": 2.6281, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1196.5320056899004, |
| "eval_loss": 2.622859239578247, |
| "eval_runtime": 8.863, |
| "eval_samples_per_second": 157.283, |
| "eval_steps_per_second": 19.745, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1202.221906116643, |
| "grad_norm": 0.1669575572013855, |
| "learning_rate": 3.963210702341137e-06, |
| "loss": 2.6281, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1202.221906116643, |
| "eval_loss": 2.6228220462799072, |
| "eval_runtime": 8.5362, |
| "eval_samples_per_second": 163.305, |
| "eval_steps_per_second": 20.501, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1207.9118065433854, |
| "grad_norm": 0.12002875655889511, |
| "learning_rate": 3.953177257525084e-06, |
| "loss": 2.6284, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1207.9118065433854, |
| "eval_loss": 2.6229088306427, |
| "eval_runtime": 8.5208, |
| "eval_samples_per_second": 163.599, |
| "eval_steps_per_second": 20.538, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1213.6017069701281, |
| "grad_norm": 0.14911407232284546, |
| "learning_rate": 3.943143812709031e-06, |
| "loss": 2.6278, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1213.6017069701281, |
| "eval_loss": 2.6207728385925293, |
| "eval_runtime": 8.5412, |
| "eval_samples_per_second": 163.208, |
| "eval_steps_per_second": 20.489, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1219.2916073968706, |
| "grad_norm": 0.1687910258769989, |
| "learning_rate": 3.9331103678929765e-06, |
| "loss": 2.6277, |
| "step": 20800 |
| }, |
| { |
| "epoch": 1219.2916073968706, |
| "eval_loss": 2.623382806777954, |
| "eval_runtime": 8.8763, |
| "eval_samples_per_second": 157.047, |
| "eval_steps_per_second": 19.715, |
| "step": 20800 |
| }, |
| { |
| "epoch": 1224.9815078236131, |
| "grad_norm": 0.1914646476507187, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 2.6271, |
| "step": 20900 |
| }, |
| { |
| "epoch": 1224.9815078236131, |
| "eval_loss": 2.6222121715545654, |
| "eval_runtime": 8.5242, |
| "eval_samples_per_second": 163.535, |
| "eval_steps_per_second": 20.53, |
| "step": 20900 |
| }, |
| { |
| "epoch": 1230.6714082503556, |
| "grad_norm": 0.15010875463485718, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 2.6276, |
| "step": 21000 |
| }, |
| { |
| "epoch": 1230.6714082503556, |
| "eval_loss": 2.6212801933288574, |
| "eval_runtime": 8.5399, |
| "eval_samples_per_second": 163.234, |
| "eval_steps_per_second": 20.492, |
| "step": 21000 |
| }, |
| { |
| "epoch": 1236.3613086770981, |
| "grad_norm": 0.1383567601442337, |
| "learning_rate": 3.903010033444816e-06, |
| "loss": 2.6275, |
| "step": 21100 |
| }, |
| { |
| "epoch": 1236.3613086770981, |
| "eval_loss": 2.6240437030792236, |
| "eval_runtime": 8.5381, |
| "eval_samples_per_second": 163.269, |
| "eval_steps_per_second": 20.496, |
| "step": 21100 |
| }, |
| { |
| "epoch": 1242.0512091038406, |
| "grad_norm": 0.15790875256061554, |
| "learning_rate": 3.892976588628763e-06, |
| "loss": 2.6267, |
| "step": 21200 |
| }, |
| { |
| "epoch": 1242.0512091038406, |
| "eval_loss": 2.623500108718872, |
| "eval_runtime": 8.8811, |
| "eval_samples_per_second": 156.963, |
| "eval_steps_per_second": 19.705, |
| "step": 21200 |
| }, |
| { |
| "epoch": 1247.7411095305831, |
| "grad_norm": 0.15240466594696045, |
| "learning_rate": 3.8829431438127095e-06, |
| "loss": 2.6269, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1247.7411095305831, |
| "eval_loss": 2.6207492351531982, |
| "eval_runtime": 8.5211, |
| "eval_samples_per_second": 163.594, |
| "eval_steps_per_second": 20.537, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1253.4310099573258, |
| "grad_norm": 0.1933618187904358, |
| "learning_rate": 3.8729096989966554e-06, |
| "loss": 2.627, |
| "step": 21400 |
| }, |
| { |
| "epoch": 1253.4310099573258, |
| "eval_loss": 2.62373948097229, |
| "eval_runtime": 8.5399, |
| "eval_samples_per_second": 163.235, |
| "eval_steps_per_second": 20.492, |
| "step": 21400 |
| }, |
| { |
| "epoch": 1259.1209103840683, |
| "grad_norm": 0.17298194766044617, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 2.6273, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1259.1209103840683, |
| "eval_loss": 2.626997232437134, |
| "eval_runtime": 8.88, |
| "eval_samples_per_second": 156.981, |
| "eval_steps_per_second": 19.707, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1264.8108108108108, |
| "grad_norm": 0.15336528420448303, |
| "learning_rate": 3.852842809364549e-06, |
| "loss": 2.6276, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1264.8108108108108, |
| "eval_loss": 2.6227035522460938, |
| "eval_runtime": 8.5395, |
| "eval_samples_per_second": 163.241, |
| "eval_steps_per_second": 20.493, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1270.5007112375533, |
| "grad_norm": 0.1456770896911621, |
| "learning_rate": 3.842809364548495e-06, |
| "loss": 2.6264, |
| "step": 21700 |
| }, |
| { |
| "epoch": 1270.5007112375533, |
| "eval_loss": 2.6244804859161377, |
| "eval_runtime": 8.5371, |
| "eval_samples_per_second": 163.287, |
| "eval_steps_per_second": 20.499, |
| "step": 21700 |
| }, |
| { |
| "epoch": 1276.1906116642958, |
| "grad_norm": 0.14131468534469604, |
| "learning_rate": 3.832775919732442e-06, |
| "loss": 2.6261, |
| "step": 21800 |
| }, |
| { |
| "epoch": 1276.1906116642958, |
| "eval_loss": 2.6235532760620117, |
| "eval_runtime": 8.8848, |
| "eval_samples_per_second": 156.898, |
| "eval_steps_per_second": 19.697, |
| "step": 21800 |
| }, |
| { |
| "epoch": 1281.8805120910383, |
| "grad_norm": 0.16801823675632477, |
| "learning_rate": 3.822742474916388e-06, |
| "loss": 2.6266, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1281.8805120910383, |
| "eval_loss": 2.6234781742095947, |
| "eval_runtime": 8.5394, |
| "eval_samples_per_second": 163.243, |
| "eval_steps_per_second": 20.493, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1287.570412517781, |
| "grad_norm": 0.13501711189746857, |
| "learning_rate": 3.8127090301003347e-06, |
| "loss": 2.6261, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1287.570412517781, |
| "eval_loss": 2.621072292327881, |
| "eval_runtime": 8.5307, |
| "eval_samples_per_second": 163.41, |
| "eval_steps_per_second": 20.514, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1293.2603129445235, |
| "grad_norm": 0.14802291989326477, |
| "learning_rate": 3.802675585284281e-06, |
| "loss": 2.6267, |
| "step": 22100 |
| }, |
| { |
| "epoch": 1293.2603129445235, |
| "eval_loss": 2.625509023666382, |
| "eval_runtime": 8.876, |
| "eval_samples_per_second": 157.052, |
| "eval_steps_per_second": 19.716, |
| "step": 22100 |
| }, |
| { |
| "epoch": 1298.950213371266, |
| "grad_norm": 0.149693563580513, |
| "learning_rate": 3.792642140468228e-06, |
| "loss": 2.6266, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1298.950213371266, |
| "eval_loss": 2.6230156421661377, |
| "eval_runtime": 8.5523, |
| "eval_samples_per_second": 162.996, |
| "eval_steps_per_second": 20.462, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1304.6401137980085, |
| "grad_norm": 0.16010881960391998, |
| "learning_rate": 3.782608695652174e-06, |
| "loss": 2.6263, |
| "step": 22300 |
| }, |
| { |
| "epoch": 1304.6401137980085, |
| "eval_loss": 2.623608112335205, |
| "eval_runtime": 8.5405, |
| "eval_samples_per_second": 163.222, |
| "eval_steps_per_second": 20.491, |
| "step": 22300 |
| }, |
| { |
| "epoch": 1310.330014224751, |
| "grad_norm": 0.1507118195295334, |
| "learning_rate": 3.7725752508361205e-06, |
| "loss": 2.6262, |
| "step": 22400 |
| }, |
| { |
| "epoch": 1310.330014224751, |
| "eval_loss": 2.6196281909942627, |
| "eval_runtime": 8.8776, |
| "eval_samples_per_second": 157.024, |
| "eval_steps_per_second": 19.713, |
| "step": 22400 |
| }, |
| { |
| "epoch": 1316.0199146514935, |
| "grad_norm": 0.12015032023191452, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 2.6261, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1316.0199146514935, |
| "eval_loss": 2.6222877502441406, |
| "eval_runtime": 8.5476, |
| "eval_samples_per_second": 163.086, |
| "eval_steps_per_second": 20.474, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1321.709815078236, |
| "grad_norm": 0.14796671271324158, |
| "learning_rate": 3.7525083612040136e-06, |
| "loss": 2.6261, |
| "step": 22600 |
| }, |
| { |
| "epoch": 1321.709815078236, |
| "eval_loss": 2.623142957687378, |
| "eval_runtime": 8.8638, |
| "eval_samples_per_second": 157.268, |
| "eval_steps_per_second": 19.743, |
| "step": 22600 |
| }, |
| { |
| "epoch": 1327.3997155049788, |
| "grad_norm": 0.14206399023532867, |
| "learning_rate": 3.74247491638796e-06, |
| "loss": 2.6261, |
| "step": 22700 |
| }, |
| { |
| "epoch": 1327.3997155049788, |
| "eval_loss": 2.620297431945801, |
| "eval_runtime": 8.5358, |
| "eval_samples_per_second": 163.313, |
| "eval_steps_per_second": 20.502, |
| "step": 22700 |
| }, |
| { |
| "epoch": 1333.0896159317213, |
| "grad_norm": 0.1448485553264618, |
| "learning_rate": 3.7324414715719067e-06, |
| "loss": 2.6258, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1333.0896159317213, |
| "eval_loss": 2.6241817474365234, |
| "eval_runtime": 8.5523, |
| "eval_samples_per_second": 162.998, |
| "eval_steps_per_second": 20.462, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1338.7795163584638, |
| "grad_norm": 0.14887595176696777, |
| "learning_rate": 3.722408026755853e-06, |
| "loss": 2.6255, |
| "step": 22900 |
| }, |
| { |
| "epoch": 1338.7795163584638, |
| "eval_loss": 2.622042179107666, |
| "eval_runtime": 8.902, |
| "eval_samples_per_second": 156.594, |
| "eval_steps_per_second": 19.658, |
| "step": 22900 |
| }, |
| { |
| "epoch": 1344.4694167852062, |
| "grad_norm": 0.16686739027500153, |
| "learning_rate": 3.7123745819398e-06, |
| "loss": 2.6258, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1344.4694167852062, |
| "eval_loss": 2.6229121685028076, |
| "eval_runtime": 8.5332, |
| "eval_samples_per_second": 163.362, |
| "eval_steps_per_second": 20.508, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1350.1593172119487, |
| "grad_norm": 0.16153846681118011, |
| "learning_rate": 3.702341137123746e-06, |
| "loss": 2.6257, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1350.1593172119487, |
| "eval_loss": 2.6239538192749023, |
| "eval_runtime": 8.8767, |
| "eval_samples_per_second": 157.04, |
| "eval_steps_per_second": 19.715, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1355.8492176386912, |
| "grad_norm": 0.1725204735994339, |
| "learning_rate": 3.6923076923076925e-06, |
| "loss": 2.6258, |
| "step": 23200 |
| }, |
| { |
| "epoch": 1355.8492176386912, |
| "eval_loss": 2.6215097904205322, |
| "eval_runtime": 8.5286, |
| "eval_samples_per_second": 163.45, |
| "eval_steps_per_second": 20.519, |
| "step": 23200 |
| }, |
| { |
| "epoch": 1361.539118065434, |
| "grad_norm": 0.12999078631401062, |
| "learning_rate": 3.6822742474916393e-06, |
| "loss": 2.6253, |
| "step": 23300 |
| }, |
| { |
| "epoch": 1361.539118065434, |
| "eval_loss": 2.6233925819396973, |
| "eval_runtime": 8.8885, |
| "eval_samples_per_second": 156.832, |
| "eval_steps_per_second": 19.688, |
| "step": 23300 |
| }, |
| { |
| "epoch": 1367.2290184921765, |
| "grad_norm": 0.1744973212480545, |
| "learning_rate": 3.6722408026755856e-06, |
| "loss": 2.6257, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1367.2290184921765, |
| "eval_loss": 2.623767614364624, |
| "eval_runtime": 8.5312, |
| "eval_samples_per_second": 163.401, |
| "eval_steps_per_second": 20.513, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1372.918918918919, |
| "grad_norm": 0.13030101358890533, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 2.6254, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1372.918918918919, |
| "eval_loss": 2.622628927230835, |
| "eval_runtime": 8.8974, |
| "eval_samples_per_second": 156.675, |
| "eval_steps_per_second": 19.669, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1378.6088193456615, |
| "grad_norm": 0.15082061290740967, |
| "learning_rate": 3.6521739130434787e-06, |
| "loss": 2.6258, |
| "step": 23600 |
| }, |
| { |
| "epoch": 1378.6088193456615, |
| "eval_loss": 2.62248158454895, |
| "eval_runtime": 8.5269, |
| "eval_samples_per_second": 163.482, |
| "eval_steps_per_second": 20.523, |
| "step": 23600 |
| }, |
| { |
| "epoch": 1384.298719772404, |
| "grad_norm": 0.1196790486574173, |
| "learning_rate": 3.642140468227425e-06, |
| "loss": 2.6254, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1384.298719772404, |
| "eval_loss": 2.618326187133789, |
| "eval_runtime": 8.5494, |
| "eval_samples_per_second": 163.052, |
| "eval_steps_per_second": 20.469, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1389.9886201991465, |
| "grad_norm": 0.168843612074852, |
| "learning_rate": 3.6321070234113714e-06, |
| "loss": 2.6249, |
| "step": 23800 |
| }, |
| { |
| "epoch": 1389.9886201991465, |
| "eval_loss": 2.621375799179077, |
| "eval_runtime": 8.859, |
| "eval_samples_per_second": 157.355, |
| "eval_steps_per_second": 19.754, |
| "step": 23800 |
| }, |
| { |
| "epoch": 1395.678520625889, |
| "grad_norm": 0.1318158209323883, |
| "learning_rate": 3.622073578595318e-06, |
| "loss": 2.6248, |
| "step": 23900 |
| }, |
| { |
| "epoch": 1395.678520625889, |
| "eval_loss": 2.6230545043945312, |
| "eval_runtime": 8.5184, |
| "eval_samples_per_second": 163.646, |
| "eval_steps_per_second": 20.544, |
| "step": 23900 |
| }, |
| { |
| "epoch": 1401.3684210526317, |
| "grad_norm": 0.14110194146633148, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 2.6252, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1401.3684210526317, |
| "eval_loss": 2.6207733154296875, |
| "eval_runtime": 8.5346, |
| "eval_samples_per_second": 163.335, |
| "eval_steps_per_second": 20.505, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1407.0583214793742, |
| "grad_norm": 0.14449109137058258, |
| "learning_rate": 3.6020066889632112e-06, |
| "loss": 2.6245, |
| "step": 24100 |
| }, |
| { |
| "epoch": 1407.0583214793742, |
| "eval_loss": 2.6209616661071777, |
| "eval_runtime": 8.5416, |
| "eval_samples_per_second": 163.201, |
| "eval_steps_per_second": 20.488, |
| "step": 24100 |
| }, |
| { |
| "epoch": 1412.7482219061167, |
| "grad_norm": 0.12893743813037872, |
| "learning_rate": 3.5919732441471576e-06, |
| "loss": 2.6247, |
| "step": 24200 |
| }, |
| { |
| "epoch": 1412.7482219061167, |
| "eval_loss": 2.6214792728424072, |
| "eval_runtime": 8.8839, |
| "eval_samples_per_second": 156.913, |
| "eval_steps_per_second": 19.699, |
| "step": 24200 |
| }, |
| { |
| "epoch": 1418.4381223328592, |
| "grad_norm": 0.15788990259170532, |
| "learning_rate": 3.581939799331104e-06, |
| "loss": 2.6249, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1418.4381223328592, |
| "eval_loss": 2.6239373683929443, |
| "eval_runtime": 8.5329, |
| "eval_samples_per_second": 163.368, |
| "eval_steps_per_second": 20.509, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1424.1280227596017, |
| "grad_norm": 0.14352256059646606, |
| "learning_rate": 3.5719063545150507e-06, |
| "loss": 2.6244, |
| "step": 24400 |
| }, |
| { |
| "epoch": 1424.1280227596017, |
| "eval_loss": 2.621476888656616, |
| "eval_runtime": 8.8748, |
| "eval_samples_per_second": 157.073, |
| "eval_steps_per_second": 19.719, |
| "step": 24400 |
| }, |
| { |
| "epoch": 1429.8179231863442, |
| "grad_norm": 0.1311691254377365, |
| "learning_rate": 3.561872909698997e-06, |
| "loss": 2.6243, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1429.8179231863442, |
| "eval_loss": 2.6242871284484863, |
| "eval_runtime": 8.5283, |
| "eval_samples_per_second": 163.456, |
| "eval_steps_per_second": 20.52, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1435.5078236130869, |
| "grad_norm": 0.15464642643928528, |
| "learning_rate": 3.5518394648829434e-06, |
| "loss": 2.624, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1435.5078236130869, |
| "eval_loss": 2.6201913356781006, |
| "eval_runtime": 8.8606, |
| "eval_samples_per_second": 157.326, |
| "eval_steps_per_second": 19.75, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1441.1977240398294, |
| "grad_norm": 0.19396920502185822, |
| "learning_rate": 3.54180602006689e-06, |
| "loss": 2.625, |
| "step": 24700 |
| }, |
| { |
| "epoch": 1441.1977240398294, |
| "eval_loss": 2.619835138320923, |
| "eval_runtime": 8.5378, |
| "eval_samples_per_second": 163.275, |
| "eval_steps_per_second": 20.497, |
| "step": 24700 |
| }, |
| { |
| "epoch": 1446.8876244665719, |
| "grad_norm": 0.16594748198986053, |
| "learning_rate": 3.5317725752508365e-06, |
| "loss": 2.6238, |
| "step": 24800 |
| }, |
| { |
| "epoch": 1446.8876244665719, |
| "eval_loss": 2.620967388153076, |
| "eval_runtime": 8.5405, |
| "eval_samples_per_second": 163.221, |
| "eval_steps_per_second": 20.49, |
| "step": 24800 |
| }, |
| { |
| "epoch": 1452.5775248933144, |
| "grad_norm": 0.12998247146606445, |
| "learning_rate": 3.521739130434783e-06, |
| "loss": 2.6237, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1452.5775248933144, |
| "eval_loss": 2.622404098510742, |
| "eval_runtime": 8.5397, |
| "eval_samples_per_second": 163.237, |
| "eval_steps_per_second": 20.492, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1458.2674253200569, |
| "grad_norm": 0.15071412920951843, |
| "learning_rate": 3.5117056856187296e-06, |
| "loss": 2.6245, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1458.2674253200569, |
| "eval_loss": 2.6213462352752686, |
| "eval_runtime": 8.8844, |
| "eval_samples_per_second": 156.905, |
| "eval_steps_per_second": 19.698, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1463.9573257467994, |
| "grad_norm": 0.1532295048236847, |
| "learning_rate": 3.501672240802676e-06, |
| "loss": 2.6245, |
| "step": 25100 |
| }, |
| { |
| "epoch": 1463.9573257467994, |
| "eval_loss": 2.6207022666931152, |
| "eval_runtime": 8.5372, |
| "eval_samples_per_second": 163.285, |
| "eval_steps_per_second": 20.498, |
| "step": 25100 |
| }, |
| { |
| "epoch": 1469.6472261735419, |
| "grad_norm": 0.13699106872081757, |
| "learning_rate": 3.491638795986622e-06, |
| "loss": 2.6239, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1469.6472261735419, |
| "eval_loss": 2.6193158626556396, |
| "eval_runtime": 8.8816, |
| "eval_samples_per_second": 156.954, |
| "eval_steps_per_second": 19.704, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1475.3371266002846, |
| "grad_norm": 0.14744792878627777, |
| "learning_rate": 3.481605351170568e-06, |
| "loss": 2.624, |
| "step": 25300 |
| }, |
| { |
| "epoch": 1475.3371266002846, |
| "eval_loss": 2.6224136352539062, |
| "eval_runtime": 8.5419, |
| "eval_samples_per_second": 163.196, |
| "eval_steps_per_second": 20.487, |
| "step": 25300 |
| }, |
| { |
| "epoch": 1481.027027027027, |
| "grad_norm": 0.1340937465429306, |
| "learning_rate": 3.471571906354515e-06, |
| "loss": 2.624, |
| "step": 25400 |
| }, |
| { |
| "epoch": 1481.027027027027, |
| "eval_loss": 2.620910882949829, |
| "eval_runtime": 8.551, |
| "eval_samples_per_second": 163.022, |
| "eval_steps_per_second": 20.465, |
| "step": 25400 |
| }, |
| { |
| "epoch": 1486.7169274537696, |
| "grad_norm": 0.16349473595619202, |
| "learning_rate": 3.4615384615384613e-06, |
| "loss": 2.6236, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1486.7169274537696, |
| "eval_loss": 2.6188619136810303, |
| "eval_runtime": 8.8975, |
| "eval_samples_per_second": 156.673, |
| "eval_steps_per_second": 19.668, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1492.406827880512, |
| "grad_norm": 0.16049961745738983, |
| "learning_rate": 3.4515050167224076e-06, |
| "loss": 2.6236, |
| "step": 25600 |
| }, |
| { |
| "epoch": 1492.406827880512, |
| "eval_loss": 2.6201858520507812, |
| "eval_runtime": 8.5265, |
| "eval_samples_per_second": 163.49, |
| "eval_steps_per_second": 20.524, |
| "step": 25600 |
| }, |
| { |
| "epoch": 1498.0967283072546, |
| "grad_norm": 0.1545686572790146, |
| "learning_rate": 3.4414715719063544e-06, |
| "loss": 2.6237, |
| "step": 25700 |
| }, |
| { |
| "epoch": 1498.0967283072546, |
| "eval_loss": 2.6238884925842285, |
| "eval_runtime": 8.5206, |
| "eval_samples_per_second": 163.603, |
| "eval_steps_per_second": 20.538, |
| "step": 25700 |
| }, |
| { |
| "epoch": 1503.786628733997, |
| "grad_norm": 0.11945275217294693, |
| "learning_rate": 3.4314381270903007e-06, |
| "loss": 2.6237, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1503.786628733997, |
| "eval_loss": 2.617251396179199, |
| "eval_runtime": 8.5374, |
| "eval_samples_per_second": 163.282, |
| "eval_steps_per_second": 20.498, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1509.4765291607396, |
| "grad_norm": 0.16417285799980164, |
| "learning_rate": 3.4214046822742475e-06, |
| "loss": 2.6234, |
| "step": 25900 |
| }, |
| { |
| "epoch": 1509.4765291607396, |
| "eval_loss": 2.61983060836792, |
| "eval_runtime": 8.8777, |
| "eval_samples_per_second": 157.022, |
| "eval_steps_per_second": 19.712, |
| "step": 25900 |
| }, |
| { |
| "epoch": 1515.1664295874823, |
| "grad_norm": 0.1562732458114624, |
| "learning_rate": 3.411371237458194e-06, |
| "loss": 2.6234, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1515.1664295874823, |
| "eval_loss": 2.6201515197753906, |
| "eval_runtime": 8.5309, |
| "eval_samples_per_second": 163.407, |
| "eval_steps_per_second": 20.514, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1520.8563300142248, |
| "grad_norm": 0.1490921974182129, |
| "learning_rate": 3.40133779264214e-06, |
| "loss": 2.6233, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1520.8563300142248, |
| "eval_loss": 2.6187028884887695, |
| "eval_runtime": 8.5278, |
| "eval_samples_per_second": 163.466, |
| "eval_steps_per_second": 20.521, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1526.5462304409673, |
| "grad_norm": 0.13493777811527252, |
| "learning_rate": 3.391304347826087e-06, |
| "loss": 2.6232, |
| "step": 26200 |
| }, |
| { |
| "epoch": 1526.5462304409673, |
| "eval_loss": 2.6198177337646484, |
| "eval_runtime": 8.5368, |
| "eval_samples_per_second": 163.293, |
| "eval_steps_per_second": 20.5, |
| "step": 26200 |
| }, |
| { |
| "epoch": 1532.2361308677098, |
| "grad_norm": 0.16828219592571259, |
| "learning_rate": 3.3812709030100333e-06, |
| "loss": 2.6235, |
| "step": 26300 |
| }, |
| { |
| "epoch": 1532.2361308677098, |
| "eval_loss": 2.620209217071533, |
| "eval_runtime": 8.8832, |
| "eval_samples_per_second": 156.925, |
| "eval_steps_per_second": 19.7, |
| "step": 26300 |
| }, |
| { |
| "epoch": 1537.9260312944523, |
| "grad_norm": 0.13606858253479004, |
| "learning_rate": 3.3712374581939796e-06, |
| "loss": 2.6236, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1537.9260312944523, |
| "eval_loss": 2.620745897293091, |
| "eval_runtime": 8.5351, |
| "eval_samples_per_second": 163.325, |
| "eval_steps_per_second": 20.504, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1543.6159317211948, |
| "grad_norm": 0.15643203258514404, |
| "learning_rate": 3.3612040133779264e-06, |
| "loss": 2.6233, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1543.6159317211948, |
| "eval_loss": 2.6203880310058594, |
| "eval_runtime": 8.5194, |
| "eval_samples_per_second": 163.627, |
| "eval_steps_per_second": 20.541, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1549.3058321479375, |
| "grad_norm": 0.15990637242794037, |
| "learning_rate": 3.3511705685618727e-06, |
| "loss": 2.6235, |
| "step": 26600 |
| }, |
| { |
| "epoch": 1549.3058321479375, |
| "eval_loss": 2.618859052658081, |
| "eval_runtime": 8.8671, |
| "eval_samples_per_second": 157.21, |
| "eval_steps_per_second": 19.736, |
| "step": 26600 |
| }, |
| { |
| "epoch": 1554.99573257468, |
| "grad_norm": 0.1532638967037201, |
| "learning_rate": 3.3411371237458195e-06, |
| "loss": 2.6227, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1554.99573257468, |
| "eval_loss": 2.621203660964966, |
| "eval_runtime": 8.5393, |
| "eval_samples_per_second": 163.246, |
| "eval_steps_per_second": 20.494, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1560.6856330014225, |
| "grad_norm": 0.14362338185310364, |
| "learning_rate": 3.331103678929766e-06, |
| "loss": 2.6233, |
| "step": 26800 |
| }, |
| { |
| "epoch": 1560.6856330014225, |
| "eval_loss": 2.6196324825286865, |
| "eval_runtime": 8.5352, |
| "eval_samples_per_second": 163.323, |
| "eval_steps_per_second": 20.503, |
| "step": 26800 |
| }, |
| { |
| "epoch": 1566.375533428165, |
| "grad_norm": 0.15064574778079987, |
| "learning_rate": 3.321070234113712e-06, |
| "loss": 2.6231, |
| "step": 26900 |
| }, |
| { |
| "epoch": 1566.375533428165, |
| "eval_loss": 2.621459722518921, |
| "eval_runtime": 8.8713, |
| "eval_samples_per_second": 157.136, |
| "eval_steps_per_second": 19.726, |
| "step": 26900 |
| }, |
| { |
| "epoch": 1572.0654338549075, |
| "grad_norm": 0.14329403638839722, |
| "learning_rate": 3.311036789297659e-06, |
| "loss": 2.623, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1572.0654338549075, |
| "eval_loss": 2.619920253753662, |
| "eval_runtime": 8.5347, |
| "eval_samples_per_second": 163.333, |
| "eval_steps_per_second": 20.505, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1577.75533428165, |
| "grad_norm": 0.14685587584972382, |
| "learning_rate": 3.3010033444816052e-06, |
| "loss": 2.6233, |
| "step": 27100 |
| }, |
| { |
| "epoch": 1577.75533428165, |
| "eval_loss": 2.620281934738159, |
| "eval_runtime": 8.5331, |
| "eval_samples_per_second": 163.363, |
| "eval_steps_per_second": 20.508, |
| "step": 27100 |
| }, |
| { |
| "epoch": 1583.4452347083925, |
| "grad_norm": 0.14042943716049194, |
| "learning_rate": 3.2909698996655516e-06, |
| "loss": 2.6227, |
| "step": 27200 |
| }, |
| { |
| "epoch": 1583.4452347083925, |
| "eval_loss": 2.6232104301452637, |
| "eval_runtime": 8.5205, |
| "eval_samples_per_second": 163.606, |
| "eval_steps_per_second": 20.539, |
| "step": 27200 |
| }, |
| { |
| "epoch": 1589.1351351351352, |
| "grad_norm": 0.15437842905521393, |
| "learning_rate": 3.2809364548494983e-06, |
| "loss": 2.6228, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1589.1351351351352, |
| "eval_loss": 2.6217334270477295, |
| "eval_runtime": 8.887, |
| "eval_samples_per_second": 156.858, |
| "eval_steps_per_second": 19.692, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1594.8250355618777, |
| "grad_norm": 0.13956615328788757, |
| "learning_rate": 3.2709030100334447e-06, |
| "loss": 2.6227, |
| "step": 27400 |
| }, |
| { |
| "epoch": 1594.8250355618777, |
| "eval_loss": 2.619623899459839, |
| "eval_runtime": 8.5391, |
| "eval_samples_per_second": 163.248, |
| "eval_steps_per_second": 20.494, |
| "step": 27400 |
| }, |
| { |
| "epoch": 1600.5149359886202, |
| "grad_norm": 0.1520717293024063, |
| "learning_rate": 3.260869565217391e-06, |
| "loss": 2.6224, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1600.5149359886202, |
| "eval_loss": 2.61783766746521, |
| "eval_runtime": 8.8816, |
| "eval_samples_per_second": 156.954, |
| "eval_steps_per_second": 19.704, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1606.2048364153627, |
| "grad_norm": 0.12460660189390182, |
| "learning_rate": 3.2508361204013378e-06, |
| "loss": 2.6228, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1606.2048364153627, |
| "eval_loss": 2.6196727752685547, |
| "eval_runtime": 8.5446, |
| "eval_samples_per_second": 163.145, |
| "eval_steps_per_second": 20.481, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1611.8947368421052, |
| "grad_norm": 0.14338594675064087, |
| "learning_rate": 3.240802675585284e-06, |
| "loss": 2.6219, |
| "step": 27700 |
| }, |
| { |
| "epoch": 1611.8947368421052, |
| "eval_loss": 2.6190030574798584, |
| "eval_runtime": 8.5528, |
| "eval_samples_per_second": 162.987, |
| "eval_steps_per_second": 20.461, |
| "step": 27700 |
| }, |
| { |
| "epoch": 1617.5846372688477, |
| "grad_norm": 0.1541885882616043, |
| "learning_rate": 3.230769230769231e-06, |
| "loss": 2.6223, |
| "step": 27800 |
| }, |
| { |
| "epoch": 1617.5846372688477, |
| "eval_loss": 2.6206130981445312, |
| "eval_runtime": 8.5355, |
| "eval_samples_per_second": 163.319, |
| "eval_steps_per_second": 20.503, |
| "step": 27800 |
| }, |
| { |
| "epoch": 1623.2745376955904, |
| "grad_norm": 0.14063502848148346, |
| "learning_rate": 3.2207357859531772e-06, |
| "loss": 2.6226, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1623.2745376955904, |
| "eval_loss": 2.6189980506896973, |
| "eval_runtime": 8.8925, |
| "eval_samples_per_second": 156.76, |
| "eval_steps_per_second": 19.679, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1628.964438122333, |
| "grad_norm": 0.1286516785621643, |
| "learning_rate": 3.2107023411371236e-06, |
| "loss": 2.6221, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1628.964438122333, |
| "eval_loss": 2.620689630508423, |
| "eval_runtime": 8.542, |
| "eval_samples_per_second": 163.193, |
| "eval_steps_per_second": 20.487, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1634.6543385490754, |
| "grad_norm": 0.1280793398618698, |
| "learning_rate": 3.2006688963210703e-06, |
| "loss": 2.6224, |
| "step": 28100 |
| }, |
| { |
| "epoch": 1634.6543385490754, |
| "eval_loss": 2.6217143535614014, |
| "eval_runtime": 8.8756, |
| "eval_samples_per_second": 157.06, |
| "eval_steps_per_second": 19.717, |
| "step": 28100 |
| }, |
| { |
| "epoch": 1640.344238975818, |
| "grad_norm": 0.15803121030330658, |
| "learning_rate": 3.1906354515050167e-06, |
| "loss": 2.6219, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1640.344238975818, |
| "eval_loss": 2.6206395626068115, |
| "eval_runtime": 8.8837, |
| "eval_samples_per_second": 156.917, |
| "eval_steps_per_second": 19.699, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1646.0341394025604, |
| "grad_norm": 0.1751488745212555, |
| "learning_rate": 3.180602006688963e-06, |
| "loss": 2.6224, |
| "step": 28300 |
| }, |
| { |
| "epoch": 1646.0341394025604, |
| "eval_loss": 2.6214957237243652, |
| "eval_runtime": 8.5404, |
| "eval_samples_per_second": 163.224, |
| "eval_steps_per_second": 20.491, |
| "step": 28300 |
| }, |
| { |
| "epoch": 1651.724039829303, |
| "grad_norm": 0.15003472566604614, |
| "learning_rate": 3.1705685618729098e-06, |
| "loss": 2.6223, |
| "step": 28400 |
| }, |
| { |
| "epoch": 1651.724039829303, |
| "eval_loss": 2.619629144668579, |
| "eval_runtime": 8.5381, |
| "eval_samples_per_second": 163.268, |
| "eval_steps_per_second": 20.496, |
| "step": 28400 |
| }, |
| { |
| "epoch": 1657.4139402560454, |
| "grad_norm": 0.13195043802261353, |
| "learning_rate": 3.160535117056856e-06, |
| "loss": 2.6213, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1657.4139402560454, |
| "eval_loss": 2.623068332672119, |
| "eval_runtime": 8.5234, |
| "eval_samples_per_second": 163.55, |
| "eval_steps_per_second": 20.532, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1663.1038406827881, |
| "grad_norm": 0.12435358017683029, |
| "learning_rate": 3.1505016722408024e-06, |
| "loss": 2.6218, |
| "step": 28600 |
| }, |
| { |
| "epoch": 1663.1038406827881, |
| "eval_loss": 2.6203911304473877, |
| "eval_runtime": 8.8568, |
| "eval_samples_per_second": 157.394, |
| "eval_steps_per_second": 19.759, |
| "step": 28600 |
| }, |
| { |
| "epoch": 1668.7937411095306, |
| "grad_norm": 0.12473925203084946, |
| "learning_rate": 3.140468227424749e-06, |
| "loss": 2.6219, |
| "step": 28700 |
| }, |
| { |
| "epoch": 1668.7937411095306, |
| "eval_loss": 2.620685338973999, |
| "eval_runtime": 8.5395, |
| "eval_samples_per_second": 163.241, |
| "eval_steps_per_second": 20.493, |
| "step": 28700 |
| }, |
| { |
| "epoch": 1674.4836415362731, |
| "grad_norm": 0.14964550733566284, |
| "learning_rate": 3.1304347826086955e-06, |
| "loss": 2.6216, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1674.4836415362731, |
| "eval_loss": 2.619400978088379, |
| "eval_runtime": 8.5365, |
| "eval_samples_per_second": 163.298, |
| "eval_steps_per_second": 20.5, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1680.1735419630156, |
| "grad_norm": 0.12900976836681366, |
| "learning_rate": 3.1204013377926423e-06, |
| "loss": 2.6218, |
| "step": 28900 |
| }, |
| { |
| "epoch": 1680.1735419630156, |
| "eval_loss": 2.621912717819214, |
| "eval_runtime": 8.5373, |
| "eval_samples_per_second": 163.284, |
| "eval_steps_per_second": 20.498, |
| "step": 28900 |
| }, |
| { |
| "epoch": 1685.8634423897581, |
| "grad_norm": 0.1679168939590454, |
| "learning_rate": 3.1103678929765886e-06, |
| "loss": 2.622, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1685.8634423897581, |
| "eval_loss": 2.6172022819519043, |
| "eval_runtime": 8.8725, |
| "eval_samples_per_second": 157.114, |
| "eval_steps_per_second": 19.724, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1691.5533428165006, |
| "grad_norm": 0.14349579811096191, |
| "learning_rate": 3.100334448160535e-06, |
| "loss": 2.6214, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1691.5533428165006, |
| "eval_loss": 2.6180310249328613, |
| "eval_runtime": 8.5353, |
| "eval_samples_per_second": 163.321, |
| "eval_steps_per_second": 20.503, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1697.2432432432433, |
| "grad_norm": 0.11367882043123245, |
| "learning_rate": 3.0903010033444818e-06, |
| "loss": 2.6216, |
| "step": 29200 |
| }, |
| { |
| "epoch": 1697.2432432432433, |
| "eval_loss": 2.6190216541290283, |
| "eval_runtime": 8.874, |
| "eval_samples_per_second": 157.088, |
| "eval_steps_per_second": 19.72, |
| "step": 29200 |
| }, |
| { |
| "epoch": 1702.9331436699858, |
| "grad_norm": 0.1360355168581009, |
| "learning_rate": 3.080267558528428e-06, |
| "loss": 2.6209, |
| "step": 29300 |
| }, |
| { |
| "epoch": 1702.9331436699858, |
| "eval_loss": 2.618488311767578, |
| "eval_runtime": 8.544, |
| "eval_samples_per_second": 163.156, |
| "eval_steps_per_second": 20.482, |
| "step": 29300 |
| }, |
| { |
| "epoch": 1708.6230440967283, |
| "grad_norm": 0.15486325323581696, |
| "learning_rate": 3.0702341137123744e-06, |
| "loss": 2.6213, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1708.6230440967283, |
| "eval_loss": 2.6200103759765625, |
| "eval_runtime": 8.88, |
| "eval_samples_per_second": 156.982, |
| "eval_steps_per_second": 19.707, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1714.3129445234708, |
| "grad_norm": 0.16179534792900085, |
| "learning_rate": 3.060200668896321e-06, |
| "loss": 2.6216, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1714.3129445234708, |
| "eval_loss": 2.619476795196533, |
| "eval_runtime": 8.5238, |
| "eval_samples_per_second": 163.542, |
| "eval_steps_per_second": 20.531, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1720.0028449502133, |
| "grad_norm": 0.12888365983963013, |
| "learning_rate": 3.0501672240802675e-06, |
| "loss": 2.621, |
| "step": 29600 |
| }, |
| { |
| "epoch": 1720.0028449502133, |
| "eval_loss": 2.6209278106689453, |
| "eval_runtime": 8.8646, |
| "eval_samples_per_second": 157.255, |
| "eval_steps_per_second": 19.742, |
| "step": 29600 |
| }, |
| { |
| "epoch": 1725.6927453769558, |
| "grad_norm": 0.1323317587375641, |
| "learning_rate": 3.0401337792642143e-06, |
| "loss": 2.6217, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1725.6927453769558, |
| "eval_loss": 2.6187312602996826, |
| "eval_runtime": 8.5424, |
| "eval_samples_per_second": 163.186, |
| "eval_steps_per_second": 20.486, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1731.3826458036983, |
| "grad_norm": 0.13297787308692932, |
| "learning_rate": 3.0301003344481606e-06, |
| "loss": 2.6212, |
| "step": 29800 |
| }, |
| { |
| "epoch": 1731.3826458036983, |
| "eval_loss": 2.6216437816619873, |
| "eval_runtime": 8.5329, |
| "eval_samples_per_second": 163.368, |
| "eval_steps_per_second": 20.509, |
| "step": 29800 |
| }, |
| { |
| "epoch": 1737.072546230441, |
| "grad_norm": 0.11761217564344406, |
| "learning_rate": 3.020066889632107e-06, |
| "loss": 2.6211, |
| "step": 29900 |
| }, |
| { |
| "epoch": 1737.072546230441, |
| "eval_loss": 2.621067523956299, |
| "eval_runtime": 8.8814, |
| "eval_samples_per_second": 156.957, |
| "eval_steps_per_second": 19.704, |
| "step": 29900 |
| }, |
| { |
| "epoch": 1742.7624466571835, |
| "grad_norm": 0.13174152374267578, |
| "learning_rate": 3.0100334448160537e-06, |
| "loss": 2.621, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1742.7624466571835, |
| "eval_loss": 2.619697093963623, |
| "eval_runtime": 8.5326, |
| "eval_samples_per_second": 163.373, |
| "eval_steps_per_second": 20.509, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1748.452347083926, |
| "grad_norm": 0.13943453133106232, |
| "learning_rate": 3e-06, |
| "loss": 2.6208, |
| "step": 30100 |
| }, |
| { |
| "epoch": 1748.452347083926, |
| "eval_loss": 2.6210110187530518, |
| "eval_runtime": 8.5178, |
| "eval_samples_per_second": 163.658, |
| "eval_steps_per_second": 20.545, |
| "step": 30100 |
| }, |
| { |
| "epoch": 1754.1422475106685, |
| "grad_norm": 0.13520394265651703, |
| "learning_rate": 2.9899665551839464e-06, |
| "loss": 2.6213, |
| "step": 30200 |
| }, |
| { |
| "epoch": 1754.1422475106685, |
| "eval_loss": 2.616352081298828, |
| "eval_runtime": 8.8621, |
| "eval_samples_per_second": 157.3, |
| "eval_steps_per_second": 19.747, |
| "step": 30200 |
| }, |
| { |
| "epoch": 1759.832147937411, |
| "grad_norm": 0.1447754055261612, |
| "learning_rate": 2.979933110367893e-06, |
| "loss": 2.6212, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1759.832147937411, |
| "eval_loss": 2.6177382469177246, |
| "eval_runtime": 8.5425, |
| "eval_samples_per_second": 163.184, |
| "eval_steps_per_second": 20.486, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1765.5220483641535, |
| "grad_norm": 0.1305381804704666, |
| "learning_rate": 2.9698996655518395e-06, |
| "loss": 2.6207, |
| "step": 30400 |
| }, |
| { |
| "epoch": 1765.5220483641535, |
| "eval_loss": 2.6181886196136475, |
| "eval_runtime": 8.5323, |
| "eval_samples_per_second": 163.379, |
| "eval_steps_per_second": 20.51, |
| "step": 30400 |
| }, |
| { |
| "epoch": 1771.2119487908963, |
| "grad_norm": 0.13752570748329163, |
| "learning_rate": 2.959866220735786e-06, |
| "loss": 2.6211, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1771.2119487908963, |
| "eval_loss": 2.6209466457366943, |
| "eval_runtime": 8.8636, |
| "eval_samples_per_second": 157.273, |
| "eval_steps_per_second": 19.744, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1776.9018492176388, |
| "grad_norm": 0.15597382187843323, |
| "learning_rate": 2.9498327759197326e-06, |
| "loss": 2.6209, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1776.9018492176388, |
| "eval_loss": 2.6217684745788574, |
| "eval_runtime": 8.535, |
| "eval_samples_per_second": 163.328, |
| "eval_steps_per_second": 20.504, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1782.5917496443813, |
| "grad_norm": 0.13857756555080414, |
| "learning_rate": 2.939799331103679e-06, |
| "loss": 2.6203, |
| "step": 30700 |
| }, |
| { |
| "epoch": 1782.5917496443813, |
| "eval_loss": 2.6178483963012695, |
| "eval_runtime": 8.5228, |
| "eval_samples_per_second": 163.56, |
| "eval_steps_per_second": 20.533, |
| "step": 30700 |
| }, |
| { |
| "epoch": 1788.2816500711237, |
| "grad_norm": 0.12845158576965332, |
| "learning_rate": 2.9297658862876257e-06, |
| "loss": 2.6207, |
| "step": 30800 |
| }, |
| { |
| "epoch": 1788.2816500711237, |
| "eval_loss": 2.615445137023926, |
| "eval_runtime": 7.9709, |
| "eval_samples_per_second": 174.886, |
| "eval_steps_per_second": 21.955, |
| "step": 30800 |
| }, |
| { |
| "epoch": 1793.9715504978662, |
| "grad_norm": 0.12672263383865356, |
| "learning_rate": 2.919732441471572e-06, |
| "loss": 2.6207, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1793.9715504978662, |
| "eval_loss": 2.621990919113159, |
| "eval_runtime": 8.6765, |
| "eval_samples_per_second": 160.663, |
| "eval_steps_per_second": 20.169, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1799.6614509246087, |
| "grad_norm": 0.15212363004684448, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 2.6205, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1799.6614509246087, |
| "eval_loss": 2.617125988006592, |
| "eval_runtime": 8.5348, |
| "eval_samples_per_second": 163.33, |
| "eval_steps_per_second": 20.504, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1805.3513513513512, |
| "grad_norm": 0.14816269278526306, |
| "learning_rate": 2.899665551839465e-06, |
| "loss": 2.6204, |
| "step": 31100 |
| }, |
| { |
| "epoch": 1805.3513513513512, |
| "eval_loss": 2.619084358215332, |
| "eval_runtime": 8.5316, |
| "eval_samples_per_second": 163.393, |
| "eval_steps_per_second": 20.512, |
| "step": 31100 |
| }, |
| { |
| "epoch": 1811.041251778094, |
| "grad_norm": 0.12133249640464783, |
| "learning_rate": 2.8896321070234115e-06, |
| "loss": 2.6202, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1811.041251778094, |
| "eval_loss": 2.6200191974639893, |
| "eval_runtime": 8.8693, |
| "eval_samples_per_second": 157.171, |
| "eval_steps_per_second": 19.731, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1816.7311522048365, |
| "grad_norm": 0.122464619576931, |
| "learning_rate": 2.879598662207358e-06, |
| "loss": 2.6206, |
| "step": 31300 |
| }, |
| { |
| "epoch": 1816.7311522048365, |
| "eval_loss": 2.6166939735412598, |
| "eval_runtime": 8.5416, |
| "eval_samples_per_second": 163.202, |
| "eval_steps_per_second": 20.488, |
| "step": 31300 |
| }, |
| { |
| "epoch": 1822.421052631579, |
| "grad_norm": 0.12631458044052124, |
| "learning_rate": 2.8695652173913046e-06, |
| "loss": 2.6207, |
| "step": 31400 |
| }, |
| { |
| "epoch": 1822.421052631579, |
| "eval_loss": 2.619025468826294, |
| "eval_runtime": 8.8769, |
| "eval_samples_per_second": 157.038, |
| "eval_steps_per_second": 19.714, |
| "step": 31400 |
| }, |
| { |
| "epoch": 1828.1109530583215, |
| "grad_norm": 0.13460245728492737, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 2.6205, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1828.1109530583215, |
| "eval_loss": 2.6193785667419434, |
| "eval_runtime": 8.5355, |
| "eval_samples_per_second": 163.318, |
| "eval_steps_per_second": 20.503, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1833.800853485064, |
| "grad_norm": 0.1309368908405304, |
| "learning_rate": 2.8494983277591977e-06, |
| "loss": 2.6202, |
| "step": 31600 |
| }, |
| { |
| "epoch": 1833.800853485064, |
| "eval_loss": 2.6178736686706543, |
| "eval_runtime": 8.5249, |
| "eval_samples_per_second": 163.52, |
| "eval_steps_per_second": 20.528, |
| "step": 31600 |
| }, |
| { |
| "epoch": 1839.4907539118065, |
| "grad_norm": 0.13755999505519867, |
| "learning_rate": 2.839464882943144e-06, |
| "loss": 2.6202, |
| "step": 31700 |
| }, |
| { |
| "epoch": 1839.4907539118065, |
| "eval_loss": 2.6197922229766846, |
| "eval_runtime": 8.8802, |
| "eval_samples_per_second": 156.978, |
| "eval_steps_per_second": 19.707, |
| "step": 31700 |
| }, |
| { |
| "epoch": 1845.1806543385492, |
| "grad_norm": 0.15736857056617737, |
| "learning_rate": 2.8294314381270904e-06, |
| "loss": 2.6205, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1845.1806543385492, |
| "eval_loss": 2.617283344268799, |
| "eval_runtime": 8.5421, |
| "eval_samples_per_second": 163.192, |
| "eval_steps_per_second": 20.487, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1850.8705547652917, |
| "grad_norm": 0.13804545998573303, |
| "learning_rate": 2.819397993311037e-06, |
| "loss": 2.6202, |
| "step": 31900 |
| }, |
| { |
| "epoch": 1850.8705547652917, |
| "eval_loss": 2.619748592376709, |
| "eval_runtime": 8.5172, |
| "eval_samples_per_second": 163.669, |
| "eval_steps_per_second": 20.547, |
| "step": 31900 |
| }, |
| { |
| "epoch": 1856.5604551920342, |
| "grad_norm": 0.14598102867603302, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 2.6198, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1856.5604551920342, |
| "eval_loss": 2.6171820163726807, |
| "eval_runtime": 8.5287, |
| "eval_samples_per_second": 163.449, |
| "eval_steps_per_second": 20.519, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1862.2503556187767, |
| "grad_norm": 0.1180824562907219, |
| "learning_rate": 2.79933110367893e-06, |
| "loss": 2.62, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1862.2503556187767, |
| "eval_loss": 2.6209444999694824, |
| "eval_runtime": 8.8778, |
| "eval_samples_per_second": 157.021, |
| "eval_steps_per_second": 19.712, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1867.9402560455192, |
| "grad_norm": 0.13339059054851532, |
| "learning_rate": 2.7892976588628766e-06, |
| "loss": 2.6199, |
| "step": 32200 |
| }, |
| { |
| "epoch": 1867.9402560455192, |
| "eval_loss": 2.6180646419525146, |
| "eval_runtime": 8.5333, |
| "eval_samples_per_second": 163.36, |
| "eval_steps_per_second": 20.508, |
| "step": 32200 |
| }, |
| { |
| "epoch": 1873.6301564722617, |
| "grad_norm": 0.13802410662174225, |
| "learning_rate": 2.779264214046823e-06, |
| "loss": 2.6199, |
| "step": 32300 |
| }, |
| { |
| "epoch": 1873.6301564722617, |
| "eval_loss": 2.6181840896606445, |
| "eval_runtime": 8.5319, |
| "eval_samples_per_second": 163.386, |
| "eval_steps_per_second": 20.511, |
| "step": 32300 |
| }, |
| { |
| "epoch": 1879.3200568990042, |
| "grad_norm": 0.1414729207754135, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 2.6204, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1879.3200568990042, |
| "eval_loss": 2.6203253269195557, |
| "eval_runtime": 8.8692, |
| "eval_samples_per_second": 157.174, |
| "eval_steps_per_second": 19.731, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1885.0099573257469, |
| "grad_norm": 0.14050759375095367, |
| "learning_rate": 2.759197324414716e-06, |
| "loss": 2.6198, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1885.0099573257469, |
| "eval_loss": 2.6230709552764893, |
| "eval_runtime": 8.5278, |
| "eval_samples_per_second": 163.466, |
| "eval_steps_per_second": 20.521, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1890.6998577524894, |
| "grad_norm": 0.12877824902534485, |
| "learning_rate": 2.749163879598662e-06, |
| "loss": 2.6196, |
| "step": 32600 |
| }, |
| { |
| "epoch": 1890.6998577524894, |
| "eval_loss": 2.617913246154785, |
| "eval_runtime": 8.8665, |
| "eval_samples_per_second": 157.221, |
| "eval_steps_per_second": 19.737, |
| "step": 32600 |
| }, |
| { |
| "epoch": 1896.3897581792319, |
| "grad_norm": 0.14117339253425598, |
| "learning_rate": 2.7391304347826087e-06, |
| "loss": 2.6201, |
| "step": 32700 |
| }, |
| { |
| "epoch": 1896.3897581792319, |
| "eval_loss": 2.6172854900360107, |
| "eval_runtime": 8.5203, |
| "eval_samples_per_second": 163.609, |
| "eval_steps_per_second": 20.539, |
| "step": 32700 |
| }, |
| { |
| "epoch": 1902.0796586059744, |
| "grad_norm": 0.13245785236358643, |
| "learning_rate": 2.729096989966555e-06, |
| "loss": 2.6195, |
| "step": 32800 |
| }, |
| { |
| "epoch": 1902.0796586059744, |
| "eval_loss": 2.618534564971924, |
| "eval_runtime": 8.5363, |
| "eval_samples_per_second": 163.303, |
| "eval_steps_per_second": 20.501, |
| "step": 32800 |
| }, |
| { |
| "epoch": 1907.7695590327169, |
| "grad_norm": 0.1612655520439148, |
| "learning_rate": 2.7190635451505014e-06, |
| "loss": 2.6196, |
| "step": 32900 |
| }, |
| { |
| "epoch": 1907.7695590327169, |
| "eval_loss": 2.619488477706909, |
| "eval_runtime": 8.5526, |
| "eval_samples_per_second": 162.992, |
| "eval_steps_per_second": 20.462, |
| "step": 32900 |
| }, |
| { |
| "epoch": 1913.4594594594594, |
| "grad_norm": 0.12668026983737946, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 2.6193, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1913.4594594594594, |
| "eval_loss": 2.617690086364746, |
| "eval_runtime": 8.8781, |
| "eval_samples_per_second": 157.015, |
| "eval_steps_per_second": 19.711, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1919.149359886202, |
| "grad_norm": 0.1749388575553894, |
| "learning_rate": 2.6989966555183945e-06, |
| "loss": 2.6198, |
| "step": 33100 |
| }, |
| { |
| "epoch": 1919.149359886202, |
| "eval_loss": 2.6185007095336914, |
| "eval_runtime": 8.5203, |
| "eval_samples_per_second": 163.61, |
| "eval_steps_per_second": 20.539, |
| "step": 33100 |
| }, |
| { |
| "epoch": 1924.8392603129446, |
| "grad_norm": 0.14269417524337769, |
| "learning_rate": 2.6889632107023413e-06, |
| "loss": 2.6195, |
| "step": 33200 |
| }, |
| { |
| "epoch": 1924.8392603129446, |
| "eval_loss": 2.6211161613464355, |
| "eval_runtime": 8.869, |
| "eval_samples_per_second": 157.176, |
| "eval_steps_per_second": 19.732, |
| "step": 33200 |
| }, |
| { |
| "epoch": 1958.6002844950212, |
| "grad_norm": 0.13085490465164185, |
| "learning_rate": 2.6789297658862876e-06, |
| "loss": 2.619, |
| "step": 33300 |
| }, |
| { |
| "epoch": 1958.6002844950212, |
| "eval_loss": 2.6180379390716553, |
| "eval_runtime": 9.0762, |
| "eval_samples_per_second": 153.589, |
| "eval_steps_per_second": 19.281, |
| "step": 33300 |
| }, |
| { |
| "epoch": 1964.290184921764, |
| "grad_norm": 0.14272978901863098, |
| "learning_rate": 2.668896321070234e-06, |
| "loss": 2.6189, |
| "step": 33400 |
| }, |
| { |
| "epoch": 1964.290184921764, |
| "eval_loss": 2.617365598678589, |
| "eval_runtime": 8.6925, |
| "eval_samples_per_second": 160.369, |
| "eval_steps_per_second": 20.132, |
| "step": 33400 |
| }, |
| { |
| "epoch": 1969.9800853485065, |
| "grad_norm": 0.1391880363225937, |
| "learning_rate": 2.6588628762541807e-06, |
| "loss": 2.6194, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1969.9800853485065, |
| "eval_loss": 2.620480537414551, |
| "eval_runtime": 8.6935, |
| "eval_samples_per_second": 160.349, |
| "eval_steps_per_second": 20.13, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1975.669985775249, |
| "grad_norm": 0.13617493212223053, |
| "learning_rate": 2.648829431438127e-06, |
| "loss": 2.619, |
| "step": 33600 |
| }, |
| { |
| "epoch": 1975.669985775249, |
| "eval_loss": 2.6200404167175293, |
| "eval_runtime": 8.5474, |
| "eval_samples_per_second": 163.091, |
| "eval_steps_per_second": 20.474, |
| "step": 33600 |
| }, |
| { |
| "epoch": 1981.3598862019915, |
| "grad_norm": 0.14002011716365814, |
| "learning_rate": 2.6387959866220734e-06, |
| "loss": 2.6193, |
| "step": 33700 |
| }, |
| { |
| "epoch": 1981.3598862019915, |
| "eval_loss": 2.619243621826172, |
| "eval_runtime": 8.852, |
| "eval_samples_per_second": 157.478, |
| "eval_steps_per_second": 19.77, |
| "step": 33700 |
| }, |
| { |
| "epoch": 1987.049786628734, |
| "grad_norm": 0.12899306416511536, |
| "learning_rate": 2.62876254180602e-06, |
| "loss": 2.6187, |
| "step": 33800 |
| }, |
| { |
| "epoch": 1987.049786628734, |
| "eval_loss": 2.6176578998565674, |
| "eval_runtime": 8.5349, |
| "eval_samples_per_second": 163.329, |
| "eval_steps_per_second": 20.504, |
| "step": 33800 |
| }, |
| { |
| "epoch": 1992.7396870554765, |
| "grad_norm": 0.13901114463806152, |
| "learning_rate": 2.6187290969899665e-06, |
| "loss": 2.619, |
| "step": 33900 |
| }, |
| { |
| "epoch": 1992.7396870554765, |
| "eval_loss": 2.6168863773345947, |
| "eval_runtime": 8.5326, |
| "eval_samples_per_second": 163.373, |
| "eval_steps_per_second": 20.509, |
| "step": 33900 |
| }, |
| { |
| "epoch": 1998.429587482219, |
| "grad_norm": 0.15283076465129852, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 2.6185, |
| "step": 34000 |
| }, |
| { |
| "epoch": 1998.429587482219, |
| "eval_loss": 2.6182827949523926, |
| "eval_runtime": 8.8501, |
| "eval_samples_per_second": 157.512, |
| "eval_steps_per_second": 19.774, |
| "step": 34000 |
| }, |
| { |
| "epoch": 2004.1194879089617, |
| "grad_norm": 0.1270897537469864, |
| "learning_rate": 2.5986622073578596e-06, |
| "loss": 2.6191, |
| "step": 34100 |
| }, |
| { |
| "epoch": 2004.1194879089617, |
| "eval_loss": 2.616523265838623, |
| "eval_runtime": 8.545, |
| "eval_samples_per_second": 163.136, |
| "eval_steps_per_second": 20.48, |
| "step": 34100 |
| }, |
| { |
| "epoch": 2009.8093883357042, |
| "grad_norm": 0.11230363696813583, |
| "learning_rate": 2.588628762541806e-06, |
| "loss": 2.6187, |
| "step": 34200 |
| }, |
| { |
| "epoch": 2009.8093883357042, |
| "eval_loss": 2.619399309158325, |
| "eval_runtime": 8.8553, |
| "eval_samples_per_second": 157.42, |
| "eval_steps_per_second": 19.762, |
| "step": 34200 |
| }, |
| { |
| "epoch": 2015.4992887624467, |
| "grad_norm": 0.14034995436668396, |
| "learning_rate": 2.5785953177257527e-06, |
| "loss": 2.6187, |
| "step": 34300 |
| }, |
| { |
| "epoch": 2015.4992887624467, |
| "eval_loss": 2.6191158294677734, |
| "eval_runtime": 8.5383, |
| "eval_samples_per_second": 163.263, |
| "eval_steps_per_second": 20.496, |
| "step": 34300 |
| }, |
| { |
| "epoch": 2021.1891891891892, |
| "grad_norm": 0.14701803028583527, |
| "learning_rate": 2.568561872909699e-06, |
| "loss": 2.6183, |
| "step": 34400 |
| }, |
| { |
| "epoch": 2021.1891891891892, |
| "eval_loss": 2.620706796646118, |
| "eval_runtime": 8.5271, |
| "eval_samples_per_second": 163.479, |
| "eval_steps_per_second": 20.523, |
| "step": 34400 |
| }, |
| { |
| "epoch": 2026.8790896159317, |
| "grad_norm": 0.15207096934318542, |
| "learning_rate": 2.5585284280936454e-06, |
| "loss": 2.6189, |
| "step": 34500 |
| }, |
| { |
| "epoch": 2026.8790896159317, |
| "eval_loss": 2.619361400604248, |
| "eval_runtime": 8.5356, |
| "eval_samples_per_second": 163.316, |
| "eval_steps_per_second": 20.502, |
| "step": 34500 |
| }, |
| { |
| "epoch": 2032.5689900426742, |
| "grad_norm": 0.1416121870279312, |
| "learning_rate": 2.548494983277592e-06, |
| "loss": 2.6182, |
| "step": 34600 |
| }, |
| { |
| "epoch": 2032.5689900426742, |
| "eval_loss": 2.6184678077697754, |
| "eval_runtime": 8.8549, |
| "eval_samples_per_second": 157.428, |
| "eval_steps_per_second": 19.763, |
| "step": 34600 |
| }, |
| { |
| "epoch": 2038.2588904694169, |
| "grad_norm": 0.1294640153646469, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 2.6185, |
| "step": 34700 |
| }, |
| { |
| "epoch": 2038.2588904694169, |
| "eval_loss": 2.618467330932617, |
| "eval_runtime": 8.5393, |
| "eval_samples_per_second": 163.246, |
| "eval_steps_per_second": 20.494, |
| "step": 34700 |
| }, |
| { |
| "epoch": 2043.9487908961594, |
| "grad_norm": 0.1140933409333229, |
| "learning_rate": 2.528428093645485e-06, |
| "loss": 2.618, |
| "step": 34800 |
| }, |
| { |
| "epoch": 2043.9487908961594, |
| "eval_loss": 2.6173062324523926, |
| "eval_runtime": 8.8708, |
| "eval_samples_per_second": 157.146, |
| "eval_steps_per_second": 19.728, |
| "step": 34800 |
| }, |
| { |
| "epoch": 2049.6386913229016, |
| "grad_norm": 0.15031367540359497, |
| "learning_rate": 2.5183946488294316e-06, |
| "loss": 2.6185, |
| "step": 34900 |
| }, |
| { |
| "epoch": 2049.6386913229016, |
| "eval_loss": 2.6177406311035156, |
| "eval_runtime": 8.5417, |
| "eval_samples_per_second": 163.2, |
| "eval_steps_per_second": 20.488, |
| "step": 34900 |
| }, |
| { |
| "epoch": 2055.3285917496446, |
| "grad_norm": 0.11632242053747177, |
| "learning_rate": 2.508361204013378e-06, |
| "loss": 2.6181, |
| "step": 35000 |
| }, |
| { |
| "epoch": 2055.3285917496446, |
| "eval_loss": 2.6203091144561768, |
| "eval_runtime": 8.8703, |
| "eval_samples_per_second": 157.154, |
| "eval_steps_per_second": 19.729, |
| "step": 35000 |
| }, |
| { |
| "epoch": 2061.018492176387, |
| "grad_norm": 0.14546014368534088, |
| "learning_rate": 2.4983277591973247e-06, |
| "loss": 2.6182, |
| "step": 35100 |
| }, |
| { |
| "epoch": 2061.018492176387, |
| "eval_loss": 2.6166837215423584, |
| "eval_runtime": 8.5363, |
| "eval_samples_per_second": 163.303, |
| "eval_steps_per_second": 20.501, |
| "step": 35100 |
| }, |
| { |
| "epoch": 2066.7083926031296, |
| "grad_norm": 0.15595249831676483, |
| "learning_rate": 2.488294314381271e-06, |
| "loss": 2.618, |
| "step": 35200 |
| }, |
| { |
| "epoch": 2066.7083926031296, |
| "eval_loss": 2.620990514755249, |
| "eval_runtime": 8.8529, |
| "eval_samples_per_second": 157.463, |
| "eval_steps_per_second": 19.768, |
| "step": 35200 |
| }, |
| { |
| "epoch": 2072.398293029872, |
| "grad_norm": 0.15020006895065308, |
| "learning_rate": 2.4782608695652173e-06, |
| "loss": 2.6181, |
| "step": 35300 |
| }, |
| { |
| "epoch": 2072.398293029872, |
| "eval_loss": 2.617979049682617, |
| "eval_runtime": 8.8415, |
| "eval_samples_per_second": 157.666, |
| "eval_steps_per_second": 19.793, |
| "step": 35300 |
| }, |
| { |
| "epoch": 2078.0881934566146, |
| "grad_norm": 0.12532344460487366, |
| "learning_rate": 2.468227424749164e-06, |
| "loss": 2.618, |
| "step": 35400 |
| }, |
| { |
| "epoch": 2078.0881934566146, |
| "eval_loss": 2.6182172298431396, |
| "eval_runtime": 8.8375, |
| "eval_samples_per_second": 157.737, |
| "eval_steps_per_second": 19.802, |
| "step": 35400 |
| }, |
| { |
| "epoch": 2083.778093883357, |
| "grad_norm": 0.13622809946537018, |
| "learning_rate": 2.4581939799331104e-06, |
| "loss": 2.6178, |
| "step": 35500 |
| }, |
| { |
| "epoch": 2083.778093883357, |
| "eval_loss": 2.6147806644439697, |
| "eval_runtime": 8.8411, |
| "eval_samples_per_second": 157.674, |
| "eval_steps_per_second": 19.794, |
| "step": 35500 |
| }, |
| { |
| "epoch": 2089.4679943100996, |
| "grad_norm": 0.11295317858457565, |
| "learning_rate": 2.4481605351170568e-06, |
| "loss": 2.6185, |
| "step": 35600 |
| }, |
| { |
| "epoch": 2089.4679943100996, |
| "eval_loss": 2.617539167404175, |
| "eval_runtime": 8.8516, |
| "eval_samples_per_second": 157.485, |
| "eval_steps_per_second": 19.77, |
| "step": 35600 |
| }, |
| { |
| "epoch": 2095.157894736842, |
| "grad_norm": 0.12703397870063782, |
| "learning_rate": 2.4381270903010035e-06, |
| "loss": 2.6178, |
| "step": 35700 |
| }, |
| { |
| "epoch": 2095.157894736842, |
| "eval_loss": 2.6192715167999268, |
| "eval_runtime": 8.8438, |
| "eval_samples_per_second": 157.624, |
| "eval_steps_per_second": 19.788, |
| "step": 35700 |
| }, |
| { |
| "epoch": 2100.8477951635846, |
| "grad_norm": 0.13047580420970917, |
| "learning_rate": 2.42809364548495e-06, |
| "loss": 2.6175, |
| "step": 35800 |
| }, |
| { |
| "epoch": 2100.8477951635846, |
| "eval_loss": 2.6149110794067383, |
| "eval_runtime": 8.5422, |
| "eval_samples_per_second": 163.189, |
| "eval_steps_per_second": 20.486, |
| "step": 35800 |
| }, |
| { |
| "epoch": 2106.537695590327, |
| "grad_norm": 0.1494310200214386, |
| "learning_rate": 2.4180602006688962e-06, |
| "loss": 2.6183, |
| "step": 35900 |
| }, |
| { |
| "epoch": 2106.537695590327, |
| "eval_loss": 2.617572069168091, |
| "eval_runtime": 8.5269, |
| "eval_samples_per_second": 163.483, |
| "eval_steps_per_second": 20.523, |
| "step": 35900 |
| }, |
| { |
| "epoch": 2112.2275960170696, |
| "grad_norm": 0.14913226664066315, |
| "learning_rate": 2.408026755852843e-06, |
| "loss": 2.6175, |
| "step": 36000 |
| }, |
| { |
| "epoch": 2112.2275960170696, |
| "eval_loss": 2.6157870292663574, |
| "eval_runtime": 8.5322, |
| "eval_samples_per_second": 163.381, |
| "eval_steps_per_second": 20.511, |
| "step": 36000 |
| }, |
| { |
| "epoch": 2117.917496443812, |
| "grad_norm": 0.12804996967315674, |
| "learning_rate": 2.3979933110367893e-06, |
| "loss": 2.6175, |
| "step": 36100 |
| }, |
| { |
| "epoch": 2117.917496443812, |
| "eval_loss": 2.6161787509918213, |
| "eval_runtime": 8.8464, |
| "eval_samples_per_second": 157.578, |
| "eval_steps_per_second": 19.782, |
| "step": 36100 |
| }, |
| { |
| "epoch": 2123.6073968705546, |
| "grad_norm": 0.1311938613653183, |
| "learning_rate": 2.387959866220736e-06, |
| "loss": 2.6177, |
| "step": 36200 |
| }, |
| { |
| "epoch": 2123.6073968705546, |
| "eval_loss": 2.6184916496276855, |
| "eval_runtime": 8.5276, |
| "eval_samples_per_second": 163.47, |
| "eval_steps_per_second": 20.522, |
| "step": 36200 |
| }, |
| { |
| "epoch": 2129.2972972972975, |
| "grad_norm": 0.14833857119083405, |
| "learning_rate": 2.3779264214046824e-06, |
| "loss": 2.618, |
| "step": 36300 |
| }, |
| { |
| "epoch": 2129.2972972972975, |
| "eval_loss": 2.616685628890991, |
| "eval_runtime": 8.5313, |
| "eval_samples_per_second": 163.399, |
| "eval_steps_per_second": 20.513, |
| "step": 36300 |
| }, |
| { |
| "epoch": 2134.98719772404, |
| "grad_norm": 0.14459851384162903, |
| "learning_rate": 2.3678929765886288e-06, |
| "loss": 2.6173, |
| "step": 36400 |
| }, |
| { |
| "epoch": 2134.98719772404, |
| "eval_loss": 2.6192727088928223, |
| "eval_runtime": 8.5314, |
| "eval_samples_per_second": 163.397, |
| "eval_steps_per_second": 20.513, |
| "step": 36400 |
| }, |
| { |
| "epoch": 2140.6770981507825, |
| "grad_norm": 0.12654992938041687, |
| "learning_rate": 2.3578595317725755e-06, |
| "loss": 2.6174, |
| "step": 36500 |
| }, |
| { |
| "epoch": 2140.6770981507825, |
| "eval_loss": 2.614757537841797, |
| "eval_runtime": 8.8498, |
| "eval_samples_per_second": 157.517, |
| "eval_steps_per_second": 19.774, |
| "step": 36500 |
| }, |
| { |
| "epoch": 2146.366998577525, |
| "grad_norm": 0.16258764266967773, |
| "learning_rate": 2.347826086956522e-06, |
| "loss": 2.618, |
| "step": 36600 |
| }, |
| { |
| "epoch": 2146.366998577525, |
| "eval_loss": 2.61818528175354, |
| "eval_runtime": 8.5339, |
| "eval_samples_per_second": 163.349, |
| "eval_steps_per_second": 20.507, |
| "step": 36600 |
| }, |
| { |
| "epoch": 2152.0568990042675, |
| "grad_norm": 0.1515471637248993, |
| "learning_rate": 2.337792642140468e-06, |
| "loss": 2.6177, |
| "step": 36700 |
| }, |
| { |
| "epoch": 2152.0568990042675, |
| "eval_loss": 2.6178441047668457, |
| "eval_runtime": 8.8414, |
| "eval_samples_per_second": 157.668, |
| "eval_steps_per_second": 19.793, |
| "step": 36700 |
| }, |
| { |
| "epoch": 2157.74679943101, |
| "grad_norm": 0.1283411979675293, |
| "learning_rate": 2.327759197324415e-06, |
| "loss": 2.6173, |
| "step": 36800 |
| }, |
| { |
| "epoch": 2157.74679943101, |
| "eval_loss": 2.6143412590026855, |
| "eval_runtime": 8.5345, |
| "eval_samples_per_second": 163.338, |
| "eval_steps_per_second": 20.505, |
| "step": 36800 |
| }, |
| { |
| "epoch": 2163.4366998577525, |
| "grad_norm": 0.13093768060207367, |
| "learning_rate": 2.3177257525083613e-06, |
| "loss": 2.6175, |
| "step": 36900 |
| }, |
| { |
| "epoch": 2163.4366998577525, |
| "eval_loss": 2.6168148517608643, |
| "eval_runtime": 8.5431, |
| "eval_samples_per_second": 163.172, |
| "eval_steps_per_second": 20.484, |
| "step": 36900 |
| }, |
| { |
| "epoch": 2169.126600284495, |
| "grad_norm": 0.12476625293493271, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 2.6174, |
| "step": 37000 |
| }, |
| { |
| "epoch": 2169.126600284495, |
| "eval_loss": 2.6173489093780518, |
| "eval_runtime": 8.8493, |
| "eval_samples_per_second": 157.526, |
| "eval_steps_per_second": 19.776, |
| "step": 37000 |
| }, |
| { |
| "epoch": 2174.8165007112375, |
| "grad_norm": 0.11948033422231674, |
| "learning_rate": 2.2976588628762544e-06, |
| "loss": 2.617, |
| "step": 37100 |
| }, |
| { |
| "epoch": 2174.8165007112375, |
| "eval_loss": 2.616060972213745, |
| "eval_runtime": 8.8543, |
| "eval_samples_per_second": 157.438, |
| "eval_steps_per_second": 19.764, |
| "step": 37100 |
| }, |
| { |
| "epoch": 2180.50640113798, |
| "grad_norm": 0.12949152290821075, |
| "learning_rate": 2.2876254180602008e-06, |
| "loss": 2.6175, |
| "step": 37200 |
| }, |
| { |
| "epoch": 2180.50640113798, |
| "eval_loss": 2.619767427444458, |
| "eval_runtime": 8.5433, |
| "eval_samples_per_second": 163.169, |
| "eval_steps_per_second": 20.484, |
| "step": 37200 |
| }, |
| { |
| "epoch": 2186.1963015647225, |
| "grad_norm": 0.14393049478530884, |
| "learning_rate": 2.2775919732441475e-06, |
| "loss": 2.6173, |
| "step": 37300 |
| }, |
| { |
| "epoch": 2186.1963015647225, |
| "eval_loss": 2.614513635635376, |
| "eval_runtime": 8.5464, |
| "eval_samples_per_second": 163.11, |
| "eval_steps_per_second": 20.477, |
| "step": 37300 |
| }, |
| { |
| "epoch": 2191.886201991465, |
| "grad_norm": 0.12848299741744995, |
| "learning_rate": 2.267558528428094e-06, |
| "loss": 2.6171, |
| "step": 37400 |
| }, |
| { |
| "epoch": 2191.886201991465, |
| "eval_loss": 2.615206241607666, |
| "eval_runtime": 8.8388, |
| "eval_samples_per_second": 157.714, |
| "eval_steps_per_second": 19.799, |
| "step": 37400 |
| }, |
| { |
| "epoch": 2197.5761024182075, |
| "grad_norm": 0.13800281286239624, |
| "learning_rate": 2.25752508361204e-06, |
| "loss": 2.6171, |
| "step": 37500 |
| }, |
| { |
| "epoch": 2197.5761024182075, |
| "eval_loss": 2.616520404815674, |
| "eval_runtime": 8.5307, |
| "eval_samples_per_second": 163.41, |
| "eval_steps_per_second": 20.514, |
| "step": 37500 |
| }, |
| { |
| "epoch": 2203.2660028449504, |
| "grad_norm": 0.1414160281419754, |
| "learning_rate": 2.2474916387959865e-06, |
| "loss": 2.617, |
| "step": 37600 |
| }, |
| { |
| "epoch": 2203.2660028449504, |
| "eval_loss": 2.617866039276123, |
| "eval_runtime": 8.8509, |
| "eval_samples_per_second": 157.499, |
| "eval_steps_per_second": 19.772, |
| "step": 37600 |
| }, |
| { |
| "epoch": 2208.955903271693, |
| "grad_norm": 0.129195898771286, |
| "learning_rate": 2.237458193979933e-06, |
| "loss": 2.617, |
| "step": 37700 |
| }, |
| { |
| "epoch": 2208.955903271693, |
| "eval_loss": 2.616370677947998, |
| "eval_runtime": 8.5271, |
| "eval_samples_per_second": 163.479, |
| "eval_steps_per_second": 20.523, |
| "step": 37700 |
| }, |
| { |
| "epoch": 2214.6458036984354, |
| "grad_norm": 0.12701831758022308, |
| "learning_rate": 2.2274247491638796e-06, |
| "loss": 2.6172, |
| "step": 37800 |
| }, |
| { |
| "epoch": 2214.6458036984354, |
| "eval_loss": 2.619422197341919, |
| "eval_runtime": 8.5388, |
| "eval_samples_per_second": 163.255, |
| "eval_steps_per_second": 20.495, |
| "step": 37800 |
| }, |
| { |
| "epoch": 2220.335704125178, |
| "grad_norm": 0.1434861570596695, |
| "learning_rate": 2.217391304347826e-06, |
| "loss": 2.6168, |
| "step": 37900 |
| }, |
| { |
| "epoch": 2220.335704125178, |
| "eval_loss": 2.6175920963287354, |
| "eval_runtime": 8.528, |
| "eval_samples_per_second": 163.461, |
| "eval_steps_per_second": 20.521, |
| "step": 37900 |
| }, |
| { |
| "epoch": 2226.0256045519204, |
| "grad_norm": 0.1319652646780014, |
| "learning_rate": 2.2073578595317723e-06, |
| "loss": 2.6169, |
| "step": 38000 |
| }, |
| { |
| "epoch": 2226.0256045519204, |
| "eval_loss": 2.6176187992095947, |
| "eval_runtime": 8.8424, |
| "eval_samples_per_second": 157.65, |
| "eval_steps_per_second": 19.791, |
| "step": 38000 |
| }, |
| { |
| "epoch": 2231.715504978663, |
| "grad_norm": 0.13358598947525024, |
| "learning_rate": 2.197324414715719e-06, |
| "loss": 2.6167, |
| "step": 38100 |
| }, |
| { |
| "epoch": 2231.715504978663, |
| "eval_loss": 2.616727828979492, |
| "eval_runtime": 8.5384, |
| "eval_samples_per_second": 163.263, |
| "eval_steps_per_second": 20.496, |
| "step": 38100 |
| }, |
| { |
| "epoch": 2237.4054054054054, |
| "grad_norm": 0.12551608681678772, |
| "learning_rate": 2.1872909698996654e-06, |
| "loss": 2.617, |
| "step": 38200 |
| }, |
| { |
| "epoch": 2237.4054054054054, |
| "eval_loss": 2.616206645965576, |
| "eval_runtime": 8.8419, |
| "eval_samples_per_second": 157.658, |
| "eval_steps_per_second": 19.792, |
| "step": 38200 |
| }, |
| { |
| "epoch": 2243.095305832148, |
| "grad_norm": 0.1412065029144287, |
| "learning_rate": 2.177257525083612e-06, |
| "loss": 2.6172, |
| "step": 38300 |
| }, |
| { |
| "epoch": 2243.095305832148, |
| "eval_loss": 2.618215799331665, |
| "eval_runtime": 8.54, |
| "eval_samples_per_second": 163.232, |
| "eval_steps_per_second": 20.492, |
| "step": 38300 |
| }, |
| { |
| "epoch": 2248.7852062588904, |
| "grad_norm": 0.16305094957351685, |
| "learning_rate": 2.1672240802675585e-06, |
| "loss": 2.6166, |
| "step": 38400 |
| }, |
| { |
| "epoch": 2248.7852062588904, |
| "eval_loss": 2.618960380554199, |
| "eval_runtime": 8.5327, |
| "eval_samples_per_second": 163.371, |
| "eval_steps_per_second": 20.509, |
| "step": 38400 |
| }, |
| { |
| "epoch": 2254.475106685633, |
| "grad_norm": 0.14737871289253235, |
| "learning_rate": 2.157190635451505e-06, |
| "loss": 2.6165, |
| "step": 38500 |
| }, |
| { |
| "epoch": 2254.475106685633, |
| "eval_loss": 2.618856906890869, |
| "eval_runtime": 8.8618, |
| "eval_samples_per_second": 157.305, |
| "eval_steps_per_second": 19.748, |
| "step": 38500 |
| }, |
| { |
| "epoch": 2260.1650071123754, |
| "grad_norm": 0.11627591401338577, |
| "learning_rate": 2.1471571906354516e-06, |
| "loss": 2.6169, |
| "step": 38600 |
| }, |
| { |
| "epoch": 2260.1650071123754, |
| "eval_loss": 2.6156229972839355, |
| "eval_runtime": 8.5295, |
| "eval_samples_per_second": 163.432, |
| "eval_steps_per_second": 20.517, |
| "step": 38600 |
| }, |
| { |
| "epoch": 2265.854907539118, |
| "grad_norm": 0.1361280232667923, |
| "learning_rate": 2.137123745819398e-06, |
| "loss": 2.6168, |
| "step": 38700 |
| }, |
| { |
| "epoch": 2265.854907539118, |
| "eval_loss": 2.6178250312805176, |
| "eval_runtime": 8.8561, |
| "eval_samples_per_second": 157.405, |
| "eval_steps_per_second": 19.76, |
| "step": 38700 |
| }, |
| { |
| "epoch": 2271.5448079658604, |
| "grad_norm": 0.13634426891803741, |
| "learning_rate": 2.1270903010033443e-06, |
| "loss": 2.6168, |
| "step": 38800 |
| }, |
| { |
| "epoch": 2271.5448079658604, |
| "eval_loss": 2.620987892150879, |
| "eval_runtime": 8.542, |
| "eval_samples_per_second": 163.194, |
| "eval_steps_per_second": 20.487, |
| "step": 38800 |
| }, |
| { |
| "epoch": 2277.2347083926034, |
| "grad_norm": 0.11851690709590912, |
| "learning_rate": 2.117056856187291e-06, |
| "loss": 2.6169, |
| "step": 38900 |
| }, |
| { |
| "epoch": 2277.2347083926034, |
| "eval_loss": 2.6175451278686523, |
| "eval_runtime": 8.5298, |
| "eval_samples_per_second": 163.428, |
| "eval_steps_per_second": 20.516, |
| "step": 38900 |
| }, |
| { |
| "epoch": 2282.924608819346, |
| "grad_norm": 0.15516361594200134, |
| "learning_rate": 2.1070234113712374e-06, |
| "loss": 2.6164, |
| "step": 39000 |
| }, |
| { |
| "epoch": 2282.924608819346, |
| "eval_loss": 2.6171224117279053, |
| "eval_runtime": 8.5317, |
| "eval_samples_per_second": 163.391, |
| "eval_steps_per_second": 20.512, |
| "step": 39000 |
| }, |
| { |
| "epoch": 2288.6145092460883, |
| "grad_norm": 0.14551801979541779, |
| "learning_rate": 2.0969899665551837e-06, |
| "loss": 2.6166, |
| "step": 39100 |
| }, |
| { |
| "epoch": 2288.6145092460883, |
| "eval_loss": 2.618269920349121, |
| "eval_runtime": 8.8607, |
| "eval_samples_per_second": 157.324, |
| "eval_steps_per_second": 19.75, |
| "step": 39100 |
| }, |
| { |
| "epoch": 2294.304409672831, |
| "grad_norm": 0.13568130135536194, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 2.6162, |
| "step": 39200 |
| }, |
| { |
| "epoch": 2294.304409672831, |
| "eval_loss": 2.6169166564941406, |
| "eval_runtime": 8.5331, |
| "eval_samples_per_second": 163.363, |
| "eval_steps_per_second": 20.508, |
| "step": 39200 |
| }, |
| { |
| "epoch": 2299.9943100995733, |
| "grad_norm": 0.1397295743227005, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 2.6165, |
| "step": 39300 |
| }, |
| { |
| "epoch": 2299.9943100995733, |
| "eval_loss": 2.617077589035034, |
| "eval_runtime": 8.8589, |
| "eval_samples_per_second": 157.356, |
| "eval_steps_per_second": 19.754, |
| "step": 39300 |
| }, |
| { |
| "epoch": 2305.684210526316, |
| "grad_norm": 0.1272270530462265, |
| "learning_rate": 2.0668896321070236e-06, |
| "loss": 2.6167, |
| "step": 39400 |
| }, |
| { |
| "epoch": 2305.684210526316, |
| "eval_loss": 2.6153969764709473, |
| "eval_runtime": 8.5514, |
| "eval_samples_per_second": 163.015, |
| "eval_steps_per_second": 20.465, |
| "step": 39400 |
| }, |
| { |
| "epoch": 2311.3741109530583, |
| "grad_norm": 0.13360774517059326, |
| "learning_rate": 2.05685618729097e-06, |
| "loss": 2.616, |
| "step": 39500 |
| }, |
| { |
| "epoch": 2311.3741109530583, |
| "eval_loss": 2.6174352169036865, |
| "eval_runtime": 8.851, |
| "eval_samples_per_second": 157.496, |
| "eval_steps_per_second": 19.772, |
| "step": 39500 |
| }, |
| { |
| "epoch": 2317.064011379801, |
| "grad_norm": 0.14483892917633057, |
| "learning_rate": 2.0468227424749163e-06, |
| "loss": 2.6158, |
| "step": 39600 |
| }, |
| { |
| "epoch": 2317.064011379801, |
| "eval_loss": 2.617931842803955, |
| "eval_runtime": 8.5303, |
| "eval_samples_per_second": 163.418, |
| "eval_steps_per_second": 20.515, |
| "step": 39600 |
| }, |
| { |
| "epoch": 2322.7539118065433, |
| "grad_norm": 0.12557685375213623, |
| "learning_rate": 2.036789297658863e-06, |
| "loss": 2.6163, |
| "step": 39700 |
| }, |
| { |
| "epoch": 2322.7539118065433, |
| "eval_loss": 2.616457462310791, |
| "eval_runtime": 8.8454, |
| "eval_samples_per_second": 157.596, |
| "eval_steps_per_second": 19.784, |
| "step": 39700 |
| }, |
| { |
| "epoch": 2328.443812233286, |
| "grad_norm": 0.14481040835380554, |
| "learning_rate": 2.0267558528428094e-06, |
| "loss": 2.6161, |
| "step": 39800 |
| }, |
| { |
| "epoch": 2328.443812233286, |
| "eval_loss": 2.6148922443389893, |
| "eval_runtime": 8.5419, |
| "eval_samples_per_second": 163.195, |
| "eval_steps_per_second": 20.487, |
| "step": 39800 |
| }, |
| { |
| "epoch": 2334.1337126600283, |
| "grad_norm": 0.1371890753507614, |
| "learning_rate": 2.0167224080267557e-06, |
| "loss": 2.6156, |
| "step": 39900 |
| }, |
| { |
| "epoch": 2334.1337126600283, |
| "eval_loss": 2.6165366172790527, |
| "eval_runtime": 8.5312, |
| "eval_samples_per_second": 163.401, |
| "eval_steps_per_second": 20.513, |
| "step": 39900 |
| }, |
| { |
| "epoch": 2339.823613086771, |
| "grad_norm": 0.11908498406410217, |
| "learning_rate": 2.0066889632107025e-06, |
| "loss": 2.6161, |
| "step": 40000 |
| }, |
| { |
| "epoch": 2339.823613086771, |
| "eval_loss": 2.6168572902679443, |
| "eval_runtime": 8.5312, |
| "eval_samples_per_second": 163.4, |
| "eval_steps_per_second": 20.513, |
| "step": 40000 |
| }, |
| { |
| "epoch": 2345.5135135135133, |
| "grad_norm": 0.15776848793029785, |
| "learning_rate": 1.996655518394649e-06, |
| "loss": 2.6161, |
| "step": 40100 |
| }, |
| { |
| "epoch": 2345.5135135135133, |
| "eval_loss": 2.61922550201416, |
| "eval_runtime": 8.8387, |
| "eval_samples_per_second": 157.716, |
| "eval_steps_per_second": 19.799, |
| "step": 40100 |
| }, |
| { |
| "epoch": 2351.2034139402563, |
| "grad_norm": 0.13650420308113098, |
| "learning_rate": 1.986622073578595e-06, |
| "loss": 2.6157, |
| "step": 40200 |
| }, |
| { |
| "epoch": 2351.2034139402563, |
| "eval_loss": 2.6171460151672363, |
| "eval_runtime": 8.8588, |
| "eval_samples_per_second": 157.357, |
| "eval_steps_per_second": 19.754, |
| "step": 40200 |
| }, |
| { |
| "epoch": 2356.8933143669988, |
| "grad_norm": 0.14394904673099518, |
| "learning_rate": 1.976588628762542e-06, |
| "loss": 2.6156, |
| "step": 40300 |
| }, |
| { |
| "epoch": 2356.8933143669988, |
| "eval_loss": 2.617033004760742, |
| "eval_runtime": 8.5275, |
| "eval_samples_per_second": 163.472, |
| "eval_steps_per_second": 20.522, |
| "step": 40300 |
| }, |
| { |
| "epoch": 2362.5832147937413, |
| "grad_norm": 0.14980724453926086, |
| "learning_rate": 1.9665551839464883e-06, |
| "loss": 2.6163, |
| "step": 40400 |
| }, |
| { |
| "epoch": 2362.5832147937413, |
| "eval_loss": 2.614140510559082, |
| "eval_runtime": 8.5229, |
| "eval_samples_per_second": 163.56, |
| "eval_steps_per_second": 20.533, |
| "step": 40400 |
| }, |
| { |
| "epoch": 2368.2731152204838, |
| "grad_norm": 0.13233982026576996, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 2.6156, |
| "step": 40500 |
| }, |
| { |
| "epoch": 2368.2731152204838, |
| "eval_loss": 2.615586042404175, |
| "eval_runtime": 8.847, |
| "eval_samples_per_second": 157.568, |
| "eval_steps_per_second": 19.781, |
| "step": 40500 |
| }, |
| { |
| "epoch": 2373.9630156472263, |
| "grad_norm": 0.13586369156837463, |
| "learning_rate": 1.9464882943143814e-06, |
| "loss": 2.6159, |
| "step": 40600 |
| }, |
| { |
| "epoch": 2373.9630156472263, |
| "eval_loss": 2.61820650100708, |
| "eval_runtime": 8.8392, |
| "eval_samples_per_second": 157.707, |
| "eval_steps_per_second": 19.798, |
| "step": 40600 |
| }, |
| { |
| "epoch": 2379.6529160739688, |
| "grad_norm": 0.13869047164916992, |
| "learning_rate": 1.9364548494983277e-06, |
| "loss": 2.6152, |
| "step": 40700 |
| }, |
| { |
| "epoch": 2379.6529160739688, |
| "eval_loss": 2.614039897918701, |
| "eval_runtime": 8.5304, |
| "eval_samples_per_second": 163.415, |
| "eval_steps_per_second": 20.515, |
| "step": 40700 |
| }, |
| { |
| "epoch": 2385.3428165007113, |
| "grad_norm": 0.1269962042570114, |
| "learning_rate": 1.9264214046822745e-06, |
| "loss": 2.6152, |
| "step": 40800 |
| }, |
| { |
| "epoch": 2385.3428165007113, |
| "eval_loss": 2.614192247390747, |
| "eval_runtime": 8.5419, |
| "eval_samples_per_second": 163.195, |
| "eval_steps_per_second": 20.487, |
| "step": 40800 |
| }, |
| { |
| "epoch": 2391.0327169274537, |
| "grad_norm": 0.14708365499973297, |
| "learning_rate": 1.916387959866221e-06, |
| "loss": 2.6155, |
| "step": 40900 |
| }, |
| { |
| "epoch": 2391.0327169274537, |
| "eval_loss": 2.615812301635742, |
| "eval_runtime": 8.8721, |
| "eval_samples_per_second": 157.122, |
| "eval_steps_per_second": 19.725, |
| "step": 40900 |
| }, |
| { |
| "epoch": 2396.7226173541962, |
| "grad_norm": 0.11788502335548401, |
| "learning_rate": 1.9063545150501674e-06, |
| "loss": 2.6158, |
| "step": 41000 |
| }, |
| { |
| "epoch": 2396.7226173541962, |
| "eval_loss": 2.615337610244751, |
| "eval_runtime": 8.8548, |
| "eval_samples_per_second": 157.429, |
| "eval_steps_per_second": 19.763, |
| "step": 41000 |
| }, |
| { |
| "epoch": 2402.4125177809387, |
| "grad_norm": 0.14130190014839172, |
| "learning_rate": 1.896321070234114e-06, |
| "loss": 2.6153, |
| "step": 41100 |
| }, |
| { |
| "epoch": 2402.4125177809387, |
| "eval_loss": 2.6168124675750732, |
| "eval_runtime": 8.5314, |
| "eval_samples_per_second": 163.396, |
| "eval_steps_per_second": 20.512, |
| "step": 41100 |
| }, |
| { |
| "epoch": 2408.1024182076812, |
| "grad_norm": 0.14463502168655396, |
| "learning_rate": 1.8862876254180603e-06, |
| "loss": 2.6155, |
| "step": 41200 |
| }, |
| { |
| "epoch": 2408.1024182076812, |
| "eval_loss": 2.6179542541503906, |
| "eval_runtime": 8.5339, |
| "eval_samples_per_second": 163.348, |
| "eval_steps_per_second": 20.506, |
| "step": 41200 |
| }, |
| { |
| "epoch": 2413.7923186344237, |
| "grad_norm": 0.12708818912506104, |
| "learning_rate": 1.8762541806020068e-06, |
| "loss": 2.6155, |
| "step": 41300 |
| }, |
| { |
| "epoch": 2413.7923186344237, |
| "eval_loss": 2.616238832473755, |
| "eval_runtime": 8.5448, |
| "eval_samples_per_second": 163.139, |
| "eval_steps_per_second": 20.48, |
| "step": 41300 |
| }, |
| { |
| "epoch": 2419.4822190611662, |
| "grad_norm": 0.1303997039794922, |
| "learning_rate": 1.8662207357859534e-06, |
| "loss": 2.616, |
| "step": 41400 |
| }, |
| { |
| "epoch": 2419.4822190611662, |
| "eval_loss": 2.615610122680664, |
| "eval_runtime": 8.8552, |
| "eval_samples_per_second": 157.421, |
| "eval_steps_per_second": 19.762, |
| "step": 41400 |
| }, |
| { |
| "epoch": 2425.172119487909, |
| "grad_norm": 0.14887328445911407, |
| "learning_rate": 1.8561872909699e-06, |
| "loss": 2.6158, |
| "step": 41500 |
| }, |
| { |
| "epoch": 2425.172119487909, |
| "eval_loss": 2.6136879920959473, |
| "eval_runtime": 8.5262, |
| "eval_samples_per_second": 163.496, |
| "eval_steps_per_second": 20.525, |
| "step": 41500 |
| }, |
| { |
| "epoch": 2430.8620199146517, |
| "grad_norm": 0.12649740278720856, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 2.6156, |
| "step": 41600 |
| }, |
| { |
| "epoch": 2430.8620199146517, |
| "eval_loss": 2.6171023845672607, |
| "eval_runtime": 8.8681, |
| "eval_samples_per_second": 157.192, |
| "eval_steps_per_second": 19.734, |
| "step": 41600 |
| }, |
| { |
| "epoch": 2436.551920341394, |
| "grad_norm": 0.14125467836856842, |
| "learning_rate": 1.8361204013377928e-06, |
| "loss": 2.6152, |
| "step": 41700 |
| }, |
| { |
| "epoch": 2436.551920341394, |
| "eval_loss": 2.6158018112182617, |
| "eval_runtime": 8.5206, |
| "eval_samples_per_second": 163.604, |
| "eval_steps_per_second": 20.538, |
| "step": 41700 |
| }, |
| { |
| "epoch": 2442.2418207681367, |
| "grad_norm": 0.12283240258693695, |
| "learning_rate": 1.8260869565217394e-06, |
| "loss": 2.6159, |
| "step": 41800 |
| }, |
| { |
| "epoch": 2442.2418207681367, |
| "eval_loss": 2.6169424057006836, |
| "eval_runtime": 8.5518, |
| "eval_samples_per_second": 163.007, |
| "eval_steps_per_second": 20.464, |
| "step": 41800 |
| }, |
| { |
| "epoch": 2447.931721194879, |
| "grad_norm": 0.15379033982753754, |
| "learning_rate": 1.8160535117056857e-06, |
| "loss": 2.6152, |
| "step": 41900 |
| }, |
| { |
| "epoch": 2447.931721194879, |
| "eval_loss": 2.6155290603637695, |
| "eval_runtime": 8.8698, |
| "eval_samples_per_second": 157.162, |
| "eval_steps_per_second": 19.73, |
| "step": 41900 |
| }, |
| { |
| "epoch": 2453.6216216216217, |
| "grad_norm": 0.15148812532424927, |
| "learning_rate": 1.8060200668896322e-06, |
| "loss": 2.6152, |
| "step": 42000 |
| }, |
| { |
| "epoch": 2453.6216216216217, |
| "eval_loss": 2.6144700050354004, |
| "eval_runtime": 8.5457, |
| "eval_samples_per_second": 163.122, |
| "eval_steps_per_second": 20.478, |
| "step": 42000 |
| }, |
| { |
| "epoch": 2459.311522048364, |
| "grad_norm": 0.1490088701248169, |
| "learning_rate": 1.7959866220735788e-06, |
| "loss": 2.615, |
| "step": 42100 |
| }, |
| { |
| "epoch": 2459.311522048364, |
| "eval_loss": 2.6168768405914307, |
| "eval_runtime": 8.5451, |
| "eval_samples_per_second": 163.135, |
| "eval_steps_per_second": 20.48, |
| "step": 42100 |
| }, |
| { |
| "epoch": 2465.0014224751067, |
| "grad_norm": 0.11491715162992477, |
| "learning_rate": 1.7859531772575253e-06, |
| "loss": 2.6157, |
| "step": 42200 |
| }, |
| { |
| "epoch": 2465.0014224751067, |
| "eval_loss": 2.6155242919921875, |
| "eval_runtime": 8.5538, |
| "eval_samples_per_second": 162.969, |
| "eval_steps_per_second": 20.459, |
| "step": 42200 |
| }, |
| { |
| "epoch": 2470.691322901849, |
| "grad_norm": 0.15772178769111633, |
| "learning_rate": 1.7759197324414717e-06, |
| "loss": 2.6153, |
| "step": 42300 |
| }, |
| { |
| "epoch": 2470.691322901849, |
| "eval_loss": 2.613830804824829, |
| "eval_runtime": 8.8766, |
| "eval_samples_per_second": 157.041, |
| "eval_steps_per_second": 19.715, |
| "step": 42300 |
| }, |
| { |
| "epoch": 2476.3812233285917, |
| "grad_norm": 0.13534432649612427, |
| "learning_rate": 1.7658862876254182e-06, |
| "loss": 2.6149, |
| "step": 42400 |
| }, |
| { |
| "epoch": 2476.3812233285917, |
| "eval_loss": 2.6143946647644043, |
| "eval_runtime": 8.5381, |
| "eval_samples_per_second": 163.268, |
| "eval_steps_per_second": 20.496, |
| "step": 42400 |
| }, |
| { |
| "epoch": 2482.071123755334, |
| "grad_norm": 0.11993639171123505, |
| "learning_rate": 1.7558528428093648e-06, |
| "loss": 2.6154, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2482.071123755334, |
| "eval_loss": 2.6130027770996094, |
| "eval_runtime": 8.5295, |
| "eval_samples_per_second": 163.434, |
| "eval_steps_per_second": 20.517, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2487.7610241820767, |
| "grad_norm": 0.12379685789346695, |
| "learning_rate": 1.745819397993311e-06, |
| "loss": 2.6152, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2487.7610241820767, |
| "eval_loss": 2.616774082183838, |
| "eval_runtime": 8.854, |
| "eval_samples_per_second": 157.444, |
| "eval_steps_per_second": 19.765, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2493.450924608819, |
| "grad_norm": 0.11662384122610092, |
| "learning_rate": 1.7357859531772575e-06, |
| "loss": 2.6152, |
| "step": 42700 |
| }, |
| { |
| "epoch": 2493.450924608819, |
| "eval_loss": 2.6169705390930176, |
| "eval_runtime": 8.5399, |
| "eval_samples_per_second": 163.234, |
| "eval_steps_per_second": 20.492, |
| "step": 42700 |
| }, |
| { |
| "epoch": 2499.140825035562, |
| "grad_norm": 0.13475127518177032, |
| "learning_rate": 1.7257525083612038e-06, |
| "loss": 2.6153, |
| "step": 42800 |
| }, |
| { |
| "epoch": 2499.140825035562, |
| "eval_loss": 2.6149654388427734, |
| "eval_runtime": 8.5276, |
| "eval_samples_per_second": 163.468, |
| "eval_steps_per_second": 20.522, |
| "step": 42800 |
| }, |
| { |
| "epoch": 2504.8307254623046, |
| "grad_norm": 0.12163935601711273, |
| "learning_rate": 1.7157190635451504e-06, |
| "loss": 2.6146, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2504.8307254623046, |
| "eval_loss": 2.616426467895508, |
| "eval_runtime": 8.5341, |
| "eval_samples_per_second": 163.345, |
| "eval_steps_per_second": 20.506, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2510.520625889047, |
| "grad_norm": 0.12904202938079834, |
| "learning_rate": 1.705685618729097e-06, |
| "loss": 2.615, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2510.520625889047, |
| "eval_loss": 2.6142711639404297, |
| "eval_runtime": 8.8632, |
| "eval_samples_per_second": 157.279, |
| "eval_steps_per_second": 19.744, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2516.2105263157896, |
| "grad_norm": 0.14409850537776947, |
| "learning_rate": 1.6956521739130435e-06, |
| "loss": 2.615, |
| "step": 43100 |
| }, |
| { |
| "epoch": 2516.2105263157896, |
| "eval_loss": 2.6173369884490967, |
| "eval_runtime": 8.517, |
| "eval_samples_per_second": 163.673, |
| "eval_steps_per_second": 20.547, |
| "step": 43100 |
| }, |
| { |
| "epoch": 2521.900426742532, |
| "grad_norm": 0.12942758202552795, |
| "learning_rate": 1.6856187290969898e-06, |
| "loss": 2.6147, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2521.900426742532, |
| "eval_loss": 2.6177051067352295, |
| "eval_runtime": 8.8403, |
| "eval_samples_per_second": 157.688, |
| "eval_steps_per_second": 19.796, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2527.5903271692746, |
| "grad_norm": 0.14761574566364288, |
| "learning_rate": 1.6755852842809363e-06, |
| "loss": 2.6143, |
| "step": 43300 |
| }, |
| { |
| "epoch": 2527.5903271692746, |
| "eval_loss": 2.6154208183288574, |
| "eval_runtime": 8.5203, |
| "eval_samples_per_second": 163.609, |
| "eval_steps_per_second": 20.539, |
| "step": 43300 |
| }, |
| { |
| "epoch": 2533.280227596017, |
| "grad_norm": 0.1361926943063736, |
| "learning_rate": 1.665551839464883e-06, |
| "loss": 2.615, |
| "step": 43400 |
| }, |
| { |
| "epoch": 2533.280227596017, |
| "eval_loss": 2.617976188659668, |
| "eval_runtime": 8.5489, |
| "eval_samples_per_second": 163.062, |
| "eval_steps_per_second": 20.47, |
| "step": 43400 |
| }, |
| { |
| "epoch": 2538.9701280227596, |
| "grad_norm": 0.1490316092967987, |
| "learning_rate": 1.6555183946488294e-06, |
| "loss": 2.6146, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2538.9701280227596, |
| "eval_loss": 2.616652250289917, |
| "eval_runtime": 8.5376, |
| "eval_samples_per_second": 163.279, |
| "eval_steps_per_second": 20.498, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2544.660028449502, |
| "grad_norm": 0.13588373363018036, |
| "learning_rate": 1.6454849498327758e-06, |
| "loss": 2.6152, |
| "step": 43600 |
| }, |
| { |
| "epoch": 2544.660028449502, |
| "eval_loss": 2.6177802085876465, |
| "eval_runtime": 8.8667, |
| "eval_samples_per_second": 157.217, |
| "eval_steps_per_second": 19.737, |
| "step": 43600 |
| }, |
| { |
| "epoch": 2550.3499288762446, |
| "grad_norm": 0.12654942274093628, |
| "learning_rate": 1.6354515050167223e-06, |
| "loss": 2.6146, |
| "step": 43700 |
| }, |
| { |
| "epoch": 2550.3499288762446, |
| "eval_loss": 2.615847110748291, |
| "eval_runtime": 8.5335, |
| "eval_samples_per_second": 163.356, |
| "eval_steps_per_second": 20.507, |
| "step": 43700 |
| }, |
| { |
| "epoch": 2556.039829302987, |
| "grad_norm": 0.15947924554347992, |
| "learning_rate": 1.6254180602006689e-06, |
| "loss": 2.6149, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2556.039829302987, |
| "eval_loss": 2.613116502761841, |
| "eval_runtime": 8.5279, |
| "eval_samples_per_second": 163.464, |
| "eval_steps_per_second": 20.521, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2561.7297297297296, |
| "grad_norm": 0.11915856599807739, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 2.6146, |
| "step": 43900 |
| }, |
| { |
| "epoch": 2561.7297297297296, |
| "eval_loss": 2.614288568496704, |
| "eval_runtime": 8.8597, |
| "eval_samples_per_second": 157.342, |
| "eval_steps_per_second": 19.752, |
| "step": 43900 |
| }, |
| { |
| "epoch": 2567.419630156472, |
| "grad_norm": 0.1312067210674286, |
| "learning_rate": 1.6053511705685618e-06, |
| "loss": 2.6147, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2567.419630156472, |
| "eval_loss": 2.6091578006744385, |
| "eval_runtime": 8.5457, |
| "eval_samples_per_second": 163.122, |
| "eval_steps_per_second": 20.478, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2573.109530583215, |
| "grad_norm": 0.14233353734016418, |
| "learning_rate": 1.5953177257525083e-06, |
| "loss": 2.6148, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2573.109530583215, |
| "eval_loss": 2.612126111984253, |
| "eval_runtime": 8.5355, |
| "eval_samples_per_second": 163.318, |
| "eval_steps_per_second": 20.503, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2578.7994310099575, |
| "grad_norm": 0.1357184797525406, |
| "learning_rate": 1.5852842809364549e-06, |
| "loss": 2.6149, |
| "step": 44200 |
| }, |
| { |
| "epoch": 2578.7994310099575, |
| "eval_loss": 2.618696928024292, |
| "eval_runtime": 8.8741, |
| "eval_samples_per_second": 157.087, |
| "eval_steps_per_second": 19.72, |
| "step": 44200 |
| }, |
| { |
| "epoch": 2584.4893314367, |
| "grad_norm": 0.14556884765625, |
| "learning_rate": 1.5752508361204012e-06, |
| "loss": 2.6142, |
| "step": 44300 |
| }, |
| { |
| "epoch": 2584.4893314367, |
| "eval_loss": 2.616926908493042, |
| "eval_runtime": 8.5359, |
| "eval_samples_per_second": 163.311, |
| "eval_steps_per_second": 20.502, |
| "step": 44300 |
| }, |
| { |
| "epoch": 2590.1792318634425, |
| "grad_norm": 0.12908801436424255, |
| "learning_rate": 1.5652173913043478e-06, |
| "loss": 2.6145, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2590.1792318634425, |
| "eval_loss": 2.6157069206237793, |
| "eval_runtime": 8.8877, |
| "eval_samples_per_second": 156.846, |
| "eval_steps_per_second": 19.69, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2595.869132290185, |
| "grad_norm": 0.14168845117092133, |
| "learning_rate": 1.5551839464882943e-06, |
| "loss": 2.6146, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2595.869132290185, |
| "eval_loss": 2.615161657333374, |
| "eval_runtime": 8.5305, |
| "eval_samples_per_second": 163.413, |
| "eval_steps_per_second": 20.515, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2601.5590327169275, |
| "grad_norm": 0.13634611666202545, |
| "learning_rate": 1.5451505016722409e-06, |
| "loss": 2.6146, |
| "step": 44600 |
| }, |
| { |
| "epoch": 2601.5590327169275, |
| "eval_loss": 2.6135544776916504, |
| "eval_runtime": 8.54, |
| "eval_samples_per_second": 163.233, |
| "eval_steps_per_second": 20.492, |
| "step": 44600 |
| }, |
| { |
| "epoch": 2607.24893314367, |
| "grad_norm": 0.14684821665287018, |
| "learning_rate": 1.5351170568561872e-06, |
| "loss": 2.6149, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2607.24893314367, |
| "eval_loss": 2.616076707839966, |
| "eval_runtime": 8.517, |
| "eval_samples_per_second": 163.672, |
| "eval_steps_per_second": 20.547, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2612.9388335704125, |
| "grad_norm": 0.14135567843914032, |
| "learning_rate": 1.5250836120401338e-06, |
| "loss": 2.6142, |
| "step": 44800 |
| }, |
| { |
| "epoch": 2612.9388335704125, |
| "eval_loss": 2.6159145832061768, |
| "eval_runtime": 8.8492, |
| "eval_samples_per_second": 157.529, |
| "eval_steps_per_second": 19.776, |
| "step": 44800 |
| }, |
| { |
| "epoch": 2618.628733997155, |
| "grad_norm": 0.1256554275751114, |
| "learning_rate": 1.5150501672240803e-06, |
| "loss": 2.6142, |
| "step": 44900 |
| }, |
| { |
| "epoch": 2618.628733997155, |
| "eval_loss": 2.6102206707000732, |
| "eval_runtime": 8.5351, |
| "eval_samples_per_second": 163.325, |
| "eval_steps_per_second": 20.503, |
| "step": 44900 |
| }, |
| { |
| "epoch": 2624.3186344238975, |
| "grad_norm": 0.12723155319690704, |
| "learning_rate": 1.5050167224080269e-06, |
| "loss": 2.614, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2624.3186344238975, |
| "eval_loss": 2.6176397800445557, |
| "eval_runtime": 8.5392, |
| "eval_samples_per_second": 163.248, |
| "eval_steps_per_second": 20.494, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2630.00853485064, |
| "grad_norm": 0.1423732191324234, |
| "learning_rate": 1.4949832775919732e-06, |
| "loss": 2.6145, |
| "step": 45100 |
| }, |
| { |
| "epoch": 2630.00853485064, |
| "eval_loss": 2.613284111022949, |
| "eval_runtime": 8.5394, |
| "eval_samples_per_second": 163.243, |
| "eval_steps_per_second": 20.493, |
| "step": 45100 |
| }, |
| { |
| "epoch": 2635.6984352773825, |
| "grad_norm": 0.127468079328537, |
| "learning_rate": 1.4849498327759198e-06, |
| "loss": 2.6143, |
| "step": 45200 |
| }, |
| { |
| "epoch": 2635.6984352773825, |
| "eval_loss": 2.614154100418091, |
| "eval_runtime": 8.842, |
| "eval_samples_per_second": 157.657, |
| "eval_steps_per_second": 19.792, |
| "step": 45200 |
| }, |
| { |
| "epoch": 2641.388335704125, |
| "grad_norm": 0.13406263291835785, |
| "learning_rate": 1.4749163879598663e-06, |
| "loss": 2.6141, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2641.388335704125, |
| "eval_loss": 2.614849328994751, |
| "eval_runtime": 8.53, |
| "eval_samples_per_second": 163.424, |
| "eval_steps_per_second": 20.516, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2647.078236130868, |
| "grad_norm": 0.14327415823936462, |
| "learning_rate": 1.4648829431438129e-06, |
| "loss": 2.6142, |
| "step": 45400 |
| }, |
| { |
| "epoch": 2647.078236130868, |
| "eval_loss": 2.6156599521636963, |
| "eval_runtime": 8.5293, |
| "eval_samples_per_second": 163.437, |
| "eval_steps_per_second": 20.518, |
| "step": 45400 |
| }, |
| { |
| "epoch": 2652.7681365576104, |
| "grad_norm": 0.13055378198623657, |
| "learning_rate": 1.4548494983277592e-06, |
| "loss": 2.6141, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2652.7681365576104, |
| "eval_loss": 2.6118390560150146, |
| "eval_runtime": 8.5382, |
| "eval_samples_per_second": 163.267, |
| "eval_steps_per_second": 20.496, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2658.458036984353, |
| "grad_norm": 0.14269088208675385, |
| "learning_rate": 1.4448160535117058e-06, |
| "loss": 2.6149, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2658.458036984353, |
| "eval_loss": 2.616089105606079, |
| "eval_runtime": 8.8625, |
| "eval_samples_per_second": 157.291, |
| "eval_steps_per_second": 19.746, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2664.1479374110954, |
| "grad_norm": 0.13923226296901703, |
| "learning_rate": 1.4347826086956523e-06, |
| "loss": 2.6141, |
| "step": 45700 |
| }, |
| { |
| "epoch": 2664.1479374110954, |
| "eval_loss": 2.615753412246704, |
| "eval_runtime": 8.5429, |
| "eval_samples_per_second": 163.175, |
| "eval_steps_per_second": 20.485, |
| "step": 45700 |
| }, |
| { |
| "epoch": 2669.837837837838, |
| "grad_norm": 0.11520116031169891, |
| "learning_rate": 1.4247491638795989e-06, |
| "loss": 2.614, |
| "step": 45800 |
| }, |
| { |
| "epoch": 2669.837837837838, |
| "eval_loss": 2.614213228225708, |
| "eval_runtime": 8.8633, |
| "eval_samples_per_second": 157.278, |
| "eval_steps_per_second": 19.744, |
| "step": 45800 |
| }, |
| { |
| "epoch": 2675.5277382645804, |
| "grad_norm": 0.13826854526996613, |
| "learning_rate": 1.4147157190635452e-06, |
| "loss": 2.6141, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2675.5277382645804, |
| "eval_loss": 2.6141257286071777, |
| "eval_runtime": 8.531, |
| "eval_samples_per_second": 163.403, |
| "eval_steps_per_second": 20.513, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2681.217638691323, |
| "grad_norm": 0.1388641595840454, |
| "learning_rate": 1.4046822742474917e-06, |
| "loss": 2.614, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2681.217638691323, |
| "eval_loss": 2.6166601181030273, |
| "eval_runtime": 8.8588, |
| "eval_samples_per_second": 157.357, |
| "eval_steps_per_second": 19.754, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2686.9075391180654, |
| "grad_norm": 0.1250719428062439, |
| "learning_rate": 1.3946488294314383e-06, |
| "loss": 2.6134, |
| "step": 46100 |
| }, |
| { |
| "epoch": 2686.9075391180654, |
| "eval_loss": 2.6177427768707275, |
| "eval_runtime": 8.53, |
| "eval_samples_per_second": 163.424, |
| "eval_steps_per_second": 20.516, |
| "step": 46100 |
| }, |
| { |
| "epoch": 2692.597439544808, |
| "grad_norm": 0.1312686949968338, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 2.614, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2692.597439544808, |
| "eval_loss": 2.61600399017334, |
| "eval_runtime": 8.518, |
| "eval_samples_per_second": 163.653, |
| "eval_steps_per_second": 20.545, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2698.2873399715504, |
| "grad_norm": 0.1418214589357376, |
| "learning_rate": 1.374581939799331e-06, |
| "loss": 2.6142, |
| "step": 46300 |
| }, |
| { |
| "epoch": 2698.2873399715504, |
| "eval_loss": 2.6169888973236084, |
| "eval_runtime": 8.5439, |
| "eval_samples_per_second": 163.157, |
| "eval_steps_per_second": 20.482, |
| "step": 46300 |
| }, |
| { |
| "epoch": 2703.977240398293, |
| "grad_norm": 0.13503268361091614, |
| "learning_rate": 1.3645484949832775e-06, |
| "loss": 2.6141, |
| "step": 46400 |
| }, |
| { |
| "epoch": 2703.977240398293, |
| "eval_loss": 2.617550849914551, |
| "eval_runtime": 8.8621, |
| "eval_samples_per_second": 157.299, |
| "eval_steps_per_second": 19.747, |
| "step": 46400 |
| }, |
| { |
| "epoch": 2709.6671408250354, |
| "grad_norm": 0.13151606917381287, |
| "learning_rate": 1.354515050167224e-06, |
| "loss": 2.6138, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2709.6671408250354, |
| "eval_loss": 2.614605665206909, |
| "eval_runtime": 8.5338, |
| "eval_samples_per_second": 163.351, |
| "eval_steps_per_second": 20.507, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2715.357041251778, |
| "grad_norm": 0.12771758437156677, |
| "learning_rate": 1.3444816053511706e-06, |
| "loss": 2.6141, |
| "step": 46600 |
| }, |
| { |
| "epoch": 2715.357041251778, |
| "eval_loss": 2.6184401512145996, |
| "eval_runtime": 8.8574, |
| "eval_samples_per_second": 157.383, |
| "eval_steps_per_second": 19.758, |
| "step": 46600 |
| }, |
| { |
| "epoch": 2721.046941678521, |
| "grad_norm": 0.13841165602207184, |
| "learning_rate": 1.334448160535117e-06, |
| "loss": 2.6138, |
| "step": 46700 |
| }, |
| { |
| "epoch": 2721.046941678521, |
| "eval_loss": 2.617668867111206, |
| "eval_runtime": 8.5339, |
| "eval_samples_per_second": 163.348, |
| "eval_steps_per_second": 20.506, |
| "step": 46700 |
| }, |
| { |
| "epoch": 2726.7368421052633, |
| "grad_norm": 0.12478631734848022, |
| "learning_rate": 1.3244147157190635e-06, |
| "loss": 2.6141, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2726.7368421052633, |
| "eval_loss": 2.61787748336792, |
| "eval_runtime": 8.8804, |
| "eval_samples_per_second": 156.974, |
| "eval_steps_per_second": 19.706, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2732.426742532006, |
| "grad_norm": 0.13361801207065582, |
| "learning_rate": 1.31438127090301e-06, |
| "loss": 2.6136, |
| "step": 46900 |
| }, |
| { |
| "epoch": 2732.426742532006, |
| "eval_loss": 2.6143569946289062, |
| "eval_runtime": 8.5467, |
| "eval_samples_per_second": 163.105, |
| "eval_steps_per_second": 20.476, |
| "step": 46900 |
| }, |
| { |
| "epoch": 2738.1166429587483, |
| "grad_norm": 0.1362065225839615, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 2.6135, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2738.1166429587483, |
| "eval_loss": 2.613162040710449, |
| "eval_runtime": 8.8749, |
| "eval_samples_per_second": 157.071, |
| "eval_steps_per_second": 19.718, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2743.806543385491, |
| "grad_norm": 0.14401383697986603, |
| "learning_rate": 1.294314381270903e-06, |
| "loss": 2.6142, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2743.806543385491, |
| "eval_loss": 2.6157407760620117, |
| "eval_runtime": 8.5464, |
| "eval_samples_per_second": 163.11, |
| "eval_steps_per_second": 20.476, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2749.4964438122333, |
| "grad_norm": 0.14595820009708405, |
| "learning_rate": 1.2842809364548495e-06, |
| "loss": 2.6136, |
| "step": 47200 |
| }, |
| { |
| "epoch": 2749.4964438122333, |
| "eval_loss": 2.6133205890655518, |
| "eval_runtime": 8.8738, |
| "eval_samples_per_second": 157.092, |
| "eval_steps_per_second": 19.721, |
| "step": 47200 |
| }, |
| { |
| "epoch": 2755.186344238976, |
| "grad_norm": 0.14186260104179382, |
| "learning_rate": 1.274247491638796e-06, |
| "loss": 2.6134, |
| "step": 47300 |
| }, |
| { |
| "epoch": 2755.186344238976, |
| "eval_loss": 2.617734670639038, |
| "eval_runtime": 8.5319, |
| "eval_samples_per_second": 163.387, |
| "eval_steps_per_second": 20.511, |
| "step": 47300 |
| }, |
| { |
| "epoch": 2760.8762446657183, |
| "grad_norm": 0.13552911579608917, |
| "learning_rate": 1.2642140468227424e-06, |
| "loss": 2.6135, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2760.8762446657183, |
| "eval_loss": 2.6164298057556152, |
| "eval_runtime": 8.5373, |
| "eval_samples_per_second": 163.284, |
| "eval_steps_per_second": 20.498, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2766.566145092461, |
| "grad_norm": 0.12871357798576355, |
| "learning_rate": 1.254180602006689e-06, |
| "loss": 2.6138, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2766.566145092461, |
| "eval_loss": 2.6128602027893066, |
| "eval_runtime": 8.5494, |
| "eval_samples_per_second": 163.052, |
| "eval_steps_per_second": 20.469, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2772.2560455192033, |
| "grad_norm": 0.12483840435743332, |
| "learning_rate": 1.2441471571906355e-06, |
| "loss": 2.6138, |
| "step": 47600 |
| }, |
| { |
| "epoch": 2772.2560455192033, |
| "eval_loss": 2.6143155097961426, |
| "eval_runtime": 8.8607, |
| "eval_samples_per_second": 157.325, |
| "eval_steps_per_second": 19.75, |
| "step": 47600 |
| }, |
| { |
| "epoch": 2777.945945945946, |
| "grad_norm": 0.13678689301013947, |
| "learning_rate": 1.234113712374582e-06, |
| "loss": 2.6131, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2777.945945945946, |
| "eval_loss": 2.617353916168213, |
| "eval_runtime": 8.5357, |
| "eval_samples_per_second": 163.314, |
| "eval_steps_per_second": 20.502, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2783.6358463726883, |
| "grad_norm": 0.12394748628139496, |
| "learning_rate": 1.2240802675585284e-06, |
| "loss": 2.6136, |
| "step": 47800 |
| }, |
| { |
| "epoch": 2783.6358463726883, |
| "eval_loss": 2.6153721809387207, |
| "eval_runtime": 8.548, |
| "eval_samples_per_second": 163.08, |
| "eval_steps_per_second": 20.473, |
| "step": 47800 |
| }, |
| { |
| "epoch": 2817.429587482219, |
| "grad_norm": 0.15023942291736603, |
| "learning_rate": 1.214046822742475e-06, |
| "loss": 2.613, |
| "step": 47900 |
| }, |
| { |
| "epoch": 2817.429587482219, |
| "eval_loss": 2.6161534786224365, |
| "eval_runtime": 8.8571, |
| "eval_samples_per_second": 157.388, |
| "eval_steps_per_second": 19.758, |
| "step": 47900 |
| }, |
| { |
| "epoch": 2823.1194879089617, |
| "grad_norm": 0.140534445643425, |
| "learning_rate": 1.2040133779264215e-06, |
| "loss": 2.6139, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2823.1194879089617, |
| "eval_loss": 2.614151954650879, |
| "eval_runtime": 8.5312, |
| "eval_samples_per_second": 163.4, |
| "eval_steps_per_second": 20.513, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2828.809388335704, |
| "grad_norm": 0.1297474205493927, |
| "learning_rate": 1.193979933110368e-06, |
| "loss": 2.6131, |
| "step": 48100 |
| }, |
| { |
| "epoch": 2828.809388335704, |
| "eval_loss": 2.612234354019165, |
| "eval_runtime": 8.5265, |
| "eval_samples_per_second": 163.489, |
| "eval_steps_per_second": 20.524, |
| "step": 48100 |
| }, |
| { |
| "epoch": 2834.4992887624467, |
| "grad_norm": 0.1272091567516327, |
| "learning_rate": 1.1839464882943144e-06, |
| "loss": 2.613, |
| "step": 48200 |
| }, |
| { |
| "epoch": 2834.4992887624467, |
| "eval_loss": 2.617521047592163, |
| "eval_runtime": 8.839, |
| "eval_samples_per_second": 157.711, |
| "eval_steps_per_second": 19.799, |
| "step": 48200 |
| }, |
| { |
| "epoch": 2840.189189189189, |
| "grad_norm": 0.16200745105743408, |
| "learning_rate": 1.173913043478261e-06, |
| "loss": 2.6134, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2840.189189189189, |
| "eval_loss": 2.614422559738159, |
| "eval_runtime": 8.8546, |
| "eval_samples_per_second": 157.433, |
| "eval_steps_per_second": 19.764, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2845.8790896159317, |
| "grad_norm": 0.13503460586071014, |
| "learning_rate": 1.1638795986622075e-06, |
| "loss": 2.6137, |
| "step": 48400 |
| }, |
| { |
| "epoch": 2845.8790896159317, |
| "eval_loss": 2.612483263015747, |
| "eval_runtime": 8.8623, |
| "eval_samples_per_second": 157.295, |
| "eval_steps_per_second": 19.747, |
| "step": 48400 |
| }, |
| { |
| "epoch": 2851.568990042674, |
| "grad_norm": 0.1506689339876175, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 2.6139, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2851.568990042674, |
| "eval_loss": 2.614689588546753, |
| "eval_runtime": 8.539, |
| "eval_samples_per_second": 163.251, |
| "eval_steps_per_second": 20.494, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2857.2588904694167, |
| "grad_norm": 0.13846616446971893, |
| "learning_rate": 1.1438127090301004e-06, |
| "loss": 2.6135, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2857.2588904694167, |
| "eval_loss": 2.612041711807251, |
| "eval_runtime": 8.5482, |
| "eval_samples_per_second": 163.075, |
| "eval_steps_per_second": 20.472, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2862.948790896159, |
| "grad_norm": 0.12145441025495529, |
| "learning_rate": 1.133779264214047e-06, |
| "loss": 2.6134, |
| "step": 48700 |
| }, |
| { |
| "epoch": 2862.948790896159, |
| "eval_loss": 2.614562749862671, |
| "eval_runtime": 8.5346, |
| "eval_samples_per_second": 163.336, |
| "eval_steps_per_second": 20.505, |
| "step": 48700 |
| }, |
| { |
| "epoch": 2868.6386913229016, |
| "grad_norm": 0.1398162990808487, |
| "learning_rate": 1.1237458193979933e-06, |
| "loss": 2.6135, |
| "step": 48800 |
| }, |
| { |
| "epoch": 2868.6386913229016, |
| "eval_loss": 2.6151633262634277, |
| "eval_runtime": 8.845, |
| "eval_samples_per_second": 157.603, |
| "eval_steps_per_second": 19.785, |
| "step": 48800 |
| }, |
| { |
| "epoch": 2874.3285917496446, |
| "grad_norm": 0.13078400492668152, |
| "learning_rate": 1.1137123745819398e-06, |
| "loss": 2.6135, |
| "step": 48900 |
| }, |
| { |
| "epoch": 2874.3285917496446, |
| "eval_loss": 2.612288236618042, |
| "eval_runtime": 8.5278, |
| "eval_samples_per_second": 163.466, |
| "eval_steps_per_second": 20.521, |
| "step": 48900 |
| }, |
| { |
| "epoch": 2880.018492176387, |
| "grad_norm": 0.14920541644096375, |
| "learning_rate": 1.1036789297658862e-06, |
| "loss": 2.6135, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2880.018492176387, |
| "eval_loss": 2.615710496902466, |
| "eval_runtime": 8.5324, |
| "eval_samples_per_second": 163.376, |
| "eval_steps_per_second": 20.51, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2885.7083926031296, |
| "grad_norm": 0.12429507821798325, |
| "learning_rate": 1.0936454849498327e-06, |
| "loss": 2.6132, |
| "step": 49100 |
| }, |
| { |
| "epoch": 2885.7083926031296, |
| "eval_loss": 2.6171202659606934, |
| "eval_runtime": 8.526, |
| "eval_samples_per_second": 163.5, |
| "eval_steps_per_second": 20.526, |
| "step": 49100 |
| }, |
| { |
| "epoch": 2891.398293029872, |
| "grad_norm": 0.14503461122512817, |
| "learning_rate": 1.0836120401337793e-06, |
| "loss": 2.6126, |
| "step": 49200 |
| }, |
| { |
| "epoch": 2891.398293029872, |
| "eval_loss": 2.617884874343872, |
| "eval_runtime": 8.8466, |
| "eval_samples_per_second": 157.575, |
| "eval_steps_per_second": 19.782, |
| "step": 49200 |
| }, |
| { |
| "epoch": 2897.0881934566146, |
| "grad_norm": 0.1336805522441864, |
| "learning_rate": 1.0735785953177258e-06, |
| "loss": 2.6132, |
| "step": 49300 |
| }, |
| { |
| "epoch": 2897.0881934566146, |
| "eval_loss": 2.6135501861572266, |
| "eval_runtime": 8.5221, |
| "eval_samples_per_second": 163.574, |
| "eval_steps_per_second": 20.535, |
| "step": 49300 |
| }, |
| { |
| "epoch": 2902.778093883357, |
| "grad_norm": 0.13340643048286438, |
| "learning_rate": 1.0635451505016722e-06, |
| "loss": 2.6132, |
| "step": 49400 |
| }, |
| { |
| "epoch": 2902.778093883357, |
| "eval_loss": 2.6155478954315186, |
| "eval_runtime": 8.5297, |
| "eval_samples_per_second": 163.428, |
| "eval_steps_per_second": 20.516, |
| "step": 49400 |
| }, |
| { |
| "epoch": 2908.4679943100996, |
| "grad_norm": 0.12103159725666046, |
| "learning_rate": 1.0535117056856187e-06, |
| "loss": 2.6128, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2908.4679943100996, |
| "eval_loss": 2.614527702331543, |
| "eval_runtime": 8.5244, |
| "eval_samples_per_second": 163.531, |
| "eval_steps_per_second": 20.529, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2914.157894736842, |
| "grad_norm": 0.13566209375858307, |
| "learning_rate": 1.0434782608695653e-06, |
| "loss": 2.6131, |
| "step": 49600 |
| }, |
| { |
| "epoch": 2914.157894736842, |
| "eval_loss": 2.6156363487243652, |
| "eval_runtime": 8.8443, |
| "eval_samples_per_second": 157.615, |
| "eval_steps_per_second": 19.787, |
| "step": 49600 |
| }, |
| { |
| "epoch": 2919.8477951635846, |
| "grad_norm": 0.14300596714019775, |
| "learning_rate": 1.0334448160535118e-06, |
| "loss": 2.6133, |
| "step": 49700 |
| }, |
| { |
| "epoch": 2919.8477951635846, |
| "eval_loss": 2.614346742630005, |
| "eval_runtime": 8.5557, |
| "eval_samples_per_second": 162.933, |
| "eval_steps_per_second": 20.454, |
| "step": 49700 |
| }, |
| { |
| "epoch": 2925.537695590327, |
| "grad_norm": 0.1305309683084488, |
| "learning_rate": 1.0234113712374581e-06, |
| "loss": 2.6127, |
| "step": 49800 |
| }, |
| { |
| "epoch": 2925.537695590327, |
| "eval_loss": 2.617161512374878, |
| "eval_runtime": 8.5486, |
| "eval_samples_per_second": 163.068, |
| "eval_steps_per_second": 20.471, |
| "step": 49800 |
| }, |
| { |
| "epoch": 2931.2275960170696, |
| "grad_norm": 0.12761159241199493, |
| "learning_rate": 1.0133779264214047e-06, |
| "loss": 2.6131, |
| "step": 49900 |
| }, |
| { |
| "epoch": 2931.2275960170696, |
| "eval_loss": 2.609426259994507, |
| "eval_runtime": 8.5282, |
| "eval_samples_per_second": 163.459, |
| "eval_steps_per_second": 20.52, |
| "step": 49900 |
| }, |
| { |
| "epoch": 2936.917496443812, |
| "grad_norm": 0.14436037838459015, |
| "learning_rate": 1.0033444816053512e-06, |
| "loss": 2.6129, |
| "step": 50000 |
| }, |
| { |
| "epoch": 2936.917496443812, |
| "eval_loss": 2.6124675273895264, |
| "eval_runtime": 8.8553, |
| "eval_samples_per_second": 157.419, |
| "eval_steps_per_second": 19.762, |
| "step": 50000 |
| }, |
| { |
| "epoch": 2942.6073968705546, |
| "grad_norm": 0.14199206233024597, |
| "learning_rate": 9.933110367892976e-07, |
| "loss": 2.6135, |
| "step": 50100 |
| }, |
| { |
| "epoch": 2942.6073968705546, |
| "eval_loss": 2.6135404109954834, |
| "eval_runtime": 8.5286, |
| "eval_samples_per_second": 163.449, |
| "eval_steps_per_second": 20.519, |
| "step": 50100 |
| }, |
| { |
| "epoch": 2948.2972972972975, |
| "grad_norm": 0.13962940871715546, |
| "learning_rate": 9.832775919732441e-07, |
| "loss": 2.6124, |
| "step": 50200 |
| }, |
| { |
| "epoch": 2948.2972972972975, |
| "eval_loss": 2.6167821884155273, |
| "eval_runtime": 8.5303, |
| "eval_samples_per_second": 163.418, |
| "eval_steps_per_second": 20.515, |
| "step": 50200 |
| }, |
| { |
| "epoch": 2953.98719772404, |
| "grad_norm": 0.13427576422691345, |
| "learning_rate": 9.732441471571907e-07, |
| "loss": 2.6131, |
| "step": 50300 |
| }, |
| { |
| "epoch": 2953.98719772404, |
| "eval_loss": 2.614187240600586, |
| "eval_runtime": 8.5272, |
| "eval_samples_per_second": 163.476, |
| "eval_steps_per_second": 20.522, |
| "step": 50300 |
| }, |
| { |
| "epoch": 2959.6770981507825, |
| "grad_norm": 0.14102576673030853, |
| "learning_rate": 9.632107023411372e-07, |
| "loss": 2.613, |
| "step": 50400 |
| }, |
| { |
| "epoch": 2959.6770981507825, |
| "eval_loss": 2.6135125160217285, |
| "eval_runtime": 8.8525, |
| "eval_samples_per_second": 157.47, |
| "eval_steps_per_second": 19.769, |
| "step": 50400 |
| }, |
| { |
| "epoch": 2965.366998577525, |
| "grad_norm": 0.13503779470920563, |
| "learning_rate": 9.531772575250837e-07, |
| "loss": 2.6128, |
| "step": 50500 |
| }, |
| { |
| "epoch": 2965.366998577525, |
| "eval_loss": 2.616763114929199, |
| "eval_runtime": 8.5222, |
| "eval_samples_per_second": 163.573, |
| "eval_steps_per_second": 20.535, |
| "step": 50500 |
| }, |
| { |
| "epoch": 2971.0568990042675, |
| "grad_norm": 0.11660658568143845, |
| "learning_rate": 9.431438127090301e-07, |
| "loss": 2.6132, |
| "step": 50600 |
| }, |
| { |
| "epoch": 2971.0568990042675, |
| "eval_loss": 2.6096742153167725, |
| "eval_runtime": 8.5204, |
| "eval_samples_per_second": 163.606, |
| "eval_steps_per_second": 20.539, |
| "step": 50600 |
| }, |
| { |
| "epoch": 2976.74679943101, |
| "grad_norm": 0.11942931264638901, |
| "learning_rate": 9.331103678929767e-07, |
| "loss": 2.6127, |
| "step": 50700 |
| }, |
| { |
| "epoch": 2976.74679943101, |
| "eval_loss": 2.6175696849823, |
| "eval_runtime": 8.5208, |
| "eval_samples_per_second": 163.599, |
| "eval_steps_per_second": 20.538, |
| "step": 50700 |
| }, |
| { |
| "epoch": 2982.4366998577525, |
| "grad_norm": 0.13427217304706573, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 2.6129, |
| "step": 50800 |
| }, |
| { |
| "epoch": 2982.4366998577525, |
| "eval_loss": 2.617108106613159, |
| "eval_runtime": 8.8462, |
| "eval_samples_per_second": 157.582, |
| "eval_steps_per_second": 19.783, |
| "step": 50800 |
| }, |
| { |
| "epoch": 2988.126600284495, |
| "grad_norm": 0.13947026431560516, |
| "learning_rate": 9.130434782608697e-07, |
| "loss": 2.6128, |
| "step": 50900 |
| }, |
| { |
| "epoch": 2988.126600284495, |
| "eval_loss": 2.614734411239624, |
| "eval_runtime": 8.5215, |
| "eval_samples_per_second": 163.586, |
| "eval_steps_per_second": 20.536, |
| "step": 50900 |
| }, |
| { |
| "epoch": 2993.8165007112375, |
| "grad_norm": 0.12719608843326569, |
| "learning_rate": 9.030100334448161e-07, |
| "loss": 2.6132, |
| "step": 51000 |
| }, |
| { |
| "epoch": 2993.8165007112375, |
| "eval_loss": 2.6145222187042236, |
| "eval_runtime": 8.5246, |
| "eval_samples_per_second": 163.526, |
| "eval_steps_per_second": 20.529, |
| "step": 51000 |
| }, |
| { |
| "epoch": 2999.50640113798, |
| "grad_norm": 0.13431696593761444, |
| "learning_rate": 8.929765886287627e-07, |
| "loss": 2.613, |
| "step": 51100 |
| }, |
| { |
| "epoch": 2999.50640113798, |
| "eval_loss": 2.615795850753784, |
| "eval_runtime": 8.5247, |
| "eval_samples_per_second": 163.525, |
| "eval_steps_per_second": 20.529, |
| "step": 51100 |
| }, |
| { |
| "epoch": 3005.1963015647225, |
| "grad_norm": 0.122039295732975, |
| "learning_rate": 8.829431438127091e-07, |
| "loss": 2.6133, |
| "step": 51200 |
| }, |
| { |
| "epoch": 3005.1963015647225, |
| "eval_loss": 2.613452434539795, |
| "eval_runtime": 8.8592, |
| "eval_samples_per_second": 157.35, |
| "eval_steps_per_second": 19.753, |
| "step": 51200 |
| }, |
| { |
| "epoch": 3010.886201991465, |
| "grad_norm": 0.12794600427150726, |
| "learning_rate": 8.729096989966555e-07, |
| "loss": 2.6125, |
| "step": 51300 |
| }, |
| { |
| "epoch": 3010.886201991465, |
| "eval_loss": 2.613462209701538, |
| "eval_runtime": 8.5292, |
| "eval_samples_per_second": 163.439, |
| "eval_steps_per_second": 20.518, |
| "step": 51300 |
| }, |
| { |
| "epoch": 3016.5761024182075, |
| "grad_norm": 0.15235668420791626, |
| "learning_rate": 8.628762541806019e-07, |
| "loss": 2.6128, |
| "step": 51400 |
| }, |
| { |
| "epoch": 3016.5761024182075, |
| "eval_loss": 2.6159415245056152, |
| "eval_runtime": 8.5195, |
| "eval_samples_per_second": 163.624, |
| "eval_steps_per_second": 20.541, |
| "step": 51400 |
| }, |
| { |
| "epoch": 3022.2660028449504, |
| "grad_norm": 0.1353672742843628, |
| "learning_rate": 8.528428093645485e-07, |
| "loss": 2.6126, |
| "step": 51500 |
| }, |
| { |
| "epoch": 3022.2660028449504, |
| "eval_loss": 2.6131489276885986, |
| "eval_runtime": 8.5212, |
| "eval_samples_per_second": 163.593, |
| "eval_steps_per_second": 20.537, |
| "step": 51500 |
| }, |
| { |
| "epoch": 3027.955903271693, |
| "grad_norm": 0.1265411078929901, |
| "learning_rate": 8.428093645484949e-07, |
| "loss": 2.6127, |
| "step": 51600 |
| }, |
| { |
| "epoch": 3027.955903271693, |
| "eval_loss": 2.6140012741088867, |
| "eval_runtime": 8.8546, |
| "eval_samples_per_second": 157.432, |
| "eval_steps_per_second": 19.764, |
| "step": 51600 |
| }, |
| { |
| "epoch": 3033.6458036984354, |
| "grad_norm": 0.12123577296733856, |
| "learning_rate": 8.327759197324414e-07, |
| "loss": 2.6123, |
| "step": 51700 |
| }, |
| { |
| "epoch": 3033.6458036984354, |
| "eval_loss": 2.617744207382202, |
| "eval_runtime": 8.5337, |
| "eval_samples_per_second": 163.352, |
| "eval_steps_per_second": 20.507, |
| "step": 51700 |
| }, |
| { |
| "epoch": 3039.335704125178, |
| "grad_norm": 0.13425582647323608, |
| "learning_rate": 8.227424749163879e-07, |
| "loss": 2.6128, |
| "step": 51800 |
| }, |
| { |
| "epoch": 3039.335704125178, |
| "eval_loss": 2.6144909858703613, |
| "eval_runtime": 8.5251, |
| "eval_samples_per_second": 163.518, |
| "eval_steps_per_second": 20.528, |
| "step": 51800 |
| }, |
| { |
| "epoch": 3045.0256045519204, |
| "grad_norm": 0.12807567417621613, |
| "learning_rate": 8.127090301003344e-07, |
| "loss": 2.6127, |
| "step": 51900 |
| }, |
| { |
| "epoch": 3045.0256045519204, |
| "eval_loss": 2.6116931438446045, |
| "eval_runtime": 8.5237, |
| "eval_samples_per_second": 163.545, |
| "eval_steps_per_second": 20.531, |
| "step": 51900 |
| }, |
| { |
| "epoch": 3050.715504978663, |
| "grad_norm": 0.13802653551101685, |
| "learning_rate": 8.026755852842809e-07, |
| "loss": 2.6127, |
| "step": 52000 |
| }, |
| { |
| "epoch": 3050.715504978663, |
| "eval_loss": 2.613800287246704, |
| "eval_runtime": 8.8567, |
| "eval_samples_per_second": 157.395, |
| "eval_steps_per_second": 19.759, |
| "step": 52000 |
| }, |
| { |
| "epoch": 3056.4054054054054, |
| "grad_norm": 0.13295966386795044, |
| "learning_rate": 7.926421404682274e-07, |
| "loss": 2.6129, |
| "step": 52100 |
| }, |
| { |
| "epoch": 3056.4054054054054, |
| "eval_loss": 2.614452600479126, |
| "eval_runtime": 8.5253, |
| "eval_samples_per_second": 163.513, |
| "eval_steps_per_second": 20.527, |
| "step": 52100 |
| }, |
| { |
| "epoch": 3062.095305832148, |
| "grad_norm": 0.1325223296880722, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 2.6128, |
| "step": 52200 |
| }, |
| { |
| "epoch": 3062.095305832148, |
| "eval_loss": 2.6134650707244873, |
| "eval_runtime": 8.5247, |
| "eval_samples_per_second": 163.525, |
| "eval_steps_per_second": 20.529, |
| "step": 52200 |
| }, |
| { |
| "epoch": 3067.7852062588904, |
| "grad_norm": 0.13717898726463318, |
| "learning_rate": 7.725752508361204e-07, |
| "loss": 2.613, |
| "step": 52300 |
| }, |
| { |
| "epoch": 3067.7852062588904, |
| "eval_loss": 2.614713430404663, |
| "eval_runtime": 8.5281, |
| "eval_samples_per_second": 163.459, |
| "eval_steps_per_second": 20.52, |
| "step": 52300 |
| }, |
| { |
| "epoch": 3073.475106685633, |
| "grad_norm": 0.1319703459739685, |
| "learning_rate": 7.625418060200669e-07, |
| "loss": 2.6121, |
| "step": 52400 |
| }, |
| { |
| "epoch": 3073.475106685633, |
| "eval_loss": 2.615602970123291, |
| "eval_runtime": 8.8484, |
| "eval_samples_per_second": 157.542, |
| "eval_steps_per_second": 19.777, |
| "step": 52400 |
| }, |
| { |
| "epoch": 3079.1650071123754, |
| "grad_norm": 0.14499501883983612, |
| "learning_rate": 7.525083612040134e-07, |
| "loss": 2.6127, |
| "step": 52500 |
| }, |
| { |
| "epoch": 3079.1650071123754, |
| "eval_loss": 2.61905574798584, |
| "eval_runtime": 8.5283, |
| "eval_samples_per_second": 163.455, |
| "eval_steps_per_second": 20.52, |
| "step": 52500 |
| }, |
| { |
| "epoch": 3084.854907539118, |
| "grad_norm": 0.12991563975811005, |
| "learning_rate": 7.424749163879599e-07, |
| "loss": 2.6125, |
| "step": 52600 |
| }, |
| { |
| "epoch": 3084.854907539118, |
| "eval_loss": 2.616900682449341, |
| "eval_runtime": 8.5227, |
| "eval_samples_per_second": 163.563, |
| "eval_steps_per_second": 20.533, |
| "step": 52600 |
| }, |
| { |
| "epoch": 3090.5448079658604, |
| "grad_norm": 0.13655343651771545, |
| "learning_rate": 7.324414715719064e-07, |
| "loss": 2.6125, |
| "step": 52700 |
| }, |
| { |
| "epoch": 3090.5448079658604, |
| "eval_loss": 2.6136441230773926, |
| "eval_runtime": 8.8771, |
| "eval_samples_per_second": 157.033, |
| "eval_steps_per_second": 19.714, |
| "step": 52700 |
| }, |
| { |
| "epoch": 3096.2347083926034, |
| "grad_norm": 0.1371728628873825, |
| "learning_rate": 7.224080267558529e-07, |
| "loss": 2.6125, |
| "step": 52800 |
| }, |
| { |
| "epoch": 3096.2347083926034, |
| "eval_loss": 2.6113550662994385, |
| "eval_runtime": 8.5351, |
| "eval_samples_per_second": 163.325, |
| "eval_steps_per_second": 20.504, |
| "step": 52800 |
| }, |
| { |
| "epoch": 3101.924608819346, |
| "grad_norm": 0.12911546230316162, |
| "learning_rate": 7.123745819397994e-07, |
| "loss": 2.6125, |
| "step": 52900 |
| }, |
| { |
| "epoch": 3101.924608819346, |
| "eval_loss": 2.61665940284729, |
| "eval_runtime": 8.8673, |
| "eval_samples_per_second": 157.206, |
| "eval_steps_per_second": 19.735, |
| "step": 52900 |
| }, |
| { |
| "epoch": 3107.6145092460883, |
| "grad_norm": 0.13488008081912994, |
| "learning_rate": 7.023411371237459e-07, |
| "loss": 2.6125, |
| "step": 53000 |
| }, |
| { |
| "epoch": 3107.6145092460883, |
| "eval_loss": 2.616480827331543, |
| "eval_runtime": 8.5263, |
| "eval_samples_per_second": 163.495, |
| "eval_steps_per_second": 20.525, |
| "step": 53000 |
| }, |
| { |
| "epoch": 3113.304409672831, |
| "grad_norm": 0.1279713660478592, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 2.6126, |
| "step": 53100 |
| }, |
| { |
| "epoch": 3113.304409672831, |
| "eval_loss": 2.612837553024292, |
| "eval_runtime": 8.8593, |
| "eval_samples_per_second": 157.348, |
| "eval_steps_per_second": 19.753, |
| "step": 53100 |
| }, |
| { |
| "epoch": 3118.9943100995733, |
| "grad_norm": 0.11780209094285965, |
| "learning_rate": 6.822742474916388e-07, |
| "loss": 2.6129, |
| "step": 53200 |
| }, |
| { |
| "epoch": 3118.9943100995733, |
| "eval_loss": 2.6156790256500244, |
| "eval_runtime": 8.5184, |
| "eval_samples_per_second": 163.645, |
| "eval_steps_per_second": 20.544, |
| "step": 53200 |
| }, |
| { |
| "epoch": 3124.684210526316, |
| "grad_norm": 0.1243632510304451, |
| "learning_rate": 6.722408026755853e-07, |
| "loss": 2.6131, |
| "step": 53300 |
| }, |
| { |
| "epoch": 3124.684210526316, |
| "eval_loss": 2.6163458824157715, |
| "eval_runtime": 8.8667, |
| "eval_samples_per_second": 157.217, |
| "eval_steps_per_second": 19.737, |
| "step": 53300 |
| }, |
| { |
| "epoch": 3130.3741109530583, |
| "grad_norm": 0.13544081151485443, |
| "learning_rate": 6.622073578595318e-07, |
| "loss": 2.6124, |
| "step": 53400 |
| }, |
| { |
| "epoch": 3130.3741109530583, |
| "eval_loss": 2.6154003143310547, |
| "eval_runtime": 8.521, |
| "eval_samples_per_second": 163.596, |
| "eval_steps_per_second": 20.538, |
| "step": 53400 |
| }, |
| { |
| "epoch": 3136.064011379801, |
| "grad_norm": 0.14009779691696167, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 2.6129, |
| "step": 53500 |
| }, |
| { |
| "epoch": 3136.064011379801, |
| "eval_loss": 2.6123223304748535, |
| "eval_runtime": 8.5157, |
| "eval_samples_per_second": 163.698, |
| "eval_steps_per_second": 20.55, |
| "step": 53500 |
| }, |
| { |
| "epoch": 3141.7539118065433, |
| "grad_norm": 0.12656356394290924, |
| "learning_rate": 6.421404682274248e-07, |
| "loss": 2.6122, |
| "step": 53600 |
| }, |
| { |
| "epoch": 3141.7539118065433, |
| "eval_loss": 2.6149022579193115, |
| "eval_runtime": 8.5183, |
| "eval_samples_per_second": 163.648, |
| "eval_steps_per_second": 20.544, |
| "step": 53600 |
| }, |
| { |
| "epoch": 3147.443812233286, |
| "grad_norm": 0.1256483644247055, |
| "learning_rate": 6.321070234113712e-07, |
| "loss": 2.6121, |
| "step": 53700 |
| }, |
| { |
| "epoch": 3147.443812233286, |
| "eval_loss": 2.613800048828125, |
| "eval_runtime": 8.8545, |
| "eval_samples_per_second": 157.434, |
| "eval_steps_per_second": 19.764, |
| "step": 53700 |
| }, |
| { |
| "epoch": 3153.1337126600283, |
| "grad_norm": 0.11175887286663055, |
| "learning_rate": 6.220735785953178e-07, |
| "loss": 2.6123, |
| "step": 53800 |
| }, |
| { |
| "epoch": 3153.1337126600283, |
| "eval_loss": 2.6164095401763916, |
| "eval_runtime": 8.8596, |
| "eval_samples_per_second": 157.343, |
| "eval_steps_per_second": 19.753, |
| "step": 53800 |
| }, |
| { |
| "epoch": 3158.823613086771, |
| "grad_norm": 0.12376561760902405, |
| "learning_rate": 6.120401337792642e-07, |
| "loss": 2.6125, |
| "step": 53900 |
| }, |
| { |
| "epoch": 3158.823613086771, |
| "eval_loss": 2.612550973892212, |
| "eval_runtime": 8.5335, |
| "eval_samples_per_second": 163.356, |
| "eval_steps_per_second": 20.507, |
| "step": 53900 |
| }, |
| { |
| "epoch": 3164.5135135135133, |
| "grad_norm": 0.12542764842510223, |
| "learning_rate": 6.020066889632107e-07, |
| "loss": 2.612, |
| "step": 54000 |
| }, |
| { |
| "epoch": 3164.5135135135133, |
| "eval_loss": 2.614248037338257, |
| "eval_runtime": 8.5367, |
| "eval_samples_per_second": 163.296, |
| "eval_steps_per_second": 20.5, |
| "step": 54000 |
| }, |
| { |
| "epoch": 3170.2034139402563, |
| "grad_norm": 0.12020324170589447, |
| "learning_rate": 5.919732441471572e-07, |
| "loss": 2.6123, |
| "step": 54100 |
| }, |
| { |
| "epoch": 3170.2034139402563, |
| "eval_loss": 2.615945339202881, |
| "eval_runtime": 8.8835, |
| "eval_samples_per_second": 156.92, |
| "eval_steps_per_second": 19.699, |
| "step": 54100 |
| }, |
| { |
| "epoch": 3175.8933143669988, |
| "grad_norm": 0.13160724937915802, |
| "learning_rate": 5.819397993311037e-07, |
| "loss": 2.6125, |
| "step": 54200 |
| }, |
| { |
| "epoch": 3175.8933143669988, |
| "eval_loss": 2.612717628479004, |
| "eval_runtime": 8.533, |
| "eval_samples_per_second": 163.365, |
| "eval_steps_per_second": 20.509, |
| "step": 54200 |
| }, |
| { |
| "epoch": 3181.5832147937413, |
| "grad_norm": 0.11064854264259338, |
| "learning_rate": 5.719063545150502e-07, |
| "loss": 2.6127, |
| "step": 54300 |
| }, |
| { |
| "epoch": 3181.5832147937413, |
| "eval_loss": 2.6137535572052, |
| "eval_runtime": 8.5351, |
| "eval_samples_per_second": 163.326, |
| "eval_steps_per_second": 20.504, |
| "step": 54300 |
| }, |
| { |
| "epoch": 3187.2731152204838, |
| "grad_norm": 0.13410420715808868, |
| "learning_rate": 5.618729096989966e-07, |
| "loss": 2.6125, |
| "step": 54400 |
| }, |
| { |
| "epoch": 3187.2731152204838, |
| "eval_loss": 2.6163814067840576, |
| "eval_runtime": 8.869, |
| "eval_samples_per_second": 157.176, |
| "eval_steps_per_second": 19.732, |
| "step": 54400 |
| }, |
| { |
| "epoch": 3192.9630156472263, |
| "grad_norm": 0.13291259109973907, |
| "learning_rate": 5.518394648829431e-07, |
| "loss": 2.6125, |
| "step": 54500 |
| }, |
| { |
| "epoch": 3192.9630156472263, |
| "eval_loss": 2.6127355098724365, |
| "eval_runtime": 8.5356, |
| "eval_samples_per_second": 163.315, |
| "eval_steps_per_second": 20.502, |
| "step": 54500 |
| }, |
| { |
| "epoch": 3198.6529160739688, |
| "grad_norm": 0.1289217323064804, |
| "learning_rate": 5.418060200668896e-07, |
| "loss": 2.6122, |
| "step": 54600 |
| }, |
| { |
| "epoch": 3198.6529160739688, |
| "eval_loss": 2.613924503326416, |
| "eval_runtime": 8.869, |
| "eval_samples_per_second": 157.176, |
| "eval_steps_per_second": 19.732, |
| "step": 54600 |
| }, |
| { |
| "epoch": 3204.3428165007113, |
| "grad_norm": 0.12402568757534027, |
| "learning_rate": 5.317725752508361e-07, |
| "loss": 2.6125, |
| "step": 54700 |
| }, |
| { |
| "epoch": 3204.3428165007113, |
| "eval_loss": 2.6189987659454346, |
| "eval_runtime": 8.5185, |
| "eval_samples_per_second": 163.643, |
| "eval_steps_per_second": 20.543, |
| "step": 54700 |
| }, |
| { |
| "epoch": 3210.0327169274537, |
| "grad_norm": 0.11996253579854965, |
| "learning_rate": 5.217391304347826e-07, |
| "loss": 2.6128, |
| "step": 54800 |
| }, |
| { |
| "epoch": 3210.0327169274537, |
| "eval_loss": 2.6130542755126953, |
| "eval_runtime": 8.5256, |
| "eval_samples_per_second": 163.508, |
| "eval_steps_per_second": 20.526, |
| "step": 54800 |
| }, |
| { |
| "epoch": 3215.7226173541962, |
| "grad_norm": 0.1303112506866455, |
| "learning_rate": 5.117056856187291e-07, |
| "loss": 2.6125, |
| "step": 54900 |
| }, |
| { |
| "epoch": 3215.7226173541962, |
| "eval_loss": 2.614591598510742, |
| "eval_runtime": 9.1311, |
| "eval_samples_per_second": 152.664, |
| "eval_steps_per_second": 19.165, |
| "step": 54900 |
| }, |
| { |
| "epoch": 3221.4125177809387, |
| "grad_norm": 0.13109813630580902, |
| "learning_rate": 5.016722408026756e-07, |
| "loss": 2.612, |
| "step": 55000 |
| }, |
| { |
| "epoch": 3221.4125177809387, |
| "eval_loss": 2.6141510009765625, |
| "eval_runtime": 8.5232, |
| "eval_samples_per_second": 163.554, |
| "eval_steps_per_second": 20.532, |
| "step": 55000 |
| }, |
| { |
| "epoch": 3227.1024182076812, |
| "grad_norm": 0.11694065481424332, |
| "learning_rate": 4.916387959866221e-07, |
| "loss": 2.6118, |
| "step": 55100 |
| }, |
| { |
| "epoch": 3227.1024182076812, |
| "eval_loss": 2.613607406616211, |
| "eval_runtime": 8.5214, |
| "eval_samples_per_second": 163.588, |
| "eval_steps_per_second": 20.537, |
| "step": 55100 |
| }, |
| { |
| "epoch": 3232.7923186344237, |
| "grad_norm": 0.126685231924057, |
| "learning_rate": 4.816053511705686e-07, |
| "loss": 2.6121, |
| "step": 55200 |
| }, |
| { |
| "epoch": 3232.7923186344237, |
| "eval_loss": 2.617375373840332, |
| "eval_runtime": 8.5254, |
| "eval_samples_per_second": 163.511, |
| "eval_steps_per_second": 20.527, |
| "step": 55200 |
| }, |
| { |
| "epoch": 3238.4822190611662, |
| "grad_norm": 0.1280149221420288, |
| "learning_rate": 4.7157190635451506e-07, |
| "loss": 2.6126, |
| "step": 55300 |
| }, |
| { |
| "epoch": 3238.4822190611662, |
| "eval_loss": 2.6150975227355957, |
| "eval_runtime": 8.8573, |
| "eval_samples_per_second": 157.383, |
| "eval_steps_per_second": 19.758, |
| "step": 55300 |
| }, |
| { |
| "epoch": 3244.172119487909, |
| "grad_norm": 0.13586066663265228, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 2.6121, |
| "step": 55400 |
| }, |
| { |
| "epoch": 3244.172119487909, |
| "eval_loss": 2.613374710083008, |
| "eval_runtime": 8.5339, |
| "eval_samples_per_second": 163.349, |
| "eval_steps_per_second": 20.506, |
| "step": 55400 |
| }, |
| { |
| "epoch": 3249.8620199146517, |
| "grad_norm": 0.13014060258865356, |
| "learning_rate": 4.5150501672240806e-07, |
| "loss": 2.6122, |
| "step": 55500 |
| }, |
| { |
| "epoch": 3249.8620199146517, |
| "eval_loss": 2.6121749877929688, |
| "eval_runtime": 8.8553, |
| "eval_samples_per_second": 157.419, |
| "eval_steps_per_second": 19.762, |
| "step": 55500 |
| }, |
| { |
| "epoch": 3255.551920341394, |
| "grad_norm": 0.1337248831987381, |
| "learning_rate": 4.4147157190635456e-07, |
| "loss": 2.6115, |
| "step": 55600 |
| }, |
| { |
| "epoch": 3255.551920341394, |
| "eval_loss": 2.6143338680267334, |
| "eval_runtime": 8.532, |
| "eval_samples_per_second": 163.385, |
| "eval_steps_per_second": 20.511, |
| "step": 55600 |
| }, |
| { |
| "epoch": 3261.2418207681367, |
| "grad_norm": 0.12295526266098022, |
| "learning_rate": 4.3143812709030095e-07, |
| "loss": 2.6128, |
| "step": 55700 |
| }, |
| { |
| "epoch": 3261.2418207681367, |
| "eval_loss": 2.6155290603637695, |
| "eval_runtime": 8.5401, |
| "eval_samples_per_second": 163.23, |
| "eval_steps_per_second": 20.492, |
| "step": 55700 |
| }, |
| { |
| "epoch": 3266.931721194879, |
| "grad_norm": 0.13388285040855408, |
| "learning_rate": 4.2140468227424745e-07, |
| "loss": 2.6121, |
| "step": 55800 |
| }, |
| { |
| "epoch": 3266.931721194879, |
| "eval_loss": 2.6157045364379883, |
| "eval_runtime": 8.8687, |
| "eval_samples_per_second": 157.182, |
| "eval_steps_per_second": 19.732, |
| "step": 55800 |
| }, |
| { |
| "epoch": 3272.6216216216217, |
| "grad_norm": 0.1304856538772583, |
| "learning_rate": 4.1137123745819395e-07, |
| "loss": 2.6119, |
| "step": 55900 |
| }, |
| { |
| "epoch": 3272.6216216216217, |
| "eval_loss": 2.614722490310669, |
| "eval_runtime": 8.5288, |
| "eval_samples_per_second": 163.446, |
| "eval_steps_per_second": 20.519, |
| "step": 55900 |
| }, |
| { |
| "epoch": 3278.311522048364, |
| "grad_norm": 0.13436032831668854, |
| "learning_rate": 4.0133779264214045e-07, |
| "loss": 2.6123, |
| "step": 56000 |
| }, |
| { |
| "epoch": 3278.311522048364, |
| "eval_loss": 2.613041639328003, |
| "eval_runtime": 8.5278, |
| "eval_samples_per_second": 163.466, |
| "eval_steps_per_second": 20.521, |
| "step": 56000 |
| }, |
| { |
| "epoch": 3284.0014224751067, |
| "grad_norm": 0.14674031734466553, |
| "learning_rate": 3.9130434782608694e-07, |
| "loss": 2.6122, |
| "step": 56100 |
| }, |
| { |
| "epoch": 3284.0014224751067, |
| "eval_loss": 2.611990213394165, |
| "eval_runtime": 8.7786, |
| "eval_samples_per_second": 158.795, |
| "eval_steps_per_second": 19.935, |
| "step": 56100 |
| }, |
| { |
| "epoch": 3289.691322901849, |
| "grad_norm": 0.1269470900297165, |
| "learning_rate": 3.8127090301003344e-07, |
| "loss": 2.6119, |
| "step": 56200 |
| }, |
| { |
| "epoch": 3289.691322901849, |
| "eval_loss": 2.614060878753662, |
| "eval_runtime": 8.5989, |
| "eval_samples_per_second": 162.113, |
| "eval_steps_per_second": 20.351, |
| "step": 56200 |
| }, |
| { |
| "epoch": 3295.3812233285917, |
| "grad_norm": 0.13767366111278534, |
| "learning_rate": 3.7123745819397994e-07, |
| "loss": 2.6121, |
| "step": 56300 |
| }, |
| { |
| "epoch": 3295.3812233285917, |
| "eval_loss": 2.616547107696533, |
| "eval_runtime": 8.5426, |
| "eval_samples_per_second": 163.181, |
| "eval_steps_per_second": 20.485, |
| "step": 56300 |
| }, |
| { |
| "epoch": 3301.071123755334, |
| "grad_norm": 0.13906554877758026, |
| "learning_rate": 3.6120401337792644e-07, |
| "loss": 2.6121, |
| "step": 56400 |
| }, |
| { |
| "epoch": 3301.071123755334, |
| "eval_loss": 2.6139421463012695, |
| "eval_runtime": 8.5234, |
| "eval_samples_per_second": 163.55, |
| "eval_steps_per_second": 20.532, |
| "step": 56400 |
| }, |
| { |
| "epoch": 3306.7610241820767, |
| "grad_norm": 0.13832303881645203, |
| "learning_rate": 3.5117056856187294e-07, |
| "loss": 2.612, |
| "step": 56500 |
| }, |
| { |
| "epoch": 3306.7610241820767, |
| "eval_loss": 2.6141018867492676, |
| "eval_runtime": 8.8522, |
| "eval_samples_per_second": 157.475, |
| "eval_steps_per_second": 19.769, |
| "step": 56500 |
| }, |
| { |
| "epoch": 3312.450924608819, |
| "grad_norm": 0.13443072140216827, |
| "learning_rate": 3.411371237458194e-07, |
| "loss": 2.6127, |
| "step": 56600 |
| }, |
| { |
| "epoch": 3312.450924608819, |
| "eval_loss": 2.6130294799804688, |
| "eval_runtime": 8.5356, |
| "eval_samples_per_second": 163.317, |
| "eval_steps_per_second": 20.502, |
| "step": 56600 |
| }, |
| { |
| "epoch": 3318.140825035562, |
| "grad_norm": 0.1384400725364685, |
| "learning_rate": 3.311036789297659e-07, |
| "loss": 2.6125, |
| "step": 56700 |
| }, |
| { |
| "epoch": 3318.140825035562, |
| "eval_loss": 2.614971876144409, |
| "eval_runtime": 8.5276, |
| "eval_samples_per_second": 163.468, |
| "eval_steps_per_second": 20.521, |
| "step": 56700 |
| }, |
| { |
| "epoch": 3323.8307254623046, |
| "grad_norm": 0.12781038880348206, |
| "learning_rate": 3.210702341137124e-07, |
| "loss": 2.6119, |
| "step": 56800 |
| }, |
| { |
| "epoch": 3323.8307254623046, |
| "eval_loss": 2.6149492263793945, |
| "eval_runtime": 8.5244, |
| "eval_samples_per_second": 163.531, |
| "eval_steps_per_second": 20.529, |
| "step": 56800 |
| }, |
| { |
| "epoch": 3329.520625889047, |
| "grad_norm": 0.13229794800281525, |
| "learning_rate": 3.110367892976589e-07, |
| "loss": 2.6114, |
| "step": 56900 |
| }, |
| { |
| "epoch": 3329.520625889047, |
| "eval_loss": 2.620450019836426, |
| "eval_runtime": 8.8571, |
| "eval_samples_per_second": 157.388, |
| "eval_steps_per_second": 19.758, |
| "step": 56900 |
| }, |
| { |
| "epoch": 3335.2105263157896, |
| "grad_norm": 0.13062149286270142, |
| "learning_rate": 3.010033444816054e-07, |
| "loss": 2.6123, |
| "step": 57000 |
| }, |
| { |
| "epoch": 3335.2105263157896, |
| "eval_loss": 2.6148345470428467, |
| "eval_runtime": 8.5245, |
| "eval_samples_per_second": 163.528, |
| "eval_steps_per_second": 20.529, |
| "step": 57000 |
| }, |
| { |
| "epoch": 3340.900426742532, |
| "grad_norm": 0.1294122189283371, |
| "learning_rate": 2.9096989966555187e-07, |
| "loss": 2.6121, |
| "step": 57100 |
| }, |
| { |
| "epoch": 3340.900426742532, |
| "eval_loss": 2.6161153316497803, |
| "eval_runtime": 8.5288, |
| "eval_samples_per_second": 163.446, |
| "eval_steps_per_second": 20.519, |
| "step": 57100 |
| }, |
| { |
| "epoch": 3346.5903271692746, |
| "grad_norm": 0.1416897028684616, |
| "learning_rate": 2.809364548494983e-07, |
| "loss": 2.6121, |
| "step": 57200 |
| }, |
| { |
| "epoch": 3346.5903271692746, |
| "eval_loss": 2.610884428024292, |
| "eval_runtime": 8.8659, |
| "eval_samples_per_second": 157.232, |
| "eval_steps_per_second": 19.739, |
| "step": 57200 |
| }, |
| { |
| "epoch": 3352.280227596017, |
| "grad_norm": 0.13414239883422852, |
| "learning_rate": 2.709030100334448e-07, |
| "loss": 2.6117, |
| "step": 57300 |
| }, |
| { |
| "epoch": 3352.280227596017, |
| "eval_loss": 2.613905906677246, |
| "eval_runtime": 8.523, |
| "eval_samples_per_second": 163.557, |
| "eval_steps_per_second": 20.533, |
| "step": 57300 |
| }, |
| { |
| "epoch": 3357.9701280227596, |
| "grad_norm": 0.11113996803760529, |
| "learning_rate": 2.608695652173913e-07, |
| "loss": 2.6123, |
| "step": 57400 |
| }, |
| { |
| "epoch": 3357.9701280227596, |
| "eval_loss": 2.6124770641326904, |
| "eval_runtime": 8.5319, |
| "eval_samples_per_second": 163.387, |
| "eval_steps_per_second": 20.511, |
| "step": 57400 |
| }, |
| { |
| "epoch": 3363.660028449502, |
| "grad_norm": 0.12642131745815277, |
| "learning_rate": 2.508361204013378e-07, |
| "loss": 2.6121, |
| "step": 57500 |
| }, |
| { |
| "epoch": 3363.660028449502, |
| "eval_loss": 2.6125006675720215, |
| "eval_runtime": 8.5321, |
| "eval_samples_per_second": 163.382, |
| "eval_steps_per_second": 20.511, |
| "step": 57500 |
| }, |
| { |
| "epoch": 3369.3499288762446, |
| "grad_norm": 0.12002536654472351, |
| "learning_rate": 2.408026755852843e-07, |
| "loss": 2.6119, |
| "step": 57600 |
| }, |
| { |
| "epoch": 3369.3499288762446, |
| "eval_loss": 2.6137232780456543, |
| "eval_runtime": 8.8563, |
| "eval_samples_per_second": 157.402, |
| "eval_steps_per_second": 19.76, |
| "step": 57600 |
| }, |
| { |
| "epoch": 3375.039829302987, |
| "grad_norm": 0.12281110137701035, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 2.6117, |
| "step": 57700 |
| }, |
| { |
| "epoch": 3375.039829302987, |
| "eval_loss": 2.6145715713500977, |
| "eval_runtime": 8.5313, |
| "eval_samples_per_second": 163.399, |
| "eval_steps_per_second": 20.513, |
| "step": 57700 |
| }, |
| { |
| "epoch": 3380.7297297297296, |
| "grad_norm": 0.14482566714286804, |
| "learning_rate": 2.2073578595317728e-07, |
| "loss": 2.6118, |
| "step": 57800 |
| }, |
| { |
| "epoch": 3380.7297297297296, |
| "eval_loss": 2.613739490509033, |
| "eval_runtime": 8.5223, |
| "eval_samples_per_second": 163.572, |
| "eval_steps_per_second": 20.534, |
| "step": 57800 |
| }, |
| { |
| "epoch": 3386.419630156472, |
| "grad_norm": 0.1368427276611328, |
| "learning_rate": 2.1070234113712372e-07, |
| "loss": 2.6119, |
| "step": 57900 |
| }, |
| { |
| "epoch": 3386.419630156472, |
| "eval_loss": 2.6152491569519043, |
| "eval_runtime": 8.8583, |
| "eval_samples_per_second": 157.366, |
| "eval_steps_per_second": 19.755, |
| "step": 57900 |
| }, |
| { |
| "epoch": 3392.109530583215, |
| "grad_norm": 0.13695128262043, |
| "learning_rate": 2.0066889632107022e-07, |
| "loss": 2.6116, |
| "step": 58000 |
| }, |
| { |
| "epoch": 3392.109530583215, |
| "eval_loss": 2.6153528690338135, |
| "eval_runtime": 8.5368, |
| "eval_samples_per_second": 163.292, |
| "eval_steps_per_second": 20.499, |
| "step": 58000 |
| }, |
| { |
| "epoch": 3397.7994310099575, |
| "grad_norm": 0.11453160643577576, |
| "learning_rate": 1.9063545150501672e-07, |
| "loss": 2.612, |
| "step": 58100 |
| }, |
| { |
| "epoch": 3397.7994310099575, |
| "eval_loss": 2.615257740020752, |
| "eval_runtime": 8.5272, |
| "eval_samples_per_second": 163.477, |
| "eval_steps_per_second": 20.523, |
| "step": 58100 |
| }, |
| { |
| "epoch": 3403.4893314367, |
| "grad_norm": 0.13847880065441132, |
| "learning_rate": 1.8060200668896322e-07, |
| "loss": 2.6125, |
| "step": 58200 |
| }, |
| { |
| "epoch": 3403.4893314367, |
| "eval_loss": 2.6181981563568115, |
| "eval_runtime": 8.8493, |
| "eval_samples_per_second": 157.527, |
| "eval_steps_per_second": 19.776, |
| "step": 58200 |
| }, |
| { |
| "epoch": 3409.1792318634425, |
| "grad_norm": 0.13308827579021454, |
| "learning_rate": 1.705685618729097e-07, |
| "loss": 2.6124, |
| "step": 58300 |
| }, |
| { |
| "epoch": 3409.1792318634425, |
| "eval_loss": 2.6120095252990723, |
| "eval_runtime": 8.5246, |
| "eval_samples_per_second": 163.526, |
| "eval_steps_per_second": 20.529, |
| "step": 58300 |
| }, |
| { |
| "epoch": 3414.869132290185, |
| "grad_norm": 0.13217765092849731, |
| "learning_rate": 1.605351170568562e-07, |
| "loss": 2.6117, |
| "step": 58400 |
| }, |
| { |
| "epoch": 3414.869132290185, |
| "eval_loss": 2.6182873249053955, |
| "eval_runtime": 8.5321, |
| "eval_samples_per_second": 163.384, |
| "eval_steps_per_second": 20.511, |
| "step": 58400 |
| }, |
| { |
| "epoch": 3420.5590327169275, |
| "grad_norm": 0.13483327627182007, |
| "learning_rate": 1.505016722408027e-07, |
| "loss": 2.6122, |
| "step": 58500 |
| }, |
| { |
| "epoch": 3420.5590327169275, |
| "eval_loss": 2.614924430847168, |
| "eval_runtime": 8.8582, |
| "eval_samples_per_second": 157.368, |
| "eval_steps_per_second": 19.756, |
| "step": 58500 |
| }, |
| { |
| "epoch": 3426.24893314367, |
| "grad_norm": 0.13368582725524902, |
| "learning_rate": 1.4046822742474916e-07, |
| "loss": 2.6123, |
| "step": 58600 |
| }, |
| { |
| "epoch": 3426.24893314367, |
| "eval_loss": 2.613650321960449, |
| "eval_runtime": 8.5412, |
| "eval_samples_per_second": 163.209, |
| "eval_steps_per_second": 20.489, |
| "step": 58600 |
| }, |
| { |
| "epoch": 3431.9388335704125, |
| "grad_norm": 0.13080868124961853, |
| "learning_rate": 1.3043478260869566e-07, |
| "loss": 2.612, |
| "step": 58700 |
| }, |
| { |
| "epoch": 3431.9388335704125, |
| "eval_loss": 2.61566162109375, |
| "eval_runtime": 8.8538, |
| "eval_samples_per_second": 157.447, |
| "eval_steps_per_second": 19.766, |
| "step": 58700 |
| }, |
| { |
| "epoch": 3437.628733997155, |
| "grad_norm": 0.1235753670334816, |
| "learning_rate": 1.2040133779264215e-07, |
| "loss": 2.6127, |
| "step": 58800 |
| }, |
| { |
| "epoch": 3437.628733997155, |
| "eval_loss": 2.612259864807129, |
| "eval_runtime": 8.5258, |
| "eval_samples_per_second": 163.503, |
| "eval_steps_per_second": 20.526, |
| "step": 58800 |
| }, |
| { |
| "epoch": 3443.3186344238975, |
| "grad_norm": 0.13656386733055115, |
| "learning_rate": 1.1036789297658864e-07, |
| "loss": 2.6119, |
| "step": 58900 |
| }, |
| { |
| "epoch": 3443.3186344238975, |
| "eval_loss": 2.615288257598877, |
| "eval_runtime": 8.8549, |
| "eval_samples_per_second": 157.428, |
| "eval_steps_per_second": 19.763, |
| "step": 58900 |
| }, |
| { |
| "epoch": 3449.00853485064, |
| "grad_norm": 0.13021564483642578, |
| "learning_rate": 1.0033444816053511e-07, |
| "loss": 2.6123, |
| "step": 59000 |
| }, |
| { |
| "epoch": 3449.00853485064, |
| "eval_loss": 2.6138880252838135, |
| "eval_runtime": 8.6443, |
| "eval_samples_per_second": 161.263, |
| "eval_steps_per_second": 20.245, |
| "step": 59000 |
| }, |
| { |
| "epoch": 3454.6984352773825, |
| "grad_norm": 0.12730829417705536, |
| "learning_rate": 9.030100334448161e-08, |
| "loss": 2.6119, |
| "step": 59100 |
| }, |
| { |
| "epoch": 3454.6984352773825, |
| "eval_loss": 2.6168129444122314, |
| "eval_runtime": 8.8546, |
| "eval_samples_per_second": 157.432, |
| "eval_steps_per_second": 19.764, |
| "step": 59100 |
| }, |
| { |
| "epoch": 3460.388335704125, |
| "grad_norm": 0.13064709305763245, |
| "learning_rate": 8.02675585284281e-08, |
| "loss": 2.6119, |
| "step": 59200 |
| }, |
| { |
| "epoch": 3460.388335704125, |
| "eval_loss": 2.616276502609253, |
| "eval_runtime": 8.5216, |
| "eval_samples_per_second": 163.585, |
| "eval_steps_per_second": 20.536, |
| "step": 59200 |
| }, |
| { |
| "epoch": 3466.078236130868, |
| "grad_norm": 0.13055041432380676, |
| "learning_rate": 7.023411371237458e-08, |
| "loss": 2.6118, |
| "step": 59300 |
| }, |
| { |
| "epoch": 3466.078236130868, |
| "eval_loss": 2.614774227142334, |
| "eval_runtime": 8.8545, |
| "eval_samples_per_second": 157.433, |
| "eval_steps_per_second": 19.764, |
| "step": 59300 |
| }, |
| { |
| "epoch": 3471.7681365576104, |
| "grad_norm": 0.12495147436857224, |
| "learning_rate": 6.020066889632108e-08, |
| "loss": 2.6122, |
| "step": 59400 |
| }, |
| { |
| "epoch": 3471.7681365576104, |
| "eval_loss": 2.6170403957366943, |
| "eval_runtime": 8.5346, |
| "eval_samples_per_second": 163.335, |
| "eval_steps_per_second": 20.505, |
| "step": 59400 |
| }, |
| { |
| "epoch": 3477.458036984353, |
| "grad_norm": 0.1302523910999298, |
| "learning_rate": 5.0167224080267556e-08, |
| "loss": 2.6119, |
| "step": 59500 |
| }, |
| { |
| "epoch": 3477.458036984353, |
| "eval_loss": 2.609950065612793, |
| "eval_runtime": 8.8493, |
| "eval_samples_per_second": 157.526, |
| "eval_steps_per_second": 19.775, |
| "step": 59500 |
| }, |
| { |
| "epoch": 3483.1479374110954, |
| "grad_norm": 0.13452781736850739, |
| "learning_rate": 4.013377926421405e-08, |
| "loss": 2.612, |
| "step": 59600 |
| }, |
| { |
| "epoch": 3483.1479374110954, |
| "eval_loss": 2.614889144897461, |
| "eval_runtime": 8.5246, |
| "eval_samples_per_second": 163.526, |
| "eval_steps_per_second": 20.529, |
| "step": 59600 |
| }, |
| { |
| "epoch": 3488.837837837838, |
| "grad_norm": 0.1290915459394455, |
| "learning_rate": 3.010033444816054e-08, |
| "loss": 2.6118, |
| "step": 59700 |
| }, |
| { |
| "epoch": 3488.837837837838, |
| "eval_loss": 2.616943120956421, |
| "eval_runtime": 8.87, |
| "eval_samples_per_second": 157.16, |
| "eval_steps_per_second": 19.73, |
| "step": 59700 |
| }, |
| { |
| "epoch": 3494.5277382645804, |
| "grad_norm": 0.12313296645879745, |
| "learning_rate": 2.0066889632107024e-08, |
| "loss": 2.6119, |
| "step": 59800 |
| }, |
| { |
| "epoch": 3494.5277382645804, |
| "eval_loss": 2.6143088340759277, |
| "eval_runtime": 8.5338, |
| "eval_samples_per_second": 163.35, |
| "eval_steps_per_second": 20.507, |
| "step": 59800 |
| }, |
| { |
| "epoch": 3500.217638691323, |
| "grad_norm": 0.11486466974020004, |
| "learning_rate": 1.0033444816053512e-08, |
| "loss": 2.6118, |
| "step": 59900 |
| }, |
| { |
| "epoch": 3500.217638691323, |
| "eval_loss": 2.6164870262145996, |
| "eval_runtime": 8.5248, |
| "eval_samples_per_second": 163.523, |
| "eval_steps_per_second": 20.528, |
| "step": 59900 |
| }, |
| { |
| "epoch": 3505.9075391180654, |
| "grad_norm": 0.11500786989927292, |
| "learning_rate": 0.0, |
| "loss": 2.6119, |
| "step": 60000 |
| }, |
| { |
| "epoch": 3505.9075391180654, |
| "eval_loss": 2.616939067840576, |
| "eval_runtime": 8.8637, |
| "eval_samples_per_second": 157.27, |
| "eval_steps_per_second": 19.743, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3530, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 10, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 10 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.02069363654656e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|