{ "best_metric": 0.229187473654747, "best_model_checkpoint": "learning_source_20260318/compounds/bert-output/compounds-small/checkpoint-61000", "epoch": 3.7895791085917425, "eval_steps": 100, "global_step": 63000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006015204934272608, "grad_norm": 4.738959312438965, "learning_rate": 3e-06, "loss": 4.8794, "step": 100 }, { "epoch": 0.006015204934272608, "eval_loss": 3.196549892425537, "eval_runtime": 21.7167, "eval_samples_per_second": 460.474, "eval_steps_per_second": 57.559, "step": 100 }, { "epoch": 0.012030409868545216, "grad_norm": 3.5901083946228027, "learning_rate": 6e-06, "loss": 2.8953, "step": 200 }, { "epoch": 0.012030409868545216, "eval_loss": 2.467365264892578, "eval_runtime": 21.7318, "eval_samples_per_second": 460.156, "eval_steps_per_second": 57.52, "step": 200 }, { "epoch": 0.018045614802817824, "grad_norm": 1.398197889328003, "learning_rate": 5.998999666555519e-06, "loss": 2.4113, "step": 300 }, { "epoch": 0.018045614802817824, "eval_loss": 2.267258644104004, "eval_runtime": 21.7416, "eval_samples_per_second": 459.948, "eval_steps_per_second": 57.493, "step": 300 }, { "epoch": 0.02406081973709043, "grad_norm": 1.1230988502502441, "learning_rate": 5.997999333111037e-06, "loss": 2.2393, "step": 400 }, { "epoch": 0.02406081973709043, "eval_loss": 2.1407406330108643, "eval_runtime": 21.7301, "eval_samples_per_second": 460.192, "eval_steps_per_second": 57.524, "step": 400 }, { "epoch": 0.030076024671363038, "grad_norm": 1.1243526935577393, "learning_rate": 5.9969989996665554e-06, "loss": 2.129, "step": 500 }, { "epoch": 0.030076024671363038, "eval_loss": 2.0753207206726074, "eval_runtime": 21.7547, "eval_samples_per_second": 459.671, "eval_steps_per_second": 57.459, "step": 500 }, { "epoch": 0.03609122960563565, "grad_norm": 2.403114080429077, "learning_rate": 5.995998666222074e-06, "loss": 2.0746, "step": 600 }, { "epoch": 0.03609122960563565, "eval_loss": 2.029200315475464, "eval_runtime": 21.746, "eval_samples_per_second": 459.855, "eval_steps_per_second": 57.482, "step": 600 }, { "epoch": 0.042106434539908255, "grad_norm": 1.4393417835235596, "learning_rate": 5.994998332777593e-06, "loss": 2.0389, "step": 700 }, { "epoch": 0.042106434539908255, "eval_loss": 1.995573878288269, "eval_runtime": 21.7364, "eval_samples_per_second": 460.057, "eval_steps_per_second": 57.507, "step": 700 }, { "epoch": 0.04812163947418086, "grad_norm": 1.392498254776001, "learning_rate": 5.9939979993331115e-06, "loss": 1.9989, "step": 800 }, { "epoch": 0.04812163947418086, "eval_loss": 1.9491331577301025, "eval_runtime": 21.7276, "eval_samples_per_second": 460.245, "eval_steps_per_second": 57.531, "step": 800 }, { "epoch": 0.05413684440845347, "grad_norm": 1.9008598327636719, "learning_rate": 5.992997665888629e-06, "loss": 1.9332, "step": 900 }, { "epoch": 0.05413684440845347, "eval_loss": 1.8610961437225342, "eval_runtime": 21.7345, "eval_samples_per_second": 460.098, "eval_steps_per_second": 57.512, "step": 900 }, { "epoch": 0.060152049342726076, "grad_norm": 2.0319344997406006, "learning_rate": 5.991997332444148e-06, "loss": 1.8607, "step": 1000 }, { "epoch": 0.060152049342726076, "eval_loss": 1.7824585437774658, "eval_runtime": 21.7313, "eval_samples_per_second": 460.167, "eval_steps_per_second": 57.521, "step": 1000 }, { "epoch": 0.06616725427699868, "grad_norm": 1.9651939868927002, "learning_rate": 5.990996998999667e-06, "loss": 1.8064, "step": 1100 }, { "epoch": 0.06616725427699868, "eval_loss": 1.7280884981155396, "eval_runtime": 21.7428, "eval_samples_per_second": 459.923, "eval_steps_per_second": 57.49, "step": 1100 }, { "epoch": 0.0721824592112713, "grad_norm": 1.2767350673675537, "learning_rate": 5.989996665555185e-06, "loss": 1.7432, "step": 1200 }, { "epoch": 0.0721824592112713, "eval_loss": 1.644949197769165, "eval_runtime": 21.7259, "eval_samples_per_second": 460.279, "eval_steps_per_second": 57.535, "step": 1200 }, { "epoch": 0.0781976641455439, "grad_norm": 1.3338353633880615, "learning_rate": 5.988996332110703e-06, "loss": 1.6816, "step": 1300 }, { "epoch": 0.0781976641455439, "eval_loss": 1.5758517980575562, "eval_runtime": 21.7331, "eval_samples_per_second": 460.127, "eval_steps_per_second": 57.516, "step": 1300 }, { "epoch": 0.08421286907981651, "grad_norm": 1.5716562271118164, "learning_rate": 5.987995998666222e-06, "loss": 1.6209, "step": 1400 }, { "epoch": 0.08421286907981651, "eval_loss": 1.506639003753662, "eval_runtime": 21.74, "eval_samples_per_second": 459.982, "eval_steps_per_second": 57.498, "step": 1400 }, { "epoch": 0.09022807401408911, "grad_norm": 1.4891563653945923, "learning_rate": 5.986995665221741e-06, "loss": 1.5562, "step": 1500 }, { "epoch": 0.09022807401408911, "eval_loss": 1.4430310726165771, "eval_runtime": 21.7254, "eval_samples_per_second": 460.29, "eval_steps_per_second": 57.536, "step": 1500 }, { "epoch": 0.09624327894836172, "grad_norm": 1.6210014820098877, "learning_rate": 5.9859953317772595e-06, "loss": 1.5081, "step": 1600 }, { "epoch": 0.09624327894836172, "eval_loss": 1.394563913345337, "eval_runtime": 21.7318, "eval_samples_per_second": 460.155, "eval_steps_per_second": 57.519, "step": 1600 }, { "epoch": 0.10225848388263432, "grad_norm": 2.3340542316436768, "learning_rate": 5.984994998332777e-06, "loss": 1.4674, "step": 1700 }, { "epoch": 0.10225848388263432, "eval_loss": 1.3446385860443115, "eval_runtime": 21.735, "eval_samples_per_second": 460.088, "eval_steps_per_second": 57.511, "step": 1700 }, { "epoch": 0.10827368881690694, "grad_norm": 1.6647675037384033, "learning_rate": 5.983994664888296e-06, "loss": 1.424, "step": 1800 }, { "epoch": 0.10827368881690694, "eval_loss": 1.3100253343582153, "eval_runtime": 21.725, "eval_samples_per_second": 460.299, "eval_steps_per_second": 57.537, "step": 1800 }, { "epoch": 0.11428889375117954, "grad_norm": 1.46592116355896, "learning_rate": 5.982994331443815e-06, "loss": 1.3892, "step": 1900 }, { "epoch": 0.11428889375117954, "eval_loss": 1.2686667442321777, "eval_runtime": 21.7512, "eval_samples_per_second": 459.744, "eval_steps_per_second": 57.468, "step": 1900 }, { "epoch": 0.12030409868545215, "grad_norm": 1.8340036869049072, "learning_rate": 5.981993997999333e-06, "loss": 1.3564, "step": 2000 }, { "epoch": 0.12030409868545215, "eval_loss": 1.2294234037399292, "eval_runtime": 21.7966, "eval_samples_per_second": 458.788, "eval_steps_per_second": 57.348, "step": 2000 }, { "epoch": 0.12631930361972477, "grad_norm": 1.5960652828216553, "learning_rate": 5.980993664554851e-06, "loss": 1.3285, "step": 2100 }, { "epoch": 0.12631930361972477, "eval_loss": 1.2100887298583984, "eval_runtime": 21.7436, "eval_samples_per_second": 459.905, "eval_steps_per_second": 57.488, "step": 2100 }, { "epoch": 0.13233450855399737, "grad_norm": 1.8335785865783691, "learning_rate": 5.979993331110371e-06, "loss": 1.3001, "step": 2200 }, { "epoch": 0.13233450855399737, "eval_loss": 1.1752792596817017, "eval_runtime": 21.7453, "eval_samples_per_second": 459.87, "eval_steps_per_second": 57.484, "step": 2200 }, { "epoch": 0.13834971348826997, "grad_norm": 1.612433671951294, "learning_rate": 5.978992997665889e-06, "loss": 1.2695, "step": 2300 }, { "epoch": 0.13834971348826997, "eval_loss": 1.147255778312683, "eval_runtime": 21.6924, "eval_samples_per_second": 460.992, "eval_steps_per_second": 57.624, "step": 2300 }, { "epoch": 0.1443649184225426, "grad_norm": 1.5603346824645996, "learning_rate": 5.9779926642214075e-06, "loss": 1.2412, "step": 2400 }, { "epoch": 0.1443649184225426, "eval_loss": 1.1130963563919067, "eval_runtime": 21.7087, "eval_samples_per_second": 460.645, "eval_steps_per_second": 57.581, "step": 2400 }, { "epoch": 0.1503801233568152, "grad_norm": 1.6393444538116455, "learning_rate": 5.976992330776926e-06, "loss": 1.2159, "step": 2500 }, { "epoch": 0.1503801233568152, "eval_loss": 1.0844037532806396, "eval_runtime": 21.7041, "eval_samples_per_second": 460.743, "eval_steps_per_second": 57.593, "step": 2500 }, { "epoch": 0.1563953282910878, "grad_norm": 1.638340950012207, "learning_rate": 5.975991997332444e-06, "loss": 1.1898, "step": 2600 }, { "epoch": 0.1563953282910878, "eval_loss": 1.0646270513534546, "eval_runtime": 21.7166, "eval_samples_per_second": 460.476, "eval_steps_per_second": 57.56, "step": 2600 }, { "epoch": 0.1624105332253604, "grad_norm": 1.745104432106018, "learning_rate": 5.974991663887963e-06, "loss": 1.1708, "step": 2700 }, { "epoch": 0.1624105332253604, "eval_loss": 1.0485780239105225, "eval_runtime": 21.7198, "eval_samples_per_second": 460.41, "eval_steps_per_second": 57.551, "step": 2700 }, { "epoch": 0.16842573815963302, "grad_norm": 1.759570837020874, "learning_rate": 5.973991330443481e-06, "loss": 1.1522, "step": 2800 }, { "epoch": 0.16842573815963302, "eval_loss": 1.0218431949615479, "eval_runtime": 21.7241, "eval_samples_per_second": 460.318, "eval_steps_per_second": 57.54, "step": 2800 }, { "epoch": 0.17444094309390562, "grad_norm": 1.76418936252594, "learning_rate": 5.972990996999e-06, "loss": 1.1218, "step": 2900 }, { "epoch": 0.17444094309390562, "eval_loss": 1.0075299739837646, "eval_runtime": 21.7441, "eval_samples_per_second": 459.894, "eval_steps_per_second": 57.487, "step": 2900 }, { "epoch": 0.18045614802817822, "grad_norm": 1.7186238765716553, "learning_rate": 5.971990663554519e-06, "loss": 1.1074, "step": 3000 }, { "epoch": 0.18045614802817822, "eval_loss": 0.9909061789512634, "eval_runtime": 21.7393, "eval_samples_per_second": 459.997, "eval_steps_per_second": 57.5, "step": 3000 }, { "epoch": 0.18647135296245085, "grad_norm": 1.6869324445724487, "learning_rate": 5.970990330110037e-06, "loss": 1.0871, "step": 3100 }, { "epoch": 0.18647135296245085, "eval_loss": 0.965461254119873, "eval_runtime": 21.7525, "eval_samples_per_second": 459.717, "eval_steps_per_second": 57.465, "step": 3100 }, { "epoch": 0.19248655789672345, "grad_norm": 1.590827465057373, "learning_rate": 5.9699899966655554e-06, "loss": 1.0678, "step": 3200 }, { "epoch": 0.19248655789672345, "eval_loss": 0.9502421617507935, "eval_runtime": 21.7257, "eval_samples_per_second": 460.284, "eval_steps_per_second": 57.536, "step": 3200 }, { "epoch": 0.19850176283099605, "grad_norm": 1.3480803966522217, "learning_rate": 5.968989663221074e-06, "loss": 1.05, "step": 3300 }, { "epoch": 0.19850176283099605, "eval_loss": 0.9217738509178162, "eval_runtime": 21.7126, "eval_samples_per_second": 460.562, "eval_steps_per_second": 57.57, "step": 3300 }, { "epoch": 0.20451696776526865, "grad_norm": 1.611717700958252, "learning_rate": 5.967989329776592e-06, "loss": 1.0308, "step": 3400 }, { "epoch": 0.20451696776526865, "eval_loss": 0.9114508628845215, "eval_runtime": 21.727, "eval_samples_per_second": 460.257, "eval_steps_per_second": 57.532, "step": 3400 }, { "epoch": 0.21053217269954128, "grad_norm": 1.424517035484314, "learning_rate": 5.966988996332111e-06, "loss": 1.0161, "step": 3500 }, { "epoch": 0.21053217269954128, "eval_loss": 0.8955187797546387, "eval_runtime": 21.7397, "eval_samples_per_second": 459.988, "eval_steps_per_second": 57.498, "step": 3500 }, { "epoch": 0.21654737763381388, "grad_norm": 1.8415201902389526, "learning_rate": 5.965988662887629e-06, "loss": 0.9983, "step": 3600 }, { "epoch": 0.21654737763381388, "eval_loss": 0.8804967999458313, "eval_runtime": 21.7608, "eval_samples_per_second": 459.541, "eval_steps_per_second": 57.443, "step": 3600 }, { "epoch": 0.22256258256808648, "grad_norm": 1.5056076049804688, "learning_rate": 5.964988329443148e-06, "loss": 0.9849, "step": 3700 }, { "epoch": 0.22256258256808648, "eval_loss": 0.8613883852958679, "eval_runtime": 21.7348, "eval_samples_per_second": 460.091, "eval_steps_per_second": 57.511, "step": 3700 }, { "epoch": 0.22857778750235908, "grad_norm": 1.6334686279296875, "learning_rate": 5.963987995998667e-06, "loss": 0.9689, "step": 3800 }, { "epoch": 0.22857778750235908, "eval_loss": 0.8555884957313538, "eval_runtime": 21.7044, "eval_samples_per_second": 460.736, "eval_steps_per_second": 57.592, "step": 3800 }, { "epoch": 0.2345929924366317, "grad_norm": 1.7393226623535156, "learning_rate": 5.962987662554185e-06, "loss": 0.9564, "step": 3900 }, { "epoch": 0.2345929924366317, "eval_loss": 0.8427873849868774, "eval_runtime": 21.7376, "eval_samples_per_second": 460.033, "eval_steps_per_second": 57.504, "step": 3900 }, { "epoch": 0.2406081973709043, "grad_norm": 1.5030866861343384, "learning_rate": 5.961987329109703e-06, "loss": 0.9417, "step": 4000 }, { "epoch": 0.2406081973709043, "eval_loss": 0.8300994038581848, "eval_runtime": 21.7438, "eval_samples_per_second": 459.902, "eval_steps_per_second": 57.488, "step": 4000 }, { "epoch": 0.2466234023051769, "grad_norm": 1.8627735376358032, "learning_rate": 5.960986995665222e-06, "loss": 0.9277, "step": 4100 }, { "epoch": 0.2466234023051769, "eval_loss": 0.8120391368865967, "eval_runtime": 21.755, "eval_samples_per_second": 459.664, "eval_steps_per_second": 57.458, "step": 4100 }, { "epoch": 0.25263860723944953, "grad_norm": 1.5174646377563477, "learning_rate": 5.95998666222074e-06, "loss": 0.9123, "step": 4200 }, { "epoch": 0.25263860723944953, "eval_loss": 0.7974905371665955, "eval_runtime": 21.7219, "eval_samples_per_second": 460.366, "eval_steps_per_second": 57.546, "step": 4200 }, { "epoch": 0.25865381217372213, "grad_norm": 1.354490041732788, "learning_rate": 5.958986328776259e-06, "loss": 0.9028, "step": 4300 }, { "epoch": 0.25865381217372213, "eval_loss": 0.7938092947006226, "eval_runtime": 21.7182, "eval_samples_per_second": 460.444, "eval_steps_per_second": 57.555, "step": 4300 }, { "epoch": 0.26466901710799473, "grad_norm": 1.6153218746185303, "learning_rate": 5.957985995331777e-06, "loss": 0.8954, "step": 4400 }, { "epoch": 0.26466901710799473, "eval_loss": 0.7785645723342896, "eval_runtime": 21.7451, "eval_samples_per_second": 459.873, "eval_steps_per_second": 57.484, "step": 4400 }, { "epoch": 0.27068422204226733, "grad_norm": 1.9774231910705566, "learning_rate": 5.956985661887296e-06, "loss": 0.8819, "step": 4500 }, { "epoch": 0.27068422204226733, "eval_loss": 0.7742797136306763, "eval_runtime": 21.7358, "eval_samples_per_second": 460.07, "eval_steps_per_second": 57.509, "step": 4500 }, { "epoch": 0.27669942697653993, "grad_norm": 1.6561676263809204, "learning_rate": 5.955985328442815e-06, "loss": 0.8729, "step": 4600 }, { "epoch": 0.27669942697653993, "eval_loss": 0.7637073397636414, "eval_runtime": 21.7296, "eval_samples_per_second": 460.202, "eval_steps_per_second": 57.525, "step": 4600 }, { "epoch": 0.28271463191081253, "grad_norm": 1.5622860193252563, "learning_rate": 5.954984994998333e-06, "loss": 0.8608, "step": 4700 }, { "epoch": 0.28271463191081253, "eval_loss": 0.7628427743911743, "eval_runtime": 21.7339, "eval_samples_per_second": 460.111, "eval_steps_per_second": 57.514, "step": 4700 }, { "epoch": 0.2887298368450852, "grad_norm": 1.6501961946487427, "learning_rate": 5.953984661553851e-06, "loss": 0.8489, "step": 4800 }, { "epoch": 0.2887298368450852, "eval_loss": 0.7505598068237305, "eval_runtime": 21.7165, "eval_samples_per_second": 460.479, "eval_steps_per_second": 57.56, "step": 4800 }, { "epoch": 0.2947450417793578, "grad_norm": 1.7538303136825562, "learning_rate": 5.95298432810937e-06, "loss": 0.8401, "step": 4900 }, { "epoch": 0.2947450417793578, "eval_loss": 0.7426216006278992, "eval_runtime": 21.7216, "eval_samples_per_second": 460.37, "eval_steps_per_second": 57.546, "step": 4900 }, { "epoch": 0.3007602467136304, "grad_norm": 1.5520670413970947, "learning_rate": 5.951983994664888e-06, "loss": 0.8361, "step": 5000 }, { "epoch": 0.3007602467136304, "eval_loss": 0.7258592247962952, "eval_runtime": 21.7177, "eval_samples_per_second": 460.454, "eval_steps_per_second": 57.557, "step": 5000 }, { "epoch": 0.306775451647903, "grad_norm": 2.0393898487091064, "learning_rate": 5.950983661220407e-06, "loss": 0.8273, "step": 5100 }, { "epoch": 0.306775451647903, "eval_loss": 0.7185364365577698, "eval_runtime": 21.6894, "eval_samples_per_second": 461.054, "eval_steps_per_second": 57.632, "step": 5100 }, { "epoch": 0.3127906565821756, "grad_norm": 1.9601730108261108, "learning_rate": 5.949983327775925e-06, "loss": 0.8135, "step": 5200 }, { "epoch": 0.3127906565821756, "eval_loss": 0.7162497639656067, "eval_runtime": 21.7614, "eval_samples_per_second": 459.53, "eval_steps_per_second": 57.441, "step": 5200 }, { "epoch": 0.3188058615164482, "grad_norm": 1.4966851472854614, "learning_rate": 5.948982994331444e-06, "loss": 0.8037, "step": 5300 }, { "epoch": 0.3188058615164482, "eval_loss": 0.7116673588752747, "eval_runtime": 21.8409, "eval_samples_per_second": 457.857, "eval_steps_per_second": 57.232, "step": 5300 }, { "epoch": 0.3248210664507208, "grad_norm": 1.4574569463729858, "learning_rate": 5.947982660886963e-06, "loss": 0.8027, "step": 5400 }, { "epoch": 0.3248210664507208, "eval_loss": 0.6981866359710693, "eval_runtime": 21.7832, "eval_samples_per_second": 459.07, "eval_steps_per_second": 57.384, "step": 5400 }, { "epoch": 0.33083627138499344, "grad_norm": 1.5823230743408203, "learning_rate": 5.9469823274424815e-06, "loss": 0.7898, "step": 5500 }, { "epoch": 0.33083627138499344, "eval_loss": 0.6950494050979614, "eval_runtime": 21.754, "eval_samples_per_second": 459.685, "eval_steps_per_second": 57.461, "step": 5500 }, { "epoch": 0.33685147631926604, "grad_norm": 1.5350251197814941, "learning_rate": 5.945981993997999e-06, "loss": 0.7829, "step": 5600 }, { "epoch": 0.33685147631926604, "eval_loss": 0.6908562183380127, "eval_runtime": 21.6939, "eval_samples_per_second": 460.958, "eval_steps_per_second": 57.62, "step": 5600 }, { "epoch": 0.34286668125353864, "grad_norm": 1.5343948602676392, "learning_rate": 5.944981660553518e-06, "loss": 0.7778, "step": 5700 }, { "epoch": 0.34286668125353864, "eval_loss": 0.6897854208946228, "eval_runtime": 21.687, "eval_samples_per_second": 461.107, "eval_steps_per_second": 57.638, "step": 5700 }, { "epoch": 0.34888188618781124, "grad_norm": 1.6000343561172485, "learning_rate": 5.943981327109036e-06, "loss": 0.7672, "step": 5800 }, { "epoch": 0.34888188618781124, "eval_loss": 0.6832409501075745, "eval_runtime": 21.7047, "eval_samples_per_second": 460.73, "eval_steps_per_second": 57.591, "step": 5800 }, { "epoch": 0.35489709112208384, "grad_norm": 1.3873372077941895, "learning_rate": 5.942980993664555e-06, "loss": 0.7645, "step": 5900 }, { "epoch": 0.35489709112208384, "eval_loss": 0.6712300777435303, "eval_runtime": 21.707, "eval_samples_per_second": 460.681, "eval_steps_per_second": 57.585, "step": 5900 }, { "epoch": 0.36091229605635644, "grad_norm": 1.5178308486938477, "learning_rate": 5.941980660220073e-06, "loss": 0.756, "step": 6000 }, { "epoch": 0.36091229605635644, "eval_loss": 0.6661484241485596, "eval_runtime": 21.7032, "eval_samples_per_second": 460.761, "eval_steps_per_second": 57.595, "step": 6000 }, { "epoch": 0.36692750099062904, "grad_norm": 1.4745811223983765, "learning_rate": 5.940980326775592e-06, "loss": 0.753, "step": 6100 }, { "epoch": 0.36692750099062904, "eval_loss": 0.664915144443512, "eval_runtime": 21.7252, "eval_samples_per_second": 460.294, "eval_steps_per_second": 57.537, "step": 6100 }, { "epoch": 0.3729427059249017, "grad_norm": 1.6472891569137573, "learning_rate": 5.939979993331111e-06, "loss": 0.743, "step": 6200 }, { "epoch": 0.3729427059249017, "eval_loss": 0.6596666574478149, "eval_runtime": 21.6717, "eval_samples_per_second": 461.432, "eval_steps_per_second": 57.679, "step": 6200 }, { "epoch": 0.3789579108591743, "grad_norm": 1.4315409660339355, "learning_rate": 5.9389796598866294e-06, "loss": 0.737, "step": 6300 }, { "epoch": 0.3789579108591743, "eval_loss": 0.6593905091285706, "eval_runtime": 21.8558, "eval_samples_per_second": 457.545, "eval_steps_per_second": 57.193, "step": 6300 }, { "epoch": 0.3849731157934469, "grad_norm": 1.553122639656067, "learning_rate": 5.937979326442147e-06, "loss": 0.7284, "step": 6400 }, { "epoch": 0.3849731157934469, "eval_loss": 0.6500257253646851, "eval_runtime": 21.725, "eval_samples_per_second": 460.298, "eval_steps_per_second": 57.537, "step": 6400 }, { "epoch": 0.3909883207277195, "grad_norm": 1.4755713939666748, "learning_rate": 5.936978992997666e-06, "loss": 0.7253, "step": 6500 }, { "epoch": 0.3909883207277195, "eval_loss": 0.6457264423370361, "eval_runtime": 21.6694, "eval_samples_per_second": 461.48, "eval_steps_per_second": 57.685, "step": 6500 }, { "epoch": 0.3970035256619921, "grad_norm": 1.3153866529464722, "learning_rate": 5.935978659553185e-06, "loss": 0.7227, "step": 6600 }, { "epoch": 0.3970035256619921, "eval_loss": 0.6387376189231873, "eval_runtime": 21.6735, "eval_samples_per_second": 461.393, "eval_steps_per_second": 57.674, "step": 6600 }, { "epoch": 0.4030187305962647, "grad_norm": 1.3349621295928955, "learning_rate": 5.9349783261087026e-06, "loss": 0.7161, "step": 6700 }, { "epoch": 0.4030187305962647, "eval_loss": 0.6228384971618652, "eval_runtime": 21.6839, "eval_samples_per_second": 461.172, "eval_steps_per_second": 57.646, "step": 6700 }, { "epoch": 0.4090339355305373, "grad_norm": 1.4209269285202026, "learning_rate": 5.933977992664221e-06, "loss": 0.7101, "step": 6800 }, { "epoch": 0.4090339355305373, "eval_loss": 0.6393507719039917, "eval_runtime": 21.6842, "eval_samples_per_second": 461.165, "eval_steps_per_second": 57.646, "step": 6800 }, { "epoch": 0.4150491404648099, "grad_norm": 1.3392629623413086, "learning_rate": 5.93297765921974e-06, "loss": 0.7043, "step": 6900 }, { "epoch": 0.4150491404648099, "eval_loss": 0.6370413303375244, "eval_runtime": 21.6802, "eval_samples_per_second": 461.251, "eval_steps_per_second": 57.656, "step": 6900 }, { "epoch": 0.42106434539908255, "grad_norm": 1.420782446861267, "learning_rate": 5.931977325775259e-06, "loss": 0.6976, "step": 7000 }, { "epoch": 0.42106434539908255, "eval_loss": 0.6197584867477417, "eval_runtime": 21.6736, "eval_samples_per_second": 461.391, "eval_steps_per_second": 57.674, "step": 7000 }, { "epoch": 0.42707955033335515, "grad_norm": 1.3362140655517578, "learning_rate": 5.930976992330777e-06, "loss": 0.6938, "step": 7100 }, { "epoch": 0.42707955033335515, "eval_loss": 0.6171865463256836, "eval_runtime": 21.6908, "eval_samples_per_second": 461.024, "eval_steps_per_second": 57.628, "step": 7100 }, { "epoch": 0.43309475526762775, "grad_norm": 1.2855477333068848, "learning_rate": 5.929976658886295e-06, "loss": 0.6897, "step": 7200 }, { "epoch": 0.43309475526762775, "eval_loss": 0.6011925935745239, "eval_runtime": 21.6697, "eval_samples_per_second": 461.474, "eval_steps_per_second": 57.684, "step": 7200 }, { "epoch": 0.43910996020190035, "grad_norm": 1.6744885444641113, "learning_rate": 5.928976325441814e-06, "loss": 0.6815, "step": 7300 }, { "epoch": 0.43910996020190035, "eval_loss": 0.606606662273407, "eval_runtime": 21.7361, "eval_samples_per_second": 460.064, "eval_steps_per_second": 57.508, "step": 7300 }, { "epoch": 0.44512516513617295, "grad_norm": 1.4268521070480347, "learning_rate": 5.927975991997333e-06, "loss": 0.6785, "step": 7400 }, { "epoch": 0.44512516513617295, "eval_loss": 0.6065685749053955, "eval_runtime": 21.7924, "eval_samples_per_second": 458.876, "eval_steps_per_second": 57.359, "step": 7400 }, { "epoch": 0.45114037007044555, "grad_norm": 1.248145341873169, "learning_rate": 5.9269756585528505e-06, "loss": 0.6734, "step": 7500 }, { "epoch": 0.45114037007044555, "eval_loss": 0.5927532911300659, "eval_runtime": 21.7131, "eval_samples_per_second": 460.551, "eval_steps_per_second": 57.569, "step": 7500 }, { "epoch": 0.45715557500471815, "grad_norm": 1.3543365001678467, "learning_rate": 5.92597532510837e-06, "loss": 0.6692, "step": 7600 }, { "epoch": 0.45715557500471815, "eval_loss": 0.584913432598114, "eval_runtime": 21.6765, "eval_samples_per_second": 461.329, "eval_steps_per_second": 57.666, "step": 7600 }, { "epoch": 0.4631707799389908, "grad_norm": 1.519895315170288, "learning_rate": 5.924974991663888e-06, "loss": 0.6683, "step": 7700 }, { "epoch": 0.4631707799389908, "eval_loss": 0.5899286270141602, "eval_runtime": 21.7078, "eval_samples_per_second": 460.664, "eval_steps_per_second": 57.583, "step": 7700 }, { "epoch": 0.4691859848732634, "grad_norm": 1.3677542209625244, "learning_rate": 5.923974658219407e-06, "loss": 0.6612, "step": 7800 }, { "epoch": 0.4691859848732634, "eval_loss": 0.5877178907394409, "eval_runtime": 21.699, "eval_samples_per_second": 460.851, "eval_steps_per_second": 57.606, "step": 7800 }, { "epoch": 0.475201189807536, "grad_norm": 1.3020201921463013, "learning_rate": 5.922974324774925e-06, "loss": 0.6593, "step": 7900 }, { "epoch": 0.475201189807536, "eval_loss": 0.5901273488998413, "eval_runtime": 21.6975, "eval_samples_per_second": 460.883, "eval_steps_per_second": 57.61, "step": 7900 }, { "epoch": 0.4812163947418086, "grad_norm": 1.2522666454315186, "learning_rate": 5.921973991330443e-06, "loss": 0.6515, "step": 8000 }, { "epoch": 0.4812163947418086, "eval_loss": 0.5791921019554138, "eval_runtime": 21.6482, "eval_samples_per_second": 461.932, "eval_steps_per_second": 57.741, "step": 8000 }, { "epoch": 0.4872315996760812, "grad_norm": 1.7226676940917969, "learning_rate": 5.920973657885962e-06, "loss": 0.6497, "step": 8100 }, { "epoch": 0.4872315996760812, "eval_loss": 0.5783876776695251, "eval_runtime": 21.8009, "eval_samples_per_second": 458.696, "eval_steps_per_second": 57.337, "step": 8100 }, { "epoch": 0.4932468046103538, "grad_norm": 1.4653980731964111, "learning_rate": 5.919973324441481e-06, "loss": 0.6463, "step": 8200 }, { "epoch": 0.4932468046103538, "eval_loss": 0.5752367973327637, "eval_runtime": 21.7179, "eval_samples_per_second": 460.45, "eval_steps_per_second": 57.556, "step": 8200 }, { "epoch": 0.4992620095446264, "grad_norm": 1.3331021070480347, "learning_rate": 5.918972990996999e-06, "loss": 0.6412, "step": 8300 }, { "epoch": 0.4992620095446264, "eval_loss": 0.5725879669189453, "eval_runtime": 21.7719, "eval_samples_per_second": 459.308, "eval_steps_per_second": 57.414, "step": 8300 }, { "epoch": 0.5052772144788991, "grad_norm": 1.245968222618103, "learning_rate": 5.917972657552518e-06, "loss": 0.64, "step": 8400 }, { "epoch": 0.5052772144788991, "eval_loss": 0.5639936923980713, "eval_runtime": 21.7448, "eval_samples_per_second": 459.88, "eval_steps_per_second": 57.485, "step": 8400 }, { "epoch": 0.5112924194131716, "grad_norm": 1.269049882888794, "learning_rate": 5.916972324108037e-06, "loss": 0.6341, "step": 8500 }, { "epoch": 0.5112924194131716, "eval_loss": 0.5605804324150085, "eval_runtime": 21.7116, "eval_samples_per_second": 460.582, "eval_steps_per_second": 57.573, "step": 8500 }, { "epoch": 0.5173076243474443, "grad_norm": 1.2048168182373047, "learning_rate": 5.915971990663555e-06, "loss": 0.6327, "step": 8600 }, { "epoch": 0.5173076243474443, "eval_loss": 0.5681275129318237, "eval_runtime": 21.7037, "eval_samples_per_second": 460.751, "eval_steps_per_second": 57.594, "step": 8600 }, { "epoch": 0.5233228292817168, "grad_norm": 1.269063949584961, "learning_rate": 5.914971657219073e-06, "loss": 0.6251, "step": 8700 }, { "epoch": 0.5233228292817168, "eval_loss": 0.5644165277481079, "eval_runtime": 21.6949, "eval_samples_per_second": 460.937, "eval_steps_per_second": 57.617, "step": 8700 }, { "epoch": 0.5293380342159895, "grad_norm": 1.3928773403167725, "learning_rate": 5.913971323774591e-06, "loss": 0.6268, "step": 8800 }, { "epoch": 0.5293380342159895, "eval_loss": 0.5452607870101929, "eval_runtime": 21.7013, "eval_samples_per_second": 460.803, "eval_steps_per_second": 57.6, "step": 8800 }, { "epoch": 0.5353532391502621, "grad_norm": 1.6263777017593384, "learning_rate": 5.91297099033011e-06, "loss": 0.6198, "step": 8900 }, { "epoch": 0.5353532391502621, "eval_loss": 0.5565773248672485, "eval_runtime": 21.7101, "eval_samples_per_second": 460.615, "eval_steps_per_second": 57.577, "step": 8900 }, { "epoch": 0.5413684440845347, "grad_norm": 1.312068223953247, "learning_rate": 5.911970656885629e-06, "loss": 0.6168, "step": 9000 }, { "epoch": 0.5413684440845347, "eval_loss": 0.544517457485199, "eval_runtime": 21.6689, "eval_samples_per_second": 461.49, "eval_steps_per_second": 57.686, "step": 9000 }, { "epoch": 0.5473836490188073, "grad_norm": 1.4878406524658203, "learning_rate": 5.910970323441147e-06, "loss": 0.6168, "step": 9100 }, { "epoch": 0.5473836490188073, "eval_loss": 0.5467077493667603, "eval_runtime": 21.7585, "eval_samples_per_second": 459.591, "eval_steps_per_second": 57.449, "step": 9100 }, { "epoch": 0.5533988539530799, "grad_norm": 1.4762675762176514, "learning_rate": 5.909969989996666e-06, "loss": 0.6062, "step": 9200 }, { "epoch": 0.5533988539530799, "eval_loss": 0.5416296720504761, "eval_runtime": 21.7398, "eval_samples_per_second": 459.985, "eval_steps_per_second": 57.498, "step": 9200 }, { "epoch": 0.5594140588873525, "grad_norm": 1.3053025007247925, "learning_rate": 5.908969656552185e-06, "loss": 0.6106, "step": 9300 }, { "epoch": 0.5594140588873525, "eval_loss": 0.5386621356010437, "eval_runtime": 21.7444, "eval_samples_per_second": 459.888, "eval_steps_per_second": 57.486, "step": 9300 }, { "epoch": 0.5654292638216251, "grad_norm": 1.5423814058303833, "learning_rate": 5.907969323107703e-06, "loss": 0.6019, "step": 9400 }, { "epoch": 0.5654292638216251, "eval_loss": 0.5405033230781555, "eval_runtime": 21.726, "eval_samples_per_second": 460.277, "eval_steps_per_second": 57.535, "step": 9400 }, { "epoch": 0.5714444687558977, "grad_norm": 1.4696613550186157, "learning_rate": 5.906968989663221e-06, "loss": 0.6011, "step": 9500 }, { "epoch": 0.5714444687558977, "eval_loss": 0.5457667708396912, "eval_runtime": 21.7773, "eval_samples_per_second": 459.193, "eval_steps_per_second": 57.399, "step": 9500 }, { "epoch": 0.5774596736901704, "grad_norm": 1.5349172353744507, "learning_rate": 5.90596865621874e-06, "loss": 0.5961, "step": 9600 }, { "epoch": 0.5774596736901704, "eval_loss": 0.533613920211792, "eval_runtime": 21.9838, "eval_samples_per_second": 454.88, "eval_steps_per_second": 56.86, "step": 9600 }, { "epoch": 0.5834748786244429, "grad_norm": 1.2024816274642944, "learning_rate": 5.904968322774258e-06, "loss": 0.593, "step": 9700 }, { "epoch": 0.5834748786244429, "eval_loss": 0.5246294140815735, "eval_runtime": 22.5017, "eval_samples_per_second": 444.411, "eval_steps_per_second": 55.551, "step": 9700 }, { "epoch": 0.5894900835587156, "grad_norm": 1.2983571290969849, "learning_rate": 5.9039679893297766e-06, "loss": 0.5925, "step": 9800 }, { "epoch": 0.5894900835587156, "eval_loss": 0.5254473686218262, "eval_runtime": 23.0942, "eval_samples_per_second": 433.009, "eval_steps_per_second": 54.126, "step": 9800 }, { "epoch": 0.5955052884929881, "grad_norm": 1.2889515161514282, "learning_rate": 5.902967655885295e-06, "loss": 0.5911, "step": 9900 }, { "epoch": 0.5955052884929881, "eval_loss": 0.5365324020385742, "eval_runtime": 23.3271, "eval_samples_per_second": 428.686, "eval_steps_per_second": 53.586, "step": 9900 }, { "epoch": 0.6015204934272608, "grad_norm": 1.3131366968154907, "learning_rate": 5.901967322440814e-06, "loss": 0.5843, "step": 10000 }, { "epoch": 0.6015204934272608, "eval_loss": 0.5123865008354187, "eval_runtime": 23.435, "eval_samples_per_second": 426.712, "eval_steps_per_second": 53.339, "step": 10000 }, { "epoch": 0.6075356983615333, "grad_norm": 1.3315032720565796, "learning_rate": 5.900966988996333e-06, "loss": 0.5832, "step": 10100 }, { "epoch": 0.6075356983615333, "eval_loss": 0.5256994962692261, "eval_runtime": 23.4061, "eval_samples_per_second": 427.24, "eval_steps_per_second": 53.405, "step": 10100 }, { "epoch": 0.613550903295806, "grad_norm": 1.3008897304534912, "learning_rate": 5.8999666555518505e-06, "loss": 0.582, "step": 10200 }, { "epoch": 0.613550903295806, "eval_loss": 0.5148985981941223, "eval_runtime": 23.4451, "eval_samples_per_second": 426.528, "eval_steps_per_second": 53.316, "step": 10200 }, { "epoch": 0.6195661082300786, "grad_norm": 1.272538423538208, "learning_rate": 5.898966322107369e-06, "loss": 0.5789, "step": 10300 }, { "epoch": 0.6195661082300786, "eval_loss": 0.5160868763923645, "eval_runtime": 23.3699, "eval_samples_per_second": 427.901, "eval_steps_per_second": 53.488, "step": 10300 }, { "epoch": 0.6255813131643512, "grad_norm": 1.38733971118927, "learning_rate": 5.897965988662888e-06, "loss": 0.5768, "step": 10400 }, { "epoch": 0.6255813131643512, "eval_loss": 0.5101234912872314, "eval_runtime": 23.5052, "eval_samples_per_second": 425.437, "eval_steps_per_second": 53.18, "step": 10400 }, { "epoch": 0.6315965180986238, "grad_norm": 1.3414686918258667, "learning_rate": 5.896965655218406e-06, "loss": 0.5728, "step": 10500 }, { "epoch": 0.6315965180986238, "eval_loss": 0.5151140689849854, "eval_runtime": 23.1483, "eval_samples_per_second": 431.997, "eval_steps_per_second": 54.0, "step": 10500 }, { "epoch": 0.6376117230328964, "grad_norm": 1.2821862697601318, "learning_rate": 5.8959653217739245e-06, "loss": 0.5732, "step": 10600 }, { "epoch": 0.6376117230328964, "eval_loss": 0.5067505240440369, "eval_runtime": 23.3046, "eval_samples_per_second": 429.099, "eval_steps_per_second": 53.637, "step": 10600 }, { "epoch": 0.643626927967169, "grad_norm": 1.4687350988388062, "learning_rate": 5.894964988329443e-06, "loss": 0.568, "step": 10700 }, { "epoch": 0.643626927967169, "eval_loss": 0.5038474798202515, "eval_runtime": 48.8496, "eval_samples_per_second": 204.71, "eval_steps_per_second": 25.589, "step": 10700 }, { "epoch": 0.6496421329014416, "grad_norm": 1.1854100227355957, "learning_rate": 5.893964654884962e-06, "loss": 0.5665, "step": 10800 }, { "epoch": 0.6496421329014416, "eval_loss": 0.5092170834541321, "eval_runtime": 51.2918, "eval_samples_per_second": 194.963, "eval_steps_per_second": 24.37, "step": 10800 }, { "epoch": 0.6556573378357142, "grad_norm": 1.2117469310760498, "learning_rate": 5.892964321440481e-06, "loss": 0.5641, "step": 10900 }, { "epoch": 0.6556573378357142, "eval_loss": 0.4948270618915558, "eval_runtime": 51.7341, "eval_samples_per_second": 193.296, "eval_steps_per_second": 24.162, "step": 10900 }, { "epoch": 0.6616725427699869, "grad_norm": 1.1809200048446655, "learning_rate": 5.8919639879959985e-06, "loss": 0.559, "step": 11000 }, { "epoch": 0.6616725427699869, "eval_loss": 0.49759823083877563, "eval_runtime": 50.8828, "eval_samples_per_second": 196.53, "eval_steps_per_second": 24.566, "step": 11000 }, { "epoch": 0.6676877477042594, "grad_norm": 1.4321728944778442, "learning_rate": 5.890963654551517e-06, "loss": 0.5597, "step": 11100 }, { "epoch": 0.6676877477042594, "eval_loss": 0.49609047174453735, "eval_runtime": 51.278, "eval_samples_per_second": 195.015, "eval_steps_per_second": 24.377, "step": 11100 }, { "epoch": 0.6737029526385321, "grad_norm": 1.3043360710144043, "learning_rate": 5.889963321107036e-06, "loss": 0.5574, "step": 11200 }, { "epoch": 0.6737029526385321, "eval_loss": 0.5004040002822876, "eval_runtime": 50.7636, "eval_samples_per_second": 196.992, "eval_steps_per_second": 24.624, "step": 11200 }, { "epoch": 0.6797181575728046, "grad_norm": 1.2415975332260132, "learning_rate": 5.888962987662554e-06, "loss": 0.5555, "step": 11300 }, { "epoch": 0.6797181575728046, "eval_loss": 0.5004035234451294, "eval_runtime": 51.3686, "eval_samples_per_second": 194.672, "eval_steps_per_second": 24.334, "step": 11300 }, { "epoch": 0.6857333625070773, "grad_norm": 1.1731830835342407, "learning_rate": 5.8879626542180725e-06, "loss": 0.5541, "step": 11400 }, { "epoch": 0.6857333625070773, "eval_loss": 0.4998365342617035, "eval_runtime": 50.9083, "eval_samples_per_second": 196.432, "eval_steps_per_second": 24.554, "step": 11400 }, { "epoch": 0.6917485674413498, "grad_norm": 1.2296881675720215, "learning_rate": 5.886962320773592e-06, "loss": 0.5487, "step": 11500 }, { "epoch": 0.6917485674413498, "eval_loss": 0.4932882785797119, "eval_runtime": 50.9764, "eval_samples_per_second": 196.169, "eval_steps_per_second": 24.521, "step": 11500 }, { "epoch": 0.6977637723756225, "grad_norm": 1.4027659893035889, "learning_rate": 5.88596198732911e-06, "loss": 0.5488, "step": 11600 }, { "epoch": 0.6977637723756225, "eval_loss": 0.48723334074020386, "eval_runtime": 51.3087, "eval_samples_per_second": 194.899, "eval_steps_per_second": 24.362, "step": 11600 }, { "epoch": 0.7037789773098951, "grad_norm": 1.345869541168213, "learning_rate": 5.884961653884629e-06, "loss": 0.5464, "step": 11700 }, { "epoch": 0.7037789773098951, "eval_loss": 0.48902279138565063, "eval_runtime": 51.5761, "eval_samples_per_second": 193.888, "eval_steps_per_second": 24.236, "step": 11700 }, { "epoch": 0.7097941822441677, "grad_norm": 1.3029801845550537, "learning_rate": 5.8839613204401465e-06, "loss": 0.545, "step": 11800 }, { "epoch": 0.7097941822441677, "eval_loss": 0.4815163016319275, "eval_runtime": 51.0467, "eval_samples_per_second": 195.899, "eval_steps_per_second": 24.487, "step": 11800 }, { "epoch": 0.7158093871784403, "grad_norm": 1.3300397396087646, "learning_rate": 5.882960986995665e-06, "loss": 0.5406, "step": 11900 }, { "epoch": 0.7158093871784403, "eval_loss": 0.4828699231147766, "eval_runtime": 50.6859, "eval_samples_per_second": 197.294, "eval_steps_per_second": 24.662, "step": 11900 }, { "epoch": 0.7218245921127129, "grad_norm": 1.3354322910308838, "learning_rate": 5.881960653551184e-06, "loss": 0.5412, "step": 12000 }, { "epoch": 0.7218245921127129, "eval_loss": 0.4760846197605133, "eval_runtime": 51.0095, "eval_samples_per_second": 196.042, "eval_steps_per_second": 24.505, "step": 12000 }, { "epoch": 0.7278397970469855, "grad_norm": 1.2316620349884033, "learning_rate": 5.880960320106702e-06, "loss": 0.5354, "step": 12100 }, { "epoch": 0.7278397970469855, "eval_loss": 0.49535489082336426, "eval_runtime": 51.064, "eval_samples_per_second": 195.833, "eval_steps_per_second": 24.479, "step": 12100 }, { "epoch": 0.7338550019812581, "grad_norm": 1.2033593654632568, "learning_rate": 5.879959986662221e-06, "loss": 0.5343, "step": 12200 }, { "epoch": 0.7338550019812581, "eval_loss": 0.4705411195755005, "eval_runtime": 50.9982, "eval_samples_per_second": 196.085, "eval_steps_per_second": 24.511, "step": 12200 }, { "epoch": 0.7398702069155307, "grad_norm": 1.2634704113006592, "learning_rate": 5.87895965321774e-06, "loss": 0.5337, "step": 12300 }, { "epoch": 0.7398702069155307, "eval_loss": 0.47791826725006104, "eval_runtime": 51.1718, "eval_samples_per_second": 195.42, "eval_steps_per_second": 24.428, "step": 12300 }, { "epoch": 0.7458854118498034, "grad_norm": 1.2546501159667969, "learning_rate": 5.877959319773258e-06, "loss": 0.5324, "step": 12400 }, { "epoch": 0.7458854118498034, "eval_loss": 0.4756995737552643, "eval_runtime": 51.0651, "eval_samples_per_second": 195.828, "eval_steps_per_second": 24.479, "step": 12400 }, { "epoch": 0.7519006167840759, "grad_norm": 1.1833654642105103, "learning_rate": 5.876958986328777e-06, "loss": 0.5299, "step": 12500 }, { "epoch": 0.7519006167840759, "eval_loss": 0.47130194306373596, "eval_runtime": 51.0775, "eval_samples_per_second": 195.781, "eval_steps_per_second": 24.473, "step": 12500 }, { "epoch": 0.7579158217183486, "grad_norm": 1.0535800457000732, "learning_rate": 5.875958652884295e-06, "loss": 0.5288, "step": 12600 }, { "epoch": 0.7579158217183486, "eval_loss": 0.46586230397224426, "eval_runtime": 51.3884, "eval_samples_per_second": 194.596, "eval_steps_per_second": 24.325, "step": 12600 }, { "epoch": 0.7639310266526211, "grad_norm": 1.2561872005462646, "learning_rate": 5.874958319439813e-06, "loss": 0.5297, "step": 12700 }, { "epoch": 0.7639310266526211, "eval_loss": 0.4665389657020569, "eval_runtime": 51.1355, "eval_samples_per_second": 195.559, "eval_steps_per_second": 24.445, "step": 12700 }, { "epoch": 0.7699462315868938, "grad_norm": 1.177007794380188, "learning_rate": 5.873957985995332e-06, "loss": 0.5326, "step": 12800 }, { "epoch": 0.7699462315868938, "eval_loss": 0.4671100676059723, "eval_runtime": 51.3263, "eval_samples_per_second": 194.832, "eval_steps_per_second": 24.354, "step": 12800 }, { "epoch": 0.7759614365211663, "grad_norm": 1.181401252746582, "learning_rate": 5.8729576525508506e-06, "loss": 0.5222, "step": 12900 }, { "epoch": 0.7759614365211663, "eval_loss": 0.4585270583629608, "eval_runtime": 51.1292, "eval_samples_per_second": 195.583, "eval_steps_per_second": 24.448, "step": 12900 }, { "epoch": 0.781976641455439, "grad_norm": 1.108788013458252, "learning_rate": 5.871957319106369e-06, "loss": 0.5202, "step": 13000 }, { "epoch": 0.781976641455439, "eval_loss": 0.46135467290878296, "eval_runtime": 51.1302, "eval_samples_per_second": 195.579, "eval_steps_per_second": 24.447, "step": 13000 }, { "epoch": 0.7879918463897116, "grad_norm": 1.152575969696045, "learning_rate": 5.870956985661888e-06, "loss": 0.5157, "step": 13100 }, { "epoch": 0.7879918463897116, "eval_loss": 0.46781352162361145, "eval_runtime": 51.1065, "eval_samples_per_second": 195.67, "eval_steps_per_second": 24.459, "step": 13100 }, { "epoch": 0.7940070513239842, "grad_norm": 1.1765929460525513, "learning_rate": 5.869956652217406e-06, "loss": 0.5177, "step": 13200 }, { "epoch": 0.7940070513239842, "eval_loss": 0.4588942527770996, "eval_runtime": 51.1353, "eval_samples_per_second": 195.56, "eval_steps_per_second": 24.445, "step": 13200 }, { "epoch": 0.8000222562582568, "grad_norm": 1.1165159940719604, "learning_rate": 5.8689563187729245e-06, "loss": 0.5141, "step": 13300 }, { "epoch": 0.8000222562582568, "eval_loss": 0.4517599046230316, "eval_runtime": 51.1096, "eval_samples_per_second": 195.658, "eval_steps_per_second": 24.457, "step": 13300 }, { "epoch": 0.8060374611925294, "grad_norm": 1.0414021015167236, "learning_rate": 5.867955985328443e-06, "loss": 0.5135, "step": 13400 }, { "epoch": 0.8060374611925294, "eval_loss": 0.46558651328086853, "eval_runtime": 51.1277, "eval_samples_per_second": 195.589, "eval_steps_per_second": 24.449, "step": 13400 }, { "epoch": 0.812052666126802, "grad_norm": 1.3002249002456665, "learning_rate": 5.866955651883961e-06, "loss": 0.5124, "step": 13500 }, { "epoch": 0.812052666126802, "eval_loss": 0.4563812017440796, "eval_runtime": 51.132, "eval_samples_per_second": 195.572, "eval_steps_per_second": 24.447, "step": 13500 }, { "epoch": 0.8180678710610746, "grad_norm": 1.5342046022415161, "learning_rate": 5.86595531843948e-06, "loss": 0.5101, "step": 13600 }, { "epoch": 0.8180678710610746, "eval_loss": 0.44918256998062134, "eval_runtime": 51.2205, "eval_samples_per_second": 195.234, "eval_steps_per_second": 24.404, "step": 13600 }, { "epoch": 0.8240830759953472, "grad_norm": 1.312056064605713, "learning_rate": 5.8649549849949985e-06, "loss": 0.5087, "step": 13700 }, { "epoch": 0.8240830759953472, "eval_loss": 0.45463162660598755, "eval_runtime": 50.988, "eval_samples_per_second": 196.125, "eval_steps_per_second": 24.516, "step": 13700 }, { "epoch": 0.8300982809296198, "grad_norm": 1.4413928985595703, "learning_rate": 5.863954651550517e-06, "loss": 0.5079, "step": 13800 }, { "epoch": 0.8300982809296198, "eval_loss": 0.4562767446041107, "eval_runtime": 51.212, "eval_samples_per_second": 195.267, "eval_steps_per_second": 24.408, "step": 13800 }, { "epoch": 0.8361134858638924, "grad_norm": 1.3391541242599487, "learning_rate": 5.862954318106036e-06, "loss": 0.5077, "step": 13900 }, { "epoch": 0.8361134858638924, "eval_loss": 0.44607582688331604, "eval_runtime": 51.1173, "eval_samples_per_second": 195.628, "eval_steps_per_second": 24.454, "step": 13900 }, { "epoch": 0.8421286907981651, "grad_norm": 1.2158905267715454, "learning_rate": 5.861953984661554e-06, "loss": 0.5032, "step": 14000 }, { "epoch": 0.8421286907981651, "eval_loss": 0.4587889611721039, "eval_runtime": 51.1702, "eval_samples_per_second": 195.426, "eval_steps_per_second": 24.428, "step": 14000 }, { "epoch": 0.8481438957324376, "grad_norm": 1.1938725709915161, "learning_rate": 5.8609536512170725e-06, "loss": 0.4996, "step": 14100 }, { "epoch": 0.8481438957324376, "eval_loss": 0.4515674412250519, "eval_runtime": 51.1351, "eval_samples_per_second": 195.56, "eval_steps_per_second": 24.445, "step": 14100 }, { "epoch": 0.8541591006667103, "grad_norm": 1.1953227519989014, "learning_rate": 5.859953317772591e-06, "loss": 0.5014, "step": 14200 }, { "epoch": 0.8541591006667103, "eval_loss": 0.44719940423965454, "eval_runtime": 51.0487, "eval_samples_per_second": 195.891, "eval_steps_per_second": 24.486, "step": 14200 }, { "epoch": 0.8601743056009828, "grad_norm": 1.2699577808380127, "learning_rate": 5.858952984328109e-06, "loss": 0.499, "step": 14300 }, { "epoch": 0.8601743056009828, "eval_loss": 0.4444737732410431, "eval_runtime": 51.2894, "eval_samples_per_second": 194.972, "eval_steps_per_second": 24.372, "step": 14300 }, { "epoch": 0.8661895105352555, "grad_norm": 1.0982294082641602, "learning_rate": 5.857952650883628e-06, "loss": 0.5024, "step": 14400 }, { "epoch": 0.8661895105352555, "eval_loss": 0.4426032602787018, "eval_runtime": 51.0622, "eval_samples_per_second": 195.84, "eval_steps_per_second": 24.48, "step": 14400 }, { "epoch": 0.872204715469528, "grad_norm": 1.1881742477416992, "learning_rate": 5.8569523174391465e-06, "loss": 0.4971, "step": 14500 }, { "epoch": 0.872204715469528, "eval_loss": 0.4500812590122223, "eval_runtime": 51.0676, "eval_samples_per_second": 195.819, "eval_steps_per_second": 24.477, "step": 14500 }, { "epoch": 0.8782199204038007, "grad_norm": 1.2892823219299316, "learning_rate": 5.855951983994665e-06, "loss": 0.4947, "step": 14600 }, { "epoch": 0.8782199204038007, "eval_loss": 0.45143038034439087, "eval_runtime": 51.2218, "eval_samples_per_second": 195.229, "eval_steps_per_second": 24.404, "step": 14600 }, { "epoch": 0.8842351253380734, "grad_norm": 1.1228898763656616, "learning_rate": 5.854951650550184e-06, "loss": 0.4912, "step": 14700 }, { "epoch": 0.8842351253380734, "eval_loss": 0.443864107131958, "eval_runtime": 51.1005, "eval_samples_per_second": 195.693, "eval_steps_per_second": 24.462, "step": 14700 }, { "epoch": 0.8902503302723459, "grad_norm": 1.2021640539169312, "learning_rate": 5.853951317105702e-06, "loss": 0.4911, "step": 14800 }, { "epoch": 0.8902503302723459, "eval_loss": 0.44539061188697815, "eval_runtime": 51.3647, "eval_samples_per_second": 194.686, "eval_steps_per_second": 24.336, "step": 14800 }, { "epoch": 0.8962655352066186, "grad_norm": 1.226335883140564, "learning_rate": 5.8529509836612205e-06, "loss": 0.488, "step": 14900 }, { "epoch": 0.8962655352066186, "eval_loss": 0.43708336353302, "eval_runtime": 51.0878, "eval_samples_per_second": 195.741, "eval_steps_per_second": 24.468, "step": 14900 }, { "epoch": 0.9022807401408911, "grad_norm": 1.1519514322280884, "learning_rate": 5.851950650216739e-06, "loss": 0.4879, "step": 15000 }, { "epoch": 0.9022807401408911, "eval_loss": 0.43572157621383667, "eval_runtime": 51.0673, "eval_samples_per_second": 195.82, "eval_steps_per_second": 24.477, "step": 15000 }, { "epoch": 0.9082959450751638, "grad_norm": 1.0578216314315796, "learning_rate": 5.850950316772257e-06, "loss": 0.491, "step": 15100 }, { "epoch": 0.9082959450751638, "eval_loss": 0.43306058645248413, "eval_runtime": 51.2921, "eval_samples_per_second": 194.962, "eval_steps_per_second": 24.37, "step": 15100 }, { "epoch": 0.9143111500094363, "grad_norm": 1.292629599571228, "learning_rate": 5.849949983327776e-06, "loss": 0.4852, "step": 15200 }, { "epoch": 0.9143111500094363, "eval_loss": 0.43448084592819214, "eval_runtime": 51.0849, "eval_samples_per_second": 195.752, "eval_steps_per_second": 24.469, "step": 15200 }, { "epoch": 0.920326354943709, "grad_norm": 1.2115490436553955, "learning_rate": 5.8489496498832945e-06, "loss": 0.4879, "step": 15300 }, { "epoch": 0.920326354943709, "eval_loss": 0.4403839409351349, "eval_runtime": 51.0866, "eval_samples_per_second": 195.746, "eval_steps_per_second": 24.468, "step": 15300 }, { "epoch": 0.9263415598779816, "grad_norm": 1.2206310033798218, "learning_rate": 5.847949316438813e-06, "loss": 0.4771, "step": 15400 }, { "epoch": 0.9263415598779816, "eval_loss": 0.43060389161109924, "eval_runtime": 51.0659, "eval_samples_per_second": 195.825, "eval_steps_per_second": 24.478, "step": 15400 }, { "epoch": 0.9323567648122542, "grad_norm": 1.0853536128997803, "learning_rate": 5.846948982994332e-06, "loss": 0.4821, "step": 15500 }, { "epoch": 0.9323567648122542, "eval_loss": 0.42842620611190796, "eval_runtime": 51.036, "eval_samples_per_second": 195.94, "eval_steps_per_second": 24.493, "step": 15500 }, { "epoch": 0.9383719697465268, "grad_norm": 1.0656437873840332, "learning_rate": 5.8459486495498506e-06, "loss": 0.4796, "step": 15600 }, { "epoch": 0.9383719697465268, "eval_loss": 0.4259638786315918, "eval_runtime": 51.0811, "eval_samples_per_second": 195.767, "eval_steps_per_second": 24.471, "step": 15600 }, { "epoch": 0.9443871746807994, "grad_norm": 1.2496039867401123, "learning_rate": 5.8449483161053684e-06, "loss": 0.4783, "step": 15700 }, { "epoch": 0.9443871746807994, "eval_loss": 0.42784813046455383, "eval_runtime": 51.0862, "eval_samples_per_second": 195.748, "eval_steps_per_second": 24.468, "step": 15700 }, { "epoch": 0.950402379615072, "grad_norm": 1.0478885173797607, "learning_rate": 5.843947982660887e-06, "loss": 0.4736, "step": 15800 }, { "epoch": 0.950402379615072, "eval_loss": 0.42105141282081604, "eval_runtime": 51.0949, "eval_samples_per_second": 195.714, "eval_steps_per_second": 24.464, "step": 15800 }, { "epoch": 0.9564175845493446, "grad_norm": 1.1973545551300049, "learning_rate": 5.842947649216405e-06, "loss": 0.4765, "step": 15900 }, { "epoch": 0.9564175845493446, "eval_loss": 0.41922861337661743, "eval_runtime": 51.0499, "eval_samples_per_second": 195.887, "eval_steps_per_second": 24.486, "step": 15900 }, { "epoch": 0.9624327894836172, "grad_norm": 1.0738471746444702, "learning_rate": 5.841947315771924e-06, "loss": 0.4713, "step": 16000 }, { "epoch": 0.9624327894836172, "eval_loss": 0.4311535060405731, "eval_runtime": 51.0775, "eval_samples_per_second": 195.781, "eval_steps_per_second": 24.473, "step": 16000 }, { "epoch": 0.9684479944178899, "grad_norm": 1.14482581615448, "learning_rate": 5.840946982327443e-06, "loss": 0.4732, "step": 16100 }, { "epoch": 0.9684479944178899, "eval_loss": 0.41709282994270325, "eval_runtime": 39.7116, "eval_samples_per_second": 251.815, "eval_steps_per_second": 31.477, "step": 16100 }, { "epoch": 0.9744631993521624, "grad_norm": 1.1577385663986206, "learning_rate": 5.839946648882961e-06, "loss": 0.4704, "step": 16200 }, { "epoch": 0.9744631993521624, "eval_loss": 0.4273630976676941, "eval_runtime": 51.0906, "eval_samples_per_second": 195.731, "eval_steps_per_second": 24.466, "step": 16200 }, { "epoch": 0.9804784042864351, "grad_norm": 1.125328779220581, "learning_rate": 5.83894631543848e-06, "loss": 0.4697, "step": 16300 }, { "epoch": 0.9804784042864351, "eval_loss": 0.42490535974502563, "eval_runtime": 51.0751, "eval_samples_per_second": 195.79, "eval_steps_per_second": 24.474, "step": 16300 }, { "epoch": 0.9864936092207076, "grad_norm": 1.2619575262069702, "learning_rate": 5.8379459819939985e-06, "loss": 0.4721, "step": 16400 }, { "epoch": 0.9864936092207076, "eval_loss": 0.42143183946609497, "eval_runtime": 51.2808, "eval_samples_per_second": 195.005, "eval_steps_per_second": 24.376, "step": 16400 }, { "epoch": 0.9925088141549803, "grad_norm": 1.0622971057891846, "learning_rate": 5.836945648549516e-06, "loss": 0.4672, "step": 16500 }, { "epoch": 0.9925088141549803, "eval_loss": 0.4140073359012604, "eval_runtime": 51.137, "eval_samples_per_second": 195.553, "eval_steps_per_second": 24.444, "step": 16500 }, { "epoch": 0.9985240190892528, "grad_norm": 1.1675751209259033, "learning_rate": 5.835945315105035e-06, "loss": 0.469, "step": 16600 }, { "epoch": 0.9985240190892528, "eval_loss": 0.413769394159317, "eval_runtime": 51.1298, "eval_samples_per_second": 195.581, "eval_steps_per_second": 24.448, "step": 16600 }, { "epoch": 1.0045392240235254, "grad_norm": 1.1390060186386108, "learning_rate": 5.834944981660553e-06, "loss": 0.4668, "step": 16700 }, { "epoch": 1.0045392240235254, "eval_loss": 0.41630059480667114, "eval_runtime": 51.1382, "eval_samples_per_second": 195.548, "eval_steps_per_second": 24.444, "step": 16700 }, { "epoch": 1.0105544289577981, "grad_norm": 1.2013533115386963, "learning_rate": 5.8339446482160725e-06, "loss": 0.4636, "step": 16800 }, { "epoch": 1.0105544289577981, "eval_loss": 0.4128175675868988, "eval_runtime": 51.0766, "eval_samples_per_second": 195.784, "eval_steps_per_second": 24.473, "step": 16800 }, { "epoch": 1.0165696338920707, "grad_norm": 1.1893339157104492, "learning_rate": 5.832944314771591e-06, "loss": 0.4628, "step": 16900 }, { "epoch": 1.0165696338920707, "eval_loss": 0.4195719361305237, "eval_runtime": 51.0932, "eval_samples_per_second": 195.721, "eval_steps_per_second": 24.465, "step": 16900 }, { "epoch": 1.0225848388263432, "grad_norm": 1.1112314462661743, "learning_rate": 5.831943981327109e-06, "loss": 0.4631, "step": 17000 }, { "epoch": 1.0225848388263432, "eval_loss": 0.41490069031715393, "eval_runtime": 51.0962, "eval_samples_per_second": 195.709, "eval_steps_per_second": 24.464, "step": 17000 }, { "epoch": 1.028600043760616, "grad_norm": 1.0246236324310303, "learning_rate": 5.830943647882628e-06, "loss": 0.4634, "step": 17100 }, { "epoch": 1.028600043760616, "eval_loss": 0.4150553345680237, "eval_runtime": 51.0756, "eval_samples_per_second": 195.788, "eval_steps_per_second": 24.474, "step": 17100 }, { "epoch": 1.0346152486948885, "grad_norm": 1.09652578830719, "learning_rate": 5.8299433144381465e-06, "loss": 0.4618, "step": 17200 }, { "epoch": 1.0346152486948885, "eval_loss": 0.41938120126724243, "eval_runtime": 51.0832, "eval_samples_per_second": 195.759, "eval_steps_per_second": 24.47, "step": 17200 }, { "epoch": 1.040630453629161, "grad_norm": 1.123412013053894, "learning_rate": 5.828942980993664e-06, "loss": 0.4598, "step": 17300 }, { "epoch": 1.040630453629161, "eval_loss": 0.4131644666194916, "eval_runtime": 51.0626, "eval_samples_per_second": 195.838, "eval_steps_per_second": 24.48, "step": 17300 }, { "epoch": 1.0466456585634338, "grad_norm": 1.195304274559021, "learning_rate": 5.827942647549183e-06, "loss": 0.455, "step": 17400 }, { "epoch": 1.0466456585634338, "eval_loss": 0.40582725405693054, "eval_runtime": 51.2954, "eval_samples_per_second": 194.949, "eval_steps_per_second": 24.369, "step": 17400 }, { "epoch": 1.0526608634977064, "grad_norm": 1.149339199066162, "learning_rate": 5.826942314104702e-06, "loss": 0.4547, "step": 17500 }, { "epoch": 1.0526608634977064, "eval_loss": 0.4130345582962036, "eval_runtime": 51.0931, "eval_samples_per_second": 195.721, "eval_steps_per_second": 24.465, "step": 17500 }, { "epoch": 1.058676068431979, "grad_norm": 1.1289178133010864, "learning_rate": 5.8259419806602205e-06, "loss": 0.4551, "step": 17600 }, { "epoch": 1.058676068431979, "eval_loss": 0.4048755466938019, "eval_runtime": 51.0261, "eval_samples_per_second": 195.978, "eval_steps_per_second": 24.497, "step": 17600 }, { "epoch": 1.0646912733662515, "grad_norm": 1.1146255731582642, "learning_rate": 5.824941647215739e-06, "loss": 0.4509, "step": 17700 }, { "epoch": 1.0646912733662515, "eval_loss": 0.401869535446167, "eval_runtime": 51.168, "eval_samples_per_second": 195.435, "eval_steps_per_second": 24.429, "step": 17700 }, { "epoch": 1.0707064783005242, "grad_norm": 1.2300053834915161, "learning_rate": 5.823941313771257e-06, "loss": 0.4505, "step": 17800 }, { "epoch": 1.0707064783005242, "eval_loss": 0.4011248052120209, "eval_runtime": 51.0381, "eval_samples_per_second": 195.932, "eval_steps_per_second": 24.491, "step": 17800 }, { "epoch": 1.0767216832347968, "grad_norm": 1.1278949975967407, "learning_rate": 5.822940980326776e-06, "loss": 0.4499, "step": 17900 }, { "epoch": 1.0767216832347968, "eval_loss": 0.4098372459411621, "eval_runtime": 51.1549, "eval_samples_per_second": 195.485, "eval_steps_per_second": 24.436, "step": 17900 }, { "epoch": 1.0827368881690693, "grad_norm": 1.1039050817489624, "learning_rate": 5.8219406468822945e-06, "loss": 0.4479, "step": 18000 }, { "epoch": 1.0827368881690693, "eval_loss": 0.4014202356338501, "eval_runtime": 51.282, "eval_samples_per_second": 195.0, "eval_steps_per_second": 24.375, "step": 18000 }, { "epoch": 1.0887520931033419, "grad_norm": 1.0981614589691162, "learning_rate": 5.820940313437812e-06, "loss": 0.4505, "step": 18100 }, { "epoch": 1.0887520931033419, "eval_loss": 0.40326839685440063, "eval_runtime": 51.0953, "eval_samples_per_second": 195.713, "eval_steps_per_second": 24.464, "step": 18100 }, { "epoch": 1.0947672980376146, "grad_norm": 1.1146022081375122, "learning_rate": 5.819939979993331e-06, "loss": 0.4485, "step": 18200 }, { "epoch": 1.0947672980376146, "eval_loss": 0.4028699994087219, "eval_runtime": 51.095, "eval_samples_per_second": 195.714, "eval_steps_per_second": 24.464, "step": 18200 }, { "epoch": 1.1007825029718872, "grad_norm": 1.0906445980072021, "learning_rate": 5.81893964654885e-06, "loss": 0.4441, "step": 18300 }, { "epoch": 1.1007825029718872, "eval_loss": 0.39843133091926575, "eval_runtime": 51.2428, "eval_samples_per_second": 195.149, "eval_steps_per_second": 24.394, "step": 18300 }, { "epoch": 1.1067977079061597, "grad_norm": 1.0257636308670044, "learning_rate": 5.8179393131043684e-06, "loss": 0.4456, "step": 18400 }, { "epoch": 1.1067977079061597, "eval_loss": 0.3976500630378723, "eval_runtime": 51.0817, "eval_samples_per_second": 195.765, "eval_steps_per_second": 24.471, "step": 18400 }, { "epoch": 1.1128129128404325, "grad_norm": 1.1339443922042847, "learning_rate": 5.816938979659887e-06, "loss": 0.4441, "step": 18500 }, { "epoch": 1.1128129128404325, "eval_loss": 0.403137743473053, "eval_runtime": 51.196, "eval_samples_per_second": 195.328, "eval_steps_per_second": 24.416, "step": 18500 }, { "epoch": 1.118828117774705, "grad_norm": 1.146203637123108, "learning_rate": 5.815938646215406e-06, "loss": 0.4431, "step": 18600 }, { "epoch": 1.118828117774705, "eval_loss": 0.40482422709465027, "eval_runtime": 51.0834, "eval_samples_per_second": 195.758, "eval_steps_per_second": 24.47, "step": 18600 }, { "epoch": 1.1248433227089776, "grad_norm": 1.1327886581420898, "learning_rate": 5.814938312770924e-06, "loss": 0.4446, "step": 18700 }, { "epoch": 1.1248433227089776, "eval_loss": 0.39922335743904114, "eval_runtime": 51.1856, "eval_samples_per_second": 195.367, "eval_steps_per_second": 24.421, "step": 18700 }, { "epoch": 1.1308585276432503, "grad_norm": 1.1702196598052979, "learning_rate": 5.8139379793264424e-06, "loss": 0.4412, "step": 18800 }, { "epoch": 1.1308585276432503, "eval_loss": 0.39871400594711304, "eval_runtime": 51.1987, "eval_samples_per_second": 195.317, "eval_steps_per_second": 24.415, "step": 18800 }, { "epoch": 1.1368737325775229, "grad_norm": 1.0438004732131958, "learning_rate": 5.81293764588196e-06, "loss": 0.44, "step": 18900 }, { "epoch": 1.1368737325775229, "eval_loss": 0.3967694044113159, "eval_runtime": 51.0919, "eval_samples_per_second": 195.726, "eval_steps_per_second": 24.466, "step": 18900 }, { "epoch": 1.1428889375117954, "grad_norm": 1.0050268173217773, "learning_rate": 5.811937312437479e-06, "loss": 0.4395, "step": 19000 }, { "epoch": 1.1428889375117954, "eval_loss": 0.3952539563179016, "eval_runtime": 51.3885, "eval_samples_per_second": 194.596, "eval_steps_per_second": 24.325, "step": 19000 }, { "epoch": 1.148904142446068, "grad_norm": 1.0875275135040283, "learning_rate": 5.810936978992998e-06, "loss": 0.4346, "step": 19100 }, { "epoch": 1.148904142446068, "eval_loss": 0.3918244242668152, "eval_runtime": 51.0342, "eval_samples_per_second": 195.947, "eval_steps_per_second": 24.493, "step": 19100 }, { "epoch": 1.1549193473803407, "grad_norm": 1.0449281930923462, "learning_rate": 5.809936645548516e-06, "loss": 0.4391, "step": 19200 }, { "epoch": 1.1549193473803407, "eval_loss": 0.3855830729007721, "eval_runtime": 51.1568, "eval_samples_per_second": 195.478, "eval_steps_per_second": 24.435, "step": 19200 }, { "epoch": 1.1609345523146133, "grad_norm": 0.9773437976837158, "learning_rate": 5.808936312104035e-06, "loss": 0.4355, "step": 19300 }, { "epoch": 1.1609345523146133, "eval_loss": 0.3886500597000122, "eval_runtime": 51.1956, "eval_samples_per_second": 195.329, "eval_steps_per_second": 24.416, "step": 19300 }, { "epoch": 1.1669497572488858, "grad_norm": 1.091601014137268, "learning_rate": 5.807935978659554e-06, "loss": 0.4344, "step": 19400 }, { "epoch": 1.1669497572488858, "eval_loss": 0.3868565857410431, "eval_runtime": 51.1098, "eval_samples_per_second": 195.657, "eval_steps_per_second": 24.457, "step": 19400 }, { "epoch": 1.1729649621831584, "grad_norm": 1.1882948875427246, "learning_rate": 5.806935645215072e-06, "loss": 0.434, "step": 19500 }, { "epoch": 1.1729649621831584, "eval_loss": 0.38946595788002014, "eval_runtime": 51.2843, "eval_samples_per_second": 194.991, "eval_steps_per_second": 24.374, "step": 19500 }, { "epoch": 1.1789801671174311, "grad_norm": 1.0534999370574951, "learning_rate": 5.80593531177059e-06, "loss": 0.4329, "step": 19600 }, { "epoch": 1.1789801671174311, "eval_loss": 0.3830993175506592, "eval_runtime": 50.9094, "eval_samples_per_second": 196.428, "eval_steps_per_second": 24.553, "step": 19600 }, { "epoch": 1.1849953720517037, "grad_norm": 1.0696886777877808, "learning_rate": 5.804934978326108e-06, "loss": 0.4311, "step": 19700 }, { "epoch": 1.1849953720517037, "eval_loss": 0.39124995470046997, "eval_runtime": 51.1273, "eval_samples_per_second": 195.59, "eval_steps_per_second": 24.449, "step": 19700 }, { "epoch": 1.1910105769859762, "grad_norm": 1.0171489715576172, "learning_rate": 5.803934644881627e-06, "loss": 0.4332, "step": 19800 }, { "epoch": 1.1910105769859762, "eval_loss": 0.384937584400177, "eval_runtime": 51.3256, "eval_samples_per_second": 194.834, "eval_steps_per_second": 24.354, "step": 19800 }, { "epoch": 1.197025781920249, "grad_norm": 1.1686575412750244, "learning_rate": 5.802934311437146e-06, "loss": 0.4289, "step": 19900 }, { "epoch": 1.197025781920249, "eval_loss": 0.38561180233955383, "eval_runtime": 51.072, "eval_samples_per_second": 195.802, "eval_steps_per_second": 24.475, "step": 19900 }, { "epoch": 1.2030409868545215, "grad_norm": 1.0748465061187744, "learning_rate": 5.801933977992664e-06, "loss": 0.4334, "step": 20000 }, { "epoch": 1.2030409868545215, "eval_loss": 0.382721871137619, "eval_runtime": 51.3966, "eval_samples_per_second": 194.565, "eval_steps_per_second": 24.321, "step": 20000 }, { "epoch": 1.209056191788794, "grad_norm": 1.100787878036499, "learning_rate": 5.800933644548183e-06, "loss": 0.4239, "step": 20100 }, { "epoch": 1.209056191788794, "eval_loss": 0.3841208517551422, "eval_runtime": 51.057, "eval_samples_per_second": 195.859, "eval_steps_per_second": 24.482, "step": 20100 }, { "epoch": 1.2150713967230669, "grad_norm": 1.04718017578125, "learning_rate": 5.799933311103702e-06, "loss": 0.4271, "step": 20200 }, { "epoch": 1.2150713967230669, "eval_loss": 0.3771766424179077, "eval_runtime": 51.2777, "eval_samples_per_second": 195.017, "eval_steps_per_second": 24.377, "step": 20200 }, { "epoch": 1.2210866016573394, "grad_norm": 1.1533209085464478, "learning_rate": 5.79893297765922e-06, "loss": 0.4254, "step": 20300 }, { "epoch": 1.2210866016573394, "eval_loss": 0.38013017177581787, "eval_runtime": 51.0118, "eval_samples_per_second": 196.033, "eval_steps_per_second": 24.504, "step": 20300 }, { "epoch": 1.227101806591612, "grad_norm": 1.2025070190429688, "learning_rate": 5.797932644214738e-06, "loss": 0.4263, "step": 20400 }, { "epoch": 1.227101806591612, "eval_loss": 0.37795642018318176, "eval_runtime": 51.132, "eval_samples_per_second": 195.572, "eval_steps_per_second": 24.447, "step": 20400 }, { "epoch": 1.2331170115258845, "grad_norm": 1.1051814556121826, "learning_rate": 5.796932310770257e-06, "loss": 0.4256, "step": 20500 }, { "epoch": 1.2331170115258845, "eval_loss": 0.37627479434013367, "eval_runtime": 50.9072, "eval_samples_per_second": 196.436, "eval_steps_per_second": 24.554, "step": 20500 }, { "epoch": 1.2391322164601573, "grad_norm": 1.0987049341201782, "learning_rate": 5.795931977325775e-06, "loss": 0.4239, "step": 20600 }, { "epoch": 1.2391322164601573, "eval_loss": 0.3853623569011688, "eval_runtime": 51.0608, "eval_samples_per_second": 195.845, "eval_steps_per_second": 24.481, "step": 20600 }, { "epoch": 1.2451474213944298, "grad_norm": 1.0989750623703003, "learning_rate": 5.794931643881294e-06, "loss": 0.4197, "step": 20700 }, { "epoch": 1.2451474213944298, "eval_loss": 0.3807806670665741, "eval_runtime": 51.3594, "eval_samples_per_second": 194.706, "eval_steps_per_second": 24.338, "step": 20700 }, { "epoch": 1.2511626263287023, "grad_norm": 1.0866729021072388, "learning_rate": 5.793931310436812e-06, "loss": 0.4234, "step": 20800 }, { "epoch": 1.2511626263287023, "eval_loss": 0.3777351379394531, "eval_runtime": 51.0621, "eval_samples_per_second": 195.84, "eval_steps_per_second": 24.48, "step": 20800 }, { "epoch": 1.2571778312629749, "grad_norm": 1.1387032270431519, "learning_rate": 5.792930976992331e-06, "loss": 0.4197, "step": 20900 }, { "epoch": 1.2571778312629749, "eval_loss": 0.3739318549633026, "eval_runtime": 51.1648, "eval_samples_per_second": 195.447, "eval_steps_per_second": 24.431, "step": 20900 }, { "epoch": 1.2631930361972477, "grad_norm": 0.9848424792289734, "learning_rate": 5.79193064354785e-06, "loss": 0.4225, "step": 21000 }, { "epoch": 1.2631930361972477, "eval_loss": 0.3804405629634857, "eval_runtime": 51.1688, "eval_samples_per_second": 195.431, "eval_steps_per_second": 24.429, "step": 21000 }, { "epoch": 1.2692082411315202, "grad_norm": 1.0492684841156006, "learning_rate": 5.790930310103368e-06, "loss": 0.4179, "step": 21100 }, { "epoch": 1.2692082411315202, "eval_loss": 0.37157440185546875, "eval_runtime": 51.0428, "eval_samples_per_second": 195.914, "eval_steps_per_second": 24.489, "step": 21100 }, { "epoch": 1.2752234460657927, "grad_norm": 1.2355892658233643, "learning_rate": 5.789929976658886e-06, "loss": 0.4177, "step": 21200 }, { "epoch": 1.2752234460657927, "eval_loss": 0.3794465661048889, "eval_runtime": 51.1116, "eval_samples_per_second": 195.65, "eval_steps_per_second": 24.456, "step": 21200 }, { "epoch": 1.2812386510000655, "grad_norm": 1.1180801391601562, "learning_rate": 5.788929643214405e-06, "loss": 0.4192, "step": 21300 }, { "epoch": 1.2812386510000655, "eval_loss": 0.3741929829120636, "eval_runtime": 51.043, "eval_samples_per_second": 195.913, "eval_steps_per_second": 24.489, "step": 21300 }, { "epoch": 1.287253855934338, "grad_norm": 1.1260274648666382, "learning_rate": 5.787929309769923e-06, "loss": 0.4165, "step": 21400 }, { "epoch": 1.287253855934338, "eval_loss": 0.37511906027793884, "eval_runtime": 51.1867, "eval_samples_per_second": 195.363, "eval_steps_per_second": 24.42, "step": 21400 }, { "epoch": 1.2932690608686106, "grad_norm": 1.0729244947433472, "learning_rate": 5.7869289763254424e-06, "loss": 0.4148, "step": 21500 }, { "epoch": 1.2932690608686106, "eval_loss": 0.3755778670310974, "eval_runtime": 50.9919, "eval_samples_per_second": 196.11, "eval_steps_per_second": 24.514, "step": 21500 }, { "epoch": 1.2992842658028834, "grad_norm": 1.5396491289138794, "learning_rate": 5.785928642880961e-06, "loss": 0.4128, "step": 21600 }, { "epoch": 1.2992842658028834, "eval_loss": 0.3713712990283966, "eval_runtime": 51.0389, "eval_samples_per_second": 195.929, "eval_steps_per_second": 24.491, "step": 21600 }, { "epoch": 1.305299470737156, "grad_norm": 0.9880481362342834, "learning_rate": 5.784928309436479e-06, "loss": 0.4138, "step": 21700 }, { "epoch": 1.305299470737156, "eval_loss": 0.3710058033466339, "eval_runtime": 51.3224, "eval_samples_per_second": 194.847, "eval_steps_per_second": 24.356, "step": 21700 }, { "epoch": 1.3113146756714285, "grad_norm": 0.9788950085639954, "learning_rate": 5.783927975991998e-06, "loss": 0.4108, "step": 21800 }, { "epoch": 1.3113146756714285, "eval_loss": 0.3687758147716522, "eval_runtime": 51.0044, "eval_samples_per_second": 196.062, "eval_steps_per_second": 24.508, "step": 21800 }, { "epoch": 1.317329880605701, "grad_norm": 1.0298100709915161, "learning_rate": 5.782927642547516e-06, "loss": 0.4129, "step": 21900 }, { "epoch": 1.317329880605701, "eval_loss": 0.365496426820755, "eval_runtime": 51.065, "eval_samples_per_second": 195.829, "eval_steps_per_second": 24.479, "step": 21900 }, { "epoch": 1.3233450855399735, "grad_norm": 1.0753816366195679, "learning_rate": 5.781927309103034e-06, "loss": 0.413, "step": 22000 }, { "epoch": 1.3233450855399735, "eval_loss": 0.3655156195163727, "eval_runtime": 51.117, "eval_samples_per_second": 195.63, "eval_steps_per_second": 24.454, "step": 22000 }, { "epoch": 1.3293602904742463, "grad_norm": 1.1379014253616333, "learning_rate": 5.780926975658553e-06, "loss": 0.4101, "step": 22100 }, { "epoch": 1.3293602904742463, "eval_loss": 0.37188926339149475, "eval_runtime": 51.0999, "eval_samples_per_second": 195.695, "eval_steps_per_second": 24.462, "step": 22100 }, { "epoch": 1.3353754954085189, "grad_norm": 0.9869519472122192, "learning_rate": 5.779926642214072e-06, "loss": 0.4113, "step": 22200 }, { "epoch": 1.3353754954085189, "eval_loss": 0.36685308814048767, "eval_runtime": 50.9524, "eval_samples_per_second": 196.262, "eval_steps_per_second": 24.533, "step": 22200 }, { "epoch": 1.3413907003427914, "grad_norm": 1.1977757215499878, "learning_rate": 5.77892630876959e-06, "loss": 0.4106, "step": 22300 }, { "epoch": 1.3413907003427914, "eval_loss": 0.3694215714931488, "eval_runtime": 50.8823, "eval_samples_per_second": 196.532, "eval_steps_per_second": 24.566, "step": 22300 }, { "epoch": 1.3474059052770642, "grad_norm": 1.0620633363723755, "learning_rate": 5.777925975325109e-06, "loss": 0.407, "step": 22400 }, { "epoch": 1.3474059052770642, "eval_loss": 0.36941900849342346, "eval_runtime": 51.0452, "eval_samples_per_second": 195.905, "eval_steps_per_second": 24.488, "step": 22400 }, { "epoch": 1.3534211102113367, "grad_norm": 1.0130232572555542, "learning_rate": 5.776925641880627e-06, "loss": 0.4076, "step": 22500 }, { "epoch": 1.3534211102113367, "eval_loss": 0.3688518702983856, "eval_runtime": 51.2935, "eval_samples_per_second": 194.956, "eval_steps_per_second": 24.37, "step": 22500 }, { "epoch": 1.3594363151456093, "grad_norm": 1.1370288133621216, "learning_rate": 5.775925308436146e-06, "loss": 0.4058, "step": 22600 }, { "epoch": 1.3594363151456093, "eval_loss": 0.35986149311065674, "eval_runtime": 50.94, "eval_samples_per_second": 196.309, "eval_steps_per_second": 24.539, "step": 22600 }, { "epoch": 1.365451520079882, "grad_norm": 1.0753254890441895, "learning_rate": 5.7749249749916635e-06, "loss": 0.404, "step": 22700 }, { "epoch": 1.365451520079882, "eval_loss": 0.36281687021255493, "eval_runtime": 51.0705, "eval_samples_per_second": 195.808, "eval_steps_per_second": 24.476, "step": 22700 }, { "epoch": 1.3714667250141546, "grad_norm": 1.0779234170913696, "learning_rate": 5.773924641547182e-06, "loss": 0.4055, "step": 22800 }, { "epoch": 1.3714667250141546, "eval_loss": 0.3607022762298584, "eval_runtime": 51.2843, "eval_samples_per_second": 194.992, "eval_steps_per_second": 24.374, "step": 22800 }, { "epoch": 1.377481929948427, "grad_norm": 1.0071178674697876, "learning_rate": 5.772924308102701e-06, "loss": 0.4038, "step": 22900 }, { "epoch": 1.377481929948427, "eval_loss": 0.36346524953842163, "eval_runtime": 50.9712, "eval_samples_per_second": 196.189, "eval_steps_per_second": 24.524, "step": 22900 }, { "epoch": 1.3834971348826999, "grad_norm": 1.0683503150939941, "learning_rate": 5.77192397465822e-06, "loss": 0.4047, "step": 23000 }, { "epoch": 1.3834971348826999, "eval_loss": 0.36117979884147644, "eval_runtime": 51.0395, "eval_samples_per_second": 195.927, "eval_steps_per_second": 24.491, "step": 23000 }, { "epoch": 1.3895123398169724, "grad_norm": 1.1770708560943604, "learning_rate": 5.770923641213738e-06, "loss": 0.4043, "step": 23100 }, { "epoch": 1.3895123398169724, "eval_loss": 0.36106517910957336, "eval_runtime": 51.0648, "eval_samples_per_second": 195.83, "eval_steps_per_second": 24.479, "step": 23100 }, { "epoch": 1.395527544751245, "grad_norm": 0.9239141941070557, "learning_rate": 5.769923307769257e-06, "loss": 0.4011, "step": 23200 }, { "epoch": 1.395527544751245, "eval_loss": 0.3578794598579407, "eval_runtime": 51.0531, "eval_samples_per_second": 195.875, "eval_steps_per_second": 24.484, "step": 23200 }, { "epoch": 1.4015427496855175, "grad_norm": 1.2712723016738892, "learning_rate": 5.768922974324775e-06, "loss": 0.4008, "step": 23300 }, { "epoch": 1.4015427496855175, "eval_loss": 0.3636392652988434, "eval_runtime": 51.1514, "eval_samples_per_second": 195.498, "eval_steps_per_second": 24.437, "step": 23300 }, { "epoch": 1.40755795461979, "grad_norm": 1.040955901145935, "learning_rate": 5.767922640880294e-06, "loss": 0.3974, "step": 23400 }, { "epoch": 1.40755795461979, "eval_loss": 0.3629893660545349, "eval_runtime": 51.021, "eval_samples_per_second": 195.998, "eval_steps_per_second": 24.5, "step": 23400 }, { "epoch": 1.4135731595540628, "grad_norm": 0.9896743893623352, "learning_rate": 5.766922307435812e-06, "loss": 0.3991, "step": 23500 }, { "epoch": 1.4135731595540628, "eval_loss": 0.35531342029571533, "eval_runtime": 51.17, "eval_samples_per_second": 195.427, "eval_steps_per_second": 24.428, "step": 23500 }, { "epoch": 1.4195883644883354, "grad_norm": 1.088028073310852, "learning_rate": 5.76592197399133e-06, "loss": 0.3972, "step": 23600 }, { "epoch": 1.4195883644883354, "eval_loss": 0.35938191413879395, "eval_runtime": 51.2648, "eval_samples_per_second": 195.066, "eval_steps_per_second": 24.383, "step": 23600 }, { "epoch": 1.425603569422608, "grad_norm": 1.0598886013031006, "learning_rate": 5.764921640546849e-06, "loss": 0.4021, "step": 23700 }, { "epoch": 1.425603569422608, "eval_loss": 0.35533782839775085, "eval_runtime": 51.0234, "eval_samples_per_second": 195.989, "eval_steps_per_second": 24.499, "step": 23700 }, { "epoch": 1.4316187743568807, "grad_norm": 1.1906119585037231, "learning_rate": 5.763921307102368e-06, "loss": 0.3977, "step": 23800 }, { "epoch": 1.4316187743568807, "eval_loss": 0.3564583361148834, "eval_runtime": 51.0223, "eval_samples_per_second": 195.993, "eval_steps_per_second": 24.499, "step": 23800 }, { "epoch": 1.4376339792911532, "grad_norm": 1.1549937725067139, "learning_rate": 5.762920973657886e-06, "loss": 0.3942, "step": 23900 }, { "epoch": 1.4376339792911532, "eval_loss": 0.3534764051437378, "eval_runtime": 51.1427, "eval_samples_per_second": 195.531, "eval_steps_per_second": 24.441, "step": 23900 }, { "epoch": 1.4436491842254258, "grad_norm": 1.0571911334991455, "learning_rate": 5.761920640213405e-06, "loss": 0.3953, "step": 24000 }, { "epoch": 1.4436491842254258, "eval_loss": 0.3564269542694092, "eval_runtime": 51.0367, "eval_samples_per_second": 195.938, "eval_steps_per_second": 24.492, "step": 24000 }, { "epoch": 1.4496643891596985, "grad_norm": 1.058688998222351, "learning_rate": 5.760920306768923e-06, "loss": 0.3957, "step": 24100 }, { "epoch": 1.4496643891596985, "eval_loss": 0.3465494215488434, "eval_runtime": 51.0338, "eval_samples_per_second": 195.949, "eval_steps_per_second": 24.494, "step": 24100 }, { "epoch": 1.455679594093971, "grad_norm": 1.0260639190673828, "learning_rate": 5.759919973324442e-06, "loss": 0.3954, "step": 24200 }, { "epoch": 1.455679594093971, "eval_loss": 0.34943073987960815, "eval_runtime": 50.8891, "eval_samples_per_second": 196.506, "eval_steps_per_second": 24.563, "step": 24200 }, { "epoch": 1.4616947990282436, "grad_norm": 0.9939345717430115, "learning_rate": 5.75891963987996e-06, "loss": 0.3944, "step": 24300 }, { "epoch": 1.4616947990282436, "eval_loss": 0.35242801904678345, "eval_runtime": 51.0489, "eval_samples_per_second": 195.891, "eval_steps_per_second": 24.486, "step": 24300 }, { "epoch": 1.4677100039625164, "grad_norm": 1.0830129384994507, "learning_rate": 5.757919306435478e-06, "loss": 0.3894, "step": 24400 }, { "epoch": 1.4677100039625164, "eval_loss": 0.34800294041633606, "eval_runtime": 51.3057, "eval_samples_per_second": 194.91, "eval_steps_per_second": 24.364, "step": 24400 }, { "epoch": 1.473725208896789, "grad_norm": 1.0526846647262573, "learning_rate": 5.756918972990997e-06, "loss": 0.39, "step": 24500 }, { "epoch": 1.473725208896789, "eval_loss": 0.3510083556175232, "eval_runtime": 50.9026, "eval_samples_per_second": 196.454, "eval_steps_per_second": 24.557, "step": 24500 }, { "epoch": 1.4797404138310615, "grad_norm": 1.1267868280410767, "learning_rate": 5.755918639546516e-06, "loss": 0.3902, "step": 24600 }, { "epoch": 1.4797404138310615, "eval_loss": 0.3532961308956146, "eval_runtime": 51.0797, "eval_samples_per_second": 195.773, "eval_steps_per_second": 24.472, "step": 24600 }, { "epoch": 1.485755618765334, "grad_norm": 1.1018403768539429, "learning_rate": 5.754918306102034e-06, "loss": 0.3908, "step": 24700 }, { "epoch": 1.485755618765334, "eval_loss": 0.3456381559371948, "eval_runtime": 51.3247, "eval_samples_per_second": 194.838, "eval_steps_per_second": 24.355, "step": 24700 }, { "epoch": 1.4917708236996066, "grad_norm": 1.0022377967834473, "learning_rate": 5.753917972657553e-06, "loss": 0.3869, "step": 24800 }, { "epoch": 1.4917708236996066, "eval_loss": 0.3509150445461273, "eval_runtime": 51.0426, "eval_samples_per_second": 195.915, "eval_steps_per_second": 24.489, "step": 24800 }, { "epoch": 1.4977860286338793, "grad_norm": 1.02973210811615, "learning_rate": 5.752917639213071e-06, "loss": 0.3885, "step": 24900 }, { "epoch": 1.4977860286338793, "eval_loss": 0.3488512635231018, "eval_runtime": 50.9719, "eval_samples_per_second": 196.187, "eval_steps_per_second": 24.523, "step": 24900 }, { "epoch": 1.5038012335681519, "grad_norm": 1.0170624256134033, "learning_rate": 5.7519173057685896e-06, "loss": 0.386, "step": 25000 }, { "epoch": 1.5038012335681519, "eval_loss": 0.344295859336853, "eval_runtime": 51.2301, "eval_samples_per_second": 195.198, "eval_steps_per_second": 24.4, "step": 25000 }, { "epoch": 1.5098164385024244, "grad_norm": 1.0053726434707642, "learning_rate": 5.750916972324108e-06, "loss": 0.3885, "step": 25100 }, { "epoch": 1.5098164385024244, "eval_loss": 0.34295952320098877, "eval_runtime": 51.2643, "eval_samples_per_second": 195.068, "eval_steps_per_second": 24.383, "step": 25100 }, { "epoch": 1.5158316434366972, "grad_norm": 0.9546186327934265, "learning_rate": 5.749916638879626e-06, "loss": 0.3902, "step": 25200 }, { "epoch": 1.5158316434366972, "eval_loss": 0.3494739234447479, "eval_runtime": 51.1243, "eval_samples_per_second": 195.602, "eval_steps_per_second": 24.45, "step": 25200 }, { "epoch": 1.5218468483709697, "grad_norm": 1.0184184312820435, "learning_rate": 5.748916305435145e-06, "loss": 0.3853, "step": 25300 }, { "epoch": 1.5218468483709697, "eval_loss": 0.34722205996513367, "eval_runtime": 51.0304, "eval_samples_per_second": 195.961, "eval_steps_per_second": 24.495, "step": 25300 }, { "epoch": 1.5278620533052423, "grad_norm": 1.0732802152633667, "learning_rate": 5.747915971990664e-06, "loss": 0.3868, "step": 25400 }, { "epoch": 1.5278620533052423, "eval_loss": 0.34737443923950195, "eval_runtime": 51.1073, "eval_samples_per_second": 195.667, "eval_steps_per_second": 24.458, "step": 25400 }, { "epoch": 1.533877258239515, "grad_norm": 1.023866891860962, "learning_rate": 5.746915638546182e-06, "loss": 0.3846, "step": 25500 }, { "epoch": 1.533877258239515, "eval_loss": 0.34227558970451355, "eval_runtime": 51.0647, "eval_samples_per_second": 195.83, "eval_steps_per_second": 24.479, "step": 25500 }, { "epoch": 1.5398924631737876, "grad_norm": 0.9621095657348633, "learning_rate": 5.745915305101701e-06, "loss": 0.3853, "step": 25600 }, { "epoch": 1.5398924631737876, "eval_loss": 0.33890464901924133, "eval_runtime": 37.4533, "eval_samples_per_second": 266.999, "eval_steps_per_second": 33.375, "step": 25600 }, { "epoch": 1.5459076681080601, "grad_norm": 1.0459903478622437, "learning_rate": 5.744914971657219e-06, "loss": 0.3867, "step": 25700 }, { "epoch": 1.5459076681080601, "eval_loss": 0.3423731327056885, "eval_runtime": 51.0943, "eval_samples_per_second": 195.717, "eval_steps_per_second": 24.465, "step": 25700 }, { "epoch": 1.551922873042333, "grad_norm": 1.0103187561035156, "learning_rate": 5.7439146382127375e-06, "loss": 0.3846, "step": 25800 }, { "epoch": 1.551922873042333, "eval_loss": 0.3495667576789856, "eval_runtime": 51.0619, "eval_samples_per_second": 195.841, "eval_steps_per_second": 24.48, "step": 25800 }, { "epoch": 1.5579380779766052, "grad_norm": 1.1959409713745117, "learning_rate": 5.742914304768256e-06, "loss": 0.3836, "step": 25900 }, { "epoch": 1.5579380779766052, "eval_loss": 0.34345749020576477, "eval_runtime": 50.9931, "eval_samples_per_second": 196.105, "eval_steps_per_second": 24.513, "step": 25900 }, { "epoch": 1.563953282910878, "grad_norm": 1.0257697105407715, "learning_rate": 5.741913971323774e-06, "loss": 0.3832, "step": 26000 }, { "epoch": 1.563953282910878, "eval_loss": 0.3426493704319, "eval_runtime": 51.1309, "eval_samples_per_second": 195.577, "eval_steps_per_second": 24.447, "step": 26000 }, { "epoch": 1.5699684878451505, "grad_norm": 1.1140973567962646, "learning_rate": 5.740913637879294e-06, "loss": 0.3797, "step": 26100 }, { "epoch": 1.5699684878451505, "eval_loss": 0.34580498933792114, "eval_runtime": 51.1787, "eval_samples_per_second": 195.394, "eval_steps_per_second": 24.424, "step": 26100 }, { "epoch": 1.575983692779423, "grad_norm": 1.0050679445266724, "learning_rate": 5.739913304434812e-06, "loss": 0.3749, "step": 26200 }, { "epoch": 1.575983692779423, "eval_loss": 0.3454411029815674, "eval_runtime": 51.1577, "eval_samples_per_second": 195.474, "eval_steps_per_second": 24.434, "step": 26200 }, { "epoch": 1.5819988977136958, "grad_norm": 1.0191149711608887, "learning_rate": 5.73891297099033e-06, "loss": 0.3772, "step": 26300 }, { "epoch": 1.5819988977136958, "eval_loss": 0.3403486907482147, "eval_runtime": 51.0929, "eval_samples_per_second": 195.722, "eval_steps_per_second": 24.465, "step": 26300 }, { "epoch": 1.5880141026479684, "grad_norm": 1.1277610063552856, "learning_rate": 5.737912637545849e-06, "loss": 0.3783, "step": 26400 }, { "epoch": 1.5880141026479684, "eval_loss": 0.3426676392555237, "eval_runtime": 51.3622, "eval_samples_per_second": 194.696, "eval_steps_per_second": 24.337, "step": 26400 }, { "epoch": 1.594029307582241, "grad_norm": 1.12416672706604, "learning_rate": 5.736912304101368e-06, "loss": 0.3765, "step": 26500 }, { "epoch": 1.594029307582241, "eval_loss": 0.3407214879989624, "eval_runtime": 51.185, "eval_samples_per_second": 195.37, "eval_steps_per_second": 24.421, "step": 26500 }, { "epoch": 1.6000445125165137, "grad_norm": 0.9676984548568726, "learning_rate": 5.7359119706568855e-06, "loss": 0.377, "step": 26600 }, { "epoch": 1.6000445125165137, "eval_loss": 0.3347455859184265, "eval_runtime": 50.9838, "eval_samples_per_second": 196.141, "eval_steps_per_second": 24.518, "step": 26600 }, { "epoch": 1.6060597174507862, "grad_norm": 1.0561347007751465, "learning_rate": 5.734911637212404e-06, "loss": 0.3768, "step": 26700 }, { "epoch": 1.6060597174507862, "eval_loss": 0.3399183452129364, "eval_runtime": 51.075, "eval_samples_per_second": 195.79, "eval_steps_per_second": 24.474, "step": 26700 }, { "epoch": 1.6120749223850588, "grad_norm": 1.2122465372085571, "learning_rate": 5.733911303767923e-06, "loss": 0.3763, "step": 26800 }, { "epoch": 1.6120749223850588, "eval_loss": 0.33461084961891174, "eval_runtime": 51.0463, "eval_samples_per_second": 195.901, "eval_steps_per_second": 24.488, "step": 26800 }, { "epoch": 1.6180901273193316, "grad_norm": 1.0054854154586792, "learning_rate": 5.732910970323442e-06, "loss": 0.3786, "step": 26900 }, { "epoch": 1.6180901273193316, "eval_loss": 0.3318628668785095, "eval_runtime": 51.0826, "eval_samples_per_second": 195.761, "eval_steps_per_second": 24.47, "step": 26900 }, { "epoch": 1.624105332253604, "grad_norm": 1.072472333908081, "learning_rate": 5.73191063687896e-06, "loss": 0.3762, "step": 27000 }, { "epoch": 1.624105332253604, "eval_loss": 0.3293687403202057, "eval_runtime": 51.072, "eval_samples_per_second": 195.802, "eval_steps_per_second": 24.475, "step": 27000 }, { "epoch": 1.6301205371878766, "grad_norm": 1.0058602094650269, "learning_rate": 5.730910303434478e-06, "loss": 0.3716, "step": 27100 }, { "epoch": 1.6301205371878766, "eval_loss": 0.33610230684280396, "eval_runtime": 51.0651, "eval_samples_per_second": 195.828, "eval_steps_per_second": 24.479, "step": 27100 }, { "epoch": 1.6361357421221494, "grad_norm": 1.0208802223205566, "learning_rate": 5.729909969989997e-06, "loss": 0.3724, "step": 27200 }, { "epoch": 1.6361357421221494, "eval_loss": 0.3361985981464386, "eval_runtime": 51.1569, "eval_samples_per_second": 195.477, "eval_steps_per_second": 24.435, "step": 27200 }, { "epoch": 1.6421509470564217, "grad_norm": 1.0464400053024292, "learning_rate": 5.728909636545516e-06, "loss": 0.3732, "step": 27300 }, { "epoch": 1.6421509470564217, "eval_loss": 0.3356834053993225, "eval_runtime": 21.647, "eval_samples_per_second": 461.957, "eval_steps_per_second": 57.745, "step": 27300 }, { "epoch": 1.6481661519906945, "grad_norm": 1.1063635349273682, "learning_rate": 5.7279093031010335e-06, "loss": 0.3725, "step": 27400 }, { "epoch": 1.6481661519906945, "eval_loss": 0.3378269374370575, "eval_runtime": 48.6948, "eval_samples_per_second": 205.361, "eval_steps_per_second": 25.67, "step": 27400 }, { "epoch": 1.654181356924967, "grad_norm": 0.8910077214241028, "learning_rate": 5.726908969656552e-06, "loss": 0.3707, "step": 27500 }, { "epoch": 1.654181356924967, "eval_loss": 0.3300679624080658, "eval_runtime": 48.819, "eval_samples_per_second": 204.838, "eval_steps_per_second": 25.605, "step": 27500 }, { "epoch": 1.6601965618592396, "grad_norm": 0.9904689192771912, "learning_rate": 5.725908636212071e-06, "loss": 0.3722, "step": 27600 }, { "epoch": 1.6601965618592396, "eval_loss": 0.33077552914619446, "eval_runtime": 45.4305, "eval_samples_per_second": 220.116, "eval_steps_per_second": 27.515, "step": 27600 }, { "epoch": 1.6662117667935123, "grad_norm": 1.0377715826034546, "learning_rate": 5.72490830276759e-06, "loss": 0.3693, "step": 27700 }, { "epoch": 1.6662117667935123, "eval_loss": 0.3365156948566437, "eval_runtime": 46.8492, "eval_samples_per_second": 213.451, "eval_steps_per_second": 26.681, "step": 27700 }, { "epoch": 1.672226971727785, "grad_norm": 0.9838355183601379, "learning_rate": 5.723907969323108e-06, "loss": 0.373, "step": 27800 }, { "epoch": 1.672226971727785, "eval_loss": 0.33353880047798157, "eval_runtime": 47.6968, "eval_samples_per_second": 209.658, "eval_steps_per_second": 26.207, "step": 27800 }, { "epoch": 1.6782421766620574, "grad_norm": 1.0050548315048218, "learning_rate": 5.722907635878626e-06, "loss": 0.3707, "step": 27900 }, { "epoch": 1.6782421766620574, "eval_loss": 0.3265502154827118, "eval_runtime": 48.1571, "eval_samples_per_second": 207.654, "eval_steps_per_second": 25.957, "step": 27900 }, { "epoch": 1.6842573815963302, "grad_norm": 1.0083630084991455, "learning_rate": 5.721907302434145e-06, "loss": 0.3687, "step": 28000 }, { "epoch": 1.6842573815963302, "eval_loss": 0.33139145374298096, "eval_runtime": 48.694, "eval_samples_per_second": 205.364, "eval_steps_per_second": 25.671, "step": 28000 }, { "epoch": 1.6902725865306027, "grad_norm": 0.9649508595466614, "learning_rate": 5.7209069689896636e-06, "loss": 0.3661, "step": 28100 }, { "epoch": 1.6902725865306027, "eval_loss": 0.3332207202911377, "eval_runtime": 40.0334, "eval_samples_per_second": 249.792, "eval_steps_per_second": 31.224, "step": 28100 }, { "epoch": 1.6962877914648753, "grad_norm": 1.042528748512268, "learning_rate": 5.7199066355451814e-06, "loss": 0.3702, "step": 28200 }, { "epoch": 1.6962877914648753, "eval_loss": 0.32571831345558167, "eval_runtime": 49.2797, "eval_samples_per_second": 202.923, "eval_steps_per_second": 25.365, "step": 28200 }, { "epoch": 1.702302996399148, "grad_norm": 0.9756554365158081, "learning_rate": 5.7189063021007e-06, "loss": 0.3647, "step": 28300 }, { "epoch": 1.702302996399148, "eval_loss": 0.3234156668186188, "eval_runtime": 49.7079, "eval_samples_per_second": 201.175, "eval_steps_per_second": 25.147, "step": 28300 }, { "epoch": 1.7083182013334206, "grad_norm": 1.0613596439361572, "learning_rate": 5.717905968656219e-06, "loss": 0.3649, "step": 28400 }, { "epoch": 1.7083182013334206, "eval_loss": 0.32939964532852173, "eval_runtime": 50.06, "eval_samples_per_second": 199.76, "eval_steps_per_second": 24.97, "step": 28400 }, { "epoch": 1.7143334062676931, "grad_norm": 1.0461217164993286, "learning_rate": 5.7169056352117375e-06, "loss": 0.3677, "step": 28500 }, { "epoch": 1.7143334062676931, "eval_loss": 0.32745957374572754, "eval_runtime": 50.0541, "eval_samples_per_second": 199.784, "eval_steps_per_second": 24.973, "step": 28500 }, { "epoch": 1.720348611201966, "grad_norm": 1.0226540565490723, "learning_rate": 5.715905301767256e-06, "loss": 0.3642, "step": 28600 }, { "epoch": 1.720348611201966, "eval_loss": 0.3290911316871643, "eval_runtime": 50.4387, "eval_samples_per_second": 198.26, "eval_steps_per_second": 24.783, "step": 28600 }, { "epoch": 1.7263638161362382, "grad_norm": 1.0498120784759521, "learning_rate": 5.714904968322774e-06, "loss": 0.3626, "step": 28700 }, { "epoch": 1.7263638161362382, "eval_loss": 0.33111146092414856, "eval_runtime": 50.7317, "eval_samples_per_second": 197.115, "eval_steps_per_second": 24.639, "step": 28700 }, { "epoch": 1.732379021070511, "grad_norm": 1.0179612636566162, "learning_rate": 5.713904634878293e-06, "loss": 0.3611, "step": 28800 }, { "epoch": 1.732379021070511, "eval_loss": 0.31966713070869446, "eval_runtime": 35.8874, "eval_samples_per_second": 278.65, "eval_steps_per_second": 34.831, "step": 28800 }, { "epoch": 1.7383942260047835, "grad_norm": 0.9876866340637207, "learning_rate": 5.7129043014338115e-06, "loss": 0.3609, "step": 28900 }, { "epoch": 1.7383942260047835, "eval_loss": 0.3232952356338501, "eval_runtime": 50.8899, "eval_samples_per_second": 196.503, "eval_steps_per_second": 24.563, "step": 28900 }, { "epoch": 1.744409430939056, "grad_norm": 1.08419668674469, "learning_rate": 5.711903967989329e-06, "loss": 0.3621, "step": 29000 }, { "epoch": 1.744409430939056, "eval_loss": 0.32880115509033203, "eval_runtime": 50.9007, "eval_samples_per_second": 196.461, "eval_steps_per_second": 24.558, "step": 29000 }, { "epoch": 1.7504246358733289, "grad_norm": 1.0506683588027954, "learning_rate": 5.710903634544848e-06, "loss": 0.3612, "step": 29100 }, { "epoch": 1.7504246358733289, "eval_loss": 0.32626426219940186, "eval_runtime": 51.3181, "eval_samples_per_second": 194.863, "eval_steps_per_second": 24.358, "step": 29100 }, { "epoch": 1.7564398408076014, "grad_norm": 1.0610612630844116, "learning_rate": 5.709903301100367e-06, "loss": 0.3604, "step": 29200 }, { "epoch": 1.7564398408076014, "eval_loss": 0.32427623867988586, "eval_runtime": 51.1109, "eval_samples_per_second": 195.653, "eval_steps_per_second": 24.457, "step": 29200 }, { "epoch": 1.762455045741874, "grad_norm": 1.0237441062927246, "learning_rate": 5.7089029676558855e-06, "loss": 0.3576, "step": 29300 }, { "epoch": 1.762455045741874, "eval_loss": 0.325724720954895, "eval_runtime": 51.0538, "eval_samples_per_second": 195.872, "eval_steps_per_second": 24.484, "step": 29300 }, { "epoch": 1.7684702506761467, "grad_norm": 1.0518171787261963, "learning_rate": 5.707902634211404e-06, "loss": 0.3623, "step": 29400 }, { "epoch": 1.7684702506761467, "eval_loss": 0.3236755430698395, "eval_runtime": 51.279, "eval_samples_per_second": 195.012, "eval_steps_per_second": 24.376, "step": 29400 }, { "epoch": 1.7744854556104193, "grad_norm": 1.008692741394043, "learning_rate": 5.706902300766923e-06, "loss": 0.3594, "step": 29500 }, { "epoch": 1.7744854556104193, "eval_loss": 0.322955846786499, "eval_runtime": 50.9674, "eval_samples_per_second": 196.204, "eval_steps_per_second": 24.525, "step": 29500 }, { "epoch": 1.7805006605446918, "grad_norm": 1.0272122621536255, "learning_rate": 5.705901967322441e-06, "loss": 0.3589, "step": 29600 }, { "epoch": 1.7805006605446918, "eval_loss": 0.32889479398727417, "eval_runtime": 51.0901, "eval_samples_per_second": 195.733, "eval_steps_per_second": 24.467, "step": 29600 }, { "epoch": 1.7865158654789646, "grad_norm": 0.9986202120780945, "learning_rate": 5.7049016338779595e-06, "loss": 0.3583, "step": 29700 }, { "epoch": 1.7865158654789646, "eval_loss": 0.32579848170280457, "eval_runtime": 51.3308, "eval_samples_per_second": 194.815, "eval_steps_per_second": 24.352, "step": 29700 }, { "epoch": 1.7925310704132371, "grad_norm": 1.1426304578781128, "learning_rate": 5.703901300433477e-06, "loss": 0.3578, "step": 29800 }, { "epoch": 1.7925310704132371, "eval_loss": 0.3219316303730011, "eval_runtime": 51.0488, "eval_samples_per_second": 195.891, "eval_steps_per_second": 24.486, "step": 29800 }, { "epoch": 1.7985462753475097, "grad_norm": 1.0315282344818115, "learning_rate": 5.702900966988996e-06, "loss": 0.3554, "step": 29900 }, { "epoch": 1.7985462753475097, "eval_loss": 0.3245343267917633, "eval_runtime": 51.1337, "eval_samples_per_second": 195.566, "eval_steps_per_second": 24.446, "step": 29900 }, { "epoch": 1.8045614802817824, "grad_norm": 0.9708550572395325, "learning_rate": 5.701900633544515e-06, "loss": 0.3576, "step": 30000 }, { "epoch": 1.8045614802817824, "eval_loss": 0.3180968761444092, "eval_runtime": 51.0446, "eval_samples_per_second": 195.907, "eval_steps_per_second": 24.488, "step": 30000 }, { "epoch": 1.8105766852160547, "grad_norm": 0.9034538865089417, "learning_rate": 5.7009003001000335e-06, "loss": 0.3537, "step": 30100 }, { "epoch": 1.8105766852160547, "eval_loss": 0.3229399621486664, "eval_runtime": 51.0689, "eval_samples_per_second": 195.814, "eval_steps_per_second": 24.477, "step": 30100 }, { "epoch": 1.8165918901503275, "grad_norm": 1.0373872518539429, "learning_rate": 5.699899966655552e-06, "loss": 0.356, "step": 30200 }, { "epoch": 1.8165918901503275, "eval_loss": 0.3164275288581848, "eval_runtime": 51.4888, "eval_samples_per_second": 194.217, "eval_steps_per_second": 24.277, "step": 30200 }, { "epoch": 1.8226070950846, "grad_norm": 1.073961615562439, "learning_rate": 5.698899633211071e-06, "loss": 0.3574, "step": 30300 }, { "epoch": 1.8226070950846, "eval_loss": 0.3165951669216156, "eval_runtime": 51.0637, "eval_samples_per_second": 195.834, "eval_steps_per_second": 24.479, "step": 30300 }, { "epoch": 1.8286223000188726, "grad_norm": 0.9891506433486938, "learning_rate": 5.697899299766589e-06, "loss": 0.3548, "step": 30400 }, { "epoch": 1.8286223000188726, "eval_loss": 0.3134399354457855, "eval_runtime": 51.2735, "eval_samples_per_second": 195.032, "eval_steps_per_second": 24.379, "step": 30400 }, { "epoch": 1.8346375049531454, "grad_norm": 0.9468514919281006, "learning_rate": 5.6968989663221075e-06, "loss": 0.3534, "step": 30500 }, { "epoch": 1.8346375049531454, "eval_loss": 0.3175615966320038, "eval_runtime": 51.0054, "eval_samples_per_second": 196.058, "eval_steps_per_second": 24.507, "step": 30500 }, { "epoch": 1.840652709887418, "grad_norm": 1.0942094326019287, "learning_rate": 5.695898632877625e-06, "loss": 0.3551, "step": 30600 }, { "epoch": 1.840652709887418, "eval_loss": 0.31934764981269836, "eval_runtime": 50.744, "eval_samples_per_second": 197.068, "eval_steps_per_second": 24.633, "step": 30600 }, { "epoch": 1.8466679148216905, "grad_norm": 1.0087659358978271, "learning_rate": 5.694898299433144e-06, "loss": 0.3534, "step": 30700 }, { "epoch": 1.8466679148216905, "eval_loss": 0.3216070532798767, "eval_runtime": 51.2443, "eval_samples_per_second": 195.144, "eval_steps_per_second": 24.393, "step": 30700 }, { "epoch": 1.8526831197559632, "grad_norm": 0.973987340927124, "learning_rate": 5.693897965988664e-06, "loss": 0.3551, "step": 30800 }, { "epoch": 1.8526831197559632, "eval_loss": 0.3222227990627289, "eval_runtime": 51.317, "eval_samples_per_second": 194.867, "eval_steps_per_second": 24.358, "step": 30800 }, { "epoch": 1.8586983246902358, "grad_norm": 1.0220999717712402, "learning_rate": 5.6928976325441814e-06, "loss": 0.3512, "step": 30900 }, { "epoch": 1.8586983246902358, "eval_loss": 0.3149110972881317, "eval_runtime": 50.9851, "eval_samples_per_second": 196.136, "eval_steps_per_second": 24.517, "step": 30900 }, { "epoch": 1.8647135296245083, "grad_norm": 0.9891929626464844, "learning_rate": 5.6918972990997e-06, "loss": 0.3494, "step": 31000 }, { "epoch": 1.8647135296245083, "eval_loss": 0.3158430755138397, "eval_runtime": 51.0404, "eval_samples_per_second": 195.923, "eval_steps_per_second": 24.49, "step": 31000 }, { "epoch": 1.870728734558781, "grad_norm": 1.0088871717453003, "learning_rate": 5.690896965655219e-06, "loss": 0.3554, "step": 31100 }, { "epoch": 1.870728734558781, "eval_loss": 0.3154695928096771, "eval_runtime": 51.3526, "eval_samples_per_second": 194.732, "eval_steps_per_second": 24.342, "step": 31100 }, { "epoch": 1.8767439394930534, "grad_norm": 1.050904393196106, "learning_rate": 5.689896632210737e-06, "loss": 0.348, "step": 31200 }, { "epoch": 1.8767439394930534, "eval_loss": 0.3176015019416809, "eval_runtime": 50.968, "eval_samples_per_second": 196.202, "eval_steps_per_second": 24.525, "step": 31200 }, { "epoch": 1.8827591444273262, "grad_norm": 0.9467193484306335, "learning_rate": 5.688896298766255e-06, "loss": 0.3495, "step": 31300 }, { "epoch": 1.8827591444273262, "eval_loss": 0.31329813599586487, "eval_runtime": 51.0441, "eval_samples_per_second": 195.909, "eval_steps_per_second": 24.489, "step": 31300 }, { "epoch": 1.888774349361599, "grad_norm": 0.9775587916374207, "learning_rate": 5.687895965321774e-06, "loss": 0.348, "step": 31400 }, { "epoch": 1.888774349361599, "eval_loss": 0.3119243383407593, "eval_runtime": 51.4209, "eval_samples_per_second": 194.474, "eval_steps_per_second": 24.309, "step": 31400 }, { "epoch": 1.8947895542958713, "grad_norm": 0.9961014986038208, "learning_rate": 5.686895631877293e-06, "loss": 0.3481, "step": 31500 }, { "epoch": 1.8947895542958713, "eval_loss": 0.3146650791168213, "eval_runtime": 51.0401, "eval_samples_per_second": 195.924, "eval_steps_per_second": 24.491, "step": 31500 }, { "epoch": 1.900804759230144, "grad_norm": 0.9647944569587708, "learning_rate": 5.6858952984328115e-06, "loss": 0.3485, "step": 31600 }, { "epoch": 1.900804759230144, "eval_loss": 0.3082703948020935, "eval_runtime": 51.0736, "eval_samples_per_second": 195.796, "eval_steps_per_second": 24.474, "step": 31600 }, { "epoch": 1.9068199641644166, "grad_norm": 0.977745532989502, "learning_rate": 5.684894964988329e-06, "loss": 0.346, "step": 31700 }, { "epoch": 1.9068199641644166, "eval_loss": 0.31021973490715027, "eval_runtime": 51.3893, "eval_samples_per_second": 194.593, "eval_steps_per_second": 24.324, "step": 31700 }, { "epoch": 1.9128351690986891, "grad_norm": 1.007712960243225, "learning_rate": 5.683894631543848e-06, "loss": 0.3439, "step": 31800 }, { "epoch": 1.9128351690986891, "eval_loss": 0.3149736225605011, "eval_runtime": 50.9919, "eval_samples_per_second": 196.109, "eval_steps_per_second": 24.514, "step": 31800 }, { "epoch": 1.9188503740329619, "grad_norm": 0.9901500940322876, "learning_rate": 5.682894298099367e-06, "loss": 0.3465, "step": 31900 }, { "epoch": 1.9188503740329619, "eval_loss": 0.3099238872528076, "eval_runtime": 49.9711, "eval_samples_per_second": 200.116, "eval_steps_per_second": 25.014, "step": 31900 }, { "epoch": 1.9248655789672344, "grad_norm": 1.0771408081054688, "learning_rate": 5.681893964654885e-06, "loss": 0.3469, "step": 32000 }, { "epoch": 1.9248655789672344, "eval_loss": 0.3117373585700989, "eval_runtime": 51.3413, "eval_samples_per_second": 194.775, "eval_steps_per_second": 24.347, "step": 32000 }, { "epoch": 1.930880783901507, "grad_norm": 0.9278393983840942, "learning_rate": 5.680893631210403e-06, "loss": 0.3449, "step": 32100 }, { "epoch": 1.930880783901507, "eval_loss": 0.3087506890296936, "eval_runtime": 50.9985, "eval_samples_per_second": 196.084, "eval_steps_per_second": 24.511, "step": 32100 }, { "epoch": 1.9368959888357797, "grad_norm": 0.9451966285705566, "learning_rate": 5.679893297765922e-06, "loss": 0.3481, "step": 32200 }, { "epoch": 1.9368959888357797, "eval_loss": 0.30677124857902527, "eval_runtime": 51.0702, "eval_samples_per_second": 195.809, "eval_steps_per_second": 24.476, "step": 32200 }, { "epoch": 1.9429111937700523, "grad_norm": 1.0483254194259644, "learning_rate": 5.678892964321441e-06, "loss": 0.3445, "step": 32300 }, { "epoch": 1.9429111937700523, "eval_loss": 0.30840355157852173, "eval_runtime": 51.0518, "eval_samples_per_second": 195.879, "eval_steps_per_second": 24.485, "step": 32300 }, { "epoch": 1.9489263987043248, "grad_norm": 1.0422637462615967, "learning_rate": 5.6778926308769595e-06, "loss": 0.3441, "step": 32400 }, { "epoch": 1.9489263987043248, "eval_loss": 0.3115750849246979, "eval_runtime": 51.1153, "eval_samples_per_second": 195.636, "eval_steps_per_second": 24.455, "step": 32400 }, { "epoch": 1.9549416036385976, "grad_norm": 0.9909389019012451, "learning_rate": 5.676892297432478e-06, "loss": 0.344, "step": 32500 }, { "epoch": 1.9549416036385976, "eval_loss": 0.30596745014190674, "eval_runtime": 51.3225, "eval_samples_per_second": 194.846, "eval_steps_per_second": 24.356, "step": 32500 }, { "epoch": 1.96095680857287, "grad_norm": 0.9379361271858215, "learning_rate": 5.675891963987996e-06, "loss": 0.3451, "step": 32600 }, { "epoch": 1.96095680857287, "eval_loss": 0.3045947253704071, "eval_runtime": 48.1799, "eval_samples_per_second": 207.555, "eval_steps_per_second": 25.944, "step": 32600 }, { "epoch": 1.9669720135071427, "grad_norm": 0.9916946887969971, "learning_rate": 5.674891630543515e-06, "loss": 0.3435, "step": 32700 }, { "epoch": 1.9669720135071427, "eval_loss": 0.3098689019680023, "eval_runtime": 51.0219, "eval_samples_per_second": 195.994, "eval_steps_per_second": 24.499, "step": 32700 }, { "epoch": 1.9729872184414154, "grad_norm": 1.0491201877593994, "learning_rate": 5.673891297099033e-06, "loss": 0.3451, "step": 32800 }, { "epoch": 1.9729872184414154, "eval_loss": 0.307062566280365, "eval_runtime": 51.2447, "eval_samples_per_second": 195.142, "eval_steps_per_second": 24.393, "step": 32800 }, { "epoch": 1.9790024233756878, "grad_norm": 1.0011417865753174, "learning_rate": 5.672890963654551e-06, "loss": 0.3438, "step": 32900 }, { "epoch": 1.9790024233756878, "eval_loss": 0.30759868025779724, "eval_runtime": 51.2551, "eval_samples_per_second": 195.103, "eval_steps_per_second": 24.388, "step": 32900 }, { "epoch": 1.9850176283099605, "grad_norm": 0.997515082359314, "learning_rate": 5.67189063021007e-06, "loss": 0.3401, "step": 33000 }, { "epoch": 1.9850176283099605, "eval_loss": 0.30724722146987915, "eval_runtime": 51.0456, "eval_samples_per_second": 195.903, "eval_steps_per_second": 24.488, "step": 33000 }, { "epoch": 1.991032833244233, "grad_norm": 1.00389564037323, "learning_rate": 5.670890296765589e-06, "loss": 0.3435, "step": 33100 }, { "epoch": 1.991032833244233, "eval_loss": 0.30223432183265686, "eval_runtime": 51.0634, "eval_samples_per_second": 195.835, "eval_steps_per_second": 24.479, "step": 33100 }, { "epoch": 1.9970480381785056, "grad_norm": 1.0292458534240723, "learning_rate": 5.6698899633211075e-06, "loss": 0.342, "step": 33200 }, { "epoch": 1.9970480381785056, "eval_loss": 0.3018937110900879, "eval_runtime": 51.3884, "eval_samples_per_second": 194.597, "eval_steps_per_second": 24.325, "step": 33200 }, { "epoch": 2.0030632431127784, "grad_norm": 0.9542250037193298, "learning_rate": 5.668889629876626e-06, "loss": 0.3437, "step": 33300 }, { "epoch": 2.0030632431127784, "eval_loss": 0.3050287961959839, "eval_runtime": 48.1087, "eval_samples_per_second": 207.863, "eval_steps_per_second": 25.983, "step": 33300 }, { "epoch": 2.0090784480470507, "grad_norm": 0.9858297109603882, "learning_rate": 5.667889296432144e-06, "loss": 0.3376, "step": 33400 }, { "epoch": 2.0090784480470507, "eval_loss": 0.3004157543182373, "eval_runtime": 50.8704, "eval_samples_per_second": 196.578, "eval_steps_per_second": 24.572, "step": 33400 }, { "epoch": 2.0150936529813235, "grad_norm": 0.9825339317321777, "learning_rate": 5.666888962987663e-06, "loss": 0.3387, "step": 33500 }, { "epoch": 2.0150936529813235, "eval_loss": 0.3035270869731903, "eval_runtime": 51.1972, "eval_samples_per_second": 195.323, "eval_steps_per_second": 24.415, "step": 33500 }, { "epoch": 2.0211088579155962, "grad_norm": 0.9198622703552246, "learning_rate": 5.665888629543181e-06, "loss": 0.336, "step": 33600 }, { "epoch": 2.0211088579155962, "eval_loss": 0.30675825476646423, "eval_runtime": 50.9963, "eval_samples_per_second": 196.093, "eval_steps_per_second": 24.512, "step": 33600 }, { "epoch": 2.0271240628498686, "grad_norm": 0.9473734498023987, "learning_rate": 5.664888296098699e-06, "loss": 0.336, "step": 33700 }, { "epoch": 2.0271240628498686, "eval_loss": 0.3050824701786041, "eval_runtime": 51.1058, "eval_samples_per_second": 195.673, "eval_steps_per_second": 24.459, "step": 33700 }, { "epoch": 2.0331392677841413, "grad_norm": 0.9824632406234741, "learning_rate": 5.663887962654218e-06, "loss": 0.3366, "step": 33800 }, { "epoch": 2.0331392677841413, "eval_loss": 0.3059363067150116, "eval_runtime": 51.3136, "eval_samples_per_second": 194.88, "eval_steps_per_second": 24.36, "step": 33800 }, { "epoch": 2.039154472718414, "grad_norm": 0.8891803622245789, "learning_rate": 5.662887629209737e-06, "loss": 0.3373, "step": 33900 }, { "epoch": 2.039154472718414, "eval_loss": 0.2996893525123596, "eval_runtime": 51.0027, "eval_samples_per_second": 196.068, "eval_steps_per_second": 24.509, "step": 33900 }, { "epoch": 2.0451696776526864, "grad_norm": 1.0512337684631348, "learning_rate": 5.6618872957652554e-06, "loss": 0.3367, "step": 34000 }, { "epoch": 2.0451696776526864, "eval_loss": 0.3059813976287842, "eval_runtime": 48.247, "eval_samples_per_second": 207.267, "eval_steps_per_second": 25.908, "step": 34000 }, { "epoch": 2.051184882586959, "grad_norm": 0.9054902791976929, "learning_rate": 5.660886962320774e-06, "loss": 0.3371, "step": 34100 }, { "epoch": 2.051184882586959, "eval_loss": 0.3016323745250702, "eval_runtime": 51.1014, "eval_samples_per_second": 195.69, "eval_steps_per_second": 24.461, "step": 34100 }, { "epoch": 2.057200087521232, "grad_norm": 0.9262953400611877, "learning_rate": 5.659886628876292e-06, "loss": 0.3367, "step": 34200 }, { "epoch": 2.057200087521232, "eval_loss": 0.29450055956840515, "eval_runtime": 51.0335, "eval_samples_per_second": 195.95, "eval_steps_per_second": 24.494, "step": 34200 }, { "epoch": 2.0632152924555043, "grad_norm": 0.9734236001968384, "learning_rate": 5.658886295431811e-06, "loss": 0.3343, "step": 34300 }, { "epoch": 2.0632152924555043, "eval_loss": 0.3005402684211731, "eval_runtime": 51.0508, "eval_samples_per_second": 195.883, "eval_steps_per_second": 24.485, "step": 34300 }, { "epoch": 2.069230497389777, "grad_norm": 1.0002549886703491, "learning_rate": 5.657885961987329e-06, "loss": 0.3322, "step": 34400 }, { "epoch": 2.069230497389777, "eval_loss": 0.2977810204029083, "eval_runtime": 51.3717, "eval_samples_per_second": 194.66, "eval_steps_per_second": 24.332, "step": 34400 }, { "epoch": 2.07524570232405, "grad_norm": 1.0582560300827026, "learning_rate": 5.656885628542847e-06, "loss": 0.3335, "step": 34500 }, { "epoch": 2.07524570232405, "eval_loss": 0.30631959438323975, "eval_runtime": 51.4392, "eval_samples_per_second": 194.404, "eval_steps_per_second": 24.301, "step": 34500 }, { "epoch": 2.081260907258322, "grad_norm": 0.9257709383964539, "learning_rate": 5.655885295098366e-06, "loss": 0.3348, "step": 34600 }, { "epoch": 2.081260907258322, "eval_loss": 0.296891450881958, "eval_runtime": 51.1063, "eval_samples_per_second": 195.671, "eval_steps_per_second": 24.459, "step": 34600 }, { "epoch": 2.087276112192595, "grad_norm": 0.9784733653068542, "learning_rate": 5.654884961653885e-06, "loss": 0.3351, "step": 34700 }, { "epoch": 2.087276112192595, "eval_loss": 0.30041709542274475, "eval_runtime": 36.3799, "eval_samples_per_second": 274.877, "eval_steps_per_second": 34.36, "step": 34700 }, { "epoch": 2.0932913171268677, "grad_norm": 0.9119441509246826, "learning_rate": 5.653884628209403e-06, "loss": 0.3331, "step": 34800 }, { "epoch": 2.0932913171268677, "eval_loss": 0.2985159754753113, "eval_runtime": 51.0698, "eval_samples_per_second": 195.811, "eval_steps_per_second": 24.476, "step": 34800 }, { "epoch": 2.09930652206114, "grad_norm": 0.8888152837753296, "learning_rate": 5.652884294764922e-06, "loss": 0.3329, "step": 34900 }, { "epoch": 2.09930652206114, "eval_loss": 0.2997465431690216, "eval_runtime": 51.2789, "eval_samples_per_second": 195.012, "eval_steps_per_second": 24.377, "step": 34900 }, { "epoch": 2.1053217269954128, "grad_norm": 0.9288111329078674, "learning_rate": 5.65188396132044e-06, "loss": 0.3293, "step": 35000 }, { "epoch": 2.1053217269954128, "eval_loss": 0.30220091342926025, "eval_runtime": 51.0672, "eval_samples_per_second": 195.82, "eval_steps_per_second": 24.478, "step": 35000 }, { "epoch": 2.111336931929685, "grad_norm": 0.9979832172393799, "learning_rate": 5.650883627875959e-06, "loss": 0.3335, "step": 35100 }, { "epoch": 2.111336931929685, "eval_loss": 0.2983012795448303, "eval_runtime": 51.1125, "eval_samples_per_second": 195.647, "eval_steps_per_second": 24.456, "step": 35100 }, { "epoch": 2.117352136863958, "grad_norm": 0.9908544421195984, "learning_rate": 5.649883294431477e-06, "loss": 0.3308, "step": 35200 }, { "epoch": 2.117352136863958, "eval_loss": 0.294648677110672, "eval_runtime": 51.0363, "eval_samples_per_second": 195.939, "eval_steps_per_second": 24.492, "step": 35200 }, { "epoch": 2.1233673417982306, "grad_norm": 0.9367330074310303, "learning_rate": 5.648882960986995e-06, "loss": 0.3308, "step": 35300 }, { "epoch": 2.1233673417982306, "eval_loss": 0.2953595817089081, "eval_runtime": 51.3129, "eval_samples_per_second": 194.883, "eval_steps_per_second": 24.36, "step": 35300 }, { "epoch": 2.129382546732503, "grad_norm": 0.923230767250061, "learning_rate": 5.647882627542515e-06, "loss": 0.3305, "step": 35400 }, { "epoch": 2.129382546732503, "eval_loss": 0.2954292893409729, "eval_runtime": 51.1146, "eval_samples_per_second": 195.639, "eval_steps_per_second": 24.455, "step": 35400 }, { "epoch": 2.1353977516667757, "grad_norm": 0.9737799167633057, "learning_rate": 5.6468822940980335e-06, "loss": 0.3321, "step": 35500 }, { "epoch": 2.1353977516667757, "eval_loss": 0.2911643981933594, "eval_runtime": 51.5291, "eval_samples_per_second": 194.065, "eval_steps_per_second": 24.258, "step": 35500 }, { "epoch": 2.1414129566010485, "grad_norm": 0.957861602306366, "learning_rate": 5.645881960653551e-06, "loss": 0.3304, "step": 35600 }, { "epoch": 2.1414129566010485, "eval_loss": 0.29846978187561035, "eval_runtime": 50.954, "eval_samples_per_second": 196.255, "eval_steps_per_second": 24.532, "step": 35600 }, { "epoch": 2.147428161535321, "grad_norm": 0.9183242321014404, "learning_rate": 5.64488162720907e-06, "loss": 0.3271, "step": 35700 }, { "epoch": 2.147428161535321, "eval_loss": 0.2944715619087219, "eval_runtime": 51.2205, "eval_samples_per_second": 195.234, "eval_steps_per_second": 24.404, "step": 35700 }, { "epoch": 2.1534433664695936, "grad_norm": 0.9701703190803528, "learning_rate": 5.643881293764588e-06, "loss": 0.3293, "step": 35800 }, { "epoch": 2.1534433664695936, "eval_loss": 0.29417359828948975, "eval_runtime": 51.0579, "eval_samples_per_second": 195.856, "eval_steps_per_second": 24.482, "step": 35800 }, { "epoch": 2.1594585714038663, "grad_norm": 0.992079496383667, "learning_rate": 5.642880960320107e-06, "loss": 0.3263, "step": 35900 }, { "epoch": 2.1594585714038663, "eval_loss": 0.29444122314453125, "eval_runtime": 51.0557, "eval_samples_per_second": 195.864, "eval_steps_per_second": 24.483, "step": 35900 }, { "epoch": 2.1654737763381386, "grad_norm": 0.9776268005371094, "learning_rate": 5.641880626875625e-06, "loss": 0.3266, "step": 36000 }, { "epoch": 2.1654737763381386, "eval_loss": 0.29786214232444763, "eval_runtime": 44.4576, "eval_samples_per_second": 224.934, "eval_steps_per_second": 28.117, "step": 36000 }, { "epoch": 2.1714889812724114, "grad_norm": 1.0352015495300293, "learning_rate": 5.640880293431144e-06, "loss": 0.3279, "step": 36100 }, { "epoch": 2.1714889812724114, "eval_loss": 0.2935112416744232, "eval_runtime": 51.0332, "eval_samples_per_second": 195.951, "eval_steps_per_second": 24.494, "step": 36100 }, { "epoch": 2.1775041862066837, "grad_norm": 0.9267537593841553, "learning_rate": 5.639879959986663e-06, "loss": 0.3252, "step": 36200 }, { "epoch": 2.1775041862066837, "eval_loss": 0.2946629822254181, "eval_runtime": 51.0517, "eval_samples_per_second": 195.88, "eval_steps_per_second": 24.485, "step": 36200 }, { "epoch": 2.1835193911409565, "grad_norm": 0.8838132619857788, "learning_rate": 5.6388796265421815e-06, "loss": 0.3273, "step": 36300 }, { "epoch": 2.1835193911409565, "eval_loss": 0.28932899236679077, "eval_runtime": 50.4286, "eval_samples_per_second": 198.3, "eval_steps_per_second": 24.788, "step": 36300 }, { "epoch": 2.1895345960752293, "grad_norm": 0.9279465079307556, "learning_rate": 5.637879293097699e-06, "loss": 0.3282, "step": 36400 }, { "epoch": 2.1895345960752293, "eval_loss": 0.2960895895957947, "eval_runtime": 51.1104, "eval_samples_per_second": 195.655, "eval_steps_per_second": 24.457, "step": 36400 }, { "epoch": 2.1955498010095016, "grad_norm": 1.0713165998458862, "learning_rate": 5.636878959653218e-06, "loss": 0.3269, "step": 36500 }, { "epoch": 2.1955498010095016, "eval_loss": 0.29087430238723755, "eval_runtime": 51.0616, "eval_samples_per_second": 195.842, "eval_steps_per_second": 24.48, "step": 36500 }, { "epoch": 2.2015650059437744, "grad_norm": 0.966033935546875, "learning_rate": 5.635878626208736e-06, "loss": 0.3258, "step": 36600 }, { "epoch": 2.2015650059437744, "eval_loss": 0.2945682108402252, "eval_runtime": 51.2162, "eval_samples_per_second": 195.251, "eval_steps_per_second": 24.406, "step": 36600 }, { "epoch": 2.207580210878047, "grad_norm": 1.0510607957839966, "learning_rate": 5.634878292764255e-06, "loss": 0.3239, "step": 36700 }, { "epoch": 2.207580210878047, "eval_loss": 0.29083874821662903, "eval_runtime": 51.0865, "eval_samples_per_second": 195.746, "eval_steps_per_second": 24.468, "step": 36700 }, { "epoch": 2.2135954158123194, "grad_norm": 0.9516984224319458, "learning_rate": 5.633877959319773e-06, "loss": 0.3242, "step": 36800 }, { "epoch": 2.2135954158123194, "eval_loss": 0.287597119808197, "eval_runtime": 51.2859, "eval_samples_per_second": 194.985, "eval_steps_per_second": 24.373, "step": 36800 }, { "epoch": 2.219610620746592, "grad_norm": 0.9704160094261169, "learning_rate": 5.632877625875292e-06, "loss": 0.3229, "step": 36900 }, { "epoch": 2.219610620746592, "eval_loss": 0.28357696533203125, "eval_runtime": 51.0184, "eval_samples_per_second": 196.008, "eval_steps_per_second": 24.501, "step": 36900 }, { "epoch": 2.225625825680865, "grad_norm": 0.9318411350250244, "learning_rate": 5.631877292430811e-06, "loss": 0.3244, "step": 37000 }, { "epoch": 2.225625825680865, "eval_loss": 0.2926484942436218, "eval_runtime": 51.0515, "eval_samples_per_second": 195.88, "eval_steps_per_second": 24.485, "step": 37000 }, { "epoch": 2.2316410306151373, "grad_norm": 0.9745403528213501, "learning_rate": 5.6308769589863294e-06, "loss": 0.3238, "step": 37100 }, { "epoch": 2.2316410306151373, "eval_loss": 0.29221734404563904, "eval_runtime": 51.0519, "eval_samples_per_second": 195.879, "eval_steps_per_second": 24.485, "step": 37100 }, { "epoch": 2.23765623554941, "grad_norm": 1.0162553787231445, "learning_rate": 5.629876625541847e-06, "loss": 0.3209, "step": 37200 }, { "epoch": 2.23765623554941, "eval_loss": 0.2900753319263458, "eval_runtime": 51.0188, "eval_samples_per_second": 196.006, "eval_steps_per_second": 24.501, "step": 37200 }, { "epoch": 2.243671440483683, "grad_norm": 0.9270024299621582, "learning_rate": 5.628876292097366e-06, "loss": 0.3218, "step": 37300 }, { "epoch": 2.243671440483683, "eval_loss": 0.29185083508491516, "eval_runtime": 49.1324, "eval_samples_per_second": 203.532, "eval_steps_per_second": 25.441, "step": 37300 }, { "epoch": 2.249686645417955, "grad_norm": 1.0156973600387573, "learning_rate": 5.627875958652885e-06, "loss": 0.3221, "step": 37400 }, { "epoch": 2.249686645417955, "eval_loss": 0.2883216440677643, "eval_runtime": 51.0198, "eval_samples_per_second": 196.002, "eval_steps_per_second": 24.5, "step": 37400 }, { "epoch": 2.255701850352228, "grad_norm": 0.884667694568634, "learning_rate": 5.6268756252084026e-06, "loss": 0.3231, "step": 37500 }, { "epoch": 2.255701850352228, "eval_loss": 0.2843243181705475, "eval_runtime": 51.199, "eval_samples_per_second": 195.316, "eval_steps_per_second": 24.415, "step": 37500 }, { "epoch": 2.2617170552865007, "grad_norm": 1.0025333166122437, "learning_rate": 5.625875291763921e-06, "loss": 0.32, "step": 37600 }, { "epoch": 2.2617170552865007, "eval_loss": 0.28985723853111267, "eval_runtime": 51.0474, "eval_samples_per_second": 195.896, "eval_steps_per_second": 24.487, "step": 37600 }, { "epoch": 2.267732260220773, "grad_norm": 0.9673831462860107, "learning_rate": 5.62487495831944e-06, "loss": 0.322, "step": 37700 }, { "epoch": 2.267732260220773, "eval_loss": 0.2844723165035248, "eval_runtime": 51.066, "eval_samples_per_second": 195.825, "eval_steps_per_second": 24.478, "step": 37700 }, { "epoch": 2.2737474651550458, "grad_norm": 0.9513309597969055, "learning_rate": 5.623874624874959e-06, "loss": 0.3202, "step": 37800 }, { "epoch": 2.2737474651550458, "eval_loss": 0.28764039278030396, "eval_runtime": 51.061, "eval_samples_per_second": 195.844, "eval_steps_per_second": 24.481, "step": 37800 }, { "epoch": 2.279762670089318, "grad_norm": 0.9131941795349121, "learning_rate": 5.622874291430477e-06, "loss": 0.3226, "step": 37900 }, { "epoch": 2.279762670089318, "eval_loss": 0.28673484921455383, "eval_runtime": 51.0581, "eval_samples_per_second": 195.855, "eval_steps_per_second": 24.482, "step": 37900 }, { "epoch": 2.285777875023591, "grad_norm": 0.9458931684494019, "learning_rate": 5.621873957985995e-06, "loss": 0.3206, "step": 38000 }, { "epoch": 2.285777875023591, "eval_loss": 0.2862774133682251, "eval_runtime": 36.7081, "eval_samples_per_second": 272.419, "eval_steps_per_second": 34.052, "step": 38000 }, { "epoch": 2.2917930799578636, "grad_norm": 0.997297465801239, "learning_rate": 5.620873624541514e-06, "loss": 0.3191, "step": 38100 }, { "epoch": 2.2917930799578636, "eval_loss": 0.2823648750782013, "eval_runtime": 51.0962, "eval_samples_per_second": 195.709, "eval_steps_per_second": 24.464, "step": 38100 }, { "epoch": 2.297808284892136, "grad_norm": 0.9200996160507202, "learning_rate": 5.619873291097033e-06, "loss": 0.3187, "step": 38200 }, { "epoch": 2.297808284892136, "eval_loss": 0.2872503995895386, "eval_runtime": 51.0809, "eval_samples_per_second": 195.768, "eval_steps_per_second": 24.471, "step": 38200 }, { "epoch": 2.3038234898264087, "grad_norm": 0.9441711902618408, "learning_rate": 5.6188729576525505e-06, "loss": 0.3209, "step": 38300 }, { "epoch": 2.3038234898264087, "eval_loss": 0.28855210542678833, "eval_runtime": 51.0269, "eval_samples_per_second": 195.975, "eval_steps_per_second": 24.497, "step": 38300 }, { "epoch": 2.3098386947606815, "grad_norm": 1.0377998352050781, "learning_rate": 5.617872624208069e-06, "loss": 0.3189, "step": 38400 }, { "epoch": 2.3098386947606815, "eval_loss": 0.2817797362804413, "eval_runtime": 51.0556, "eval_samples_per_second": 195.865, "eval_steps_per_second": 24.483, "step": 38400 }, { "epoch": 2.315853899694954, "grad_norm": 0.9088771939277649, "learning_rate": 5.616872290763588e-06, "loss": 0.3183, "step": 38500 }, { "epoch": 2.315853899694954, "eval_loss": 0.28079554438591003, "eval_runtime": 51.0907, "eval_samples_per_second": 195.73, "eval_steps_per_second": 24.466, "step": 38500 }, { "epoch": 2.3218691046292266, "grad_norm": 0.8959800004959106, "learning_rate": 5.615871957319107e-06, "loss": 0.3174, "step": 38600 }, { "epoch": 2.3218691046292266, "eval_loss": 0.28803524374961853, "eval_runtime": 50.9133, "eval_samples_per_second": 196.412, "eval_steps_per_second": 24.552, "step": 38600 }, { "epoch": 2.3278843095634993, "grad_norm": 0.9056723713874817, "learning_rate": 5.614871623874625e-06, "loss": 0.3167, "step": 38700 }, { "epoch": 2.3278843095634993, "eval_loss": 0.2826622426509857, "eval_runtime": 50.7905, "eval_samples_per_second": 196.887, "eval_steps_per_second": 24.611, "step": 38700 }, { "epoch": 2.3338995144977717, "grad_norm": 0.9248780608177185, "learning_rate": 5.613871290430143e-06, "loss": 0.3176, "step": 38800 }, { "epoch": 2.3338995144977717, "eval_loss": 0.2767186462879181, "eval_runtime": 50.6115, "eval_samples_per_second": 197.583, "eval_steps_per_second": 24.698, "step": 38800 }, { "epoch": 2.3399147194320444, "grad_norm": 0.9541249871253967, "learning_rate": 5.612870956985662e-06, "loss": 0.3187, "step": 38900 }, { "epoch": 2.3399147194320444, "eval_loss": 0.28110334277153015, "eval_runtime": 49.7615, "eval_samples_per_second": 200.959, "eval_steps_per_second": 25.12, "step": 38900 }, { "epoch": 2.3459299243663168, "grad_norm": 0.9116654396057129, "learning_rate": 5.611870623541181e-06, "loss": 0.3147, "step": 39000 }, { "epoch": 2.3459299243663168, "eval_loss": 0.2833644449710846, "eval_runtime": 50.9711, "eval_samples_per_second": 196.19, "eval_steps_per_second": 24.524, "step": 39000 }, { "epoch": 2.3519451293005895, "grad_norm": 0.9693782329559326, "learning_rate": 5.6108702900966985e-06, "loss": 0.3187, "step": 39100 }, { "epoch": 2.3519451293005895, "eval_loss": 0.2744785249233246, "eval_runtime": 51.0233, "eval_samples_per_second": 195.989, "eval_steps_per_second": 24.499, "step": 39100 }, { "epoch": 2.3579603342348623, "grad_norm": 0.911391019821167, "learning_rate": 5.609869956652217e-06, "loss": 0.3144, "step": 39200 }, { "epoch": 2.3579603342348623, "eval_loss": 0.27756959199905396, "eval_runtime": 50.862, "eval_samples_per_second": 196.61, "eval_steps_per_second": 24.576, "step": 39200 }, { "epoch": 2.3639755391691346, "grad_norm": 0.9383348822593689, "learning_rate": 5.608869623207736e-06, "loss": 0.3167, "step": 39300 }, { "epoch": 2.3639755391691346, "eval_loss": 0.2751516103744507, "eval_runtime": 51.0203, "eval_samples_per_second": 196.001, "eval_steps_per_second": 24.5, "step": 39300 }, { "epoch": 2.3699907441034074, "grad_norm": 0.8825791478157043, "learning_rate": 5.607869289763255e-06, "loss": 0.3133, "step": 39400 }, { "epoch": 2.3699907441034074, "eval_loss": 0.27583110332489014, "eval_runtime": 51.0047, "eval_samples_per_second": 196.06, "eval_steps_per_second": 24.508, "step": 39400 }, { "epoch": 2.37600594903768, "grad_norm": 0.9765325784683228, "learning_rate": 5.606868956318773e-06, "loss": 0.314, "step": 39500 }, { "epoch": 2.37600594903768, "eval_loss": 0.2750406563282013, "eval_runtime": 51.0333, "eval_samples_per_second": 195.95, "eval_steps_per_second": 24.494, "step": 39500 }, { "epoch": 2.3820211539719525, "grad_norm": 0.968429684638977, "learning_rate": 5.605868622874291e-06, "loss": 0.3162, "step": 39600 }, { "epoch": 2.3820211539719525, "eval_loss": 0.28406116366386414, "eval_runtime": 50.9992, "eval_samples_per_second": 196.081, "eval_steps_per_second": 24.51, "step": 39600 }, { "epoch": 2.3880363589062252, "grad_norm": 0.9351980686187744, "learning_rate": 5.60486828942981e-06, "loss": 0.3087, "step": 39700 }, { "epoch": 2.3880363589062252, "eval_loss": 0.2797408103942871, "eval_runtime": 51.0101, "eval_samples_per_second": 196.04, "eval_steps_per_second": 24.505, "step": 39700 }, { "epoch": 2.394051563840498, "grad_norm": 0.9547052383422852, "learning_rate": 5.603867955985329e-06, "loss": 0.3139, "step": 39800 }, { "epoch": 2.394051563840498, "eval_loss": 0.2779112458229065, "eval_runtime": 50.9598, "eval_samples_per_second": 196.233, "eval_steps_per_second": 24.529, "step": 39800 }, { "epoch": 2.4000667687747703, "grad_norm": 0.8971194624900818, "learning_rate": 5.6028676225408465e-06, "loss": 0.3113, "step": 39900 }, { "epoch": 2.4000667687747703, "eval_loss": 0.28396087884902954, "eval_runtime": 51.1122, "eval_samples_per_second": 195.648, "eval_steps_per_second": 24.456, "step": 39900 }, { "epoch": 2.406081973709043, "grad_norm": 0.9058307409286499, "learning_rate": 5.601867289096365e-06, "loss": 0.314, "step": 40000 }, { "epoch": 2.406081973709043, "eval_loss": 0.2806677222251892, "eval_runtime": 50.9901, "eval_samples_per_second": 196.117, "eval_steps_per_second": 24.515, "step": 40000 }, { "epoch": 2.4120971786433154, "grad_norm": 0.9002136588096619, "learning_rate": 5.600866955651885e-06, "loss": 0.3107, "step": 40100 }, { "epoch": 2.4120971786433154, "eval_loss": 0.2816166579723358, "eval_runtime": 50.992, "eval_samples_per_second": 196.109, "eval_steps_per_second": 24.514, "step": 40100 }, { "epoch": 2.418112383577588, "grad_norm": 0.9614746570587158, "learning_rate": 5.599866622207403e-06, "loss": 0.3107, "step": 40200 }, { "epoch": 2.418112383577588, "eval_loss": 0.2749168276786804, "eval_runtime": 51.0276, "eval_samples_per_second": 195.972, "eval_steps_per_second": 24.497, "step": 40200 }, { "epoch": 2.424127588511861, "grad_norm": 0.8742543458938599, "learning_rate": 5.598866288762921e-06, "loss": 0.3149, "step": 40300 }, { "epoch": 2.424127588511861, "eval_loss": 0.2682496905326843, "eval_runtime": 49.6212, "eval_samples_per_second": 201.527, "eval_steps_per_second": 25.191, "step": 40300 }, { "epoch": 2.4301427934461337, "grad_norm": 0.9011858105659485, "learning_rate": 5.59786595531844e-06, "loss": 0.3094, "step": 40400 }, { "epoch": 2.4301427934461337, "eval_loss": 0.277034729719162, "eval_runtime": 51.0504, "eval_samples_per_second": 195.885, "eval_steps_per_second": 24.486, "step": 40400 }, { "epoch": 2.436157998380406, "grad_norm": 0.9290640950202942, "learning_rate": 5.596865621873958e-06, "loss": 0.3114, "step": 40500 }, { "epoch": 2.436157998380406, "eval_loss": 0.27406954765319824, "eval_runtime": 51.0172, "eval_samples_per_second": 196.012, "eval_steps_per_second": 24.502, "step": 40500 }, { "epoch": 2.442173203314679, "grad_norm": 0.89925616979599, "learning_rate": 5.5958652884294766e-06, "loss": 0.3096, "step": 40600 }, { "epoch": 2.442173203314679, "eval_loss": 0.2777319848537445, "eval_runtime": 51.1656, "eval_samples_per_second": 195.444, "eval_steps_per_second": 24.43, "step": 40600 }, { "epoch": 2.448188408248951, "grad_norm": 0.8584897518157959, "learning_rate": 5.594864954984994e-06, "loss": 0.3123, "step": 40700 }, { "epoch": 2.448188408248951, "eval_loss": 0.27250877022743225, "eval_runtime": 51.0648, "eval_samples_per_second": 195.829, "eval_steps_per_second": 24.479, "step": 40700 }, { "epoch": 2.454203613183224, "grad_norm": 0.9398366808891296, "learning_rate": 5.593864621540514e-06, "loss": 0.3108, "step": 40800 }, { "epoch": 2.454203613183224, "eval_loss": 0.27442407608032227, "eval_runtime": 51.06, "eval_samples_per_second": 195.848, "eval_steps_per_second": 24.481, "step": 40800 }, { "epoch": 2.4602188181174967, "grad_norm": 0.8771011233329773, "learning_rate": 5.592864288096033e-06, "loss": 0.3107, "step": 40900 }, { "epoch": 2.4602188181174967, "eval_loss": 0.27768152952194214, "eval_runtime": 51.1346, "eval_samples_per_second": 195.562, "eval_steps_per_second": 24.445, "step": 40900 }, { "epoch": 2.466234023051769, "grad_norm": 0.922232449054718, "learning_rate": 5.5918639546515505e-06, "loss": 0.3082, "step": 41000 }, { "epoch": 2.466234023051769, "eval_loss": 0.27813389897346497, "eval_runtime": 28.068, "eval_samples_per_second": 356.278, "eval_steps_per_second": 44.535, "step": 41000 }, { "epoch": 2.4722492279860417, "grad_norm": 0.9415081143379211, "learning_rate": 5.590863621207069e-06, "loss": 0.3105, "step": 41100 }, { "epoch": 2.4722492279860417, "eval_loss": 0.27401283383369446, "eval_runtime": 50.7464, "eval_samples_per_second": 197.058, "eval_steps_per_second": 24.632, "step": 41100 }, { "epoch": 2.4782644329203145, "grad_norm": 0.8894750475883484, "learning_rate": 5.589863287762588e-06, "loss": 0.31, "step": 41200 }, { "epoch": 2.4782644329203145, "eval_loss": 0.2711414694786072, "eval_runtime": 50.7544, "eval_samples_per_second": 197.027, "eval_steps_per_second": 24.628, "step": 41200 }, { "epoch": 2.484279637854587, "grad_norm": 0.8910822868347168, "learning_rate": 5.588862954318106e-06, "loss": 0.3064, "step": 41300 }, { "epoch": 2.484279637854587, "eval_loss": 0.2753881514072418, "eval_runtime": 48.2521, "eval_samples_per_second": 207.245, "eval_steps_per_second": 25.906, "step": 41300 }, { "epoch": 2.4902948427888596, "grad_norm": 0.890864908695221, "learning_rate": 5.5878626208736245e-06, "loss": 0.3042, "step": 41400 }, { "epoch": 2.4902948427888596, "eval_loss": 0.27833056449890137, "eval_runtime": 44.9081, "eval_samples_per_second": 222.677, "eval_steps_per_second": 27.835, "step": 41400 }, { "epoch": 2.4963100477231324, "grad_norm": 0.8507567048072815, "learning_rate": 5.586862287429143e-06, "loss": 0.308, "step": 41500 }, { "epoch": 2.4963100477231324, "eval_loss": 0.2749514579772949, "eval_runtime": 45.6991, "eval_samples_per_second": 218.823, "eval_steps_per_second": 27.353, "step": 41500 }, { "epoch": 2.5023252526574047, "grad_norm": 1.0246086120605469, "learning_rate": 5.585861953984662e-06, "loss": 0.308, "step": 41600 }, { "epoch": 2.5023252526574047, "eval_loss": 0.2693102955818176, "eval_runtime": 48.6013, "eval_samples_per_second": 205.756, "eval_steps_per_second": 25.719, "step": 41600 }, { "epoch": 2.5083404575916775, "grad_norm": 1.015673279762268, "learning_rate": 5.584861620540181e-06, "loss": 0.3062, "step": 41700 }, { "epoch": 2.5083404575916775, "eval_loss": 0.2740586996078491, "eval_runtime": 49.0311, "eval_samples_per_second": 203.952, "eval_steps_per_second": 25.494, "step": 41700 }, { "epoch": 2.5143556625259498, "grad_norm": 0.9325861930847168, "learning_rate": 5.5838612870956985e-06, "loss": 0.3085, "step": 41800 }, { "epoch": 2.5143556625259498, "eval_loss": 0.2755836844444275, "eval_runtime": 49.0354, "eval_samples_per_second": 203.934, "eval_steps_per_second": 25.492, "step": 41800 }, { "epoch": 2.5203708674602225, "grad_norm": 0.8402740359306335, "learning_rate": 5.582860953651217e-06, "loss": 0.3074, "step": 41900 }, { "epoch": 2.5203708674602225, "eval_loss": 0.2750794291496277, "eval_runtime": 49.6049, "eval_samples_per_second": 201.593, "eval_steps_per_second": 25.199, "step": 41900 }, { "epoch": 2.5263860723944953, "grad_norm": 0.8873264193534851, "learning_rate": 5.581860620206736e-06, "loss": 0.3073, "step": 42000 }, { "epoch": 2.5263860723944953, "eval_loss": 0.2801840901374817, "eval_runtime": 49.3914, "eval_samples_per_second": 202.464, "eval_steps_per_second": 25.308, "step": 42000 }, { "epoch": 2.5324012773287676, "grad_norm": 0.9626051187515259, "learning_rate": 5.580860286762254e-06, "loss": 0.3068, "step": 42100 }, { "epoch": 2.5324012773287676, "eval_loss": 0.2711939811706543, "eval_runtime": 49.617, "eval_samples_per_second": 201.544, "eval_steps_per_second": 25.193, "step": 42100 }, { "epoch": 2.5384164822630404, "grad_norm": 0.9168198108673096, "learning_rate": 5.5798599533177725e-06, "loss": 0.3059, "step": 42200 }, { "epoch": 2.5384164822630404, "eval_loss": 0.270614355802536, "eval_runtime": 50.1412, "eval_samples_per_second": 199.437, "eval_steps_per_second": 24.93, "step": 42200 }, { "epoch": 2.544431687197313, "grad_norm": 0.9542158842086792, "learning_rate": 5.578859619873291e-06, "loss": 0.3061, "step": 42300 }, { "epoch": 2.544431687197313, "eval_loss": 0.2705308198928833, "eval_runtime": 50.4655, "eval_samples_per_second": 198.155, "eval_steps_per_second": 24.769, "step": 42300 }, { "epoch": 2.5504468921315855, "grad_norm": 0.8468143939971924, "learning_rate": 5.57785928642881e-06, "loss": 0.3048, "step": 42400 }, { "epoch": 2.5504468921315855, "eval_loss": 0.27329984307289124, "eval_runtime": 50.4318, "eval_samples_per_second": 198.288, "eval_steps_per_second": 24.786, "step": 42400 }, { "epoch": 2.5564620970658583, "grad_norm": 0.9493191838264465, "learning_rate": 5.576858952984329e-06, "loss": 0.3019, "step": 42500 }, { "epoch": 2.5564620970658583, "eval_loss": 0.2731817364692688, "eval_runtime": 50.5666, "eval_samples_per_second": 197.759, "eval_steps_per_second": 24.72, "step": 42500 }, { "epoch": 2.562477302000131, "grad_norm": 0.9617642760276794, "learning_rate": 5.5758586195398465e-06, "loss": 0.3012, "step": 42600 }, { "epoch": 2.562477302000131, "eval_loss": 0.26970621943473816, "eval_runtime": 51.0766, "eval_samples_per_second": 195.784, "eval_steps_per_second": 24.473, "step": 42600 }, { "epoch": 2.5684925069344033, "grad_norm": 0.9389893412590027, "learning_rate": 5.574858286095365e-06, "loss": 0.3027, "step": 42700 }, { "epoch": 2.5684925069344033, "eval_loss": 0.27145934104919434, "eval_runtime": 51.074, "eval_samples_per_second": 195.794, "eval_steps_per_second": 24.474, "step": 42700 }, { "epoch": 2.574507711868676, "grad_norm": 0.9073367714881897, "learning_rate": 5.573857952650884e-06, "loss": 0.3021, "step": 42800 }, { "epoch": 2.574507711868676, "eval_loss": 0.2711017429828644, "eval_runtime": 51.072, "eval_samples_per_second": 195.802, "eval_steps_per_second": 24.475, "step": 42800 }, { "epoch": 2.5805229168029484, "grad_norm": 0.8948126435279846, "learning_rate": 5.572857619206402e-06, "loss": 0.302, "step": 42900 }, { "epoch": 2.5805229168029484, "eval_loss": 0.2703753113746643, "eval_runtime": 51.0323, "eval_samples_per_second": 195.954, "eval_steps_per_second": 24.494, "step": 42900 }, { "epoch": 2.586538121737221, "grad_norm": 0.943368136882782, "learning_rate": 5.5718572857619205e-06, "loss": 0.3007, "step": 43000 }, { "epoch": 2.586538121737221, "eval_loss": 0.2676005959510803, "eval_runtime": 51.147, "eval_samples_per_second": 195.515, "eval_steps_per_second": 24.439, "step": 43000 }, { "epoch": 2.592553326671494, "grad_norm": 0.9073809385299683, "learning_rate": 5.570856952317439e-06, "loss": 0.3004, "step": 43100 }, { "epoch": 2.592553326671494, "eval_loss": 0.26843926310539246, "eval_runtime": 51.0148, "eval_samples_per_second": 196.021, "eval_steps_per_second": 24.503, "step": 43100 }, { "epoch": 2.5985685316057667, "grad_norm": 0.9534226655960083, "learning_rate": 5.569856618872958e-06, "loss": 0.3039, "step": 43200 }, { "epoch": 2.5985685316057667, "eval_loss": 0.2675269842147827, "eval_runtime": 51.1418, "eval_samples_per_second": 195.535, "eval_steps_per_second": 24.442, "step": 43200 }, { "epoch": 2.604583736540039, "grad_norm": 0.8546542525291443, "learning_rate": 5.5688562854284766e-06, "loss": 0.3008, "step": 43300 }, { "epoch": 2.604583736540039, "eval_loss": 0.2680804133415222, "eval_runtime": 51.0519, "eval_samples_per_second": 195.879, "eval_steps_per_second": 24.485, "step": 43300 }, { "epoch": 2.610598941474312, "grad_norm": 0.9167499542236328, "learning_rate": 5.567855951983995e-06, "loss": 0.3001, "step": 43400 }, { "epoch": 2.610598941474312, "eval_loss": 0.26866093277931213, "eval_runtime": 51.2331, "eval_samples_per_second": 195.186, "eval_steps_per_second": 24.398, "step": 43400 }, { "epoch": 2.616614146408584, "grad_norm": 0.9243641495704651, "learning_rate": 5.566855618539513e-06, "loss": 0.3007, "step": 43500 }, { "epoch": 2.616614146408584, "eval_loss": 0.27828356623649597, "eval_runtime": 35.4476, "eval_samples_per_second": 282.107, "eval_steps_per_second": 35.263, "step": 43500 }, { "epoch": 2.622629351342857, "grad_norm": 0.9069240689277649, "learning_rate": 5.565855285095032e-06, "loss": 0.3039, "step": 43600 }, { "epoch": 2.622629351342857, "eval_loss": 0.27373048663139343, "eval_runtime": 51.0712, "eval_samples_per_second": 195.805, "eval_steps_per_second": 24.476, "step": 43600 }, { "epoch": 2.6286445562771297, "grad_norm": 0.8967992663383484, "learning_rate": 5.56485495165055e-06, "loss": 0.3026, "step": 43700 }, { "epoch": 2.6286445562771297, "eval_loss": 0.2672281861305237, "eval_runtime": 51.0214, "eval_samples_per_second": 195.996, "eval_steps_per_second": 24.5, "step": 43700 }, { "epoch": 2.634659761211402, "grad_norm": 0.8463547229766846, "learning_rate": 5.563854618206068e-06, "loss": 0.3018, "step": 43800 }, { "epoch": 2.634659761211402, "eval_loss": 0.2690221071243286, "eval_runtime": 51.0223, "eval_samples_per_second": 195.993, "eval_steps_per_second": 24.499, "step": 43800 }, { "epoch": 2.6406749661456748, "grad_norm": 0.8656585812568665, "learning_rate": 5.562854284761587e-06, "loss": 0.3019, "step": 43900 }, { "epoch": 2.6406749661456748, "eval_loss": 0.2694147229194641, "eval_runtime": 51.2059, "eval_samples_per_second": 195.29, "eval_steps_per_second": 24.411, "step": 43900 }, { "epoch": 2.646690171079947, "grad_norm": 0.8388367891311646, "learning_rate": 5.561853951317106e-06, "loss": 0.299, "step": 44000 }, { "epoch": 2.646690171079947, "eval_loss": 0.27004268765449524, "eval_runtime": 51.0385, "eval_samples_per_second": 195.93, "eval_steps_per_second": 24.491, "step": 44000 }, { "epoch": 2.65270537601422, "grad_norm": 0.8733914494514465, "learning_rate": 5.5608536178726245e-06, "loss": 0.2996, "step": 44100 }, { "epoch": 2.65270537601422, "eval_loss": 0.2620984613895416, "eval_runtime": 51.1206, "eval_samples_per_second": 195.616, "eval_steps_per_second": 24.452, "step": 44100 }, { "epoch": 2.6587205809484926, "grad_norm": 0.825485348701477, "learning_rate": 5.559853284428143e-06, "loss": 0.2996, "step": 44200 }, { "epoch": 2.6587205809484926, "eval_loss": 0.26619336009025574, "eval_runtime": 50.9856, "eval_samples_per_second": 196.134, "eval_steps_per_second": 24.517, "step": 44200 }, { "epoch": 2.6647357858827654, "grad_norm": 0.9234973192214966, "learning_rate": 5.558852950983661e-06, "loss": 0.2994, "step": 44300 }, { "epoch": 2.6647357858827654, "eval_loss": 0.269397497177124, "eval_runtime": 51.1229, "eval_samples_per_second": 195.607, "eval_steps_per_second": 24.451, "step": 44300 }, { "epoch": 2.6707509908170377, "grad_norm": 0.9815935492515564, "learning_rate": 5.55785261753918e-06, "loss": 0.2964, "step": 44400 }, { "epoch": 2.6707509908170377, "eval_loss": 0.26540160179138184, "eval_runtime": 51.0268, "eval_samples_per_second": 195.975, "eval_steps_per_second": 24.497, "step": 44400 }, { "epoch": 2.6767661957513105, "grad_norm": 0.8895259499549866, "learning_rate": 5.5568522840946985e-06, "loss": 0.2943, "step": 44500 }, { "epoch": 2.6767661957513105, "eval_loss": 0.2682526707649231, "eval_runtime": 51.188, "eval_samples_per_second": 195.358, "eval_steps_per_second": 24.42, "step": 44500 }, { "epoch": 2.682781400685583, "grad_norm": 0.8415577411651611, "learning_rate": 5.555851950650216e-06, "loss": 0.2972, "step": 44600 }, { "epoch": 2.682781400685583, "eval_loss": 0.2677549421787262, "eval_runtime": 51.1092, "eval_samples_per_second": 195.66, "eval_steps_per_second": 24.457, "step": 44600 }, { "epoch": 2.6887966056198556, "grad_norm": 0.8922407031059265, "learning_rate": 5.554851617205736e-06, "loss": 0.2969, "step": 44700 }, { "epoch": 2.6887966056198556, "eval_loss": 0.2671573758125305, "eval_runtime": 51.0789, "eval_samples_per_second": 195.776, "eval_steps_per_second": 24.472, "step": 44700 }, { "epoch": 2.6948118105541283, "grad_norm": 1.0156275033950806, "learning_rate": 5.553851283761254e-06, "loss": 0.2972, "step": 44800 }, { "epoch": 2.6948118105541283, "eval_loss": 0.26524412631988525, "eval_runtime": 51.0819, "eval_samples_per_second": 195.764, "eval_steps_per_second": 24.471, "step": 44800 }, { "epoch": 2.7008270154884007, "grad_norm": 0.9283206462860107, "learning_rate": 5.5528509503167725e-06, "loss": 0.2953, "step": 44900 }, { "epoch": 2.7008270154884007, "eval_loss": 0.26051226258277893, "eval_runtime": 51.0731, "eval_samples_per_second": 195.798, "eval_steps_per_second": 24.475, "step": 44900 }, { "epoch": 2.7068422204226734, "grad_norm": 0.9081267714500427, "learning_rate": 5.551850616872291e-06, "loss": 0.2956, "step": 45000 }, { "epoch": 2.7068422204226734, "eval_loss": 0.26829174160957336, "eval_runtime": 51.0764, "eval_samples_per_second": 195.785, "eval_steps_per_second": 24.473, "step": 45000 }, { "epoch": 2.712857425356946, "grad_norm": 0.9797186255455017, "learning_rate": 5.550850283427809e-06, "loss": 0.2951, "step": 45100 }, { "epoch": 2.712857425356946, "eval_loss": 0.2626285254955292, "eval_runtime": 51.0441, "eval_samples_per_second": 195.909, "eval_steps_per_second": 24.489, "step": 45100 }, { "epoch": 2.7188726302912185, "grad_norm": 0.972873866558075, "learning_rate": 5.549849949983328e-06, "loss": 0.2938, "step": 45200 }, { "epoch": 2.7188726302912185, "eval_loss": 0.2651112675666809, "eval_runtime": 51.1856, "eval_samples_per_second": 195.368, "eval_steps_per_second": 24.421, "step": 45200 }, { "epoch": 2.7248878352254913, "grad_norm": 0.8637024164199829, "learning_rate": 5.5488496165388465e-06, "loss": 0.2951, "step": 45300 }, { "epoch": 2.7248878352254913, "eval_loss": 0.26248618960380554, "eval_runtime": 51.1456, "eval_samples_per_second": 195.52, "eval_steps_per_second": 24.44, "step": 45300 }, { "epoch": 2.730903040159764, "grad_norm": 0.9163945317268372, "learning_rate": 5.547849283094365e-06, "loss": 0.2948, "step": 45400 }, { "epoch": 2.730903040159764, "eval_loss": 0.2693786025047302, "eval_runtime": 51.0867, "eval_samples_per_second": 195.746, "eval_steps_per_second": 24.468, "step": 45400 }, { "epoch": 2.7369182450940364, "grad_norm": 1.0530128479003906, "learning_rate": 5.546848949649884e-06, "loss": 0.2944, "step": 45500 }, { "epoch": 2.7369182450940364, "eval_loss": 0.2621295750141144, "eval_runtime": 51.1036, "eval_samples_per_second": 195.681, "eval_steps_per_second": 24.46, "step": 45500 }, { "epoch": 2.742933450028309, "grad_norm": 0.9258381128311157, "learning_rate": 5.545848616205402e-06, "loss": 0.2943, "step": 45600 }, { "epoch": 2.742933450028309, "eval_loss": 0.25974345207214355, "eval_runtime": 51.1397, "eval_samples_per_second": 195.543, "eval_steps_per_second": 24.443, "step": 45600 }, { "epoch": 2.7489486549625815, "grad_norm": 0.8768019676208496, "learning_rate": 5.5448482827609205e-06, "loss": 0.2934, "step": 45700 }, { "epoch": 2.7489486549625815, "eval_loss": 0.26323673129081726, "eval_runtime": 51.1134, "eval_samples_per_second": 195.643, "eval_steps_per_second": 24.455, "step": 45700 }, { "epoch": 2.754963859896854, "grad_norm": 0.8610267639160156, "learning_rate": 5.543847949316439e-06, "loss": 0.2934, "step": 45800 }, { "epoch": 2.754963859896854, "eval_loss": 0.2621345818042755, "eval_runtime": 51.0875, "eval_samples_per_second": 195.743, "eval_steps_per_second": 24.468, "step": 45800 }, { "epoch": 2.760979064831127, "grad_norm": 0.8272863626480103, "learning_rate": 5.542847615871957e-06, "loss": 0.2952, "step": 45900 }, { "epoch": 2.760979064831127, "eval_loss": 0.2651170790195465, "eval_runtime": 51.1189, "eval_samples_per_second": 195.622, "eval_steps_per_second": 24.453, "step": 45900 }, { "epoch": 2.7669942697653997, "grad_norm": 0.8691322207450867, "learning_rate": 5.541847282427476e-06, "loss": 0.2903, "step": 46000 }, { "epoch": 2.7669942697653997, "eval_loss": 0.2674708664417267, "eval_runtime": 51.0977, "eval_samples_per_second": 195.704, "eval_steps_per_second": 24.463, "step": 46000 }, { "epoch": 2.773009474699672, "grad_norm": 0.9887429475784302, "learning_rate": 5.5408469489829944e-06, "loss": 0.2931, "step": 46100 }, { "epoch": 2.773009474699672, "eval_loss": 0.2632472515106201, "eval_runtime": 51.1106, "eval_samples_per_second": 195.654, "eval_steps_per_second": 24.457, "step": 46100 }, { "epoch": 2.779024679633945, "grad_norm": 0.9419971704483032, "learning_rate": 5.539846615538513e-06, "loss": 0.2933, "step": 46200 }, { "epoch": 2.779024679633945, "eval_loss": 0.2613042891025543, "eval_runtime": 51.0338, "eval_samples_per_second": 195.949, "eval_steps_per_second": 24.494, "step": 46200 }, { "epoch": 2.785039884568217, "grad_norm": 0.9267482161521912, "learning_rate": 5.538846282094032e-06, "loss": 0.2915, "step": 46300 }, { "epoch": 2.785039884568217, "eval_loss": 0.2661626935005188, "eval_runtime": 51.084, "eval_samples_per_second": 195.756, "eval_steps_per_second": 24.469, "step": 46300 }, { "epoch": 2.79105508950249, "grad_norm": 0.9020786285400391, "learning_rate": 5.5378459486495506e-06, "loss": 0.2933, "step": 46400 }, { "epoch": 2.79105508950249, "eval_loss": 0.2588748335838318, "eval_runtime": 51.1198, "eval_samples_per_second": 195.619, "eval_steps_per_second": 24.452, "step": 46400 }, { "epoch": 2.7970702944367627, "grad_norm": 0.893649160861969, "learning_rate": 5.5368456152050684e-06, "loss": 0.2914, "step": 46500 }, { "epoch": 2.7970702944367627, "eval_loss": 0.2560584545135498, "eval_runtime": 51.1578, "eval_samples_per_second": 195.474, "eval_steps_per_second": 24.434, "step": 46500 }, { "epoch": 2.803085499371035, "grad_norm": 0.8569892644882202, "learning_rate": 5.535845281760587e-06, "loss": 0.2921, "step": 46600 }, { "epoch": 2.803085499371035, "eval_loss": 0.26415926218032837, "eval_runtime": 48.8588, "eval_samples_per_second": 204.672, "eval_steps_per_second": 25.584, "step": 46600 }, { "epoch": 2.809100704305308, "grad_norm": 0.967966616153717, "learning_rate": 5.534844948316105e-06, "loss": 0.2932, "step": 46700 }, { "epoch": 2.809100704305308, "eval_loss": 0.262004554271698, "eval_runtime": 51.1167, "eval_samples_per_second": 195.631, "eval_steps_per_second": 24.454, "step": 46700 }, { "epoch": 2.81511590923958, "grad_norm": 0.8977293968200684, "learning_rate": 5.533844614871624e-06, "loss": 0.291, "step": 46800 }, { "epoch": 2.81511590923958, "eval_loss": 0.26304325461387634, "eval_runtime": 51.1071, "eval_samples_per_second": 195.668, "eval_steps_per_second": 24.458, "step": 46800 }, { "epoch": 2.821131114173853, "grad_norm": 0.8833451271057129, "learning_rate": 5.532844281427142e-06, "loss": 0.2879, "step": 46900 }, { "epoch": 2.821131114173853, "eval_loss": 0.2652186155319214, "eval_runtime": 51.1212, "eval_samples_per_second": 195.614, "eval_steps_per_second": 24.452, "step": 46900 }, { "epoch": 2.8271463191081256, "grad_norm": 0.916098415851593, "learning_rate": 5.531843947982661e-06, "loss": 0.29, "step": 47000 }, { "epoch": 2.8271463191081256, "eval_loss": 0.2618425190448761, "eval_runtime": 51.1419, "eval_samples_per_second": 195.534, "eval_steps_per_second": 24.442, "step": 47000 }, { "epoch": 2.8331615240423984, "grad_norm": 0.8808870315551758, "learning_rate": 5.53084361453818e-06, "loss": 0.2912, "step": 47100 }, { "epoch": 2.8331615240423984, "eval_loss": 0.26288196444511414, "eval_runtime": 51.1216, "eval_samples_per_second": 195.612, "eval_steps_per_second": 24.452, "step": 47100 }, { "epoch": 2.8391767289766707, "grad_norm": 0.8972067832946777, "learning_rate": 5.5298432810936985e-06, "loss": 0.2914, "step": 47200 }, { "epoch": 2.8391767289766707, "eval_loss": 0.2557620704174042, "eval_runtime": 51.1227, "eval_samples_per_second": 195.608, "eval_steps_per_second": 24.451, "step": 47200 }, { "epoch": 2.8451919339109435, "grad_norm": 0.8946945667266846, "learning_rate": 5.528842947649216e-06, "loss": 0.2894, "step": 47300 }, { "epoch": 2.8451919339109435, "eval_loss": 0.26096677780151367, "eval_runtime": 48.2836, "eval_samples_per_second": 207.109, "eval_steps_per_second": 25.889, "step": 47300 }, { "epoch": 2.851207138845216, "grad_norm": 0.9023754000663757, "learning_rate": 5.527842614204735e-06, "loss": 0.2875, "step": 47400 }, { "epoch": 2.851207138845216, "eval_loss": 0.25718143582344055, "eval_runtime": 51.1174, "eval_samples_per_second": 195.628, "eval_steps_per_second": 24.453, "step": 47400 }, { "epoch": 2.8572223437794886, "grad_norm": 0.8229103088378906, "learning_rate": 5.526842280760254e-06, "loss": 0.2875, "step": 47500 }, { "epoch": 2.8572223437794886, "eval_loss": 0.26064789295196533, "eval_runtime": 51.0796, "eval_samples_per_second": 195.773, "eval_steps_per_second": 24.472, "step": 47500 }, { "epoch": 2.8632375487137613, "grad_norm": 0.7903328537940979, "learning_rate": 5.525841947315772e-06, "loss": 0.2888, "step": 47600 }, { "epoch": 2.8632375487137613, "eval_loss": 0.25777605175971985, "eval_runtime": 51.0732, "eval_samples_per_second": 195.797, "eval_steps_per_second": 24.475, "step": 47600 }, { "epoch": 2.8692527536480337, "grad_norm": 0.9628756046295166, "learning_rate": 5.52484161387129e-06, "loss": 0.2909, "step": 47700 }, { "epoch": 2.8692527536480337, "eval_loss": 0.2552904188632965, "eval_runtime": 51.1083, "eval_samples_per_second": 195.663, "eval_steps_per_second": 24.458, "step": 47700 }, { "epoch": 2.8752679585823064, "grad_norm": 0.8853189945220947, "learning_rate": 5.523841280426809e-06, "loss": 0.2885, "step": 47800 }, { "epoch": 2.8752679585823064, "eval_loss": 0.2585737407207489, "eval_runtime": 51.0832, "eval_samples_per_second": 195.759, "eval_steps_per_second": 24.47, "step": 47800 }, { "epoch": 2.8812831635165788, "grad_norm": 0.9299560785293579, "learning_rate": 5.522840946982328e-06, "loss": 0.2865, "step": 47900 }, { "epoch": 2.8812831635165788, "eval_loss": 0.2563331425189972, "eval_runtime": 51.0909, "eval_samples_per_second": 195.729, "eval_steps_per_second": 24.466, "step": 47900 }, { "epoch": 2.8872983684508515, "grad_norm": 0.9286957383155823, "learning_rate": 5.5218406135378465e-06, "loss": 0.2873, "step": 48000 }, { "epoch": 2.8872983684508515, "eval_loss": 0.2592049837112427, "eval_runtime": 48.2359, "eval_samples_per_second": 207.315, "eval_steps_per_second": 25.914, "step": 48000 }, { "epoch": 2.8933135733851243, "grad_norm": 0.8729236125946045, "learning_rate": 5.520840280093364e-06, "loss": 0.2861, "step": 48100 }, { "epoch": 2.8933135733851243, "eval_loss": 0.25870123505592346, "eval_runtime": 51.1066, "eval_samples_per_second": 195.669, "eval_steps_per_second": 24.459, "step": 48100 }, { "epoch": 2.899328778319397, "grad_norm": 0.8652471899986267, "learning_rate": 5.519839946648883e-06, "loss": 0.2867, "step": 48200 }, { "epoch": 2.899328778319397, "eval_loss": 0.2612285017967224, "eval_runtime": 51.1028, "eval_samples_per_second": 195.684, "eval_steps_per_second": 24.46, "step": 48200 }, { "epoch": 2.9053439832536694, "grad_norm": 0.8425643444061279, "learning_rate": 5.518839613204402e-06, "loss": 0.2852, "step": 48300 }, { "epoch": 2.9053439832536694, "eval_loss": 0.2628696858882904, "eval_runtime": 51.123, "eval_samples_per_second": 195.607, "eval_steps_per_second": 24.451, "step": 48300 }, { "epoch": 2.911359188187942, "grad_norm": 0.9844802021980286, "learning_rate": 5.51783927975992e-06, "loss": 0.2877, "step": 48400 }, { "epoch": 2.911359188187942, "eval_loss": 0.2612448036670685, "eval_runtime": 51.0987, "eval_samples_per_second": 195.7, "eval_steps_per_second": 24.462, "step": 48400 }, { "epoch": 2.9173743931222145, "grad_norm": 0.878381073474884, "learning_rate": 5.516838946315438e-06, "loss": 0.2869, "step": 48500 }, { "epoch": 2.9173743931222145, "eval_loss": 0.25639012455940247, "eval_runtime": 51.1127, "eval_samples_per_second": 195.646, "eval_steps_per_second": 24.456, "step": 48500 }, { "epoch": 2.9233895980564872, "grad_norm": 0.8658349514007568, "learning_rate": 5.515838612870957e-06, "loss": 0.2862, "step": 48600 }, { "epoch": 2.9233895980564872, "eval_loss": 0.24971692264080048, "eval_runtime": 51.1228, "eval_samples_per_second": 195.607, "eval_steps_per_second": 24.451, "step": 48600 }, { "epoch": 2.92940480299076, "grad_norm": 0.8590924143791199, "learning_rate": 5.514838279426476e-06, "loss": 0.2868, "step": 48700 }, { "epoch": 2.92940480299076, "eval_loss": 0.2601747214794159, "eval_runtime": 51.129, "eval_samples_per_second": 195.584, "eval_steps_per_second": 24.448, "step": 48700 }, { "epoch": 2.9354200079250328, "grad_norm": 0.8948882222175598, "learning_rate": 5.5138379459819945e-06, "loss": 0.2876, "step": 48800 }, { "epoch": 2.9354200079250328, "eval_loss": 0.256122350692749, "eval_runtime": 51.1826, "eval_samples_per_second": 195.379, "eval_steps_per_second": 24.422, "step": 48800 }, { "epoch": 2.941435212859305, "grad_norm": 0.8714300990104675, "learning_rate": 5.512837612537512e-06, "loss": 0.2854, "step": 48900 }, { "epoch": 2.941435212859305, "eval_loss": 0.2527640163898468, "eval_runtime": 51.143, "eval_samples_per_second": 195.53, "eval_steps_per_second": 24.441, "step": 48900 }, { "epoch": 2.947450417793578, "grad_norm": 0.8347595930099487, "learning_rate": 5.511837279093031e-06, "loss": 0.2859, "step": 49000 }, { "epoch": 2.947450417793578, "eval_loss": 0.2613712549209595, "eval_runtime": 51.1079, "eval_samples_per_second": 195.664, "eval_steps_per_second": 24.458, "step": 49000 }, { "epoch": 2.95346562272785, "grad_norm": 0.8538709878921509, "learning_rate": 5.51083694564855e-06, "loss": 0.2852, "step": 49100 }, { "epoch": 2.95346562272785, "eval_loss": 0.25488194823265076, "eval_runtime": 51.1132, "eval_samples_per_second": 195.644, "eval_steps_per_second": 24.456, "step": 49100 }, { "epoch": 2.959480827662123, "grad_norm": 0.922144889831543, "learning_rate": 5.509836612204068e-06, "loss": 0.2847, "step": 49200 }, { "epoch": 2.959480827662123, "eval_loss": 0.2526051700115204, "eval_runtime": 51.1124, "eval_samples_per_second": 195.647, "eval_steps_per_second": 24.456, "step": 49200 }, { "epoch": 2.9654960325963957, "grad_norm": 0.8684960007667542, "learning_rate": 5.508836278759587e-06, "loss": 0.2837, "step": 49300 }, { "epoch": 2.9654960325963957, "eval_loss": 0.25194811820983887, "eval_runtime": 51.0578, "eval_samples_per_second": 195.857, "eval_steps_per_second": 24.482, "step": 49300 }, { "epoch": 2.971511237530668, "grad_norm": 0.9055145978927612, "learning_rate": 5.507835945315106e-06, "loss": 0.2817, "step": 49400 }, { "epoch": 2.971511237530668, "eval_loss": 0.25218260288238525, "eval_runtime": 51.0821, "eval_samples_per_second": 195.763, "eval_steps_per_second": 24.47, "step": 49400 }, { "epoch": 2.977526442464941, "grad_norm": 0.8636729121208191, "learning_rate": 5.506835611870624e-06, "loss": 0.2855, "step": 49500 }, { "epoch": 2.977526442464941, "eval_loss": 0.25728458166122437, "eval_runtime": 51.06, "eval_samples_per_second": 195.848, "eval_steps_per_second": 24.481, "step": 49500 }, { "epoch": 2.983541647399213, "grad_norm": 0.9919777512550354, "learning_rate": 5.5058352784261424e-06, "loss": 0.2816, "step": 49600 }, { "epoch": 2.983541647399213, "eval_loss": 0.2515828311443329, "eval_runtime": 51.2113, "eval_samples_per_second": 195.269, "eval_steps_per_second": 24.409, "step": 49600 }, { "epoch": 2.989556852333486, "grad_norm": 0.9122774600982666, "learning_rate": 5.50483494498166e-06, "loss": 0.2832, "step": 49700 }, { "epoch": 2.989556852333486, "eval_loss": 0.25426608324050903, "eval_runtime": 51.098, "eval_samples_per_second": 195.702, "eval_steps_per_second": 24.463, "step": 49700 }, { "epoch": 2.9955720572677587, "grad_norm": 0.8778186440467834, "learning_rate": 5.503834611537179e-06, "loss": 0.2821, "step": 49800 }, { "epoch": 2.9955720572677587, "eval_loss": 0.2510456442832947, "eval_runtime": 51.0495, "eval_samples_per_second": 195.888, "eval_steps_per_second": 24.486, "step": 49800 }, { "epoch": 3.001587262202031, "grad_norm": 0.8645954132080078, "learning_rate": 5.502834278092698e-06, "loss": 0.283, "step": 49900 }, { "epoch": 3.001587262202031, "eval_loss": 0.2549561858177185, "eval_runtime": 51.1194, "eval_samples_per_second": 195.62, "eval_steps_per_second": 24.453, "step": 49900 }, { "epoch": 3.0076024671363037, "grad_norm": 0.971116304397583, "learning_rate": 5.501833944648216e-06, "loss": 0.2833, "step": 50000 }, { "epoch": 3.0076024671363037, "eval_loss": 0.24709643423557281, "eval_runtime": 50.6183, "eval_samples_per_second": 197.557, "eval_steps_per_second": 24.695, "step": 50000 }, { "epoch": 3.0136176720705765, "grad_norm": 0.9352070093154907, "learning_rate": 5.500833611203735e-06, "loss": 0.2829, "step": 50100 }, { "epoch": 3.0136176720705765, "eval_loss": 0.2510698139667511, "eval_runtime": 50.9108, "eval_samples_per_second": 196.422, "eval_steps_per_second": 24.553, "step": 50100 }, { "epoch": 3.019632877004849, "grad_norm": 0.8702713847160339, "learning_rate": 5.499833277759254e-06, "loss": 0.2806, "step": 50200 }, { "epoch": 3.019632877004849, "eval_loss": 0.25517037510871887, "eval_runtime": 51.143, "eval_samples_per_second": 195.53, "eval_steps_per_second": 24.441, "step": 50200 }, { "epoch": 3.0256480819391216, "grad_norm": 0.8589245676994324, "learning_rate": 5.498832944314772e-06, "loss": 0.2828, "step": 50300 }, { "epoch": 3.0256480819391216, "eval_loss": 0.25433140993118286, "eval_runtime": 48.9769, "eval_samples_per_second": 204.178, "eval_steps_per_second": 25.522, "step": 50300 }, { "epoch": 3.0316632868733944, "grad_norm": 0.8240871429443359, "learning_rate": 5.49783261087029e-06, "loss": 0.2786, "step": 50400 }, { "epoch": 3.0316632868733944, "eval_loss": 0.2537357807159424, "eval_runtime": 43.57, "eval_samples_per_second": 229.516, "eval_steps_per_second": 28.689, "step": 50400 }, { "epoch": 3.0376784918076667, "grad_norm": 0.8937031030654907, "learning_rate": 5.496832277425809e-06, "loss": 0.2818, "step": 50500 }, { "epoch": 3.0376784918076667, "eval_loss": 0.25536617636680603, "eval_runtime": 43.9342, "eval_samples_per_second": 227.613, "eval_steps_per_second": 28.452, "step": 50500 }, { "epoch": 3.0436936967419395, "grad_norm": 0.8851022720336914, "learning_rate": 5.495831943981327e-06, "loss": 0.28, "step": 50600 }, { "epoch": 3.0436936967419395, "eval_loss": 0.2511354684829712, "eval_runtime": 43.4697, "eval_samples_per_second": 230.045, "eval_steps_per_second": 28.756, "step": 50600 }, { "epoch": 3.0497089016762122, "grad_norm": 0.9308133125305176, "learning_rate": 5.494831610536846e-06, "loss": 0.2822, "step": 50700 }, { "epoch": 3.0497089016762122, "eval_loss": 0.2528564929962158, "eval_runtime": 38.8722, "eval_samples_per_second": 257.253, "eval_steps_per_second": 32.157, "step": 50700 }, { "epoch": 3.0557241066104845, "grad_norm": 1.0158571004867554, "learning_rate": 5.493831277092364e-06, "loss": 0.2829, "step": 50800 }, { "epoch": 3.0557241066104845, "eval_loss": 0.24908022582530975, "eval_runtime": 37.7881, "eval_samples_per_second": 264.634, "eval_steps_per_second": 33.079, "step": 50800 }, { "epoch": 3.0617393115447573, "grad_norm": 0.8238421082496643, "learning_rate": 5.492830943647883e-06, "loss": 0.2804, "step": 50900 }, { "epoch": 3.0617393115447573, "eval_loss": 0.24608242511749268, "eval_runtime": 40.8226, "eval_samples_per_second": 244.962, "eval_steps_per_second": 30.62, "step": 50900 }, { "epoch": 3.06775451647903, "grad_norm": 0.8686819672584534, "learning_rate": 5.491830610203402e-06, "loss": 0.2793, "step": 51000 }, { "epoch": 3.06775451647903, "eval_loss": 0.24653884768486023, "eval_runtime": 43.055, "eval_samples_per_second": 232.261, "eval_steps_per_second": 29.033, "step": 51000 }, { "epoch": 3.0737697214133024, "grad_norm": 0.9399664998054504, "learning_rate": 5.49083027675892e-06, "loss": 0.2812, "step": 51100 }, { "epoch": 3.0737697214133024, "eval_loss": 0.25110530853271484, "eval_runtime": 44.132, "eval_samples_per_second": 226.593, "eval_steps_per_second": 28.324, "step": 51100 }, { "epoch": 3.079784926347575, "grad_norm": 0.9775184988975525, "learning_rate": 5.489829943314438e-06, "loss": 0.2791, "step": 51200 }, { "epoch": 3.079784926347575, "eval_loss": 0.24785326421260834, "eval_runtime": 39.588, "eval_samples_per_second": 252.602, "eval_steps_per_second": 31.575, "step": 51200 }, { "epoch": 3.0858001312818475, "grad_norm": 0.9678452014923096, "learning_rate": 5.488829609869957e-06, "loss": 0.2799, "step": 51300 }, { "epoch": 3.0858001312818475, "eval_loss": 0.25371748208999634, "eval_runtime": 40.7507, "eval_samples_per_second": 245.395, "eval_steps_per_second": 30.674, "step": 51300 }, { "epoch": 3.0918153362161203, "grad_norm": 0.9417468309402466, "learning_rate": 5.487829276425475e-06, "loss": 0.2794, "step": 51400 }, { "epoch": 3.0918153362161203, "eval_loss": 0.2551732659339905, "eval_runtime": 42.2338, "eval_samples_per_second": 236.777, "eval_steps_per_second": 29.597, "step": 51400 }, { "epoch": 3.097830541150393, "grad_norm": 0.8855278491973877, "learning_rate": 5.486828942980994e-06, "loss": 0.2798, "step": 51500 }, { "epoch": 3.097830541150393, "eval_loss": 0.24791452288627625, "eval_runtime": 48.1906, "eval_samples_per_second": 207.509, "eval_steps_per_second": 25.939, "step": 51500 }, { "epoch": 3.1038457460846653, "grad_norm": 0.8699272274971008, "learning_rate": 5.485828609536512e-06, "loss": 0.2777, "step": 51600 }, { "epoch": 3.1038457460846653, "eval_loss": 0.24532942473888397, "eval_runtime": 45.8295, "eval_samples_per_second": 218.2, "eval_steps_per_second": 27.275, "step": 51600 }, { "epoch": 3.109860951018938, "grad_norm": 0.8299559950828552, "learning_rate": 5.484828276092031e-06, "loss": 0.277, "step": 51700 }, { "epoch": 3.109860951018938, "eval_loss": 0.24607662856578827, "eval_runtime": 46.3442, "eval_samples_per_second": 215.777, "eval_steps_per_second": 26.972, "step": 51700 }, { "epoch": 3.115876155953211, "grad_norm": 0.8937397003173828, "learning_rate": 5.48382794264755e-06, "loss": 0.2823, "step": 51800 }, { "epoch": 3.115876155953211, "eval_loss": 0.2510640621185303, "eval_runtime": 47.5854, "eval_samples_per_second": 210.148, "eval_steps_per_second": 26.269, "step": 51800 }, { "epoch": 3.121891360887483, "grad_norm": 0.7908412218093872, "learning_rate": 5.482827609203068e-06, "loss": 0.2764, "step": 51900 }, { "epoch": 3.121891360887483, "eval_loss": 0.24473002552986145, "eval_runtime": 48.2096, "eval_samples_per_second": 207.427, "eval_steps_per_second": 25.928, "step": 51900 }, { "epoch": 3.127906565821756, "grad_norm": 0.8543498516082764, "learning_rate": 5.481827275758586e-06, "loss": 0.2782, "step": 52000 }, { "epoch": 3.127906565821756, "eval_loss": 0.24760138988494873, "eval_runtime": 48.6773, "eval_samples_per_second": 205.435, "eval_steps_per_second": 25.679, "step": 52000 }, { "epoch": 3.1339217707560287, "grad_norm": 0.869742751121521, "learning_rate": 5.480826942314105e-06, "loss": 0.2778, "step": 52100 }, { "epoch": 3.1339217707560287, "eval_loss": 0.2506987452507019, "eval_runtime": 49.27, "eval_samples_per_second": 202.963, "eval_steps_per_second": 25.37, "step": 52100 }, { "epoch": 3.139936975690301, "grad_norm": 0.97697514295578, "learning_rate": 5.479826608869623e-06, "loss": 0.2765, "step": 52200 }, { "epoch": 3.139936975690301, "eval_loss": 0.248337984085083, "eval_runtime": 50.0788, "eval_samples_per_second": 199.685, "eval_steps_per_second": 24.961, "step": 52200 }, { "epoch": 3.145952180624574, "grad_norm": 0.9102049469947815, "learning_rate": 5.478826275425142e-06, "loss": 0.2776, "step": 52300 }, { "epoch": 3.145952180624574, "eval_loss": 0.24709181487560272, "eval_runtime": 50.384, "eval_samples_per_second": 198.476, "eval_steps_per_second": 24.809, "step": 52300 }, { "epoch": 3.151967385558846, "grad_norm": 0.9332506656646729, "learning_rate": 5.47782594198066e-06, "loss": 0.2777, "step": 52400 }, { "epoch": 3.151967385558846, "eval_loss": 0.2484249472618103, "eval_runtime": 50.292, "eval_samples_per_second": 198.839, "eval_steps_per_second": 24.855, "step": 52400 }, { "epoch": 3.157982590493119, "grad_norm": 0.8517917394638062, "learning_rate": 5.476825608536179e-06, "loss": 0.278, "step": 52500 }, { "epoch": 3.157982590493119, "eval_loss": 0.24207893013954163, "eval_runtime": 48.3341, "eval_samples_per_second": 206.893, "eval_steps_per_second": 25.862, "step": 52500 }, { "epoch": 3.1639977954273917, "grad_norm": 0.8629357814788818, "learning_rate": 5.475825275091698e-06, "loss": 0.2775, "step": 52600 }, { "epoch": 3.1639977954273917, "eval_loss": 0.24527695775032043, "eval_runtime": 50.4058, "eval_samples_per_second": 198.39, "eval_steps_per_second": 24.799, "step": 52600 }, { "epoch": 3.170013000361664, "grad_norm": 0.9194425940513611, "learning_rate": 5.4748249416472156e-06, "loss": 0.2775, "step": 52700 }, { "epoch": 3.170013000361664, "eval_loss": 0.2455427497625351, "eval_runtime": 48.0608, "eval_samples_per_second": 208.07, "eval_steps_per_second": 26.009, "step": 52700 }, { "epoch": 3.1760282052959368, "grad_norm": 0.8746848702430725, "learning_rate": 5.473824608202734e-06, "loss": 0.278, "step": 52800 }, { "epoch": 3.1760282052959368, "eval_loss": 0.24813415110111237, "eval_runtime": 42.6735, "eval_samples_per_second": 234.338, "eval_steps_per_second": 29.292, "step": 52800 }, { "epoch": 3.1820434102302095, "grad_norm": 0.9082689881324768, "learning_rate": 5.472824274758253e-06, "loss": 0.2732, "step": 52900 }, { "epoch": 3.1820434102302095, "eval_loss": 0.24827983975410461, "eval_runtime": 44.2364, "eval_samples_per_second": 226.058, "eval_steps_per_second": 28.257, "step": 52900 }, { "epoch": 3.188058615164482, "grad_norm": 0.8607956171035767, "learning_rate": 5.471823941313771e-06, "loss": 0.2772, "step": 53000 }, { "epoch": 3.188058615164482, "eval_loss": 0.24322330951690674, "eval_runtime": 44.8161, "eval_samples_per_second": 223.134, "eval_steps_per_second": 27.892, "step": 53000 }, { "epoch": 3.1940738200987546, "grad_norm": 0.9439307451248169, "learning_rate": 5.4708236078692896e-06, "loss": 0.2734, "step": 53100 }, { "epoch": 3.1940738200987546, "eval_loss": 0.24696892499923706, "eval_runtime": 47.0223, "eval_samples_per_second": 212.665, "eval_steps_per_second": 26.583, "step": 53100 }, { "epoch": 3.2000890250330274, "grad_norm": 1.0130066871643066, "learning_rate": 5.469823274424808e-06, "loss": 0.2737, "step": 53200 }, { "epoch": 3.2000890250330274, "eval_loss": 0.2521739602088928, "eval_runtime": 46.5164, "eval_samples_per_second": 214.978, "eval_steps_per_second": 26.872, "step": 53200 }, { "epoch": 3.2061042299672997, "grad_norm": 0.9969391822814941, "learning_rate": 5.468822940980327e-06, "loss": 0.2767, "step": 53300 }, { "epoch": 3.2061042299672997, "eval_loss": 0.25239297747612, "eval_runtime": 46.7418, "eval_samples_per_second": 213.941, "eval_steps_per_second": 26.743, "step": 53300 }, { "epoch": 3.2121194349015725, "grad_norm": 0.9380843639373779, "learning_rate": 5.467822607535846e-06, "loss": 0.2743, "step": 53400 }, { "epoch": 3.2121194349015725, "eval_loss": 0.2427060306072235, "eval_runtime": 47.8166, "eval_samples_per_second": 209.133, "eval_steps_per_second": 26.142, "step": 53400 }, { "epoch": 3.2181346398358452, "grad_norm": 0.8498116135597229, "learning_rate": 5.466822274091364e-06, "loss": 0.2752, "step": 53500 }, { "epoch": 3.2181346398358452, "eval_loss": 0.23972123861312866, "eval_runtime": 48.9235, "eval_samples_per_second": 204.401, "eval_steps_per_second": 25.55, "step": 53500 }, { "epoch": 3.2241498447701176, "grad_norm": 0.8372825980186462, "learning_rate": 5.465821940646882e-06, "loss": 0.273, "step": 53600 }, { "epoch": 3.2241498447701176, "eval_loss": 0.2440669685602188, "eval_runtime": 47.5669, "eval_samples_per_second": 210.23, "eval_steps_per_second": 26.279, "step": 53600 }, { "epoch": 3.2301650497043903, "grad_norm": 0.9698020815849304, "learning_rate": 5.464821607202401e-06, "loss": 0.2767, "step": 53700 }, { "epoch": 3.2301650497043903, "eval_loss": 0.23816044628620148, "eval_runtime": 34.9463, "eval_samples_per_second": 286.153, "eval_steps_per_second": 35.769, "step": 53700 }, { "epoch": 3.236180254638663, "grad_norm": 0.822875440120697, "learning_rate": 5.463821273757919e-06, "loss": 0.2751, "step": 53800 }, { "epoch": 3.236180254638663, "eval_loss": 0.24079230427742004, "eval_runtime": 35.4307, "eval_samples_per_second": 282.241, "eval_steps_per_second": 35.28, "step": 53800 }, { "epoch": 3.2421954595729354, "grad_norm": 0.8933221101760864, "learning_rate": 5.4628209403134375e-06, "loss": 0.2753, "step": 53900 }, { "epoch": 3.2421954595729354, "eval_loss": 0.25047245621681213, "eval_runtime": 36.1364, "eval_samples_per_second": 276.729, "eval_steps_per_second": 34.591, "step": 53900 }, { "epoch": 3.248210664507208, "grad_norm": 0.915135383605957, "learning_rate": 5.461820606868957e-06, "loss": 0.2736, "step": 54000 }, { "epoch": 3.248210664507208, "eval_loss": 0.24464978277683258, "eval_runtime": 35.7495, "eval_samples_per_second": 279.724, "eval_steps_per_second": 34.966, "step": 54000 }, { "epoch": 3.2542258694414805, "grad_norm": 0.8490029573440552, "learning_rate": 5.460820273424475e-06, "loss": 0.274, "step": 54100 }, { "epoch": 3.2542258694414805, "eval_loss": 0.2507534325122833, "eval_runtime": 38.4129, "eval_samples_per_second": 260.329, "eval_steps_per_second": 32.541, "step": 54100 }, { "epoch": 3.2602410743757533, "grad_norm": 0.9220608472824097, "learning_rate": 5.459819939979994e-06, "loss": 0.2736, "step": 54200 }, { "epoch": 3.2602410743757533, "eval_loss": 0.24634374678134918, "eval_runtime": 41.8157, "eval_samples_per_second": 239.145, "eval_steps_per_second": 29.893, "step": 54200 }, { "epoch": 3.266256279310026, "grad_norm": 0.8318041563034058, "learning_rate": 5.458819606535512e-06, "loss": 0.271, "step": 54300 }, { "epoch": 3.266256279310026, "eval_loss": 0.24672181904315948, "eval_runtime": 39.1233, "eval_samples_per_second": 255.602, "eval_steps_per_second": 31.95, "step": 54300 }, { "epoch": 3.2722714842442984, "grad_norm": 0.8373593091964722, "learning_rate": 5.45781927309103e-06, "loss": 0.272, "step": 54400 }, { "epoch": 3.2722714842442984, "eval_loss": 0.24106918275356293, "eval_runtime": 36.4825, "eval_samples_per_second": 274.104, "eval_steps_per_second": 34.263, "step": 54400 }, { "epoch": 3.278286689178571, "grad_norm": 0.8802669644355774, "learning_rate": 5.456818939646549e-06, "loss": 0.2683, "step": 54500 }, { "epoch": 3.278286689178571, "eval_loss": 0.24452929198741913, "eval_runtime": 33.0976, "eval_samples_per_second": 302.137, "eval_steps_per_second": 37.767, "step": 54500 }, { "epoch": 3.284301894112844, "grad_norm": 0.8867002129554749, "learning_rate": 5.455818606202067e-06, "loss": 0.2697, "step": 54600 }, { "epoch": 3.284301894112844, "eval_loss": 0.23936684429645538, "eval_runtime": 40.904, "eval_samples_per_second": 244.475, "eval_steps_per_second": 30.559, "step": 54600 }, { "epoch": 3.2903170990471162, "grad_norm": 0.91335529088974, "learning_rate": 5.454818272757586e-06, "loss": 0.2739, "step": 54700 }, { "epoch": 3.2903170990471162, "eval_loss": 0.24262717366218567, "eval_runtime": 43.6033, "eval_samples_per_second": 229.34, "eval_steps_per_second": 28.668, "step": 54700 }, { "epoch": 3.296332303981389, "grad_norm": 0.8662433624267578, "learning_rate": 5.453817939313105e-06, "loss": 0.2715, "step": 54800 }, { "epoch": 3.296332303981389, "eval_loss": 0.24885956943035126, "eval_runtime": 45.6743, "eval_samples_per_second": 218.942, "eval_steps_per_second": 27.368, "step": 54800 }, { "epoch": 3.3023475089156618, "grad_norm": 0.943458616733551, "learning_rate": 5.452817605868623e-06, "loss": 0.2709, "step": 54900 }, { "epoch": 3.3023475089156618, "eval_loss": 0.24570631980895996, "eval_runtime": 46.9183, "eval_samples_per_second": 213.136, "eval_steps_per_second": 26.642, "step": 54900 }, { "epoch": 3.308362713849934, "grad_norm": 0.8767443299293518, "learning_rate": 5.451817272424142e-06, "loss": 0.2724, "step": 55000 }, { "epoch": 3.308362713849934, "eval_loss": 0.24481208622455597, "eval_runtime": 47.56, "eval_samples_per_second": 210.261, "eval_steps_per_second": 26.283, "step": 55000 }, { "epoch": 3.314377918784207, "grad_norm": 0.9032852053642273, "learning_rate": 5.45081693897966e-06, "loss": 0.2733, "step": 55100 }, { "epoch": 3.314377918784207, "eval_loss": 0.24037285149097443, "eval_runtime": 48.5117, "eval_samples_per_second": 206.136, "eval_steps_per_second": 25.767, "step": 55100 }, { "epoch": 3.320393123718479, "grad_norm": 0.8414300084114075, "learning_rate": 5.449816605535178e-06, "loss": 0.2709, "step": 55200 }, { "epoch": 3.320393123718479, "eval_loss": 0.24620996415615082, "eval_runtime": 48.2151, "eval_samples_per_second": 207.404, "eval_steps_per_second": 25.925, "step": 55200 }, { "epoch": 3.326408328652752, "grad_norm": 0.9093489646911621, "learning_rate": 5.448816272090697e-06, "loss": 0.2683, "step": 55300 }, { "epoch": 3.326408328652752, "eval_loss": 0.24467670917510986, "eval_runtime": 49.7086, "eval_samples_per_second": 201.172, "eval_steps_per_second": 25.147, "step": 55300 }, { "epoch": 3.3324235335870247, "grad_norm": 0.920391857624054, "learning_rate": 5.447815938646216e-06, "loss": 0.2703, "step": 55400 }, { "epoch": 3.3324235335870247, "eval_loss": 0.24019140005111694, "eval_runtime": 50.0394, "eval_samples_per_second": 199.843, "eval_steps_per_second": 24.98, "step": 55400 }, { "epoch": 3.338438738521297, "grad_norm": 0.9286474585533142, "learning_rate": 5.446815605201734e-06, "loss": 0.2705, "step": 55500 }, { "epoch": 3.338438738521297, "eval_loss": 0.24543143808841705, "eval_runtime": 50.344, "eval_samples_per_second": 198.633, "eval_steps_per_second": 24.829, "step": 55500 }, { "epoch": 3.34445394345557, "grad_norm": 0.9175123572349548, "learning_rate": 5.445815271757253e-06, "loss": 0.2713, "step": 55600 }, { "epoch": 3.34445394345557, "eval_loss": 0.23898915946483612, "eval_runtime": 50.3195, "eval_samples_per_second": 198.73, "eval_steps_per_second": 24.841, "step": 55600 }, { "epoch": 3.3504691483898426, "grad_norm": 0.8990902900695801, "learning_rate": 5.444814938312771e-06, "loss": 0.2713, "step": 55700 }, { "epoch": 3.3504691483898426, "eval_loss": 0.24149462580680847, "eval_runtime": 50.7504, "eval_samples_per_second": 197.043, "eval_steps_per_second": 24.63, "step": 55700 }, { "epoch": 3.356484353324115, "grad_norm": 0.8217372298240662, "learning_rate": 5.4438146048682896e-06, "loss": 0.2694, "step": 55800 }, { "epoch": 3.356484353324115, "eval_loss": 0.24138091504573822, "eval_runtime": 50.9006, "eval_samples_per_second": 196.461, "eval_steps_per_second": 24.558, "step": 55800 }, { "epoch": 3.3624995582583876, "grad_norm": 0.8727395534515381, "learning_rate": 5.442814271423808e-06, "loss": 0.2694, "step": 55900 }, { "epoch": 3.3624995582583876, "eval_loss": 0.24046172201633453, "eval_runtime": 36.3936, "eval_samples_per_second": 274.773, "eval_steps_per_second": 34.347, "step": 55900 }, { "epoch": 3.3685147631926604, "grad_norm": 0.8453567028045654, "learning_rate": 5.441813937979326e-06, "loss": 0.2683, "step": 56000 }, { "epoch": 3.3685147631926604, "eval_loss": 0.24423474073410034, "eval_runtime": 50.8544, "eval_samples_per_second": 196.64, "eval_steps_per_second": 24.58, "step": 56000 }, { "epoch": 3.3745299681269327, "grad_norm": 0.86241614818573, "learning_rate": 5.440813604534845e-06, "loss": 0.2649, "step": 56100 }, { "epoch": 3.3745299681269327, "eval_loss": 0.2407056838274002, "eval_runtime": 50.778, "eval_samples_per_second": 196.936, "eval_steps_per_second": 24.617, "step": 56100 }, { "epoch": 3.3805451730612055, "grad_norm": 0.9142568111419678, "learning_rate": 5.4398132710903636e-06, "loss": 0.2696, "step": 56200 }, { "epoch": 3.3805451730612055, "eval_loss": 0.24098168313503265, "eval_runtime": 51.0703, "eval_samples_per_second": 195.809, "eval_steps_per_second": 24.476, "step": 56200 }, { "epoch": 3.386560377995478, "grad_norm": 0.8302989602088928, "learning_rate": 5.438812937645882e-06, "loss": 0.2695, "step": 56300 }, { "epoch": 3.386560377995478, "eval_loss": 0.23798757791519165, "eval_runtime": 50.9646, "eval_samples_per_second": 196.215, "eval_steps_per_second": 24.527, "step": 56300 }, { "epoch": 3.3925755829297506, "grad_norm": 0.8420681357383728, "learning_rate": 5.437812604201401e-06, "loss": 0.2682, "step": 56400 }, { "epoch": 3.3925755829297506, "eval_loss": 0.24360163509845734, "eval_runtime": 51.0498, "eval_samples_per_second": 195.887, "eval_steps_per_second": 24.486, "step": 56400 }, { "epoch": 3.3985907878640234, "grad_norm": 0.8456258773803711, "learning_rate": 5.436812270756919e-06, "loss": 0.2661, "step": 56500 }, { "epoch": 3.3985907878640234, "eval_loss": 0.23989547789096832, "eval_runtime": 49.5593, "eval_samples_per_second": 201.778, "eval_steps_per_second": 25.222, "step": 56500 }, { "epoch": 3.404605992798296, "grad_norm": 0.9097959399223328, "learning_rate": 5.4358119373124375e-06, "loss": 0.2684, "step": 56600 }, { "epoch": 3.404605992798296, "eval_loss": 0.2373836487531662, "eval_runtime": 48.7156, "eval_samples_per_second": 205.273, "eval_steps_per_second": 25.659, "step": 56600 }, { "epoch": 3.4106211977325684, "grad_norm": 0.8549370169639587, "learning_rate": 5.434811603867956e-06, "loss": 0.266, "step": 56700 }, { "epoch": 3.4106211977325684, "eval_loss": 0.2353491634130478, "eval_runtime": 48.0299, "eval_samples_per_second": 208.204, "eval_steps_per_second": 26.025, "step": 56700 }, { "epoch": 3.416636402666841, "grad_norm": 0.9058821797370911, "learning_rate": 5.433811270423474e-06, "loss": 0.2712, "step": 56800 }, { "epoch": 3.416636402666841, "eval_loss": 0.24013860523700714, "eval_runtime": 48.2564, "eval_samples_per_second": 207.226, "eval_steps_per_second": 25.903, "step": 56800 }, { "epoch": 3.4226516076011135, "grad_norm": 0.7843255400657654, "learning_rate": 5.432810936978993e-06, "loss": 0.2667, "step": 56900 }, { "epoch": 3.4226516076011135, "eval_loss": 0.2440056949853897, "eval_runtime": 49.156, "eval_samples_per_second": 203.434, "eval_steps_per_second": 25.429, "step": 56900 }, { "epoch": 3.4286668125353863, "grad_norm": 0.8476096987724304, "learning_rate": 5.4318106035345115e-06, "loss": 0.2647, "step": 57000 }, { "epoch": 3.4286668125353863, "eval_loss": 0.24185192584991455, "eval_runtime": 48.8755, "eval_samples_per_second": 204.602, "eval_steps_per_second": 25.575, "step": 57000 }, { "epoch": 3.434682017469659, "grad_norm": 0.8693493008613586, "learning_rate": 5.43081027009003e-06, "loss": 0.2667, "step": 57100 }, { "epoch": 3.434682017469659, "eval_loss": 0.23922978341579437, "eval_runtime": 49.1662, "eval_samples_per_second": 203.392, "eval_steps_per_second": 25.424, "step": 57100 }, { "epoch": 3.4406972224039314, "grad_norm": 0.7601708769798279, "learning_rate": 5.429809936645549e-06, "loss": 0.268, "step": 57200 }, { "epoch": 3.4406972224039314, "eval_loss": 0.2391706109046936, "eval_runtime": 49.6653, "eval_samples_per_second": 201.348, "eval_steps_per_second": 25.168, "step": 57200 }, { "epoch": 3.446712427338204, "grad_norm": 0.8476257920265198, "learning_rate": 5.428809603201068e-06, "loss": 0.2668, "step": 57300 }, { "epoch": 3.446712427338204, "eval_loss": 0.23998339474201202, "eval_runtime": 49.9477, "eval_samples_per_second": 200.209, "eval_steps_per_second": 25.026, "step": 57300 }, { "epoch": 3.452727632272477, "grad_norm": 0.9185997843742371, "learning_rate": 5.4278092697565855e-06, "loss": 0.2649, "step": 57400 }, { "epoch": 3.452727632272477, "eval_loss": 0.2374006062746048, "eval_runtime": 49.539, "eval_samples_per_second": 201.861, "eval_steps_per_second": 25.233, "step": 57400 }, { "epoch": 3.4587428372067492, "grad_norm": 0.8186565041542053, "learning_rate": 5.426808936312104e-06, "loss": 0.2667, "step": 57500 }, { "epoch": 3.4587428372067492, "eval_loss": 0.23729223012924194, "eval_runtime": 50.958, "eval_samples_per_second": 196.24, "eval_steps_per_second": 24.53, "step": 57500 }, { "epoch": 3.464758042141022, "grad_norm": 0.876054048538208, "learning_rate": 5.425808602867622e-06, "loss": 0.2644, "step": 57600 }, { "epoch": 3.464758042141022, "eval_loss": 0.2387179434299469, "eval_runtime": 36.8167, "eval_samples_per_second": 271.616, "eval_steps_per_second": 33.952, "step": 57600 }, { "epoch": 3.4707732470752948, "grad_norm": 0.8078221678733826, "learning_rate": 5.424808269423141e-06, "loss": 0.2671, "step": 57700 }, { "epoch": 3.4707732470752948, "eval_loss": 0.23494240641593933, "eval_runtime": 50.9663, "eval_samples_per_second": 196.208, "eval_steps_per_second": 24.526, "step": 57700 }, { "epoch": 3.476788452009567, "grad_norm": 0.8425822257995605, "learning_rate": 5.4238079359786595e-06, "loss": 0.2662, "step": 57800 }, { "epoch": 3.476788452009567, "eval_loss": 0.23349033296108246, "eval_runtime": 50.9349, "eval_samples_per_second": 196.329, "eval_steps_per_second": 24.541, "step": 57800 }, { "epoch": 3.48280365694384, "grad_norm": 0.8718583583831787, "learning_rate": 5.422807602534178e-06, "loss": 0.267, "step": 57900 }, { "epoch": 3.48280365694384, "eval_loss": 0.23534800112247467, "eval_runtime": 50.6689, "eval_samples_per_second": 197.36, "eval_steps_per_second": 24.67, "step": 57900 }, { "epoch": 3.488818861878112, "grad_norm": 0.8161312341690063, "learning_rate": 5.421807269089697e-06, "loss": 0.2641, "step": 58000 }, { "epoch": 3.488818861878112, "eval_loss": 0.23691873252391815, "eval_runtime": 50.9223, "eval_samples_per_second": 196.377, "eval_steps_per_second": 24.547, "step": 58000 }, { "epoch": 3.494834066812385, "grad_norm": 0.781482458114624, "learning_rate": 5.420806935645216e-06, "loss": 0.2652, "step": 58100 }, { "epoch": 3.494834066812385, "eval_loss": 0.2412412315607071, "eval_runtime": 51.059, "eval_samples_per_second": 195.852, "eval_steps_per_second": 24.481, "step": 58100 }, { "epoch": 3.5008492717466577, "grad_norm": 0.869367778301239, "learning_rate": 5.4198066022007335e-06, "loss": 0.2639, "step": 58200 }, { "epoch": 3.5008492717466577, "eval_loss": 0.23919972777366638, "eval_runtime": 50.9672, "eval_samples_per_second": 196.205, "eval_steps_per_second": 24.526, "step": 58200 }, { "epoch": 3.5068644766809305, "grad_norm": 0.8614550828933716, "learning_rate": 5.418806268756252e-06, "loss": 0.2637, "step": 58300 }, { "epoch": 3.5068644766809305, "eval_loss": 0.23232702910900116, "eval_runtime": 50.8155, "eval_samples_per_second": 196.79, "eval_steps_per_second": 24.599, "step": 58300 }, { "epoch": 3.512879681615203, "grad_norm": 0.9519971609115601, "learning_rate": 5.417805935311771e-06, "loss": 0.2636, "step": 58400 }, { "epoch": 3.512879681615203, "eval_loss": 0.2359647899866104, "eval_runtime": 51.0167, "eval_samples_per_second": 196.014, "eval_steps_per_second": 24.502, "step": 58400 }, { "epoch": 3.5188948865494756, "grad_norm": 0.7815201282501221, "learning_rate": 5.416805601867289e-06, "loss": 0.263, "step": 58500 }, { "epoch": 3.5188948865494756, "eval_loss": 0.2390337437391281, "eval_runtime": 50.9327, "eval_samples_per_second": 196.337, "eval_steps_per_second": 24.542, "step": 58500 }, { "epoch": 3.524910091483748, "grad_norm": 0.9015016555786133, "learning_rate": 5.415805268422808e-06, "loss": 0.2635, "step": 58600 }, { "epoch": 3.524910091483748, "eval_loss": 0.23515385389328003, "eval_runtime": 50.6423, "eval_samples_per_second": 197.463, "eval_steps_per_second": 24.683, "step": 58600 }, { "epoch": 3.5309252964180207, "grad_norm": 0.9041895866394043, "learning_rate": 5.414804934978326e-06, "loss": 0.2633, "step": 58700 }, { "epoch": 3.5309252964180207, "eval_loss": 0.2379036694765091, "eval_runtime": 50.2383, "eval_samples_per_second": 199.051, "eval_steps_per_second": 24.881, "step": 58700 }, { "epoch": 3.5369405013522934, "grad_norm": 0.884931743144989, "learning_rate": 5.413804601533845e-06, "loss": 0.2612, "step": 58800 }, { "epoch": 3.5369405013522934, "eval_loss": 0.23683039844036102, "eval_runtime": 50.2696, "eval_samples_per_second": 198.928, "eval_steps_per_second": 24.866, "step": 58800 }, { "epoch": 3.5429557062865658, "grad_norm": 0.862382709980011, "learning_rate": 5.4128042680893636e-06, "loss": 0.2623, "step": 58900 }, { "epoch": 3.5429557062865658, "eval_loss": 0.23638789355754852, "eval_runtime": 50.4759, "eval_samples_per_second": 198.114, "eval_steps_per_second": 24.764, "step": 58900 }, { "epoch": 3.5489709112208385, "grad_norm": 0.8239731788635254, "learning_rate": 5.4118039346448814e-06, "loss": 0.2652, "step": 59000 }, { "epoch": 3.5489709112208385, "eval_loss": 0.23644813895225525, "eval_runtime": 49.8805, "eval_samples_per_second": 200.479, "eval_steps_per_second": 25.06, "step": 59000 }, { "epoch": 3.554986116155111, "grad_norm": 0.8433008193969727, "learning_rate": 5.4108036012004e-06, "loss": 0.2628, "step": 59100 }, { "epoch": 3.554986116155111, "eval_loss": 0.23331347107887268, "eval_runtime": 50.0038, "eval_samples_per_second": 199.985, "eval_steps_per_second": 24.998, "step": 59100 }, { "epoch": 3.5610013210893836, "grad_norm": 0.8740643858909607, "learning_rate": 5.409803267755919e-06, "loss": 0.2615, "step": 59200 }, { "epoch": 3.5610013210893836, "eval_loss": 0.23751728236675262, "eval_runtime": 49.4105, "eval_samples_per_second": 202.386, "eval_steps_per_second": 25.298, "step": 59200 }, { "epoch": 3.5670165260236564, "grad_norm": 0.7903056144714355, "learning_rate": 5.4088029343114375e-06, "loss": 0.2621, "step": 59300 }, { "epoch": 3.5670165260236564, "eval_loss": 0.23228037357330322, "eval_runtime": 49.2273, "eval_samples_per_second": 203.139, "eval_steps_per_second": 25.392, "step": 59300 }, { "epoch": 3.573031730957929, "grad_norm": 0.8559598326683044, "learning_rate": 5.407802600866956e-06, "loss": 0.2621, "step": 59400 }, { "epoch": 3.573031730957929, "eval_loss": 0.23780353367328644, "eval_runtime": 49.4165, "eval_samples_per_second": 202.362, "eval_steps_per_second": 25.295, "step": 59400 }, { "epoch": 3.5790469358922015, "grad_norm": 0.9178751111030579, "learning_rate": 5.406802267422474e-06, "loss": 0.2635, "step": 59500 }, { "epoch": 3.5790469358922015, "eval_loss": 0.23736293613910675, "eval_runtime": 49.1576, "eval_samples_per_second": 203.427, "eval_steps_per_second": 25.428, "step": 59500 }, { "epoch": 3.5850621408264742, "grad_norm": 0.8310320377349854, "learning_rate": 5.405801933977993e-06, "loss": 0.2626, "step": 59600 }, { "epoch": 3.5850621408264742, "eval_loss": 0.2320030778646469, "eval_runtime": 49.4934, "eval_samples_per_second": 202.047, "eval_steps_per_second": 25.256, "step": 59600 }, { "epoch": 3.5910773457607466, "grad_norm": 0.7860143184661865, "learning_rate": 5.4048016005335115e-06, "loss": 0.2632, "step": 59700 }, { "epoch": 3.5910773457607466, "eval_loss": 0.2336650937795639, "eval_runtime": 49.1673, "eval_samples_per_second": 203.387, "eval_steps_per_second": 25.423, "step": 59700 }, { "epoch": 3.5970925506950193, "grad_norm": 0.836063027381897, "learning_rate": 5.403801267089029e-06, "loss": 0.2621, "step": 59800 }, { "epoch": 3.5970925506950193, "eval_loss": 0.23437707126140594, "eval_runtime": 49.5986, "eval_samples_per_second": 201.619, "eval_steps_per_second": 25.202, "step": 59800 }, { "epoch": 3.603107755629292, "grad_norm": 0.8768342137336731, "learning_rate": 5.402800933644548e-06, "loss": 0.2609, "step": 59900 }, { "epoch": 3.603107755629292, "eval_loss": 0.23560036718845367, "eval_runtime": 49.2225, "eval_samples_per_second": 203.159, "eval_steps_per_second": 25.395, "step": 59900 }, { "epoch": 3.6091229605635644, "grad_norm": 0.8093357682228088, "learning_rate": 5.401800600200067e-06, "loss": 0.26, "step": 60000 }, { "epoch": 3.6091229605635644, "eval_loss": 0.2340717762708664, "eval_runtime": 49.3844, "eval_samples_per_second": 202.493, "eval_steps_per_second": 25.312, "step": 60000 }, { "epoch": 3.615138165497837, "grad_norm": 0.8731770515441895, "learning_rate": 5.4008002667555855e-06, "loss": 0.2614, "step": 60100 }, { "epoch": 3.615138165497837, "eval_loss": 0.2342948466539383, "eval_runtime": 48.6563, "eval_samples_per_second": 205.523, "eval_steps_per_second": 25.69, "step": 60100 }, { "epoch": 3.6211533704321095, "grad_norm": 0.8906363844871521, "learning_rate": 5.399799933311104e-06, "loss": 0.2601, "step": 60200 }, { "epoch": 3.6211533704321095, "eval_loss": 0.2331141084432602, "eval_runtime": 49.3998, "eval_samples_per_second": 202.43, "eval_steps_per_second": 25.304, "step": 60200 }, { "epoch": 3.6271685753663823, "grad_norm": 0.8565790057182312, "learning_rate": 5.398799599866623e-06, "loss": 0.2603, "step": 60300 }, { "epoch": 3.6271685753663823, "eval_loss": 0.23420780897140503, "eval_runtime": 48.2983, "eval_samples_per_second": 207.046, "eval_steps_per_second": 25.881, "step": 60300 }, { "epoch": 3.633183780300655, "grad_norm": 0.9718087911605835, "learning_rate": 5.397799266422141e-06, "loss": 0.2635, "step": 60400 }, { "epoch": 3.633183780300655, "eval_loss": 0.2375570833683014, "eval_runtime": 48.9976, "eval_samples_per_second": 204.091, "eval_steps_per_second": 25.511, "step": 60400 }, { "epoch": 3.639198985234928, "grad_norm": 0.8572448492050171, "learning_rate": 5.3967989329776595e-06, "loss": 0.2626, "step": 60500 }, { "epoch": 3.639198985234928, "eval_loss": 0.23931777477264404, "eval_runtime": 49.0436, "eval_samples_per_second": 203.9, "eval_steps_per_second": 25.488, "step": 60500 }, { "epoch": 3.6452141901692, "grad_norm": 0.8994346857070923, "learning_rate": 5.395798599533177e-06, "loss": 0.2595, "step": 60600 }, { "epoch": 3.6452141901692, "eval_loss": 0.2317589819431305, "eval_runtime": 49.5846, "eval_samples_per_second": 201.675, "eval_steps_per_second": 25.209, "step": 60600 }, { "epoch": 3.651229395103473, "grad_norm": 0.8513436913490295, "learning_rate": 5.394798266088696e-06, "loss": 0.2614, "step": 60700 }, { "epoch": 3.651229395103473, "eval_loss": 0.23111025989055634, "eval_runtime": 49.7262, "eval_samples_per_second": 201.101, "eval_steps_per_second": 25.138, "step": 60700 }, { "epoch": 3.657244600037745, "grad_norm": 0.9126865267753601, "learning_rate": 5.393797932644215e-06, "loss": 0.2583, "step": 60800 }, { "epoch": 3.657244600037745, "eval_loss": 0.23351147770881653, "eval_runtime": 49.8967, "eval_samples_per_second": 200.414, "eval_steps_per_second": 25.052, "step": 60800 }, { "epoch": 3.663259804972018, "grad_norm": 0.8021876811981201, "learning_rate": 5.3927975991997335e-06, "loss": 0.2601, "step": 60900 }, { "epoch": 3.663259804972018, "eval_loss": 0.23443163931369781, "eval_runtime": 49.9056, "eval_samples_per_second": 200.378, "eval_steps_per_second": 25.047, "step": 60900 }, { "epoch": 3.6692750099062907, "grad_norm": 0.8586119413375854, "learning_rate": 5.391797265755252e-06, "loss": 0.2605, "step": 61000 }, { "epoch": 3.6692750099062907, "eval_loss": 0.229187473654747, "eval_runtime": 40.9269, "eval_samples_per_second": 244.338, "eval_steps_per_second": 30.542, "step": 61000 }, { "epoch": 3.6752902148405635, "grad_norm": 0.9336073398590088, "learning_rate": 5.390796932310771e-06, "loss": 0.2612, "step": 61100 }, { "epoch": 3.6752902148405635, "eval_loss": 0.23033183813095093, "eval_runtime": 49.8614, "eval_samples_per_second": 200.556, "eval_steps_per_second": 25.069, "step": 61100 }, { "epoch": 3.681305419774836, "grad_norm": 0.7944173812866211, "learning_rate": 5.389796598866289e-06, "loss": 0.2595, "step": 61200 }, { "epoch": 3.681305419774836, "eval_loss": 0.22884014248847961, "eval_runtime": 50.2375, "eval_samples_per_second": 199.055, "eval_steps_per_second": 24.882, "step": 61200 }, { "epoch": 3.6873206247091086, "grad_norm": 0.8038543462753296, "learning_rate": 5.3887962654218075e-06, "loss": 0.2588, "step": 61300 }, { "epoch": 3.6873206247091086, "eval_loss": 0.23328329622745514, "eval_runtime": 51.0649, "eval_samples_per_second": 195.829, "eval_steps_per_second": 24.479, "step": 61300 }, { "epoch": 3.693335829643381, "grad_norm": 0.8919224143028259, "learning_rate": 5.387795931977326e-06, "loss": 0.2592, "step": 61400 }, { "epoch": 3.693335829643381, "eval_loss": 0.23098503053188324, "eval_runtime": 51.0915, "eval_samples_per_second": 195.727, "eval_steps_per_second": 24.466, "step": 61400 }, { "epoch": 3.6993510345776537, "grad_norm": 0.81063312292099, "learning_rate": 5.386795598532844e-06, "loss": 0.2598, "step": 61500 }, { "epoch": 3.6993510345776537, "eval_loss": 0.23130032420158386, "eval_runtime": 51.1499, "eval_samples_per_second": 195.504, "eval_steps_per_second": 24.438, "step": 61500 }, { "epoch": 3.7053662395119265, "grad_norm": 0.8565428853034973, "learning_rate": 5.385795265088363e-06, "loss": 0.2569, "step": 61600 }, { "epoch": 3.7053662395119265, "eval_loss": 0.23042194545269012, "eval_runtime": 51.0719, "eval_samples_per_second": 195.802, "eval_steps_per_second": 24.475, "step": 61600 }, { "epoch": 3.7113814444461988, "grad_norm": 0.8808117508888245, "learning_rate": 5.3847949316438814e-06, "loss": 0.2579, "step": 61700 }, { "epoch": 3.7113814444461988, "eval_loss": 0.22964029014110565, "eval_runtime": 51.1788, "eval_samples_per_second": 195.393, "eval_steps_per_second": 24.424, "step": 61700 }, { "epoch": 3.7173966493804715, "grad_norm": 0.8812440037727356, "learning_rate": 5.3837945981994e-06, "loss": 0.2568, "step": 61800 }, { "epoch": 3.7173966493804715, "eval_loss": 0.23177900910377502, "eval_runtime": 51.1658, "eval_samples_per_second": 195.443, "eval_steps_per_second": 24.43, "step": 61800 }, { "epoch": 3.723411854314744, "grad_norm": 0.8692899346351624, "learning_rate": 5.382794264754919e-06, "loss": 0.2567, "step": 61900 }, { "epoch": 3.723411854314744, "eval_loss": 0.23119042813777924, "eval_runtime": 51.1394, "eval_samples_per_second": 195.544, "eval_steps_per_second": 24.443, "step": 61900 }, { "epoch": 3.7294270592490166, "grad_norm": 0.8057258725166321, "learning_rate": 5.381793931310437e-06, "loss": 0.2574, "step": 62000 }, { "epoch": 3.7294270592490166, "eval_loss": 0.2311127930879593, "eval_runtime": 51.1109, "eval_samples_per_second": 195.653, "eval_steps_per_second": 24.457, "step": 62000 }, { "epoch": 3.7354422641832894, "grad_norm": 0.7970178127288818, "learning_rate": 5.380793597865955e-06, "loss": 0.2589, "step": 62100 }, { "epoch": 3.7354422641832894, "eval_loss": 0.2320980727672577, "eval_runtime": 51.1619, "eval_samples_per_second": 195.458, "eval_steps_per_second": 24.432, "step": 62100 }, { "epoch": 3.741457469117562, "grad_norm": 0.8987645506858826, "learning_rate": 5.379793264421474e-06, "loss": 0.2565, "step": 62200 }, { "epoch": 3.741457469117562, "eval_loss": 0.22809037566184998, "eval_runtime": 51.1437, "eval_samples_per_second": 195.527, "eval_steps_per_second": 24.441, "step": 62200 }, { "epoch": 3.7474726740518345, "grad_norm": 0.8491466641426086, "learning_rate": 5.378792930976992e-06, "loss": 0.2572, "step": 62300 }, { "epoch": 3.7474726740518345, "eval_loss": 0.23448967933654785, "eval_runtime": 51.1016, "eval_samples_per_second": 195.688, "eval_steps_per_second": 24.461, "step": 62300 }, { "epoch": 3.7534878789861073, "grad_norm": 0.8310768008232117, "learning_rate": 5.377792597532511e-06, "loss": 0.2558, "step": 62400 }, { "epoch": 3.7534878789861073, "eval_loss": 0.2314356416463852, "eval_runtime": 51.1436, "eval_samples_per_second": 195.528, "eval_steps_per_second": 24.441, "step": 62400 }, { "epoch": 3.7595030839203796, "grad_norm": 0.8902222514152527, "learning_rate": 5.376792264088029e-06, "loss": 0.256, "step": 62500 }, { "epoch": 3.7595030839203796, "eval_loss": 0.23469364643096924, "eval_runtime": 51.1102, "eval_samples_per_second": 195.656, "eval_steps_per_second": 24.457, "step": 62500 }, { "epoch": 3.7655182888546523, "grad_norm": 0.7377832531929016, "learning_rate": 5.375791930643548e-06, "loss": 0.2574, "step": 62600 }, { "epoch": 3.7655182888546523, "eval_loss": 0.23291806876659393, "eval_runtime": 51.1312, "eval_samples_per_second": 195.575, "eval_steps_per_second": 24.447, "step": 62600 }, { "epoch": 3.771533493788925, "grad_norm": 0.7997824549674988, "learning_rate": 5.374791597199067e-06, "loss": 0.257, "step": 62700 }, { "epoch": 3.771533493788925, "eval_loss": 0.23000933229923248, "eval_runtime": 48.2655, "eval_samples_per_second": 207.187, "eval_steps_per_second": 25.898, "step": 62700 }, { "epoch": 3.7775486987231974, "grad_norm": 0.8683999180793762, "learning_rate": 5.373791263754585e-06, "loss": 0.2564, "step": 62800 }, { "epoch": 3.7775486987231974, "eval_loss": 0.23462143540382385, "eval_runtime": 51.0748, "eval_samples_per_second": 195.791, "eval_steps_per_second": 24.474, "step": 62800 }, { "epoch": 3.78356390365747, "grad_norm": 0.8755656480789185, "learning_rate": 5.372790930310103e-06, "loss": 0.2558, "step": 62900 }, { "epoch": 3.78356390365747, "eval_loss": 0.23621977865695953, "eval_runtime": 51.1202, "eval_samples_per_second": 195.617, "eval_steps_per_second": 24.452, "step": 62900 }, { "epoch": 3.7895791085917425, "grad_norm": 0.9032362699508667, "learning_rate": 5.371790596865622e-06, "loss": 0.2551, "step": 63000 }, { "epoch": 3.7895791085917425, "eval_loss": 0.2294510453939438, "eval_runtime": 51.1388, "eval_samples_per_second": 195.546, "eval_steps_per_second": 24.443, "step": 63000 } ], "logging_steps": 100, "max_steps": 600000, "num_input_tokens_seen": 0, "num_train_epochs": 37, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 8 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.304354533994406e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }