| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 44.270833333333336, |
| "eval_steps": 576, |
| "global_step": 25500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001736111111111111, |
| "eval_loss": 9.594602584838867, |
| "eval_runtime": 41.3373, |
| "eval_samples_per_second": 90.209, |
| "eval_steps_per_second": 5.661, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.08680555555555555, |
| "grad_norm": 12.75, |
| "learning_rate": 0.000196, |
| "loss": 7.4156, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1736111111111111, |
| "grad_norm": 13.3125, |
| "learning_rate": 0.0001999985665413352, |
| "loss": 4.4164, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2604166666666667, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.00019999414859436728, |
| "loss": 4.1765, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3472222222222222, |
| "grad_norm": 11.0, |
| "learning_rate": 0.00019998674569395055, |
| "loss": 4.0896, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4340277777777778, |
| "grad_norm": 6.625, |
| "learning_rate": 0.000199976358061071, |
| "loss": 3.9586, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5208333333333334, |
| "grad_norm": 6.65625, |
| "learning_rate": 0.00019996298600581287, |
| "loss": 3.9273, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6076388888888888, |
| "grad_norm": 13.125, |
| "learning_rate": 0.0001999466299273491, |
| "loss": 3.8612, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 7.0625, |
| "learning_rate": 0.00019992729031392958, |
| "loss": 3.8205, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.78125, |
| "grad_norm": 8.75, |
| "learning_rate": 0.00019990496774286654, |
| "loss": 3.7956, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8680555555555556, |
| "grad_norm": 8.75, |
| "learning_rate": 0.00019987966288051735, |
| "loss": 3.7654, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9548611111111112, |
| "grad_norm": 14.0625, |
| "learning_rate": 0.00019985137648226457, |
| "loss": 3.6055, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.320380210876465, |
| "eval_runtime": 41.8114, |
| "eval_samples_per_second": 89.186, |
| "eval_steps_per_second": 5.597, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.0416666666666667, |
| "grad_norm": 13.875, |
| "learning_rate": 0.00019982010939249346, |
| "loss": 3.4141, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1284722222222223, |
| "grad_norm": 15.125, |
| "learning_rate": 0.0001997858625445666, |
| "loss": 3.3461, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.2152777777777777, |
| "grad_norm": 13.25, |
| "learning_rate": 0.0001997486369607964, |
| "loss": 3.2968, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3020833333333333, |
| "grad_norm": 11.25, |
| "learning_rate": 0.00019970843375241416, |
| "loss": 3.2924, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.3888888888888888, |
| "grad_norm": 12.5, |
| "learning_rate": 0.00019966525411953717, |
| "loss": 3.2577, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4756944444444444, |
| "grad_norm": 13.0625, |
| "learning_rate": 0.00019961909935113284, |
| "loss": 3.2544, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 14.125, |
| "learning_rate": 0.00019956997082498009, |
| "loss": 3.2245, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.6493055555555556, |
| "grad_norm": 9.1875, |
| "learning_rate": 0.00019951787000762835, |
| "loss": 3.2121, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.7361111111111112, |
| "grad_norm": 13.125, |
| "learning_rate": 0.00019946279845435382, |
| "loss": 3.1861, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.8229166666666665, |
| "grad_norm": 8.8125, |
| "learning_rate": 0.0001994047578091129, |
| "loss": 3.1813, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.9097222222222223, |
| "grad_norm": 10.5, |
| "learning_rate": 0.00019934374980449325, |
| "loss": 3.1483, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9965277777777777, |
| "grad_norm": 11.875, |
| "learning_rate": 0.00019927977626166193, |
| "loss": 3.1491, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 3.0681025981903076, |
| "eval_runtime": 41.9062, |
| "eval_samples_per_second": 88.984, |
| "eval_steps_per_second": 5.584, |
| "step": 1152 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 10.25, |
| "learning_rate": 0.00019921283909031114, |
| "loss": 3.1364, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.170138888888889, |
| "grad_norm": 10.375, |
| "learning_rate": 0.00019914294028860127, |
| "loss": 3.1123, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.2569444444444446, |
| "grad_norm": 10.3125, |
| "learning_rate": 0.00019907008194310102, |
| "loss": 3.1234, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.34375, |
| "grad_norm": 12.9375, |
| "learning_rate": 0.00019899426622872543, |
| "loss": 3.1215, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.4305555555555554, |
| "grad_norm": 12.5, |
| "learning_rate": 0.00019891549540867066, |
| "loss": 3.0999, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.517361111111111, |
| "grad_norm": 6.59375, |
| "learning_rate": 0.00019883377183434666, |
| "loss": 3.1192, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.6041666666666665, |
| "grad_norm": 7.6875, |
| "learning_rate": 0.00019874909794530675, |
| "loss": 3.0983, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6909722222222223, |
| "grad_norm": 8.625, |
| "learning_rate": 0.0001986614762691751, |
| "loss": 3.0853, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 12.875, |
| "learning_rate": 0.00019857090942157092, |
| "loss": 3.0822, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8645833333333335, |
| "grad_norm": 11.3125, |
| "learning_rate": 0.00019847740010603068, |
| "loss": 3.0779, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.951388888888889, |
| "grad_norm": 7.3125, |
| "learning_rate": 0.00019838095111392726, |
| "loss": 3.0747, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 3.002568483352661, |
| "eval_runtime": 40.5832, |
| "eval_samples_per_second": 91.885, |
| "eval_steps_per_second": 5.766, |
| "step": 1728 |
| }, |
| { |
| "epoch": 3.0381944444444446, |
| "grad_norm": 11.625, |
| "learning_rate": 0.00019828156532438666, |
| "loss": 3.0638, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 10.9375, |
| "learning_rate": 0.00019817924570420198, |
| "loss": 3.0585, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2118055555555554, |
| "grad_norm": 7.0625, |
| "learning_rate": 0.00019807399530774502, |
| "loss": 3.0494, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.298611111111111, |
| "grad_norm": 9.125, |
| "learning_rate": 0.00019796581727687493, |
| "loss": 3.0628, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3854166666666665, |
| "grad_norm": 11.875, |
| "learning_rate": 0.00019785471484084458, |
| "loss": 3.0529, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.4722222222222223, |
| "grad_norm": 14.9375, |
| "learning_rate": 0.00019774069131620398, |
| "loss": 3.0594, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590277777777777, |
| "grad_norm": 8.4375, |
| "learning_rate": 0.00019762375010670143, |
| "loss": 3.0478, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.6458333333333335, |
| "grad_norm": 9.8125, |
| "learning_rate": 0.0001975038947031819, |
| "loss": 3.0401, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.732638888888889, |
| "grad_norm": 11.0, |
| "learning_rate": 0.0001973811286834827, |
| "loss": 3.0339, |
| "step": 2150 |
| }, |
| { |
| "epoch": 3.8194444444444446, |
| "grad_norm": 9.0625, |
| "learning_rate": 0.00019725545571232686, |
| "loss": 3.0461, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.90625, |
| "grad_norm": 7.21875, |
| "learning_rate": 0.0001971268795412135, |
| "loss": 3.0156, |
| "step": 2250 |
| }, |
| { |
| "epoch": 3.9930555555555554, |
| "grad_norm": 9.75, |
| "learning_rate": 0.00019699540400830616, |
| "loss": 3.0261, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 2.960036516189575, |
| "eval_runtime": 41.7286, |
| "eval_samples_per_second": 89.363, |
| "eval_steps_per_second": 5.608, |
| "step": 2304 |
| }, |
| { |
| "epoch": 4.079861111111111, |
| "grad_norm": 7.53125, |
| "learning_rate": 0.00019686103303831787, |
| "loss": 3.0194, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 8.0, |
| "learning_rate": 0.0001967237706423943, |
| "loss": 2.9982, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.253472222222222, |
| "grad_norm": 10.0, |
| "learning_rate": 0.00019658362091799374, |
| "loss": 3.0147, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.340277777777778, |
| "grad_norm": 8.9375, |
| "learning_rate": 0.00019644058804876513, |
| "loss": 3.0187, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.427083333333333, |
| "grad_norm": 7.28125, |
| "learning_rate": 0.0001962946763044228, |
| "loss": 3.0009, |
| "step": 2550 |
| }, |
| { |
| "epoch": 4.513888888888889, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.00019614589004061928, |
| "loss": 3.0264, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.600694444444445, |
| "grad_norm": 8.6875, |
| "learning_rate": 0.0001959942336988152, |
| "loss": 3.0037, |
| "step": 2650 |
| }, |
| { |
| "epoch": 4.6875, |
| "grad_norm": 8.25, |
| "learning_rate": 0.0001958397118061466, |
| "loss": 3.0003, |
| "step": 2700 |
| }, |
| { |
| "epoch": 4.774305555555555, |
| "grad_norm": 7.1875, |
| "learning_rate": 0.00019568232897529002, |
| "loss": 2.9937, |
| "step": 2750 |
| }, |
| { |
| "epoch": 4.861111111111111, |
| "grad_norm": 7.5, |
| "learning_rate": 0.00019552208990432457, |
| "loss": 2.9977, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.947916666666667, |
| "grad_norm": 11.8125, |
| "learning_rate": 0.0001953589993765918, |
| "loss": 2.992, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 2.9334027767181396, |
| "eval_runtime": 42.3875, |
| "eval_samples_per_second": 87.974, |
| "eval_steps_per_second": 5.52, |
| "step": 2880 |
| }, |
| { |
| "epoch": 5.034722222222222, |
| "grad_norm": 8.8125, |
| "learning_rate": 0.000195193062260553, |
| "loss": 2.9851, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.121527777777778, |
| "grad_norm": 7.875, |
| "learning_rate": 0.00019502428350964355, |
| "loss": 2.9796, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.208333333333333, |
| "grad_norm": 6.53125, |
| "learning_rate": 0.00019485266816212548, |
| "loss": 2.977, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.295138888888889, |
| "grad_norm": 11.125, |
| "learning_rate": 0.00019467822134093684, |
| "loss": 2.9887, |
| "step": 3050 |
| }, |
| { |
| "epoch": 5.381944444444445, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.00019450094825353864, |
| "loss": 2.982, |
| "step": 3100 |
| }, |
| { |
| "epoch": 5.46875, |
| "grad_norm": 8.75, |
| "learning_rate": 0.00019432085419175975, |
| "loss": 2.9896, |
| "step": 3150 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.00019413794453163857, |
| "loss": 2.9854, |
| "step": 3200 |
| }, |
| { |
| "epoch": 5.642361111111111, |
| "grad_norm": 10.4375, |
| "learning_rate": 0.00019395222473326284, |
| "loss": 2.9749, |
| "step": 3250 |
| }, |
| { |
| "epoch": 5.729166666666667, |
| "grad_norm": 7.03125, |
| "learning_rate": 0.00019376370034060653, |
| "loss": 2.9705, |
| "step": 3300 |
| }, |
| { |
| "epoch": 5.815972222222222, |
| "grad_norm": 9.8125, |
| "learning_rate": 0.00019357237698136427, |
| "loss": 2.9855, |
| "step": 3350 |
| }, |
| { |
| "epoch": 5.902777777777778, |
| "grad_norm": 6.78125, |
| "learning_rate": 0.00019337826036678338, |
| "loss": 2.9596, |
| "step": 3400 |
| }, |
| { |
| "epoch": 5.989583333333333, |
| "grad_norm": 8.6875, |
| "learning_rate": 0.00019318135629149363, |
| "loss": 2.9692, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 2.9161436557769775, |
| "eval_runtime": 41.8777, |
| "eval_samples_per_second": 89.045, |
| "eval_steps_per_second": 5.588, |
| "step": 3456 |
| }, |
| { |
| "epoch": 6.076388888888889, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.0001929816706333339, |
| "loss": 2.9666, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.163194444444445, |
| "grad_norm": 11.625, |
| "learning_rate": 0.00019277920935317688, |
| "loss": 2.9451, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 7.625, |
| "learning_rate": 0.00019257397849475124, |
| "loss": 2.9624, |
| "step": 3600 |
| }, |
| { |
| "epoch": 6.336805555555555, |
| "grad_norm": 7.34375, |
| "learning_rate": 0.00019236598418446098, |
| "loss": 2.9722, |
| "step": 3650 |
| }, |
| { |
| "epoch": 6.423611111111111, |
| "grad_norm": 7.5, |
| "learning_rate": 0.00019215523263120283, |
| "loss": 2.9552, |
| "step": 3700 |
| }, |
| { |
| "epoch": 6.510416666666667, |
| "grad_norm": 10.625, |
| "learning_rate": 0.0001919417301261806, |
| "loss": 2.9844, |
| "step": 3750 |
| }, |
| { |
| "epoch": 6.597222222222222, |
| "grad_norm": 6.25, |
| "learning_rate": 0.00019172548304271768, |
| "loss": 2.9576, |
| "step": 3800 |
| }, |
| { |
| "epoch": 6.684027777777778, |
| "grad_norm": 8.25, |
| "learning_rate": 0.00019150649783606646, |
| "loss": 2.9598, |
| "step": 3850 |
| }, |
| { |
| "epoch": 6.770833333333333, |
| "grad_norm": 6.25, |
| "learning_rate": 0.00019128478104321603, |
| "loss": 2.9488, |
| "step": 3900 |
| }, |
| { |
| "epoch": 6.857638888888889, |
| "grad_norm": 8.25, |
| "learning_rate": 0.00019106033928269667, |
| "loss": 2.9591, |
| "step": 3950 |
| }, |
| { |
| "epoch": 6.944444444444445, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.00019083317925438248, |
| "loss": 2.9501, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 2.90425968170166, |
| "eval_runtime": 41.3276, |
| "eval_samples_per_second": 90.23, |
| "eval_steps_per_second": 5.662, |
| "step": 4032 |
| }, |
| { |
| "epoch": 7.03125, |
| "grad_norm": 6.40625, |
| "learning_rate": 0.00019060330773929137, |
| "loss": 2.9478, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.118055555555555, |
| "grad_norm": 8.75, |
| "learning_rate": 0.00019037073159938256, |
| "loss": 2.9421, |
| "step": 4100 |
| }, |
| { |
| "epoch": 7.204861111111111, |
| "grad_norm": 6.1875, |
| "learning_rate": 0.00019013545777735183, |
| "loss": 2.9394, |
| "step": 4150 |
| }, |
| { |
| "epoch": 7.291666666666667, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.00018989749329642418, |
| "loss": 2.9519, |
| "step": 4200 |
| }, |
| { |
| "epoch": 7.378472222222222, |
| "grad_norm": 6.25, |
| "learning_rate": 0.00018965684526014425, |
| "loss": 2.9475, |
| "step": 4250 |
| }, |
| { |
| "epoch": 7.465277777777778, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.00018941352085216425, |
| "loss": 2.9507, |
| "step": 4300 |
| }, |
| { |
| "epoch": 7.552083333333333, |
| "grad_norm": 7.5625, |
| "learning_rate": 0.0001891675273360295, |
| "loss": 2.956, |
| "step": 4350 |
| }, |
| { |
| "epoch": 7.638888888888889, |
| "grad_norm": 8.1875, |
| "learning_rate": 0.00018891887205496163, |
| "loss": 2.9422, |
| "step": 4400 |
| }, |
| { |
| "epoch": 7.725694444444445, |
| "grad_norm": 6.625, |
| "learning_rate": 0.00018866756243163938, |
| "loss": 2.9379, |
| "step": 4450 |
| }, |
| { |
| "epoch": 7.8125, |
| "grad_norm": 7.46875, |
| "learning_rate": 0.00018841360596797695, |
| "loss": 2.9477, |
| "step": 4500 |
| }, |
| { |
| "epoch": 7.899305555555555, |
| "grad_norm": 9.3125, |
| "learning_rate": 0.0001881570102449002, |
| "loss": 2.9293, |
| "step": 4550 |
| }, |
| { |
| "epoch": 7.986111111111111, |
| "grad_norm": 8.375, |
| "learning_rate": 0.0001878977829221201, |
| "loss": 2.9379, |
| "step": 4600 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 2.894627571105957, |
| "eval_runtime": 42.0326, |
| "eval_samples_per_second": 88.717, |
| "eval_steps_per_second": 5.567, |
| "step": 4608 |
| }, |
| { |
| "epoch": 8.072916666666666, |
| "grad_norm": 7.625, |
| "learning_rate": 0.00018763593173790454, |
| "loss": 2.9327, |
| "step": 4650 |
| }, |
| { |
| "epoch": 8.159722222222221, |
| "grad_norm": 6.25, |
| "learning_rate": 0.00018737146450884668, |
| "loss": 2.917, |
| "step": 4700 |
| }, |
| { |
| "epoch": 8.246527777777779, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.00018710438912963225, |
| "loss": 2.9335, |
| "step": 4750 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.00018683471357280347, |
| "loss": 2.9416, |
| "step": 4800 |
| }, |
| { |
| "epoch": 8.42013888888889, |
| "grad_norm": 6.84375, |
| "learning_rate": 0.00018656244588852124, |
| "loss": 2.9256, |
| "step": 4850 |
| }, |
| { |
| "epoch": 8.506944444444445, |
| "grad_norm": 5.5625, |
| "learning_rate": 0.00018628759420432473, |
| "loss": 2.9525, |
| "step": 4900 |
| }, |
| { |
| "epoch": 8.59375, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.00018601016672488888, |
| "loss": 2.9268, |
| "step": 4950 |
| }, |
| { |
| "epoch": 8.680555555555555, |
| "grad_norm": 6.90625, |
| "learning_rate": 0.00018573017173177938, |
| "loss": 2.9347, |
| "step": 5000 |
| }, |
| { |
| "epoch": 8.76736111111111, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.0001854476175832055, |
| "loss": 2.9267, |
| "step": 5050 |
| }, |
| { |
| "epoch": 8.854166666666666, |
| "grad_norm": 6.375, |
| "learning_rate": 0.00018516251271377064, |
| "loss": 2.9246, |
| "step": 5100 |
| }, |
| { |
| "epoch": 8.940972222222221, |
| "grad_norm": 6.0, |
| "learning_rate": 0.00018487486563422036, |
| "loss": 2.9221, |
| "step": 5150 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 2.8883821964263916, |
| "eval_runtime": 41.385, |
| "eval_samples_per_second": 90.105, |
| "eval_steps_per_second": 5.654, |
| "step": 5184 |
| }, |
| { |
| "epoch": 9.027777777777779, |
| "grad_norm": 7.875, |
| "learning_rate": 0.00018458468493118857, |
| "loss": 2.9219, |
| "step": 5200 |
| }, |
| { |
| "epoch": 9.114583333333334, |
| "grad_norm": 7.90625, |
| "learning_rate": 0.000184291979266941, |
| "loss": 2.9209, |
| "step": 5250 |
| }, |
| { |
| "epoch": 9.20138888888889, |
| "grad_norm": 6.4375, |
| "learning_rate": 0.00018399675737911677, |
| "loss": 2.9127, |
| "step": 5300 |
| }, |
| { |
| "epoch": 9.288194444444445, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.00018369902808046748, |
| "loss": 2.9262, |
| "step": 5350 |
| }, |
| { |
| "epoch": 9.375, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.0001833988002585941, |
| "loss": 2.9258, |
| "step": 5400 |
| }, |
| { |
| "epoch": 9.461805555555555, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.00018309608287568182, |
| "loss": 2.9275, |
| "step": 5450 |
| }, |
| { |
| "epoch": 9.54861111111111, |
| "grad_norm": 6.25, |
| "learning_rate": 0.00018279088496823235, |
| "loss": 2.9312, |
| "step": 5500 |
| }, |
| { |
| "epoch": 9.635416666666666, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.00018248321564679425, |
| "loss": 2.9205, |
| "step": 5550 |
| }, |
| { |
| "epoch": 9.722222222222221, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.0001821730840956909, |
| "loss": 2.9203, |
| "step": 5600 |
| }, |
| { |
| "epoch": 9.809027777777779, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.00018186049957274656, |
| "loss": 2.9264, |
| "step": 5650 |
| }, |
| { |
| "epoch": 9.895833333333334, |
| "grad_norm": 5.0, |
| "learning_rate": 0.0001815454714090096, |
| "loss": 2.9109, |
| "step": 5700 |
| }, |
| { |
| "epoch": 9.98263888888889, |
| "grad_norm": 5.875, |
| "learning_rate": 0.0001812280090084744, |
| "loss": 2.9139, |
| "step": 5750 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 2.8820853233337402, |
| "eval_runtime": 42.0383, |
| "eval_samples_per_second": 88.705, |
| "eval_steps_per_second": 5.566, |
| "step": 5760 |
| }, |
| { |
| "epoch": 10.069444444444445, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.00018090812184780032, |
| "loss": 2.9105, |
| "step": 5800 |
| }, |
| { |
| "epoch": 10.15625, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.000180585819476029, |
| "loss": 2.9039, |
| "step": 5850 |
| }, |
| { |
| "epoch": 10.243055555555555, |
| "grad_norm": 5.84375, |
| "learning_rate": 0.0001802611115142991, |
| "loss": 2.9122, |
| "step": 5900 |
| }, |
| { |
| "epoch": 10.32986111111111, |
| "grad_norm": 6.75, |
| "learning_rate": 0.00017993400765555932, |
| "loss": 2.9233, |
| "step": 5950 |
| }, |
| { |
| "epoch": 10.416666666666666, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.00017960451766427897, |
| "loss": 2.9075, |
| "step": 6000 |
| }, |
| { |
| "epoch": 10.503472222222221, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.00017927265137615637, |
| "loss": 2.937, |
| "step": 6050 |
| }, |
| { |
| "epoch": 10.590277777777779, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.00017893841869782547, |
| "loss": 2.9075, |
| "step": 6100 |
| }, |
| { |
| "epoch": 10.677083333333334, |
| "grad_norm": 5.5625, |
| "learning_rate": 0.0001786018296065599, |
| "loss": 2.9184, |
| "step": 6150 |
| }, |
| { |
| "epoch": 10.76388888888889, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.0001782628941499753, |
| "loss": 2.9093, |
| "step": 6200 |
| }, |
| { |
| "epoch": 10.850694444444445, |
| "grad_norm": 6.9375, |
| "learning_rate": 0.00017792162244572928, |
| "loss": 2.911, |
| "step": 6250 |
| }, |
| { |
| "epoch": 10.9375, |
| "grad_norm": 8.125, |
| "learning_rate": 0.00017757802468121946, |
| "loss": 2.9023, |
| "step": 6300 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 2.8765242099761963, |
| "eval_runtime": 40.8481, |
| "eval_samples_per_second": 91.289, |
| "eval_steps_per_second": 5.729, |
| "step": 6336 |
| }, |
| { |
| "epoch": 11.024305555555555, |
| "grad_norm": 4.3125, |
| "learning_rate": 0.00017723211111327934, |
| "loss": 2.9075, |
| "step": 6350 |
| }, |
| { |
| "epoch": 11.11111111111111, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.0001768838920678721, |
| "loss": 2.9027, |
| "step": 6400 |
| }, |
| { |
| "epoch": 11.197916666666666, |
| "grad_norm": 8.375, |
| "learning_rate": 0.00017653337793978237, |
| "loss": 2.8971, |
| "step": 6450 |
| }, |
| { |
| "epoch": 11.284722222222221, |
| "grad_norm": 6.34375, |
| "learning_rate": 0.00017618057919230597, |
| "loss": 2.9095, |
| "step": 6500 |
| }, |
| { |
| "epoch": 11.371527777777779, |
| "grad_norm": 10.125, |
| "learning_rate": 0.00017582550635693753, |
| "loss": 2.9108, |
| "step": 6550 |
| }, |
| { |
| "epoch": 11.458333333333334, |
| "grad_norm": 9.375, |
| "learning_rate": 0.0001754681700330561, |
| "loss": 2.9115, |
| "step": 6600 |
| }, |
| { |
| "epoch": 11.54513888888889, |
| "grad_norm": 5.96875, |
| "learning_rate": 0.00017510858088760876, |
| "loss": 2.9137, |
| "step": 6650 |
| }, |
| { |
| "epoch": 11.631944444444445, |
| "grad_norm": 6.9375, |
| "learning_rate": 0.00017474674965479222, |
| "loss": 2.91, |
| "step": 6700 |
| }, |
| { |
| "epoch": 11.71875, |
| "grad_norm": 9.8125, |
| "learning_rate": 0.00017438268713573237, |
| "loss": 2.9037, |
| "step": 6750 |
| }, |
| { |
| "epoch": 11.805555555555555, |
| "grad_norm": 4.75, |
| "learning_rate": 0.00017401640419816182, |
| "loss": 2.9103, |
| "step": 6800 |
| }, |
| { |
| "epoch": 11.89236111111111, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.00017364791177609554, |
| "loss": 2.895, |
| "step": 6850 |
| }, |
| { |
| "epoch": 11.979166666666666, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.00017327722086950446, |
| "loss": 2.8989, |
| "step": 6900 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 2.872136116027832, |
| "eval_runtime": 41.6305, |
| "eval_samples_per_second": 89.574, |
| "eval_steps_per_second": 5.621, |
| "step": 6912 |
| }, |
| { |
| "epoch": 12.065972222222221, |
| "grad_norm": 7.75, |
| "learning_rate": 0.0001729043425439871, |
| "loss": 2.8952, |
| "step": 6950 |
| }, |
| { |
| "epoch": 12.152777777777779, |
| "grad_norm": 5.84375, |
| "learning_rate": 0.00017252928793043916, |
| "loss": 2.8915, |
| "step": 7000 |
| }, |
| { |
| "epoch": 12.239583333333334, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.00017215206822472143, |
| "loss": 2.8955, |
| "step": 7050 |
| }, |
| { |
| "epoch": 12.32638888888889, |
| "grad_norm": 5.875, |
| "learning_rate": 0.00017177269468732535, |
| "loss": 2.9131, |
| "step": 7100 |
| }, |
| { |
| "epoch": 12.413194444444445, |
| "grad_norm": 6.65625, |
| "learning_rate": 0.00017139117864303714, |
| "loss": 2.8935, |
| "step": 7150 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.0001710075314805995, |
| "loss": 2.9223, |
| "step": 7200 |
| }, |
| { |
| "epoch": 12.586805555555555, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.00017062176465237175, |
| "loss": 2.8979, |
| "step": 7250 |
| }, |
| { |
| "epoch": 12.67361111111111, |
| "grad_norm": 7.28125, |
| "learning_rate": 0.00017023388967398796, |
| "loss": 2.9076, |
| "step": 7300 |
| }, |
| { |
| "epoch": 12.760416666666666, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.00016984391812401316, |
| "loss": 2.8939, |
| "step": 7350 |
| }, |
| { |
| "epoch": 12.847222222222221, |
| "grad_norm": 5.03125, |
| "learning_rate": 0.00016945186164359782, |
| "loss": 2.9007, |
| "step": 7400 |
| }, |
| { |
| "epoch": 12.934027777777779, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.00016905773193613013, |
| "loss": 2.891, |
| "step": 7450 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 2.869907855987549, |
| "eval_runtime": 41.6939, |
| "eval_samples_per_second": 89.437, |
| "eval_steps_per_second": 5.612, |
| "step": 7488 |
| }, |
| { |
| "epoch": 13.020833333333334, |
| "grad_norm": 5.375, |
| "learning_rate": 0.00016866154076688683, |
| "loss": 2.8958, |
| "step": 7500 |
| }, |
| { |
| "epoch": 13.10763888888889, |
| "grad_norm": 5.03125, |
| "learning_rate": 0.00016826329996268196, |
| "loss": 2.8938, |
| "step": 7550 |
| }, |
| { |
| "epoch": 13.194444444444445, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.00016786302141151368, |
| "loss": 2.8862, |
| "step": 7600 |
| }, |
| { |
| "epoch": 13.28125, |
| "grad_norm": 5.21875, |
| "learning_rate": 0.00016746071706220966, |
| "loss": 2.8969, |
| "step": 7650 |
| }, |
| { |
| "epoch": 13.368055555555555, |
| "grad_norm": 6.8125, |
| "learning_rate": 0.00016705639892407014, |
| "loss": 2.9042, |
| "step": 7700 |
| }, |
| { |
| "epoch": 13.45486111111111, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.00016665007906650948, |
| "loss": 2.8953, |
| "step": 7750 |
| }, |
| { |
| "epoch": 13.541666666666666, |
| "grad_norm": 8.1875, |
| "learning_rate": 0.00016624176961869616, |
| "loss": 2.908, |
| "step": 7800 |
| }, |
| { |
| "epoch": 13.628472222222221, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0001658314827691902, |
| "loss": 2.8964, |
| "step": 7850 |
| }, |
| { |
| "epoch": 13.715277777777779, |
| "grad_norm": 5.0, |
| "learning_rate": 0.00016541923076557978, |
| "loss": 2.8924, |
| "step": 7900 |
| }, |
| { |
| "epoch": 13.802083333333334, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.0001650050259141154, |
| "loss": 2.9024, |
| "step": 7950 |
| }, |
| { |
| "epoch": 13.88888888888889, |
| "grad_norm": 4.71875, |
| "learning_rate": 0.00016458888057934248, |
| "loss": 2.884, |
| "step": 8000 |
| }, |
| { |
| "epoch": 13.975694444444445, |
| "grad_norm": 11.4375, |
| "learning_rate": 0.0001641708071837325, |
| "loss": 2.8926, |
| "step": 8050 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 2.8657476902008057, |
| "eval_runtime": 41.9302, |
| "eval_samples_per_second": 88.934, |
| "eval_steps_per_second": 5.581, |
| "step": 8064 |
| }, |
| { |
| "epoch": 14.0625, |
| "grad_norm": 6.40625, |
| "learning_rate": 0.00016375081820731193, |
| "loss": 2.8867, |
| "step": 8100 |
| }, |
| { |
| "epoch": 14.149305555555555, |
| "grad_norm": 4.625, |
| "learning_rate": 0.00016332892618728986, |
| "loss": 2.8829, |
| "step": 8150 |
| }, |
| { |
| "epoch": 14.23611111111111, |
| "grad_norm": 4.1875, |
| "learning_rate": 0.00016290514371768356, |
| "loss": 2.8852, |
| "step": 8200 |
| }, |
| { |
| "epoch": 14.322916666666666, |
| "grad_norm": 4.3125, |
| "learning_rate": 0.0001624794834489427, |
| "loss": 2.9058, |
| "step": 8250 |
| }, |
| { |
| "epoch": 14.409722222222221, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.00016205195808757173, |
| "loss": 2.8848, |
| "step": 8300 |
| }, |
| { |
| "epoch": 14.496527777777779, |
| "grad_norm": 6.21875, |
| "learning_rate": 0.00016162258039575033, |
| "loss": 2.9088, |
| "step": 8350 |
| }, |
| { |
| "epoch": 14.583333333333334, |
| "grad_norm": 7.34375, |
| "learning_rate": 0.0001611913631909528, |
| "loss": 2.8913, |
| "step": 8400 |
| }, |
| { |
| "epoch": 14.67013888888889, |
| "grad_norm": 6.0, |
| "learning_rate": 0.00016075831934556518, |
| "loss": 2.9013, |
| "step": 8450 |
| }, |
| { |
| "epoch": 14.756944444444445, |
| "grad_norm": 8.9375, |
| "learning_rate": 0.00016032346178650105, |
| "loss": 2.8843, |
| "step": 8500 |
| }, |
| { |
| "epoch": 14.84375, |
| "grad_norm": 4.9375, |
| "learning_rate": 0.0001598868034948157, |
| "loss": 2.8901, |
| "step": 8550 |
| }, |
| { |
| "epoch": 14.930555555555555, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.00015944835750531858, |
| "loss": 2.8824, |
| "step": 8600 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 2.8647055625915527, |
| "eval_runtime": 41.7282, |
| "eval_samples_per_second": 89.364, |
| "eval_steps_per_second": 5.608, |
| "step": 8640 |
| }, |
| { |
| "epoch": 15.01736111111111, |
| "grad_norm": 8.125, |
| "learning_rate": 0.0001590081369061842, |
| "loss": 2.8874, |
| "step": 8650 |
| }, |
| { |
| "epoch": 15.104166666666666, |
| "grad_norm": 6.375, |
| "learning_rate": 0.00015856615483856153, |
| "loss": 2.8822, |
| "step": 8700 |
| }, |
| { |
| "epoch": 15.190972222222221, |
| "grad_norm": 6.21875, |
| "learning_rate": 0.00015812242449618147, |
| "loss": 2.8752, |
| "step": 8750 |
| }, |
| { |
| "epoch": 15.277777777777779, |
| "grad_norm": 7.15625, |
| "learning_rate": 0.0001576769591249633, |
| "loss": 2.8873, |
| "step": 8800 |
| }, |
| { |
| "epoch": 15.364583333333334, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.0001572297720226191, |
| "loss": 2.8993, |
| "step": 8850 |
| }, |
| { |
| "epoch": 15.45138888888889, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.00015678087653825675, |
| "loss": 2.8854, |
| "step": 8900 |
| }, |
| { |
| "epoch": 15.538194444444445, |
| "grad_norm": 4.03125, |
| "learning_rate": 0.0001563302860719816, |
| "loss": 2.8994, |
| "step": 8950 |
| }, |
| { |
| "epoch": 15.625, |
| "grad_norm": 6.59375, |
| "learning_rate": 0.00015587801407449648, |
| "loss": 2.8893, |
| "step": 9000 |
| }, |
| { |
| "epoch": 15.711805555555555, |
| "grad_norm": 6.25, |
| "learning_rate": 0.0001554240740466998, |
| "loss": 2.8871, |
| "step": 9050 |
| }, |
| { |
| "epoch": 15.79861111111111, |
| "grad_norm": 10.0625, |
| "learning_rate": 0.00015496847953928313, |
| "loss": 2.8935, |
| "step": 9100 |
| }, |
| { |
| "epoch": 15.885416666666666, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.00015451124415232615, |
| "loss": 2.8775, |
| "step": 9150 |
| }, |
| { |
| "epoch": 15.972222222222221, |
| "grad_norm": 7.65625, |
| "learning_rate": 0.00015405238153489096, |
| "loss": 2.8831, |
| "step": 9200 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 2.8630547523498535, |
| "eval_runtime": 40.8978, |
| "eval_samples_per_second": 91.179, |
| "eval_steps_per_second": 5.722, |
| "step": 9216 |
| }, |
| { |
| "epoch": 16.05902777777778, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.00015359190538461462, |
| "loss": 2.88, |
| "step": 9250 |
| }, |
| { |
| "epoch": 16.145833333333332, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.00015312982944730018, |
| "loss": 2.8777, |
| "step": 9300 |
| }, |
| { |
| "epoch": 16.23263888888889, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.00015266616751650642, |
| "loss": 2.8785, |
| "step": 9350 |
| }, |
| { |
| "epoch": 16.319444444444443, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.00015220093343313592, |
| "loss": 2.8968, |
| "step": 9400 |
| }, |
| { |
| "epoch": 16.40625, |
| "grad_norm": 5.125, |
| "learning_rate": 0.00015173414108502224, |
| "loss": 2.877, |
| "step": 9450 |
| }, |
| { |
| "epoch": 16.493055555555557, |
| "grad_norm": 5.03125, |
| "learning_rate": 0.00015126580440651496, |
| "loss": 2.9016, |
| "step": 9500 |
| }, |
| { |
| "epoch": 16.57986111111111, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.00015079593737806399, |
| "loss": 2.8841, |
| "step": 9550 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 5.21875, |
| "learning_rate": 0.00015032455402580217, |
| "loss": 2.8937, |
| "step": 9600 |
| }, |
| { |
| "epoch": 16.75347222222222, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.00014985166842112644, |
| "loss": 2.8789, |
| "step": 9650 |
| }, |
| { |
| "epoch": 16.84027777777778, |
| "grad_norm": 5.5625, |
| "learning_rate": 0.00014937729468027797, |
| "loss": 2.8883, |
| "step": 9700 |
| }, |
| { |
| "epoch": 16.927083333333332, |
| "grad_norm": 5.15625, |
| "learning_rate": 0.00014890144696392074, |
| "loss": 2.8751, |
| "step": 9750 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 2.862104892730713, |
| "eval_runtime": 41.9728, |
| "eval_samples_per_second": 88.843, |
| "eval_steps_per_second": 5.575, |
| "step": 9792 |
| }, |
| { |
| "epoch": 17.01388888888889, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.00014842413947671872, |
| "loss": 2.8821, |
| "step": 9800 |
| }, |
| { |
| "epoch": 17.100694444444443, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.0001479453864669119, |
| "loss": 2.8785, |
| "step": 9850 |
| }, |
| { |
| "epoch": 17.1875, |
| "grad_norm": 5.4375, |
| "learning_rate": 0.00014746520222589103, |
| "loss": 2.8715, |
| "step": 9900 |
| }, |
| { |
| "epoch": 17.274305555555557, |
| "grad_norm": 11.8125, |
| "learning_rate": 0.00014698360108777097, |
| "loss": 2.8826, |
| "step": 9950 |
| }, |
| { |
| "epoch": 17.36111111111111, |
| "grad_norm": 5.96875, |
| "learning_rate": 0.00014650059742896265, |
| "loss": 2.8958, |
| "step": 10000 |
| }, |
| { |
| "epoch": 17.447916666666668, |
| "grad_norm": 5.75, |
| "learning_rate": 0.00014601620566774415, |
| "loss": 2.8751, |
| "step": 10050 |
| }, |
| { |
| "epoch": 17.53472222222222, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.00014553044026383014, |
| "loss": 2.8925, |
| "step": 10100 |
| }, |
| { |
| "epoch": 17.62152777777778, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.0001450433157179403, |
| "loss": 2.8889, |
| "step": 10150 |
| }, |
| { |
| "epoch": 17.708333333333332, |
| "grad_norm": 6.21875, |
| "learning_rate": 0.00014455484657136642, |
| "loss": 2.8807, |
| "step": 10200 |
| }, |
| { |
| "epoch": 17.79513888888889, |
| "grad_norm": 5.375, |
| "learning_rate": 0.00014406504740553837, |
| "loss": 2.8836, |
| "step": 10250 |
| }, |
| { |
| "epoch": 17.881944444444443, |
| "grad_norm": 5.125, |
| "learning_rate": 0.00014357393284158878, |
| "loss": 2.8723, |
| "step": 10300 |
| }, |
| { |
| "epoch": 17.96875, |
| "grad_norm": 5.3125, |
| "learning_rate": 0.00014308151753991658, |
| "loss": 2.881, |
| "step": 10350 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 2.8605997562408447, |
| "eval_runtime": 40.45, |
| "eval_samples_per_second": 92.188, |
| "eval_steps_per_second": 5.785, |
| "step": 10368 |
| }, |
| { |
| "epoch": 18.055555555555557, |
| "grad_norm": 4.5625, |
| "learning_rate": 0.00014258781619974945, |
| "loss": 2.8781, |
| "step": 10400 |
| }, |
| { |
| "epoch": 18.14236111111111, |
| "grad_norm": 4.625, |
| "learning_rate": 0.00014209284355870492, |
| "loss": 2.8705, |
| "step": 10450 |
| }, |
| { |
| "epoch": 18.229166666666668, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.00014159661439235046, |
| "loss": 2.876, |
| "step": 10500 |
| }, |
| { |
| "epoch": 18.31597222222222, |
| "grad_norm": 4.875, |
| "learning_rate": 0.0001410991435137625, |
| "loss": 2.8918, |
| "step": 10550 |
| }, |
| { |
| "epoch": 18.40277777777778, |
| "grad_norm": 4.375, |
| "learning_rate": 0.00014060044577308408, |
| "loss": 2.8759, |
| "step": 10600 |
| }, |
| { |
| "epoch": 18.489583333333332, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.00014010053605708174, |
| "loss": 2.8958, |
| "step": 10650 |
| }, |
| { |
| "epoch": 18.57638888888889, |
| "grad_norm": 5.125, |
| "learning_rate": 0.000139599429288701, |
| "loss": 2.8763, |
| "step": 10700 |
| }, |
| { |
| "epoch": 18.663194444444443, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.00013909714042662085, |
| "loss": 2.8905, |
| "step": 10750 |
| }, |
| { |
| "epoch": 18.75, |
| "grad_norm": 5.625, |
| "learning_rate": 0.00013859368446480743, |
| "loss": 2.8782, |
| "step": 10800 |
| }, |
| { |
| "epoch": 18.836805555555557, |
| "grad_norm": 5.375, |
| "learning_rate": 0.0001380890764320662, |
| "loss": 2.8834, |
| "step": 10850 |
| }, |
| { |
| "epoch": 18.92361111111111, |
| "grad_norm": 3.984375, |
| "learning_rate": 0.00013758333139159343, |
| "loss": 2.8705, |
| "step": 10900 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 2.8602795600891113, |
| "eval_runtime": 40.4167, |
| "eval_samples_per_second": 92.264, |
| "eval_steps_per_second": 5.79, |
| "step": 10944 |
| }, |
| { |
| "epoch": 19.010416666666668, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.00013707646444052656, |
| "loss": 2.8757, |
| "step": 10950 |
| }, |
| { |
| "epoch": 19.09722222222222, |
| "grad_norm": 7.3125, |
| "learning_rate": 0.0001365684907094935, |
| "loss": 2.8753, |
| "step": 11000 |
| }, |
| { |
| "epoch": 19.18402777777778, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.0001360594253621609, |
| "loss": 2.8632, |
| "step": 11050 |
| }, |
| { |
| "epoch": 19.270833333333332, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.0001355492835947816, |
| "loss": 2.8771, |
| "step": 11100 |
| }, |
| { |
| "epoch": 19.35763888888889, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.000135038080635741, |
| "loss": 2.8945, |
| "step": 11150 |
| }, |
| { |
| "epoch": 19.444444444444443, |
| "grad_norm": 4.25, |
| "learning_rate": 0.00013452583174510237, |
| "loss": 2.87, |
| "step": 11200 |
| }, |
| { |
| "epoch": 19.53125, |
| "grad_norm": 5.1875, |
| "learning_rate": 0.0001340125522141514, |
| "loss": 2.8948, |
| "step": 11250 |
| }, |
| { |
| "epoch": 19.618055555555557, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.00013349825736493965, |
| "loss": 2.882, |
| "step": 11300 |
| }, |
| { |
| "epoch": 19.70486111111111, |
| "grad_norm": 7.0, |
| "learning_rate": 0.00013298296254982733, |
| "loss": 2.8753, |
| "step": 11350 |
| }, |
| { |
| "epoch": 19.791666666666668, |
| "grad_norm": 4.78125, |
| "learning_rate": 0.00013246668315102487, |
| "loss": 2.8823, |
| "step": 11400 |
| }, |
| { |
| "epoch": 19.87847222222222, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.00013194943458013375, |
| "loss": 2.8675, |
| "step": 11450 |
| }, |
| { |
| "epoch": 19.96527777777778, |
| "grad_norm": 6.0, |
| "learning_rate": 0.00013143123227768658, |
| "loss": 2.8765, |
| "step": 11500 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 2.8591601848602295, |
| "eval_runtime": 41.4885, |
| "eval_samples_per_second": 89.88, |
| "eval_steps_per_second": 5.64, |
| "step": 11520 |
| }, |
| { |
| "epoch": 20.052083333333332, |
| "grad_norm": 4.9375, |
| "learning_rate": 0.00013091209171268599, |
| "loss": 2.8735, |
| "step": 11550 |
| }, |
| { |
| "epoch": 20.13888888888889, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.00013039202838214294, |
| "loss": 2.8698, |
| "step": 11600 |
| }, |
| { |
| "epoch": 20.225694444444443, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0001298710578106142, |
| "loss": 2.8702, |
| "step": 11650 |
| }, |
| { |
| "epoch": 20.3125, |
| "grad_norm": 6.59375, |
| "learning_rate": 0.00012934919554973874, |
| "loss": 2.8871, |
| "step": 11700 |
| }, |
| { |
| "epoch": 20.399305555555557, |
| "grad_norm": 4.375, |
| "learning_rate": 0.00012882645717777376, |
| "loss": 2.8752, |
| "step": 11750 |
| }, |
| { |
| "epoch": 20.48611111111111, |
| "grad_norm": 6.78125, |
| "learning_rate": 0.00012830285829912926, |
| "loss": 2.8896, |
| "step": 11800 |
| }, |
| { |
| "epoch": 20.572916666666668, |
| "grad_norm": 5.21875, |
| "learning_rate": 0.00012777841454390275, |
| "loss": 2.8768, |
| "step": 11850 |
| }, |
| { |
| "epoch": 20.65972222222222, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.00012725314156741214, |
| "loss": 2.8846, |
| "step": 11900 |
| }, |
| { |
| "epoch": 20.74652777777778, |
| "grad_norm": 5.25, |
| "learning_rate": 0.00012672705504972884, |
| "loss": 2.873, |
| "step": 11950 |
| }, |
| { |
| "epoch": 20.833333333333332, |
| "grad_norm": 5.6875, |
| "learning_rate": 0.00012620017069520936, |
| "loss": 2.8809, |
| "step": 12000 |
| }, |
| { |
| "epoch": 20.92013888888889, |
| "grad_norm": 4.40625, |
| "learning_rate": 0.00012567250423202675, |
| "loss": 2.8656, |
| "step": 12050 |
| }, |
| { |
| "epoch": 21.0, |
| "eval_loss": 2.857980489730835, |
| "eval_runtime": 41.0572, |
| "eval_samples_per_second": 90.825, |
| "eval_steps_per_second": 5.699, |
| "step": 12096 |
| }, |
| { |
| "epoch": 21.006944444444443, |
| "grad_norm": 5.5, |
| "learning_rate": 0.00012514407141170104, |
| "loss": 2.8738, |
| "step": 12100 |
| }, |
| { |
| "epoch": 21.09375, |
| "grad_norm": 4.78125, |
| "learning_rate": 0.00012461488800862887, |
| "loss": 2.8725, |
| "step": 12150 |
| }, |
| { |
| "epoch": 21.180555555555557, |
| "grad_norm": 8.375, |
| "learning_rate": 0.00012408496981961288, |
| "loss": 2.8628, |
| "step": 12200 |
| }, |
| { |
| "epoch": 21.26736111111111, |
| "grad_norm": 7.375, |
| "learning_rate": 0.00012355433266338992, |
| "loss": 2.8733, |
| "step": 12250 |
| }, |
| { |
| "epoch": 21.354166666666668, |
| "grad_norm": 4.65625, |
| "learning_rate": 0.00012302299238015895, |
| "loss": 2.8901, |
| "step": 12300 |
| }, |
| { |
| "epoch": 21.44097222222222, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.0001224909648311082, |
| "loss": 2.8696, |
| "step": 12350 |
| }, |
| { |
| "epoch": 21.52777777777778, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.00012195826589794162, |
| "loss": 2.8925, |
| "step": 12400 |
| }, |
| { |
| "epoch": 21.614583333333332, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.00012142491148240491, |
| "loss": 2.8764, |
| "step": 12450 |
| }, |
| { |
| "epoch": 21.70138888888889, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.00012089091750581067, |
| "loss": 2.8716, |
| "step": 12500 |
| }, |
| { |
| "epoch": 21.788194444444443, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.0001203562999085633, |
| "loss": 2.8816, |
| "step": 12550 |
| }, |
| { |
| "epoch": 21.875, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.00011982107464968298, |
| "loss": 2.8677, |
| "step": 12600 |
| }, |
| { |
| "epoch": 21.961805555555557, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.00011928525770632946, |
| "loss": 2.8729, |
| "step": 12650 |
| }, |
| { |
| "epoch": 22.0, |
| "eval_loss": 2.857877016067505, |
| "eval_runtime": 42.4855, |
| "eval_samples_per_second": 87.771, |
| "eval_steps_per_second": 5.508, |
| "step": 12672 |
| }, |
| { |
| "epoch": 22.04861111111111, |
| "grad_norm": 7.5, |
| "learning_rate": 0.000118748865073325, |
| "loss": 2.8712, |
| "step": 12700 |
| }, |
| { |
| "epoch": 22.135416666666668, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.00011821191276267684, |
| "loss": 2.867, |
| "step": 12750 |
| }, |
| { |
| "epoch": 22.22222222222222, |
| "grad_norm": 6.1875, |
| "learning_rate": 0.00011767441680309955, |
| "loss": 2.8635, |
| "step": 12800 |
| }, |
| { |
| "epoch": 22.30902777777778, |
| "grad_norm": 8.625, |
| "learning_rate": 0.00011713639323953602, |
| "loss": 2.886, |
| "step": 12850 |
| }, |
| { |
| "epoch": 22.395833333333332, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.00011659785813267905, |
| "loss": 2.872, |
| "step": 12900 |
| }, |
| { |
| "epoch": 22.48263888888889, |
| "grad_norm": 5.625, |
| "learning_rate": 0.0001160588275584915, |
| "loss": 2.8891, |
| "step": 12950 |
| }, |
| { |
| "epoch": 22.569444444444443, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.00011551931760772661, |
| "loss": 2.8741, |
| "step": 13000 |
| }, |
| { |
| "epoch": 22.65625, |
| "grad_norm": 6.65625, |
| "learning_rate": 0.00011497934438544769, |
| "loss": 2.8815, |
| "step": 13050 |
| }, |
| { |
| "epoch": 22.743055555555557, |
| "grad_norm": 5.3125, |
| "learning_rate": 0.00011443892401054719, |
| "loss": 2.8705, |
| "step": 13100 |
| }, |
| { |
| "epoch": 22.82986111111111, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.00011389807261526573, |
| "loss": 2.8823, |
| "step": 13150 |
| }, |
| { |
| "epoch": 22.916666666666668, |
| "grad_norm": 5.5, |
| "learning_rate": 0.00011335680634471035, |
| "loss": 2.8596, |
| "step": 13200 |
| }, |
| { |
| "epoch": 23.0, |
| "eval_loss": 2.856687545776367, |
| "eval_runtime": 42.2388, |
| "eval_samples_per_second": 88.284, |
| "eval_steps_per_second": 5.54, |
| "step": 13248 |
| }, |
| { |
| "epoch": 23.00347222222222, |
| "grad_norm": 7.3125, |
| "learning_rate": 0.00011281514135637278, |
| "loss": 2.8712, |
| "step": 13250 |
| }, |
| { |
| "epoch": 23.09027777777778, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.00011227309381964684, |
| "loss": 2.8741, |
| "step": 13300 |
| }, |
| { |
| "epoch": 23.177083333333332, |
| "grad_norm": 4.875, |
| "learning_rate": 0.00011173067991534598, |
| "loss": 2.8567, |
| "step": 13350 |
| }, |
| { |
| "epoch": 23.26388888888889, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.00011118791583522023, |
| "loss": 2.8739, |
| "step": 13400 |
| }, |
| { |
| "epoch": 23.350694444444443, |
| "grad_norm": 5.96875, |
| "learning_rate": 0.00011064481778147275, |
| "loss": 2.8865, |
| "step": 13450 |
| }, |
| { |
| "epoch": 23.4375, |
| "grad_norm": 5.6875, |
| "learning_rate": 0.00011010140196627627, |
| "loss": 2.8657, |
| "step": 13500 |
| }, |
| { |
| "epoch": 23.524305555555557, |
| "grad_norm": 5.15625, |
| "learning_rate": 0.00010955768461128911, |
| "loss": 2.8911, |
| "step": 13550 |
| }, |
| { |
| "epoch": 23.61111111111111, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.00010901368194717091, |
| "loss": 2.8727, |
| "step": 13600 |
| }, |
| { |
| "epoch": 23.697916666666668, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.00010846941021309817, |
| "loss": 2.8729, |
| "step": 13650 |
| }, |
| { |
| "epoch": 23.78472222222222, |
| "grad_norm": 5.53125, |
| "learning_rate": 0.00010792488565627953, |
| "loss": 2.8749, |
| "step": 13700 |
| }, |
| { |
| "epoch": 23.87152777777778, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.00010738012453147062, |
| "loss": 2.87, |
| "step": 13750 |
| }, |
| { |
| "epoch": 23.958333333333332, |
| "grad_norm": 6.875, |
| "learning_rate": 0.00010683514310048894, |
| "loss": 2.8713, |
| "step": 13800 |
| }, |
| { |
| "epoch": 24.0, |
| "eval_loss": 2.856473922729492, |
| "eval_runtime": 40.551, |
| "eval_samples_per_second": 91.958, |
| "eval_steps_per_second": 5.771, |
| "step": 13824 |
| }, |
| { |
| "epoch": 24.04513888888889, |
| "grad_norm": 7.4375, |
| "learning_rate": 0.00010628995763172851, |
| "loss": 2.8675, |
| "step": 13850 |
| }, |
| { |
| "epoch": 24.131944444444443, |
| "grad_norm": 5.875, |
| "learning_rate": 0.00010574458439967401, |
| "loss": 2.8666, |
| "step": 13900 |
| }, |
| { |
| "epoch": 24.21875, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.00010519903968441516, |
| "loss": 2.8586, |
| "step": 13950 |
| }, |
| { |
| "epoch": 24.305555555555557, |
| "grad_norm": 4.8125, |
| "learning_rate": 0.0001046533397711607, |
| "loss": 2.8836, |
| "step": 14000 |
| }, |
| { |
| "epoch": 24.39236111111111, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.00010410750094975215, |
| "loss": 2.8711, |
| "step": 14050 |
| }, |
| { |
| "epoch": 24.479166666666668, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.00010356153951417771, |
| "loss": 2.8866, |
| "step": 14100 |
| }, |
| { |
| "epoch": 24.56597222222222, |
| "grad_norm": 5.3125, |
| "learning_rate": 0.00010301547176208568, |
| "loss": 2.8723, |
| "step": 14150 |
| }, |
| { |
| "epoch": 24.65277777777778, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.00010246931399429812, |
| "loss": 2.8754, |
| "step": 14200 |
| }, |
| { |
| "epoch": 24.739583333333332, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.00010192308251432412, |
| "loss": 2.8733, |
| "step": 14250 |
| }, |
| { |
| "epoch": 24.82638888888889, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.0001013767936278732, |
| "loss": 2.8821, |
| "step": 14300 |
| }, |
| { |
| "epoch": 24.913194444444443, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.00010083046364236854, |
| "loss": 2.8564, |
| "step": 14350 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.00010028410886646014, |
| "loss": 2.8708, |
| "step": 14400 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_loss": 2.8556883335113525, |
| "eval_runtime": 39.6238, |
| "eval_samples_per_second": 94.11, |
| "eval_steps_per_second": 5.906, |
| "step": 14400 |
| }, |
| { |
| "epoch": 25.086805555555557, |
| "grad_norm": 4.46875, |
| "learning_rate": 9.97377456095381e-05, |
| "loss": 2.8732, |
| "step": 14450 |
| }, |
| { |
| "epoch": 25.17361111111111, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.91913901812456e-05, |
| "loss": 2.8537, |
| "step": 14500 |
| }, |
| { |
| "epoch": 25.260416666666668, |
| "grad_norm": 3.640625, |
| "learning_rate": 9.864505889099217e-05, |
| "loss": 2.8704, |
| "step": 14550 |
| }, |
| { |
| "epoch": 25.34722222222222, |
| "grad_norm": 5.5, |
| "learning_rate": 9.809876804746683e-05, |
| "loss": 2.8865, |
| "step": 14600 |
| }, |
| { |
| "epoch": 25.43402777777778, |
| "grad_norm": 6.6875, |
| "learning_rate": 9.755253395815116e-05, |
| "loss": 2.8648, |
| "step": 14650 |
| }, |
| { |
| "epoch": 25.520833333333332, |
| "grad_norm": 5.46875, |
| "learning_rate": 9.700637292883252e-05, |
| "loss": 2.8886, |
| "step": 14700 |
| }, |
| { |
| "epoch": 25.60763888888889, |
| "grad_norm": 4.96875, |
| "learning_rate": 9.646030126311743e-05, |
| "loss": 2.872, |
| "step": 14750 |
| }, |
| { |
| "epoch": 25.694444444444443, |
| "grad_norm": 7.9375, |
| "learning_rate": 9.591433526194474e-05, |
| "loss": 2.8698, |
| "step": 14800 |
| }, |
| { |
| "epoch": 25.78125, |
| "grad_norm": 5.0, |
| "learning_rate": 9.536849122309901e-05, |
| "loss": 2.8718, |
| "step": 14850 |
| }, |
| { |
| "epoch": 25.868055555555557, |
| "grad_norm": 6.1875, |
| "learning_rate": 9.482278544072425e-05, |
| "loss": 2.8712, |
| "step": 14900 |
| }, |
| { |
| "epoch": 25.95486111111111, |
| "grad_norm": 5.34375, |
| "learning_rate": 9.427723420483717e-05, |
| "loss": 2.8674, |
| "step": 14950 |
| }, |
| { |
| "epoch": 26.0, |
| "eval_loss": 2.855642080307007, |
| "eval_runtime": 41.248, |
| "eval_samples_per_second": 90.404, |
| "eval_steps_per_second": 5.673, |
| "step": 14976 |
| }, |
| { |
| "epoch": 26.041666666666668, |
| "grad_norm": 6.46875, |
| "learning_rate": 9.373185380084113e-05, |
| "loss": 2.8681, |
| "step": 15000 |
| }, |
| { |
| "epoch": 26.12847222222222, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.318666050903988e-05, |
| "loss": 2.8659, |
| "step": 15050 |
| }, |
| { |
| "epoch": 26.21527777777778, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.264167060415178e-05, |
| "loss": 2.857, |
| "step": 15100 |
| }, |
| { |
| "epoch": 26.302083333333332, |
| "grad_norm": 4.4375, |
| "learning_rate": 9.209690035482372e-05, |
| "loss": 2.8821, |
| "step": 15150 |
| }, |
| { |
| "epoch": 26.38888888888889, |
| "grad_norm": 6.0, |
| "learning_rate": 9.155236602314552e-05, |
| "loss": 2.8707, |
| "step": 15200 |
| }, |
| { |
| "epoch": 26.475694444444443, |
| "grad_norm": 4.53125, |
| "learning_rate": 9.100808386416475e-05, |
| "loss": 2.8819, |
| "step": 15250 |
| }, |
| { |
| "epoch": 26.5625, |
| "grad_norm": 4.125, |
| "learning_rate": 9.046407012540115e-05, |
| "loss": 2.8716, |
| "step": 15300 |
| }, |
| { |
| "epoch": 26.649305555555557, |
| "grad_norm": 5.8125, |
| "learning_rate": 8.992034104636183e-05, |
| "loss": 2.8758, |
| "step": 15350 |
| }, |
| { |
| "epoch": 26.73611111111111, |
| "grad_norm": 4.84375, |
| "learning_rate": 8.937691285805634e-05, |
| "loss": 2.8716, |
| "step": 15400 |
| }, |
| { |
| "epoch": 26.822916666666668, |
| "grad_norm": 4.8125, |
| "learning_rate": 8.883380178251249e-05, |
| "loss": 2.8792, |
| "step": 15450 |
| }, |
| { |
| "epoch": 26.90972222222222, |
| "grad_norm": 4.59375, |
| "learning_rate": 8.829102403229163e-05, |
| "loss": 2.8585, |
| "step": 15500 |
| }, |
| { |
| "epoch": 26.99652777777778, |
| "grad_norm": 7.09375, |
| "learning_rate": 8.774859581000504e-05, |
| "loss": 2.8683, |
| "step": 15550 |
| }, |
| { |
| "epoch": 27.0, |
| "eval_loss": 2.8553037643432617, |
| "eval_runtime": 41.718, |
| "eval_samples_per_second": 89.386, |
| "eval_steps_per_second": 5.609, |
| "step": 15552 |
| }, |
| { |
| "epoch": 27.083333333333332, |
| "grad_norm": 4.78125, |
| "learning_rate": 8.720653330783013e-05, |
| "loss": 2.8705, |
| "step": 15600 |
| }, |
| { |
| "epoch": 27.17013888888889, |
| "grad_norm": 4.5, |
| "learning_rate": 8.666485270702704e-05, |
| "loss": 2.8559, |
| "step": 15650 |
| }, |
| { |
| "epoch": 27.256944444444443, |
| "grad_norm": 4.03125, |
| "learning_rate": 8.612357017745578e-05, |
| "loss": 2.87, |
| "step": 15700 |
| }, |
| { |
| "epoch": 27.34375, |
| "grad_norm": 5.65625, |
| "learning_rate": 8.558270187709328e-05, |
| "loss": 2.8804, |
| "step": 15750 |
| }, |
| { |
| "epoch": 27.430555555555557, |
| "grad_norm": 5.15625, |
| "learning_rate": 8.504226395155132e-05, |
| "loss": 2.8634, |
| "step": 15800 |
| }, |
| { |
| "epoch": 27.51736111111111, |
| "grad_norm": 5.40625, |
| "learning_rate": 8.450227253359439e-05, |
| "loss": 2.8878, |
| "step": 15850 |
| }, |
| { |
| "epoch": 27.604166666666668, |
| "grad_norm": 4.15625, |
| "learning_rate": 8.39627437426581e-05, |
| "loss": 2.8713, |
| "step": 15900 |
| }, |
| { |
| "epoch": 27.69097222222222, |
| "grad_norm": 4.84375, |
| "learning_rate": 8.34236936843682e-05, |
| "loss": 2.8689, |
| "step": 15950 |
| }, |
| { |
| "epoch": 27.77777777777778, |
| "grad_norm": 5.34375, |
| "learning_rate": 8.28851384500595e-05, |
| "loss": 2.8706, |
| "step": 16000 |
| }, |
| { |
| "epoch": 27.864583333333332, |
| "grad_norm": 4.3125, |
| "learning_rate": 8.234709411629572e-05, |
| "loss": 2.8689, |
| "step": 16050 |
| }, |
| { |
| "epoch": 27.95138888888889, |
| "grad_norm": 8.125, |
| "learning_rate": 8.180957674438966e-05, |
| "loss": 2.8677, |
| "step": 16100 |
| }, |
| { |
| "epoch": 28.0, |
| "eval_loss": 2.8553411960601807, |
| "eval_runtime": 41.9128, |
| "eval_samples_per_second": 88.97, |
| "eval_steps_per_second": 5.583, |
| "step": 16128 |
| }, |
| { |
| "epoch": 28.038194444444443, |
| "grad_norm": 4.34375, |
| "learning_rate": 8.12726023799235e-05, |
| "loss": 2.8652, |
| "step": 16150 |
| }, |
| { |
| "epoch": 28.125, |
| "grad_norm": 5.125, |
| "learning_rate": 8.073618705226998e-05, |
| "loss": 2.8667, |
| "step": 16200 |
| }, |
| { |
| "epoch": 28.211805555555557, |
| "grad_norm": 4.34375, |
| "learning_rate": 8.020034677411386e-05, |
| "loss": 2.8591, |
| "step": 16250 |
| }, |
| { |
| "epoch": 28.29861111111111, |
| "grad_norm": 4.9375, |
| "learning_rate": 7.966509754097404e-05, |
| "loss": 2.8778, |
| "step": 16300 |
| }, |
| { |
| "epoch": 28.385416666666668, |
| "grad_norm": 4.0625, |
| "learning_rate": 7.913045533072587e-05, |
| "loss": 2.8716, |
| "step": 16350 |
| }, |
| { |
| "epoch": 28.47222222222222, |
| "grad_norm": 5.125, |
| "learning_rate": 7.859643610312424e-05, |
| "loss": 2.8786, |
| "step": 16400 |
| }, |
| { |
| "epoch": 28.55902777777778, |
| "grad_norm": 5.375, |
| "learning_rate": 7.80630557993274e-05, |
| "loss": 2.8746, |
| "step": 16450 |
| }, |
| { |
| "epoch": 28.645833333333332, |
| "grad_norm": 4.75, |
| "learning_rate": 7.753033034142075e-05, |
| "loss": 2.871, |
| "step": 16500 |
| }, |
| { |
| "epoch": 28.73263888888889, |
| "grad_norm": 5.09375, |
| "learning_rate": 7.69982756319417e-05, |
| "loss": 2.8704, |
| "step": 16550 |
| }, |
| { |
| "epoch": 28.819444444444443, |
| "grad_norm": 5.03125, |
| "learning_rate": 7.646690755340504e-05, |
| "loss": 2.8813, |
| "step": 16600 |
| }, |
| { |
| "epoch": 28.90625, |
| "grad_norm": 4.53125, |
| "learning_rate": 7.59362419678287e-05, |
| "loss": 2.8563, |
| "step": 16650 |
| }, |
| { |
| "epoch": 28.993055555555557, |
| "grad_norm": 4.53125, |
| "learning_rate": 7.540629471626026e-05, |
| "loss": 2.868, |
| "step": 16700 |
| }, |
| { |
| "epoch": 29.0, |
| "eval_loss": 2.8549838066101074, |
| "eval_runtime": 40.2288, |
| "eval_samples_per_second": 92.695, |
| "eval_steps_per_second": 5.817, |
| "step": 16704 |
| }, |
| { |
| "epoch": 29.07986111111111, |
| "grad_norm": 4.90625, |
| "learning_rate": 7.48770816183042e-05, |
| "loss": 2.869, |
| "step": 16750 |
| }, |
| { |
| "epoch": 29.166666666666668, |
| "grad_norm": 4.1875, |
| "learning_rate": 7.434861847164955e-05, |
| "loss": 2.8525, |
| "step": 16800 |
| }, |
| { |
| "epoch": 29.25347222222222, |
| "grad_norm": 4.125, |
| "learning_rate": 7.382092105159825e-05, |
| "loss": 2.868, |
| "step": 16850 |
| }, |
| { |
| "epoch": 29.34027777777778, |
| "grad_norm": 6.125, |
| "learning_rate": 7.329400511059442e-05, |
| "loss": 2.8797, |
| "step": 16900 |
| }, |
| { |
| "epoch": 29.427083333333332, |
| "grad_norm": 4.71875, |
| "learning_rate": 7.276788637775393e-05, |
| "loss": 2.8629, |
| "step": 16950 |
| }, |
| { |
| "epoch": 29.51388888888889, |
| "grad_norm": 3.90625, |
| "learning_rate": 7.224258055839509e-05, |
| "loss": 2.8888, |
| "step": 17000 |
| }, |
| { |
| "epoch": 29.600694444444443, |
| "grad_norm": 4.8125, |
| "learning_rate": 7.171810333356961e-05, |
| "loss": 2.869, |
| "step": 17050 |
| }, |
| { |
| "epoch": 29.6875, |
| "grad_norm": 5.0625, |
| "learning_rate": 7.119447035959457e-05, |
| "loss": 2.8709, |
| "step": 17100 |
| }, |
| { |
| "epoch": 29.774305555555557, |
| "grad_norm": 3.828125, |
| "learning_rate": 7.067169726758522e-05, |
| "loss": 2.8669, |
| "step": 17150 |
| }, |
| { |
| "epoch": 29.86111111111111, |
| "grad_norm": 5.5625, |
| "learning_rate": 7.014979966298808e-05, |
| "loss": 2.8698, |
| "step": 17200 |
| }, |
| { |
| "epoch": 29.947916666666668, |
| "grad_norm": 3.546875, |
| "learning_rate": 6.962879312511531e-05, |
| "loss": 2.8669, |
| "step": 17250 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_loss": 2.854860544204712, |
| "eval_runtime": 41.7924, |
| "eval_samples_per_second": 89.227, |
| "eval_steps_per_second": 5.599, |
| "step": 17280 |
| }, |
| { |
| "epoch": 30.03472222222222, |
| "grad_norm": 5.53125, |
| "learning_rate": 6.910869320667955e-05, |
| "loss": 2.8649, |
| "step": 17300 |
| }, |
| { |
| "epoch": 30.12152777777778, |
| "grad_norm": 6.15625, |
| "learning_rate": 6.858951543332978e-05, |
| "loss": 2.8648, |
| "step": 17350 |
| }, |
| { |
| "epoch": 30.208333333333332, |
| "grad_norm": 5.3125, |
| "learning_rate": 6.807127530318771e-05, |
| "loss": 2.8618, |
| "step": 17400 |
| }, |
| { |
| "epoch": 30.29513888888889, |
| "grad_norm": 4.625, |
| "learning_rate": 6.755398828638512e-05, |
| "loss": 2.8748, |
| "step": 17450 |
| }, |
| { |
| "epoch": 30.381944444444443, |
| "grad_norm": 4.6875, |
| "learning_rate": 6.703766982460231e-05, |
| "loss": 2.8702, |
| "step": 17500 |
| }, |
| { |
| "epoch": 30.46875, |
| "grad_norm": 4.5, |
| "learning_rate": 6.652233533060683e-05, |
| "loss": 2.8766, |
| "step": 17550 |
| }, |
| { |
| "epoch": 30.555555555555557, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.600800018779356e-05, |
| "loss": 2.8766, |
| "step": 17600 |
| }, |
| { |
| "epoch": 30.64236111111111, |
| "grad_norm": 5.53125, |
| "learning_rate": 6.549467974972552e-05, |
| "loss": 2.8674, |
| "step": 17650 |
| }, |
| { |
| "epoch": 30.729166666666668, |
| "grad_norm": 5.28125, |
| "learning_rate": 6.498238933967544e-05, |
| "loss": 2.868, |
| "step": 17700 |
| }, |
| { |
| "epoch": 30.81597222222222, |
| "grad_norm": 4.09375, |
| "learning_rate": 6.44711442501684e-05, |
| "loss": 2.8798, |
| "step": 17750 |
| }, |
| { |
| "epoch": 30.90277777777778, |
| "grad_norm": 6.03125, |
| "learning_rate": 6.396095974252534e-05, |
| "loss": 2.8578, |
| "step": 17800 |
| }, |
| { |
| "epoch": 30.989583333333332, |
| "grad_norm": 5.59375, |
| "learning_rate": 6.345185104640747e-05, |
| "loss": 2.8672, |
| "step": 17850 |
| }, |
| { |
| "epoch": 31.0, |
| "eval_loss": 2.8543925285339355, |
| "eval_runtime": 41.2327, |
| "eval_samples_per_second": 90.438, |
| "eval_steps_per_second": 5.675, |
| "step": 17856 |
| }, |
| { |
| "epoch": 31.07638888888889, |
| "grad_norm": 4.625, |
| "learning_rate": 6.294383335936167e-05, |
| "loss": 2.87, |
| "step": 17900 |
| }, |
| { |
| "epoch": 31.163194444444443, |
| "grad_norm": 3.78125, |
| "learning_rate": 6.24369218463667e-05, |
| "loss": 2.8516, |
| "step": 17950 |
| }, |
| { |
| "epoch": 31.25, |
| "grad_norm": 5.4375, |
| "learning_rate": 6.193113163938075e-05, |
| "loss": 2.8673, |
| "step": 18000 |
| }, |
| { |
| "epoch": 31.336805555555557, |
| "grad_norm": 4.4375, |
| "learning_rate": 6.14264778368895e-05, |
| "loss": 2.8794, |
| "step": 18050 |
| }, |
| { |
| "epoch": 31.42361111111111, |
| "grad_norm": 5.125, |
| "learning_rate": 6.092297550345554e-05, |
| "loss": 2.8634, |
| "step": 18100 |
| }, |
| { |
| "epoch": 31.510416666666668, |
| "grad_norm": 5.46875, |
| "learning_rate": 6.0420639669268544e-05, |
| "loss": 2.8904, |
| "step": 18150 |
| }, |
| { |
| "epoch": 31.59722222222222, |
| "grad_norm": 4.21875, |
| "learning_rate": 5.991948532969685e-05, |
| "loss": 2.8651, |
| "step": 18200 |
| }, |
| { |
| "epoch": 31.68402777777778, |
| "grad_norm": 4.6875, |
| "learning_rate": 5.9419527444839515e-05, |
| "loss": 2.8727, |
| "step": 18250 |
| }, |
| { |
| "epoch": 31.770833333333332, |
| "grad_norm": 3.765625, |
| "learning_rate": 5.8920780939079955e-05, |
| "loss": 2.8645, |
| "step": 18300 |
| }, |
| { |
| "epoch": 31.85763888888889, |
| "grad_norm": 6.28125, |
| "learning_rate": 5.8423260700640417e-05, |
| "loss": 2.8713, |
| "step": 18350 |
| }, |
| { |
| "epoch": 31.944444444444443, |
| "grad_norm": 6.9375, |
| "learning_rate": 5.792698158113742e-05, |
| "loss": 2.8634, |
| "step": 18400 |
| }, |
| { |
| "epoch": 32.0, |
| "eval_loss": 2.8544044494628906, |
| "eval_runtime": 40.4905, |
| "eval_samples_per_second": 92.096, |
| "eval_steps_per_second": 5.779, |
| "step": 18432 |
| }, |
| { |
| "epoch": 32.03125, |
| "grad_norm": 3.84375, |
| "learning_rate": 5.743195839513852e-05, |
| "loss": 2.8657, |
| "step": 18450 |
| }, |
| { |
| "epoch": 32.11805555555556, |
| "grad_norm": 4.65625, |
| "learning_rate": 5.693820591971996e-05, |
| "loss": 2.8633, |
| "step": 18500 |
| }, |
| { |
| "epoch": 32.204861111111114, |
| "grad_norm": 5.25, |
| "learning_rate": 5.644573889402589e-05, |
| "loss": 2.8595, |
| "step": 18550 |
| }, |
| { |
| "epoch": 32.291666666666664, |
| "grad_norm": 4.8125, |
| "learning_rate": 5.5954572018827846e-05, |
| "loss": 2.8737, |
| "step": 18600 |
| }, |
| { |
| "epoch": 32.37847222222222, |
| "grad_norm": 5.46875, |
| "learning_rate": 5.5464719956086396e-05, |
| "loss": 2.8722, |
| "step": 18650 |
| }, |
| { |
| "epoch": 32.46527777777778, |
| "grad_norm": 5.15625, |
| "learning_rate": 5.49761973285132e-05, |
| "loss": 2.871, |
| "step": 18700 |
| }, |
| { |
| "epoch": 32.552083333333336, |
| "grad_norm": 4.1875, |
| "learning_rate": 5.4489018719134654e-05, |
| "loss": 2.8801, |
| "step": 18750 |
| }, |
| { |
| "epoch": 32.638888888888886, |
| "grad_norm": 4.875, |
| "learning_rate": 5.400319867085633e-05, |
| "loss": 2.8668, |
| "step": 18800 |
| }, |
| { |
| "epoch": 32.72569444444444, |
| "grad_norm": 4.8125, |
| "learning_rate": 5.3518751686029134e-05, |
| "loss": 2.8673, |
| "step": 18850 |
| }, |
| { |
| "epoch": 32.8125, |
| "grad_norm": 3.828125, |
| "learning_rate": 5.303569222601626e-05, |
| "loss": 2.875, |
| "step": 18900 |
| }, |
| { |
| "epoch": 32.89930555555556, |
| "grad_norm": 4.0625, |
| "learning_rate": 5.25540347107615e-05, |
| "loss": 2.8596, |
| "step": 18950 |
| }, |
| { |
| "epoch": 32.986111111111114, |
| "grad_norm": 4.25, |
| "learning_rate": 5.207379351835875e-05, |
| "loss": 2.8683, |
| "step": 19000 |
| }, |
| { |
| "epoch": 33.0, |
| "eval_loss": 2.854464054107666, |
| "eval_runtime": 40.2584, |
| "eval_samples_per_second": 92.627, |
| "eval_steps_per_second": 5.812, |
| "step": 19008 |
| }, |
| { |
| "epoch": 33.072916666666664, |
| "grad_norm": 4.5, |
| "learning_rate": 5.1594982984622906e-05, |
| "loss": 2.8657, |
| "step": 19050 |
| }, |
| { |
| "epoch": 33.15972222222222, |
| "grad_norm": 5.78125, |
| "learning_rate": 5.1117617402661865e-05, |
| "loss": 2.8538, |
| "step": 19100 |
| }, |
| { |
| "epoch": 33.24652777777778, |
| "grad_norm": 4.25, |
| "learning_rate": 5.064171102244985e-05, |
| "loss": 2.8671, |
| "step": 19150 |
| }, |
| { |
| "epoch": 33.333333333333336, |
| "grad_norm": 5.125, |
| "learning_rate": 5.0167278050402075e-05, |
| "loss": 2.879, |
| "step": 19200 |
| }, |
| { |
| "epoch": 33.420138888888886, |
| "grad_norm": 3.890625, |
| "learning_rate": 4.9694332648950536e-05, |
| "loss": 2.8637, |
| "step": 19250 |
| }, |
| { |
| "epoch": 33.50694444444444, |
| "grad_norm": 3.71875, |
| "learning_rate": 4.9222888936121494e-05, |
| "loss": 2.8891, |
| "step": 19300 |
| }, |
| { |
| "epoch": 33.59375, |
| "grad_norm": 3.578125, |
| "learning_rate": 4.875296098511365e-05, |
| "loss": 2.864, |
| "step": 19350 |
| }, |
| { |
| "epoch": 33.68055555555556, |
| "grad_norm": 4.9375, |
| "learning_rate": 4.828456282387859e-05, |
| "loss": 2.8731, |
| "step": 19400 |
| }, |
| { |
| "epoch": 33.767361111111114, |
| "grad_norm": 4.3125, |
| "learning_rate": 4.781770843470144e-05, |
| "loss": 2.8677, |
| "step": 19450 |
| }, |
| { |
| "epoch": 33.854166666666664, |
| "grad_norm": 5.15625, |
| "learning_rate": 4.735241175378386e-05, |
| "loss": 2.8649, |
| "step": 19500 |
| }, |
| { |
| "epoch": 33.94097222222222, |
| "grad_norm": 4.15625, |
| "learning_rate": 4.688868667082794e-05, |
| "loss": 2.8629, |
| "step": 19550 |
| }, |
| { |
| "epoch": 34.0, |
| "eval_loss": 2.8541414737701416, |
| "eval_runtime": 39.773, |
| "eval_samples_per_second": 93.757, |
| "eval_steps_per_second": 5.883, |
| "step": 19584 |
| }, |
| { |
| "epoch": 34.02777777777778, |
| "grad_norm": 4.09375, |
| "learning_rate": 4.642654702862157e-05, |
| "loss": 2.8661, |
| "step": 19600 |
| }, |
| { |
| "epoch": 34.114583333333336, |
| "grad_norm": 3.53125, |
| "learning_rate": 4.596600662262508e-05, |
| "loss": 2.8641, |
| "step": 19650 |
| }, |
| { |
| "epoch": 34.201388888888886, |
| "grad_norm": 4.46875, |
| "learning_rate": 4.55070792005597e-05, |
| "loss": 2.8574, |
| "step": 19700 |
| }, |
| { |
| "epoch": 34.28819444444444, |
| "grad_norm": 3.828125, |
| "learning_rate": 4.5049778461996926e-05, |
| "loss": 2.8735, |
| "step": 19750 |
| }, |
| { |
| "epoch": 34.375, |
| "grad_norm": 6.65625, |
| "learning_rate": 4.459411805794976e-05, |
| "loss": 2.8731, |
| "step": 19800 |
| }, |
| { |
| "epoch": 34.46180555555556, |
| "grad_norm": 4.25, |
| "learning_rate": 4.414011159046495e-05, |
| "loss": 2.8719, |
| "step": 19850 |
| }, |
| { |
| "epoch": 34.548611111111114, |
| "grad_norm": 3.90625, |
| "learning_rate": 4.368777261221737e-05, |
| "loss": 2.8769, |
| "step": 19900 |
| }, |
| { |
| "epoch": 34.635416666666664, |
| "grad_norm": 4.0625, |
| "learning_rate": 4.323711462610495e-05, |
| "loss": 2.8679, |
| "step": 19950 |
| }, |
| { |
| "epoch": 34.72222222222222, |
| "grad_norm": 3.953125, |
| "learning_rate": 4.278815108484602e-05, |
| "loss": 2.8681, |
| "step": 20000 |
| }, |
| { |
| "epoch": 34.80902777777778, |
| "grad_norm": 4.09375, |
| "learning_rate": 4.234089539057745e-05, |
| "loss": 2.8744, |
| "step": 20050 |
| }, |
| { |
| "epoch": 34.895833333333336, |
| "grad_norm": 4.03125, |
| "learning_rate": 4.1895360894454774e-05, |
| "loss": 2.8615, |
| "step": 20100 |
| }, |
| { |
| "epoch": 34.982638888888886, |
| "grad_norm": 4.21875, |
| "learning_rate": 4.1451560896253515e-05, |
| "loss": 2.8641, |
| "step": 20150 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_loss": 2.8540618419647217, |
| "eval_runtime": 41.6189, |
| "eval_samples_per_second": 89.599, |
| "eval_steps_per_second": 5.622, |
| "step": 20160 |
| }, |
| { |
| "epoch": 35.06944444444444, |
| "grad_norm": 4.1875, |
| "learning_rate": 4.100950864397223e-05, |
| "loss": 2.8629, |
| "step": 20200 |
| }, |
| { |
| "epoch": 35.15625, |
| "grad_norm": 3.484375, |
| "learning_rate": 4.056921733343704e-05, |
| "loss": 2.8579, |
| "step": 20250 |
| }, |
| { |
| "epoch": 35.24305555555556, |
| "grad_norm": 5.28125, |
| "learning_rate": 4.013070010790759e-05, |
| "loss": 2.8641, |
| "step": 20300 |
| }, |
| { |
| "epoch": 35.329861111111114, |
| "grad_norm": 4.25, |
| "learning_rate": 3.9693970057684984e-05, |
| "loss": 2.8801, |
| "step": 20350 |
| }, |
| { |
| "epoch": 35.416666666666664, |
| "grad_norm": 4.84375, |
| "learning_rate": 3.9259040219720645e-05, |
| "loss": 2.8614, |
| "step": 20400 |
| }, |
| { |
| "epoch": 35.50347222222222, |
| "grad_norm": 5.21875, |
| "learning_rate": 3.882592357722761e-05, |
| "loss": 2.8902, |
| "step": 20450 |
| }, |
| { |
| "epoch": 35.59027777777778, |
| "grad_norm": 4.71875, |
| "learning_rate": 3.839463305929247e-05, |
| "loss": 2.8626, |
| "step": 20500 |
| }, |
| { |
| "epoch": 35.677083333333336, |
| "grad_norm": 4.3125, |
| "learning_rate": 3.7965181540489794e-05, |
| "loss": 2.8741, |
| "step": 20550 |
| }, |
| { |
| "epoch": 35.763888888888886, |
| "grad_norm": 3.46875, |
| "learning_rate": 3.753758184049764e-05, |
| "loss": 2.8656, |
| "step": 20600 |
| }, |
| { |
| "epoch": 35.85069444444444, |
| "grad_norm": 5.03125, |
| "learning_rate": 3.7111846723714916e-05, |
| "loss": 2.8661, |
| "step": 20650 |
| }, |
| { |
| "epoch": 35.9375, |
| "grad_norm": 3.25, |
| "learning_rate": 3.668798889888022e-05, |
| "loss": 2.8597, |
| "step": 20700 |
| }, |
| { |
| "epoch": 36.0, |
| "eval_loss": 2.853997230529785, |
| "eval_runtime": 41.7365, |
| "eval_samples_per_second": 89.346, |
| "eval_steps_per_second": 5.607, |
| "step": 20736 |
| }, |
| { |
| "epoch": 36.02430555555556, |
| "grad_norm": 4.5, |
| "learning_rate": 3.626602101869281e-05, |
| "loss": 2.8674, |
| "step": 20750 |
| }, |
| { |
| "epoch": 36.111111111111114, |
| "grad_norm": 5.15625, |
| "learning_rate": 3.5845955679434426e-05, |
| "loss": 2.8631, |
| "step": 20800 |
| }, |
| { |
| "epoch": 36.197916666666664, |
| "grad_norm": 3.6875, |
| "learning_rate": 3.542780542059373e-05, |
| "loss": 2.8576, |
| "step": 20850 |
| }, |
| { |
| "epoch": 36.28472222222222, |
| "grad_norm": 3.515625, |
| "learning_rate": 3.501158272449155e-05, |
| "loss": 2.8715, |
| "step": 20900 |
| }, |
| { |
| "epoch": 36.37152777777778, |
| "grad_norm": 3.4375, |
| "learning_rate": 3.45973000159088e-05, |
| "loss": 2.8754, |
| "step": 20950 |
| }, |
| { |
| "epoch": 36.458333333333336, |
| "grad_norm": 3.671875, |
| "learning_rate": 3.418496966171498e-05, |
| "loss": 2.8721, |
| "step": 21000 |
| }, |
| { |
| "epoch": 36.545138888888886, |
| "grad_norm": 3.84375, |
| "learning_rate": 3.377460397049951e-05, |
| "loss": 2.8741, |
| "step": 21050 |
| }, |
| { |
| "epoch": 36.63194444444444, |
| "grad_norm": 4.3125, |
| "learning_rate": 3.336621519220404e-05, |
| "loss": 2.8717, |
| "step": 21100 |
| }, |
| { |
| "epoch": 36.71875, |
| "grad_norm": 3.625, |
| "learning_rate": 3.295981551775679e-05, |
| "loss": 2.8655, |
| "step": 21150 |
| }, |
| { |
| "epoch": 36.80555555555556, |
| "grad_norm": 3.46875, |
| "learning_rate": 3.255541707870874e-05, |
| "loss": 2.8748, |
| "step": 21200 |
| }, |
| { |
| "epoch": 36.892361111111114, |
| "grad_norm": 3.640625, |
| "learning_rate": 3.2153031946871427e-05, |
| "loss": 2.8598, |
| "step": 21250 |
| }, |
| { |
| "epoch": 36.979166666666664, |
| "grad_norm": 3.875, |
| "learning_rate": 3.1752672133956596e-05, |
| "loss": 2.8632, |
| "step": 21300 |
| }, |
| { |
| "epoch": 37.0, |
| "eval_loss": 2.854156017303467, |
| "eval_runtime": 40.6449, |
| "eval_samples_per_second": 91.746, |
| "eval_steps_per_second": 5.757, |
| "step": 21312 |
| }, |
| { |
| "epoch": 37.06597222222222, |
| "grad_norm": 4.125, |
| "learning_rate": 3.135434959121756e-05, |
| "loss": 2.8613, |
| "step": 21350 |
| }, |
| { |
| "epoch": 37.15277777777778, |
| "grad_norm": 5.53125, |
| "learning_rate": 3.095807620909257e-05, |
| "loss": 2.859, |
| "step": 21400 |
| }, |
| { |
| "epoch": 37.239583333333336, |
| "grad_norm": 4.15625, |
| "learning_rate": 3.0563863816849795e-05, |
| "loss": 2.8618, |
| "step": 21450 |
| }, |
| { |
| "epoch": 37.326388888888886, |
| "grad_norm": 4.0625, |
| "learning_rate": 3.017172418223424e-05, |
| "loss": 2.8817, |
| "step": 21500 |
| }, |
| { |
| "epoch": 37.41319444444444, |
| "grad_norm": 3.953125, |
| "learning_rate": 2.9781669011116364e-05, |
| "loss": 2.8609, |
| "step": 21550 |
| }, |
| { |
| "epoch": 37.5, |
| "grad_norm": 3.96875, |
| "learning_rate": 2.939370994714278e-05, |
| "loss": 2.8872, |
| "step": 21600 |
| }, |
| { |
| "epoch": 37.58680555555556, |
| "grad_norm": 4.09375, |
| "learning_rate": 2.90078585713886e-05, |
| "loss": 2.864, |
| "step": 21650 |
| }, |
| { |
| "epoch": 37.673611111111114, |
| "grad_norm": 3.421875, |
| "learning_rate": 2.8624126402011798e-05, |
| "loss": 2.8757, |
| "step": 21700 |
| }, |
| { |
| "epoch": 37.760416666666664, |
| "grad_norm": 3.40625, |
| "learning_rate": 2.8242524893909162e-05, |
| "loss": 2.8623, |
| "step": 21750 |
| }, |
| { |
| "epoch": 37.84722222222222, |
| "grad_norm": 3.28125, |
| "learning_rate": 2.7863065438374748e-05, |
| "loss": 2.8695, |
| "step": 21800 |
| }, |
| { |
| "epoch": 37.93402777777778, |
| "grad_norm": 4.4375, |
| "learning_rate": 2.7485759362759378e-05, |
| "loss": 2.8596, |
| "step": 21850 |
| }, |
| { |
| "epoch": 38.0, |
| "eval_loss": 2.8540520668029785, |
| "eval_runtime": 41.5194, |
| "eval_samples_per_second": 89.813, |
| "eval_steps_per_second": 5.636, |
| "step": 21888 |
| }, |
| { |
| "epoch": 38.020833333333336, |
| "grad_norm": 3.25, |
| "learning_rate": 2.7110617930132877e-05, |
| "loss": 2.8658, |
| "step": 21900 |
| }, |
| { |
| "epoch": 38.107638888888886, |
| "grad_norm": 3.1875, |
| "learning_rate": 2.673765233894755e-05, |
| "loss": 2.8632, |
| "step": 21950 |
| }, |
| { |
| "epoch": 38.19444444444444, |
| "grad_norm": 3.765625, |
| "learning_rate": 2.6366873722704265e-05, |
| "loss": 2.8583, |
| "step": 22000 |
| }, |
| { |
| "epoch": 38.28125, |
| "grad_norm": 3.640625, |
| "learning_rate": 2.599829314961967e-05, |
| "loss": 2.8678, |
| "step": 22050 |
| }, |
| { |
| "epoch": 38.36805555555556, |
| "grad_norm": 3.609375, |
| "learning_rate": 2.5631921622296128e-05, |
| "loss": 2.8777, |
| "step": 22100 |
| }, |
| { |
| "epoch": 38.454861111111114, |
| "grad_norm": 3.15625, |
| "learning_rate": 2.526777007739316e-05, |
| "loss": 2.8671, |
| "step": 22150 |
| }, |
| { |
| "epoch": 38.541666666666664, |
| "grad_norm": 3.53125, |
| "learning_rate": 2.4905849385300883e-05, |
| "loss": 2.8782, |
| "step": 22200 |
| }, |
| { |
| "epoch": 38.62847222222222, |
| "grad_norm": 3.46875, |
| "learning_rate": 2.4546170349815666e-05, |
| "loss": 2.8699, |
| "step": 22250 |
| }, |
| { |
| "epoch": 38.71527777777778, |
| "grad_norm": 3.8125, |
| "learning_rate": 2.418874370781754e-05, |
| "loss": 2.8658, |
| "step": 22300 |
| }, |
| { |
| "epoch": 38.802083333333336, |
| "grad_norm": 3.9375, |
| "learning_rate": 2.3833580128949762e-05, |
| "loss": 2.8749, |
| "step": 22350 |
| }, |
| { |
| "epoch": 38.888888888888886, |
| "grad_norm": 3.75, |
| "learning_rate": 2.3480690215300105e-05, |
| "loss": 2.8573, |
| "step": 22400 |
| }, |
| { |
| "epoch": 38.97569444444444, |
| "grad_norm": 4.75, |
| "learning_rate": 2.313008450108468e-05, |
| "loss": 2.8656, |
| "step": 22450 |
| }, |
| { |
| "epoch": 39.0, |
| "eval_loss": 2.854092597961426, |
| "eval_runtime": 41.0282, |
| "eval_samples_per_second": 90.889, |
| "eval_steps_per_second": 5.703, |
| "step": 22464 |
| }, |
| { |
| "epoch": 39.0625, |
| "grad_norm": 3.40625, |
| "learning_rate": 2.278177345233323e-05, |
| "loss": 2.8622, |
| "step": 22500 |
| }, |
| { |
| "epoch": 39.14930555555556, |
| "grad_norm": 3.625, |
| "learning_rate": 2.2435767466576863e-05, |
| "loss": 2.8578, |
| "step": 22550 |
| }, |
| { |
| "epoch": 39.236111111111114, |
| "grad_norm": 4.15625, |
| "learning_rate": 2.209207687253746e-05, |
| "loss": 2.8602, |
| "step": 22600 |
| }, |
| { |
| "epoch": 39.322916666666664, |
| "grad_norm": 2.921875, |
| "learning_rate": 2.1750711929819723e-05, |
| "loss": 2.8825, |
| "step": 22650 |
| }, |
| { |
| "epoch": 39.40972222222222, |
| "grad_norm": 3.421875, |
| "learning_rate": 2.1411682828604452e-05, |
| "loss": 2.8618, |
| "step": 22700 |
| }, |
| { |
| "epoch": 39.49652777777778, |
| "grad_norm": 3.6875, |
| "learning_rate": 2.1074999689344755e-05, |
| "loss": 2.8834, |
| "step": 22750 |
| }, |
| { |
| "epoch": 39.583333333333336, |
| "grad_norm": 4.46875, |
| "learning_rate": 2.0740672562463602e-05, |
| "loss": 2.8664, |
| "step": 22800 |
| }, |
| { |
| "epoch": 39.670138888888886, |
| "grad_norm": 3.5625, |
| "learning_rate": 2.0408711428054195e-05, |
| "loss": 2.8771, |
| "step": 22850 |
| }, |
| { |
| "epoch": 39.75694444444444, |
| "grad_norm": 3.390625, |
| "learning_rate": 2.0079126195581612e-05, |
| "loss": 2.8629, |
| "step": 22900 |
| }, |
| { |
| "epoch": 39.84375, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.9751926703587353e-05, |
| "loss": 2.867, |
| "step": 22950 |
| }, |
| { |
| "epoch": 39.93055555555556, |
| "grad_norm": 3.390625, |
| "learning_rate": 1.9427122719395452e-05, |
| "loss": 2.8591, |
| "step": 23000 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_loss": 2.85404634475708, |
| "eval_runtime": 42.4648, |
| "eval_samples_per_second": 87.814, |
| "eval_steps_per_second": 5.51, |
| "step": 23040 |
| }, |
| { |
| "epoch": 40.017361111111114, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.9104723938821012e-05, |
| "loss": 2.8661, |
| "step": 23050 |
| }, |
| { |
| "epoch": 40.104166666666664, |
| "grad_norm": 3.375, |
| "learning_rate": 1.8784739985880628e-05, |
| "loss": 2.8613, |
| "step": 23100 |
| }, |
| { |
| "epoch": 40.19097222222222, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.8467180412505313e-05, |
| "loss": 2.8565, |
| "step": 23150 |
| }, |
| { |
| "epoch": 40.27777777777778, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.8152054698255194e-05, |
| "loss": 2.8671, |
| "step": 23200 |
| }, |
| { |
| "epoch": 40.364583333333336, |
| "grad_norm": 3.609375, |
| "learning_rate": 1.7839372250036534e-05, |
| "loss": 2.8812, |
| "step": 23250 |
| }, |
| { |
| "epoch": 40.451388888888886, |
| "grad_norm": 3.9375, |
| "learning_rate": 1.7529142401821062e-05, |
| "loss": 2.8657, |
| "step": 23300 |
| }, |
| { |
| "epoch": 40.53819444444444, |
| "grad_norm": 3.875, |
| "learning_rate": 1.722137441436721e-05, |
| "loss": 2.8782, |
| "step": 23350 |
| }, |
| { |
| "epoch": 40.625, |
| "grad_norm": 3.984375, |
| "learning_rate": 1.6916077474943736e-05, |
| "loss": 2.8685, |
| "step": 23400 |
| }, |
| { |
| "epoch": 40.71180555555556, |
| "grad_norm": 2.90625, |
| "learning_rate": 1.66132606970554e-05, |
| "loss": 2.8671, |
| "step": 23450 |
| }, |
| { |
| "epoch": 40.798611111111114, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.631293312017099e-05, |
| "loss": 2.8723, |
| "step": 23500 |
| }, |
| { |
| "epoch": 40.885416666666664, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.6015103709453482e-05, |
| "loss": 2.8591, |
| "step": 23550 |
| }, |
| { |
| "epoch": 40.97222222222222, |
| "grad_norm": 3.375, |
| "learning_rate": 1.571978135549238e-05, |
| "loss": 2.8635, |
| "step": 23600 |
| }, |
| { |
| "epoch": 41.0, |
| "eval_loss": 2.8541696071624756, |
| "eval_runtime": 40.7796, |
| "eval_samples_per_second": 91.443, |
| "eval_steps_per_second": 5.738, |
| "step": 23616 |
| }, |
| { |
| "epoch": 41.05902777777778, |
| "grad_norm": 3.5625, |
| "learning_rate": 1.5426974874038247e-05, |
| "loss": 2.8627, |
| "step": 23650 |
| }, |
| { |
| "epoch": 41.145833333333336, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.51366930057398e-05, |
| "loss": 2.8606, |
| "step": 23700 |
| }, |
| { |
| "epoch": 41.232638888888886, |
| "grad_norm": 3.25, |
| "learning_rate": 1.4848944415882648e-05, |
| "loss": 2.8608, |
| "step": 23750 |
| }, |
| { |
| "epoch": 41.31944444444444, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.4563737694130885e-05, |
| "loss": 2.8802, |
| "step": 23800 |
| }, |
| { |
| "epoch": 41.40625, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.4281081354270564e-05, |
| "loss": 2.8615, |
| "step": 23850 |
| }, |
| { |
| "epoch": 41.49305555555556, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.4000983833955594e-05, |
| "loss": 2.8829, |
| "step": 23900 |
| }, |
| { |
| "epoch": 41.579861111111114, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.3723453494455784e-05, |
| "loss": 2.8665, |
| "step": 23950 |
| }, |
| { |
| "epoch": 41.666666666666664, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.3448498620407345e-05, |
| "loss": 2.8761, |
| "step": 24000 |
| }, |
| { |
| "epoch": 41.75347222222222, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.3176127419565564e-05, |
| "loss": 2.8624, |
| "step": 24050 |
| }, |
| { |
| "epoch": 41.84027777777778, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.2906348022559755e-05, |
| "loss": 2.8687, |
| "step": 24100 |
| }, |
| { |
| "epoch": 41.927083333333336, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.2639168482650532e-05, |
| "loss": 2.8575, |
| "step": 24150 |
| }, |
| { |
| "epoch": 42.0, |
| "eval_loss": 2.854001045227051, |
| "eval_runtime": 40.3709, |
| "eval_samples_per_second": 92.368, |
| "eval_steps_per_second": 5.796, |
| "step": 24192 |
| }, |
| { |
| "epoch": 42.013888888888886, |
| "grad_norm": 3.96875, |
| "learning_rate": 1.2374596775489477e-05, |
| "loss": 2.8656, |
| "step": 24200 |
| }, |
| { |
| "epoch": 42.10069444444444, |
| "grad_norm": 3.4375, |
| "learning_rate": 1.2112640798881058e-05, |
| "loss": 2.8625, |
| "step": 24250 |
| }, |
| { |
| "epoch": 42.1875, |
| "grad_norm": 3.53125, |
| "learning_rate": 1.1853308372546756e-05, |
| "loss": 2.8571, |
| "step": 24300 |
| }, |
| { |
| "epoch": 42.27430555555556, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.1596607237891766e-05, |
| "loss": 2.8664, |
| "step": 24350 |
| }, |
| { |
| "epoch": 42.361111111111114, |
| "grad_norm": 3.125, |
| "learning_rate": 1.1342545057773846e-05, |
| "loss": 2.881, |
| "step": 24400 |
| }, |
| { |
| "epoch": 42.447916666666664, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.1091129416274603e-05, |
| "loss": 2.8614, |
| "step": 24450 |
| }, |
| { |
| "epoch": 42.53472222222222, |
| "grad_norm": 3.28125, |
| "learning_rate": 1.0842367818472988e-05, |
| "loss": 2.8773, |
| "step": 24500 |
| }, |
| { |
| "epoch": 42.62152777777778, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.0596267690221496e-05, |
| "loss": 2.874, |
| "step": 24550 |
| }, |
| { |
| "epoch": 42.708333333333336, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.0352836377924202e-05, |
| "loss": 2.8666, |
| "step": 24600 |
| }, |
| { |
| "epoch": 42.795138888888886, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.0112081148317687e-05, |
| "loss": 2.8681, |
| "step": 24650 |
| }, |
| { |
| "epoch": 42.88194444444444, |
| "grad_norm": 3.15625, |
| "learning_rate": 9.874009188253974e-06, |
| "loss": 2.8575, |
| "step": 24700 |
| }, |
| { |
| "epoch": 42.96875, |
| "grad_norm": 3.046875, |
| "learning_rate": 9.63862760448616e-06, |
| "loss": 2.8666, |
| "step": 24750 |
| }, |
| { |
| "epoch": 43.0, |
| "eval_loss": 2.8540420532226562, |
| "eval_runtime": 42.3273, |
| "eval_samples_per_second": 88.099, |
| "eval_steps_per_second": 5.528, |
| "step": 24768 |
| }, |
| { |
| "epoch": 43.05555555555556, |
| "grad_norm": 3.265625, |
| "learning_rate": 9.405943423456043e-06, |
| "loss": 2.8636, |
| "step": 24800 |
| }, |
| { |
| "epoch": 43.142361111111114, |
| "grad_norm": 3.25, |
| "learning_rate": 9.175963591084546e-06, |
| "loss": 2.858, |
| "step": 24850 |
| }, |
| { |
| "epoch": 43.229166666666664, |
| "grad_norm": 4.59375, |
| "learning_rate": 8.948694972564343e-06, |
| "loss": 2.8629, |
| "step": 24900 |
| }, |
| { |
| "epoch": 43.31597222222222, |
| "grad_norm": 3.078125, |
| "learning_rate": 8.724144352154861e-06, |
| "loss": 2.8783, |
| "step": 24950 |
| }, |
| { |
| "epoch": 43.40277777777778, |
| "grad_norm": 3.515625, |
| "learning_rate": 8.502318432979806e-06, |
| "loss": 2.8623, |
| "step": 25000 |
| }, |
| { |
| "epoch": 43.489583333333336, |
| "grad_norm": 2.65625, |
| "learning_rate": 8.28322383682707e-06, |
| "loss": 2.8827, |
| "step": 25050 |
| }, |
| { |
| "epoch": 43.576388888888886, |
| "grad_norm": 3.203125, |
| "learning_rate": 8.066867103951082e-06, |
| "loss": 2.8631, |
| "step": 25100 |
| }, |
| { |
| "epoch": 43.66319444444444, |
| "grad_norm": 2.890625, |
| "learning_rate": 7.853254692877476e-06, |
| "loss": 2.8769, |
| "step": 25150 |
| }, |
| { |
| "epoch": 43.75, |
| "grad_norm": 3.078125, |
| "learning_rate": 7.642392980210423e-06, |
| "loss": 2.8654, |
| "step": 25200 |
| }, |
| { |
| "epoch": 43.83680555555556, |
| "grad_norm": 3.21875, |
| "learning_rate": 7.4342882604422125e-06, |
| "loss": 2.87, |
| "step": 25250 |
| }, |
| { |
| "epoch": 43.923611111111114, |
| "grad_norm": 3.34375, |
| "learning_rate": 7.228946745765364e-06, |
| "loss": 2.8584, |
| "step": 25300 |
| }, |
| { |
| "epoch": 44.0, |
| "eval_loss": 2.8539493083953857, |
| "eval_runtime": 42.0373, |
| "eval_samples_per_second": 88.707, |
| "eval_steps_per_second": 5.566, |
| "step": 25344 |
| }, |
| { |
| "epoch": 44.010416666666664, |
| "grad_norm": 2.90625, |
| "learning_rate": 7.026374565887117e-06, |
| "loss": 2.8638, |
| "step": 25350 |
| }, |
| { |
| "epoch": 44.09722222222222, |
| "grad_norm": 2.46875, |
| "learning_rate": 6.826577767846665e-06, |
| "loss": 2.8638, |
| "step": 25400 |
| }, |
| { |
| "epoch": 44.18402777777778, |
| "grad_norm": 3.078125, |
| "learning_rate": 6.629562315834348e-06, |
| "loss": 2.8536, |
| "step": 25450 |
| }, |
| { |
| "epoch": 44.270833333333336, |
| "grad_norm": 3.09375, |
| "learning_rate": 6.435334091013856e-06, |
| "loss": 2.8646, |
| "step": 25500 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 28800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.177294293290189e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|