diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15401 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.999656605198997, + "eval_steps": 1, + "global_step": 218400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004578597346702838, + "grad_norm": 1.834977626800537, + "learning_rate": 4.578754578754579e-07, + "loss": 0.7024, + "step": 100 + }, + { + "epoch": 0.009157194693405675, + "grad_norm": 1.148632287979126, + "learning_rate": 9.157509157509158e-07, + "loss": 0.6769, + "step": 200 + }, + { + "epoch": 0.013735792040108512, + "grad_norm": 0.9439795613288879, + "learning_rate": 1.3736263736263736e-06, + "loss": 0.6151, + "step": 300 + }, + { + "epoch": 0.01831438938681135, + "grad_norm": 0.7975717186927795, + "learning_rate": 1.8315018315018316e-06, + "loss": 0.5009, + "step": 400 + }, + { + "epoch": 0.022892986733514187, + "grad_norm": 0.6828369498252869, + "learning_rate": 2.2893772893772894e-06, + "loss": 0.4383, + "step": 500 + }, + { + "epoch": 0.027471584080217024, + "grad_norm": 0.7334387302398682, + "learning_rate": 2.747252747252747e-06, + "loss": 0.3948, + "step": 600 + }, + { + "epoch": 0.032050181426919865, + "grad_norm": 0.5922483205795288, + "learning_rate": 3.205128205128205e-06, + "loss": 0.3682, + "step": 700 + }, + { + "epoch": 0.0366287787736227, + "grad_norm": 0.42683616280555725, + "learning_rate": 3.663003663003663e-06, + "loss": 0.3471, + "step": 800 + }, + { + "epoch": 0.04120737612032554, + "grad_norm": 0.798675537109375, + "learning_rate": 4.120879120879121e-06, + "loss": 0.3391, + "step": 900 + }, + { + "epoch": 0.045785973467028375, + "grad_norm": 0.6987248659133911, + "learning_rate": 4.578754578754579e-06, + "loss": 0.3213, + "step": 1000 + }, + { + "epoch": 0.05036457081373121, + "grad_norm": 0.697691798210144, + "learning_rate": 5.036630036630037e-06, + "loss": 0.3021, + "step": 1100 + }, + { + "epoch": 0.05494316816043405, + "grad_norm": 1.0107570886611938, + "learning_rate": 5.494505494505494e-06, + "loss": 0.2854, + "step": 1200 + }, + { + "epoch": 0.059521765507136885, + "grad_norm": 0.9664294719696045, + "learning_rate": 5.9523809523809525e-06, + "loss": 0.2712, + "step": 1300 + }, + { + "epoch": 0.06410036285383973, + "grad_norm": 0.8629726767539978, + "learning_rate": 6.41025641025641e-06, + "loss": 0.2591, + "step": 1400 + }, + { + "epoch": 0.06867896020054257, + "grad_norm": 1.2182508707046509, + "learning_rate": 6.868131868131869e-06, + "loss": 0.2523, + "step": 1500 + }, + { + "epoch": 0.0732575575472454, + "grad_norm": 0.8369842171669006, + "learning_rate": 7.326007326007326e-06, + "loss": 0.2498, + "step": 1600 + }, + { + "epoch": 0.07783615489394824, + "grad_norm": 1.195522427558899, + "learning_rate": 7.783882783882785e-06, + "loss": 0.2345, + "step": 1700 + }, + { + "epoch": 0.08241475224065108, + "grad_norm": 1.7609663009643555, + "learning_rate": 8.241758241758243e-06, + "loss": 0.226, + "step": 1800 + }, + { + "epoch": 0.08699334958735391, + "grad_norm": 1.2075083255767822, + "learning_rate": 8.6996336996337e-06, + "loss": 0.2187, + "step": 1900 + }, + { + "epoch": 0.09157194693405675, + "grad_norm": 1.0836577415466309, + "learning_rate": 9.157509157509158e-06, + "loss": 0.2121, + "step": 2000 + }, + { + "epoch": 0.09615054428075959, + "grad_norm": 0.962322473526001, + "learning_rate": 9.615384615384616e-06, + "loss": 0.1984, + "step": 2100 + }, + { + "epoch": 0.10072914162746242, + "grad_norm": 0.6929520964622498, + "learning_rate": 1.0073260073260074e-05, + "loss": 0.1949, + "step": 2200 + }, + { + "epoch": 0.10530773897416526, + "grad_norm": 1.0407646894454956, + "learning_rate": 1.0531135531135532e-05, + "loss": 0.1968, + "step": 2300 + }, + { + "epoch": 0.1098863363208681, + "grad_norm": 1.3924702405929565, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.1932, + "step": 2400 + }, + { + "epoch": 0.11446493366757093, + "grad_norm": 1.2128703594207764, + "learning_rate": 1.1446886446886447e-05, + "loss": 0.1855, + "step": 2500 + }, + { + "epoch": 0.11904353101427377, + "grad_norm": 1.8191227912902832, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.1866, + "step": 2600 + }, + { + "epoch": 0.12362212836097662, + "grad_norm": 1.528939127922058, + "learning_rate": 1.2362637362637363e-05, + "loss": 0.1798, + "step": 2700 + }, + { + "epoch": 0.12820072570767946, + "grad_norm": 1.4381574392318726, + "learning_rate": 1.282051282051282e-05, + "loss": 0.1788, + "step": 2800 + }, + { + "epoch": 0.13277932305438228, + "grad_norm": 1.9535735845565796, + "learning_rate": 1.327838827838828e-05, + "loss": 0.1751, + "step": 2900 + }, + { + "epoch": 0.13735792040108513, + "grad_norm": 1.095574140548706, + "learning_rate": 1.3736263736263738e-05, + "loss": 0.1745, + "step": 3000 + }, + { + "epoch": 0.14193651774778795, + "grad_norm": 1.4498175382614136, + "learning_rate": 1.4194139194139194e-05, + "loss": 0.1654, + "step": 3100 + }, + { + "epoch": 0.1465151150944908, + "grad_norm": 1.274528980255127, + "learning_rate": 1.4652014652014653e-05, + "loss": 0.1759, + "step": 3200 + }, + { + "epoch": 0.15109371244119363, + "grad_norm": 2.9102790355682373, + "learning_rate": 1.510989010989011e-05, + "loss": 0.1663, + "step": 3300 + }, + { + "epoch": 0.15567230978789648, + "grad_norm": 0.8863092660903931, + "learning_rate": 1.556776556776557e-05, + "loss": 0.1613, + "step": 3400 + }, + { + "epoch": 0.1602509071345993, + "grad_norm": 1.825391411781311, + "learning_rate": 1.602564102564103e-05, + "loss": 0.1655, + "step": 3500 + }, + { + "epoch": 0.16482950448130215, + "grad_norm": 1.2341893911361694, + "learning_rate": 1.6483516483516486e-05, + "loss": 0.1532, + "step": 3600 + }, + { + "epoch": 0.169408101828005, + "grad_norm": 1.6574805974960327, + "learning_rate": 1.6941391941391942e-05, + "loss": 0.1485, + "step": 3700 + }, + { + "epoch": 0.17398669917470783, + "grad_norm": 1.424926996231079, + "learning_rate": 1.73992673992674e-05, + "loss": 0.1514, + "step": 3800 + }, + { + "epoch": 0.17856529652141068, + "grad_norm": 1.5658457279205322, + "learning_rate": 1.785714285714286e-05, + "loss": 0.1532, + "step": 3900 + }, + { + "epoch": 0.1831438938681135, + "grad_norm": 2.2447550296783447, + "learning_rate": 1.8315018315018315e-05, + "loss": 0.1536, + "step": 4000 + }, + { + "epoch": 0.18772249121481635, + "grad_norm": 1.8312195539474487, + "learning_rate": 1.8772893772893775e-05, + "loss": 0.152, + "step": 4100 + }, + { + "epoch": 0.19230108856151917, + "grad_norm": 4.884443283081055, + "learning_rate": 1.923076923076923e-05, + "loss": 0.1423, + "step": 4200 + }, + { + "epoch": 0.19687968590822202, + "grad_norm": 2.421905994415283, + "learning_rate": 1.9688644688644688e-05, + "loss": 0.1493, + "step": 4300 + }, + { + "epoch": 0.20145828325492485, + "grad_norm": 1.5298246145248413, + "learning_rate": 2.0146520146520148e-05, + "loss": 0.1399, + "step": 4400 + }, + { + "epoch": 0.2060368806016277, + "grad_norm": 1.8627902269363403, + "learning_rate": 2.0604395604395604e-05, + "loss": 0.1418, + "step": 4500 + }, + { + "epoch": 0.21061547794833052, + "grad_norm": 1.0591548681259155, + "learning_rate": 2.1062271062271064e-05, + "loss": 0.146, + "step": 4600 + }, + { + "epoch": 0.21519407529503337, + "grad_norm": 2.3305251598358154, + "learning_rate": 2.152014652014652e-05, + "loss": 0.1394, + "step": 4700 + }, + { + "epoch": 0.2197726726417362, + "grad_norm": 2.5741324424743652, + "learning_rate": 2.1978021978021977e-05, + "loss": 0.1357, + "step": 4800 + }, + { + "epoch": 0.22435126998843904, + "grad_norm": 1.3497207164764404, + "learning_rate": 2.2435897435897437e-05, + "loss": 0.1293, + "step": 4900 + }, + { + "epoch": 0.22892986733514187, + "grad_norm": 1.5644819736480713, + "learning_rate": 2.2893772893772894e-05, + "loss": 0.132, + "step": 5000 + }, + { + "epoch": 0.23350846468184472, + "grad_norm": 1.2510719299316406, + "learning_rate": 2.3351648351648354e-05, + "loss": 0.1266, + "step": 5100 + }, + { + "epoch": 0.23808706202854754, + "grad_norm": 2.4705810546875, + "learning_rate": 2.380952380952381e-05, + "loss": 0.1334, + "step": 5200 + }, + { + "epoch": 0.2426656593752504, + "grad_norm": 2.0139317512512207, + "learning_rate": 2.4267399267399267e-05, + "loss": 0.1336, + "step": 5300 + }, + { + "epoch": 0.24724425672195324, + "grad_norm": 1.2344926595687866, + "learning_rate": 2.4725274725274727e-05, + "loss": 0.1309, + "step": 5400 + }, + { + "epoch": 0.25182285406865607, + "grad_norm": 1.8490768671035767, + "learning_rate": 2.5183150183150183e-05, + "loss": 0.1337, + "step": 5500 + }, + { + "epoch": 0.2564014514153589, + "grad_norm": 1.2988219261169434, + "learning_rate": 2.564102564102564e-05, + "loss": 0.1373, + "step": 5600 + }, + { + "epoch": 0.26098004876206177, + "grad_norm": 1.7260547876358032, + "learning_rate": 2.6098901098901103e-05, + "loss": 0.1199, + "step": 5700 + }, + { + "epoch": 0.26555864610876456, + "grad_norm": 2.653820037841797, + "learning_rate": 2.655677655677656e-05, + "loss": 0.1175, + "step": 5800 + }, + { + "epoch": 0.2701372434554674, + "grad_norm": 2.190546989440918, + "learning_rate": 2.7014652014652016e-05, + "loss": 0.1216, + "step": 5900 + }, + { + "epoch": 0.27471584080217026, + "grad_norm": 1.3163684606552124, + "learning_rate": 2.7472527472527476e-05, + "loss": 0.127, + "step": 6000 + }, + { + "epoch": 0.2792944381488731, + "grad_norm": 2.4772284030914307, + "learning_rate": 2.7930402930402932e-05, + "loss": 0.1218, + "step": 6100 + }, + { + "epoch": 0.2838730354955759, + "grad_norm": 2.5586929321289062, + "learning_rate": 2.838827838827839e-05, + "loss": 0.1258, + "step": 6200 + }, + { + "epoch": 0.28845163284227876, + "grad_norm": 1.8947139978408813, + "learning_rate": 2.8846153846153845e-05, + "loss": 0.1242, + "step": 6300 + }, + { + "epoch": 0.2930302301889816, + "grad_norm": 2.729238271713257, + "learning_rate": 2.9304029304029305e-05, + "loss": 0.1183, + "step": 6400 + }, + { + "epoch": 0.29760882753568446, + "grad_norm": 1.338982343673706, + "learning_rate": 2.9761904761904762e-05, + "loss": 0.1203, + "step": 6500 + }, + { + "epoch": 0.30218742488238726, + "grad_norm": 1.6393356323242188, + "learning_rate": 3.021978021978022e-05, + "loss": 0.1173, + "step": 6600 + }, + { + "epoch": 0.3067660222290901, + "grad_norm": 2.4386088848114014, + "learning_rate": 3.067765567765568e-05, + "loss": 0.1262, + "step": 6700 + }, + { + "epoch": 0.31134461957579296, + "grad_norm": 1.6236072778701782, + "learning_rate": 3.113553113553114e-05, + "loss": 0.1163, + "step": 6800 + }, + { + "epoch": 0.3159232169224958, + "grad_norm": 1.6855531930923462, + "learning_rate": 3.1593406593406595e-05, + "loss": 0.1116, + "step": 6900 + }, + { + "epoch": 0.3205018142691986, + "grad_norm": 0.9769238233566284, + "learning_rate": 3.205128205128206e-05, + "loss": 0.1152, + "step": 7000 + }, + { + "epoch": 0.32508041161590145, + "grad_norm": 2.372692823410034, + "learning_rate": 3.2509157509157515e-05, + "loss": 0.1148, + "step": 7100 + }, + { + "epoch": 0.3296590089626043, + "grad_norm": 1.6294013261795044, + "learning_rate": 3.296703296703297e-05, + "loss": 0.1149, + "step": 7200 + }, + { + "epoch": 0.33423760630930716, + "grad_norm": 1.4730180501937866, + "learning_rate": 3.342490842490843e-05, + "loss": 0.113, + "step": 7300 + }, + { + "epoch": 0.33881620365601, + "grad_norm": 1.435680866241455, + "learning_rate": 3.3882783882783884e-05, + "loss": 0.1084, + "step": 7400 + }, + { + "epoch": 0.3433948010027128, + "grad_norm": 1.2709417343139648, + "learning_rate": 3.434065934065934e-05, + "loss": 0.1076, + "step": 7500 + }, + { + "epoch": 0.34797339834941565, + "grad_norm": 2.1665501594543457, + "learning_rate": 3.47985347985348e-05, + "loss": 0.1074, + "step": 7600 + }, + { + "epoch": 0.3525519956961185, + "grad_norm": 1.0768879652023315, + "learning_rate": 3.525641025641026e-05, + "loss": 0.1124, + "step": 7700 + }, + { + "epoch": 0.35713059304282135, + "grad_norm": 2.1648874282836914, + "learning_rate": 3.571428571428572e-05, + "loss": 0.1139, + "step": 7800 + }, + { + "epoch": 0.36170919038952415, + "grad_norm": 1.0463404655456543, + "learning_rate": 3.6172161172161173e-05, + "loss": 0.1084, + "step": 7900 + }, + { + "epoch": 0.366287787736227, + "grad_norm": 2.0209906101226807, + "learning_rate": 3.663003663003663e-05, + "loss": 0.1143, + "step": 8000 + }, + { + "epoch": 0.37086638508292985, + "grad_norm": 1.6264885663986206, + "learning_rate": 3.708791208791209e-05, + "loss": 0.1117, + "step": 8100 + }, + { + "epoch": 0.3754449824296327, + "grad_norm": 1.8169121742248535, + "learning_rate": 3.754578754578755e-05, + "loss": 0.1055, + "step": 8200 + }, + { + "epoch": 0.3800235797763355, + "grad_norm": 1.3127943277359009, + "learning_rate": 3.8003663003663006e-05, + "loss": 0.1135, + "step": 8300 + }, + { + "epoch": 0.38460217712303835, + "grad_norm": 1.2721083164215088, + "learning_rate": 3.846153846153846e-05, + "loss": 0.1144, + "step": 8400 + }, + { + "epoch": 0.3891807744697412, + "grad_norm": 1.393925666809082, + "learning_rate": 3.891941391941392e-05, + "loss": 0.106, + "step": 8500 + }, + { + "epoch": 0.39375937181644405, + "grad_norm": 1.0821542739868164, + "learning_rate": 3.9377289377289376e-05, + "loss": 0.105, + "step": 8600 + }, + { + "epoch": 0.39833796916314684, + "grad_norm": 1.5736069679260254, + "learning_rate": 3.983516483516483e-05, + "loss": 0.111, + "step": 8700 + }, + { + "epoch": 0.4029165665098497, + "grad_norm": 1.8037768602371216, + "learning_rate": 4.0293040293040296e-05, + "loss": 0.1094, + "step": 8800 + }, + { + "epoch": 0.40749516385655254, + "grad_norm": 1.1317250728607178, + "learning_rate": 4.075091575091575e-05, + "loss": 0.1028, + "step": 8900 + }, + { + "epoch": 0.4120737612032554, + "grad_norm": 1.362167477607727, + "learning_rate": 4.120879120879121e-05, + "loss": 0.1087, + "step": 9000 + }, + { + "epoch": 0.41665235854995825, + "grad_norm": 1.9178133010864258, + "learning_rate": 4.166666666666667e-05, + "loss": 0.1036, + "step": 9100 + }, + { + "epoch": 0.42123095589666104, + "grad_norm": 1.3326084613800049, + "learning_rate": 4.212454212454213e-05, + "loss": 0.1034, + "step": 9200 + }, + { + "epoch": 0.4258095532433639, + "grad_norm": 2.299654245376587, + "learning_rate": 4.2582417582417585e-05, + "loss": 0.0938, + "step": 9300 + }, + { + "epoch": 0.43038815059006674, + "grad_norm": 1.5850861072540283, + "learning_rate": 4.304029304029304e-05, + "loss": 0.0991, + "step": 9400 + }, + { + "epoch": 0.4349667479367696, + "grad_norm": 1.0600929260253906, + "learning_rate": 4.34981684981685e-05, + "loss": 0.1001, + "step": 9500 + }, + { + "epoch": 0.4395453452834724, + "grad_norm": 0.8734288811683655, + "learning_rate": 4.3956043956043955e-05, + "loss": 0.0991, + "step": 9600 + }, + { + "epoch": 0.44412394263017524, + "grad_norm": 1.5875756740570068, + "learning_rate": 4.441391941391941e-05, + "loss": 0.0982, + "step": 9700 + }, + { + "epoch": 0.4487025399768781, + "grad_norm": 1.2083957195281982, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.1027, + "step": 9800 + }, + { + "epoch": 0.45328113732358094, + "grad_norm": 1.5730398893356323, + "learning_rate": 4.532967032967033e-05, + "loss": 0.1035, + "step": 9900 + }, + { + "epoch": 0.45785973467028374, + "grad_norm": 1.0928138494491577, + "learning_rate": 4.578754578754579e-05, + "loss": 0.1017, + "step": 10000 + }, + { + "epoch": 0.4624383320169866, + "grad_norm": 1.858508586883545, + "learning_rate": 4.624542124542125e-05, + "loss": 0.1056, + "step": 10100 + }, + { + "epoch": 0.46701692936368944, + "grad_norm": 0.7009546756744385, + "learning_rate": 4.670329670329671e-05, + "loss": 0.1014, + "step": 10200 + }, + { + "epoch": 0.4715955267103923, + "grad_norm": 1.1056081056594849, + "learning_rate": 4.7161172161172164e-05, + "loss": 0.0997, + "step": 10300 + }, + { + "epoch": 0.4761741240570951, + "grad_norm": 1.5328575372695923, + "learning_rate": 4.761904761904762e-05, + "loss": 0.0971, + "step": 10400 + }, + { + "epoch": 0.48075272140379793, + "grad_norm": 2.9480137825012207, + "learning_rate": 4.8076923076923084e-05, + "loss": 0.1004, + "step": 10500 + }, + { + "epoch": 0.4853313187505008, + "grad_norm": 0.9198638796806335, + "learning_rate": 4.8534798534798533e-05, + "loss": 0.093, + "step": 10600 + }, + { + "epoch": 0.48990991609720363, + "grad_norm": 1.3510689735412598, + "learning_rate": 4.899267399267399e-05, + "loss": 0.0966, + "step": 10700 + }, + { + "epoch": 0.4944885134439065, + "grad_norm": 1.1206891536712646, + "learning_rate": 4.945054945054945e-05, + "loss": 0.1007, + "step": 10800 + }, + { + "epoch": 0.4990671107906093, + "grad_norm": 1.6948041915893555, + "learning_rate": 4.990842490842491e-05, + "loss": 0.0904, + "step": 10900 + }, + { + "epoch": 0.5036457081373121, + "grad_norm": 0.7195038199424744, + "learning_rate": 5.0366300366300366e-05, + "loss": 0.0894, + "step": 11000 + }, + { + "epoch": 0.508224305484015, + "grad_norm": 0.9326936602592468, + "learning_rate": 5.082417582417582e-05, + "loss": 0.0935, + "step": 11100 + }, + { + "epoch": 0.5128029028307178, + "grad_norm": 1.224360704421997, + "learning_rate": 5.128205128205128e-05, + "loss": 0.1011, + "step": 11200 + }, + { + "epoch": 0.5173815001774207, + "grad_norm": 0.7471579313278198, + "learning_rate": 5.173992673992675e-05, + "loss": 0.0936, + "step": 11300 + }, + { + "epoch": 0.5219600975241235, + "grad_norm": 0.8234615921974182, + "learning_rate": 5.2197802197802206e-05, + "loss": 0.092, + "step": 11400 + }, + { + "epoch": 0.5265386948708263, + "grad_norm": 1.204841136932373, + "learning_rate": 5.265567765567766e-05, + "loss": 0.1006, + "step": 11500 + }, + { + "epoch": 0.5311172922175291, + "grad_norm": 0.980890691280365, + "learning_rate": 5.311355311355312e-05, + "loss": 0.0909, + "step": 11600 + }, + { + "epoch": 0.535695889564232, + "grad_norm": 0.8736656308174133, + "learning_rate": 5.3571428571428575e-05, + "loss": 0.0921, + "step": 11700 + }, + { + "epoch": 0.5402744869109348, + "grad_norm": 1.8916438817977905, + "learning_rate": 5.402930402930403e-05, + "loss": 0.092, + "step": 11800 + }, + { + "epoch": 0.5448530842576377, + "grad_norm": 0.49095866084098816, + "learning_rate": 5.448717948717948e-05, + "loss": 0.0922, + "step": 11900 + }, + { + "epoch": 0.5494316816043405, + "grad_norm": 1.418338656425476, + "learning_rate": 5.494505494505495e-05, + "loss": 0.088, + "step": 12000 + }, + { + "epoch": 0.5540102789510434, + "grad_norm": 0.6211123466491699, + "learning_rate": 5.540293040293041e-05, + "loss": 0.0931, + "step": 12100 + }, + { + "epoch": 0.5585888762977462, + "grad_norm": 1.9046452045440674, + "learning_rate": 5.5860805860805865e-05, + "loss": 0.0934, + "step": 12200 + }, + { + "epoch": 0.563167473644449, + "grad_norm": 0.9247643351554871, + "learning_rate": 5.631868131868132e-05, + "loss": 0.0889, + "step": 12300 + }, + { + "epoch": 0.5677460709911518, + "grad_norm": 1.4018969535827637, + "learning_rate": 5.677655677655678e-05, + "loss": 0.0904, + "step": 12400 + }, + { + "epoch": 0.5723246683378547, + "grad_norm": 0.510405421257019, + "learning_rate": 5.7234432234432234e-05, + "loss": 0.0856, + "step": 12500 + }, + { + "epoch": 0.5769032656845575, + "grad_norm": 0.7951760292053223, + "learning_rate": 5.769230769230769e-05, + "loss": 0.0881, + "step": 12600 + }, + { + "epoch": 0.5814818630312604, + "grad_norm": 1.340402364730835, + "learning_rate": 5.8150183150183154e-05, + "loss": 0.0869, + "step": 12700 + }, + { + "epoch": 0.5860604603779632, + "grad_norm": 1.1029311418533325, + "learning_rate": 5.860805860805861e-05, + "loss": 0.0922, + "step": 12800 + }, + { + "epoch": 0.5906390577246661, + "grad_norm": 0.9942110180854797, + "learning_rate": 5.906593406593407e-05, + "loss": 0.094, + "step": 12900 + }, + { + "epoch": 0.5952176550713689, + "grad_norm": 0.9533814787864685, + "learning_rate": 5.9523809523809524e-05, + "loss": 0.0846, + "step": 13000 + }, + { + "epoch": 0.5997962524180718, + "grad_norm": 0.9805833101272583, + "learning_rate": 5.998168498168498e-05, + "loss": 0.0889, + "step": 13100 + }, + { + "epoch": 0.6043748497647745, + "grad_norm": 0.6185852885246277, + "learning_rate": 6.043956043956044e-05, + "loss": 0.0832, + "step": 13200 + }, + { + "epoch": 0.6089534471114774, + "grad_norm": 0.508185088634491, + "learning_rate": 6.089743589743589e-05, + "loss": 0.0828, + "step": 13300 + }, + { + "epoch": 0.6135320444581802, + "grad_norm": 0.8816813826560974, + "learning_rate": 6.135531135531136e-05, + "loss": 0.0907, + "step": 13400 + }, + { + "epoch": 0.6181106418048831, + "grad_norm": 0.7851380109786987, + "learning_rate": 6.181318681318681e-05, + "loss": 0.0838, + "step": 13500 + }, + { + "epoch": 0.6226892391515859, + "grad_norm": 1.2309856414794922, + "learning_rate": 6.227106227106228e-05, + "loss": 0.0892, + "step": 13600 + }, + { + "epoch": 0.6272678364982888, + "grad_norm": 0.9368516802787781, + "learning_rate": 6.272893772893773e-05, + "loss": 0.0826, + "step": 13700 + }, + { + "epoch": 0.6318464338449916, + "grad_norm": 2.122927188873291, + "learning_rate": 6.318681318681319e-05, + "loss": 0.0891, + "step": 13800 + }, + { + "epoch": 0.6364250311916945, + "grad_norm": 1.421099305152893, + "learning_rate": 6.364468864468864e-05, + "loss": 0.0856, + "step": 13900 + }, + { + "epoch": 0.6410036285383972, + "grad_norm": 1.240886926651001, + "learning_rate": 6.410256410256412e-05, + "loss": 0.084, + "step": 14000 + }, + { + "epoch": 0.6455822258851001, + "grad_norm": 1.5990924835205078, + "learning_rate": 6.456043956043957e-05, + "loss": 0.08, + "step": 14100 + }, + { + "epoch": 0.6501608232318029, + "grad_norm": 1.1593393087387085, + "learning_rate": 6.501831501831503e-05, + "loss": 0.0872, + "step": 14200 + }, + { + "epoch": 0.6547394205785058, + "grad_norm": 1.4352833032608032, + "learning_rate": 6.547619047619048e-05, + "loss": 0.0855, + "step": 14300 + }, + { + "epoch": 0.6593180179252086, + "grad_norm": 1.0805554389953613, + "learning_rate": 6.593406593406594e-05, + "loss": 0.076, + "step": 14400 + }, + { + "epoch": 0.6638966152719115, + "grad_norm": 1.4789384603500366, + "learning_rate": 6.639194139194139e-05, + "loss": 0.0787, + "step": 14500 + }, + { + "epoch": 0.6684752126186143, + "grad_norm": 0.5183298587799072, + "learning_rate": 6.684981684981686e-05, + "loss": 0.0818, + "step": 14600 + }, + { + "epoch": 0.6730538099653172, + "grad_norm": 0.540027916431427, + "learning_rate": 6.730769230769232e-05, + "loss": 0.0821, + "step": 14700 + }, + { + "epoch": 0.67763240731202, + "grad_norm": 0.4286615550518036, + "learning_rate": 6.776556776556777e-05, + "loss": 0.0835, + "step": 14800 + }, + { + "epoch": 0.6822110046587228, + "grad_norm": 0.6089257597923279, + "learning_rate": 6.822344322344323e-05, + "loss": 0.0774, + "step": 14900 + }, + { + "epoch": 0.6867896020054256, + "grad_norm": 1.8646626472473145, + "learning_rate": 6.868131868131868e-05, + "loss": 0.0768, + "step": 15000 + }, + { + "epoch": 0.6913681993521285, + "grad_norm": 1.5041414499282837, + "learning_rate": 6.913919413919414e-05, + "loss": 0.0811, + "step": 15100 + }, + { + "epoch": 0.6959467966988313, + "grad_norm": 1.2774971723556519, + "learning_rate": 6.95970695970696e-05, + "loss": 0.0824, + "step": 15200 + }, + { + "epoch": 0.7005253940455342, + "grad_norm": 0.7839298248291016, + "learning_rate": 7.005494505494506e-05, + "loss": 0.0825, + "step": 15300 + }, + { + "epoch": 0.705103991392237, + "grad_norm": 0.8007500767707825, + "learning_rate": 7.051282051282052e-05, + "loss": 0.0822, + "step": 15400 + }, + { + "epoch": 0.7096825887389399, + "grad_norm": 0.9601584672927856, + "learning_rate": 7.097069597069597e-05, + "loss": 0.0735, + "step": 15500 + }, + { + "epoch": 0.7142611860856427, + "grad_norm": 0.6283702254295349, + "learning_rate": 7.142857142857143e-05, + "loss": 0.0834, + "step": 15600 + }, + { + "epoch": 0.7188397834323454, + "grad_norm": 0.9371336102485657, + "learning_rate": 7.188644688644688e-05, + "loss": 0.0805, + "step": 15700 + }, + { + "epoch": 0.7234183807790483, + "grad_norm": 0.703433096408844, + "learning_rate": 7.234432234432235e-05, + "loss": 0.0797, + "step": 15800 + }, + { + "epoch": 0.7279969781257511, + "grad_norm": 1.1103012561798096, + "learning_rate": 7.28021978021978e-05, + "loss": 0.0702, + "step": 15900 + }, + { + "epoch": 0.732575575472454, + "grad_norm": 1.1333719491958618, + "learning_rate": 7.326007326007326e-05, + "loss": 0.0785, + "step": 16000 + }, + { + "epoch": 0.7371541728191568, + "grad_norm": 1.4945460557937622, + "learning_rate": 7.371794871794872e-05, + "loss": 0.0759, + "step": 16100 + }, + { + "epoch": 0.7417327701658597, + "grad_norm": 1.2516579627990723, + "learning_rate": 7.417582417582419e-05, + "loss": 0.0773, + "step": 16200 + }, + { + "epoch": 0.7463113675125626, + "grad_norm": 0.6910843253135681, + "learning_rate": 7.463369963369964e-05, + "loss": 0.0791, + "step": 16300 + }, + { + "epoch": 0.7508899648592654, + "grad_norm": 1.752681851387024, + "learning_rate": 7.50915750915751e-05, + "loss": 0.0729, + "step": 16400 + }, + { + "epoch": 0.7554685622059683, + "grad_norm": 0.46465998888015747, + "learning_rate": 7.554945054945055e-05, + "loss": 0.0691, + "step": 16500 + }, + { + "epoch": 0.760047159552671, + "grad_norm": 0.6676632165908813, + "learning_rate": 7.600732600732601e-05, + "loss": 0.0773, + "step": 16600 + }, + { + "epoch": 0.7646257568993738, + "grad_norm": 0.5497579574584961, + "learning_rate": 7.646520146520146e-05, + "loss": 0.0734, + "step": 16700 + }, + { + "epoch": 0.7692043542460767, + "grad_norm": 1.6269124746322632, + "learning_rate": 7.692307692307693e-05, + "loss": 0.075, + "step": 16800 + }, + { + "epoch": 0.7737829515927795, + "grad_norm": 1.7342535257339478, + "learning_rate": 7.738095238095239e-05, + "loss": 0.0738, + "step": 16900 + }, + { + "epoch": 0.7783615489394824, + "grad_norm": 0.5286089181900024, + "learning_rate": 7.783882783882784e-05, + "loss": 0.0791, + "step": 17000 + }, + { + "epoch": 0.7829401462861852, + "grad_norm": 1.0948727130889893, + "learning_rate": 7.82967032967033e-05, + "loss": 0.074, + "step": 17100 + }, + { + "epoch": 0.7875187436328881, + "grad_norm": 0.7580143809318542, + "learning_rate": 7.875457875457875e-05, + "loss": 0.0776, + "step": 17200 + }, + { + "epoch": 0.792097340979591, + "grad_norm": 0.6144015789031982, + "learning_rate": 7.921245421245422e-05, + "loss": 0.0698, + "step": 17300 + }, + { + "epoch": 0.7966759383262937, + "grad_norm": 1.054747462272644, + "learning_rate": 7.967032967032966e-05, + "loss": 0.0773, + "step": 17400 + }, + { + "epoch": 0.8012545356729965, + "grad_norm": 0.7159505486488342, + "learning_rate": 8.012820512820514e-05, + "loss": 0.0751, + "step": 17500 + }, + { + "epoch": 0.8058331330196994, + "grad_norm": 0.7566177248954773, + "learning_rate": 8.058608058608059e-05, + "loss": 0.0734, + "step": 17600 + }, + { + "epoch": 0.8104117303664022, + "grad_norm": 0.6282426714897156, + "learning_rate": 8.104395604395605e-05, + "loss": 0.0778, + "step": 17700 + }, + { + "epoch": 0.8149903277131051, + "grad_norm": 1.3555270433425903, + "learning_rate": 8.15018315018315e-05, + "loss": 0.0702, + "step": 17800 + }, + { + "epoch": 0.8195689250598079, + "grad_norm": 0.43876418471336365, + "learning_rate": 8.195970695970697e-05, + "loss": 0.0736, + "step": 17900 + }, + { + "epoch": 0.8241475224065108, + "grad_norm": 0.8096747994422913, + "learning_rate": 8.241758241758242e-05, + "loss": 0.0743, + "step": 18000 + }, + { + "epoch": 0.8287261197532136, + "grad_norm": 0.5688252449035645, + "learning_rate": 8.287545787545788e-05, + "loss": 0.0701, + "step": 18100 + }, + { + "epoch": 0.8333047170999165, + "grad_norm": 0.711829662322998, + "learning_rate": 8.333333333333334e-05, + "loss": 0.0795, + "step": 18200 + }, + { + "epoch": 0.8378833144466192, + "grad_norm": 0.9951382875442505, + "learning_rate": 8.37912087912088e-05, + "loss": 0.0752, + "step": 18300 + }, + { + "epoch": 0.8424619117933221, + "grad_norm": 1.2362946271896362, + "learning_rate": 8.424908424908426e-05, + "loss": 0.0726, + "step": 18400 + }, + { + "epoch": 0.8470405091400249, + "grad_norm": 0.6342608332633972, + "learning_rate": 8.470695970695971e-05, + "loss": 0.0784, + "step": 18500 + }, + { + "epoch": 0.8516191064867278, + "grad_norm": 0.4258309006690979, + "learning_rate": 8.516483516483517e-05, + "loss": 0.0725, + "step": 18600 + }, + { + "epoch": 0.8561977038334306, + "grad_norm": 0.6683163642883301, + "learning_rate": 8.562271062271062e-05, + "loss": 0.0711, + "step": 18700 + }, + { + "epoch": 0.8607763011801335, + "grad_norm": 0.7911510467529297, + "learning_rate": 8.608058608058608e-05, + "loss": 0.0683, + "step": 18800 + }, + { + "epoch": 0.8653548985268363, + "grad_norm": 0.5352203845977783, + "learning_rate": 8.653846153846155e-05, + "loss": 0.0702, + "step": 18900 + }, + { + "epoch": 0.8699334958735392, + "grad_norm": 0.850853443145752, + "learning_rate": 8.6996336996337e-05, + "loss": 0.0702, + "step": 19000 + }, + { + "epoch": 0.8745120932202419, + "grad_norm": 0.5566896796226501, + "learning_rate": 8.745421245421246e-05, + "loss": 0.0764, + "step": 19100 + }, + { + "epoch": 0.8790906905669448, + "grad_norm": 0.28583312034606934, + "learning_rate": 8.791208791208791e-05, + "loss": 0.0701, + "step": 19200 + }, + { + "epoch": 0.8836692879136476, + "grad_norm": 0.4633546471595764, + "learning_rate": 8.836996336996337e-05, + "loss": 0.0748, + "step": 19300 + }, + { + "epoch": 0.8882478852603505, + "grad_norm": 0.6778764724731445, + "learning_rate": 8.882783882783882e-05, + "loss": 0.0719, + "step": 19400 + }, + { + "epoch": 0.8928264826070533, + "grad_norm": 0.9359253644943237, + "learning_rate": 8.92857142857143e-05, + "loss": 0.0729, + "step": 19500 + }, + { + "epoch": 0.8974050799537562, + "grad_norm": 4.642319679260254, + "learning_rate": 8.974358974358975e-05, + "loss": 0.0704, + "step": 19600 + }, + { + "epoch": 0.901983677300459, + "grad_norm": 1.6843513250350952, + "learning_rate": 9.020146520146521e-05, + "loss": 0.0703, + "step": 19700 + }, + { + "epoch": 0.9065622746471619, + "grad_norm": 0.6702886819839478, + "learning_rate": 9.065934065934066e-05, + "loss": 0.072, + "step": 19800 + }, + { + "epoch": 0.9111408719938647, + "grad_norm": 0.7958008646965027, + "learning_rate": 9.111721611721613e-05, + "loss": 0.0717, + "step": 19900 + }, + { + "epoch": 0.9157194693405675, + "grad_norm": 0.41371116042137146, + "learning_rate": 9.157509157509158e-05, + "loss": 0.0701, + "step": 20000 + }, + { + "epoch": 0.9202980666872703, + "grad_norm": 0.446638286113739, + "learning_rate": 9.203296703296704e-05, + "loss": 0.0643, + "step": 20100 + }, + { + "epoch": 0.9248766640339732, + "grad_norm": 0.5474185347557068, + "learning_rate": 9.24908424908425e-05, + "loss": 0.0672, + "step": 20200 + }, + { + "epoch": 0.929455261380676, + "grad_norm": 1.0076775550842285, + "learning_rate": 9.294871794871795e-05, + "loss": 0.0699, + "step": 20300 + }, + { + "epoch": 0.9340338587273789, + "grad_norm": 0.45534393191337585, + "learning_rate": 9.340659340659341e-05, + "loss": 0.0712, + "step": 20400 + }, + { + "epoch": 0.9386124560740817, + "grad_norm": 0.6854729652404785, + "learning_rate": 9.386446886446886e-05, + "loss": 0.0695, + "step": 20500 + }, + { + "epoch": 0.9431910534207846, + "grad_norm": 1.4581429958343506, + "learning_rate": 9.432234432234433e-05, + "loss": 0.0676, + "step": 20600 + }, + { + "epoch": 0.9477696507674874, + "grad_norm": 1.6819262504577637, + "learning_rate": 9.478021978021978e-05, + "loss": 0.0683, + "step": 20700 + }, + { + "epoch": 0.9523482481141902, + "grad_norm": 0.8808913826942444, + "learning_rate": 9.523809523809524e-05, + "loss": 0.0668, + "step": 20800 + }, + { + "epoch": 0.956926845460893, + "grad_norm": 0.3119984269142151, + "learning_rate": 9.56959706959707e-05, + "loss": 0.074, + "step": 20900 + }, + { + "epoch": 0.9615054428075959, + "grad_norm": 0.6743124723434448, + "learning_rate": 9.615384615384617e-05, + "loss": 0.0728, + "step": 21000 + }, + { + "epoch": 0.9660840401542987, + "grad_norm": 0.6196538209915161, + "learning_rate": 9.661172161172162e-05, + "loss": 0.0677, + "step": 21100 + }, + { + "epoch": 0.9706626375010016, + "grad_norm": 0.7010948657989502, + "learning_rate": 9.706959706959707e-05, + "loss": 0.0716, + "step": 21200 + }, + { + "epoch": 0.9752412348477044, + "grad_norm": 0.7601842880249023, + "learning_rate": 9.752747252747253e-05, + "loss": 0.0675, + "step": 21300 + }, + { + "epoch": 0.9798198321944073, + "grad_norm": 0.5342845320701599, + "learning_rate": 9.798534798534798e-05, + "loss": 0.0732, + "step": 21400 + }, + { + "epoch": 0.9843984295411101, + "grad_norm": 0.7880052328109741, + "learning_rate": 9.844322344322346e-05, + "loss": 0.0671, + "step": 21500 + }, + { + "epoch": 0.988977026887813, + "grad_norm": 0.690728485584259, + "learning_rate": 9.89010989010989e-05, + "loss": 0.069, + "step": 21600 + }, + { + "epoch": 0.9935556242345157, + "grad_norm": 0.6646633148193359, + "learning_rate": 9.935897435897437e-05, + "loss": 0.0667, + "step": 21700 + }, + { + "epoch": 0.9981342215812186, + "grad_norm": 1.2037309408187866, + "learning_rate": 9.981684981684982e-05, + "loss": 0.0683, + "step": 21800 + }, + { + "epoch": 0.9999656605198998, + "eval_loss": 0.14296908676624298, + "eval_runtime": 256.7574, + "eval_samples_per_second": 21.421, + "eval_steps_per_second": 21.421, + "step": 21840 + }, + { + "epoch": 1.0027128189279215, + "grad_norm": 0.4327790439128876, + "learning_rate": 9.999997700931376e-05, + "loss": 0.0692, + "step": 21900 + }, + { + "epoch": 1.0072914162746243, + "grad_norm": 0.8181611895561218, + "learning_rate": 9.999983651075218e-05, + "loss": 0.0542, + "step": 22000 + }, + { + "epoch": 1.011870013621327, + "grad_norm": 0.6566409468650818, + "learning_rate": 9.999956828659095e-05, + "loss": 0.0571, + "step": 22100 + }, + { + "epoch": 1.01644861096803, + "grad_norm": 0.7238597273826599, + "learning_rate": 9.999917233751526e-05, + "loss": 0.0611, + "step": 22200 + }, + { + "epoch": 1.0210272083147327, + "grad_norm": 0.3051077127456665, + "learning_rate": 9.999864866453658e-05, + "loss": 0.059, + "step": 22300 + }, + { + "epoch": 1.0256058056614357, + "grad_norm": 1.5912861824035645, + "learning_rate": 9.999799726899262e-05, + "loss": 0.0615, + "step": 22400 + }, + { + "epoch": 1.0301844030081384, + "grad_norm": 0.6656569242477417, + "learning_rate": 9.999721815254742e-05, + "loss": 0.0618, + "step": 22500 + }, + { + "epoch": 1.0347630003548414, + "grad_norm": 1.1994621753692627, + "learning_rate": 9.999631131719119e-05, + "loss": 0.0614, + "step": 22600 + }, + { + "epoch": 1.039341597701544, + "grad_norm": 0.6420437097549438, + "learning_rate": 9.999527676524052e-05, + "loss": 0.0565, + "step": 22700 + }, + { + "epoch": 1.043920195048247, + "grad_norm": 0.6077245473861694, + "learning_rate": 9.999411449933816e-05, + "loss": 0.0609, + "step": 22800 + }, + { + "epoch": 1.0484987923949498, + "grad_norm": 0.6168214082717896, + "learning_rate": 9.999282452245315e-05, + "loss": 0.0579, + "step": 22900 + }, + { + "epoch": 1.0530773897416525, + "grad_norm": 0.4628690779209137, + "learning_rate": 9.999140683788078e-05, + "loss": 0.0576, + "step": 23000 + }, + { + "epoch": 1.0576559870883555, + "grad_norm": 0.43243736028671265, + "learning_rate": 9.998986144924251e-05, + "loss": 0.0615, + "step": 23100 + }, + { + "epoch": 1.0622345844350582, + "grad_norm": 0.7162685394287109, + "learning_rate": 9.998818836048611e-05, + "loss": 0.0598, + "step": 23200 + }, + { + "epoch": 1.0668131817817612, + "grad_norm": 0.7162106037139893, + "learning_rate": 9.99863875758855e-05, + "loss": 0.0574, + "step": 23300 + }, + { + "epoch": 1.071391779128464, + "grad_norm": 0.4392016530036926, + "learning_rate": 9.998445910004082e-05, + "loss": 0.0576, + "step": 23400 + }, + { + "epoch": 1.075970376475167, + "grad_norm": 0.8344998955726624, + "learning_rate": 9.998240293787841e-05, + "loss": 0.0639, + "step": 23500 + }, + { + "epoch": 1.0805489738218697, + "grad_norm": 0.9016310572624207, + "learning_rate": 9.998021909465076e-05, + "loss": 0.058, + "step": 23600 + }, + { + "epoch": 1.0851275711685724, + "grad_norm": 0.1677553951740265, + "learning_rate": 9.997790757593657e-05, + "loss": 0.0648, + "step": 23700 + }, + { + "epoch": 1.0897061685152754, + "grad_norm": 0.6796389222145081, + "learning_rate": 9.997546838764065e-05, + "loss": 0.0589, + "step": 23800 + }, + { + "epoch": 1.094284765861978, + "grad_norm": 0.667464554309845, + "learning_rate": 9.997290153599394e-05, + "loss": 0.0557, + "step": 23900 + }, + { + "epoch": 1.098863363208681, + "grad_norm": 0.9013321995735168, + "learning_rate": 9.997020702755353e-05, + "loss": 0.0555, + "step": 24000 + }, + { + "epoch": 1.1034419605553838, + "grad_norm": 0.3552779257297516, + "learning_rate": 9.996738486920259e-05, + "loss": 0.0568, + "step": 24100 + }, + { + "epoch": 1.1080205579020868, + "grad_norm": 0.6730219721794128, + "learning_rate": 9.996443506815039e-05, + "loss": 0.0556, + "step": 24200 + }, + { + "epoch": 1.1125991552487895, + "grad_norm": 0.29462745785713196, + "learning_rate": 9.996135763193225e-05, + "loss": 0.055, + "step": 24300 + }, + { + "epoch": 1.1171777525954925, + "grad_norm": 0.3105739653110504, + "learning_rate": 9.995815256840955e-05, + "loss": 0.0592, + "step": 24400 + }, + { + "epoch": 1.1217563499421952, + "grad_norm": 0.5383213758468628, + "learning_rate": 9.995481988576968e-05, + "loss": 0.0525, + "step": 24500 + }, + { + "epoch": 1.126334947288898, + "grad_norm": 0.6290645003318787, + "learning_rate": 9.995135959252605e-05, + "loss": 0.058, + "step": 24600 + }, + { + "epoch": 1.130913544635601, + "grad_norm": 0.4531712532043457, + "learning_rate": 9.994777169751806e-05, + "loss": 0.0515, + "step": 24700 + }, + { + "epoch": 1.1354921419823036, + "grad_norm": 0.5031425952911377, + "learning_rate": 9.994405620991102e-05, + "loss": 0.0591, + "step": 24800 + }, + { + "epoch": 1.1400707393290066, + "grad_norm": 0.8398526310920715, + "learning_rate": 9.994021313919628e-05, + "loss": 0.0608, + "step": 24900 + }, + { + "epoch": 1.1446493366757093, + "grad_norm": 0.3783178925514221, + "learning_rate": 9.9936242495191e-05, + "loss": 0.0589, + "step": 25000 + }, + { + "epoch": 1.1492279340224123, + "grad_norm": 0.3554207384586334, + "learning_rate": 9.99321442880383e-05, + "loss": 0.0561, + "step": 25100 + }, + { + "epoch": 1.153806531369115, + "grad_norm": 0.8848966956138611, + "learning_rate": 9.992791852820709e-05, + "loss": 0.0571, + "step": 25200 + }, + { + "epoch": 1.158385128715818, + "grad_norm": 0.4907087981700897, + "learning_rate": 9.99235652264922e-05, + "loss": 0.0593, + "step": 25300 + }, + { + "epoch": 1.1629637260625207, + "grad_norm": 0.6268092393875122, + "learning_rate": 9.991908439401421e-05, + "loss": 0.0526, + "step": 25400 + }, + { + "epoch": 1.1675423234092235, + "grad_norm": 0.5183268785476685, + "learning_rate": 9.991447604221951e-05, + "loss": 0.0536, + "step": 25500 + }, + { + "epoch": 1.1721209207559264, + "grad_norm": 0.4522722065448761, + "learning_rate": 9.990974018288022e-05, + "loss": 0.05, + "step": 25600 + }, + { + "epoch": 1.1766995181026292, + "grad_norm": 0.8773862719535828, + "learning_rate": 9.990487682809418e-05, + "loss": 0.0539, + "step": 25700 + }, + { + "epoch": 1.1812781154493321, + "grad_norm": 0.5325748920440674, + "learning_rate": 9.989988599028492e-05, + "loss": 0.0604, + "step": 25800 + }, + { + "epoch": 1.1858567127960349, + "grad_norm": 0.5544828772544861, + "learning_rate": 9.989476768220168e-05, + "loss": 0.0538, + "step": 25900 + }, + { + "epoch": 1.1904353101427378, + "grad_norm": 0.8816759586334229, + "learning_rate": 9.988952191691925e-05, + "loss": 0.0568, + "step": 26000 + }, + { + "epoch": 1.1950139074894406, + "grad_norm": 0.8002095222473145, + "learning_rate": 9.988414870783806e-05, + "loss": 0.0573, + "step": 26100 + }, + { + "epoch": 1.1995925048361435, + "grad_norm": 0.5534511208534241, + "learning_rate": 9.987864806868405e-05, + "loss": 0.0597, + "step": 26200 + }, + { + "epoch": 1.2041711021828463, + "grad_norm": 0.4148072898387909, + "learning_rate": 9.987302001350875e-05, + "loss": 0.049, + "step": 26300 + }, + { + "epoch": 1.208749699529549, + "grad_norm": 0.30762553215026855, + "learning_rate": 9.986726455668913e-05, + "loss": 0.0559, + "step": 26400 + }, + { + "epoch": 1.213328296876252, + "grad_norm": 0.7850671410560608, + "learning_rate": 9.986138171292762e-05, + "loss": 0.0515, + "step": 26500 + }, + { + "epoch": 1.2179068942229547, + "grad_norm": 0.45396122336387634, + "learning_rate": 9.985537149725205e-05, + "loss": 0.0529, + "step": 26600 + }, + { + "epoch": 1.2224854915696577, + "grad_norm": 0.4627123177051544, + "learning_rate": 9.984923392501567e-05, + "loss": 0.0555, + "step": 26700 + }, + { + "epoch": 1.2270640889163604, + "grad_norm": 0.8190097212791443, + "learning_rate": 9.984296901189702e-05, + "loss": 0.0507, + "step": 26800 + }, + { + "epoch": 1.2316426862630634, + "grad_norm": 0.6249597668647766, + "learning_rate": 9.983657677389992e-05, + "loss": 0.0538, + "step": 26900 + }, + { + "epoch": 1.2362212836097661, + "grad_norm": 0.8909338116645813, + "learning_rate": 9.983005722735351e-05, + "loss": 0.0458, + "step": 27000 + }, + { + "epoch": 1.240799880956469, + "grad_norm": 0.4777618944644928, + "learning_rate": 9.98234103889121e-05, + "loss": 0.0552, + "step": 27100 + }, + { + "epoch": 1.2453784783031718, + "grad_norm": 0.30679649114608765, + "learning_rate": 9.981663627555515e-05, + "loss": 0.0547, + "step": 27200 + }, + { + "epoch": 1.2499570756498746, + "grad_norm": 0.5480089783668518, + "learning_rate": 9.980973490458728e-05, + "loss": 0.0584, + "step": 27300 + }, + { + "epoch": 1.2545356729965775, + "grad_norm": 0.7595780491828918, + "learning_rate": 9.980270629363819e-05, + "loss": 0.056, + "step": 27400 + }, + { + "epoch": 1.2591142703432803, + "grad_norm": 0.34684839844703674, + "learning_rate": 9.979555046066261e-05, + "loss": 0.0545, + "step": 27500 + }, + { + "epoch": 1.2636928676899832, + "grad_norm": 0.4605325758457184, + "learning_rate": 9.978826742394027e-05, + "loss": 0.0588, + "step": 27600 + }, + { + "epoch": 1.268271465036686, + "grad_norm": 0.8060219287872314, + "learning_rate": 9.97808572020758e-05, + "loss": 0.0529, + "step": 27700 + }, + { + "epoch": 1.2728500623833887, + "grad_norm": 0.4551374614238739, + "learning_rate": 9.97733198139988e-05, + "loss": 0.0506, + "step": 27800 + }, + { + "epoch": 1.2774286597300917, + "grad_norm": 0.5313341617584229, + "learning_rate": 9.976565527896366e-05, + "loss": 0.0524, + "step": 27900 + }, + { + "epoch": 1.2820072570767946, + "grad_norm": 0.511184811592102, + "learning_rate": 9.97578636165496e-05, + "loss": 0.0522, + "step": 28000 + }, + { + "epoch": 1.2865858544234974, + "grad_norm": 0.8772425055503845, + "learning_rate": 9.974994484666058e-05, + "loss": 0.0546, + "step": 28100 + }, + { + "epoch": 1.2911644517702001, + "grad_norm": 0.4593620002269745, + "learning_rate": 9.974189898952524e-05, + "loss": 0.0527, + "step": 28200 + }, + { + "epoch": 1.295743049116903, + "grad_norm": 0.49878451228141785, + "learning_rate": 9.973372606569692e-05, + "loss": 0.0536, + "step": 28300 + }, + { + "epoch": 1.3003216464636058, + "grad_norm": 0.8320513367652893, + "learning_rate": 9.97254260960535e-05, + "loss": 0.0522, + "step": 28400 + }, + { + "epoch": 1.3049002438103088, + "grad_norm": 0.4917149245738983, + "learning_rate": 9.971699910179742e-05, + "loss": 0.0574, + "step": 28500 + }, + { + "epoch": 1.3094788411570115, + "grad_norm": 0.561815083026886, + "learning_rate": 9.97084451044556e-05, + "loss": 0.0557, + "step": 28600 + }, + { + "epoch": 1.3140574385037143, + "grad_norm": 0.43367573618888855, + "learning_rate": 9.969976412587944e-05, + "loss": 0.0522, + "step": 28700 + }, + { + "epoch": 1.3186360358504172, + "grad_norm": 1.0113517045974731, + "learning_rate": 9.969095618824462e-05, + "loss": 0.0491, + "step": 28800 + }, + { + "epoch": 1.3232146331971202, + "grad_norm": 0.548916220664978, + "learning_rate": 9.968202131405124e-05, + "loss": 0.0499, + "step": 28900 + }, + { + "epoch": 1.327793230543823, + "grad_norm": 0.5541431903839111, + "learning_rate": 9.967295952612361e-05, + "loss": 0.0464, + "step": 29000 + }, + { + "epoch": 1.3323718278905257, + "grad_norm": 0.47956761717796326, + "learning_rate": 9.966377084761023e-05, + "loss": 0.0548, + "step": 29100 + }, + { + "epoch": 1.3369504252372286, + "grad_norm": 0.9489524960517883, + "learning_rate": 9.965445530198378e-05, + "loss": 0.0576, + "step": 29200 + }, + { + "epoch": 1.3415290225839314, + "grad_norm": 0.7664705514907837, + "learning_rate": 9.964501291304101e-05, + "loss": 0.055, + "step": 29300 + }, + { + "epoch": 1.3461076199306343, + "grad_norm": 0.5601370930671692, + "learning_rate": 9.96354437049027e-05, + "loss": 0.0525, + "step": 29400 + }, + { + "epoch": 1.350686217277337, + "grad_norm": 0.3737477958202362, + "learning_rate": 9.962574770201358e-05, + "loss": 0.049, + "step": 29500 + }, + { + "epoch": 1.3552648146240398, + "grad_norm": 0.8171801567077637, + "learning_rate": 9.96159249291423e-05, + "loss": 0.0501, + "step": 29600 + }, + { + "epoch": 1.3598434119707428, + "grad_norm": 0.8035039305686951, + "learning_rate": 9.960597541138131e-05, + "loss": 0.0493, + "step": 29700 + }, + { + "epoch": 1.3644220093174457, + "grad_norm": 0.2262045294046402, + "learning_rate": 9.959589917414687e-05, + "loss": 0.0503, + "step": 29800 + }, + { + "epoch": 1.3690006066641485, + "grad_norm": 0.5973814725875854, + "learning_rate": 9.958569624317893e-05, + "loss": 0.0528, + "step": 29900 + }, + { + "epoch": 1.3735792040108512, + "grad_norm": 0.66443932056427, + "learning_rate": 9.957536664454108e-05, + "loss": 0.0509, + "step": 30000 + }, + { + "epoch": 1.3781578013575542, + "grad_norm": 1.04296875, + "learning_rate": 9.956491040462052e-05, + "loss": 0.0515, + "step": 30100 + }, + { + "epoch": 1.382736398704257, + "grad_norm": 1.5576283931732178, + "learning_rate": 9.955432755012788e-05, + "loss": 0.0533, + "step": 30200 + }, + { + "epoch": 1.3873149960509599, + "grad_norm": 0.3329857885837555, + "learning_rate": 9.954361810809732e-05, + "loss": 0.0523, + "step": 30300 + }, + { + "epoch": 1.3918935933976626, + "grad_norm": 1.7028000354766846, + "learning_rate": 9.953278210588628e-05, + "loss": 0.0516, + "step": 30400 + }, + { + "epoch": 1.3964721907443653, + "grad_norm": 0.49420544505119324, + "learning_rate": 9.952181957117559e-05, + "loss": 0.0505, + "step": 30500 + }, + { + "epoch": 1.4010507880910683, + "grad_norm": 0.15591812133789062, + "learning_rate": 9.951073053196926e-05, + "loss": 0.0512, + "step": 30600 + }, + { + "epoch": 1.4056293854377713, + "grad_norm": 0.4006904661655426, + "learning_rate": 9.949951501659445e-05, + "loss": 0.0522, + "step": 30700 + }, + { + "epoch": 1.410207982784474, + "grad_norm": 0.46110183000564575, + "learning_rate": 9.948817305370143e-05, + "loss": 0.049, + "step": 30800 + }, + { + "epoch": 1.4147865801311768, + "grad_norm": 0.24079594016075134, + "learning_rate": 9.947670467226349e-05, + "loss": 0.0521, + "step": 30900 + }, + { + "epoch": 1.4193651774778797, + "grad_norm": 0.6515139937400818, + "learning_rate": 9.946510990157682e-05, + "loss": 0.0495, + "step": 31000 + }, + { + "epoch": 1.4239437748245825, + "grad_norm": 0.5415006279945374, + "learning_rate": 9.945338877126052e-05, + "loss": 0.0526, + "step": 31100 + }, + { + "epoch": 1.4285223721712854, + "grad_norm": 0.8711938261985779, + "learning_rate": 9.944154131125642e-05, + "loss": 0.0484, + "step": 31200 + }, + { + "epoch": 1.4331009695179882, + "grad_norm": 0.5021001696586609, + "learning_rate": 9.942956755182916e-05, + "loss": 0.0567, + "step": 31300 + }, + { + "epoch": 1.437679566864691, + "grad_norm": 1.9676926136016846, + "learning_rate": 9.941746752356588e-05, + "loss": 0.0496, + "step": 31400 + }, + { + "epoch": 1.4422581642113939, + "grad_norm": 0.5120891332626343, + "learning_rate": 9.94052412573764e-05, + "loss": 0.0492, + "step": 31500 + }, + { + "epoch": 1.4468367615580966, + "grad_norm": 0.9182060956954956, + "learning_rate": 9.939288878449294e-05, + "loss": 0.0525, + "step": 31600 + }, + { + "epoch": 1.4514153589047996, + "grad_norm": 0.6737085580825806, + "learning_rate": 9.938041013647016e-05, + "loss": 0.0462, + "step": 31700 + }, + { + "epoch": 1.4559939562515023, + "grad_norm": 0.7034218311309814, + "learning_rate": 9.936780534518502e-05, + "loss": 0.0497, + "step": 31800 + }, + { + "epoch": 1.4605725535982053, + "grad_norm": 0.9228888750076294, + "learning_rate": 9.935507444283669e-05, + "loss": 0.0482, + "step": 31900 + }, + { + "epoch": 1.465151150944908, + "grad_norm": 0.3609278202056885, + "learning_rate": 9.934221746194655e-05, + "loss": 0.0594, + "step": 32000 + }, + { + "epoch": 1.469729748291611, + "grad_norm": 0.12724661827087402, + "learning_rate": 9.932923443535798e-05, + "loss": 0.0476, + "step": 32100 + }, + { + "epoch": 1.4743083456383137, + "grad_norm": 0.5686663389205933, + "learning_rate": 9.931612539623643e-05, + "loss": 0.0538, + "step": 32200 + }, + { + "epoch": 1.4788869429850164, + "grad_norm": 0.6813719868659973, + "learning_rate": 9.930289037806919e-05, + "loss": 0.0511, + "step": 32300 + }, + { + "epoch": 1.4834655403317194, + "grad_norm": 0.678242027759552, + "learning_rate": 9.928952941466538e-05, + "loss": 0.0492, + "step": 32400 + }, + { + "epoch": 1.4880441376784221, + "grad_norm": 0.7721807360649109, + "learning_rate": 9.927604254015585e-05, + "loss": 0.0529, + "step": 32500 + }, + { + "epoch": 1.492622735025125, + "grad_norm": 0.6314060688018799, + "learning_rate": 9.926242978899312e-05, + "loss": 0.0462, + "step": 32600 + }, + { + "epoch": 1.4972013323718278, + "grad_norm": 0.5451350212097168, + "learning_rate": 9.924869119595119e-05, + "loss": 0.0476, + "step": 32700 + }, + { + "epoch": 1.5017799297185306, + "grad_norm": 0.5342521071434021, + "learning_rate": 9.923482679612563e-05, + "loss": 0.0505, + "step": 32800 + }, + { + "epoch": 1.5063585270652335, + "grad_norm": 0.7561967968940735, + "learning_rate": 9.922083662493329e-05, + "loss": 0.0491, + "step": 32900 + }, + { + "epoch": 1.5109371244119365, + "grad_norm": 0.2349376529455185, + "learning_rate": 9.920672071811237e-05, + "loss": 0.0463, + "step": 33000 + }, + { + "epoch": 1.5155157217586392, + "grad_norm": 0.3987545073032379, + "learning_rate": 9.919247911172224e-05, + "loss": 0.0528, + "step": 33100 + }, + { + "epoch": 1.520094319105342, + "grad_norm": 0.4922156035900116, + "learning_rate": 9.917811184214337e-05, + "loss": 0.0479, + "step": 33200 + }, + { + "epoch": 1.524672916452045, + "grad_norm": 0.9758409261703491, + "learning_rate": 9.916361894607722e-05, + "loss": 0.0537, + "step": 33300 + }, + { + "epoch": 1.529251513798748, + "grad_norm": 0.5304883718490601, + "learning_rate": 9.914900046054623e-05, + "loss": 0.0504, + "step": 33400 + }, + { + "epoch": 1.5338301111454506, + "grad_norm": 0.4293117821216583, + "learning_rate": 9.913425642289358e-05, + "loss": 0.0481, + "step": 33500 + }, + { + "epoch": 1.5384087084921534, + "grad_norm": 0.354592889547348, + "learning_rate": 9.911938687078324e-05, + "loss": 0.0496, + "step": 33600 + }, + { + "epoch": 1.5429873058388561, + "grad_norm": 0.36046740412712097, + "learning_rate": 9.910439184219978e-05, + "loss": 0.0451, + "step": 33700 + }, + { + "epoch": 1.547565903185559, + "grad_norm": 0.4680946171283722, + "learning_rate": 9.90892713754483e-05, + "loss": 0.048, + "step": 33800 + }, + { + "epoch": 1.552144500532262, + "grad_norm": 0.4586212635040283, + "learning_rate": 9.907402550915433e-05, + "loss": 0.0462, + "step": 33900 + }, + { + "epoch": 1.5567230978789648, + "grad_norm": 0.2608386278152466, + "learning_rate": 9.905865428226376e-05, + "loss": 0.0472, + "step": 34000 + }, + { + "epoch": 1.5613016952256675, + "grad_norm": 0.5291585922241211, + "learning_rate": 9.90431577340427e-05, + "loss": 0.044, + "step": 34100 + }, + { + "epoch": 1.5658802925723705, + "grad_norm": 0.9200330376625061, + "learning_rate": 9.90275359040774e-05, + "loss": 0.0487, + "step": 34200 + }, + { + "epoch": 1.5704588899190735, + "grad_norm": 0.550689160823822, + "learning_rate": 9.901178883227414e-05, + "loss": 0.0515, + "step": 34300 + }, + { + "epoch": 1.5750374872657762, + "grad_norm": 0.7476568818092346, + "learning_rate": 9.899591655885912e-05, + "loss": 0.0457, + "step": 34400 + }, + { + "epoch": 1.579616084612479, + "grad_norm": 0.8736041188240051, + "learning_rate": 9.89799191243784e-05, + "loss": 0.0473, + "step": 34500 + }, + { + "epoch": 1.5841946819591817, + "grad_norm": 0.17842432856559753, + "learning_rate": 9.896379656969776e-05, + "loss": 0.0456, + "step": 34600 + }, + { + "epoch": 1.5887732793058846, + "grad_norm": 0.5870159864425659, + "learning_rate": 9.894754893600258e-05, + "loss": 0.052, + "step": 34700 + }, + { + "epoch": 1.5933518766525876, + "grad_norm": 0.33038216829299927, + "learning_rate": 9.893117626479777e-05, + "loss": 0.0498, + "step": 34800 + }, + { + "epoch": 1.5979304739992903, + "grad_norm": 0.7480065226554871, + "learning_rate": 9.891467859790767e-05, + "loss": 0.0484, + "step": 34900 + }, + { + "epoch": 1.602509071345993, + "grad_norm": 0.46852391958236694, + "learning_rate": 9.889805597747588e-05, + "loss": 0.0471, + "step": 35000 + }, + { + "epoch": 1.607087668692696, + "grad_norm": 0.33162882924079895, + "learning_rate": 9.888130844596524e-05, + "loss": 0.0477, + "step": 35100 + }, + { + "epoch": 1.6116662660393988, + "grad_norm": 1.0083402395248413, + "learning_rate": 9.886443604615764e-05, + "loss": 0.051, + "step": 35200 + }, + { + "epoch": 1.6162448633861017, + "grad_norm": 0.6158673763275146, + "learning_rate": 9.8847438821154e-05, + "loss": 0.0459, + "step": 35300 + }, + { + "epoch": 1.6208234607328045, + "grad_norm": 1.0110929012298584, + "learning_rate": 9.883031681437405e-05, + "loss": 0.0481, + "step": 35400 + }, + { + "epoch": 1.6254020580795072, + "grad_norm": 0.35791000723838806, + "learning_rate": 9.881307006955634e-05, + "loss": 0.0466, + "step": 35500 + }, + { + "epoch": 1.6299806554262102, + "grad_norm": 0.5888839364051819, + "learning_rate": 9.879569863075799e-05, + "loss": 0.048, + "step": 35600 + }, + { + "epoch": 1.6345592527729131, + "grad_norm": 0.7552986741065979, + "learning_rate": 9.877820254235471e-05, + "loss": 0.0482, + "step": 35700 + }, + { + "epoch": 1.6391378501196159, + "grad_norm": 0.5620241165161133, + "learning_rate": 9.87605818490406e-05, + "loss": 0.0461, + "step": 35800 + }, + { + "epoch": 1.6437164474663186, + "grad_norm": 0.40786847472190857, + "learning_rate": 9.87428365958281e-05, + "loss": 0.0501, + "step": 35900 + }, + { + "epoch": 1.6482950448130216, + "grad_norm": 0.3627175986766815, + "learning_rate": 9.872496682804781e-05, + "loss": 0.0495, + "step": 36000 + }, + { + "epoch": 1.6528736421597243, + "grad_norm": 0.3631226122379303, + "learning_rate": 9.870697259134844e-05, + "loss": 0.0415, + "step": 36100 + }, + { + "epoch": 1.6574522395064273, + "grad_norm": 0.41811424493789673, + "learning_rate": 9.86888539316966e-05, + "loss": 0.0444, + "step": 36200 + }, + { + "epoch": 1.66203083685313, + "grad_norm": 0.8351331949234009, + "learning_rate": 9.867061089537677e-05, + "loss": 0.0499, + "step": 36300 + }, + { + "epoch": 1.6666094341998328, + "grad_norm": 0.26144087314605713, + "learning_rate": 9.865224352899119e-05, + "loss": 0.0488, + "step": 36400 + }, + { + "epoch": 1.6711880315465357, + "grad_norm": 1.9417400360107422, + "learning_rate": 9.863375187945967e-05, + "loss": 0.0456, + "step": 36500 + }, + { + "epoch": 1.6757666288932387, + "grad_norm": 0.762496292591095, + "learning_rate": 9.861513599401948e-05, + "loss": 0.0446, + "step": 36600 + }, + { + "epoch": 1.6803452262399414, + "grad_norm": 0.6936019659042358, + "learning_rate": 9.859639592022528e-05, + "loss": 0.046, + "step": 36700 + }, + { + "epoch": 1.6849238235866442, + "grad_norm": 0.3661505877971649, + "learning_rate": 9.857753170594897e-05, + "loss": 0.0445, + "step": 36800 + }, + { + "epoch": 1.689502420933347, + "grad_norm": 0.6424843668937683, + "learning_rate": 9.85585433993796e-05, + "loss": 0.0473, + "step": 36900 + }, + { + "epoch": 1.6940810182800499, + "grad_norm": 0.520645022392273, + "learning_rate": 9.853943104902315e-05, + "loss": 0.0474, + "step": 37000 + }, + { + "epoch": 1.6986596156267528, + "grad_norm": 0.3194561302661896, + "learning_rate": 9.852019470370253e-05, + "loss": 0.0482, + "step": 37100 + }, + { + "epoch": 1.7032382129734556, + "grad_norm": 0.6570625305175781, + "learning_rate": 9.850083441255735e-05, + "loss": 0.0457, + "step": 37200 + }, + { + "epoch": 1.7078168103201583, + "grad_norm": 0.4810948371887207, + "learning_rate": 9.84813502250439e-05, + "loss": 0.0474, + "step": 37300 + }, + { + "epoch": 1.7123954076668613, + "grad_norm": 0.5983640551567078, + "learning_rate": 9.846174219093491e-05, + "loss": 0.0451, + "step": 37400 + }, + { + "epoch": 1.7169740050135642, + "grad_norm": 0.4565774202346802, + "learning_rate": 9.844201036031951e-05, + "loss": 0.0436, + "step": 37500 + }, + { + "epoch": 1.721552602360267, + "grad_norm": 0.4429413974285126, + "learning_rate": 9.842215478360306e-05, + "loss": 0.0415, + "step": 37600 + }, + { + "epoch": 1.7261311997069697, + "grad_norm": 0.461791068315506, + "learning_rate": 9.840217551150706e-05, + "loss": 0.0436, + "step": 37700 + }, + { + "epoch": 1.7307097970536724, + "grad_norm": 0.7613334059715271, + "learning_rate": 9.838207259506891e-05, + "loss": 0.0433, + "step": 37800 + }, + { + "epoch": 1.7352883944003754, + "grad_norm": 0.1547241359949112, + "learning_rate": 9.836184608564198e-05, + "loss": 0.044, + "step": 37900 + }, + { + "epoch": 1.7398669917470784, + "grad_norm": 0.45752155780792236, + "learning_rate": 9.834149603489526e-05, + "loss": 0.0436, + "step": 38000 + }, + { + "epoch": 1.7444455890937811, + "grad_norm": 1.1743345260620117, + "learning_rate": 9.832102249481338e-05, + "loss": 0.0443, + "step": 38100 + }, + { + "epoch": 1.7490241864404839, + "grad_norm": 0.9375355243682861, + "learning_rate": 9.830042551769641e-05, + "loss": 0.0437, + "step": 38200 + }, + { + "epoch": 1.7536027837871868, + "grad_norm": 0.3472870886325836, + "learning_rate": 9.827970515615977e-05, + "loss": 0.0445, + "step": 38300 + }, + { + "epoch": 1.7581813811338898, + "grad_norm": 1.0196037292480469, + "learning_rate": 9.825886146313402e-05, + "loss": 0.0452, + "step": 38400 + }, + { + "epoch": 1.7627599784805925, + "grad_norm": 0.774361789226532, + "learning_rate": 9.82378944918648e-05, + "loss": 0.0465, + "step": 38500 + }, + { + "epoch": 1.7673385758272953, + "grad_norm": 1.1351568698883057, + "learning_rate": 9.821680429591269e-05, + "loss": 0.0438, + "step": 38600 + }, + { + "epoch": 1.771917173173998, + "grad_norm": 0.3935364782810211, + "learning_rate": 9.819559092915299e-05, + "loss": 0.0477, + "step": 38700 + }, + { + "epoch": 1.776495770520701, + "grad_norm": 0.48644939064979553, + "learning_rate": 9.81742544457757e-05, + "loss": 0.0482, + "step": 38800 + }, + { + "epoch": 1.781074367867404, + "grad_norm": 0.7816250324249268, + "learning_rate": 9.815279490028529e-05, + "loss": 0.0418, + "step": 38900 + }, + { + "epoch": 1.7856529652141067, + "grad_norm": 0.9440283179283142, + "learning_rate": 9.81312123475006e-05, + "loss": 0.0445, + "step": 39000 + }, + { + "epoch": 1.7902315625608094, + "grad_norm": 0.5908809304237366, + "learning_rate": 9.810950684255473e-05, + "loss": 0.0467, + "step": 39100 + }, + { + "epoch": 1.7948101599075124, + "grad_norm": 0.8200555443763733, + "learning_rate": 9.80876784408948e-05, + "loss": 0.044, + "step": 39200 + }, + { + "epoch": 1.7993887572542153, + "grad_norm": 0.513478696346283, + "learning_rate": 9.806572719828193e-05, + "loss": 0.0437, + "step": 39300 + }, + { + "epoch": 1.803967354600918, + "grad_norm": 0.34043118357658386, + "learning_rate": 9.8043653170791e-05, + "loss": 0.0412, + "step": 39400 + }, + { + "epoch": 1.8085459519476208, + "grad_norm": 0.7160608172416687, + "learning_rate": 9.802145641481056e-05, + "loss": 0.0475, + "step": 39500 + }, + { + "epoch": 1.8131245492943235, + "grad_norm": 0.7825314998626709, + "learning_rate": 9.799913698704269e-05, + "loss": 0.0475, + "step": 39600 + }, + { + "epoch": 1.8177031466410265, + "grad_norm": 0.38378453254699707, + "learning_rate": 9.797669494450281e-05, + "loss": 0.0416, + "step": 39700 + }, + { + "epoch": 1.8222817439877295, + "grad_norm": 0.3362584710121155, + "learning_rate": 9.795413034451959e-05, + "loss": 0.0439, + "step": 39800 + }, + { + "epoch": 1.8268603413344322, + "grad_norm": 0.42681771516799927, + "learning_rate": 9.793144324473473e-05, + "loss": 0.0458, + "step": 39900 + }, + { + "epoch": 1.831438938681135, + "grad_norm": 0.7747517824172974, + "learning_rate": 9.790863370310293e-05, + "loss": 0.0442, + "step": 40000 + }, + { + "epoch": 1.836017536027838, + "grad_norm": 0.481751024723053, + "learning_rate": 9.788570177789158e-05, + "loss": 0.0479, + "step": 40100 + }, + { + "epoch": 1.8405961333745409, + "grad_norm": 0.4065397381782532, + "learning_rate": 9.78626475276808e-05, + "loss": 0.0433, + "step": 40200 + }, + { + "epoch": 1.8451747307212436, + "grad_norm": 0.47623851895332336, + "learning_rate": 9.78394710113631e-05, + "loss": 0.043, + "step": 40300 + }, + { + "epoch": 1.8497533280679463, + "grad_norm": 0.5850650668144226, + "learning_rate": 9.781617228814339e-05, + "loss": 0.0413, + "step": 40400 + }, + { + "epoch": 1.854331925414649, + "grad_norm": 0.5443374514579773, + "learning_rate": 9.77927514175387e-05, + "loss": 0.044, + "step": 40500 + }, + { + "epoch": 1.858910522761352, + "grad_norm": 0.5081647634506226, + "learning_rate": 9.776920845937816e-05, + "loss": 0.0417, + "step": 40600 + }, + { + "epoch": 1.863489120108055, + "grad_norm": 0.9533047080039978, + "learning_rate": 9.774554347380271e-05, + "loss": 0.0438, + "step": 40700 + }, + { + "epoch": 1.8680677174547577, + "grad_norm": 0.7583338618278503, + "learning_rate": 9.772175652126503e-05, + "loss": 0.0437, + "step": 40800 + }, + { + "epoch": 1.8726463148014605, + "grad_norm": 0.6980351209640503, + "learning_rate": 9.769784766252941e-05, + "loss": 0.0453, + "step": 40900 + }, + { + "epoch": 1.8772249121481634, + "grad_norm": 0.523002564907074, + "learning_rate": 9.767381695867149e-05, + "loss": 0.0436, + "step": 41000 + }, + { + "epoch": 1.8818035094948664, + "grad_norm": 0.20366418361663818, + "learning_rate": 9.764966447107819e-05, + "loss": 0.0436, + "step": 41100 + }, + { + "epoch": 1.8863821068415692, + "grad_norm": 0.5247259140014648, + "learning_rate": 9.762539026144755e-05, + "loss": 0.0436, + "step": 41200 + }, + { + "epoch": 1.890960704188272, + "grad_norm": 0.7223037481307983, + "learning_rate": 9.760099439178852e-05, + "loss": 0.0433, + "step": 41300 + }, + { + "epoch": 1.8955393015349746, + "grad_norm": 0.8085638880729675, + "learning_rate": 9.757647692442083e-05, + "loss": 0.0438, + "step": 41400 + }, + { + "epoch": 1.9001178988816776, + "grad_norm": 0.360061913728714, + "learning_rate": 9.755183792197486e-05, + "loss": 0.0437, + "step": 41500 + }, + { + "epoch": 1.9046964962283806, + "grad_norm": 0.33270975947380066, + "learning_rate": 9.752707744739145e-05, + "loss": 0.0417, + "step": 41600 + }, + { + "epoch": 1.9092750935750833, + "grad_norm": 0.40628722310066223, + "learning_rate": 9.750219556392175e-05, + "loss": 0.0442, + "step": 41700 + }, + { + "epoch": 1.913853690921786, + "grad_norm": 0.41940930485725403, + "learning_rate": 9.7477192335127e-05, + "loss": 0.0401, + "step": 41800 + }, + { + "epoch": 1.918432288268489, + "grad_norm": 0.41912841796875, + "learning_rate": 9.74520678248785e-05, + "loss": 0.0444, + "step": 41900 + }, + { + "epoch": 1.9230108856151917, + "grad_norm": 0.5386761426925659, + "learning_rate": 9.742682209735727e-05, + "loss": 0.0419, + "step": 42000 + }, + { + "epoch": 1.9275894829618947, + "grad_norm": 0.4845391809940338, + "learning_rate": 9.74014552170541e-05, + "loss": 0.0423, + "step": 42100 + }, + { + "epoch": 1.9321680803085974, + "grad_norm": 0.7683694958686829, + "learning_rate": 9.737596724876914e-05, + "loss": 0.0423, + "step": 42200 + }, + { + "epoch": 1.9367466776553002, + "grad_norm": 0.3862452805042267, + "learning_rate": 9.735035825761197e-05, + "loss": 0.0391, + "step": 42300 + }, + { + "epoch": 1.9413252750020031, + "grad_norm": 0.34973305463790894, + "learning_rate": 9.732462830900124e-05, + "loss": 0.0421, + "step": 42400 + }, + { + "epoch": 1.945903872348706, + "grad_norm": 0.6201797127723694, + "learning_rate": 9.729877746866465e-05, + "loss": 0.044, + "step": 42500 + }, + { + "epoch": 1.9504824696954088, + "grad_norm": 1.0637577772140503, + "learning_rate": 9.72728058026387e-05, + "loss": 0.0445, + "step": 42600 + }, + { + "epoch": 1.9550610670421116, + "grad_norm": 0.4952399432659149, + "learning_rate": 9.724671337726854e-05, + "loss": 0.0428, + "step": 42700 + }, + { + "epoch": 1.9596396643888145, + "grad_norm": 0.8040750026702881, + "learning_rate": 9.722050025920778e-05, + "loss": 0.0422, + "step": 42800 + }, + { + "epoch": 1.9642182617355173, + "grad_norm": 0.5028950572013855, + "learning_rate": 9.719416651541839e-05, + "loss": 0.0435, + "step": 42900 + }, + { + "epoch": 1.9687968590822202, + "grad_norm": 0.3617078959941864, + "learning_rate": 9.716771221317042e-05, + "loss": 0.0414, + "step": 43000 + }, + { + "epoch": 1.973375456428923, + "grad_norm": 0.6627247929573059, + "learning_rate": 9.714113742004198e-05, + "loss": 0.0442, + "step": 43100 + }, + { + "epoch": 1.9779540537756257, + "grad_norm": 0.5225775241851807, + "learning_rate": 9.711444220391886e-05, + "loss": 0.041, + "step": 43200 + }, + { + "epoch": 1.9825326511223287, + "grad_norm": 0.44373607635498047, + "learning_rate": 9.708762663299456e-05, + "loss": 0.0498, + "step": 43300 + }, + { + "epoch": 1.9871112484690316, + "grad_norm": 0.6220275163650513, + "learning_rate": 9.706069077577001e-05, + "loss": 0.0431, + "step": 43400 + }, + { + "epoch": 1.9916898458157344, + "grad_norm": 0.14181743562221527, + "learning_rate": 9.703363470105338e-05, + "loss": 0.0405, + "step": 43500 + }, + { + "epoch": 1.9962684431624371, + "grad_norm": 0.5560967922210693, + "learning_rate": 9.700645847796e-05, + "loss": 0.0393, + "step": 43600 + }, + { + "epoch": 1.9999771070132666, + "eval_loss": 0.16101938486099243, + "eval_runtime": 260.5165, + "eval_samples_per_second": 21.112, + "eval_steps_per_second": 21.112, + "step": 43681 + }, + { + "epoch": 2.00084704050914, + "grad_norm": 0.33940353989601135, + "learning_rate": 9.697916217591206e-05, + "loss": 0.0412, + "step": 43700 + }, + { + "epoch": 2.005425637855843, + "grad_norm": 0.5346599817276001, + "learning_rate": 9.695174586463848e-05, + "loss": 0.0324, + "step": 43800 + }, + { + "epoch": 2.010004235202546, + "grad_norm": 0.33371779322624207, + "learning_rate": 9.692420961417488e-05, + "loss": 0.0289, + "step": 43900 + }, + { + "epoch": 2.0145828325492485, + "grad_norm": 0.7601104974746704, + "learning_rate": 9.689655349486309e-05, + "loss": 0.0301, + "step": 44000 + }, + { + "epoch": 2.0191614298959513, + "grad_norm": 1.0337655544281006, + "learning_rate": 9.686877757735127e-05, + "loss": 0.0308, + "step": 44100 + }, + { + "epoch": 2.023740027242654, + "grad_norm": 0.39981570839881897, + "learning_rate": 9.684088193259355e-05, + "loss": 0.03, + "step": 44200 + }, + { + "epoch": 2.028318624589357, + "grad_norm": 0.6537315845489502, + "learning_rate": 9.681286663184994e-05, + "loss": 0.0319, + "step": 44300 + }, + { + "epoch": 2.03289722193606, + "grad_norm": 0.4220235347747803, + "learning_rate": 9.678473174668606e-05, + "loss": 0.03, + "step": 44400 + }, + { + "epoch": 2.0374758192827627, + "grad_norm": 0.20741093158721924, + "learning_rate": 9.675647734897309e-05, + "loss": 0.0315, + "step": 44500 + }, + { + "epoch": 2.0420544166294654, + "grad_norm": 0.46414992213249207, + "learning_rate": 9.672810351088743e-05, + "loss": 0.0304, + "step": 44600 + }, + { + "epoch": 2.0466330139761686, + "grad_norm": 0.4579373002052307, + "learning_rate": 9.669961030491064e-05, + "loss": 0.0312, + "step": 44700 + }, + { + "epoch": 2.0512116113228713, + "grad_norm": 0.7499117851257324, + "learning_rate": 9.66709978038292e-05, + "loss": 0.0306, + "step": 44800 + }, + { + "epoch": 2.055790208669574, + "grad_norm": 0.2862129807472229, + "learning_rate": 9.664226608073431e-05, + "loss": 0.031, + "step": 44900 + }, + { + "epoch": 2.060368806016277, + "grad_norm": 0.1198749765753746, + "learning_rate": 9.661341520902176e-05, + "loss": 0.0335, + "step": 45000 + }, + { + "epoch": 2.0649474033629795, + "grad_norm": 0.32276931405067444, + "learning_rate": 9.658444526239168e-05, + "loss": 0.0308, + "step": 45100 + }, + { + "epoch": 2.0695260007096827, + "grad_norm": 0.30404484272003174, + "learning_rate": 9.655535631484838e-05, + "loss": 0.0293, + "step": 45200 + }, + { + "epoch": 2.0741045980563855, + "grad_norm": 0.4213615357875824, + "learning_rate": 9.652614844070018e-05, + "loss": 0.0314, + "step": 45300 + }, + { + "epoch": 2.078683195403088, + "grad_norm": 0.26582449674606323, + "learning_rate": 9.64968217145592e-05, + "loss": 0.035, + "step": 45400 + }, + { + "epoch": 2.083261792749791, + "grad_norm": 0.462671160697937, + "learning_rate": 9.646737621134112e-05, + "loss": 0.0313, + "step": 45500 + }, + { + "epoch": 2.087840390096494, + "grad_norm": 0.7931702733039856, + "learning_rate": 9.643781200626511e-05, + "loss": 0.0341, + "step": 45600 + }, + { + "epoch": 2.092418987443197, + "grad_norm": 0.7540990710258484, + "learning_rate": 9.640812917485353e-05, + "loss": 0.03, + "step": 45700 + }, + { + "epoch": 2.0969975847898996, + "grad_norm": 0.23272991180419922, + "learning_rate": 9.637832779293177e-05, + "loss": 0.0282, + "step": 45800 + }, + { + "epoch": 2.1015761821366024, + "grad_norm": 0.4618046283721924, + "learning_rate": 9.634840793662807e-05, + "loss": 0.0345, + "step": 45900 + }, + { + "epoch": 2.106154779483305, + "grad_norm": 0.5170308947563171, + "learning_rate": 9.63183696823733e-05, + "loss": 0.0301, + "step": 46000 + }, + { + "epoch": 2.1107333768300083, + "grad_norm": 0.4604352116584778, + "learning_rate": 9.628821310690082e-05, + "loss": 0.0304, + "step": 46100 + }, + { + "epoch": 2.115311974176711, + "grad_norm": 0.5687543749809265, + "learning_rate": 9.625793828724618e-05, + "loss": 0.0326, + "step": 46200 + }, + { + "epoch": 2.1198905715234138, + "grad_norm": 0.6474491357803345, + "learning_rate": 9.622754530074705e-05, + "loss": 0.0323, + "step": 46300 + }, + { + "epoch": 2.1244691688701165, + "grad_norm": 0.3418339490890503, + "learning_rate": 9.619703422504291e-05, + "loss": 0.0311, + "step": 46400 + }, + { + "epoch": 2.1290477662168197, + "grad_norm": 0.542149543762207, + "learning_rate": 9.616640513807493e-05, + "loss": 0.0302, + "step": 46500 + }, + { + "epoch": 2.1336263635635224, + "grad_norm": 0.3595920205116272, + "learning_rate": 9.613565811808576e-05, + "loss": 0.033, + "step": 46600 + }, + { + "epoch": 2.138204960910225, + "grad_norm": 0.30227652192115784, + "learning_rate": 9.610479324361926e-05, + "loss": 0.0333, + "step": 46700 + }, + { + "epoch": 2.142783558256928, + "grad_norm": 0.3367413580417633, + "learning_rate": 9.607381059352038e-05, + "loss": 0.0291, + "step": 46800 + }, + { + "epoch": 2.1473621556036306, + "grad_norm": 0.33046597242355347, + "learning_rate": 9.604271024693495e-05, + "loss": 0.0324, + "step": 46900 + }, + { + "epoch": 2.151940752950334, + "grad_norm": 0.3185320794582367, + "learning_rate": 9.601149228330944e-05, + "loss": 0.03, + "step": 47000 + }, + { + "epoch": 2.1565193502970366, + "grad_norm": 0.4530138075351715, + "learning_rate": 9.598015678239074e-05, + "loss": 0.0301, + "step": 47100 + }, + { + "epoch": 2.1610979476437393, + "grad_norm": 1.8580175638198853, + "learning_rate": 9.594870382422604e-05, + "loss": 0.0279, + "step": 47200 + }, + { + "epoch": 2.165676544990442, + "grad_norm": 0.7226896286010742, + "learning_rate": 9.591713348916258e-05, + "loss": 0.0376, + "step": 47300 + }, + { + "epoch": 2.170255142337145, + "grad_norm": 0.5682775974273682, + "learning_rate": 9.588544585784741e-05, + "loss": 0.0319, + "step": 47400 + }, + { + "epoch": 2.174833739683848, + "grad_norm": 0.35570502281188965, + "learning_rate": 9.585364101122723e-05, + "loss": 0.0323, + "step": 47500 + }, + { + "epoch": 2.1794123370305507, + "grad_norm": 0.34631285071372986, + "learning_rate": 9.582171903054816e-05, + "loss": 0.0349, + "step": 47600 + }, + { + "epoch": 2.1839909343772534, + "grad_norm": 0.6178816556930542, + "learning_rate": 9.578967999735556e-05, + "loss": 0.0309, + "step": 47700 + }, + { + "epoch": 2.188569531723956, + "grad_norm": 0.37293490767478943, + "learning_rate": 9.575752399349378e-05, + "loss": 0.0288, + "step": 47800 + }, + { + "epoch": 2.1931481290706594, + "grad_norm": 0.6338769197463989, + "learning_rate": 9.572525110110601e-05, + "loss": 0.0321, + "step": 47900 + }, + { + "epoch": 2.197726726417362, + "grad_norm": 0.19380028545856476, + "learning_rate": 9.569286140263399e-05, + "loss": 0.0306, + "step": 48000 + }, + { + "epoch": 2.202305323764065, + "grad_norm": 0.3065268099308014, + "learning_rate": 9.566035498081784e-05, + "loss": 0.03, + "step": 48100 + }, + { + "epoch": 2.2068839211107676, + "grad_norm": 0.29010531306266785, + "learning_rate": 9.562773191869594e-05, + "loss": 0.0327, + "step": 48200 + }, + { + "epoch": 2.2114625184574708, + "grad_norm": 0.40536558628082275, + "learning_rate": 9.559499229960451e-05, + "loss": 0.028, + "step": 48300 + }, + { + "epoch": 2.2160411158041735, + "grad_norm": 0.30061614513397217, + "learning_rate": 9.55621362071776e-05, + "loss": 0.0306, + "step": 48400 + }, + { + "epoch": 2.2206197131508763, + "grad_norm": 0.5350512266159058, + "learning_rate": 9.552916372534674e-05, + "loss": 0.0301, + "step": 48500 + }, + { + "epoch": 2.225198310497579, + "grad_norm": 0.4163435101509094, + "learning_rate": 9.549607493834085e-05, + "loss": 0.0333, + "step": 48600 + }, + { + "epoch": 2.2297769078442817, + "grad_norm": 0.6648384928703308, + "learning_rate": 9.546286993068588e-05, + "loss": 0.0323, + "step": 48700 + }, + { + "epoch": 2.234355505190985, + "grad_norm": 0.3643403947353363, + "learning_rate": 9.54295487872047e-05, + "loss": 0.0287, + "step": 48800 + }, + { + "epoch": 2.2389341025376877, + "grad_norm": 0.8857894539833069, + "learning_rate": 9.539611159301684e-05, + "loss": 0.0299, + "step": 48900 + }, + { + "epoch": 2.2435126998843904, + "grad_norm": 0.4569896459579468, + "learning_rate": 9.536255843353832e-05, + "loss": 0.0317, + "step": 49000 + }, + { + "epoch": 2.248091297231093, + "grad_norm": 0.46430703997612, + "learning_rate": 9.532888939448134e-05, + "loss": 0.0342, + "step": 49100 + }, + { + "epoch": 2.252669894577796, + "grad_norm": 0.4034232795238495, + "learning_rate": 9.529510456185417e-05, + "loss": 0.0316, + "step": 49200 + }, + { + "epoch": 2.257248491924499, + "grad_norm": 0.5079818964004517, + "learning_rate": 9.526120402196083e-05, + "loss": 0.0302, + "step": 49300 + }, + { + "epoch": 2.261827089271202, + "grad_norm": 0.4281846880912781, + "learning_rate": 9.522718786140097e-05, + "loss": 0.0328, + "step": 49400 + }, + { + "epoch": 2.2664056866179045, + "grad_norm": 1.395179033279419, + "learning_rate": 9.519305616706953e-05, + "loss": 0.0321, + "step": 49500 + }, + { + "epoch": 2.2709842839646073, + "grad_norm": 0.22532618045806885, + "learning_rate": 9.515880902615661e-05, + "loss": 0.0323, + "step": 49600 + }, + { + "epoch": 2.2755628813113105, + "grad_norm": 0.2474541962146759, + "learning_rate": 9.512444652614728e-05, + "loss": 0.0318, + "step": 49700 + }, + { + "epoch": 2.280141478658013, + "grad_norm": 0.2567112445831299, + "learning_rate": 9.50899687548212e-05, + "loss": 0.0334, + "step": 49800 + }, + { + "epoch": 2.284720076004716, + "grad_norm": 0.446916401386261, + "learning_rate": 9.505537580025256e-05, + "loss": 0.0314, + "step": 49900 + }, + { + "epoch": 2.2892986733514187, + "grad_norm": 0.4602959454059601, + "learning_rate": 9.502066775080976e-05, + "loss": 0.0287, + "step": 50000 + }, + { + "epoch": 2.293877270698122, + "grad_norm": 0.16146403551101685, + "learning_rate": 9.49858446951552e-05, + "loss": 0.0304, + "step": 50100 + }, + { + "epoch": 2.2984558680448246, + "grad_norm": 0.5386151075363159, + "learning_rate": 9.495090672224511e-05, + "loss": 0.0312, + "step": 50200 + }, + { + "epoch": 2.3030344653915273, + "grad_norm": 0.4908753037452698, + "learning_rate": 9.491585392132924e-05, + "loss": 0.0266, + "step": 50300 + }, + { + "epoch": 2.30761306273823, + "grad_norm": 0.2822779715061188, + "learning_rate": 9.48806863819507e-05, + "loss": 0.031, + "step": 50400 + }, + { + "epoch": 2.312191660084933, + "grad_norm": 0.4930579364299774, + "learning_rate": 9.484540419394568e-05, + "loss": 0.0264, + "step": 50500 + }, + { + "epoch": 2.316770257431636, + "grad_norm": 0.3992035686969757, + "learning_rate": 9.481000744744321e-05, + "loss": 0.0331, + "step": 50600 + }, + { + "epoch": 2.3213488547783387, + "grad_norm": 0.8977420926094055, + "learning_rate": 9.477449623286505e-05, + "loss": 0.0294, + "step": 50700 + }, + { + "epoch": 2.3259274521250415, + "grad_norm": 0.26158419251441956, + "learning_rate": 9.473887064092531e-05, + "loss": 0.0332, + "step": 50800 + }, + { + "epoch": 2.3305060494717442, + "grad_norm": 0.4764149487018585, + "learning_rate": 9.470313076263025e-05, + "loss": 0.0335, + "step": 50900 + }, + { + "epoch": 2.335084646818447, + "grad_norm": 0.21730680763721466, + "learning_rate": 9.466727668927816e-05, + "loss": 0.0285, + "step": 51000 + }, + { + "epoch": 2.33966324416515, + "grad_norm": 0.7260767817497253, + "learning_rate": 9.463130851245898e-05, + "loss": 0.0336, + "step": 51100 + }, + { + "epoch": 2.344241841511853, + "grad_norm": 0.688016951084137, + "learning_rate": 9.459522632405415e-05, + "loss": 0.0291, + "step": 51200 + }, + { + "epoch": 2.3488204388585556, + "grad_norm": 0.172237828373909, + "learning_rate": 9.455903021623637e-05, + "loss": 0.0287, + "step": 51300 + }, + { + "epoch": 2.3533990362052584, + "grad_norm": 0.42502665519714355, + "learning_rate": 9.452272028146932e-05, + "loss": 0.0304, + "step": 51400 + }, + { + "epoch": 2.3579776335519615, + "grad_norm": 0.6109219193458557, + "learning_rate": 9.448629661250745e-05, + "loss": 0.0353, + "step": 51500 + }, + { + "epoch": 2.3625562308986643, + "grad_norm": 0.6103388071060181, + "learning_rate": 9.444975930239581e-05, + "loss": 0.0293, + "step": 51600 + }, + { + "epoch": 2.367134828245367, + "grad_norm": 0.6018409132957458, + "learning_rate": 9.441310844446965e-05, + "loss": 0.0349, + "step": 51700 + }, + { + "epoch": 2.3717134255920698, + "grad_norm": 0.3021933436393738, + "learning_rate": 9.437634413235436e-05, + "loss": 0.0294, + "step": 51800 + }, + { + "epoch": 2.376292022938773, + "grad_norm": 0.42470723390579224, + "learning_rate": 9.433946645996514e-05, + "loss": 0.0296, + "step": 51900 + }, + { + "epoch": 2.3808706202854757, + "grad_norm": 0.3379852771759033, + "learning_rate": 9.430247552150673e-05, + "loss": 0.0294, + "step": 52000 + }, + { + "epoch": 2.3854492176321784, + "grad_norm": 0.22888457775115967, + "learning_rate": 9.426537141147322e-05, + "loss": 0.0286, + "step": 52100 + }, + { + "epoch": 2.390027814978881, + "grad_norm": 0.5915461778640747, + "learning_rate": 9.422815422464786e-05, + "loss": 0.0353, + "step": 52200 + }, + { + "epoch": 2.394606412325584, + "grad_norm": 0.4212239980697632, + "learning_rate": 9.419082405610267e-05, + "loss": 0.0293, + "step": 52300 + }, + { + "epoch": 2.399185009672287, + "grad_norm": 0.21963568031787872, + "learning_rate": 9.415338100119833e-05, + "loss": 0.0291, + "step": 52400 + }, + { + "epoch": 2.40376360701899, + "grad_norm": 0.40482693910598755, + "learning_rate": 9.41158251555839e-05, + "loss": 0.0299, + "step": 52500 + }, + { + "epoch": 2.4083422043656926, + "grad_norm": 1.0071722269058228, + "learning_rate": 9.407815661519655e-05, + "loss": 0.0272, + "step": 52600 + }, + { + "epoch": 2.4129208017123953, + "grad_norm": 0.9266312718391418, + "learning_rate": 9.404037547626134e-05, + "loss": 0.0292, + "step": 52700 + }, + { + "epoch": 2.417499399059098, + "grad_norm": 0.9991750121116638, + "learning_rate": 9.400248183529093e-05, + "loss": 0.0341, + "step": 52800 + }, + { + "epoch": 2.4220779964058012, + "grad_norm": 0.4451786279678345, + "learning_rate": 9.396447578908543e-05, + "loss": 0.0308, + "step": 52900 + }, + { + "epoch": 2.426656593752504, + "grad_norm": 0.7537618279457092, + "learning_rate": 9.392635743473204e-05, + "loss": 0.0335, + "step": 53000 + }, + { + "epoch": 2.4312351910992067, + "grad_norm": 0.3898552358150482, + "learning_rate": 9.388812686960486e-05, + "loss": 0.0303, + "step": 53100 + }, + { + "epoch": 2.4358137884459095, + "grad_norm": 0.1613057404756546, + "learning_rate": 9.384978419136468e-05, + "loss": 0.0319, + "step": 53200 + }, + { + "epoch": 2.440392385792612, + "grad_norm": 0.34397152066230774, + "learning_rate": 9.381132949795861e-05, + "loss": 0.0343, + "step": 53300 + }, + { + "epoch": 2.4449709831393154, + "grad_norm": 0.38366371393203735, + "learning_rate": 9.377276288761997e-05, + "loss": 0.0296, + "step": 53400 + }, + { + "epoch": 2.449549580486018, + "grad_norm": 0.15570569038391113, + "learning_rate": 9.373408445886798e-05, + "loss": 0.0294, + "step": 53500 + }, + { + "epoch": 2.454128177832721, + "grad_norm": 0.2775089144706726, + "learning_rate": 9.369529431050743e-05, + "loss": 0.0301, + "step": 53600 + }, + { + "epoch": 2.4587067751794236, + "grad_norm": 0.20707450807094574, + "learning_rate": 9.365639254162854e-05, + "loss": 0.0301, + "step": 53700 + }, + { + "epoch": 2.463285372526127, + "grad_norm": 0.16948607563972473, + "learning_rate": 9.36173792516067e-05, + "loss": 0.0329, + "step": 53800 + }, + { + "epoch": 2.4678639698728295, + "grad_norm": 0.4837573766708374, + "learning_rate": 9.357825454010213e-05, + "loss": 0.0299, + "step": 53900 + }, + { + "epoch": 2.4724425672195323, + "grad_norm": 0.4705110490322113, + "learning_rate": 9.353901850705972e-05, + "loss": 0.0312, + "step": 54000 + }, + { + "epoch": 2.477021164566235, + "grad_norm": 0.6251786947250366, + "learning_rate": 9.349967125270871e-05, + "loss": 0.0306, + "step": 54100 + }, + { + "epoch": 2.481599761912938, + "grad_norm": 0.27536630630493164, + "learning_rate": 9.346021287756246e-05, + "loss": 0.03, + "step": 54200 + }, + { + "epoch": 2.486178359259641, + "grad_norm": 0.581510066986084, + "learning_rate": 9.342064348241818e-05, + "loss": 0.0326, + "step": 54300 + }, + { + "epoch": 2.4907569566063437, + "grad_norm": 0.4884732961654663, + "learning_rate": 9.338096316835671e-05, + "loss": 0.0281, + "step": 54400 + }, + { + "epoch": 2.4953355539530464, + "grad_norm": 0.34184959530830383, + "learning_rate": 9.334117203674219e-05, + "loss": 0.0308, + "step": 54500 + }, + { + "epoch": 2.499914151299749, + "grad_norm": 1.2529618740081787, + "learning_rate": 9.330127018922194e-05, + "loss": 0.0286, + "step": 54600 + }, + { + "epoch": 2.5044927486464523, + "grad_norm": 0.3773830831050873, + "learning_rate": 9.326125772772597e-05, + "loss": 0.0313, + "step": 54700 + }, + { + "epoch": 2.509071345993155, + "grad_norm": 0.5453410744667053, + "learning_rate": 9.322113475446698e-05, + "loss": 0.029, + "step": 54800 + }, + { + "epoch": 2.513649943339858, + "grad_norm": 0.4246394634246826, + "learning_rate": 9.318090137193988e-05, + "loss": 0.0333, + "step": 54900 + }, + { + "epoch": 2.5182285406865605, + "grad_norm": 0.46837061643600464, + "learning_rate": 9.314055768292169e-05, + "loss": 0.0311, + "step": 55000 + }, + { + "epoch": 2.5228071380332633, + "grad_norm": 0.31000879406929016, + "learning_rate": 9.310010379047119e-05, + "loss": 0.0288, + "step": 55100 + }, + { + "epoch": 2.5273857353799665, + "grad_norm": 0.36738067865371704, + "learning_rate": 9.305953979792865e-05, + "loss": 0.0318, + "step": 55200 + }, + { + "epoch": 2.531964332726669, + "grad_norm": 0.29930517077445984, + "learning_rate": 9.301886580891562e-05, + "loss": 0.0285, + "step": 55300 + }, + { + "epoch": 2.536542930073372, + "grad_norm": 0.22497807443141937, + "learning_rate": 9.297808192733464e-05, + "loss": 0.0283, + "step": 55400 + }, + { + "epoch": 2.541121527420075, + "grad_norm": 0.6719942688941956, + "learning_rate": 9.293718825736897e-05, + "loss": 0.0283, + "step": 55500 + }, + { + "epoch": 2.5457001247667774, + "grad_norm": 0.32624194025993347, + "learning_rate": 9.289618490348228e-05, + "loss": 0.0309, + "step": 55600 + }, + { + "epoch": 2.5502787221134806, + "grad_norm": 0.5497521162033081, + "learning_rate": 9.285507197041853e-05, + "loss": 0.0288, + "step": 55700 + }, + { + "epoch": 2.5548573194601834, + "grad_norm": 0.8471511006355286, + "learning_rate": 9.281384956320153e-05, + "loss": 0.0302, + "step": 55800 + }, + { + "epoch": 2.559435916806886, + "grad_norm": 0.40366891026496887, + "learning_rate": 9.277251778713474e-05, + "loss": 0.0279, + "step": 55900 + }, + { + "epoch": 2.5640145141535893, + "grad_norm": 0.10733508318662643, + "learning_rate": 9.273107674780102e-05, + "loss": 0.0285, + "step": 56000 + }, + { + "epoch": 2.568593111500292, + "grad_norm": 0.2292618602514267, + "learning_rate": 9.268952655106236e-05, + "loss": 0.0266, + "step": 56100 + }, + { + "epoch": 2.5731717088469948, + "grad_norm": 0.4450601637363434, + "learning_rate": 9.26478673030596e-05, + "loss": 0.0297, + "step": 56200 + }, + { + "epoch": 2.5777503061936975, + "grad_norm": 1.0813257694244385, + "learning_rate": 9.260609911021209e-05, + "loss": 0.0319, + "step": 56300 + }, + { + "epoch": 2.5823289035404002, + "grad_norm": 0.3026310205459595, + "learning_rate": 9.256422207921757e-05, + "loss": 0.0315, + "step": 56400 + }, + { + "epoch": 2.5869075008871034, + "grad_norm": 0.23144447803497314, + "learning_rate": 9.252223631705175e-05, + "loss": 0.0294, + "step": 56500 + }, + { + "epoch": 2.591486098233806, + "grad_norm": 0.38160964846611023, + "learning_rate": 9.248014193096811e-05, + "loss": 0.031, + "step": 56600 + }, + { + "epoch": 2.596064695580509, + "grad_norm": 0.2660236060619354, + "learning_rate": 9.243793902849763e-05, + "loss": 0.0279, + "step": 56700 + }, + { + "epoch": 2.6006432929272116, + "grad_norm": 0.7620320320129395, + "learning_rate": 9.239562771744848e-05, + "loss": 0.0318, + "step": 56800 + }, + { + "epoch": 2.6052218902739144, + "grad_norm": 0.5840933918952942, + "learning_rate": 9.235320810590575e-05, + "loss": 0.0317, + "step": 56900 + }, + { + "epoch": 2.6098004876206176, + "grad_norm": 0.3403662443161011, + "learning_rate": 9.231068030223122e-05, + "loss": 0.0322, + "step": 57000 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.2513747811317444, + "learning_rate": 9.226804441506302e-05, + "loss": 0.0295, + "step": 57100 + }, + { + "epoch": 2.618957682314023, + "grad_norm": 0.433006227016449, + "learning_rate": 9.22253005533154e-05, + "loss": 0.0308, + "step": 57200 + }, + { + "epoch": 2.623536279660726, + "grad_norm": 0.3026902675628662, + "learning_rate": 9.218244882617842e-05, + "loss": 0.0253, + "step": 57300 + }, + { + "epoch": 2.6281148770074285, + "grad_norm": 0.4516427516937256, + "learning_rate": 9.213948934311767e-05, + "loss": 0.0295, + "step": 57400 + }, + { + "epoch": 2.6326934743541317, + "grad_norm": 0.26671695709228516, + "learning_rate": 9.209642221387405e-05, + "loss": 0.028, + "step": 57500 + }, + { + "epoch": 2.6372720717008344, + "grad_norm": 0.5790793299674988, + "learning_rate": 9.20532475484634e-05, + "loss": 0.0301, + "step": 57600 + }, + { + "epoch": 2.641850669047537, + "grad_norm": 0.37218374013900757, + "learning_rate": 9.200996545717629e-05, + "loss": 0.0302, + "step": 57700 + }, + { + "epoch": 2.6464292663942404, + "grad_norm": 0.5596415400505066, + "learning_rate": 9.196657605057769e-05, + "loss": 0.0332, + "step": 57800 + }, + { + "epoch": 2.651007863740943, + "grad_norm": 0.6026178002357483, + "learning_rate": 9.192307943950675e-05, + "loss": 0.0297, + "step": 57900 + }, + { + "epoch": 2.655586461087646, + "grad_norm": 0.13471604883670807, + "learning_rate": 9.187947573507642e-05, + "loss": 0.031, + "step": 58000 + }, + { + "epoch": 2.6601650584343486, + "grad_norm": 0.7578465342521667, + "learning_rate": 9.183576504867327e-05, + "loss": 0.0255, + "step": 58100 + }, + { + "epoch": 2.6647436557810513, + "grad_norm": 0.32717058062553406, + "learning_rate": 9.179194749195713e-05, + "loss": 0.0295, + "step": 58200 + }, + { + "epoch": 2.6693222531277545, + "grad_norm": 0.4371168911457062, + "learning_rate": 9.174802317686084e-05, + "loss": 0.0297, + "step": 58300 + }, + { + "epoch": 2.6739008504744572, + "grad_norm": 0.3458854854106903, + "learning_rate": 9.170399221558995e-05, + "loss": 0.0252, + "step": 58400 + }, + { + "epoch": 2.67847944782116, + "grad_norm": 0.8889488577842712, + "learning_rate": 9.165985472062246e-05, + "loss": 0.0292, + "step": 58500 + }, + { + "epoch": 2.6830580451678627, + "grad_norm": 0.6179521679878235, + "learning_rate": 9.161561080470847e-05, + "loss": 0.0304, + "step": 58600 + }, + { + "epoch": 2.6876366425145655, + "grad_norm": 0.3913422226905823, + "learning_rate": 9.157126058087e-05, + "loss": 0.0321, + "step": 58700 + }, + { + "epoch": 2.6922152398612686, + "grad_norm": 0.31714576482772827, + "learning_rate": 9.152680416240059e-05, + "loss": 0.0306, + "step": 58800 + }, + { + "epoch": 2.6967938372079714, + "grad_norm": 0.16598474979400635, + "learning_rate": 9.148224166286506e-05, + "loss": 0.0308, + "step": 58900 + }, + { + "epoch": 2.701372434554674, + "grad_norm": 0.4751458466053009, + "learning_rate": 9.14375731960992e-05, + "loss": 0.0328, + "step": 59000 + }, + { + "epoch": 2.705951031901377, + "grad_norm": 0.8825288414955139, + "learning_rate": 9.139279887620955e-05, + "loss": 0.0288, + "step": 59100 + }, + { + "epoch": 2.7105296292480796, + "grad_norm": 0.4172840714454651, + "learning_rate": 9.1347918817573e-05, + "loss": 0.0301, + "step": 59200 + }, + { + "epoch": 2.715108226594783, + "grad_norm": 0.3465460538864136, + "learning_rate": 9.13029331348366e-05, + "loss": 0.0252, + "step": 59300 + }, + { + "epoch": 2.7196868239414855, + "grad_norm": 1.264923095703125, + "learning_rate": 9.125784194291717e-05, + "loss": 0.0272, + "step": 59400 + }, + { + "epoch": 2.7242654212881883, + "grad_norm": 0.2547473907470703, + "learning_rate": 9.121264535700107e-05, + "loss": 0.0332, + "step": 59500 + }, + { + "epoch": 2.7288440186348915, + "grad_norm": 0.508148193359375, + "learning_rate": 9.116734349254393e-05, + "loss": 0.0317, + "step": 59600 + }, + { + "epoch": 2.733422615981594, + "grad_norm": 0.6783300638198853, + "learning_rate": 9.112193646527024e-05, + "loss": 0.0331, + "step": 59700 + }, + { + "epoch": 2.738001213328297, + "grad_norm": 0.16436424851417542, + "learning_rate": 9.107642439117321e-05, + "loss": 0.0288, + "step": 59800 + }, + { + "epoch": 2.7425798106749997, + "grad_norm": 0.4682653546333313, + "learning_rate": 9.103080738651434e-05, + "loss": 0.0287, + "step": 59900 + }, + { + "epoch": 2.7471584080217024, + "grad_norm": 0.6873565912246704, + "learning_rate": 9.09850855678232e-05, + "loss": 0.0337, + "step": 60000 + }, + { + "epoch": 2.7517370053684056, + "grad_norm": 0.6117233037948608, + "learning_rate": 9.093925905189713e-05, + "loss": 0.0298, + "step": 60100 + }, + { + "epoch": 2.7563156027151083, + "grad_norm": 0.17423506081104279, + "learning_rate": 9.089332795580086e-05, + "loss": 0.03, + "step": 60200 + }, + { + "epoch": 2.760894200061811, + "grad_norm": 0.5828815698623657, + "learning_rate": 9.084729239686633e-05, + "loss": 0.0289, + "step": 60300 + }, + { + "epoch": 2.765472797408514, + "grad_norm": 0.2698822021484375, + "learning_rate": 9.080115249269232e-05, + "loss": 0.0298, + "step": 60400 + }, + { + "epoch": 2.7700513947552166, + "grad_norm": 0.5367493629455566, + "learning_rate": 9.075490836114413e-05, + "loss": 0.0322, + "step": 60500 + }, + { + "epoch": 2.7746299921019197, + "grad_norm": 0.4073825478553772, + "learning_rate": 9.070856012035336e-05, + "loss": 0.0292, + "step": 60600 + }, + { + "epoch": 2.7792085894486225, + "grad_norm": 0.22106589376926422, + "learning_rate": 9.066210788871751e-05, + "loss": 0.0308, + "step": 60700 + }, + { + "epoch": 2.783787186795325, + "grad_norm": 0.575246274471283, + "learning_rate": 9.061555178489978e-05, + "loss": 0.0284, + "step": 60800 + }, + { + "epoch": 2.788365784142028, + "grad_norm": 0.44034871459007263, + "learning_rate": 9.056889192782866e-05, + "loss": 0.0277, + "step": 60900 + }, + { + "epoch": 2.7929443814887307, + "grad_norm": 0.2914714217185974, + "learning_rate": 9.05221284366977e-05, + "loss": 0.0298, + "step": 61000 + }, + { + "epoch": 2.797522978835434, + "grad_norm": 0.411410391330719, + "learning_rate": 9.04752614309652e-05, + "loss": 0.0256, + "step": 61100 + }, + { + "epoch": 2.8021015761821366, + "grad_norm": 0.172648623585701, + "learning_rate": 9.04282910303539e-05, + "loss": 0.0326, + "step": 61200 + }, + { + "epoch": 2.8066801735288394, + "grad_norm": 0.279862642288208, + "learning_rate": 9.038121735485062e-05, + "loss": 0.0275, + "step": 61300 + }, + { + "epoch": 2.8112587708755425, + "grad_norm": 0.2992120385169983, + "learning_rate": 9.033404052470602e-05, + "loss": 0.0287, + "step": 61400 + }, + { + "epoch": 2.815837368222245, + "grad_norm": 0.3917059004306793, + "learning_rate": 9.028676066043428e-05, + "loss": 0.0316, + "step": 61500 + }, + { + "epoch": 2.820415965568948, + "grad_norm": 0.5848602056503296, + "learning_rate": 9.023937788281278e-05, + "loss": 0.0303, + "step": 61600 + }, + { + "epoch": 2.8249945629156508, + "grad_norm": 0.4045267701148987, + "learning_rate": 9.019189231288176e-05, + "loss": 0.0282, + "step": 61700 + }, + { + "epoch": 2.8295731602623535, + "grad_norm": 0.38309866189956665, + "learning_rate": 9.014430407194413e-05, + "loss": 0.0287, + "step": 61800 + }, + { + "epoch": 2.8341517576090567, + "grad_norm": 0.7173412442207336, + "learning_rate": 9.009661328156498e-05, + "loss": 0.0274, + "step": 61900 + }, + { + "epoch": 2.8387303549557594, + "grad_norm": 0.37477946281433105, + "learning_rate": 9.00488200635714e-05, + "loss": 0.0303, + "step": 62000 + }, + { + "epoch": 2.843308952302462, + "grad_norm": 0.26493415236473083, + "learning_rate": 9.000092454005216e-05, + "loss": 0.0289, + "step": 62100 + }, + { + "epoch": 2.847887549649165, + "grad_norm": 0.15275776386260986, + "learning_rate": 8.995292683335733e-05, + "loss": 0.0304, + "step": 62200 + }, + { + "epoch": 2.8524661469958676, + "grad_norm": 0.2792358994483948, + "learning_rate": 8.990482706609805e-05, + "loss": 0.0311, + "step": 62300 + }, + { + "epoch": 2.857044744342571, + "grad_norm": 0.4240334630012512, + "learning_rate": 8.985662536114613e-05, + "loss": 0.0304, + "step": 62400 + }, + { + "epoch": 2.8616233416892736, + "grad_norm": 0.137941375374794, + "learning_rate": 8.980832184163382e-05, + "loss": 0.0309, + "step": 62500 + }, + { + "epoch": 2.8662019390359763, + "grad_norm": 0.2340019941329956, + "learning_rate": 8.975991663095344e-05, + "loss": 0.0296, + "step": 62600 + }, + { + "epoch": 2.870780536382679, + "grad_norm": 0.39523446559906006, + "learning_rate": 8.97114098527571e-05, + "loss": 0.0284, + "step": 62700 + }, + { + "epoch": 2.875359133729382, + "grad_norm": 0.5535847544670105, + "learning_rate": 8.966280163095633e-05, + "loss": 0.0325, + "step": 62800 + }, + { + "epoch": 2.879937731076085, + "grad_norm": 0.4570659101009369, + "learning_rate": 8.961409208972182e-05, + "loss": 0.0237, + "step": 62900 + }, + { + "epoch": 2.8845163284227877, + "grad_norm": 0.5584346055984497, + "learning_rate": 8.95652813534831e-05, + "loss": 0.0358, + "step": 63000 + }, + { + "epoch": 2.8890949257694905, + "grad_norm": 0.961768388748169, + "learning_rate": 8.951636954692819e-05, + "loss": 0.0299, + "step": 63100 + }, + { + "epoch": 2.893673523116193, + "grad_norm": 0.24575570225715637, + "learning_rate": 8.94673567950033e-05, + "loss": 0.0282, + "step": 63200 + }, + { + "epoch": 2.898252120462896, + "grad_norm": 0.32376107573509216, + "learning_rate": 8.941824322291246e-05, + "loss": 0.0263, + "step": 63300 + }, + { + "epoch": 2.902830717809599, + "grad_norm": 0.20682887732982635, + "learning_rate": 8.936902895611732e-05, + "loss": 0.0313, + "step": 63400 + }, + { + "epoch": 2.907409315156302, + "grad_norm": 0.29019802808761597, + "learning_rate": 8.931971412033673e-05, + "loss": 0.0327, + "step": 63500 + }, + { + "epoch": 2.9119879125030046, + "grad_norm": 0.6069703102111816, + "learning_rate": 8.927029884154646e-05, + "loss": 0.0272, + "step": 63600 + }, + { + "epoch": 2.9165665098497078, + "grad_norm": 0.5670173168182373, + "learning_rate": 8.922078324597879e-05, + "loss": 0.0317, + "step": 63700 + }, + { + "epoch": 2.9211451071964105, + "grad_norm": 0.29881516098976135, + "learning_rate": 8.917116746012235e-05, + "loss": 0.0283, + "step": 63800 + }, + { + "epoch": 2.9257237045431133, + "grad_norm": 0.722374439239502, + "learning_rate": 8.91214516107217e-05, + "loss": 0.0295, + "step": 63900 + }, + { + "epoch": 2.930302301889816, + "grad_norm": 0.4505271315574646, + "learning_rate": 8.907163582477693e-05, + "loss": 0.0282, + "step": 64000 + }, + { + "epoch": 2.9348808992365187, + "grad_norm": 0.9996728301048279, + "learning_rate": 8.902172022954353e-05, + "loss": 0.0283, + "step": 64100 + }, + { + "epoch": 2.939459496583222, + "grad_norm": 0.5205316543579102, + "learning_rate": 8.897170495253187e-05, + "loss": 0.0281, + "step": 64200 + }, + { + "epoch": 2.9440380939299247, + "grad_norm": 0.6521015763282776, + "learning_rate": 8.892159012150701e-05, + "loss": 0.0279, + "step": 64300 + }, + { + "epoch": 2.9486166912766274, + "grad_norm": 0.8637863397598267, + "learning_rate": 8.88713758644883e-05, + "loss": 0.0277, + "step": 64400 + }, + { + "epoch": 2.95319528862333, + "grad_norm": 0.9392446875572205, + "learning_rate": 8.88210623097491e-05, + "loss": 0.0256, + "step": 64500 + }, + { + "epoch": 2.957773885970033, + "grad_norm": 0.23240399360656738, + "learning_rate": 8.877064958581636e-05, + "loss": 0.0276, + "step": 64600 + }, + { + "epoch": 2.962352483316736, + "grad_norm": 0.5640022158622742, + "learning_rate": 8.872013782147047e-05, + "loss": 0.0294, + "step": 64700 + }, + { + "epoch": 2.966931080663439, + "grad_norm": 0.254486620426178, + "learning_rate": 8.86695271457447e-05, + "loss": 0.0267, + "step": 64800 + }, + { + "epoch": 2.9715096780101415, + "grad_norm": 0.4906103014945984, + "learning_rate": 8.86188176879251e-05, + "loss": 0.0279, + "step": 64900 + }, + { + "epoch": 2.9760882753568443, + "grad_norm": 0.3822503387928009, + "learning_rate": 8.856800957755e-05, + "loss": 0.0299, + "step": 65000 + }, + { + "epoch": 2.980666872703547, + "grad_norm": 0.4109038710594177, + "learning_rate": 8.851710294440973e-05, + "loss": 0.0297, + "step": 65100 + }, + { + "epoch": 2.98524547005025, + "grad_norm": 0.4413500130176544, + "learning_rate": 8.846609791854633e-05, + "loss": 0.0272, + "step": 65200 + }, + { + "epoch": 2.989824067396953, + "grad_norm": 0.762428879737854, + "learning_rate": 8.84149946302532e-05, + "loss": 0.0279, + "step": 65300 + }, + { + "epoch": 2.9944026647436557, + "grad_norm": 0.9755131602287292, + "learning_rate": 8.83637932100747e-05, + "loss": 0.0294, + "step": 65400 + }, + { + "epoch": 2.998981262090359, + "grad_norm": 0.3907323181629181, + "learning_rate": 8.831249378880591e-05, + "loss": 0.0312, + "step": 65500 + }, + { + "epoch": 2.9999885535066335, + "eval_loss": 0.14470230042934418, + "eval_runtime": 251.7148, + "eval_samples_per_second": 21.85, + "eval_steps_per_second": 21.85, + "step": 65522 + }, + { + "epoch": 3.0035598594370616, + "grad_norm": 0.09335774928331375, + "learning_rate": 8.826109649749224e-05, + "loss": 0.024, + "step": 65600 + }, + { + "epoch": 3.0081384567837643, + "grad_norm": 0.43074119091033936, + "learning_rate": 8.820960146742913e-05, + "loss": 0.0205, + "step": 65700 + }, + { + "epoch": 3.012717054130467, + "grad_norm": 0.5296483635902405, + "learning_rate": 8.815800883016168e-05, + "loss": 0.0223, + "step": 65800 + }, + { + "epoch": 3.01729565147717, + "grad_norm": 0.3759153187274933, + "learning_rate": 8.810631871748432e-05, + "loss": 0.0207, + "step": 65900 + }, + { + "epoch": 3.021874248823873, + "grad_norm": 0.6265881657600403, + "learning_rate": 8.805453126144047e-05, + "loss": 0.0218, + "step": 66000 + }, + { + "epoch": 3.0264528461705758, + "grad_norm": 1.2174720764160156, + "learning_rate": 8.800264659432232e-05, + "loss": 0.0217, + "step": 66100 + }, + { + "epoch": 3.0310314435172785, + "grad_norm": 0.9290931224822998, + "learning_rate": 8.795066484867023e-05, + "loss": 0.0199, + "step": 66200 + }, + { + "epoch": 3.0356100408639812, + "grad_norm": 0.6158362030982971, + "learning_rate": 8.789858615727265e-05, + "loss": 0.0182, + "step": 66300 + }, + { + "epoch": 3.040188638210684, + "grad_norm": 0.35175448656082153, + "learning_rate": 8.784641065316567e-05, + "loss": 0.0192, + "step": 66400 + }, + { + "epoch": 3.044767235557387, + "grad_norm": 0.6219223141670227, + "learning_rate": 8.779413846963267e-05, + "loss": 0.0174, + "step": 66500 + }, + { + "epoch": 3.04934583290409, + "grad_norm": 0.1079217791557312, + "learning_rate": 8.7741769740204e-05, + "loss": 0.0213, + "step": 66600 + }, + { + "epoch": 3.0539244302507926, + "grad_norm": 0.4346974790096283, + "learning_rate": 8.768930459865665e-05, + "loss": 0.0207, + "step": 66700 + }, + { + "epoch": 3.0585030275974954, + "grad_norm": 0.26265600323677063, + "learning_rate": 8.76367431790139e-05, + "loss": 0.0213, + "step": 66800 + }, + { + "epoch": 3.0630816249441986, + "grad_norm": 0.536638617515564, + "learning_rate": 8.758408561554495e-05, + "loss": 0.0207, + "step": 66900 + }, + { + "epoch": 3.0676602222909013, + "grad_norm": 0.4859350025653839, + "learning_rate": 8.753133204276462e-05, + "loss": 0.0208, + "step": 67000 + }, + { + "epoch": 3.072238819637604, + "grad_norm": 0.03394511342048645, + "learning_rate": 8.7478482595433e-05, + "loss": 0.0202, + "step": 67100 + }, + { + "epoch": 3.0768174169843068, + "grad_norm": 1.2979317903518677, + "learning_rate": 8.742553740855506e-05, + "loss": 0.02, + "step": 67200 + }, + { + "epoch": 3.0813960143310095, + "grad_norm": 0.7448957562446594, + "learning_rate": 8.737249661738036e-05, + "loss": 0.02, + "step": 67300 + }, + { + "epoch": 3.0859746116777127, + "grad_norm": 0.45380228757858276, + "learning_rate": 8.731936035740269e-05, + "loss": 0.0214, + "step": 67400 + }, + { + "epoch": 3.0905532090244154, + "grad_norm": 0.49080690741539, + "learning_rate": 8.726612876435972e-05, + "loss": 0.0206, + "step": 67500 + }, + { + "epoch": 3.095131806371118, + "grad_norm": 0.2271386682987213, + "learning_rate": 8.721280197423258e-05, + "loss": 0.0218, + "step": 67600 + }, + { + "epoch": 3.099710403717821, + "grad_norm": 0.7691048383712769, + "learning_rate": 8.71593801232457e-05, + "loss": 0.0184, + "step": 67700 + }, + { + "epoch": 3.104289001064524, + "grad_norm": 0.37762150168418884, + "learning_rate": 8.710586334786627e-05, + "loss": 0.0196, + "step": 67800 + }, + { + "epoch": 3.108867598411227, + "grad_norm": 0.4796387255191803, + "learning_rate": 8.705225178480398e-05, + "loss": 0.0194, + "step": 67900 + }, + { + "epoch": 3.1134461957579296, + "grad_norm": 0.1666077822446823, + "learning_rate": 8.699854557101063e-05, + "loss": 0.0215, + "step": 68000 + }, + { + "epoch": 3.1180247931046323, + "grad_norm": 0.287124365568161, + "learning_rate": 8.69447448436799e-05, + "loss": 0.0184, + "step": 68100 + }, + { + "epoch": 3.122603390451335, + "grad_norm": 0.2599179744720459, + "learning_rate": 8.689084974024677e-05, + "loss": 0.0185, + "step": 68200 + }, + { + "epoch": 3.1271819877980382, + "grad_norm": 0.33696624636650085, + "learning_rate": 8.683686039838742e-05, + "loss": 0.0199, + "step": 68300 + }, + { + "epoch": 3.131760585144741, + "grad_norm": 0.4512630105018616, + "learning_rate": 8.678277695601872e-05, + "loss": 0.0189, + "step": 68400 + }, + { + "epoch": 3.1363391824914437, + "grad_norm": 1.3083339929580688, + "learning_rate": 8.67285995512979e-05, + "loss": 0.0205, + "step": 68500 + }, + { + "epoch": 3.1409177798381465, + "grad_norm": 0.5254839658737183, + "learning_rate": 8.66743283226223e-05, + "loss": 0.021, + "step": 68600 + }, + { + "epoch": 3.145496377184849, + "grad_norm": 0.37214428186416626, + "learning_rate": 8.66199634086288e-05, + "loss": 0.0214, + "step": 68700 + }, + { + "epoch": 3.1500749745315524, + "grad_norm": 0.39814454317092896, + "learning_rate": 8.656550494819373e-05, + "loss": 0.0215, + "step": 68800 + }, + { + "epoch": 3.154653571878255, + "grad_norm": 0.7737843990325928, + "learning_rate": 8.651095308043232e-05, + "loss": 0.0199, + "step": 68900 + }, + { + "epoch": 3.159232169224958, + "grad_norm": 0.32976606488227844, + "learning_rate": 8.645630794469843e-05, + "loss": 0.0232, + "step": 69000 + }, + { + "epoch": 3.1638107665716606, + "grad_norm": 0.23388764262199402, + "learning_rate": 8.640156968058417e-05, + "loss": 0.0197, + "step": 69100 + }, + { + "epoch": 3.168389363918364, + "grad_norm": 0.15984760224819183, + "learning_rate": 8.634673842791956e-05, + "loss": 0.0212, + "step": 69200 + }, + { + "epoch": 3.1729679612650665, + "grad_norm": 0.20868225395679474, + "learning_rate": 8.629181432677213e-05, + "loss": 0.02, + "step": 69300 + }, + { + "epoch": 3.1775465586117693, + "grad_norm": 0.12190031260251999, + "learning_rate": 8.623679751744662e-05, + "loss": 0.0195, + "step": 69400 + }, + { + "epoch": 3.182125155958472, + "grad_norm": 0.7357327342033386, + "learning_rate": 8.61816881404846e-05, + "loss": 0.0212, + "step": 69500 + }, + { + "epoch": 3.186703753305175, + "grad_norm": 0.231657475233078, + "learning_rate": 8.612648633666406e-05, + "loss": 0.0181, + "step": 69600 + }, + { + "epoch": 3.191282350651878, + "grad_norm": 0.9028156995773315, + "learning_rate": 8.607119224699919e-05, + "loss": 0.0216, + "step": 69700 + }, + { + "epoch": 3.1958609479985807, + "grad_norm": 0.30773207545280457, + "learning_rate": 8.601580601273982e-05, + "loss": 0.0189, + "step": 69800 + }, + { + "epoch": 3.2004395453452834, + "grad_norm": 0.15716642141342163, + "learning_rate": 8.596032777537123e-05, + "loss": 0.022, + "step": 69900 + }, + { + "epoch": 3.205018142691986, + "grad_norm": 0.2637390196323395, + "learning_rate": 8.59047576766137e-05, + "loss": 0.0174, + "step": 70000 + }, + { + "epoch": 3.2095967400386893, + "grad_norm": 0.29018816351890564, + "learning_rate": 8.584909585842218e-05, + "loss": 0.0205, + "step": 70100 + }, + { + "epoch": 3.214175337385392, + "grad_norm": 0.6676698327064514, + "learning_rate": 8.579334246298593e-05, + "loss": 0.0176, + "step": 70200 + }, + { + "epoch": 3.218753934732095, + "grad_norm": 0.3571256101131439, + "learning_rate": 8.573749763272811e-05, + "loss": 0.0229, + "step": 70300 + }, + { + "epoch": 3.2233325320787976, + "grad_norm": 0.7378453016281128, + "learning_rate": 8.568156151030549e-05, + "loss": 0.0185, + "step": 70400 + }, + { + "epoch": 3.2279111294255003, + "grad_norm": 0.533330500125885, + "learning_rate": 8.562553423860802e-05, + "loss": 0.0207, + "step": 70500 + }, + { + "epoch": 3.2324897267722035, + "grad_norm": 0.28255611658096313, + "learning_rate": 8.556941596075852e-05, + "loss": 0.0185, + "step": 70600 + }, + { + "epoch": 3.237068324118906, + "grad_norm": 0.37244170904159546, + "learning_rate": 8.551320682011228e-05, + "loss": 0.0217, + "step": 70700 + }, + { + "epoch": 3.241646921465609, + "grad_norm": 0.16496537625789642, + "learning_rate": 8.545690696025666e-05, + "loss": 0.0238, + "step": 70800 + }, + { + "epoch": 3.2462255188123117, + "grad_norm": 1.0030924081802368, + "learning_rate": 8.540051652501082e-05, + "loss": 0.0213, + "step": 70900 + }, + { + "epoch": 3.250804116159015, + "grad_norm": 0.7419716715812683, + "learning_rate": 8.534403565842528e-05, + "loss": 0.0225, + "step": 71000 + }, + { + "epoch": 3.2553827135057176, + "grad_norm": 0.2792261242866516, + "learning_rate": 8.528746450478156e-05, + "loss": 0.0187, + "step": 71100 + }, + { + "epoch": 3.2599613108524204, + "grad_norm": 0.0836094543337822, + "learning_rate": 8.523080320859181e-05, + "loss": 0.0221, + "step": 71200 + }, + { + "epoch": 3.264539908199123, + "grad_norm": 0.10340839624404907, + "learning_rate": 8.517405191459847e-05, + "loss": 0.0213, + "step": 71300 + }, + { + "epoch": 3.2691185055458263, + "grad_norm": 0.7118562459945679, + "learning_rate": 8.511721076777389e-05, + "loss": 0.0193, + "step": 71400 + }, + { + "epoch": 3.273697102892529, + "grad_norm": 0.12246321886777878, + "learning_rate": 8.50602799133199e-05, + "loss": 0.0223, + "step": 71500 + }, + { + "epoch": 3.2782757002392318, + "grad_norm": 0.2873895764350891, + "learning_rate": 8.500325949666755e-05, + "loss": 0.0213, + "step": 71600 + }, + { + "epoch": 3.2828542975859345, + "grad_norm": 0.5243780016899109, + "learning_rate": 8.494614966347668e-05, + "loss": 0.0201, + "step": 71700 + }, + { + "epoch": 3.2874328949326372, + "grad_norm": 0.28602150082588196, + "learning_rate": 8.488895055963546e-05, + "loss": 0.0209, + "step": 71800 + }, + { + "epoch": 3.2920114922793404, + "grad_norm": 0.35241249203681946, + "learning_rate": 8.483166233126022e-05, + "loss": 0.0217, + "step": 71900 + }, + { + "epoch": 3.296590089626043, + "grad_norm": 0.6958779096603394, + "learning_rate": 8.477428512469488e-05, + "loss": 0.023, + "step": 72000 + }, + { + "epoch": 3.301168686972746, + "grad_norm": 0.13842323422431946, + "learning_rate": 8.471681908651067e-05, + "loss": 0.0202, + "step": 72100 + }, + { + "epoch": 3.3057472843194486, + "grad_norm": 0.21349883079528809, + "learning_rate": 8.46592643635058e-05, + "loss": 0.0209, + "step": 72200 + }, + { + "epoch": 3.3103258816661514, + "grad_norm": 0.3605678975582123, + "learning_rate": 8.460162110270494e-05, + "loss": 0.0241, + "step": 72300 + }, + { + "epoch": 3.3149044790128546, + "grad_norm": 0.46661075949668884, + "learning_rate": 8.454388945135895e-05, + "loss": 0.0193, + "step": 72400 + }, + { + "epoch": 3.3194830763595573, + "grad_norm": 0.24211075901985168, + "learning_rate": 8.448606955694457e-05, + "loss": 0.0214, + "step": 72500 + }, + { + "epoch": 3.32406167370626, + "grad_norm": 0.3622238337993622, + "learning_rate": 8.442816156716385e-05, + "loss": 0.0213, + "step": 72600 + }, + { + "epoch": 3.328640271052963, + "grad_norm": 1.0499359369277954, + "learning_rate": 8.437016562994397e-05, + "loss": 0.0196, + "step": 72700 + }, + { + "epoch": 3.3332188683996655, + "grad_norm": 0.2845001816749573, + "learning_rate": 8.43120818934367e-05, + "loss": 0.0202, + "step": 72800 + }, + { + "epoch": 3.3377974657463687, + "grad_norm": 0.5690521001815796, + "learning_rate": 8.42539105060181e-05, + "loss": 0.0209, + "step": 72900 + }, + { + "epoch": 3.3423760630930714, + "grad_norm": 0.09998586773872375, + "learning_rate": 8.419565161628823e-05, + "loss": 0.018, + "step": 73000 + }, + { + "epoch": 3.346954660439774, + "grad_norm": 0.9970934391021729, + "learning_rate": 8.413730537307056e-05, + "loss": 0.0213, + "step": 73100 + }, + { + "epoch": 3.351533257786477, + "grad_norm": 1.1385819911956787, + "learning_rate": 8.407887192541177e-05, + "loss": 0.0198, + "step": 73200 + }, + { + "epoch": 3.35611185513318, + "grad_norm": 0.6288115382194519, + "learning_rate": 8.402035142258131e-05, + "loss": 0.0211, + "step": 73300 + }, + { + "epoch": 3.360690452479883, + "grad_norm": 0.35352623462677, + "learning_rate": 8.396174401407095e-05, + "loss": 0.0189, + "step": 73400 + }, + { + "epoch": 3.3652690498265856, + "grad_norm": 0.5127176642417908, + "learning_rate": 8.390304984959454e-05, + "loss": 0.0195, + "step": 73500 + }, + { + "epoch": 3.3698476471732883, + "grad_norm": 0.9110797643661499, + "learning_rate": 8.384426907908754e-05, + "loss": 0.0219, + "step": 73600 + }, + { + "epoch": 3.3744262445199915, + "grad_norm": 0.22417746484279633, + "learning_rate": 8.378540185270656e-05, + "loss": 0.0194, + "step": 73700 + }, + { + "epoch": 3.3790048418666943, + "grad_norm": 0.49265140295028687, + "learning_rate": 8.372644832082917e-05, + "loss": 0.0205, + "step": 73800 + }, + { + "epoch": 3.383583439213397, + "grad_norm": 0.7536473870277405, + "learning_rate": 8.366740863405336e-05, + "loss": 0.0222, + "step": 73900 + }, + { + "epoch": 3.3881620365600997, + "grad_norm": 0.2447548657655716, + "learning_rate": 8.360828294319721e-05, + "loss": 0.0205, + "step": 74000 + }, + { + "epoch": 3.3927406339068025, + "grad_norm": 0.3335092067718506, + "learning_rate": 8.354907139929851e-05, + "loss": 0.0208, + "step": 74100 + }, + { + "epoch": 3.3973192312535057, + "grad_norm": 0.6961463689804077, + "learning_rate": 8.348977415361434e-05, + "loss": 0.018, + "step": 74200 + }, + { + "epoch": 3.4018978286002084, + "grad_norm": 0.4184730648994446, + "learning_rate": 8.343039135762071e-05, + "loss": 0.0198, + "step": 74300 + }, + { + "epoch": 3.406476425946911, + "grad_norm": 0.6484507918357849, + "learning_rate": 8.337092316301223e-05, + "loss": 0.0203, + "step": 74400 + }, + { + "epoch": 3.411055023293614, + "grad_norm": 0.31808891892433167, + "learning_rate": 8.331136972170155e-05, + "loss": 0.0202, + "step": 74500 + }, + { + "epoch": 3.4156336206403166, + "grad_norm": 0.6552246809005737, + "learning_rate": 8.325173118581919e-05, + "loss": 0.0198, + "step": 74600 + }, + { + "epoch": 3.42021221798702, + "grad_norm": 0.5105406641960144, + "learning_rate": 8.319200770771298e-05, + "loss": 0.0197, + "step": 74700 + }, + { + "epoch": 3.4247908153337225, + "grad_norm": 0.9565762877464294, + "learning_rate": 8.313219943994777e-05, + "loss": 0.019, + "step": 74800 + }, + { + "epoch": 3.4293694126804253, + "grad_norm": 0.7772880792617798, + "learning_rate": 8.3072306535305e-05, + "loss": 0.0207, + "step": 74900 + }, + { + "epoch": 3.433948010027128, + "grad_norm": 0.6711807250976562, + "learning_rate": 8.30123291467823e-05, + "loss": 0.0222, + "step": 75000 + }, + { + "epoch": 3.438526607373831, + "grad_norm": 0.10591955482959747, + "learning_rate": 8.295226742759315e-05, + "loss": 0.0199, + "step": 75100 + }, + { + "epoch": 3.443105204720534, + "grad_norm": 0.5128488540649414, + "learning_rate": 8.289212153116642e-05, + "loss": 0.0219, + "step": 75200 + }, + { + "epoch": 3.4476838020672367, + "grad_norm": 0.24297969043254852, + "learning_rate": 8.283189161114602e-05, + "loss": 0.0205, + "step": 75300 + }, + { + "epoch": 3.4522623994139394, + "grad_norm": 0.9164755344390869, + "learning_rate": 8.27715778213905e-05, + "loss": 0.0224, + "step": 75400 + }, + { + "epoch": 3.4568409967606426, + "grad_norm": 0.493466317653656, + "learning_rate": 8.271118031597271e-05, + "loss": 0.0204, + "step": 75500 + }, + { + "epoch": 3.4614195941073453, + "grad_norm": 0.27884870767593384, + "learning_rate": 8.265069924917925e-05, + "loss": 0.0199, + "step": 75600 + }, + { + "epoch": 3.465998191454048, + "grad_norm": 0.2624457776546478, + "learning_rate": 8.259013477551027e-05, + "loss": 0.0223, + "step": 75700 + }, + { + "epoch": 3.470576788800751, + "grad_norm": 0.6593875885009766, + "learning_rate": 8.252948704967896e-05, + "loss": 0.0186, + "step": 75800 + }, + { + "epoch": 3.4751553861474536, + "grad_norm": 0.398616760969162, + "learning_rate": 8.246875622661113e-05, + "loss": 0.0199, + "step": 75900 + }, + { + "epoch": 3.4797339834941567, + "grad_norm": 0.2612878978252411, + "learning_rate": 8.240794246144492e-05, + "loss": 0.0207, + "step": 76000 + }, + { + "epoch": 3.4843125808408595, + "grad_norm": 0.21333344280719757, + "learning_rate": 8.234704590953033e-05, + "loss": 0.0205, + "step": 76100 + }, + { + "epoch": 3.4888911781875622, + "grad_norm": 1.0213849544525146, + "learning_rate": 8.228606672642884e-05, + "loss": 0.0199, + "step": 76200 + }, + { + "epoch": 3.493469775534265, + "grad_norm": 0.29667162895202637, + "learning_rate": 8.222500506791304e-05, + "loss": 0.0215, + "step": 76300 + }, + { + "epoch": 3.4980483728809677, + "grad_norm": 0.20311638712882996, + "learning_rate": 8.216386108996614e-05, + "loss": 0.0219, + "step": 76400 + }, + { + "epoch": 3.502626970227671, + "grad_norm": 0.8317406177520752, + "learning_rate": 8.21026349487817e-05, + "loss": 0.0215, + "step": 76500 + }, + { + "epoch": 3.5072055675743736, + "grad_norm": 0.4841706156730652, + "learning_rate": 8.204132680076312e-05, + "loss": 0.0207, + "step": 76600 + }, + { + "epoch": 3.5117841649210764, + "grad_norm": 0.5647122263908386, + "learning_rate": 8.197993680252334e-05, + "loss": 0.0217, + "step": 76700 + }, + { + "epoch": 3.516362762267779, + "grad_norm": 0.9369067549705505, + "learning_rate": 8.191846511088435e-05, + "loss": 0.0215, + "step": 76800 + }, + { + "epoch": 3.520941359614482, + "grad_norm": 0.7805814743041992, + "learning_rate": 8.185691188287684e-05, + "loss": 0.0219, + "step": 76900 + }, + { + "epoch": 3.525519956961185, + "grad_norm": 1.2135581970214844, + "learning_rate": 8.179527727573975e-05, + "loss": 0.0193, + "step": 77000 + }, + { + "epoch": 3.5300985543078878, + "grad_norm": 0.14101019501686096, + "learning_rate": 8.173356144691999e-05, + "loss": 0.0211, + "step": 77100 + }, + { + "epoch": 3.5346771516545905, + "grad_norm": 0.7078022956848145, + "learning_rate": 8.167176455407187e-05, + "loss": 0.0204, + "step": 77200 + }, + { + "epoch": 3.5392557490012937, + "grad_norm": 1.2366012334823608, + "learning_rate": 8.160988675505679e-05, + "loss": 0.0183, + "step": 77300 + }, + { + "epoch": 3.5438343463479964, + "grad_norm": 0.26279062032699585, + "learning_rate": 8.15479282079429e-05, + "loss": 0.02, + "step": 77400 + }, + { + "epoch": 3.548412943694699, + "grad_norm": 0.21293646097183228, + "learning_rate": 8.148588907100454e-05, + "loss": 0.0203, + "step": 77500 + }, + { + "epoch": 3.552991541041402, + "grad_norm": 0.48216012120246887, + "learning_rate": 8.142376950272193e-05, + "loss": 0.0192, + "step": 77600 + }, + { + "epoch": 3.5575701383881047, + "grad_norm": 0.1273164004087448, + "learning_rate": 8.136156966178081e-05, + "loss": 0.0183, + "step": 77700 + }, + { + "epoch": 3.562148735734808, + "grad_norm": 0.621910035610199, + "learning_rate": 8.12992897070719e-05, + "loss": 0.0217, + "step": 77800 + }, + { + "epoch": 3.5667273330815106, + "grad_norm": 0.3813430964946747, + "learning_rate": 8.123692979769064e-05, + "loss": 0.0184, + "step": 77900 + }, + { + "epoch": 3.5713059304282133, + "grad_norm": 0.3676023781299591, + "learning_rate": 8.117449009293668e-05, + "loss": 0.0175, + "step": 78000 + }, + { + "epoch": 3.575884527774916, + "grad_norm": 0.41113948822021484, + "learning_rate": 8.111197075231351e-05, + "loss": 0.0194, + "step": 78100 + }, + { + "epoch": 3.580463125121619, + "grad_norm": 0.2245587855577469, + "learning_rate": 8.104937193552806e-05, + "loss": 0.0212, + "step": 78200 + }, + { + "epoch": 3.585041722468322, + "grad_norm": 0.08874198794364929, + "learning_rate": 8.098669380249029e-05, + "loss": 0.0192, + "step": 78300 + }, + { + "epoch": 3.5896203198150247, + "grad_norm": 0.29562532901763916, + "learning_rate": 8.092393651331275e-05, + "loss": 0.022, + "step": 78400 + }, + { + "epoch": 3.5941989171617275, + "grad_norm": 0.47509998083114624, + "learning_rate": 8.086110022831023e-05, + "loss": 0.0202, + "step": 78500 + }, + { + "epoch": 3.59877751450843, + "grad_norm": 0.41073593497276306, + "learning_rate": 8.079818510799928e-05, + "loss": 0.0214, + "step": 78600 + }, + { + "epoch": 3.603356111855133, + "grad_norm": 0.2985229790210724, + "learning_rate": 8.073519131309786e-05, + "loss": 0.0165, + "step": 78700 + }, + { + "epoch": 3.607934709201836, + "grad_norm": 0.7368443012237549, + "learning_rate": 8.067211900452492e-05, + "loss": 0.0177, + "step": 78800 + }, + { + "epoch": 3.612513306548539, + "grad_norm": 0.46281248331069946, + "learning_rate": 8.060896834339993e-05, + "loss": 0.0221, + "step": 78900 + }, + { + "epoch": 3.6170919038952416, + "grad_norm": 0.18318797647953033, + "learning_rate": 8.054573949104253e-05, + "loss": 0.0191, + "step": 79000 + }, + { + "epoch": 3.621670501241945, + "grad_norm": 0.19009487330913544, + "learning_rate": 8.048243260897217e-05, + "loss": 0.0212, + "step": 79100 + }, + { + "epoch": 3.6262490985886475, + "grad_norm": 0.38268911838531494, + "learning_rate": 8.041904785890749e-05, + "loss": 0.0197, + "step": 79200 + }, + { + "epoch": 3.6308276959353503, + "grad_norm": 0.3892700672149658, + "learning_rate": 8.035558540276618e-05, + "loss": 0.0214, + "step": 79300 + }, + { + "epoch": 3.635406293282053, + "grad_norm": 0.6497855186462402, + "learning_rate": 8.029204540266434e-05, + "loss": 0.0192, + "step": 79400 + }, + { + "epoch": 3.6399848906287557, + "grad_norm": 0.20039434731006622, + "learning_rate": 8.022842802091623e-05, + "loss": 0.0188, + "step": 79500 + }, + { + "epoch": 3.644563487975459, + "grad_norm": 0.19965870678424835, + "learning_rate": 8.016473342003372e-05, + "loss": 0.0204, + "step": 79600 + }, + { + "epoch": 3.6491420853221617, + "grad_norm": 0.12873798608779907, + "learning_rate": 8.010096176272595e-05, + "loss": 0.0189, + "step": 79700 + }, + { + "epoch": 3.6537206826688644, + "grad_norm": 0.26886749267578125, + "learning_rate": 8.003711321189895e-05, + "loss": 0.0206, + "step": 79800 + }, + { + "epoch": 3.658299280015567, + "grad_norm": 0.4891631305217743, + "learning_rate": 7.997318793065513e-05, + "loss": 0.0204, + "step": 79900 + }, + { + "epoch": 3.66287787736227, + "grad_norm": 0.2781907021999359, + "learning_rate": 7.99091860822929e-05, + "loss": 0.0192, + "step": 80000 + }, + { + "epoch": 3.667456474708973, + "grad_norm": 0.3009509742259979, + "learning_rate": 7.984510783030632e-05, + "loss": 0.0185, + "step": 80100 + }, + { + "epoch": 3.672035072055676, + "grad_norm": 0.5892056822776794, + "learning_rate": 7.978095333838457e-05, + "loss": 0.0191, + "step": 80200 + }, + { + "epoch": 3.6766136694023785, + "grad_norm": 0.3318547308444977, + "learning_rate": 7.97167227704116e-05, + "loss": 0.0194, + "step": 80300 + }, + { + "epoch": 3.6811922667490813, + "grad_norm": 0.4608217179775238, + "learning_rate": 7.965241629046571e-05, + "loss": 0.0215, + "step": 80400 + }, + { + "epoch": 3.685770864095784, + "grad_norm": 0.39660006761550903, + "learning_rate": 7.95880340628191e-05, + "loss": 0.0172, + "step": 80500 + }, + { + "epoch": 3.690349461442487, + "grad_norm": 0.182856485247612, + "learning_rate": 7.952357625193749e-05, + "loss": 0.0184, + "step": 80600 + }, + { + "epoch": 3.69492805878919, + "grad_norm": 0.6444191932678223, + "learning_rate": 7.945904302247969e-05, + "loss": 0.0179, + "step": 80700 + }, + { + "epoch": 3.6995066561358927, + "grad_norm": 0.4182109534740448, + "learning_rate": 7.939443453929712e-05, + "loss": 0.0217, + "step": 80800 + }, + { + "epoch": 3.704085253482596, + "grad_norm": 4.025650501251221, + "learning_rate": 7.932975096743346e-05, + "loss": 0.0203, + "step": 80900 + }, + { + "epoch": 3.708663850829298, + "grad_norm": 0.665017306804657, + "learning_rate": 7.926499247212422e-05, + "loss": 0.0186, + "step": 81000 + }, + { + "epoch": 3.7132424481760014, + "grad_norm": 0.12548814713954926, + "learning_rate": 7.920015921879631e-05, + "loss": 0.0182, + "step": 81100 + }, + { + "epoch": 3.717821045522704, + "grad_norm": 0.33034953474998474, + "learning_rate": 7.913525137306756e-05, + "loss": 0.0225, + "step": 81200 + }, + { + "epoch": 3.722399642869407, + "grad_norm": 0.2771977186203003, + "learning_rate": 7.907026910074643e-05, + "loss": 0.0206, + "step": 81300 + }, + { + "epoch": 3.72697824021611, + "grad_norm": 0.1603299379348755, + "learning_rate": 7.900521256783143e-05, + "loss": 0.0191, + "step": 81400 + }, + { + "epoch": 3.7315568375628128, + "grad_norm": 0.29296520352363586, + "learning_rate": 7.894008194051077e-05, + "loss": 0.0199, + "step": 81500 + }, + { + "epoch": 3.7361354349095155, + "grad_norm": 0.3158813416957855, + "learning_rate": 7.8874877385162e-05, + "loss": 0.0216, + "step": 81600 + }, + { + "epoch": 3.7407140322562182, + "grad_norm": 0.42911648750305176, + "learning_rate": 7.880959906835148e-05, + "loss": 0.0174, + "step": 81700 + }, + { + "epoch": 3.745292629602921, + "grad_norm": 0.3854501247406006, + "learning_rate": 7.8744247156834e-05, + "loss": 0.0217, + "step": 81800 + }, + { + "epoch": 3.749871226949624, + "grad_norm": 0.1661909967660904, + "learning_rate": 7.86788218175523e-05, + "loss": 0.0185, + "step": 81900 + }, + { + "epoch": 3.754449824296327, + "grad_norm": 0.3275599479675293, + "learning_rate": 7.861332321763682e-05, + "loss": 0.0172, + "step": 82000 + }, + { + "epoch": 3.7590284216430296, + "grad_norm": 0.4914777874946594, + "learning_rate": 7.854775152440501e-05, + "loss": 0.0206, + "step": 82100 + }, + { + "epoch": 3.7636070189897324, + "grad_norm": 0.6310822367668152, + "learning_rate": 7.84821069053611e-05, + "loss": 0.0193, + "step": 82200 + }, + { + "epoch": 3.768185616336435, + "grad_norm": 0.33729735016822815, + "learning_rate": 7.841638952819563e-05, + "loss": 0.0209, + "step": 82300 + }, + { + "epoch": 3.7727642136831383, + "grad_norm": 0.6020189523696899, + "learning_rate": 7.835059956078494e-05, + "loss": 0.0194, + "step": 82400 + }, + { + "epoch": 3.777342811029841, + "grad_norm": 0.3810158669948578, + "learning_rate": 7.828473717119088e-05, + "loss": 0.0199, + "step": 82500 + }, + { + "epoch": 3.781921408376544, + "grad_norm": 0.6647739410400391, + "learning_rate": 7.821880252766025e-05, + "loss": 0.0211, + "step": 82600 + }, + { + "epoch": 3.7865000057232465, + "grad_norm": 0.5358772873878479, + "learning_rate": 7.815279579862442e-05, + "loss": 0.0196, + "step": 82700 + }, + { + "epoch": 3.7910786030699493, + "grad_norm": 0.26241055130958557, + "learning_rate": 7.808671715269896e-05, + "loss": 0.0206, + "step": 82800 + }, + { + "epoch": 3.7956572004166524, + "grad_norm": 0.24061718583106995, + "learning_rate": 7.802056675868306e-05, + "loss": 0.0186, + "step": 82900 + }, + { + "epoch": 3.800235797763355, + "grad_norm": 0.16280798614025116, + "learning_rate": 7.79543447855593e-05, + "loss": 0.0185, + "step": 83000 + }, + { + "epoch": 3.804814395110058, + "grad_norm": 0.7385302186012268, + "learning_rate": 7.788805140249302e-05, + "loss": 0.0207, + "step": 83100 + }, + { + "epoch": 3.809392992456761, + "grad_norm": 0.20743854343891144, + "learning_rate": 7.782168677883206e-05, + "loss": 0.0177, + "step": 83200 + }, + { + "epoch": 3.813971589803464, + "grad_norm": 0.3482532501220703, + "learning_rate": 7.775525108410615e-05, + "loss": 0.0216, + "step": 83300 + }, + { + "epoch": 3.8185501871501666, + "grad_norm": 0.42130351066589355, + "learning_rate": 7.768874448802665e-05, + "loss": 0.0207, + "step": 83400 + }, + { + "epoch": 3.8231287844968693, + "grad_norm": 0.44204580783843994, + "learning_rate": 7.762216716048602e-05, + "loss": 0.0215, + "step": 83500 + }, + { + "epoch": 3.827707381843572, + "grad_norm": 0.20962856709957123, + "learning_rate": 7.755551927155739e-05, + "loss": 0.0183, + "step": 83600 + }, + { + "epoch": 3.8322859791902752, + "grad_norm": 0.19921015202999115, + "learning_rate": 7.748880099149415e-05, + "loss": 0.02, + "step": 83700 + }, + { + "epoch": 3.836864576536978, + "grad_norm": 0.2693636119365692, + "learning_rate": 7.742201249072948e-05, + "loss": 0.019, + "step": 83800 + }, + { + "epoch": 3.8414431738836807, + "grad_norm": 0.677135705947876, + "learning_rate": 7.735515393987602e-05, + "loss": 0.0195, + "step": 83900 + }, + { + "epoch": 3.8460217712303835, + "grad_norm": 0.34260210394859314, + "learning_rate": 7.728822550972523e-05, + "loss": 0.0194, + "step": 84000 + }, + { + "epoch": 3.850600368577086, + "grad_norm": 0.83556067943573, + "learning_rate": 7.72212273712472e-05, + "loss": 0.0226, + "step": 84100 + }, + { + "epoch": 3.8551789659237894, + "grad_norm": 0.22360268235206604, + "learning_rate": 7.715415969559002e-05, + "loss": 0.0177, + "step": 84200 + }, + { + "epoch": 3.859757563270492, + "grad_norm": 0.32109469175338745, + "learning_rate": 7.708702265407941e-05, + "loss": 0.0197, + "step": 84300 + }, + { + "epoch": 3.864336160617195, + "grad_norm": 0.4577140212059021, + "learning_rate": 7.701981641821834e-05, + "loss": 0.0173, + "step": 84400 + }, + { + "epoch": 3.8689147579638976, + "grad_norm": 0.30675482749938965, + "learning_rate": 7.695254115968648e-05, + "loss": 0.0198, + "step": 84500 + }, + { + "epoch": 3.8734933553106004, + "grad_norm": 0.6526969075202942, + "learning_rate": 7.688519705033989e-05, + "loss": 0.0222, + "step": 84600 + }, + { + "epoch": 3.8780719526573035, + "grad_norm": 0.09654036164283752, + "learning_rate": 7.681778426221042e-05, + "loss": 0.0194, + "step": 84700 + }, + { + "epoch": 3.8826505500040063, + "grad_norm": 0.2337755411863327, + "learning_rate": 7.675030296750542e-05, + "loss": 0.019, + "step": 84800 + }, + { + "epoch": 3.887229147350709, + "grad_norm": 0.05356181785464287, + "learning_rate": 7.668275333860724e-05, + "loss": 0.0202, + "step": 84900 + }, + { + "epoch": 3.891807744697412, + "grad_norm": 0.4630540907382965, + "learning_rate": 7.66151355480728e-05, + "loss": 0.0182, + "step": 85000 + }, + { + "epoch": 3.896386342044115, + "grad_norm": 0.21360653638839722, + "learning_rate": 7.65474497686331e-05, + "loss": 0.0198, + "step": 85100 + }, + { + "epoch": 3.9009649393908177, + "grad_norm": 0.2991812229156494, + "learning_rate": 7.647969617319282e-05, + "loss": 0.0201, + "step": 85200 + }, + { + "epoch": 3.9055435367375204, + "grad_norm": 0.214981809258461, + "learning_rate": 7.641187493482995e-05, + "loss": 0.0164, + "step": 85300 + }, + { + "epoch": 3.910122134084223, + "grad_norm": 0.48418205976486206, + "learning_rate": 7.634398622679517e-05, + "loss": 0.0192, + "step": 85400 + }, + { + "epoch": 3.9147007314309263, + "grad_norm": 0.5781142711639404, + "learning_rate": 7.62760302225116e-05, + "loss": 0.0199, + "step": 85500 + }, + { + "epoch": 3.919279328777629, + "grad_norm": 0.7809280157089233, + "learning_rate": 7.620800709557421e-05, + "loss": 0.0186, + "step": 85600 + }, + { + "epoch": 3.923857926124332, + "grad_norm": 0.1833581030368805, + "learning_rate": 7.61399170197495e-05, + "loss": 0.0189, + "step": 85700 + }, + { + "epoch": 3.9284365234710346, + "grad_norm": 0.3215663433074951, + "learning_rate": 7.60717601689749e-05, + "loss": 0.0168, + "step": 85800 + }, + { + "epoch": 3.9330151208177373, + "grad_norm": 0.41018444299697876, + "learning_rate": 7.600353671735853e-05, + "loss": 0.0208, + "step": 85900 + }, + { + "epoch": 3.9375937181644405, + "grad_norm": 0.34082677960395813, + "learning_rate": 7.593524683917854e-05, + "loss": 0.0191, + "step": 86000 + }, + { + "epoch": 3.942172315511143, + "grad_norm": 0.39426901936531067, + "learning_rate": 7.586689070888284e-05, + "loss": 0.0199, + "step": 86100 + }, + { + "epoch": 3.946750912857846, + "grad_norm": 0.4446451663970947, + "learning_rate": 7.579846850108855e-05, + "loss": 0.0204, + "step": 86200 + }, + { + "epoch": 3.9513295102045487, + "grad_norm": 0.3159216344356537, + "learning_rate": 7.572998039058159e-05, + "loss": 0.0183, + "step": 86300 + }, + { + "epoch": 3.9559081075512514, + "grad_norm": 0.3799346387386322, + "learning_rate": 7.566142655231622e-05, + "loss": 0.019, + "step": 86400 + }, + { + "epoch": 3.9604867048979546, + "grad_norm": 0.4832625687122345, + "learning_rate": 7.559280716141463e-05, + "loss": 0.0179, + "step": 86500 + }, + { + "epoch": 3.9650653022446574, + "grad_norm": 0.2456403523683548, + "learning_rate": 7.552412239316645e-05, + "loss": 0.0184, + "step": 86600 + }, + { + "epoch": 3.96964389959136, + "grad_norm": 0.3314709961414337, + "learning_rate": 7.545537242302829e-05, + "loss": 0.0177, + "step": 86700 + }, + { + "epoch": 3.9742224969380633, + "grad_norm": 0.4336375892162323, + "learning_rate": 7.53865574266234e-05, + "loss": 0.0187, + "step": 86800 + }, + { + "epoch": 3.978801094284766, + "grad_norm": 0.7629146575927734, + "learning_rate": 7.531767757974104e-05, + "loss": 0.0199, + "step": 86900 + }, + { + "epoch": 3.9833796916314688, + "grad_norm": 0.16511370241641998, + "learning_rate": 7.52487330583362e-05, + "loss": 0.0182, + "step": 87000 + }, + { + "epoch": 3.9879582889781715, + "grad_norm": 0.29885396361351013, + "learning_rate": 7.517972403852905e-05, + "loss": 0.0193, + "step": 87100 + }, + { + "epoch": 3.9925368863248742, + "grad_norm": 0.4066375494003296, + "learning_rate": 7.511065069660458e-05, + "loss": 0.0191, + "step": 87200 + }, + { + "epoch": 3.9971154836715774, + "grad_norm": 0.44243311882019043, + "learning_rate": 7.504151320901199e-05, + "loss": 0.0203, + "step": 87300 + }, + { + "epoch": 4.0, + "eval_loss": 0.15203019976615906, + "eval_runtime": 258.1696, + "eval_samples_per_second": 21.304, + "eval_steps_per_second": 21.304, + "step": 87363 + }, + { + "epoch": 4.00169408101828, + "grad_norm": 0.2750494182109833, + "learning_rate": 7.497231175236442e-05, + "loss": 0.0174, + "step": 87400 + }, + { + "epoch": 4.006272678364983, + "grad_norm": 0.4887785315513611, + "learning_rate": 7.490304650343841e-05, + "loss": 0.0131, + "step": 87500 + }, + { + "epoch": 4.010851275711686, + "grad_norm": 0.21974627673625946, + "learning_rate": 7.483371763917345e-05, + "loss": 0.0141, + "step": 87600 + }, + { + "epoch": 4.015429873058388, + "grad_norm": 0.34770917892456055, + "learning_rate": 7.476432533667151e-05, + "loss": 0.0139, + "step": 87700 + }, + { + "epoch": 4.020008470405092, + "grad_norm": 0.2878529727458954, + "learning_rate": 7.469486977319665e-05, + "loss": 0.0118, + "step": 87800 + }, + { + "epoch": 4.024587067751794, + "grad_norm": 0.6604347229003906, + "learning_rate": 7.462535112617452e-05, + "loss": 0.0128, + "step": 87900 + }, + { + "epoch": 4.029165665098497, + "grad_norm": 0.4288138747215271, + "learning_rate": 7.455576957319194e-05, + "loss": 0.0145, + "step": 88000 + }, + { + "epoch": 4.0337442624452, + "grad_norm": 0.19010120630264282, + "learning_rate": 7.448612529199637e-05, + "loss": 0.0114, + "step": 88100 + }, + { + "epoch": 4.0383228597919025, + "grad_norm": 0.3835040032863617, + "learning_rate": 7.441641846049556e-05, + "loss": 0.0152, + "step": 88200 + }, + { + "epoch": 4.042901457138606, + "grad_norm": 2.3910844326019287, + "learning_rate": 7.434664925675702e-05, + "loss": 0.0153, + "step": 88300 + }, + { + "epoch": 4.047480054485308, + "grad_norm": 0.40593621134757996, + "learning_rate": 7.427681785900761e-05, + "loss": 0.0143, + "step": 88400 + }, + { + "epoch": 4.052058651832011, + "grad_norm": 0.08815860003232956, + "learning_rate": 7.420692444563305e-05, + "loss": 0.014, + "step": 88500 + }, + { + "epoch": 4.056637249178714, + "grad_norm": 0.33992356061935425, + "learning_rate": 7.413696919517749e-05, + "loss": 0.0135, + "step": 88600 + }, + { + "epoch": 4.061215846525417, + "grad_norm": 0.32726776599884033, + "learning_rate": 7.406695228634305e-05, + "loss": 0.0131, + "step": 88700 + }, + { + "epoch": 4.06579444387212, + "grad_norm": 0.3524836301803589, + "learning_rate": 7.399687389798933e-05, + "loss": 0.0136, + "step": 88800 + }, + { + "epoch": 4.070373041218823, + "grad_norm": 0.18603968620300293, + "learning_rate": 7.3926734209133e-05, + "loss": 0.0123, + "step": 88900 + }, + { + "epoch": 4.074951638565525, + "grad_norm": 0.4780280888080597, + "learning_rate": 7.385653339894733e-05, + "loss": 0.0142, + "step": 89000 + }, + { + "epoch": 4.0795302359122285, + "grad_norm": 0.22851374745368958, + "learning_rate": 7.378627164676173e-05, + "loss": 0.013, + "step": 89100 + }, + { + "epoch": 4.084108833258931, + "grad_norm": 0.4251825511455536, + "learning_rate": 7.371594913206124e-05, + "loss": 0.0153, + "step": 89200 + }, + { + "epoch": 4.088687430605634, + "grad_norm": 0.3959885239601135, + "learning_rate": 7.364556603448619e-05, + "loss": 0.0166, + "step": 89300 + }, + { + "epoch": 4.093266027952337, + "grad_norm": 0.8459362387657166, + "learning_rate": 7.357512253383162e-05, + "loss": 0.0152, + "step": 89400 + }, + { + "epoch": 4.0978446252990395, + "grad_norm": 0.5725641250610352, + "learning_rate": 7.35046188100469e-05, + "loss": 0.0135, + "step": 89500 + }, + { + "epoch": 4.102423222645743, + "grad_norm": 0.2906801402568817, + "learning_rate": 7.343405504323519e-05, + "loss": 0.013, + "step": 89600 + }, + { + "epoch": 4.107001819992445, + "grad_norm": 0.10050017386674881, + "learning_rate": 7.33634314136531e-05, + "loss": 0.0114, + "step": 89700 + }, + { + "epoch": 4.111580417339148, + "grad_norm": 0.6948938965797424, + "learning_rate": 7.329274810171014e-05, + "loss": 0.0138, + "step": 89800 + }, + { + "epoch": 4.116159014685851, + "grad_norm": 0.4069768190383911, + "learning_rate": 7.322200528796822e-05, + "loss": 0.0124, + "step": 89900 + }, + { + "epoch": 4.120737612032554, + "grad_norm": 0.09699010848999023, + "learning_rate": 7.315120315314134e-05, + "loss": 0.0128, + "step": 90000 + }, + { + "epoch": 4.125316209379257, + "grad_norm": 0.3347591161727905, + "learning_rate": 7.308034187809498e-05, + "loss": 0.0166, + "step": 90100 + }, + { + "epoch": 4.129894806725959, + "grad_norm": 0.22168204188346863, + "learning_rate": 7.300942164384571e-05, + "loss": 0.0151, + "step": 90200 + }, + { + "epoch": 4.134473404072662, + "grad_norm": 0.5564683675765991, + "learning_rate": 7.293844263156072e-05, + "loss": 0.0126, + "step": 90300 + }, + { + "epoch": 4.1390520014193655, + "grad_norm": 0.32226261496543884, + "learning_rate": 7.28674050225573e-05, + "loss": 0.0131, + "step": 90400 + }, + { + "epoch": 4.143630598766068, + "grad_norm": 0.36912479996681213, + "learning_rate": 7.279630899830252e-05, + "loss": 0.0143, + "step": 90500 + }, + { + "epoch": 4.148209196112771, + "grad_norm": 0.2860753834247589, + "learning_rate": 7.272515474041259e-05, + "loss": 0.0152, + "step": 90600 + }, + { + "epoch": 4.152787793459474, + "grad_norm": 0.3625887930393219, + "learning_rate": 7.265394243065253e-05, + "loss": 0.0143, + "step": 90700 + }, + { + "epoch": 4.157366390806176, + "grad_norm": 0.24506491422653198, + "learning_rate": 7.258267225093563e-05, + "loss": 0.015, + "step": 90800 + }, + { + "epoch": 4.16194498815288, + "grad_norm": 0.03290629759430885, + "learning_rate": 7.251134438332299e-05, + "loss": 0.0126, + "step": 90900 + }, + { + "epoch": 4.166523585499582, + "grad_norm": 0.4261631667613983, + "learning_rate": 7.243995901002312e-05, + "loss": 0.0148, + "step": 91000 + }, + { + "epoch": 4.171102182846285, + "grad_norm": 0.14463308453559875, + "learning_rate": 7.23685163133914e-05, + "loss": 0.0113, + "step": 91100 + }, + { + "epoch": 4.175680780192988, + "grad_norm": 0.53131502866745, + "learning_rate": 7.229701647592966e-05, + "loss": 0.0136, + "step": 91200 + }, + { + "epoch": 4.180259377539691, + "grad_norm": 0.30526795983314514, + "learning_rate": 7.222545968028569e-05, + "loss": 0.0142, + "step": 91300 + }, + { + "epoch": 4.184837974886394, + "grad_norm": 0.07798325270414352, + "learning_rate": 7.215384610925278e-05, + "loss": 0.0134, + "step": 91400 + }, + { + "epoch": 4.189416572233096, + "grad_norm": 0.164367213845253, + "learning_rate": 7.208217594576923e-05, + "loss": 0.0127, + "step": 91500 + }, + { + "epoch": 4.193995169579799, + "grad_norm": 0.0945630893111229, + "learning_rate": 7.201044937291797e-05, + "loss": 0.0118, + "step": 91600 + }, + { + "epoch": 4.198573766926502, + "grad_norm": 0.38682791590690613, + "learning_rate": 7.193866657392597e-05, + "loss": 0.0141, + "step": 91700 + }, + { + "epoch": 4.203152364273205, + "grad_norm": 0.49326708912849426, + "learning_rate": 7.186682773216384e-05, + "loss": 0.0125, + "step": 91800 + }, + { + "epoch": 4.207730961619908, + "grad_norm": 0.2276126593351364, + "learning_rate": 7.179493303114537e-05, + "loss": 0.014, + "step": 91900 + }, + { + "epoch": 4.21230955896661, + "grad_norm": 0.5109021067619324, + "learning_rate": 7.172298265452706e-05, + "loss": 0.0138, + "step": 92000 + }, + { + "epoch": 4.216888156313313, + "grad_norm": 0.23471687734127045, + "learning_rate": 7.165097678610759e-05, + "loss": 0.014, + "step": 92100 + }, + { + "epoch": 4.221466753660017, + "grad_norm": 0.4894104301929474, + "learning_rate": 7.15789156098274e-05, + "loss": 0.0155, + "step": 92200 + }, + { + "epoch": 4.226045351006719, + "grad_norm": 0.1319025456905365, + "learning_rate": 7.150679930976825e-05, + "loss": 0.0135, + "step": 92300 + }, + { + "epoch": 4.230623948353422, + "grad_norm": 0.32496750354766846, + "learning_rate": 7.143462807015271e-05, + "loss": 0.0136, + "step": 92400 + }, + { + "epoch": 4.235202545700124, + "grad_norm": 0.380876749753952, + "learning_rate": 7.136240207534365e-05, + "loss": 0.0148, + "step": 92500 + }, + { + "epoch": 4.2397811430468275, + "grad_norm": 0.18530067801475525, + "learning_rate": 7.129012150984387e-05, + "loss": 0.0143, + "step": 92600 + }, + { + "epoch": 4.244359740393531, + "grad_norm": 0.9411688446998596, + "learning_rate": 7.121778655829554e-05, + "loss": 0.0115, + "step": 92700 + }, + { + "epoch": 4.248938337740233, + "grad_norm": 0.22460629045963287, + "learning_rate": 7.114539740547974e-05, + "loss": 0.0159, + "step": 92800 + }, + { + "epoch": 4.253516935086936, + "grad_norm": 0.19735155999660492, + "learning_rate": 7.107295423631606e-05, + "loss": 0.0133, + "step": 92900 + }, + { + "epoch": 4.258095532433639, + "grad_norm": 0.2656545341014862, + "learning_rate": 7.100045723586204e-05, + "loss": 0.0125, + "step": 93000 + }, + { + "epoch": 4.262674129780342, + "grad_norm": 1.059777021408081, + "learning_rate": 7.092790658931273e-05, + "loss": 0.0148, + "step": 93100 + }, + { + "epoch": 4.267252727127045, + "grad_norm": 0.3590608835220337, + "learning_rate": 7.085530248200027e-05, + "loss": 0.0139, + "step": 93200 + }, + { + "epoch": 4.271831324473747, + "grad_norm": 0.133284792304039, + "learning_rate": 7.07826450993933e-05, + "loss": 0.0153, + "step": 93300 + }, + { + "epoch": 4.27640992182045, + "grad_norm": 0.3305582106113434, + "learning_rate": 7.070993462709656e-05, + "loss": 0.0129, + "step": 93400 + }, + { + "epoch": 4.2809885191671535, + "grad_norm": 0.4209526777267456, + "learning_rate": 7.06371712508505e-05, + "loss": 0.0125, + "step": 93500 + }, + { + "epoch": 4.285567116513856, + "grad_norm": 0.10924796760082245, + "learning_rate": 7.056435515653059e-05, + "loss": 0.0162, + "step": 93600 + }, + { + "epoch": 4.290145713860559, + "grad_norm": 0.4727434515953064, + "learning_rate": 7.049148653014702e-05, + "loss": 0.0126, + "step": 93700 + }, + { + "epoch": 4.294724311207261, + "grad_norm": 0.5440820455551147, + "learning_rate": 7.041856555784421e-05, + "loss": 0.0131, + "step": 93800 + }, + { + "epoch": 4.2993029085539645, + "grad_norm": 0.07101954519748688, + "learning_rate": 7.034559242590027e-05, + "loss": 0.0163, + "step": 93900 + }, + { + "epoch": 4.303881505900668, + "grad_norm": 1.4522393941879272, + "learning_rate": 7.027256732072651e-05, + "loss": 0.014, + "step": 94000 + }, + { + "epoch": 4.30846010324737, + "grad_norm": 0.1080670952796936, + "learning_rate": 7.019949042886708e-05, + "loss": 0.013, + "step": 94100 + }, + { + "epoch": 4.313038700594073, + "grad_norm": 0.4725320339202881, + "learning_rate": 7.012636193699837e-05, + "loss": 0.0133, + "step": 94200 + }, + { + "epoch": 4.317617297940776, + "grad_norm": 0.7752532362937927, + "learning_rate": 7.005318203192864e-05, + "loss": 0.0136, + "step": 94300 + }, + { + "epoch": 4.322195895287479, + "grad_norm": 0.39167362451553345, + "learning_rate": 6.997995090059739e-05, + "loss": 0.0132, + "step": 94400 + }, + { + "epoch": 4.326774492634182, + "grad_norm": 0.16077743470668793, + "learning_rate": 6.990666873007505e-05, + "loss": 0.0126, + "step": 94500 + }, + { + "epoch": 4.331353089980884, + "grad_norm": 0.20132170617580414, + "learning_rate": 6.983333570756245e-05, + "loss": 0.0125, + "step": 94600 + }, + { + "epoch": 4.335931687327587, + "grad_norm": 0.4036431610584259, + "learning_rate": 6.975995202039025e-05, + "loss": 0.0149, + "step": 94700 + }, + { + "epoch": 4.34051028467429, + "grad_norm": 0.8535305261611938, + "learning_rate": 6.968651785601859e-05, + "loss": 0.0136, + "step": 94800 + }, + { + "epoch": 4.345088882020993, + "grad_norm": 0.3927995562553406, + "learning_rate": 6.961303340203653e-05, + "loss": 0.0146, + "step": 94900 + }, + { + "epoch": 4.349667479367696, + "grad_norm": 0.371528297662735, + "learning_rate": 6.953949884616162e-05, + "loss": 0.0124, + "step": 95000 + }, + { + "epoch": 4.354246076714398, + "grad_norm": 0.06207489222288132, + "learning_rate": 6.946591437623934e-05, + "loss": 0.0129, + "step": 95100 + }, + { + "epoch": 4.358824674061101, + "grad_norm": 0.05522959679365158, + "learning_rate": 6.939228018024275e-05, + "loss": 0.0133, + "step": 95200 + }, + { + "epoch": 4.363403271407805, + "grad_norm": 0.5625087022781372, + "learning_rate": 6.931859644627189e-05, + "loss": 0.0141, + "step": 95300 + }, + { + "epoch": 4.367981868754507, + "grad_norm": 0.13779932260513306, + "learning_rate": 6.924486336255337e-05, + "loss": 0.0135, + "step": 95400 + }, + { + "epoch": 4.37256046610121, + "grad_norm": 1.0762056112289429, + "learning_rate": 6.917108111743984e-05, + "loss": 0.0142, + "step": 95500 + }, + { + "epoch": 4.377139063447912, + "grad_norm": 0.22283124923706055, + "learning_rate": 6.909724989940953e-05, + "loss": 0.0133, + "step": 95600 + }, + { + "epoch": 4.3817176607946156, + "grad_norm": 0.5186660289764404, + "learning_rate": 6.902336989706581e-05, + "loss": 0.0136, + "step": 95700 + }, + { + "epoch": 4.386296258141319, + "grad_norm": 0.47632691264152527, + "learning_rate": 6.894944129913667e-05, + "loss": 0.0147, + "step": 95800 + }, + { + "epoch": 4.390874855488021, + "grad_norm": 1.1676534414291382, + "learning_rate": 6.887546429447419e-05, + "loss": 0.0128, + "step": 95900 + }, + { + "epoch": 4.395453452834724, + "grad_norm": 1.0476038455963135, + "learning_rate": 6.880143907205411e-05, + "loss": 0.0132, + "step": 96000 + }, + { + "epoch": 4.4000320501814265, + "grad_norm": 0.656058669090271, + "learning_rate": 6.872736582097541e-05, + "loss": 0.0152, + "step": 96100 + }, + { + "epoch": 4.40461064752813, + "grad_norm": 0.3963877856731415, + "learning_rate": 6.86532447304597e-05, + "loss": 0.0122, + "step": 96200 + }, + { + "epoch": 4.409189244874833, + "grad_norm": 0.23698298633098602, + "learning_rate": 6.857907598985081e-05, + "loss": 0.0135, + "step": 96300 + }, + { + "epoch": 4.413767842221535, + "grad_norm": 0.20948071777820587, + "learning_rate": 6.850485978861431e-05, + "loss": 0.0136, + "step": 96400 + }, + { + "epoch": 4.418346439568238, + "grad_norm": 0.3551422357559204, + "learning_rate": 6.843059631633699e-05, + "loss": 0.0143, + "step": 96500 + }, + { + "epoch": 4.4229250369149415, + "grad_norm": 0.21045321226119995, + "learning_rate": 6.835628576272638e-05, + "loss": 0.0149, + "step": 96600 + }, + { + "epoch": 4.427503634261644, + "grad_norm": 1.5752928256988525, + "learning_rate": 6.828192831761033e-05, + "loss": 0.0151, + "step": 96700 + }, + { + "epoch": 4.432082231608347, + "grad_norm": 0.4416331350803375, + "learning_rate": 6.820752417093644e-05, + "loss": 0.0133, + "step": 96800 + }, + { + "epoch": 4.436660828955049, + "grad_norm": 0.44132721424102783, + "learning_rate": 6.81330735127716e-05, + "loss": 0.0101, + "step": 96900 + }, + { + "epoch": 4.4412394263017525, + "grad_norm": 0.2506002187728882, + "learning_rate": 6.805857653330156e-05, + "loss": 0.0128, + "step": 97000 + }, + { + "epoch": 4.445818023648456, + "grad_norm": 0.11981073021888733, + "learning_rate": 6.798403342283034e-05, + "loss": 0.0127, + "step": 97100 + }, + { + "epoch": 4.450396620995158, + "grad_norm": 0.9063414335250854, + "learning_rate": 6.790944437177984e-05, + "loss": 0.0136, + "step": 97200 + }, + { + "epoch": 4.454975218341861, + "grad_norm": 1.0382390022277832, + "learning_rate": 6.783480957068934e-05, + "loss": 0.0116, + "step": 97300 + }, + { + "epoch": 4.4595538156885635, + "grad_norm": 0.22426804900169373, + "learning_rate": 6.776012921021492e-05, + "loss": 0.0149, + "step": 97400 + }, + { + "epoch": 4.464132413035267, + "grad_norm": 0.4911547899246216, + "learning_rate": 6.768540348112907e-05, + "loss": 0.0123, + "step": 97500 + }, + { + "epoch": 4.46871101038197, + "grad_norm": 0.6653274893760681, + "learning_rate": 6.761063257432023e-05, + "loss": 0.0121, + "step": 97600 + }, + { + "epoch": 4.473289607728672, + "grad_norm": 0.37786972522735596, + "learning_rate": 6.753581668079219e-05, + "loss": 0.0133, + "step": 97700 + }, + { + "epoch": 4.477868205075375, + "grad_norm": 0.15616688132286072, + "learning_rate": 6.746095599166362e-05, + "loss": 0.013, + "step": 97800 + }, + { + "epoch": 4.482446802422078, + "grad_norm": 0.11935741454362869, + "learning_rate": 6.738605069816775e-05, + "loss": 0.0148, + "step": 97900 + }, + { + "epoch": 4.487025399768781, + "grad_norm": 0.18721537292003632, + "learning_rate": 6.731110099165164e-05, + "loss": 0.0139, + "step": 98000 + }, + { + "epoch": 4.491603997115484, + "grad_norm": 0.3637322783470154, + "learning_rate": 6.723610706357582e-05, + "loss": 0.0148, + "step": 98100 + }, + { + "epoch": 4.496182594462186, + "grad_norm": 0.1633034497499466, + "learning_rate": 6.716106910551385e-05, + "loss": 0.0127, + "step": 98200 + }, + { + "epoch": 4.5007611918088894, + "grad_norm": 0.19283847510814667, + "learning_rate": 6.708598730915168e-05, + "loss": 0.0132, + "step": 98300 + }, + { + "epoch": 4.505339789155592, + "grad_norm": 0.17327933013439178, + "learning_rate": 6.701086186628732e-05, + "loss": 0.0156, + "step": 98400 + }, + { + "epoch": 4.509918386502295, + "grad_norm": 0.06521926075220108, + "learning_rate": 6.693569296883022e-05, + "loss": 0.0137, + "step": 98500 + }, + { + "epoch": 4.514496983848998, + "grad_norm": 0.4145078659057617, + "learning_rate": 6.686048080880086e-05, + "loss": 0.0144, + "step": 98600 + }, + { + "epoch": 4.5190755811957, + "grad_norm": 0.5390291810035706, + "learning_rate": 6.678522557833024e-05, + "loss": 0.0132, + "step": 98700 + }, + { + "epoch": 4.523654178542404, + "grad_norm": 0.2249838411808014, + "learning_rate": 6.670992746965938e-05, + "loss": 0.0122, + "step": 98800 + }, + { + "epoch": 4.528232775889107, + "grad_norm": 0.09684702008962631, + "learning_rate": 6.663458667513882e-05, + "loss": 0.0122, + "step": 98900 + }, + { + "epoch": 4.532811373235809, + "grad_norm": 0.5852058529853821, + "learning_rate": 6.655920338722816e-05, + "loss": 0.014, + "step": 99000 + }, + { + "epoch": 4.537389970582512, + "grad_norm": 0.4523356258869171, + "learning_rate": 6.648377779849554e-05, + "loss": 0.0129, + "step": 99100 + }, + { + "epoch": 4.5419685679292146, + "grad_norm": 0.4520733058452606, + "learning_rate": 6.640831010161716e-05, + "loss": 0.0123, + "step": 99200 + }, + { + "epoch": 4.546547165275918, + "grad_norm": 0.42760178446769714, + "learning_rate": 6.633280048937678e-05, + "loss": 0.0171, + "step": 99300 + }, + { + "epoch": 4.551125762622621, + "grad_norm": 0.27447327971458435, + "learning_rate": 6.625724915466526e-05, + "loss": 0.0136, + "step": 99400 + }, + { + "epoch": 4.555704359969323, + "grad_norm": 0.30612578988075256, + "learning_rate": 6.618165629048e-05, + "loss": 0.0133, + "step": 99500 + }, + { + "epoch": 4.560282957316026, + "grad_norm": 0.48825210332870483, + "learning_rate": 6.610602208992454e-05, + "loss": 0.0123, + "step": 99600 + }, + { + "epoch": 4.564861554662729, + "grad_norm": 0.43417781591415405, + "learning_rate": 6.603034674620794e-05, + "loss": 0.0149, + "step": 99700 + }, + { + "epoch": 4.569440152009432, + "grad_norm": 0.6489459276199341, + "learning_rate": 6.595463045264445e-05, + "loss": 0.0118, + "step": 99800 + }, + { + "epoch": 4.574018749356135, + "grad_norm": 0.29751142859458923, + "learning_rate": 6.587887340265286e-05, + "loss": 0.0122, + "step": 99900 + }, + { + "epoch": 4.578597346702837, + "grad_norm": 0.1352328062057495, + "learning_rate": 6.580307578975608e-05, + "loss": 0.0139, + "step": 100000 + }, + { + "epoch": 4.5831759440495405, + "grad_norm": 0.2985703945159912, + "learning_rate": 6.572723780758069e-05, + "loss": 0.0121, + "step": 100100 + }, + { + "epoch": 4.587754541396244, + "grad_norm": 0.1775195151567459, + "learning_rate": 6.565135964985634e-05, + "loss": 0.0139, + "step": 100200 + }, + { + "epoch": 4.592333138742946, + "grad_norm": 0.41841718554496765, + "learning_rate": 6.557544151041531e-05, + "loss": 0.0146, + "step": 100300 + }, + { + "epoch": 4.596911736089649, + "grad_norm": 0.07853005081415176, + "learning_rate": 6.549948358319206e-05, + "loss": 0.0138, + "step": 100400 + }, + { + "epoch": 4.6014903334363515, + "grad_norm": 0.39813074469566345, + "learning_rate": 6.542348606222266e-05, + "loss": 0.0127, + "step": 100500 + }, + { + "epoch": 4.606068930783055, + "grad_norm": 0.3754967749118805, + "learning_rate": 6.53474491416443e-05, + "loss": 0.0156, + "step": 100600 + }, + { + "epoch": 4.610647528129757, + "grad_norm": 0.6578196287155151, + "learning_rate": 6.527137301569486e-05, + "loss": 0.0125, + "step": 100700 + }, + { + "epoch": 4.61522612547646, + "grad_norm": 0.7814628481864929, + "learning_rate": 6.519525787871235e-05, + "loss": 0.0142, + "step": 100800 + }, + { + "epoch": 4.619804722823163, + "grad_norm": 0.23694345355033875, + "learning_rate": 6.511910392513443e-05, + "loss": 0.0115, + "step": 100900 + }, + { + "epoch": 4.624383320169866, + "grad_norm": 0.18302284181118011, + "learning_rate": 6.504291134949792e-05, + "loss": 0.0138, + "step": 101000 + }, + { + "epoch": 4.628961917516569, + "grad_norm": 0.5445951223373413, + "learning_rate": 6.496668034643831e-05, + "loss": 0.0149, + "step": 101100 + }, + { + "epoch": 4.633540514863272, + "grad_norm": 0.6721272468566895, + "learning_rate": 6.489041111068926e-05, + "loss": 0.014, + "step": 101200 + }, + { + "epoch": 4.638119112209974, + "grad_norm": 0.40816518664360046, + "learning_rate": 6.481410383708206e-05, + "loss": 0.012, + "step": 101300 + }, + { + "epoch": 4.6426977095566775, + "grad_norm": 0.28873249888420105, + "learning_rate": 6.473775872054521e-05, + "loss": 0.0148, + "step": 101400 + }, + { + "epoch": 4.64727630690338, + "grad_norm": 0.5939431190490723, + "learning_rate": 6.466137595610388e-05, + "loss": 0.0124, + "step": 101500 + }, + { + "epoch": 4.651854904250083, + "grad_norm": 0.08564829081296921, + "learning_rate": 6.458495573887933e-05, + "loss": 0.0128, + "step": 101600 + }, + { + "epoch": 4.656433501596786, + "grad_norm": 0.8717368245124817, + "learning_rate": 6.450849826408865e-05, + "loss": 0.0137, + "step": 101700 + }, + { + "epoch": 4.6610120989434884, + "grad_norm": 0.02314877323806286, + "learning_rate": 6.443200372704395e-05, + "loss": 0.0151, + "step": 101800 + }, + { + "epoch": 4.665590696290192, + "grad_norm": 0.2785604000091553, + "learning_rate": 6.43554723231521e-05, + "loss": 0.0111, + "step": 101900 + }, + { + "epoch": 4.670169293636894, + "grad_norm": 0.14578752219676971, + "learning_rate": 6.427890424791415e-05, + "loss": 0.0131, + "step": 102000 + }, + { + "epoch": 4.674747890983597, + "grad_norm": 0.14544513821601868, + "learning_rate": 6.420229969692477e-05, + "loss": 0.0136, + "step": 102100 + }, + { + "epoch": 4.6793264883303, + "grad_norm": 0.36046111583709717, + "learning_rate": 6.412565886587185e-05, + "loss": 0.0135, + "step": 102200 + }, + { + "epoch": 4.683905085677003, + "grad_norm": 0.208379328250885, + "learning_rate": 6.404898195053597e-05, + "loss": 0.0132, + "step": 102300 + }, + { + "epoch": 4.688483683023706, + "grad_norm": 0.04505769535899162, + "learning_rate": 6.397226914678986e-05, + "loss": 0.014, + "step": 102400 + }, + { + "epoch": 4.693062280370409, + "grad_norm": 0.12393535673618317, + "learning_rate": 6.389552065059795e-05, + "loss": 0.0142, + "step": 102500 + }, + { + "epoch": 4.697640877717111, + "grad_norm": 0.14113786816596985, + "learning_rate": 6.381873665801581e-05, + "loss": 0.0146, + "step": 102600 + }, + { + "epoch": 4.702219475063814, + "grad_norm": 0.14992213249206543, + "learning_rate": 6.374191736518974e-05, + "loss": 0.01, + "step": 102700 + }, + { + "epoch": 4.706798072410517, + "grad_norm": 0.24738559126853943, + "learning_rate": 6.366506296835616e-05, + "loss": 0.0114, + "step": 102800 + }, + { + "epoch": 4.71137666975722, + "grad_norm": 0.6193427443504333, + "learning_rate": 6.358817366384122e-05, + "loss": 0.0139, + "step": 102900 + }, + { + "epoch": 4.715955267103923, + "grad_norm": 0.24367505311965942, + "learning_rate": 6.35112496480602e-05, + "loss": 0.0113, + "step": 103000 + }, + { + "epoch": 4.720533864450625, + "grad_norm": 0.11543486267328262, + "learning_rate": 6.343429111751704e-05, + "loss": 0.015, + "step": 103100 + }, + { + "epoch": 4.725112461797329, + "grad_norm": 0.23988036811351776, + "learning_rate": 6.33572982688039e-05, + "loss": 0.0121, + "step": 103200 + }, + { + "epoch": 4.729691059144031, + "grad_norm": 0.26978039741516113, + "learning_rate": 6.328027129860057e-05, + "loss": 0.0117, + "step": 103300 + }, + { + "epoch": 4.734269656490734, + "grad_norm": 0.047924984246492386, + "learning_rate": 6.3203210403674e-05, + "loss": 0.0141, + "step": 103400 + }, + { + "epoch": 4.738848253837437, + "grad_norm": 0.23787090182304382, + "learning_rate": 6.312611578087784e-05, + "loss": 0.0133, + "step": 103500 + }, + { + "epoch": 4.7434268511841395, + "grad_norm": 0.9701817035675049, + "learning_rate": 6.304898762715186e-05, + "loss": 0.0121, + "step": 103600 + }, + { + "epoch": 4.748005448530843, + "grad_norm": 0.5129296183586121, + "learning_rate": 6.29718261395215e-05, + "loss": 0.0161, + "step": 103700 + }, + { + "epoch": 4.752584045877546, + "grad_norm": 0.2481413185596466, + "learning_rate": 6.289463151509733e-05, + "loss": 0.0142, + "step": 103800 + }, + { + "epoch": 4.757162643224248, + "grad_norm": 0.4262784719467163, + "learning_rate": 6.281740395107462e-05, + "loss": 0.0152, + "step": 103900 + }, + { + "epoch": 4.761741240570951, + "grad_norm": 0.42060771584510803, + "learning_rate": 6.274014364473274e-05, + "loss": 0.0132, + "step": 104000 + }, + { + "epoch": 4.766319837917654, + "grad_norm": 0.2619081437587738, + "learning_rate": 6.26628507934347e-05, + "loss": 0.0124, + "step": 104100 + }, + { + "epoch": 4.770898435264357, + "grad_norm": 0.47017577290534973, + "learning_rate": 6.258552559462668e-05, + "loss": 0.0132, + "step": 104200 + }, + { + "epoch": 4.775477032611059, + "grad_norm": 0.5897017121315002, + "learning_rate": 6.250816824583747e-05, + "loss": 0.0134, + "step": 104300 + }, + { + "epoch": 4.780055629957762, + "grad_norm": 0.41096287965774536, + "learning_rate": 6.243077894467799e-05, + "loss": 0.0139, + "step": 104400 + }, + { + "epoch": 4.7846342273044655, + "grad_norm": 0.9277390241622925, + "learning_rate": 6.235335788884079e-05, + "loss": 0.0114, + "step": 104500 + }, + { + "epoch": 4.789212824651168, + "grad_norm": 0.4034029245376587, + "learning_rate": 6.227590527609952e-05, + "loss": 0.0117, + "step": 104600 + }, + { + "epoch": 4.793791421997871, + "grad_norm": 0.08527888357639313, + "learning_rate": 6.219842130430846e-05, + "loss": 0.0139, + "step": 104700 + }, + { + "epoch": 4.798370019344574, + "grad_norm": 0.43536534905433655, + "learning_rate": 6.2120906171402e-05, + "loss": 0.0136, + "step": 104800 + }, + { + "epoch": 4.8029486166912765, + "grad_norm": 0.14146916568279266, + "learning_rate": 6.204336007539412e-05, + "loss": 0.014, + "step": 104900 + }, + { + "epoch": 4.80752721403798, + "grad_norm": 0.2524791657924652, + "learning_rate": 6.19657832143779e-05, + "loss": 0.0149, + "step": 105000 + }, + { + "epoch": 4.812105811384682, + "grad_norm": 0.14325548708438873, + "learning_rate": 6.1888175786525e-05, + "loss": 0.0135, + "step": 105100 + }, + { + "epoch": 4.816684408731385, + "grad_norm": 0.08125073462724686, + "learning_rate": 6.181053799008519e-05, + "loss": 0.012, + "step": 105200 + }, + { + "epoch": 4.821263006078088, + "grad_norm": 0.765481173992157, + "learning_rate": 6.173287002338577e-05, + "loss": 0.0123, + "step": 105300 + }, + { + "epoch": 4.825841603424791, + "grad_norm": 0.6038789749145508, + "learning_rate": 6.165517208483117e-05, + "loss": 0.0135, + "step": 105400 + }, + { + "epoch": 4.830420200771494, + "grad_norm": 0.3267226219177246, + "learning_rate": 6.157744437290236e-05, + "loss": 0.012, + "step": 105500 + }, + { + "epoch": 4.834998798118196, + "grad_norm": 0.08704890310764313, + "learning_rate": 6.149968708615634e-05, + "loss": 0.0136, + "step": 105600 + }, + { + "epoch": 4.839577395464899, + "grad_norm": 0.22156277298927307, + "learning_rate": 6.142190042322569e-05, + "loss": 0.013, + "step": 105700 + }, + { + "epoch": 4.8441559928116025, + "grad_norm": 0.12729842960834503, + "learning_rate": 6.134408458281805e-05, + "loss": 0.014, + "step": 105800 + }, + { + "epoch": 4.848734590158305, + "grad_norm": 0.21868254244327545, + "learning_rate": 6.12662397637155e-05, + "loss": 0.0154, + "step": 105900 + }, + { + "epoch": 4.853313187505008, + "grad_norm": 0.3205544352531433, + "learning_rate": 6.118836616477427e-05, + "loss": 0.0132, + "step": 106000 + }, + { + "epoch": 4.857891784851711, + "grad_norm": 0.26208868622779846, + "learning_rate": 6.111046398492404e-05, + "loss": 0.0139, + "step": 106100 + }, + { + "epoch": 4.862470382198413, + "grad_norm": 0.6751037836074829, + "learning_rate": 6.103253342316753e-05, + "loss": 0.0129, + "step": 106200 + }, + { + "epoch": 4.867048979545117, + "grad_norm": 0.2062651365995407, + "learning_rate": 6.095457467857989e-05, + "loss": 0.0145, + "step": 106300 + }, + { + "epoch": 4.871627576891819, + "grad_norm": 0.18155290186405182, + "learning_rate": 6.087658795030837e-05, + "loss": 0.0127, + "step": 106400 + }, + { + "epoch": 4.876206174238522, + "grad_norm": 0.17720471322536469, + "learning_rate": 6.079857343757165e-05, + "loss": 0.0134, + "step": 106500 + }, + { + "epoch": 4.880784771585224, + "grad_norm": 0.09973806142807007, + "learning_rate": 6.072053133965938e-05, + "loss": 0.0116, + "step": 106600 + }, + { + "epoch": 4.885363368931928, + "grad_norm": 0.25288718938827515, + "learning_rate": 6.064246185593167e-05, + "loss": 0.0127, + "step": 106700 + }, + { + "epoch": 4.889941966278631, + "grad_norm": 0.19430892169475555, + "learning_rate": 6.056436518581864e-05, + "loss": 0.0147, + "step": 106800 + }, + { + "epoch": 4.894520563625333, + "grad_norm": 0.31932905316352844, + "learning_rate": 6.0486241528819795e-05, + "loss": 0.0127, + "step": 106900 + }, + { + "epoch": 4.899099160972036, + "grad_norm": 0.06558812409639359, + "learning_rate": 6.040809108450363e-05, + "loss": 0.0124, + "step": 107000 + }, + { + "epoch": 4.903677758318739, + "grad_norm": 0.20380474627017975, + "learning_rate": 6.032991405250702e-05, + "loss": 0.0147, + "step": 107100 + }, + { + "epoch": 4.908256355665442, + "grad_norm": 0.08541610836982727, + "learning_rate": 6.025171063253479e-05, + "loss": 0.014, + "step": 107200 + }, + { + "epoch": 4.912834953012145, + "grad_norm": 0.3804337978363037, + "learning_rate": 6.017348102435918e-05, + "loss": 0.0116, + "step": 107300 + }, + { + "epoch": 4.917413550358847, + "grad_norm": 0.3044677674770355, + "learning_rate": 6.00952254278193e-05, + "loss": 0.0141, + "step": 107400 + }, + { + "epoch": 4.92199214770555, + "grad_norm": 0.4350314438343048, + "learning_rate": 6.001694404282068e-05, + "loss": 0.0129, + "step": 107500 + }, + { + "epoch": 4.926570745052254, + "grad_norm": 0.19222760200500488, + "learning_rate": 5.993863706933468e-05, + "loss": 0.0124, + "step": 107600 + }, + { + "epoch": 4.931149342398956, + "grad_norm": 0.36865904927253723, + "learning_rate": 5.986030470739811e-05, + "loss": 0.0113, + "step": 107700 + }, + { + "epoch": 4.935727939745659, + "grad_norm": 0.20282283425331116, + "learning_rate": 5.9781947157112536e-05, + "loss": 0.013, + "step": 107800 + }, + { + "epoch": 4.940306537092361, + "grad_norm": 0.11859617382287979, + "learning_rate": 5.970356461864391e-05, + "loss": 0.0138, + "step": 107900 + }, + { + "epoch": 4.9448851344390645, + "grad_norm": 0.5312494039535522, + "learning_rate": 5.962515729222208e-05, + "loss": 0.0128, + "step": 108000 + }, + { + "epoch": 4.949463731785768, + "grad_norm": 0.40164250135421753, + "learning_rate": 5.95467253781401e-05, + "loss": 0.0117, + "step": 108100 + }, + { + "epoch": 4.95404232913247, + "grad_norm": 0.11808757483959198, + "learning_rate": 5.9468269076753894e-05, + "loss": 0.0121, + "step": 108200 + }, + { + "epoch": 4.958620926479173, + "grad_norm": 0.20174367725849152, + "learning_rate": 5.938978858848171e-05, + "loss": 0.0122, + "step": 108300 + }, + { + "epoch": 4.963199523825876, + "grad_norm": 0.33299440145492554, + "learning_rate": 5.9311284113803524e-05, + "loss": 0.0115, + "step": 108400 + }, + { + "epoch": 4.967778121172579, + "grad_norm": 0.6904717683792114, + "learning_rate": 5.9232755853260635e-05, + "loss": 0.0139, + "step": 108500 + }, + { + "epoch": 4.972356718519282, + "grad_norm": 0.17567585408687592, + "learning_rate": 5.915420400745507e-05, + "loss": 0.0118, + "step": 108600 + }, + { + "epoch": 4.976935315865984, + "grad_norm": 0.16880100965499878, + "learning_rate": 5.907562877704912e-05, + "loss": 0.015, + "step": 108700 + }, + { + "epoch": 4.981513913212687, + "grad_norm": 0.2917187213897705, + "learning_rate": 5.899703036276482e-05, + "loss": 0.0135, + "step": 108800 + }, + { + "epoch": 4.9860925105593905, + "grad_norm": 0.028255263343453407, + "learning_rate": 5.891840896538339e-05, + "loss": 0.0112, + "step": 108900 + }, + { + "epoch": 4.990671107906093, + "grad_norm": 0.2152412086725235, + "learning_rate": 5.883976478574482e-05, + "loss": 0.014, + "step": 109000 + }, + { + "epoch": 4.995249705252796, + "grad_norm": 0.3723663091659546, + "learning_rate": 5.876109802474725e-05, + "loss": 0.0123, + "step": 109100 + }, + { + "epoch": 4.999828302599498, + "grad_norm": 0.6162732243537903, + "learning_rate": 5.868240888334653e-05, + "loss": 0.0161, + "step": 109200 + }, + { + "epoch": 4.9999656605199, + "eval_loss": 0.17184050381183624, + "eval_runtime": 244.2658, + "eval_samples_per_second": 22.516, + "eval_steps_per_second": 22.516, + "step": 109203 + }, + { + "epoch": 5.0044068999462015, + "grad_norm": 0.5017980337142944, + "learning_rate": 5.860369756255566e-05, + "loss": 0.0083, + "step": 109300 + }, + { + "epoch": 5.008985497292905, + "grad_norm": 0.14825376868247986, + "learning_rate": 5.8524964263444324e-05, + "loss": 0.0097, + "step": 109400 + }, + { + "epoch": 5.013564094639607, + "grad_norm": 1.7440462112426758, + "learning_rate": 5.8446209187138324e-05, + "loss": 0.0083, + "step": 109500 + }, + { + "epoch": 5.01814269198631, + "grad_norm": 0.25318461656570435, + "learning_rate": 5.8367432534819124e-05, + "loss": 0.0094, + "step": 109600 + }, + { + "epoch": 5.022721289333012, + "grad_norm": 0.0751919150352478, + "learning_rate": 5.8288634507723274e-05, + "loss": 0.0089, + "step": 109700 + }, + { + "epoch": 5.027299886679716, + "grad_norm": 0.3842028081417084, + "learning_rate": 5.820981530714191e-05, + "loss": 0.0088, + "step": 109800 + }, + { + "epoch": 5.031878484026419, + "grad_norm": 0.11625286936759949, + "learning_rate": 5.813097513442035e-05, + "loss": 0.008, + "step": 109900 + }, + { + "epoch": 5.036457081373121, + "grad_norm": 0.25438615679740906, + "learning_rate": 5.805211419095736e-05, + "loss": 0.009, + "step": 110000 + }, + { + "epoch": 5.041035678719824, + "grad_norm": 0.13749825954437256, + "learning_rate": 5.797323267820484e-05, + "loss": 0.0092, + "step": 110100 + }, + { + "epoch": 5.0456142760665275, + "grad_norm": 0.06733408570289612, + "learning_rate": 5.789433079766723e-05, + "loss": 0.0097, + "step": 110200 + }, + { + "epoch": 5.05019287341323, + "grad_norm": 0.2959531843662262, + "learning_rate": 5.7815408750900993e-05, + "loss": 0.0071, + "step": 110300 + }, + { + "epoch": 5.054771470759933, + "grad_norm": 0.10893545299768448, + "learning_rate": 5.773646673951406e-05, + "loss": 0.0096, + "step": 110400 + }, + { + "epoch": 5.059350068106635, + "grad_norm": 0.9517889618873596, + "learning_rate": 5.765750496516547e-05, + "loss": 0.0108, + "step": 110500 + }, + { + "epoch": 5.063928665453338, + "grad_norm": 0.31945428252220154, + "learning_rate": 5.757852362956463e-05, + "loss": 0.0107, + "step": 110600 + }, + { + "epoch": 5.068507262800042, + "grad_norm": 0.2407699078321457, + "learning_rate": 5.7499522934470994e-05, + "loss": 0.0083, + "step": 110700 + }, + { + "epoch": 5.073085860146744, + "grad_norm": 0.15435832738876343, + "learning_rate": 5.7420503081693446e-05, + "loss": 0.0086, + "step": 110800 + }, + { + "epoch": 5.077664457493447, + "grad_norm": 0.4791698455810547, + "learning_rate": 5.734146427308979e-05, + "loss": 0.0072, + "step": 110900 + }, + { + "epoch": 5.082243054840149, + "grad_norm": 0.14484897255897522, + "learning_rate": 5.7262406710566296e-05, + "loss": 0.0105, + "step": 111000 + }, + { + "epoch": 5.086821652186853, + "grad_norm": 0.5574690103530884, + "learning_rate": 5.71833305960771e-05, + "loss": 0.0092, + "step": 111100 + }, + { + "epoch": 5.091400249533556, + "grad_norm": 0.3678722679615021, + "learning_rate": 5.7104236131623736e-05, + "loss": 0.0099, + "step": 111200 + }, + { + "epoch": 5.095978846880258, + "grad_norm": 0.8227113485336304, + "learning_rate": 5.702512351925464e-05, + "loss": 0.008, + "step": 111300 + }, + { + "epoch": 5.100557444226961, + "grad_norm": 0.13089661300182343, + "learning_rate": 5.6945992961064586e-05, + "loss": 0.0081, + "step": 111400 + }, + { + "epoch": 5.1051360415736635, + "grad_norm": 0.008971684612333775, + "learning_rate": 5.6866844659194185e-05, + "loss": 0.0084, + "step": 111500 + }, + { + "epoch": 5.109714638920367, + "grad_norm": 0.0824974775314331, + "learning_rate": 5.6787678815829404e-05, + "loss": 0.0098, + "step": 111600 + }, + { + "epoch": 5.11429323626707, + "grad_norm": 0.17469094693660736, + "learning_rate": 5.6708495633200964e-05, + "loss": 0.0078, + "step": 111700 + }, + { + "epoch": 5.118871833613772, + "grad_norm": 0.13333024084568024, + "learning_rate": 5.6629295313583974e-05, + "loss": 0.0082, + "step": 111800 + }, + { + "epoch": 5.123450430960475, + "grad_norm": 0.43794387578964233, + "learning_rate": 5.6550078059297205e-05, + "loss": 0.0089, + "step": 111900 + }, + { + "epoch": 5.1280290283071785, + "grad_norm": 0.37814435362815857, + "learning_rate": 5.6470844072702764e-05, + "loss": 0.0105, + "step": 112000 + }, + { + "epoch": 5.132607625653881, + "grad_norm": 0.3779330253601074, + "learning_rate": 5.639159355620551e-05, + "loss": 0.0084, + "step": 112100 + }, + { + "epoch": 5.137186223000584, + "grad_norm": 0.30869078636169434, + "learning_rate": 5.631232671225247e-05, + "loss": 0.0093, + "step": 112200 + }, + { + "epoch": 5.141764820347286, + "grad_norm": 0.3333792984485626, + "learning_rate": 5.623304374333239e-05, + "loss": 0.0108, + "step": 112300 + }, + { + "epoch": 5.1463434176939895, + "grad_norm": 1.2692680358886719, + "learning_rate": 5.6153744851975274e-05, + "loss": 0.0081, + "step": 112400 + }, + { + "epoch": 5.150922015040693, + "grad_norm": 0.017233431339263916, + "learning_rate": 5.607443024075173e-05, + "loss": 0.0075, + "step": 112500 + }, + { + "epoch": 5.155500612387395, + "grad_norm": 0.46397635340690613, + "learning_rate": 5.5995100112272545e-05, + "loss": 0.0108, + "step": 112600 + }, + { + "epoch": 5.160079209734098, + "grad_norm": 0.23527605831623077, + "learning_rate": 5.591575466918816e-05, + "loss": 0.0094, + "step": 112700 + }, + { + "epoch": 5.1646578070808005, + "grad_norm": 0.19655343890190125, + "learning_rate": 5.583639411418811e-05, + "loss": 0.0092, + "step": 112800 + }, + { + "epoch": 5.169236404427504, + "grad_norm": 0.6157360076904297, + "learning_rate": 5.575701865000054e-05, + "loss": 0.0085, + "step": 112900 + }, + { + "epoch": 5.173815001774207, + "grad_norm": 0.4467610716819763, + "learning_rate": 5.56776284793917e-05, + "loss": 0.0092, + "step": 113000 + }, + { + "epoch": 5.178393599120909, + "grad_norm": 0.16839289665222168, + "learning_rate": 5.559822380516539e-05, + "loss": 0.0093, + "step": 113100 + }, + { + "epoch": 5.182972196467612, + "grad_norm": 0.08081818372011185, + "learning_rate": 5.551880483016248e-05, + "loss": 0.0088, + "step": 113200 + }, + { + "epoch": 5.187550793814315, + "grad_norm": 0.7287288308143616, + "learning_rate": 5.543937175726035e-05, + "loss": 0.0084, + "step": 113300 + }, + { + "epoch": 5.192129391161018, + "grad_norm": 0.18267770111560822, + "learning_rate": 5.5359924789372396e-05, + "loss": 0.0083, + "step": 113400 + }, + { + "epoch": 5.196707988507721, + "grad_norm": 0.3210001587867737, + "learning_rate": 5.528046412944752e-05, + "loss": 0.0094, + "step": 113500 + }, + { + "epoch": 5.201286585854423, + "grad_norm": 0.21997089684009552, + "learning_rate": 5.520098998046958e-05, + "loss": 0.0089, + "step": 113600 + }, + { + "epoch": 5.2058651832011265, + "grad_norm": 0.24578975141048431, + "learning_rate": 5.5121502545456925e-05, + "loss": 0.0095, + "step": 113700 + }, + { + "epoch": 5.210443780547829, + "grad_norm": 1.2959401607513428, + "learning_rate": 5.504200202746182e-05, + "loss": 0.0085, + "step": 113800 + }, + { + "epoch": 5.215022377894532, + "grad_norm": 0.12553347647190094, + "learning_rate": 5.496248862956994e-05, + "loss": 0.0089, + "step": 113900 + }, + { + "epoch": 5.219600975241235, + "grad_norm": 0.7202230095863342, + "learning_rate": 5.488296255489991e-05, + "loss": 0.008, + "step": 114000 + }, + { + "epoch": 5.224179572587937, + "grad_norm": 0.7170085310935974, + "learning_rate": 5.480342400660268e-05, + "loss": 0.0104, + "step": 114100 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 0.029888896271586418, + "learning_rate": 5.4723873187861085e-05, + "loss": 0.0092, + "step": 114200 + }, + { + "epoch": 5.233336767281344, + "grad_norm": 0.2950020730495453, + "learning_rate": 5.4644310301889334e-05, + "loss": 0.0089, + "step": 114300 + }, + { + "epoch": 5.237915364628046, + "grad_norm": 0.12343444675207138, + "learning_rate": 5.456473555193242e-05, + "loss": 0.008, + "step": 114400 + }, + { + "epoch": 5.242493961974749, + "grad_norm": 0.5347928404808044, + "learning_rate": 5.4485149141265667e-05, + "loss": 0.0079, + "step": 114500 + }, + { + "epoch": 5.247072559321452, + "grad_norm": 0.9914150834083557, + "learning_rate": 5.440555127319418e-05, + "loss": 0.0111, + "step": 114600 + }, + { + "epoch": 5.251651156668155, + "grad_norm": 0.24366235733032227, + "learning_rate": 5.432594215105234e-05, + "loss": 0.0085, + "step": 114700 + }, + { + "epoch": 5.256229754014858, + "grad_norm": 0.1021379604935646, + "learning_rate": 5.424632197820324e-05, + "loss": 0.0091, + "step": 114800 + }, + { + "epoch": 5.26080835136156, + "grad_norm": 0.11071757227182388, + "learning_rate": 5.4166690958038265e-05, + "loss": 0.0082, + "step": 114900 + }, + { + "epoch": 5.265386948708263, + "grad_norm": 1.4259638786315918, + "learning_rate": 5.408704929397648e-05, + "loss": 0.0085, + "step": 115000 + }, + { + "epoch": 5.269965546054966, + "grad_norm": 0.2681211531162262, + "learning_rate": 5.4007397189464105e-05, + "loss": 0.0108, + "step": 115100 + }, + { + "epoch": 5.274544143401669, + "grad_norm": 0.4776928126811981, + "learning_rate": 5.3927734847974064e-05, + "loss": 0.008, + "step": 115200 + }, + { + "epoch": 5.279122740748372, + "grad_norm": 0.38615280389785767, + "learning_rate": 5.3848062473005464e-05, + "loss": 0.0092, + "step": 115300 + }, + { + "epoch": 5.283701338095074, + "grad_norm": 0.23448576033115387, + "learning_rate": 5.376838026808298e-05, + "loss": 0.0099, + "step": 115400 + }, + { + "epoch": 5.2882799354417775, + "grad_norm": 0.11435823887586594, + "learning_rate": 5.368868843675642e-05, + "loss": 0.0093, + "step": 115500 + }, + { + "epoch": 5.29285853278848, + "grad_norm": 0.22706013917922974, + "learning_rate": 5.360898718260021e-05, + "loss": 0.0085, + "step": 115600 + }, + { + "epoch": 5.297437130135183, + "grad_norm": 0.04221300780773163, + "learning_rate": 5.3529276709212816e-05, + "loss": 0.0084, + "step": 115700 + }, + { + "epoch": 5.302015727481886, + "grad_norm": 0.3892548382282257, + "learning_rate": 5.344955722021624e-05, + "loss": 0.0101, + "step": 115800 + }, + { + "epoch": 5.3065943248285885, + "grad_norm": 0.13219723105430603, + "learning_rate": 5.336982891925559e-05, + "loss": 0.0087, + "step": 115900 + }, + { + "epoch": 5.311172922175292, + "grad_norm": 0.18125391006469727, + "learning_rate": 5.32900920099984e-05, + "loss": 0.0097, + "step": 116000 + }, + { + "epoch": 5.315751519521994, + "grad_norm": 0.14028698205947876, + "learning_rate": 5.321034669613422e-05, + "loss": 0.0088, + "step": 116100 + }, + { + "epoch": 5.320330116868697, + "grad_norm": 0.1114293709397316, + "learning_rate": 5.31305931813741e-05, + "loss": 0.0086, + "step": 116200 + }, + { + "epoch": 5.3249087142154, + "grad_norm": 0.20969901978969574, + "learning_rate": 5.3050831669450005e-05, + "loss": 0.0082, + "step": 116300 + }, + { + "epoch": 5.329487311562103, + "grad_norm": 0.07742590457201004, + "learning_rate": 5.297106236411432e-05, + "loss": 0.0083, + "step": 116400 + }, + { + "epoch": 5.334065908908806, + "grad_norm": 0.259859174489975, + "learning_rate": 5.2891285469139395e-05, + "loss": 0.0087, + "step": 116500 + }, + { + "epoch": 5.338644506255509, + "grad_norm": 0.3085865080356598, + "learning_rate": 5.2811501188316915e-05, + "loss": 0.0103, + "step": 116600 + }, + { + "epoch": 5.343223103602211, + "grad_norm": 0.27554938197135925, + "learning_rate": 5.2731709725457434e-05, + "loss": 0.0084, + "step": 116700 + }, + { + "epoch": 5.3478017009489145, + "grad_norm": 0.36539149284362793, + "learning_rate": 5.2651911284389896e-05, + "loss": 0.0085, + "step": 116800 + }, + { + "epoch": 5.352380298295617, + "grad_norm": 0.47007834911346436, + "learning_rate": 5.2572106068961026e-05, + "loss": 0.0106, + "step": 116900 + }, + { + "epoch": 5.35695889564232, + "grad_norm": 0.22008706629276276, + "learning_rate": 5.249229428303486e-05, + "loss": 0.0086, + "step": 117000 + }, + { + "epoch": 5.361537492989023, + "grad_norm": 0.02755674161016941, + "learning_rate": 5.241247613049225e-05, + "loss": 0.0093, + "step": 117100 + }, + { + "epoch": 5.3661160903357255, + "grad_norm": 0.11869332939386368, + "learning_rate": 5.233265181523028e-05, + "loss": 0.0086, + "step": 117200 + }, + { + "epoch": 5.370694687682429, + "grad_norm": 0.6038843393325806, + "learning_rate": 5.225282154116179e-05, + "loss": 0.0089, + "step": 117300 + }, + { + "epoch": 5.375273285029131, + "grad_norm": 0.34202539920806885, + "learning_rate": 5.217298551221483e-05, + "loss": 0.0101, + "step": 117400 + }, + { + "epoch": 5.379851882375834, + "grad_norm": 0.18048258125782013, + "learning_rate": 5.2093143932332176e-05, + "loss": 0.0089, + "step": 117500 + }, + { + "epoch": 5.384430479722537, + "grad_norm": 0.14283466339111328, + "learning_rate": 5.201329700547076e-05, + "loss": 0.0076, + "step": 117600 + }, + { + "epoch": 5.38900907706924, + "grad_norm": 0.5224958658218384, + "learning_rate": 5.193344493560117e-05, + "loss": 0.0091, + "step": 117700 + }, + { + "epoch": 5.393587674415943, + "grad_norm": 0.0608445443212986, + "learning_rate": 5.185358792670718e-05, + "loss": 0.0091, + "step": 117800 + }, + { + "epoch": 5.398166271762646, + "grad_norm": 0.3700086176395416, + "learning_rate": 5.177372618278511e-05, + "loss": 0.0087, + "step": 117900 + }, + { + "epoch": 5.402744869109348, + "grad_norm": 0.20753388106822968, + "learning_rate": 5.16938599078434e-05, + "loss": 0.0099, + "step": 118000 + }, + { + "epoch": 5.407323466456051, + "grad_norm": 0.13068944215774536, + "learning_rate": 5.161398930590212e-05, + "loss": 0.0099, + "step": 118100 + }, + { + "epoch": 5.411902063802754, + "grad_norm": 0.34820255637168884, + "learning_rate": 5.153411458099231e-05, + "loss": 0.0087, + "step": 118200 + }, + { + "epoch": 5.416480661149457, + "grad_norm": 0.3474198281764984, + "learning_rate": 5.145423593715557e-05, + "loss": 0.0104, + "step": 118300 + }, + { + "epoch": 5.42105925849616, + "grad_norm": 0.11103557795286179, + "learning_rate": 5.137435357844357e-05, + "loss": 0.0065, + "step": 118400 + }, + { + "epoch": 5.425637855842862, + "grad_norm": 0.08837764710187912, + "learning_rate": 5.129446770891738e-05, + "loss": 0.0078, + "step": 118500 + }, + { + "epoch": 5.430216453189566, + "grad_norm": 0.07470713555812836, + "learning_rate": 5.121457853264708e-05, + "loss": 0.0074, + "step": 118600 + }, + { + "epoch": 5.434795050536268, + "grad_norm": 0.08019549399614334, + "learning_rate": 5.1134686253711215e-05, + "loss": 0.0104, + "step": 118700 + }, + { + "epoch": 5.439373647882971, + "grad_norm": 0.1745513528585434, + "learning_rate": 5.105479107619624e-05, + "loss": 0.009, + "step": 118800 + }, + { + "epoch": 5.443952245229674, + "grad_norm": 0.07470156252384186, + "learning_rate": 5.097489320419598e-05, + "loss": 0.0083, + "step": 118900 + }, + { + "epoch": 5.4485308425763765, + "grad_norm": 0.5151394605636597, + "learning_rate": 5.089499284181122e-05, + "loss": 0.0083, + "step": 119000 + }, + { + "epoch": 5.45310943992308, + "grad_norm": 0.11218901723623276, + "learning_rate": 5.081509019314902e-05, + "loss": 0.0097, + "step": 119100 + }, + { + "epoch": 5.457688037269782, + "grad_norm": 0.25493118166923523, + "learning_rate": 5.073518546232234e-05, + "loss": 0.0084, + "step": 119200 + }, + { + "epoch": 5.462266634616485, + "grad_norm": 0.39373013377189636, + "learning_rate": 5.065527885344944e-05, + "loss": 0.0098, + "step": 119300 + }, + { + "epoch": 5.466845231963188, + "grad_norm": 0.5648688673973083, + "learning_rate": 5.057537057065338e-05, + "loss": 0.009, + "step": 119400 + }, + { + "epoch": 5.471423829309891, + "grad_norm": 0.2762792408466339, + "learning_rate": 5.049546081806149e-05, + "loss": 0.0077, + "step": 119500 + }, + { + "epoch": 5.476002426656594, + "grad_norm": 0.10117408633232117, + "learning_rate": 5.041554979980486e-05, + "loss": 0.0078, + "step": 119600 + }, + { + "epoch": 5.480581024003296, + "grad_norm": 0.7319039106369019, + "learning_rate": 5.0335637720017817e-05, + "loss": 0.0085, + "step": 119700 + }, + { + "epoch": 5.485159621349999, + "grad_norm": 0.4741845428943634, + "learning_rate": 5.025572478283738e-05, + "loss": 0.0084, + "step": 119800 + }, + { + "epoch": 5.4897382186967025, + "grad_norm": 0.2592092752456665, + "learning_rate": 5.0175811192402767e-05, + "loss": 0.0075, + "step": 119900 + }, + { + "epoch": 5.494316816043405, + "grad_norm": 0.03605992719531059, + "learning_rate": 5.009589715285492e-05, + "loss": 0.0056, + "step": 120000 + }, + { + "epoch": 5.498895413390108, + "grad_norm": 0.518429696559906, + "learning_rate": 5.0015982868335834e-05, + "loss": 0.0104, + "step": 120100 + }, + { + "epoch": 5.503474010736811, + "grad_norm": 0.42362892627716064, + "learning_rate": 4.993606854298817e-05, + "loss": 0.0106, + "step": 120200 + }, + { + "epoch": 5.5080526080835135, + "grad_norm": 0.27914491295814514, + "learning_rate": 4.985615438095473e-05, + "loss": 0.008, + "step": 120300 + }, + { + "epoch": 5.512631205430217, + "grad_norm": 0.12702660262584686, + "learning_rate": 4.977624058637783e-05, + "loss": 0.0094, + "step": 120400 + }, + { + "epoch": 5.517209802776919, + "grad_norm": 0.06755949556827545, + "learning_rate": 4.969632736339893e-05, + "loss": 0.0089, + "step": 120500 + }, + { + "epoch": 5.521788400123622, + "grad_norm": 0.2052990347146988, + "learning_rate": 4.961641491615794e-05, + "loss": 0.0079, + "step": 120600 + }, + { + "epoch": 5.526366997470325, + "grad_norm": 0.27255722880363464, + "learning_rate": 4.953650344879286e-05, + "loss": 0.0076, + "step": 120700 + }, + { + "epoch": 5.530945594817028, + "grad_norm": 0.10563024878501892, + "learning_rate": 4.945659316543916e-05, + "loss": 0.0087, + "step": 120800 + }, + { + "epoch": 5.535524192163731, + "grad_norm": 0.31879550218582153, + "learning_rate": 4.9376684270229254e-05, + "loss": 0.009, + "step": 120900 + }, + { + "epoch": 5.540102789510433, + "grad_norm": 0.21383854746818542, + "learning_rate": 4.929677696729207e-05, + "loss": 0.0085, + "step": 121000 + }, + { + "epoch": 5.544681386857136, + "grad_norm": 0.2081623524427414, + "learning_rate": 4.921687146075244e-05, + "loss": 0.0095, + "step": 121100 + }, + { + "epoch": 5.5492599842038395, + "grad_norm": 0.12125098705291748, + "learning_rate": 4.913696795473058e-05, + "loss": 0.0084, + "step": 121200 + }, + { + "epoch": 5.553838581550542, + "grad_norm": 0.17820671200752258, + "learning_rate": 4.905706665334165e-05, + "loss": 0.0081, + "step": 121300 + }, + { + "epoch": 5.558417178897245, + "grad_norm": 0.2230408489704132, + "learning_rate": 4.897716776069512e-05, + "loss": 0.0079, + "step": 121400 + }, + { + "epoch": 5.562995776243948, + "grad_norm": 0.3595784604549408, + "learning_rate": 4.889727148089439e-05, + "loss": 0.0104, + "step": 121500 + }, + { + "epoch": 5.56757437359065, + "grad_norm": 0.08180402964353561, + "learning_rate": 4.8817378018036073e-05, + "loss": 0.008, + "step": 121600 + }, + { + "epoch": 5.572152970937354, + "grad_norm": 0.13690640032291412, + "learning_rate": 4.873748757620967e-05, + "loss": 0.0093, + "step": 121700 + }, + { + "epoch": 5.576731568284056, + "grad_norm": 0.048987165093421936, + "learning_rate": 4.865760035949695e-05, + "loss": 0.0088, + "step": 121800 + }, + { + "epoch": 5.581310165630759, + "grad_norm": 0.7239773869514465, + "learning_rate": 4.857771657197142e-05, + "loss": 0.0098, + "step": 121900 + }, + { + "epoch": 5.585888762977461, + "grad_norm": 0.13404466211795807, + "learning_rate": 4.849783641769783e-05, + "loss": 0.0095, + "step": 122000 + }, + { + "epoch": 5.590467360324165, + "grad_norm": 0.30230358242988586, + "learning_rate": 4.8417960100731706e-05, + "loss": 0.0076, + "step": 122100 + }, + { + "epoch": 5.595045957670868, + "grad_norm": 0.169099822640419, + "learning_rate": 4.8338087825118675e-05, + "loss": 0.009, + "step": 122200 + }, + { + "epoch": 5.59962455501757, + "grad_norm": 0.7153336405754089, + "learning_rate": 4.8258219794894095e-05, + "loss": 0.0088, + "step": 122300 + }, + { + "epoch": 5.604203152364273, + "grad_norm": 0.167174831032753, + "learning_rate": 4.817835621408251e-05, + "loss": 0.0076, + "step": 122400 + }, + { + "epoch": 5.608781749710976, + "grad_norm": 0.16803164780139923, + "learning_rate": 4.809849728669702e-05, + "loss": 0.0079, + "step": 122500 + }, + { + "epoch": 5.613360347057679, + "grad_norm": 0.645155131816864, + "learning_rate": 4.80186432167389e-05, + "loss": 0.008, + "step": 122600 + }, + { + "epoch": 5.617938944404382, + "grad_norm": 0.1512228399515152, + "learning_rate": 4.7938794208197005e-05, + "loss": 0.0091, + "step": 122700 + }, + { + "epoch": 5.622517541751084, + "grad_norm": 0.28644976019859314, + "learning_rate": 4.7858950465047224e-05, + "loss": 0.0081, + "step": 122800 + }, + { + "epoch": 5.627096139097787, + "grad_norm": 0.5135303735733032, + "learning_rate": 4.7779112191252054e-05, + "loss": 0.0092, + "step": 122900 + }, + { + "epoch": 5.631674736444491, + "grad_norm": 0.38240012526512146, + "learning_rate": 4.769927959075999e-05, + "loss": 0.0105, + "step": 123000 + }, + { + "epoch": 5.636253333791193, + "grad_norm": 0.565757155418396, + "learning_rate": 4.761945286750499e-05, + "loss": 0.0093, + "step": 123100 + }, + { + "epoch": 5.640831931137896, + "grad_norm": 0.12311606109142303, + "learning_rate": 4.7539632225406095e-05, + "loss": 0.0076, + "step": 123200 + }, + { + "epoch": 5.645410528484598, + "grad_norm": 0.2507004737854004, + "learning_rate": 4.745981786836672e-05, + "loss": 0.0088, + "step": 123300 + }, + { + "epoch": 5.6499891258313015, + "grad_norm": 0.3408881425857544, + "learning_rate": 4.738001000027431e-05, + "loss": 0.0088, + "step": 123400 + }, + { + "epoch": 5.654567723178005, + "grad_norm": 0.6254268884658813, + "learning_rate": 4.730020882499964e-05, + "loss": 0.0091, + "step": 123500 + }, + { + "epoch": 5.659146320524707, + "grad_norm": 0.046281538903713226, + "learning_rate": 4.722041454639645e-05, + "loss": 0.0084, + "step": 123600 + }, + { + "epoch": 5.66372491787141, + "grad_norm": 0.12148924171924591, + "learning_rate": 4.714062736830088e-05, + "loss": 0.0078, + "step": 123700 + }, + { + "epoch": 5.668303515218113, + "grad_norm": 0.06817379593849182, + "learning_rate": 4.706084749453085e-05, + "loss": 0.0078, + "step": 123800 + }, + { + "epoch": 5.672882112564816, + "grad_norm": 0.11472304165363312, + "learning_rate": 4.6981075128885693e-05, + "loss": 0.0092, + "step": 123900 + }, + { + "epoch": 5.677460709911519, + "grad_norm": 0.7873682975769043, + "learning_rate": 4.690131047514556e-05, + "loss": 0.0082, + "step": 124000 + }, + { + "epoch": 5.682039307258221, + "grad_norm": 0.34170079231262207, + "learning_rate": 4.6821553737070856e-05, + "loss": 0.008, + "step": 124100 + }, + { + "epoch": 5.686617904604924, + "grad_norm": 0.562393844127655, + "learning_rate": 4.674180511840178e-05, + "loss": 0.0079, + "step": 124200 + }, + { + "epoch": 5.691196501951627, + "grad_norm": 0.32295772433280945, + "learning_rate": 4.6662064822857844e-05, + "loss": 0.0088, + "step": 124300 + }, + { + "epoch": 5.69577509929833, + "grad_norm": 0.09313233196735382, + "learning_rate": 4.658233305413722e-05, + "loss": 0.0083, + "step": 124400 + }, + { + "epoch": 5.700353696645033, + "grad_norm": 0.27240103483200073, + "learning_rate": 4.650261001591633e-05, + "loss": 0.0076, + "step": 124500 + }, + { + "epoch": 5.704932293991735, + "grad_norm": 0.5987135767936707, + "learning_rate": 4.642289591184934e-05, + "loss": 0.0072, + "step": 124600 + }, + { + "epoch": 5.7095108913384385, + "grad_norm": 0.044540900737047195, + "learning_rate": 4.6343190945567504e-05, + "loss": 0.0084, + "step": 124700 + }, + { + "epoch": 5.714089488685142, + "grad_norm": 0.19168873131275177, + "learning_rate": 4.626349532067879e-05, + "loss": 0.0085, + "step": 124800 + }, + { + "epoch": 5.718668086031844, + "grad_norm": 0.3095737397670746, + "learning_rate": 4.6183809240767314e-05, + "loss": 0.0102, + "step": 124900 + }, + { + "epoch": 5.723246683378547, + "grad_norm": 0.34387272596359253, + "learning_rate": 4.6104132909392765e-05, + "loss": 0.0084, + "step": 125000 + }, + { + "epoch": 5.727825280725249, + "grad_norm": 0.18629814684391022, + "learning_rate": 4.602446653008997e-05, + "loss": 0.0091, + "step": 125100 + }, + { + "epoch": 5.732403878071953, + "grad_norm": 0.1663748174905777, + "learning_rate": 4.594481030636832e-05, + "loss": 0.0094, + "step": 125200 + }, + { + "epoch": 5.736982475418656, + "grad_norm": 0.21490418910980225, + "learning_rate": 4.586516444171122e-05, + "loss": 0.0083, + "step": 125300 + }, + { + "epoch": 5.741561072765358, + "grad_norm": 0.17258259654045105, + "learning_rate": 4.57855291395757e-05, + "loss": 0.0089, + "step": 125400 + }, + { + "epoch": 5.746139670112061, + "grad_norm": 0.25354665517807007, + "learning_rate": 4.5705904603391716e-05, + "loss": 0.0077, + "step": 125500 + }, + { + "epoch": 5.750718267458764, + "grad_norm": 0.28657224774360657, + "learning_rate": 4.562629103656183e-05, + "loss": 0.0074, + "step": 125600 + }, + { + "epoch": 5.755296864805467, + "grad_norm": 0.36166995763778687, + "learning_rate": 4.5546688642460446e-05, + "loss": 0.0091, + "step": 125700 + }, + { + "epoch": 5.75987546215217, + "grad_norm": 0.19394946098327637, + "learning_rate": 4.5467097624433524e-05, + "loss": 0.0097, + "step": 125800 + }, + { + "epoch": 5.764454059498872, + "grad_norm": 0.16516007483005524, + "learning_rate": 4.538751818579797e-05, + "loss": 0.0085, + "step": 125900 + }, + { + "epoch": 5.769032656845575, + "grad_norm": 0.2279433161020279, + "learning_rate": 4.530795052984104e-05, + "loss": 0.0078, + "step": 126000 + }, + { + "epoch": 5.773611254192279, + "grad_norm": 0.5914369225502014, + "learning_rate": 4.522839485981994e-05, + "loss": 0.0085, + "step": 126100 + }, + { + "epoch": 5.778189851538981, + "grad_norm": 0.06345394253730774, + "learning_rate": 4.514885137896127e-05, + "loss": 0.0096, + "step": 126200 + }, + { + "epoch": 5.782768448885684, + "grad_norm": 0.2646149694919586, + "learning_rate": 4.506932029046044e-05, + "loss": 0.0073, + "step": 126300 + }, + { + "epoch": 5.787347046232386, + "grad_norm": 0.8094835877418518, + "learning_rate": 4.498980179748123e-05, + "loss": 0.0082, + "step": 126400 + }, + { + "epoch": 5.79192564357909, + "grad_norm": 0.4164597988128662, + "learning_rate": 4.4910296103155296e-05, + "loss": 0.0079, + "step": 126500 + }, + { + "epoch": 5.796504240925793, + "grad_norm": 0.3092726469039917, + "learning_rate": 4.48308034105815e-05, + "loss": 0.0102, + "step": 126600 + }, + { + "epoch": 5.801082838272495, + "grad_norm": 0.2584327161312103, + "learning_rate": 4.475132392282556e-05, + "loss": 0.0084, + "step": 126700 + }, + { + "epoch": 5.805661435619198, + "grad_norm": 0.07558545470237732, + "learning_rate": 4.467185784291946e-05, + "loss": 0.008, + "step": 126800 + }, + { + "epoch": 5.8102400329659005, + "grad_norm": 0.1425691694021225, + "learning_rate": 4.459240537386089e-05, + "loss": 0.0095, + "step": 126900 + }, + { + "epoch": 5.814818630312604, + "grad_norm": 0.4250103235244751, + "learning_rate": 4.451296671861282e-05, + "loss": 0.009, + "step": 127000 + }, + { + "epoch": 5.819397227659307, + "grad_norm": 0.06756921857595444, + "learning_rate": 4.443354208010291e-05, + "loss": 0.0073, + "step": 127100 + }, + { + "epoch": 5.823975825006009, + "grad_norm": 0.2185693234205246, + "learning_rate": 4.4354131661222996e-05, + "loss": 0.0072, + "step": 127200 + }, + { + "epoch": 5.828554422352712, + "grad_norm": 0.3645274341106415, + "learning_rate": 4.427473566482863e-05, + "loss": 0.0106, + "step": 127300 + }, + { + "epoch": 5.8331330196994156, + "grad_norm": 0.26136744022369385, + "learning_rate": 4.4195354293738484e-05, + "loss": 0.0085, + "step": 127400 + }, + { + "epoch": 5.837711617046118, + "grad_norm": 0.1584431380033493, + "learning_rate": 4.4115987750733914e-05, + "loss": 0.0067, + "step": 127500 + }, + { + "epoch": 5.842290214392821, + "grad_norm": 0.3366251587867737, + "learning_rate": 4.4036636238558335e-05, + "loss": 0.0072, + "step": 127600 + }, + { + "epoch": 5.846868811739523, + "grad_norm": 0.1969982236623764, + "learning_rate": 4.39572999599168e-05, + "loss": 0.0099, + "step": 127700 + }, + { + "epoch": 5.8514474090862265, + "grad_norm": 0.178545281291008, + "learning_rate": 4.3877979117475486e-05, + "loss": 0.0063, + "step": 127800 + }, + { + "epoch": 5.856026006432929, + "grad_norm": 0.3591267168521881, + "learning_rate": 4.379867391386106e-05, + "loss": 0.0074, + "step": 127900 + }, + { + "epoch": 5.860604603779632, + "grad_norm": 0.11651629209518433, + "learning_rate": 4.371938455166028e-05, + "loss": 0.0079, + "step": 128000 + }, + { + "epoch": 5.865183201126335, + "grad_norm": 0.19086627662181854, + "learning_rate": 4.364011123341947e-05, + "loss": 0.0067, + "step": 128100 + }, + { + "epoch": 5.8697617984730375, + "grad_norm": 0.0712941512465477, + "learning_rate": 4.35608541616439e-05, + "loss": 0.0099, + "step": 128200 + }, + { + "epoch": 5.874340395819741, + "grad_norm": 0.26921433210372925, + "learning_rate": 4.348161353879737e-05, + "loss": 0.0107, + "step": 128300 + }, + { + "epoch": 5.878918993166444, + "grad_norm": 0.6659551858901978, + "learning_rate": 4.340238956730169e-05, + "loss": 0.0081, + "step": 128400 + }, + { + "epoch": 5.883497590513146, + "grad_norm": 1.7324509620666504, + "learning_rate": 4.3323182449536095e-05, + "loss": 0.0076, + "step": 128500 + }, + { + "epoch": 5.888076187859849, + "grad_norm": 0.4373182952404022, + "learning_rate": 4.3243992387836755e-05, + "loss": 0.0063, + "step": 128600 + }, + { + "epoch": 5.892654785206552, + "grad_norm": 0.45876213908195496, + "learning_rate": 4.316481958449634e-05, + "loss": 0.008, + "step": 128700 + }, + { + "epoch": 5.897233382553255, + "grad_norm": 0.18616245687007904, + "learning_rate": 4.308566424176336e-05, + "loss": 0.0072, + "step": 128800 + }, + { + "epoch": 5.901811979899958, + "grad_norm": 0.056702371686697006, + "learning_rate": 4.3006526561841725e-05, + "loss": 0.0086, + "step": 128900 + }, + { + "epoch": 5.90639057724666, + "grad_norm": 0.38554903864860535, + "learning_rate": 4.292740674689031e-05, + "loss": 0.0078, + "step": 129000 + }, + { + "epoch": 5.9109691745933635, + "grad_norm": 0.6524538397789001, + "learning_rate": 4.284830499902223e-05, + "loss": 0.0093, + "step": 129100 + }, + { + "epoch": 5.915547771940066, + "grad_norm": 0.3187253475189209, + "learning_rate": 4.276922152030454e-05, + "loss": 0.0075, + "step": 129200 + }, + { + "epoch": 5.920126369286769, + "grad_norm": 0.208381786942482, + "learning_rate": 4.269015651275761e-05, + "loss": 0.0073, + "step": 129300 + }, + { + "epoch": 5.924704966633472, + "grad_norm": 0.2706379294395447, + "learning_rate": 4.261111017835456e-05, + "loss": 0.0074, + "step": 129400 + }, + { + "epoch": 5.929283563980174, + "grad_norm": 0.8774177432060242, + "learning_rate": 4.253208271902091e-05, + "loss": 0.008, + "step": 129500 + }, + { + "epoch": 5.933862161326878, + "grad_norm": 0.22220508754253387, + "learning_rate": 4.245307433663388e-05, + "loss": 0.0078, + "step": 129600 + }, + { + "epoch": 5.938440758673581, + "grad_norm": 0.37277668714523315, + "learning_rate": 4.237408523302203e-05, + "loss": 0.0073, + "step": 129700 + }, + { + "epoch": 5.943019356020283, + "grad_norm": 0.1921541541814804, + "learning_rate": 4.229511560996459e-05, + "loss": 0.0082, + "step": 129800 + }, + { + "epoch": 5.947597953366986, + "grad_norm": 0.8308386206626892, + "learning_rate": 4.221616566919107e-05, + "loss": 0.0085, + "step": 129900 + }, + { + "epoch": 5.952176550713689, + "grad_norm": 0.11215928941965103, + "learning_rate": 4.213723561238074e-05, + "loss": 0.0081, + "step": 130000 + }, + { + "epoch": 5.956755148060392, + "grad_norm": 0.6458770632743835, + "learning_rate": 4.205832564116201e-05, + "loss": 0.0091, + "step": 130100 + }, + { + "epoch": 5.961333745407094, + "grad_norm": 0.2930019199848175, + "learning_rate": 4.197943595711198e-05, + "loss": 0.0059, + "step": 130200 + }, + { + "epoch": 5.965912342753797, + "grad_norm": 0.08667781949043274, + "learning_rate": 4.190056676175602e-05, + "loss": 0.0072, + "step": 130300 + }, + { + "epoch": 5.9704909401005, + "grad_norm": 0.34257155656814575, + "learning_rate": 4.1821718256567034e-05, + "loss": 0.0076, + "step": 130400 + }, + { + "epoch": 5.975069537447203, + "grad_norm": 0.2989988327026367, + "learning_rate": 4.174289064296514e-05, + "loss": 0.0104, + "step": 130500 + }, + { + "epoch": 5.979648134793906, + "grad_norm": 0.6057233810424805, + "learning_rate": 4.1664084122317124e-05, + "loss": 0.0065, + "step": 130600 + }, + { + "epoch": 5.984226732140609, + "grad_norm": 0.16379669308662415, + "learning_rate": 4.15852988959358e-05, + "loss": 0.0072, + "step": 130700 + }, + { + "epoch": 5.988805329487311, + "grad_norm": 0.061728738248348236, + "learning_rate": 4.150653516507964e-05, + "loss": 0.0076, + "step": 130800 + }, + { + "epoch": 5.9933839268340146, + "grad_norm": 0.19023200869560242, + "learning_rate": 4.142779313095223e-05, + "loss": 0.0074, + "step": 130900 + }, + { + "epoch": 5.997962524180718, + "grad_norm": 0.2615407109260559, + "learning_rate": 4.134907299470165e-05, + "loss": 0.0087, + "step": 131000 + }, + { + "epoch": 5.999977107013266, + "eval_loss": 0.13592004776000977, + "eval_runtime": 244.1354, + "eval_samples_per_second": 22.528, + "eval_steps_per_second": 22.528, + "step": 131044 + }, + { + "epoch": 6.00254112152742, + "grad_norm": 0.12518206238746643, + "learning_rate": 4.127037495742013e-05, + "loss": 0.0077, + "step": 131100 + }, + { + "epoch": 6.007119718874123, + "grad_norm": 0.1018320843577385, + "learning_rate": 4.119169922014339e-05, + "loss": 0.0043, + "step": 131200 + }, + { + "epoch": 6.0116983162208255, + "grad_norm": 0.09295986592769623, + "learning_rate": 4.111304598385018e-05, + "loss": 0.0061, + "step": 131300 + }, + { + "epoch": 6.016276913567529, + "grad_norm": 0.05357728898525238, + "learning_rate": 4.103441544946184e-05, + "loss": 0.0056, + "step": 131400 + }, + { + "epoch": 6.020855510914231, + "grad_norm": 0.08241847157478333, + "learning_rate": 4.095580781784162e-05, + "loss": 0.0059, + "step": 131500 + }, + { + "epoch": 6.025434108260934, + "grad_norm": 0.12265779078006744, + "learning_rate": 4.087722328979438e-05, + "loss": 0.0033, + "step": 131600 + }, + { + "epoch": 6.030012705607637, + "grad_norm": 0.11975305527448654, + "learning_rate": 4.079866206606582e-05, + "loss": 0.0061, + "step": 131700 + }, + { + "epoch": 6.03459130295434, + "grad_norm": 0.15824288129806519, + "learning_rate": 4.072012434734222e-05, + "loss": 0.0066, + "step": 131800 + }, + { + "epoch": 6.039169900301043, + "grad_norm": 0.2796044647693634, + "learning_rate": 4.06416103342498e-05, + "loss": 0.0055, + "step": 131900 + }, + { + "epoch": 6.043748497647746, + "grad_norm": 0.1359216570854187, + "learning_rate": 4.056312022735417e-05, + "loss": 0.006, + "step": 132000 + }, + { + "epoch": 6.048327094994448, + "grad_norm": 0.24055655300617218, + "learning_rate": 4.0484654227159914e-05, + "loss": 0.0072, + "step": 132100 + }, + { + "epoch": 6.0529056923411515, + "grad_norm": 0.4629483222961426, + "learning_rate": 4.040621253411004e-05, + "loss": 0.0059, + "step": 132200 + }, + { + "epoch": 6.057484289687854, + "grad_norm": 0.3944862186908722, + "learning_rate": 4.032779534858544e-05, + "loss": 0.0059, + "step": 132300 + }, + { + "epoch": 6.062062887034557, + "grad_norm": 0.45347368717193604, + "learning_rate": 4.0249402870904396e-05, + "loss": 0.0061, + "step": 132400 + }, + { + "epoch": 6.06664148438126, + "grad_norm": 0.587853729724884, + "learning_rate": 4.017103530132212e-05, + "loss": 0.0074, + "step": 132500 + }, + { + "epoch": 6.0712200817279625, + "grad_norm": 0.6638960242271423, + "learning_rate": 4.0092692840030134e-05, + "loss": 0.0071, + "step": 132600 + }, + { + "epoch": 6.075798679074666, + "grad_norm": 0.28217336535453796, + "learning_rate": 4.0014375687155844e-05, + "loss": 0.0055, + "step": 132700 + }, + { + "epoch": 6.080377276421368, + "grad_norm": 0.19813333451747894, + "learning_rate": 3.993608404276205e-05, + "loss": 0.0066, + "step": 132800 + }, + { + "epoch": 6.084955873768071, + "grad_norm": 0.06923089921474457, + "learning_rate": 3.985781810684631e-05, + "loss": 0.006, + "step": 132900 + }, + { + "epoch": 6.089534471114774, + "grad_norm": 0.5418972969055176, + "learning_rate": 3.9779578079340554e-05, + "loss": 0.0051, + "step": 133000 + }, + { + "epoch": 6.094113068461477, + "grad_norm": 0.1508362740278244, + "learning_rate": 3.970136416011056e-05, + "loss": 0.0049, + "step": 133100 + }, + { + "epoch": 6.09869166580818, + "grad_norm": 0.26092636585235596, + "learning_rate": 3.962317654895533e-05, + "loss": 0.0054, + "step": 133200 + }, + { + "epoch": 6.103270263154882, + "grad_norm": 0.3573048412799835, + "learning_rate": 3.9545015445606736e-05, + "loss": 0.007, + "step": 133300 + }, + { + "epoch": 6.107848860501585, + "grad_norm": 0.033060140907764435, + "learning_rate": 3.946688104972891e-05, + "loss": 0.0045, + "step": 133400 + }, + { + "epoch": 6.1124274578482884, + "grad_norm": 0.05039636045694351, + "learning_rate": 3.9388773560917724e-05, + "loss": 0.0048, + "step": 133500 + }, + { + "epoch": 6.117006055194991, + "grad_norm": 0.41924425959587097, + "learning_rate": 3.931069317870039e-05, + "loss": 0.0065, + "step": 133600 + }, + { + "epoch": 6.121584652541694, + "grad_norm": 0.33294302225112915, + "learning_rate": 3.9232640102534786e-05, + "loss": 0.0054, + "step": 133700 + }, + { + "epoch": 6.126163249888397, + "grad_norm": 0.025311682373285294, + "learning_rate": 3.915461453180914e-05, + "loss": 0.0048, + "step": 133800 + }, + { + "epoch": 6.130741847235099, + "grad_norm": 0.13680239021778107, + "learning_rate": 3.907661666584131e-05, + "loss": 0.0055, + "step": 133900 + }, + { + "epoch": 6.135320444581803, + "grad_norm": 0.2641524076461792, + "learning_rate": 3.899864670387844e-05, + "loss": 0.0063, + "step": 134000 + }, + { + "epoch": 6.139899041928505, + "grad_norm": 0.28719770908355713, + "learning_rate": 3.892070484509642e-05, + "loss": 0.0052, + "step": 134100 + }, + { + "epoch": 6.144477639275208, + "grad_norm": 0.19892792403697968, + "learning_rate": 3.884279128859927e-05, + "loss": 0.0045, + "step": 134200 + }, + { + "epoch": 6.149056236621911, + "grad_norm": 0.16647031903266907, + "learning_rate": 3.8764906233418775e-05, + "loss": 0.0062, + "step": 134300 + }, + { + "epoch": 6.1536348339686135, + "grad_norm": 0.18115417659282684, + "learning_rate": 3.86870498785139e-05, + "loss": 0.0053, + "step": 134400 + }, + { + "epoch": 6.158213431315317, + "grad_norm": 0.7846788167953491, + "learning_rate": 3.860922242277028e-05, + "loss": 0.006, + "step": 134500 + }, + { + "epoch": 6.162792028662019, + "grad_norm": 0.056557830423116684, + "learning_rate": 3.853142406499972e-05, + "loss": 0.0068, + "step": 134600 + }, + { + "epoch": 6.167370626008722, + "grad_norm": 0.37449362874031067, + "learning_rate": 3.845365500393974e-05, + "loss": 0.0055, + "step": 134700 + }, + { + "epoch": 6.171949223355425, + "grad_norm": 1.3642663955688477, + "learning_rate": 3.837591543825296e-05, + "loss": 0.0052, + "step": 134800 + }, + { + "epoch": 6.176527820702128, + "grad_norm": 0.022911841049790382, + "learning_rate": 3.8298205566526676e-05, + "loss": 0.0042, + "step": 134900 + }, + { + "epoch": 6.181106418048831, + "grad_norm": 0.028689689934253693, + "learning_rate": 3.8220525587272384e-05, + "loss": 0.0062, + "step": 135000 + }, + { + "epoch": 6.185685015395533, + "grad_norm": 0.3197433650493622, + "learning_rate": 3.814287569892512e-05, + "loss": 0.0059, + "step": 135100 + }, + { + "epoch": 6.190263612742236, + "grad_norm": 0.06785603612661362, + "learning_rate": 3.806525609984312e-05, + "loss": 0.0049, + "step": 135200 + }, + { + "epoch": 6.1948422100889395, + "grad_norm": 0.3414902985095978, + "learning_rate": 3.7987666988307244e-05, + "loss": 0.0053, + "step": 135300 + }, + { + "epoch": 6.199420807435642, + "grad_norm": 0.1857975274324417, + "learning_rate": 3.791010856252043e-05, + "loss": 0.0058, + "step": 135400 + }, + { + "epoch": 6.203999404782345, + "grad_norm": 0.1203976720571518, + "learning_rate": 3.7832581020607284e-05, + "loss": 0.0077, + "step": 135500 + }, + { + "epoch": 6.208578002129048, + "grad_norm": 0.3408762514591217, + "learning_rate": 3.7755084560613455e-05, + "loss": 0.0065, + "step": 135600 + }, + { + "epoch": 6.2131565994757505, + "grad_norm": 0.0590222142636776, + "learning_rate": 3.767761938050528e-05, + "loss": 0.0055, + "step": 135700 + }, + { + "epoch": 6.217735196822454, + "grad_norm": 0.662112295627594, + "learning_rate": 3.760018567816908e-05, + "loss": 0.0059, + "step": 135800 + }, + { + "epoch": 6.222313794169156, + "grad_norm": 0.005584437865763903, + "learning_rate": 3.752278365141084e-05, + "loss": 0.0067, + "step": 135900 + }, + { + "epoch": 6.226892391515859, + "grad_norm": 0.15262584388256073, + "learning_rate": 3.744541349795564e-05, + "loss": 0.0065, + "step": 136000 + }, + { + "epoch": 6.231470988862562, + "grad_norm": 0.644844114780426, + "learning_rate": 3.7368075415447086e-05, + "loss": 0.0047, + "step": 136100 + }, + { + "epoch": 6.236049586209265, + "grad_norm": 0.06777459383010864, + "learning_rate": 3.729076960144687e-05, + "loss": 0.0052, + "step": 136200 + }, + { + "epoch": 6.240628183555968, + "grad_norm": 0.7604510188102722, + "learning_rate": 3.721349625343431e-05, + "loss": 0.0054, + "step": 136300 + }, + { + "epoch": 6.24520678090267, + "grad_norm": 0.3464463949203491, + "learning_rate": 3.71362555688057e-05, + "loss": 0.0053, + "step": 136400 + }, + { + "epoch": 6.249785378249373, + "grad_norm": 0.032986678183078766, + "learning_rate": 3.705904774487396e-05, + "loss": 0.0053, + "step": 136500 + }, + { + "epoch": 6.2543639755960765, + "grad_norm": 0.270702987909317, + "learning_rate": 3.6981872978868065e-05, + "loss": 0.0056, + "step": 136600 + }, + { + "epoch": 6.258942572942779, + "grad_norm": 0.5898098945617676, + "learning_rate": 3.6904731467932493e-05, + "loss": 0.0047, + "step": 136700 + }, + { + "epoch": 6.263521170289482, + "grad_norm": 0.07433097064495087, + "learning_rate": 3.682762340912681e-05, + "loss": 0.0057, + "step": 136800 + }, + { + "epoch": 6.268099767636184, + "grad_norm": 0.208632692694664, + "learning_rate": 3.675054899942515e-05, + "loss": 0.0064, + "step": 136900 + }, + { + "epoch": 6.2726783649828874, + "grad_norm": 0.48827114701271057, + "learning_rate": 3.6673508435715634e-05, + "loss": 0.0056, + "step": 137000 + }, + { + "epoch": 6.277256962329591, + "grad_norm": 0.15773746371269226, + "learning_rate": 3.659650191479994e-05, + "loss": 0.0059, + "step": 137100 + }, + { + "epoch": 6.281835559676293, + "grad_norm": 0.46037283539772034, + "learning_rate": 3.651952963339282e-05, + "loss": 0.0052, + "step": 137200 + }, + { + "epoch": 6.286414157022996, + "grad_norm": 0.07779065519571304, + "learning_rate": 3.6442591788121505e-05, + "loss": 0.0051, + "step": 137300 + }, + { + "epoch": 6.290992754369698, + "grad_norm": 0.5138252377510071, + "learning_rate": 3.6365688575525315e-05, + "loss": 0.0053, + "step": 137400 + }, + { + "epoch": 6.295571351716402, + "grad_norm": 0.21173468232154846, + "learning_rate": 3.628882019205506e-05, + "loss": 0.0058, + "step": 137500 + }, + { + "epoch": 6.300149949063105, + "grad_norm": 0.4661062955856323, + "learning_rate": 3.621198683407258e-05, + "loss": 0.0053, + "step": 137600 + }, + { + "epoch": 6.304728546409807, + "grad_norm": 0.2002924084663391, + "learning_rate": 3.613518869785025e-05, + "loss": 0.0054, + "step": 137700 + }, + { + "epoch": 6.30930714375651, + "grad_norm": 0.24317267537117004, + "learning_rate": 3.6058425979570485e-05, + "loss": 0.0057, + "step": 137800 + }, + { + "epoch": 6.313885741103213, + "grad_norm": 0.14312195777893066, + "learning_rate": 3.598169887532521e-05, + "loss": 0.0059, + "step": 137900 + }, + { + "epoch": 6.318464338449916, + "grad_norm": 0.07625292241573334, + "learning_rate": 3.590500758111537e-05, + "loss": 0.006, + "step": 138000 + }, + { + "epoch": 6.323042935796619, + "grad_norm": 0.07330285757780075, + "learning_rate": 3.582835229285042e-05, + "loss": 0.0044, + "step": 138100 + }, + { + "epoch": 6.327621533143321, + "grad_norm": 0.025587473064661026, + "learning_rate": 3.5751733206347894e-05, + "loss": 0.0054, + "step": 138200 + }, + { + "epoch": 6.332200130490024, + "grad_norm": 0.09335857629776001, + "learning_rate": 3.567515051733277e-05, + "loss": 0.0062, + "step": 138300 + }, + { + "epoch": 6.336778727836728, + "grad_norm": 0.031704433262348175, + "learning_rate": 3.559860442143709e-05, + "loss": 0.0063, + "step": 138400 + }, + { + "epoch": 6.34135732518343, + "grad_norm": 0.09114887565374374, + "learning_rate": 3.552209511419943e-05, + "loss": 0.0045, + "step": 138500 + }, + { + "epoch": 6.345935922530133, + "grad_norm": 0.023929867893457413, + "learning_rate": 3.5445622791064356e-05, + "loss": 0.0053, + "step": 138600 + }, + { + "epoch": 6.350514519876835, + "grad_norm": 1.4821025133132935, + "learning_rate": 3.5369187647381974e-05, + "loss": 0.0056, + "step": 138700 + }, + { + "epoch": 6.3550931172235385, + "grad_norm": 0.16608977317810059, + "learning_rate": 3.529278987840744e-05, + "loss": 0.0055, + "step": 138800 + }, + { + "epoch": 6.359671714570242, + "grad_norm": 0.21598820388317108, + "learning_rate": 3.5216429679300376e-05, + "loss": 0.0051, + "step": 138900 + }, + { + "epoch": 6.364250311916944, + "grad_norm": 0.016882436349987984, + "learning_rate": 3.5140107245124476e-05, + "loss": 0.0052, + "step": 139000 + }, + { + "epoch": 6.368828909263647, + "grad_norm": 0.05500126630067825, + "learning_rate": 3.506382277084696e-05, + "loss": 0.0043, + "step": 139100 + }, + { + "epoch": 6.37340750661035, + "grad_norm": 0.03151680901646614, + "learning_rate": 3.4987576451338055e-05, + "loss": 0.0056, + "step": 139200 + }, + { + "epoch": 6.377986103957053, + "grad_norm": 0.07861995697021484, + "learning_rate": 3.491136848137053e-05, + "loss": 0.0044, + "step": 139300 + }, + { + "epoch": 6.382564701303756, + "grad_norm": 0.12852996587753296, + "learning_rate": 3.483519905561924e-05, + "loss": 0.0045, + "step": 139400 + }, + { + "epoch": 6.387143298650458, + "grad_norm": 0.6488528847694397, + "learning_rate": 3.475906836866046e-05, + "loss": 0.0043, + "step": 139500 + }, + { + "epoch": 6.391721895997161, + "grad_norm": 0.3967334032058716, + "learning_rate": 3.468297661497164e-05, + "loss": 0.0069, + "step": 139600 + }, + { + "epoch": 6.396300493343864, + "grad_norm": 0.1223519891500473, + "learning_rate": 3.460692398893068e-05, + "loss": 0.0054, + "step": 139700 + }, + { + "epoch": 6.400879090690567, + "grad_norm": 0.15460754930973053, + "learning_rate": 3.453091068481559e-05, + "loss": 0.0056, + "step": 139800 + }, + { + "epoch": 6.40545768803727, + "grad_norm": 0.040079645812511444, + "learning_rate": 3.445493689680388e-05, + "loss": 0.0055, + "step": 139900 + }, + { + "epoch": 6.410036285383972, + "grad_norm": 0.020937960594892502, + "learning_rate": 3.4379002818972124e-05, + "loss": 0.0054, + "step": 140000 + }, + { + "epoch": 6.4146148827306755, + "grad_norm": 0.21271613240242004, + "learning_rate": 3.43031086452955e-05, + "loss": 0.0046, + "step": 140100 + }, + { + "epoch": 6.419193480077379, + "grad_norm": 0.09579429775476456, + "learning_rate": 3.4227254569647205e-05, + "loss": 0.0051, + "step": 140200 + }, + { + "epoch": 6.423772077424081, + "grad_norm": 0.043937426060438156, + "learning_rate": 3.4151440785798004e-05, + "loss": 0.0053, + "step": 140300 + }, + { + "epoch": 6.428350674770784, + "grad_norm": 0.4987468123435974, + "learning_rate": 3.4075667487415785e-05, + "loss": 0.0058, + "step": 140400 + }, + { + "epoch": 6.432929272117486, + "grad_norm": 0.6420878171920776, + "learning_rate": 3.399993486806495e-05, + "loss": 0.0067, + "step": 140500 + }, + { + "epoch": 6.43750786946419, + "grad_norm": 0.39332908391952515, + "learning_rate": 3.392424312120601e-05, + "loss": 0.0052, + "step": 140600 + }, + { + "epoch": 6.442086466810893, + "grad_norm": 0.06132081523537636, + "learning_rate": 3.384859244019511e-05, + "loss": 0.0047, + "step": 140700 + }, + { + "epoch": 6.446665064157595, + "grad_norm": 0.23858435451984406, + "learning_rate": 3.377298301828343e-05, + "loss": 0.0047, + "step": 140800 + }, + { + "epoch": 6.451243661504298, + "grad_norm": 0.32406067848205566, + "learning_rate": 3.3697415048616765e-05, + "loss": 0.0055, + "step": 140900 + }, + { + "epoch": 6.455822258851001, + "grad_norm": 1.4054268598556519, + "learning_rate": 3.362188872423506e-05, + "loss": 0.0051, + "step": 141000 + }, + { + "epoch": 6.460400856197704, + "grad_norm": 0.05585220828652382, + "learning_rate": 3.354640423807183e-05, + "loss": 0.0062, + "step": 141100 + }, + { + "epoch": 6.464979453544407, + "grad_norm": 0.018032953143119812, + "learning_rate": 3.347096178295371e-05, + "loss": 0.0037, + "step": 141200 + }, + { + "epoch": 6.469558050891109, + "grad_norm": 0.907580554485321, + "learning_rate": 3.339556155160004e-05, + "loss": 0.006, + "step": 141300 + }, + { + "epoch": 6.474136648237812, + "grad_norm": 0.08372417092323303, + "learning_rate": 3.3320203736622184e-05, + "loss": 0.0057, + "step": 141400 + }, + { + "epoch": 6.478715245584516, + "grad_norm": 0.39907532930374146, + "learning_rate": 3.324488853052326e-05, + "loss": 0.0044, + "step": 141500 + }, + { + "epoch": 6.483293842931218, + "grad_norm": 1.1346983909606934, + "learning_rate": 3.3169616125697486e-05, + "loss": 0.0048, + "step": 141600 + }, + { + "epoch": 6.487872440277921, + "grad_norm": 0.8341863751411438, + "learning_rate": 3.3094386714429724e-05, + "loss": 0.0047, + "step": 141700 + }, + { + "epoch": 6.492451037624623, + "grad_norm": 0.07596173137426376, + "learning_rate": 3.301920048889506e-05, + "loss": 0.0041, + "step": 141800 + }, + { + "epoch": 6.497029634971327, + "grad_norm": 0.7775061130523682, + "learning_rate": 3.294405764115823e-05, + "loss": 0.0049, + "step": 141900 + }, + { + "epoch": 6.50160823231803, + "grad_norm": 0.30017733573913574, + "learning_rate": 3.286895836317319e-05, + "loss": 0.0043, + "step": 142000 + }, + { + "epoch": 6.506186829664732, + "grad_norm": 0.3541896343231201, + "learning_rate": 3.2793902846782534e-05, + "loss": 0.0055, + "step": 142100 + }, + { + "epoch": 6.510765427011435, + "grad_norm": 0.1445729285478592, + "learning_rate": 3.271889128371712e-05, + "loss": 0.0054, + "step": 142200 + }, + { + "epoch": 6.5153440243581375, + "grad_norm": 0.20019569993019104, + "learning_rate": 3.2643923865595536e-05, + "loss": 0.005, + "step": 142300 + }, + { + "epoch": 6.519922621704841, + "grad_norm": 0.21448808908462524, + "learning_rate": 3.2569000783923544e-05, + "loss": 0.0051, + "step": 142400 + }, + { + "epoch": 6.524501219051544, + "grad_norm": 0.13675835728645325, + "learning_rate": 3.249412223009368e-05, + "loss": 0.0046, + "step": 142500 + }, + { + "epoch": 6.529079816398246, + "grad_norm": 0.11393424868583679, + "learning_rate": 3.2419288395384785e-05, + "loss": 0.004, + "step": 142600 + }, + { + "epoch": 6.533658413744949, + "grad_norm": 0.2209634631872177, + "learning_rate": 3.234449947096135e-05, + "loss": 0.0042, + "step": 142700 + }, + { + "epoch": 6.538237011091653, + "grad_norm": 0.025969982147216797, + "learning_rate": 3.226975564787322e-05, + "loss": 0.0059, + "step": 142800 + }, + { + "epoch": 6.542815608438355, + "grad_norm": 0.8372477293014526, + "learning_rate": 3.2195057117055036e-05, + "loss": 0.0042, + "step": 142900 + }, + { + "epoch": 6.547394205785058, + "grad_norm": 0.8654465675354004, + "learning_rate": 3.212040406932569e-05, + "loss": 0.0046, + "step": 143000 + }, + { + "epoch": 6.55197280313176, + "grad_norm": 0.614989161491394, + "learning_rate": 3.204579669538792e-05, + "loss": 0.0052, + "step": 143100 + }, + { + "epoch": 6.5565514004784635, + "grad_norm": 0.31863656640052795, + "learning_rate": 3.19712351858278e-05, + "loss": 0.0053, + "step": 143200 + }, + { + "epoch": 6.561129997825166, + "grad_norm": 0.5576188564300537, + "learning_rate": 3.1896719731114186e-05, + "loss": 0.0053, + "step": 143300 + }, + { + "epoch": 6.565708595171869, + "grad_norm": 0.17591483891010284, + "learning_rate": 3.182225052159833e-05, + "loss": 0.0049, + "step": 143400 + }, + { + "epoch": 6.570287192518572, + "grad_norm": 0.29361802339553833, + "learning_rate": 3.174782774751338e-05, + "loss": 0.0053, + "step": 143500 + }, + { + "epoch": 6.5748657898652745, + "grad_norm": 0.046735215932130814, + "learning_rate": 3.167345159897378e-05, + "loss": 0.0047, + "step": 143600 + }, + { + "epoch": 6.579444387211978, + "grad_norm": 0.3643573224544525, + "learning_rate": 3.1599122265974946e-05, + "loss": 0.0041, + "step": 143700 + }, + { + "epoch": 6.584022984558681, + "grad_norm": 0.1328035444021225, + "learning_rate": 3.152483993839265e-05, + "loss": 0.0045, + "step": 143800 + }, + { + "epoch": 6.588601581905383, + "grad_norm": 0.11621426790952682, + "learning_rate": 3.145060480598263e-05, + "loss": 0.0045, + "step": 143900 + }, + { + "epoch": 6.593180179252086, + "grad_norm": 0.3485720753669739, + "learning_rate": 3.137641705838004e-05, + "loss": 0.0057, + "step": 144000 + }, + { + "epoch": 6.597758776598789, + "grad_norm": 0.7195566892623901, + "learning_rate": 3.1302276885098955e-05, + "loss": 0.0057, + "step": 144100 + }, + { + "epoch": 6.602337373945492, + "grad_norm": 0.07403887808322906, + "learning_rate": 3.122818447553201e-05, + "loss": 0.0052, + "step": 144200 + }, + { + "epoch": 6.606915971292195, + "grad_norm": 0.07861506193876266, + "learning_rate": 3.115414001894974e-05, + "loss": 0.0054, + "step": 144300 + }, + { + "epoch": 6.611494568638897, + "grad_norm": 0.20306392014026642, + "learning_rate": 3.108014370450021e-05, + "loss": 0.0063, + "step": 144400 + }, + { + "epoch": 6.6160731659856005, + "grad_norm": 0.5941068530082703, + "learning_rate": 3.100619572120854e-05, + "loss": 0.0052, + "step": 144500 + }, + { + "epoch": 6.620651763332303, + "grad_norm": 0.4385504126548767, + "learning_rate": 3.0932296257976336e-05, + "loss": 0.0049, + "step": 144600 + }, + { + "epoch": 6.625230360679006, + "grad_norm": 0.07183999568223953, + "learning_rate": 3.0858445503581266e-05, + "loss": 0.0054, + "step": 144700 + }, + { + "epoch": 6.629808958025709, + "grad_norm": 0.04285150766372681, + "learning_rate": 3.0784643646676635e-05, + "loss": 0.0046, + "step": 144800 + }, + { + "epoch": 6.634387555372411, + "grad_norm": 0.04266421124339104, + "learning_rate": 3.071089087579074e-05, + "loss": 0.0064, + "step": 144900 + }, + { + "epoch": 6.638966152719115, + "grad_norm": 0.4585297405719757, + "learning_rate": 3.063718737932655e-05, + "loss": 0.0053, + "step": 145000 + }, + { + "epoch": 6.643544750065818, + "grad_norm": 0.04796597734093666, + "learning_rate": 3.0563533345561155e-05, + "loss": 0.0052, + "step": 145100 + }, + { + "epoch": 6.64812334741252, + "grad_norm": 0.7312212586402893, + "learning_rate": 3.0489928962645275e-05, + "loss": 0.0047, + "step": 145200 + }, + { + "epoch": 6.652701944759223, + "grad_norm": 0.4970768988132477, + "learning_rate": 3.041637441860279e-05, + "loss": 0.005, + "step": 145300 + }, + { + "epoch": 6.657280542105926, + "grad_norm": 0.28591519594192505, + "learning_rate": 3.0342869901330313e-05, + "loss": 0.0047, + "step": 145400 + }, + { + "epoch": 6.661859139452629, + "grad_norm": 0.019086016342043877, + "learning_rate": 3.02694155985966e-05, + "loss": 0.0052, + "step": 145500 + }, + { + "epoch": 6.666437736799331, + "grad_norm": 0.06344935297966003, + "learning_rate": 3.019601169804216e-05, + "loss": 0.0054, + "step": 145600 + }, + { + "epoch": 6.671016334146034, + "grad_norm": 0.2660221755504608, + "learning_rate": 3.012265838717878e-05, + "loss": 0.0049, + "step": 145700 + }, + { + "epoch": 6.675594931492737, + "grad_norm": 0.22985537350177765, + "learning_rate": 3.0049355853388955e-05, + "loss": 0.0049, + "step": 145800 + }, + { + "epoch": 6.68017352883944, + "grad_norm": 0.06822630017995834, + "learning_rate": 2.9976104283925515e-05, + "loss": 0.004, + "step": 145900 + }, + { + "epoch": 6.684752126186143, + "grad_norm": 0.03378499671816826, + "learning_rate": 2.9902903865911068e-05, + "loss": 0.0062, + "step": 146000 + }, + { + "epoch": 6.689330723532846, + "grad_norm": 0.3799358904361725, + "learning_rate": 2.9829754786337603e-05, + "loss": 0.0056, + "step": 146100 + }, + { + "epoch": 6.693909320879548, + "grad_norm": 0.2396411895751953, + "learning_rate": 2.975665723206591e-05, + "loss": 0.0049, + "step": 146200 + }, + { + "epoch": 6.698487918226252, + "grad_norm": 0.19714663922786713, + "learning_rate": 2.9683611389825167e-05, + "loss": 0.0057, + "step": 146300 + }, + { + "epoch": 6.703066515572954, + "grad_norm": 0.07194243371486664, + "learning_rate": 2.9610617446212495e-05, + "loss": 0.0058, + "step": 146400 + }, + { + "epoch": 6.707645112919657, + "grad_norm": 0.567692220211029, + "learning_rate": 2.9537675587692382e-05, + "loss": 0.0045, + "step": 146500 + }, + { + "epoch": 6.71222371026636, + "grad_norm": 0.4618910551071167, + "learning_rate": 2.946478600059629e-05, + "loss": 0.0051, + "step": 146600 + }, + { + "epoch": 6.7168023076130625, + "grad_norm": 0.09115318953990936, + "learning_rate": 2.939194887112218e-05, + "loss": 0.0046, + "step": 146700 + }, + { + "epoch": 6.721380904959766, + "grad_norm": 0.1926048994064331, + "learning_rate": 2.9319164385333953e-05, + "loss": 0.0039, + "step": 146800 + }, + { + "epoch": 6.725959502306468, + "grad_norm": 0.5767799615859985, + "learning_rate": 2.9246432729161055e-05, + "loss": 0.0068, + "step": 146900 + }, + { + "epoch": 6.730538099653171, + "grad_norm": 0.5855737328529358, + "learning_rate": 2.917375408839803e-05, + "loss": 0.0039, + "step": 147000 + }, + { + "epoch": 6.735116696999874, + "grad_norm": 0.18008683621883392, + "learning_rate": 2.910112864870388e-05, + "loss": 0.0053, + "step": 147100 + }, + { + "epoch": 6.739695294346577, + "grad_norm": 0.013704614713788033, + "learning_rate": 2.9028556595601786e-05, + "loss": 0.0058, + "step": 147200 + }, + { + "epoch": 6.74427389169328, + "grad_norm": 0.2103748619556427, + "learning_rate": 2.895603811447858e-05, + "loss": 0.0053, + "step": 147300 + }, + { + "epoch": 6.748852489039983, + "grad_norm": 0.3199872374534607, + "learning_rate": 2.888357339058413e-05, + "loss": 0.0045, + "step": 147400 + }, + { + "epoch": 6.753431086386685, + "grad_norm": 0.1987699270248413, + "learning_rate": 2.8811162609031104e-05, + "loss": 0.0038, + "step": 147500 + }, + { + "epoch": 6.7580096837333885, + "grad_norm": 0.12141498178243637, + "learning_rate": 2.8738805954794295e-05, + "loss": 0.004, + "step": 147600 + }, + { + "epoch": 6.762588281080091, + "grad_norm": 0.24456042051315308, + "learning_rate": 2.8666503612710226e-05, + "loss": 0.0052, + "step": 147700 + }, + { + "epoch": 6.767166878426794, + "grad_norm": 0.21112100780010223, + "learning_rate": 2.8594255767476718e-05, + "loss": 0.0057, + "step": 147800 + }, + { + "epoch": 6.771745475773497, + "grad_norm": 0.395063579082489, + "learning_rate": 2.852206260365237e-05, + "loss": 0.0051, + "step": 147900 + }, + { + "epoch": 6.7763240731201995, + "grad_norm": 0.39365246891975403, + "learning_rate": 2.8449924305656107e-05, + "loss": 0.0043, + "step": 148000 + }, + { + "epoch": 6.780902670466903, + "grad_norm": 0.19307668507099152, + "learning_rate": 2.8377841057766624e-05, + "loss": 0.0057, + "step": 148100 + }, + { + "epoch": 6.785481267813605, + "grad_norm": 0.3313720226287842, + "learning_rate": 2.8305813044122097e-05, + "loss": 0.0054, + "step": 148200 + }, + { + "epoch": 6.790059865160308, + "grad_norm": 0.6470041871070862, + "learning_rate": 2.8233840448719532e-05, + "loss": 0.0048, + "step": 148300 + }, + { + "epoch": 6.794638462507011, + "grad_norm": 0.9007655382156372, + "learning_rate": 2.8161923455414367e-05, + "loss": 0.0055, + "step": 148400 + }, + { + "epoch": 6.799217059853714, + "grad_norm": 0.8383020758628845, + "learning_rate": 2.8090062247920045e-05, + "loss": 0.005, + "step": 148500 + }, + { + "epoch": 6.803795657200417, + "grad_norm": 0.2168063223361969, + "learning_rate": 2.80182570098075e-05, + "loss": 0.0045, + "step": 148600 + }, + { + "epoch": 6.80837425454712, + "grad_norm": 0.1763121336698532, + "learning_rate": 2.794650792450464e-05, + "loss": 0.0058, + "step": 148700 + }, + { + "epoch": 6.812952851893822, + "grad_norm": 0.022952038794755936, + "learning_rate": 2.7874815175296e-05, + "loss": 0.0043, + "step": 148800 + }, + { + "epoch": 6.8175314492405255, + "grad_norm": 0.01308775506913662, + "learning_rate": 2.7803178945322134e-05, + "loss": 0.0047, + "step": 148900 + }, + { + "epoch": 6.822110046587228, + "grad_norm": 0.0029964440036565065, + "learning_rate": 2.7731599417579245e-05, + "loss": 0.0052, + "step": 149000 + }, + { + "epoch": 6.826688643933931, + "grad_norm": 0.2904300093650818, + "learning_rate": 2.7660076774918708e-05, + "loss": 0.0039, + "step": 149100 + }, + { + "epoch": 6.831267241280633, + "grad_norm": 0.19335739314556122, + "learning_rate": 2.7588611200046592e-05, + "loss": 0.004, + "step": 149200 + }, + { + "epoch": 6.835845838627336, + "grad_norm": 0.04841936379671097, + "learning_rate": 2.7517202875523117e-05, + "loss": 0.0048, + "step": 149300 + }, + { + "epoch": 6.84042443597404, + "grad_norm": 0.12522141635417938, + "learning_rate": 2.7445851983762344e-05, + "loss": 0.004, + "step": 149400 + }, + { + "epoch": 6.845003033320742, + "grad_norm": 0.17885401844978333, + "learning_rate": 2.737455870703155e-05, + "loss": 0.0055, + "step": 149500 + }, + { + "epoch": 6.849581630667445, + "grad_norm": 0.47798067331314087, + "learning_rate": 2.7303323227450857e-05, + "loss": 0.005, + "step": 149600 + }, + { + "epoch": 6.854160228014148, + "grad_norm": 0.02613680437207222, + "learning_rate": 2.7232145726992752e-05, + "loss": 0.0065, + "step": 149700 + }, + { + "epoch": 6.8587388253608506, + "grad_norm": 0.2435833066701889, + "learning_rate": 2.7161026387481636e-05, + "loss": 0.0061, + "step": 149800 + }, + { + "epoch": 6.863317422707554, + "grad_norm": 0.08455272018909454, + "learning_rate": 2.7089965390593263e-05, + "loss": 0.0059, + "step": 149900 + }, + { + "epoch": 6.867896020054256, + "grad_norm": 0.08332820981740952, + "learning_rate": 2.7018962917854418e-05, + "loss": 0.0042, + "step": 150000 + }, + { + "epoch": 6.872474617400959, + "grad_norm": 0.34127193689346313, + "learning_rate": 2.6948019150642383e-05, + "loss": 0.0029, + "step": 150100 + }, + { + "epoch": 6.877053214747662, + "grad_norm": 0.1273692101240158, + "learning_rate": 2.6877134270184435e-05, + "loss": 0.0051, + "step": 150200 + }, + { + "epoch": 6.881631812094365, + "grad_norm": 0.14943909645080566, + "learning_rate": 2.6806308457557423e-05, + "loss": 0.0062, + "step": 150300 + }, + { + "epoch": 6.886210409441068, + "grad_norm": 0.14334943890571594, + "learning_rate": 2.6735541893687343e-05, + "loss": 0.0056, + "step": 150400 + }, + { + "epoch": 6.89078900678777, + "grad_norm": 0.15444263815879822, + "learning_rate": 2.666483475934885e-05, + "loss": 0.0045, + "step": 150500 + }, + { + "epoch": 6.895367604134473, + "grad_norm": 0.31661495566368103, + "learning_rate": 2.6594187235164713e-05, + "loss": 0.0063, + "step": 150600 + }, + { + "epoch": 6.8999462014811765, + "grad_norm": 0.4060909152030945, + "learning_rate": 2.65235995016055e-05, + "loss": 0.0052, + "step": 150700 + }, + { + "epoch": 6.904524798827879, + "grad_norm": 0.20253728330135345, + "learning_rate": 2.645307173898901e-05, + "loss": 0.0064, + "step": 150800 + }, + { + "epoch": 6.909103396174582, + "grad_norm": 0.7078954577445984, + "learning_rate": 2.6382604127479815e-05, + "loss": 0.0044, + "step": 150900 + }, + { + "epoch": 6.913681993521285, + "grad_norm": 0.14812923967838287, + "learning_rate": 2.6312196847088893e-05, + "loss": 0.0052, + "step": 151000 + }, + { + "epoch": 6.9182605908679875, + "grad_norm": 0.10642609000205994, + "learning_rate": 2.6241850077673087e-05, + "loss": 0.0052, + "step": 151100 + }, + { + "epoch": 6.922839188214691, + "grad_norm": 0.15536317229270935, + "learning_rate": 2.6171563998934605e-05, + "loss": 0.0053, + "step": 151200 + }, + { + "epoch": 6.927417785561393, + "grad_norm": 0.1677425354719162, + "learning_rate": 2.6101338790420715e-05, + "loss": 0.0048, + "step": 151300 + }, + { + "epoch": 6.931996382908096, + "grad_norm": 0.1952294558286667, + "learning_rate": 2.6031174631523118e-05, + "loss": 0.0059, + "step": 151400 + }, + { + "epoch": 6.9365749802547985, + "grad_norm": 0.171901673078537, + "learning_rate": 2.5961071701477567e-05, + "loss": 0.0049, + "step": 151500 + }, + { + "epoch": 6.941153577601502, + "grad_norm": 0.0632760226726532, + "learning_rate": 2.589103017936344e-05, + "loss": 0.0043, + "step": 151600 + }, + { + "epoch": 6.945732174948205, + "grad_norm": 0.14387081563472748, + "learning_rate": 2.582105024410325e-05, + "loss": 0.0046, + "step": 151700 + }, + { + "epoch": 6.950310772294907, + "grad_norm": 0.1789962351322174, + "learning_rate": 2.575113207446213e-05, + "loss": 0.0041, + "step": 151800 + }, + { + "epoch": 6.95488936964161, + "grad_norm": 0.05178796499967575, + "learning_rate": 2.5681275849047482e-05, + "loss": 0.0052, + "step": 151900 + }, + { + "epoch": 6.9594679669883135, + "grad_norm": 0.06059027463197708, + "learning_rate": 2.5611481746308473e-05, + "loss": 0.0049, + "step": 152000 + }, + { + "epoch": 6.964046564335016, + "grad_norm": 0.02760574221611023, + "learning_rate": 2.5541749944535554e-05, + "loss": 0.005, + "step": 152100 + }, + { + "epoch": 6.968625161681719, + "grad_norm": 0.5041255950927734, + "learning_rate": 2.547208062185999e-05, + "loss": 0.0034, + "step": 152200 + }, + { + "epoch": 6.973203759028421, + "grad_norm": 0.12533140182495117, + "learning_rate": 2.5402473956253515e-05, + "loss": 0.0059, + "step": 152300 + }, + { + "epoch": 6.9777823563751245, + "grad_norm": 0.0706457793712616, + "learning_rate": 2.5332930125527787e-05, + "loss": 0.006, + "step": 152400 + }, + { + "epoch": 6.982360953721828, + "grad_norm": 0.37089434266090393, + "learning_rate": 2.5263449307333908e-05, + "loss": 0.0052, + "step": 152500 + }, + { + "epoch": 6.98693955106853, + "grad_norm": 0.034625936299562454, + "learning_rate": 2.5194031679162067e-05, + "loss": 0.0048, + "step": 152600 + }, + { + "epoch": 6.991518148415233, + "grad_norm": 0.19594725966453552, + "learning_rate": 2.512467741834099e-05, + "loss": 0.0048, + "step": 152700 + }, + { + "epoch": 6.996096745761935, + "grad_norm": 0.09410729259252548, + "learning_rate": 2.505538670203754e-05, + "loss": 0.0043, + "step": 152800 + }, + { + "epoch": 6.9999885535066335, + "eval_loss": 0.1694260537624359, + "eval_runtime": 268.0609, + "eval_samples_per_second": 20.518, + "eval_steps_per_second": 20.518, + "step": 152885 + }, + { + "epoch": 7.000675343108639, + "grad_norm": 0.03295362740755081, + "learning_rate": 2.4986159707256274e-05, + "loss": 0.0039, + "step": 152900 + }, + { + "epoch": 7.005253940455342, + "grad_norm": 0.06891167163848877, + "learning_rate": 2.4916996610838973e-05, + "loss": 0.0035, + "step": 153000 + }, + { + "epoch": 7.009832537802044, + "grad_norm": 0.12001962214708328, + "learning_rate": 2.484789758946414e-05, + "loss": 0.0034, + "step": 153100 + }, + { + "epoch": 7.014411135148747, + "grad_norm": 0.2218388170003891, + "learning_rate": 2.477886281964667e-05, + "loss": 0.0026, + "step": 153200 + }, + { + "epoch": 7.01898973249545, + "grad_norm": 0.006664707791060209, + "learning_rate": 2.4709892477737262e-05, + "loss": 0.0028, + "step": 153300 + }, + { + "epoch": 7.023568329842153, + "grad_norm": 0.0030799272935837507, + "learning_rate": 2.464098673992205e-05, + "loss": 0.0035, + "step": 153400 + }, + { + "epoch": 7.028146927188856, + "grad_norm": 0.14536774158477783, + "learning_rate": 2.457214578222215e-05, + "loss": 0.0028, + "step": 153500 + }, + { + "epoch": 7.032725524535558, + "grad_norm": 0.14756208658218384, + "learning_rate": 2.450336978049322e-05, + "loss": 0.0031, + "step": 153600 + }, + { + "epoch": 7.037304121882261, + "grad_norm": 0.22443515062332153, + "learning_rate": 2.44346589104249e-05, + "loss": 0.0044, + "step": 153700 + }, + { + "epoch": 7.041882719228965, + "grad_norm": 0.05271737277507782, + "learning_rate": 2.4366013347540545e-05, + "loss": 0.0034, + "step": 153800 + }, + { + "epoch": 7.046461316575667, + "grad_norm": 0.10101396590471268, + "learning_rate": 2.4297433267196668e-05, + "loss": 0.0043, + "step": 153900 + }, + { + "epoch": 7.05103991392237, + "grad_norm": 0.042054641991853714, + "learning_rate": 2.422891884458241e-05, + "loss": 0.0034, + "step": 154000 + }, + { + "epoch": 7.055618511269072, + "grad_norm": 0.3988908529281616, + "learning_rate": 2.4160470254719285e-05, + "loss": 0.0033, + "step": 154100 + }, + { + "epoch": 7.0601971086157755, + "grad_norm": 0.02858237735927105, + "learning_rate": 2.4092087672460623e-05, + "loss": 0.004, + "step": 154200 + }, + { + "epoch": 7.064775705962479, + "grad_norm": 0.023146772757172585, + "learning_rate": 2.4023771272491125e-05, + "loss": 0.0033, + "step": 154300 + }, + { + "epoch": 7.069354303309181, + "grad_norm": 0.2908150553703308, + "learning_rate": 2.39555212293264e-05, + "loss": 0.0028, + "step": 154400 + }, + { + "epoch": 7.073932900655884, + "grad_norm": 0.15492355823516846, + "learning_rate": 2.38873377173126e-05, + "loss": 0.003, + "step": 154500 + }, + { + "epoch": 7.0785114980025865, + "grad_norm": 0.15679115056991577, + "learning_rate": 2.3819220910625882e-05, + "loss": 0.002, + "step": 154600 + }, + { + "epoch": 7.08309009534929, + "grad_norm": 0.08304117619991302, + "learning_rate": 2.3751170983272e-05, + "loss": 0.0037, + "step": 154700 + }, + { + "epoch": 7.087668692695993, + "grad_norm": 0.19912482798099518, + "learning_rate": 2.368318810908588e-05, + "loss": 0.0038, + "step": 154800 + }, + { + "epoch": 7.092247290042695, + "grad_norm": 0.163644939661026, + "learning_rate": 2.3615272461731186e-05, + "loss": 0.0046, + "step": 154900 + }, + { + "epoch": 7.096825887389398, + "grad_norm": 0.564191460609436, + "learning_rate": 2.3547424214699786e-05, + "loss": 0.0027, + "step": 155000 + }, + { + "epoch": 7.1014044847361015, + "grad_norm": 0.21988272666931152, + "learning_rate": 2.347964354131144e-05, + "loss": 0.0032, + "step": 155100 + }, + { + "epoch": 7.105983082082804, + "grad_norm": 0.028988847509026527, + "learning_rate": 2.3411930614713247e-05, + "loss": 0.0038, + "step": 155200 + }, + { + "epoch": 7.110561679429507, + "grad_norm": 0.23106561601161957, + "learning_rate": 2.3344285607879224e-05, + "loss": 0.0026, + "step": 155300 + }, + { + "epoch": 7.115140276776209, + "grad_norm": 0.08767526596784592, + "learning_rate": 2.3276708693609943e-05, + "loss": 0.0044, + "step": 155400 + }, + { + "epoch": 7.1197188741229125, + "grad_norm": 0.11939793080091476, + "learning_rate": 2.3209200044532027e-05, + "loss": 0.0028, + "step": 155500 + }, + { + "epoch": 7.124297471469616, + "grad_norm": 0.19785170257091522, + "learning_rate": 2.3141759833097653e-05, + "loss": 0.003, + "step": 155600 + }, + { + "epoch": 7.128876068816318, + "grad_norm": 0.889499843120575, + "learning_rate": 2.307438823158425e-05, + "loss": 0.0024, + "step": 155700 + }, + { + "epoch": 7.133454666163021, + "grad_norm": 0.5035886764526367, + "learning_rate": 2.300708541209393e-05, + "loss": 0.0039, + "step": 155800 + }, + { + "epoch": 7.1380332635097234, + "grad_norm": 0.04118403419852257, + "learning_rate": 2.2939851546553094e-05, + "loss": 0.0038, + "step": 155900 + }, + { + "epoch": 7.142611860856427, + "grad_norm": 0.3574579358100891, + "learning_rate": 2.2872686806712035e-05, + "loss": 0.0028, + "step": 156000 + }, + { + "epoch": 7.14719045820313, + "grad_norm": 0.005380525719374418, + "learning_rate": 2.2805591364144447e-05, + "loss": 0.0028, + "step": 156100 + }, + { + "epoch": 7.151769055549832, + "grad_norm": 0.0275371465831995, + "learning_rate": 2.273856539024703e-05, + "loss": 0.0029, + "step": 156200 + }, + { + "epoch": 7.156347652896535, + "grad_norm": 0.05387549847364426, + "learning_rate": 2.2671609056238952e-05, + "loss": 0.0026, + "step": 156300 + }, + { + "epoch": 7.160926250243238, + "grad_norm": 0.25863537192344666, + "learning_rate": 2.2604722533161572e-05, + "loss": 0.0022, + "step": 156400 + }, + { + "epoch": 7.165504847589941, + "grad_norm": 0.03250390663743019, + "learning_rate": 2.2537905991877855e-05, + "loss": 0.0026, + "step": 156500 + }, + { + "epoch": 7.170083444936644, + "grad_norm": 0.15917915105819702, + "learning_rate": 2.2471159603071995e-05, + "loss": 0.0047, + "step": 156600 + }, + { + "epoch": 7.174662042283346, + "grad_norm": 0.25873464345932007, + "learning_rate": 2.2404483537249023e-05, + "loss": 0.0041, + "step": 156700 + }, + { + "epoch": 7.179240639630049, + "grad_norm": 0.03446133807301521, + "learning_rate": 2.233787796473432e-05, + "loss": 0.0027, + "step": 156800 + }, + { + "epoch": 7.183819236976753, + "grad_norm": 0.39116761088371277, + "learning_rate": 2.2271343055673144e-05, + "loss": 0.0027, + "step": 156900 + }, + { + "epoch": 7.188397834323455, + "grad_norm": 0.005667871795594692, + "learning_rate": 2.22048789800303e-05, + "loss": 0.0032, + "step": 157000 + }, + { + "epoch": 7.192976431670158, + "grad_norm": 0.2927045226097107, + "learning_rate": 2.2138485907589613e-05, + "loss": 0.0033, + "step": 157100 + }, + { + "epoch": 7.19755502901686, + "grad_norm": 0.15872865915298462, + "learning_rate": 2.2072164007953517e-05, + "loss": 0.0029, + "step": 157200 + }, + { + "epoch": 7.202133626363564, + "grad_norm": 0.5440332293510437, + "learning_rate": 2.200591345054267e-05, + "loss": 0.0037, + "step": 157300 + }, + { + "epoch": 7.206712223710267, + "grad_norm": 0.2492242008447647, + "learning_rate": 2.193973440459549e-05, + "loss": 0.0029, + "step": 157400 + }, + { + "epoch": 7.211290821056969, + "grad_norm": 0.0664735659956932, + "learning_rate": 2.187362703916766e-05, + "loss": 0.0036, + "step": 157500 + }, + { + "epoch": 7.215869418403672, + "grad_norm": 0.006082352716475725, + "learning_rate": 2.1807591523131827e-05, + "loss": 0.0023, + "step": 157600 + }, + { + "epoch": 7.2204480157503745, + "grad_norm": 0.008978066965937614, + "learning_rate": 2.1741628025177036e-05, + "loss": 0.0031, + "step": 157700 + }, + { + "epoch": 7.225026613097078, + "grad_norm": 0.034269288182258606, + "learning_rate": 2.167573671380837e-05, + "loss": 0.005, + "step": 157800 + }, + { + "epoch": 7.229605210443781, + "grad_norm": 0.2424388974905014, + "learning_rate": 2.1609917757346542e-05, + "loss": 0.0031, + "step": 157900 + }, + { + "epoch": 7.234183807790483, + "grad_norm": 0.07712133228778839, + "learning_rate": 2.1544171323927415e-05, + "loss": 0.003, + "step": 158000 + }, + { + "epoch": 7.238762405137186, + "grad_norm": 0.4302210509777069, + "learning_rate": 2.1478497581501616e-05, + "loss": 0.0034, + "step": 158100 + }, + { + "epoch": 7.243341002483889, + "grad_norm": 0.09475143998861313, + "learning_rate": 2.141289669783401e-05, + "loss": 0.0028, + "step": 158200 + }, + { + "epoch": 7.247919599830592, + "grad_norm": 0.35180267691612244, + "learning_rate": 2.134736884050343e-05, + "loss": 0.0042, + "step": 158300 + }, + { + "epoch": 7.252498197177295, + "grad_norm": 0.013500731438398361, + "learning_rate": 2.1281914176902108e-05, + "loss": 0.0043, + "step": 158400 + }, + { + "epoch": 7.257076794523997, + "grad_norm": 0.23346847295761108, + "learning_rate": 2.1216532874235285e-05, + "loss": 0.0031, + "step": 158500 + }, + { + "epoch": 7.2616553918707005, + "grad_norm": 0.16270193457603455, + "learning_rate": 2.115122509952085e-05, + "loss": 0.004, + "step": 158600 + }, + { + "epoch": 7.266233989217403, + "grad_norm": 0.23846475780010223, + "learning_rate": 2.1085991019588863e-05, + "loss": 0.0027, + "step": 158700 + }, + { + "epoch": 7.270812586564106, + "grad_norm": 0.027605965733528137, + "learning_rate": 2.1020830801081077e-05, + "loss": 0.0026, + "step": 158800 + }, + { + "epoch": 7.275391183910809, + "grad_norm": 0.01757560484111309, + "learning_rate": 2.0955744610450618e-05, + "loss": 0.0036, + "step": 158900 + }, + { + "epoch": 7.2799697812575115, + "grad_norm": 0.02324344404041767, + "learning_rate": 2.0890732613961478e-05, + "loss": 0.0029, + "step": 159000 + }, + { + "epoch": 7.284548378604215, + "grad_norm": 0.01704220287501812, + "learning_rate": 2.0825794977688108e-05, + "loss": 0.0037, + "step": 159100 + }, + { + "epoch": 7.289126975950918, + "grad_norm": 0.08089294284582138, + "learning_rate": 2.0760931867515032e-05, + "loss": 0.0035, + "step": 159200 + }, + { + "epoch": 7.29370557329762, + "grad_norm": 0.15005187690258026, + "learning_rate": 2.0696143449136402e-05, + "loss": 0.0022, + "step": 159300 + }, + { + "epoch": 7.298284170644323, + "grad_norm": 0.0878557413816452, + "learning_rate": 2.063142988805552e-05, + "loss": 0.0035, + "step": 159400 + }, + { + "epoch": 7.302862767991026, + "grad_norm": 0.012229022569954395, + "learning_rate": 2.056679134958453e-05, + "loss": 0.0026, + "step": 159500 + }, + { + "epoch": 7.307441365337729, + "grad_norm": 0.053704481571912766, + "learning_rate": 2.050222799884387e-05, + "loss": 0.0036, + "step": 159600 + }, + { + "epoch": 7.312019962684432, + "grad_norm": 0.5345095992088318, + "learning_rate": 2.0437740000761925e-05, + "loss": 0.0038, + "step": 159700 + }, + { + "epoch": 7.316598560031134, + "grad_norm": 0.09854476153850555, + "learning_rate": 2.037332752007461e-05, + "loss": 0.0031, + "step": 159800 + }, + { + "epoch": 7.3211771573778375, + "grad_norm": 0.04005116969347, + "learning_rate": 2.0308990721324927e-05, + "loss": 0.0027, + "step": 159900 + }, + { + "epoch": 7.32575575472454, + "grad_norm": 1.264863133430481, + "learning_rate": 2.0244729768862518e-05, + "loss": 0.0034, + "step": 160000 + }, + { + "epoch": 7.330334352071243, + "grad_norm": 0.017268653959035873, + "learning_rate": 2.01805448268433e-05, + "loss": 0.0037, + "step": 160100 + }, + { + "epoch": 7.334912949417946, + "grad_norm": 0.10752640664577484, + "learning_rate": 2.0116436059229038e-05, + "loss": 0.0035, + "step": 160200 + }, + { + "epoch": 7.339491546764648, + "grad_norm": 0.43235811591148376, + "learning_rate": 2.0052403629786858e-05, + "loss": 0.0027, + "step": 160300 + }, + { + "epoch": 7.344070144111352, + "grad_norm": 0.014576783403754234, + "learning_rate": 1.9988447702088898e-05, + "loss": 0.0035, + "step": 160400 + }, + { + "epoch": 7.348648741458054, + "grad_norm": 0.1350947916507721, + "learning_rate": 1.9924568439511876e-05, + "loss": 0.0032, + "step": 160500 + }, + { + "epoch": 7.353227338804757, + "grad_norm": 0.24974310398101807, + "learning_rate": 1.98607660052367e-05, + "loss": 0.0038, + "step": 160600 + }, + { + "epoch": 7.35780593615146, + "grad_norm": 0.05233803018927574, + "learning_rate": 1.9797040562247948e-05, + "loss": 0.0041, + "step": 160700 + }, + { + "epoch": 7.362384533498163, + "grad_norm": 0.18822649121284485, + "learning_rate": 1.9733392273333596e-05, + "loss": 0.0037, + "step": 160800 + }, + { + "epoch": 7.366963130844866, + "grad_norm": 0.19756104052066803, + "learning_rate": 1.9669821301084475e-05, + "loss": 0.0027, + "step": 160900 + }, + { + "epoch": 7.371541728191568, + "grad_norm": 0.00448650261387229, + "learning_rate": 1.9606327807893902e-05, + "loss": 0.0032, + "step": 161000 + }, + { + "epoch": 7.376120325538271, + "grad_norm": 0.14489981532096863, + "learning_rate": 1.954291195595733e-05, + "loss": 0.0031, + "step": 161100 + }, + { + "epoch": 7.380698922884974, + "grad_norm": 0.0051267268136143684, + "learning_rate": 1.947957390727185e-05, + "loss": 0.003, + "step": 161200 + }, + { + "epoch": 7.385277520231677, + "grad_norm": 0.38486120104789734, + "learning_rate": 1.941631382363576e-05, + "loss": 0.0035, + "step": 161300 + }, + { + "epoch": 7.38985611757838, + "grad_norm": 0.004985155537724495, + "learning_rate": 1.9353131866648273e-05, + "loss": 0.0024, + "step": 161400 + }, + { + "epoch": 7.394434714925083, + "grad_norm": 0.002783630508929491, + "learning_rate": 1.929002819770896e-05, + "loss": 0.0034, + "step": 161500 + }, + { + "epoch": 7.399013312271785, + "grad_norm": 0.2842748165130615, + "learning_rate": 1.922700297801741e-05, + "loss": 0.0034, + "step": 161600 + }, + { + "epoch": 7.403591909618489, + "grad_norm": 0.050929997116327286, + "learning_rate": 1.9164056368572846e-05, + "loss": 0.003, + "step": 161700 + }, + { + "epoch": 7.408170506965191, + "grad_norm": 0.06748020648956299, + "learning_rate": 1.9101188530173687e-05, + "loss": 0.0032, + "step": 161800 + }, + { + "epoch": 7.412749104311894, + "grad_norm": 0.03134176880121231, + "learning_rate": 1.9038399623417063e-05, + "loss": 0.0023, + "step": 161900 + }, + { + "epoch": 7.417327701658597, + "grad_norm": 0.06679194420576096, + "learning_rate": 1.897568980869855e-05, + "loss": 0.0032, + "step": 162000 + }, + { + "epoch": 7.4219062990052995, + "grad_norm": 0.22911858558654785, + "learning_rate": 1.8913059246211612e-05, + "loss": 0.0033, + "step": 162100 + }, + { + "epoch": 7.426484896352003, + "grad_norm": 0.12825864553451538, + "learning_rate": 1.8850508095947332e-05, + "loss": 0.0029, + "step": 162200 + }, + { + "epoch": 7.431063493698705, + "grad_norm": 0.022259972989559174, + "learning_rate": 1.8788036517693858e-05, + "loss": 0.004, + "step": 162300 + }, + { + "epoch": 7.435642091045408, + "grad_norm": 0.09766406565904617, + "learning_rate": 1.8725644671036126e-05, + "loss": 0.0033, + "step": 162400 + }, + { + "epoch": 7.440220688392111, + "grad_norm": 0.6670352816581726, + "learning_rate": 1.8663332715355396e-05, + "loss": 0.0032, + "step": 162500 + }, + { + "epoch": 7.444799285738814, + "grad_norm": 0.009802890941500664, + "learning_rate": 1.8601100809828787e-05, + "loss": 0.0039, + "step": 162600 + }, + { + "epoch": 7.449377883085517, + "grad_norm": 0.08977996557950974, + "learning_rate": 1.853894911342901e-05, + "loss": 0.0029, + "step": 162700 + }, + { + "epoch": 7.45395648043222, + "grad_norm": 0.713555097579956, + "learning_rate": 1.847687778492382e-05, + "loss": 0.0027, + "step": 162800 + }, + { + "epoch": 7.458535077778922, + "grad_norm": 0.3743430972099304, + "learning_rate": 1.8414886982875664e-05, + "loss": 0.0034, + "step": 162900 + }, + { + "epoch": 7.4631136751256255, + "grad_norm": 0.0767466276884079, + "learning_rate": 1.8352976865641326e-05, + "loss": 0.0032, + "step": 163000 + }, + { + "epoch": 7.467692272472328, + "grad_norm": 0.28391310572624207, + "learning_rate": 1.8291147591371482e-05, + "loss": 0.0035, + "step": 163100 + }, + { + "epoch": 7.472270869819031, + "grad_norm": 0.25534164905548096, + "learning_rate": 1.822939931801024e-05, + "loss": 0.0028, + "step": 163200 + }, + { + "epoch": 7.476849467165734, + "grad_norm": 0.03635001927614212, + "learning_rate": 1.816773220329484e-05, + "loss": 0.0035, + "step": 163300 + }, + { + "epoch": 7.4814280645124365, + "grad_norm": 0.06547212600708008, + "learning_rate": 1.810614640475518e-05, + "loss": 0.004, + "step": 163400 + }, + { + "epoch": 7.48600666185914, + "grad_norm": 0.10231446474790573, + "learning_rate": 1.8044642079713408e-05, + "loss": 0.0026, + "step": 163500 + }, + { + "epoch": 7.490585259205842, + "grad_norm": 0.08887581527233124, + "learning_rate": 1.79832193852836e-05, + "loss": 0.002, + "step": 163600 + }, + { + "epoch": 7.495163856552545, + "grad_norm": 0.01825689524412155, + "learning_rate": 1.792187847837129e-05, + "loss": 0.0032, + "step": 163700 + }, + { + "epoch": 7.499742453899248, + "grad_norm": 0.0413985475897789, + "learning_rate": 1.7860619515673033e-05, + "loss": 0.003, + "step": 163800 + }, + { + "epoch": 7.504321051245951, + "grad_norm": 0.11123603582382202, + "learning_rate": 1.779944265367614e-05, + "loss": 0.0031, + "step": 163900 + }, + { + "epoch": 7.508899648592654, + "grad_norm": 0.11079199612140656, + "learning_rate": 1.7738348048658127e-05, + "loss": 0.0029, + "step": 164000 + }, + { + "epoch": 7.513478245939356, + "grad_norm": 0.026996923610568047, + "learning_rate": 1.767733585668639e-05, + "loss": 0.0028, + "step": 164100 + }, + { + "epoch": 7.518056843286059, + "grad_norm": 0.2861877381801605, + "learning_rate": 1.7616406233617832e-05, + "loss": 0.0033, + "step": 164200 + }, + { + "epoch": 7.5226354406327625, + "grad_norm": 0.013889641501009464, + "learning_rate": 1.7555559335098414e-05, + "loss": 0.0034, + "step": 164300 + }, + { + "epoch": 7.527214037979465, + "grad_norm": 0.5749355554580688, + "learning_rate": 1.749479531656279e-05, + "loss": 0.0034, + "step": 164400 + }, + { + "epoch": 7.531792635326168, + "grad_norm": 0.03499993681907654, + "learning_rate": 1.7434114333233852e-05, + "loss": 0.0029, + "step": 164500 + }, + { + "epoch": 7.53637123267287, + "grad_norm": 0.1424218863248825, + "learning_rate": 1.737351654012244e-05, + "loss": 0.0025, + "step": 164600 + }, + { + "epoch": 7.540949830019573, + "grad_norm": 0.009633993729948997, + "learning_rate": 1.7313002092026837e-05, + "loss": 0.0032, + "step": 164700 + }, + { + "epoch": 7.545528427366277, + "grad_norm": 0.02650436945259571, + "learning_rate": 1.725257114353241e-05, + "loss": 0.0044, + "step": 164800 + }, + { + "epoch": 7.550107024712979, + "grad_norm": 0.0338139683008194, + "learning_rate": 1.7192223849011258e-05, + "loss": 0.0029, + "step": 164900 + }, + { + "epoch": 7.554685622059682, + "grad_norm": 1.0118355751037598, + "learning_rate": 1.7131960362621796e-05, + "loss": 0.0041, + "step": 165000 + }, + { + "epoch": 7.559264219406385, + "grad_norm": 0.014256274327635765, + "learning_rate": 1.7071780838308288e-05, + "loss": 0.0027, + "step": 165100 + }, + { + "epoch": 7.563842816753088, + "grad_norm": 0.05664459615945816, + "learning_rate": 1.7011685429800595e-05, + "loss": 0.0026, + "step": 165200 + }, + { + "epoch": 7.568421414099791, + "grad_norm": 0.14832501113414764, + "learning_rate": 1.695167429061364e-05, + "loss": 0.0027, + "step": 165300 + }, + { + "epoch": 7.573000011446493, + "grad_norm": 0.19807232916355133, + "learning_rate": 1.6891747574047078e-05, + "loss": 0.0026, + "step": 165400 + }, + { + "epoch": 7.577578608793196, + "grad_norm": 0.09145753085613251, + "learning_rate": 1.6831905433184946e-05, + "loss": 0.0032, + "step": 165500 + }, + { + "epoch": 7.582157206139899, + "grad_norm": 0.021602990105748177, + "learning_rate": 1.6772148020895228e-05, + "loss": 0.0022, + "step": 165600 + }, + { + "epoch": 7.586735803486602, + "grad_norm": 0.2839347720146179, + "learning_rate": 1.671247548982941e-05, + "loss": 0.0034, + "step": 165700 + }, + { + "epoch": 7.591314400833305, + "grad_norm": 0.02294602431356907, + "learning_rate": 1.6652887992422235e-05, + "loss": 0.0023, + "step": 165800 + }, + { + "epoch": 7.595892998180007, + "grad_norm": 0.027606772258877754, + "learning_rate": 1.659338568089114e-05, + "loss": 0.0032, + "step": 165900 + }, + { + "epoch": 7.60047159552671, + "grad_norm": 0.01902574673295021, + "learning_rate": 1.653396870723599e-05, + "loss": 0.0036, + "step": 166000 + }, + { + "epoch": 7.6050501928734136, + "grad_norm": 0.06941546499729156, + "learning_rate": 1.6474637223238665e-05, + "loss": 0.0031, + "step": 166100 + }, + { + "epoch": 7.609628790220116, + "grad_norm": 0.06622402369976044, + "learning_rate": 1.641539138046264e-05, + "loss": 0.003, + "step": 166200 + }, + { + "epoch": 7.614207387566819, + "grad_norm": 0.0019321365980431437, + "learning_rate": 1.6356231330252657e-05, + "loss": 0.0031, + "step": 166300 + }, + { + "epoch": 7.618785984913522, + "grad_norm": 0.11348855495452881, + "learning_rate": 1.629715722373423e-05, + "loss": 0.0039, + "step": 166400 + }, + { + "epoch": 7.6233645822602245, + "grad_norm": 0.14493609964847565, + "learning_rate": 1.6238169211813387e-05, + "loss": 0.0019, + "step": 166500 + }, + { + "epoch": 7.627943179606928, + "grad_norm": 0.11578594148159027, + "learning_rate": 1.6179267445176206e-05, + "loss": 0.0031, + "step": 166600 + }, + { + "epoch": 7.63252177695363, + "grad_norm": 0.026161905378103256, + "learning_rate": 1.6120452074288416e-05, + "loss": 0.0031, + "step": 166700 + }, + { + "epoch": 7.637100374300333, + "grad_norm": 0.048572130501270294, + "learning_rate": 1.6061723249395104e-05, + "loss": 0.0027, + "step": 166800 + }, + { + "epoch": 7.6416789716470355, + "grad_norm": 0.08658236265182495, + "learning_rate": 1.600308112052027e-05, + "loss": 0.0048, + "step": 166900 + }, + { + "epoch": 7.646257568993739, + "grad_norm": 0.03995939716696739, + "learning_rate": 1.594452583746638e-05, + "loss": 0.0029, + "step": 167000 + }, + { + "epoch": 7.650836166340442, + "grad_norm": 0.5306475758552551, + "learning_rate": 1.588605754981413e-05, + "loss": 0.0032, + "step": 167100 + }, + { + "epoch": 7.655414763687144, + "grad_norm": 0.008948258124291897, + "learning_rate": 1.582767640692194e-05, + "loss": 0.0024, + "step": 167200 + }, + { + "epoch": 7.659993361033847, + "grad_norm": 0.09350460022687912, + "learning_rate": 1.576938255792561e-05, + "loss": 0.0032, + "step": 167300 + }, + { + "epoch": 7.6645719583805505, + "grad_norm": 0.34027963876724243, + "learning_rate": 1.5711176151737984e-05, + "loss": 0.0029, + "step": 167400 + }, + { + "epoch": 7.669150555727253, + "grad_norm": 0.012650508433580399, + "learning_rate": 1.5653057337048514e-05, + "loss": 0.0031, + "step": 167500 + }, + { + "epoch": 7.673729153073956, + "grad_norm": 0.07974658906459808, + "learning_rate": 1.5595026262322875e-05, + "loss": 0.0023, + "step": 167600 + }, + { + "epoch": 7.678307750420658, + "grad_norm": 0.06705432385206223, + "learning_rate": 1.553708307580265e-05, + "loss": 0.0032, + "step": 167700 + }, + { + "epoch": 7.6828863477673615, + "grad_norm": 0.027641797438263893, + "learning_rate": 1.547922792550488e-05, + "loss": 0.0036, + "step": 167800 + }, + { + "epoch": 7.687464945114065, + "grad_norm": 0.44552162289619446, + "learning_rate": 1.5421460959221707e-05, + "loss": 0.0036, + "step": 167900 + }, + { + "epoch": 7.692043542460767, + "grad_norm": 0.02241067960858345, + "learning_rate": 1.536378232452003e-05, + "loss": 0.0037, + "step": 168000 + }, + { + "epoch": 7.69662213980747, + "grad_norm": 0.2189732789993286, + "learning_rate": 1.5306192168741117e-05, + "loss": 0.0026, + "step": 168100 + }, + { + "epoch": 7.701200737154172, + "grad_norm": 0.046641841530799866, + "learning_rate": 1.5248690639000162e-05, + "loss": 0.0035, + "step": 168200 + }, + { + "epoch": 7.705779334500876, + "grad_norm": 0.02562684379518032, + "learning_rate": 1.5191277882186023e-05, + "loss": 0.003, + "step": 168300 + }, + { + "epoch": 7.710357931847579, + "grad_norm": 0.22241626679897308, + "learning_rate": 1.513395404496072e-05, + "loss": 0.0022, + "step": 168400 + }, + { + "epoch": 7.714936529194281, + "grad_norm": 0.2740160822868347, + "learning_rate": 1.5076719273759198e-05, + "loss": 0.0033, + "step": 168500 + }, + { + "epoch": 7.719515126540984, + "grad_norm": 0.02267398126423359, + "learning_rate": 1.5019573714788809e-05, + "loss": 0.002, + "step": 168600 + }, + { + "epoch": 7.7240937238876874, + "grad_norm": 0.008224272169172764, + "learning_rate": 1.4962517514029067e-05, + "loss": 0.0022, + "step": 168700 + }, + { + "epoch": 7.72867232123439, + "grad_norm": 0.11832094937562943, + "learning_rate": 1.4905550817231206e-05, + "loss": 0.0029, + "step": 168800 + }, + { + "epoch": 7.733250918581093, + "grad_norm": 0.3029548227787018, + "learning_rate": 1.4848673769917787e-05, + "loss": 0.0042, + "step": 168900 + }, + { + "epoch": 7.737829515927795, + "grad_norm": 0.026391340419650078, + "learning_rate": 1.4791886517382413e-05, + "loss": 0.0031, + "step": 169000 + }, + { + "epoch": 7.742408113274498, + "grad_norm": 0.4289281666278839, + "learning_rate": 1.473518920468926e-05, + "loss": 0.0033, + "step": 169100 + }, + { + "epoch": 7.746986710621201, + "grad_norm": 0.1801924854516983, + "learning_rate": 1.4678581976672751e-05, + "loss": 0.0028, + "step": 169200 + }, + { + "epoch": 7.751565307967904, + "grad_norm": 0.06808359920978546, + "learning_rate": 1.4622064977937222e-05, + "loss": 0.0037, + "step": 169300 + }, + { + "epoch": 7.756143905314607, + "grad_norm": 0.5008605122566223, + "learning_rate": 1.4565638352856503e-05, + "loss": 0.0032, + "step": 169400 + }, + { + "epoch": 7.760722502661309, + "grad_norm": 0.13920585811138153, + "learning_rate": 1.4509302245573536e-05, + "loss": 0.0032, + "step": 169500 + }, + { + "epoch": 7.7653011000080125, + "grad_norm": 0.002380757825449109, + "learning_rate": 1.4453056800000076e-05, + "loss": 0.0025, + "step": 169600 + }, + { + "epoch": 7.769879697354716, + "grad_norm": 0.03281938657164574, + "learning_rate": 1.4396902159816245e-05, + "loss": 0.0028, + "step": 169700 + }, + { + "epoch": 7.774458294701418, + "grad_norm": 0.2583022117614746, + "learning_rate": 1.4340838468470197e-05, + "loss": 0.0031, + "step": 169800 + }, + { + "epoch": 7.779036892048121, + "grad_norm": 0.0035414681769907475, + "learning_rate": 1.4284865869177789e-05, + "loss": 0.0031, + "step": 169900 + }, + { + "epoch": 7.783615489394824, + "grad_norm": 0.23097677528858185, + "learning_rate": 1.4228984504922178e-05, + "loss": 0.0034, + "step": 170000 + }, + { + "epoch": 7.788194086741527, + "grad_norm": 0.515470027923584, + "learning_rate": 1.4173194518453414e-05, + "loss": 0.004, + "step": 170100 + }, + { + "epoch": 7.79277268408823, + "grad_norm": 0.03734416887164116, + "learning_rate": 1.4117496052288193e-05, + "loss": 0.0025, + "step": 170200 + }, + { + "epoch": 7.797351281434932, + "grad_norm": 0.270358681678772, + "learning_rate": 1.4061889248709343e-05, + "loss": 0.0017, + "step": 170300 + }, + { + "epoch": 7.801929878781635, + "grad_norm": 0.027283625677227974, + "learning_rate": 1.4006374249765597e-05, + "loss": 0.0028, + "step": 170400 + }, + { + "epoch": 7.806508476128338, + "grad_norm": 0.06574155390262604, + "learning_rate": 1.3950951197271134e-05, + "loss": 0.0031, + "step": 170500 + }, + { + "epoch": 7.811087073475041, + "grad_norm": 0.05151946470141411, + "learning_rate": 1.3895620232805279e-05, + "loss": 0.0017, + "step": 170600 + }, + { + "epoch": 7.815665670821744, + "grad_norm": 0.012561053037643433, + "learning_rate": 1.3840381497712113e-05, + "loss": 0.0025, + "step": 170700 + }, + { + "epoch": 7.820244268168446, + "grad_norm": 0.005159000866115093, + "learning_rate": 1.3785235133100088e-05, + "loss": 0.0034, + "step": 170800 + }, + { + "epoch": 7.8248228655151495, + "grad_norm": 0.04550444707274437, + "learning_rate": 1.3730181279841748e-05, + "loss": 0.0024, + "step": 170900 + }, + { + "epoch": 7.829401462861853, + "grad_norm": 0.05944928154349327, + "learning_rate": 1.3675220078573253e-05, + "loss": 0.0022, + "step": 171000 + }, + { + "epoch": 7.833980060208555, + "grad_norm": 0.31237590312957764, + "learning_rate": 1.3620351669694103e-05, + "loss": 0.0023, + "step": 171100 + }, + { + "epoch": 7.838558657555258, + "grad_norm": 0.0012041196459904313, + "learning_rate": 1.356557619336678e-05, + "loss": 0.0027, + "step": 171200 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 0.1280195415019989, + "learning_rate": 1.3510893789516372e-05, + "loss": 0.0034, + "step": 171300 + }, + { + "epoch": 7.847715852248664, + "grad_norm": 0.2050485610961914, + "learning_rate": 1.345630459783015e-05, + "loss": 0.0028, + "step": 171400 + }, + { + "epoch": 7.852294449595367, + "grad_norm": 0.15840676426887512, + "learning_rate": 1.340180875775735e-05, + "loss": 0.002, + "step": 171500 + }, + { + "epoch": 7.856873046942069, + "grad_norm": 0.7529467344284058, + "learning_rate": 1.3347406408508695e-05, + "loss": 0.0022, + "step": 171600 + }, + { + "epoch": 7.861451644288772, + "grad_norm": 0.03594828397035599, + "learning_rate": 1.3293097689056078e-05, + "loss": 0.0025, + "step": 171700 + }, + { + "epoch": 7.866030241635475, + "grad_norm": 0.4587234854698181, + "learning_rate": 1.323888273813223e-05, + "loss": 0.0029, + "step": 171800 + }, + { + "epoch": 7.870608838982178, + "grad_norm": 0.05882592126727104, + "learning_rate": 1.3184761694230375e-05, + "loss": 0.0026, + "step": 171900 + }, + { + "epoch": 7.875187436328881, + "grad_norm": 0.07484336197376251, + "learning_rate": 1.3130734695603786e-05, + "loss": 0.0028, + "step": 172000 + }, + { + "epoch": 7.879766033675583, + "grad_norm": 0.008674757555127144, + "learning_rate": 1.3076801880265554e-05, + "loss": 0.0028, + "step": 172100 + }, + { + "epoch": 7.884344631022286, + "grad_norm": 0.41222670674324036, + "learning_rate": 1.3022963385988151e-05, + "loss": 0.0036, + "step": 172200 + }, + { + "epoch": 7.88892322836899, + "grad_norm": 0.10513575375080109, + "learning_rate": 1.296921935030308e-05, + "loss": 0.0029, + "step": 172300 + }, + { + "epoch": 7.893501825715692, + "grad_norm": 0.29091617465019226, + "learning_rate": 1.2915569910500591e-05, + "loss": 0.004, + "step": 172400 + }, + { + "epoch": 7.898080423062395, + "grad_norm": 0.09394501894712448, + "learning_rate": 1.2862015203629274e-05, + "loss": 0.0032, + "step": 172500 + }, + { + "epoch": 7.902659020409097, + "grad_norm": 0.0589442253112793, + "learning_rate": 1.2808555366495728e-05, + "loss": 0.0027, + "step": 172600 + }, + { + "epoch": 7.907237617755801, + "grad_norm": 0.02068307250738144, + "learning_rate": 1.2755190535664168e-05, + "loss": 0.0024, + "step": 172700 + }, + { + "epoch": 7.911816215102503, + "grad_norm": 0.08841919153928757, + "learning_rate": 1.2701920847456166e-05, + "loss": 0.0027, + "step": 172800 + }, + { + "epoch": 7.916394812449206, + "grad_norm": 0.22736288607120514, + "learning_rate": 1.264874643795021e-05, + "loss": 0.0034, + "step": 172900 + }, + { + "epoch": 7.920973409795909, + "grad_norm": 0.16831666231155396, + "learning_rate": 1.2595667442981401e-05, + "loss": 0.0023, + "step": 173000 + }, + { + "epoch": 7.9255520071426115, + "grad_norm": 0.04770100489258766, + "learning_rate": 1.2542683998141119e-05, + "loss": 0.0025, + "step": 173100 + }, + { + "epoch": 7.930130604489315, + "grad_norm": 0.6141162514686584, + "learning_rate": 1.2489796238776675e-05, + "loss": 0.004, + "step": 173200 + }, + { + "epoch": 7.934709201836018, + "grad_norm": 0.7967793345451355, + "learning_rate": 1.243700429999089e-05, + "loss": 0.0027, + "step": 173300 + }, + { + "epoch": 7.93928779918272, + "grad_norm": 0.015516542829573154, + "learning_rate": 1.2384308316641874e-05, + "loss": 0.0017, + "step": 173400 + }, + { + "epoch": 7.943866396529423, + "grad_norm": 0.0020021158270537853, + "learning_rate": 1.233170842334258e-05, + "loss": 0.0029, + "step": 173500 + }, + { + "epoch": 7.948444993876126, + "grad_norm": 0.014905404299497604, + "learning_rate": 1.2279204754460493e-05, + "loss": 0.0026, + "step": 173600 + }, + { + "epoch": 7.953023591222829, + "grad_norm": 0.04339270293712616, + "learning_rate": 1.222679744411731e-05, + "loss": 0.0031, + "step": 173700 + }, + { + "epoch": 7.957602188569532, + "grad_norm": 0.10109388083219528, + "learning_rate": 1.2174486626188586e-05, + "loss": 0.0033, + "step": 173800 + }, + { + "epoch": 7.962180785916234, + "grad_norm": 0.018510516732931137, + "learning_rate": 1.2122272434303344e-05, + "loss": 0.0026, + "step": 173900 + }, + { + "epoch": 7.9667593832629375, + "grad_norm": 0.014604040421545506, + "learning_rate": 1.2070155001843835e-05, + "loss": 0.0024, + "step": 174000 + }, + { + "epoch": 7.97133798060964, + "grad_norm": 0.20794948935508728, + "learning_rate": 1.2018134461945075e-05, + "loss": 0.0033, + "step": 174100 + }, + { + "epoch": 7.975916577956343, + "grad_norm": 0.06476528197526932, + "learning_rate": 1.1966210947494583e-05, + "loss": 0.0024, + "step": 174200 + }, + { + "epoch": 7.980495175303046, + "grad_norm": 0.0063975718803703785, + "learning_rate": 1.1914384591132044e-05, + "loss": 0.0022, + "step": 174300 + }, + { + "epoch": 7.9850737726497485, + "grad_norm": 0.03397635370492935, + "learning_rate": 1.1862655525248945e-05, + "loss": 0.0025, + "step": 174400 + }, + { + "epoch": 7.989652369996452, + "grad_norm": 0.030696725472807884, + "learning_rate": 1.1811023881988248e-05, + "loss": 0.0021, + "step": 174500 + }, + { + "epoch": 7.994230967343155, + "grad_norm": 0.08137042820453644, + "learning_rate": 1.1759489793244022e-05, + "loss": 0.0025, + "step": 174600 + }, + { + "epoch": 7.998809564689857, + "grad_norm": 0.0656815618276596, + "learning_rate": 1.1708053390661128e-05, + "loss": 0.0026, + "step": 174700 + }, + { + "epoch": 8.0, + "eval_loss": 0.17588233947753906, + "eval_runtime": 260.0784, + "eval_samples_per_second": 21.147, + "eval_steps_per_second": 21.147, + "step": 174726 + }, + { + "epoch": 8.00338816203656, + "grad_norm": 0.10640919208526611, + "learning_rate": 1.1656714805634938e-05, + "loss": 0.0018, + "step": 174800 + }, + { + "epoch": 8.007966759383264, + "grad_norm": 0.0020934424828737974, + "learning_rate": 1.1605474169310881e-05, + "loss": 0.002, + "step": 174900 + }, + { + "epoch": 8.012545356729966, + "grad_norm": 0.09055866301059723, + "learning_rate": 1.1554331612584218e-05, + "loss": 0.0017, + "step": 175000 + }, + { + "epoch": 8.017123954076668, + "grad_norm": 0.49149322509765625, + "learning_rate": 1.1503287266099666e-05, + "loss": 0.0025, + "step": 175100 + }, + { + "epoch": 8.021702551423372, + "grad_norm": 0.01625397428870201, + "learning_rate": 1.145234126025102e-05, + "loss": 0.0021, + "step": 175200 + }, + { + "epoch": 8.026281148770074, + "grad_norm": 0.8061564564704895, + "learning_rate": 1.1401493725180912e-05, + "loss": 0.0015, + "step": 175300 + }, + { + "epoch": 8.030859746116777, + "grad_norm": 0.6298221349716187, + "learning_rate": 1.1350744790780388e-05, + "loss": 0.0018, + "step": 175400 + }, + { + "epoch": 8.035438343463479, + "grad_norm": 0.051574669778347015, + "learning_rate": 1.130009458668863e-05, + "loss": 0.0019, + "step": 175500 + }, + { + "epoch": 8.040016940810183, + "grad_norm": 0.034144267439842224, + "learning_rate": 1.1249543242292627e-05, + "loss": 0.0019, + "step": 175600 + }, + { + "epoch": 8.044595538156885, + "grad_norm": 0.05505882203578949, + "learning_rate": 1.119909088672682e-05, + "loss": 0.0019, + "step": 175700 + }, + { + "epoch": 8.049174135503588, + "grad_norm": 0.01235408615320921, + "learning_rate": 1.1148737648872759e-05, + "loss": 0.0019, + "step": 175800 + }, + { + "epoch": 8.053752732850292, + "grad_norm": 0.07047531008720398, + "learning_rate": 1.1098483657358844e-05, + "loss": 0.0017, + "step": 175900 + }, + { + "epoch": 8.058331330196994, + "grad_norm": 0.048473093658685684, + "learning_rate": 1.1048329040559896e-05, + "loss": 0.0019, + "step": 176000 + }, + { + "epoch": 8.062909927543696, + "grad_norm": 0.019425269216299057, + "learning_rate": 1.0998273926596897e-05, + "loss": 0.0015, + "step": 176100 + }, + { + "epoch": 8.0674885248904, + "grad_norm": 0.0072584389708936214, + "learning_rate": 1.094831844333667e-05, + "loss": 0.0024, + "step": 176200 + }, + { + "epoch": 8.072067122237103, + "grad_norm": 0.0020360236521810293, + "learning_rate": 1.0898462718391523e-05, + "loss": 0.0014, + "step": 176300 + }, + { + "epoch": 8.076645719583805, + "grad_norm": 0.5871603488922119, + "learning_rate": 1.0848706879118892e-05, + "loss": 0.0019, + "step": 176400 + }, + { + "epoch": 8.08122431693051, + "grad_norm": 0.13031832873821259, + "learning_rate": 1.0799051052621106e-05, + "loss": 0.0017, + "step": 176500 + }, + { + "epoch": 8.085802914277211, + "grad_norm": 0.008929682895541191, + "learning_rate": 1.074949536574496e-05, + "loss": 0.0016, + "step": 176600 + }, + { + "epoch": 8.090381511623914, + "grad_norm": 0.003812073729932308, + "learning_rate": 1.0700039945081498e-05, + "loss": 0.0017, + "step": 176700 + }, + { + "epoch": 8.094960108970616, + "grad_norm": 0.011707616969943047, + "learning_rate": 1.0650684916965559e-05, + "loss": 0.0016, + "step": 176800 + }, + { + "epoch": 8.09953870631732, + "grad_norm": 0.037662629038095474, + "learning_rate": 1.0601430407475582e-05, + "loss": 0.002, + "step": 176900 + }, + { + "epoch": 8.104117303664022, + "grad_norm": 0.012711996212601662, + "learning_rate": 1.0552276542433237e-05, + "loss": 0.0015, + "step": 177000 + }, + { + "epoch": 8.108695901010725, + "grad_norm": 0.19156889617443085, + "learning_rate": 1.0503223447403032e-05, + "loss": 0.0011, + "step": 177100 + }, + { + "epoch": 8.113274498357429, + "grad_norm": 0.010708093643188477, + "learning_rate": 1.0454271247692137e-05, + "loss": 0.0013, + "step": 177200 + }, + { + "epoch": 8.117853095704131, + "grad_norm": 0.02960583008825779, + "learning_rate": 1.040542006834992e-05, + "loss": 0.0024, + "step": 177300 + }, + { + "epoch": 8.122431693050833, + "grad_norm": 0.1249750480055809, + "learning_rate": 1.0356670034167698e-05, + "loss": 0.0015, + "step": 177400 + }, + { + "epoch": 8.127010290397537, + "grad_norm": 0.0189303457736969, + "learning_rate": 1.0308021269678442e-05, + "loss": 0.0021, + "step": 177500 + }, + { + "epoch": 8.13158888774424, + "grad_norm": 0.004004355985671282, + "learning_rate": 1.025947389915643e-05, + "loss": 0.0025, + "step": 177600 + }, + { + "epoch": 8.136167485090942, + "grad_norm": 0.021703239530324936, + "learning_rate": 1.0211028046616866e-05, + "loss": 0.0012, + "step": 177700 + }, + { + "epoch": 8.140746082437646, + "grad_norm": 0.41312482953071594, + "learning_rate": 1.0162683835815705e-05, + "loss": 0.0013, + "step": 177800 + }, + { + "epoch": 8.145324679784348, + "grad_norm": 0.021725183352828026, + "learning_rate": 1.0114441390249202e-05, + "loss": 0.0012, + "step": 177900 + }, + { + "epoch": 8.14990327713105, + "grad_norm": 0.33544811606407166, + "learning_rate": 1.0066300833153647e-05, + "loss": 0.002, + "step": 178000 + }, + { + "epoch": 8.154481874477753, + "grad_norm": 0.024289660155773163, + "learning_rate": 1.0018262287505086e-05, + "loss": 0.0023, + "step": 178100 + }, + { + "epoch": 8.159060471824457, + "grad_norm": 0.49725693464279175, + "learning_rate": 9.970325876018982e-06, + "loss": 0.002, + "step": 178200 + }, + { + "epoch": 8.16363906917116, + "grad_norm": 0.018485499545931816, + "learning_rate": 9.922491721149845e-06, + "loss": 0.0019, + "step": 178300 + }, + { + "epoch": 8.168217666517862, + "grad_norm": 0.009344914928078651, + "learning_rate": 9.874759945091016e-06, + "loss": 0.0016, + "step": 178400 + }, + { + "epoch": 8.172796263864566, + "grad_norm": 0.019952520728111267, + "learning_rate": 9.82713066977427e-06, + "loss": 0.0012, + "step": 178500 + }, + { + "epoch": 8.177374861211268, + "grad_norm": 0.5553386211395264, + "learning_rate": 9.77960401686958e-06, + "loss": 0.0019, + "step": 178600 + }, + { + "epoch": 8.18195345855797, + "grad_norm": 0.009466302581131458, + "learning_rate": 9.732180107784727e-06, + "loss": 0.0022, + "step": 178700 + }, + { + "epoch": 8.186532055904674, + "grad_norm": 0.5055824518203735, + "learning_rate": 9.684859063665059e-06, + "loss": 0.0017, + "step": 178800 + }, + { + "epoch": 8.191110653251377, + "grad_norm": 0.38719162344932556, + "learning_rate": 9.637641005393167e-06, + "loss": 0.002, + "step": 178900 + }, + { + "epoch": 8.195689250598079, + "grad_norm": 0.0033107008785009384, + "learning_rate": 9.590526053588505e-06, + "loss": 0.0013, + "step": 179000 + }, + { + "epoch": 8.200267847944781, + "grad_norm": 0.015472437255084515, + "learning_rate": 9.543514328607212e-06, + "loss": 0.0019, + "step": 179100 + }, + { + "epoch": 8.204846445291485, + "grad_norm": 0.004773670807480812, + "learning_rate": 9.496605950541676e-06, + "loss": 0.002, + "step": 179200 + }, + { + "epoch": 8.209425042638188, + "grad_norm": 0.0060659064911305904, + "learning_rate": 9.44980103922029e-06, + "loss": 0.0018, + "step": 179300 + }, + { + "epoch": 8.21400363998489, + "grad_norm": 0.004397235810756683, + "learning_rate": 9.403099714207175e-06, + "loss": 0.0017, + "step": 179400 + }, + { + "epoch": 8.218582237331594, + "grad_norm": 0.004803112708032131, + "learning_rate": 9.356502094801816e-06, + "loss": 0.0015, + "step": 179500 + }, + { + "epoch": 8.223160834678296, + "grad_norm": 0.0035059794317930937, + "learning_rate": 9.310008300038758e-06, + "loss": 0.0018, + "step": 179600 + }, + { + "epoch": 8.227739432024999, + "grad_norm": 0.025477442890405655, + "learning_rate": 9.263618448687377e-06, + "loss": 0.002, + "step": 179700 + }, + { + "epoch": 8.232318029371703, + "grad_norm": 0.3329303562641144, + "learning_rate": 9.217332659251477e-06, + "loss": 0.0018, + "step": 179800 + }, + { + "epoch": 8.236896626718405, + "grad_norm": 0.2675701379776001, + "learning_rate": 9.171151049969029e-06, + "loss": 0.0012, + "step": 179900 + }, + { + "epoch": 8.241475224065107, + "grad_norm": 1.2457773685455322, + "learning_rate": 9.125073738811918e-06, + "loss": 0.0019, + "step": 180000 + }, + { + "epoch": 8.246053821411811, + "grad_norm": 0.1400783210992813, + "learning_rate": 9.079100843485578e-06, + "loss": 0.0021, + "step": 180100 + }, + { + "epoch": 8.250632418758514, + "grad_norm": 0.025368591770529747, + "learning_rate": 9.033232481428678e-06, + "loss": 0.0018, + "step": 180200 + }, + { + "epoch": 8.255211016105216, + "grad_norm": 0.0014903460396453738, + "learning_rate": 8.987468769812912e-06, + "loss": 0.0014, + "step": 180300 + }, + { + "epoch": 8.259789613451918, + "grad_norm": 0.22623829543590546, + "learning_rate": 8.941809825542596e-06, + "loss": 0.0025, + "step": 180400 + }, + { + "epoch": 8.264368210798622, + "grad_norm": 0.017613211646676064, + "learning_rate": 8.896255765254424e-06, + "loss": 0.0012, + "step": 180500 + }, + { + "epoch": 8.268946808145325, + "grad_norm": 0.005598566494882107, + "learning_rate": 8.850806705317183e-06, + "loss": 0.001, + "step": 180600 + }, + { + "epoch": 8.273525405492027, + "grad_norm": 0.17524650692939758, + "learning_rate": 8.805462761831418e-06, + "loss": 0.001, + "step": 180700 + }, + { + "epoch": 8.278104002838731, + "grad_norm": 0.03338591754436493, + "learning_rate": 8.760224050629162e-06, + "loss": 0.0014, + "step": 180800 + }, + { + "epoch": 8.282682600185433, + "grad_norm": 0.017168212682008743, + "learning_rate": 8.715090687273614e-06, + "loss": 0.001, + "step": 180900 + }, + { + "epoch": 8.287261197532136, + "grad_norm": 0.09427805244922638, + "learning_rate": 8.67006278705888e-06, + "loss": 0.0013, + "step": 181000 + }, + { + "epoch": 8.29183979487884, + "grad_norm": 0.0094602657482028, + "learning_rate": 8.625140465009635e-06, + "loss": 0.0013, + "step": 181100 + }, + { + "epoch": 8.296418392225542, + "grad_norm": 0.06793930381536484, + "learning_rate": 8.58032383588086e-06, + "loss": 0.0018, + "step": 181200 + }, + { + "epoch": 8.300996989572244, + "grad_norm": 0.08039774000644684, + "learning_rate": 8.535613014157557e-06, + "loss": 0.0019, + "step": 181300 + }, + { + "epoch": 8.305575586918948, + "grad_norm": 0.03726482763886452, + "learning_rate": 8.491008114054439e-06, + "loss": 0.0021, + "step": 181400 + }, + { + "epoch": 8.31015418426565, + "grad_norm": 0.10031867027282715, + "learning_rate": 8.446509249515605e-06, + "loss": 0.0021, + "step": 181500 + }, + { + "epoch": 8.314732781612353, + "grad_norm": 0.38206222653388977, + "learning_rate": 8.402116534214338e-06, + "loss": 0.0021, + "step": 181600 + }, + { + "epoch": 8.319311378959055, + "grad_norm": 0.05652381107211113, + "learning_rate": 8.35783008155272e-06, + "loss": 0.0009, + "step": 181700 + }, + { + "epoch": 8.32388997630576, + "grad_norm": 0.0731114000082016, + "learning_rate": 8.313650004661383e-06, + "loss": 0.0016, + "step": 181800 + }, + { + "epoch": 8.328468573652462, + "grad_norm": 0.43218135833740234, + "learning_rate": 8.26957641639924e-06, + "loss": 0.0024, + "step": 181900 + }, + { + "epoch": 8.333047170999164, + "grad_norm": 0.08536510914564133, + "learning_rate": 8.225609429353187e-06, + "loss": 0.0021, + "step": 182000 + }, + { + "epoch": 8.337625768345868, + "grad_norm": 0.011019705794751644, + "learning_rate": 8.181749155837754e-06, + "loss": 0.0016, + "step": 182100 + }, + { + "epoch": 8.34220436569257, + "grad_norm": 0.040587395429611206, + "learning_rate": 8.137995707894942e-06, + "loss": 0.0018, + "step": 182200 + }, + { + "epoch": 8.346782963039272, + "grad_norm": 0.0023947455920279026, + "learning_rate": 8.094349197293793e-06, + "loss": 0.0015, + "step": 182300 + }, + { + "epoch": 8.351361560385977, + "grad_norm": 0.007556082680821419, + "learning_rate": 8.050809735530207e-06, + "loss": 0.0016, + "step": 182400 + }, + { + "epoch": 8.355940157732679, + "grad_norm": 0.11117005348205566, + "learning_rate": 8.007377433826634e-06, + "loss": 0.0016, + "step": 182500 + }, + { + "epoch": 8.360518755079381, + "grad_norm": 0.0016330329235643148, + "learning_rate": 7.964052403131773e-06, + "loss": 0.0013, + "step": 182600 + }, + { + "epoch": 8.365097352426083, + "grad_norm": 0.4123118221759796, + "learning_rate": 7.920834754120304e-06, + "loss": 0.0021, + "step": 182700 + }, + { + "epoch": 8.369675949772788, + "grad_norm": 0.014765871688723564, + "learning_rate": 7.877724597192582e-06, + "loss": 0.0022, + "step": 182800 + }, + { + "epoch": 8.37425454711949, + "grad_norm": 0.004433237481862307, + "learning_rate": 7.834722042474374e-06, + "loss": 0.0012, + "step": 182900 + }, + { + "epoch": 8.378833144466192, + "grad_norm": 0.0037168385460972786, + "learning_rate": 7.791827199816593e-06, + "loss": 0.0016, + "step": 183000 + }, + { + "epoch": 8.383411741812896, + "grad_norm": 0.04149395972490311, + "learning_rate": 7.74904017879497e-06, + "loss": 0.0029, + "step": 183100 + }, + { + "epoch": 8.387990339159598, + "grad_norm": 0.011970234103500843, + "learning_rate": 7.70636108870983e-06, + "loss": 0.0022, + "step": 183200 + }, + { + "epoch": 8.3925689365063, + "grad_norm": 0.049423061311244965, + "learning_rate": 7.663790038585793e-06, + "loss": 0.0021, + "step": 183300 + }, + { + "epoch": 8.397147533853005, + "grad_norm": 0.029166920110583305, + "learning_rate": 7.621327137171447e-06, + "loss": 0.0015, + "step": 183400 + }, + { + "epoch": 8.401726131199707, + "grad_norm": 0.029471127316355705, + "learning_rate": 7.5789724929391625e-06, + "loss": 0.0019, + "step": 183500 + }, + { + "epoch": 8.40630472854641, + "grad_norm": 0.039268478751182556, + "learning_rate": 7.536726214084722e-06, + "loss": 0.0019, + "step": 183600 + }, + { + "epoch": 8.410883325893113, + "grad_norm": 0.4737110137939453, + "learning_rate": 7.494588408527103e-06, + "loss": 0.0018, + "step": 183700 + }, + { + "epoch": 8.415461923239816, + "grad_norm": 0.03173527121543884, + "learning_rate": 7.4525591839081865e-06, + "loss": 0.0019, + "step": 183800 + }, + { + "epoch": 8.420040520586518, + "grad_norm": 0.013487137854099274, + "learning_rate": 7.4106386475925046e-06, + "loss": 0.0013, + "step": 183900 + }, + { + "epoch": 8.42461911793322, + "grad_norm": 0.0010746048064902425, + "learning_rate": 7.368826906666887e-06, + "loss": 0.0019, + "step": 184000 + }, + { + "epoch": 8.429197715279924, + "grad_norm": 0.01748150959610939, + "learning_rate": 7.327124067940311e-06, + "loss": 0.0025, + "step": 184100 + }, + { + "epoch": 8.433776312626627, + "grad_norm": 0.0159548781812191, + "learning_rate": 7.285530237943505e-06, + "loss": 0.0022, + "step": 184200 + }, + { + "epoch": 8.438354909973329, + "grad_norm": 0.0013540086802095175, + "learning_rate": 7.24404552292875e-06, + "loss": 0.0012, + "step": 184300 + }, + { + "epoch": 8.442933507320033, + "grad_norm": 0.12859028577804565, + "learning_rate": 7.202670028869601e-06, + "loss": 0.002, + "step": 184400 + }, + { + "epoch": 8.447512104666735, + "grad_norm": 0.006918368861079216, + "learning_rate": 7.161403861460614e-06, + "loss": 0.0014, + "step": 184500 + }, + { + "epoch": 8.452090702013438, + "grad_norm": 0.014983629807829857, + "learning_rate": 7.1202471261170245e-06, + "loss": 0.0016, + "step": 184600 + }, + { + "epoch": 8.456669299360142, + "grad_norm": 0.03064214624464512, + "learning_rate": 7.079199927974584e-06, + "loss": 0.0021, + "step": 184700 + }, + { + "epoch": 8.461247896706844, + "grad_norm": 0.030842667445540428, + "learning_rate": 7.038262371889159e-06, + "loss": 0.0012, + "step": 184800 + }, + { + "epoch": 8.465826494053546, + "grad_norm": 0.0024680488277226686, + "learning_rate": 6.997434562436606e-06, + "loss": 0.002, + "step": 184900 + }, + { + "epoch": 8.470405091400249, + "grad_norm": 0.06068078801035881, + "learning_rate": 6.956716603912361e-06, + "loss": 0.0021, + "step": 185000 + }, + { + "epoch": 8.474983688746953, + "grad_norm": 0.5528777241706848, + "learning_rate": 6.9161086003312945e-06, + "loss": 0.0015, + "step": 185100 + }, + { + "epoch": 8.479562286093655, + "grad_norm": 0.05493824928998947, + "learning_rate": 6.875610655427389e-06, + "loss": 0.0017, + "step": 185200 + }, + { + "epoch": 8.484140883440357, + "grad_norm": 0.07727139443159103, + "learning_rate": 6.83522287265344e-06, + "loss": 0.0017, + "step": 185300 + }, + { + "epoch": 8.488719480787061, + "grad_norm": 0.5921161770820618, + "learning_rate": 6.794945355180893e-06, + "loss": 0.0019, + "step": 185400 + }, + { + "epoch": 8.493298078133764, + "grad_norm": 0.1638752669095993, + "learning_rate": 6.754778205899465e-06, + "loss": 0.0011, + "step": 185500 + }, + { + "epoch": 8.497876675480466, + "grad_norm": 0.0014271615073084831, + "learning_rate": 6.714721527416956e-06, + "loss": 0.0017, + "step": 185600 + }, + { + "epoch": 8.50245527282717, + "grad_norm": 0.010398001410067081, + "learning_rate": 6.674775422058965e-06, + "loss": 0.0024, + "step": 185700 + }, + { + "epoch": 8.507033870173872, + "grad_norm": 0.019598359242081642, + "learning_rate": 6.63493999186865e-06, + "loss": 0.0024, + "step": 185800 + }, + { + "epoch": 8.511612467520575, + "grad_norm": 0.00348674226552248, + "learning_rate": 6.595215338606397e-06, + "loss": 0.0012, + "step": 185900 + }, + { + "epoch": 8.516191064867279, + "grad_norm": 0.0019242248963564634, + "learning_rate": 6.555601563749675e-06, + "loss": 0.0012, + "step": 186000 + }, + { + "epoch": 8.520769662213981, + "grad_norm": 0.008147502318024635, + "learning_rate": 6.516098768492662e-06, + "loss": 0.0015, + "step": 186100 + }, + { + "epoch": 8.525348259560683, + "grad_norm": 0.04510408639907837, + "learning_rate": 6.47670705374604e-06, + "loss": 0.001, + "step": 186200 + }, + { + "epoch": 8.529926856907386, + "grad_norm": 0.011768829077482224, + "learning_rate": 6.437426520136758e-06, + "loss": 0.0019, + "step": 186300 + }, + { + "epoch": 8.53450545425409, + "grad_norm": 0.049900226294994354, + "learning_rate": 6.398257268007746e-06, + "loss": 0.001, + "step": 186400 + }, + { + "epoch": 8.539084051600792, + "grad_norm": 0.030545897781848907, + "learning_rate": 6.359199397417637e-06, + "loss": 0.0019, + "step": 186500 + }, + { + "epoch": 8.543662648947494, + "grad_norm": 0.007900966331362724, + "learning_rate": 6.320253008140575e-06, + "loss": 0.0018, + "step": 186600 + }, + { + "epoch": 8.548241246294198, + "grad_norm": 0.002196391811594367, + "learning_rate": 6.281418199665884e-06, + "loss": 0.002, + "step": 186700 + }, + { + "epoch": 8.5528198436409, + "grad_norm": 0.08688097447156906, + "learning_rate": 6.242695071197896e-06, + "loss": 0.0014, + "step": 186800 + }, + { + "epoch": 8.557398440987603, + "grad_norm": 0.06626435369253159, + "learning_rate": 6.204083721655607e-06, + "loss": 0.0017, + "step": 186900 + }, + { + "epoch": 8.561977038334307, + "grad_norm": 0.08788962662220001, + "learning_rate": 6.165584249672507e-06, + "loss": 0.0016, + "step": 187000 + }, + { + "epoch": 8.56655563568101, + "grad_norm": 0.021715328097343445, + "learning_rate": 6.127196753596287e-06, + "loss": 0.0017, + "step": 187100 + }, + { + "epoch": 8.571134233027712, + "grad_norm": 0.0046193236485123634, + "learning_rate": 6.088921331488568e-06, + "loss": 0.001, + "step": 187200 + }, + { + "epoch": 8.575712830374414, + "grad_norm": 0.03280609846115112, + "learning_rate": 6.050758081124719e-06, + "loss": 0.0021, + "step": 187300 + }, + { + "epoch": 8.580291427721118, + "grad_norm": 0.01722414791584015, + "learning_rate": 6.012707099993525e-06, + "loss": 0.0015, + "step": 187400 + }, + { + "epoch": 8.58487002506782, + "grad_norm": 0.003511949675157666, + "learning_rate": 5.974768485296977e-06, + "loss": 0.0019, + "step": 187500 + }, + { + "epoch": 8.589448622414523, + "grad_norm": 0.1540764719247818, + "learning_rate": 5.936942333950063e-06, + "loss": 0.0022, + "step": 187600 + }, + { + "epoch": 8.594027219761227, + "grad_norm": 0.013510748744010925, + "learning_rate": 5.8992287425804485e-06, + "loss": 0.0012, + "step": 187700 + }, + { + "epoch": 8.598605817107929, + "grad_norm": 0.07007778435945511, + "learning_rate": 5.861627807528264e-06, + "loss": 0.001, + "step": 187800 + }, + { + "epoch": 8.603184414454631, + "grad_norm": 0.010667093098163605, + "learning_rate": 5.82413962484587e-06, + "loss": 0.0015, + "step": 187900 + }, + { + "epoch": 8.607763011801335, + "grad_norm": 0.024145985022187233, + "learning_rate": 5.7867642902975975e-06, + "loss": 0.0025, + "step": 188000 + }, + { + "epoch": 8.612341609148038, + "grad_norm": 0.12270953506231308, + "learning_rate": 5.749501899359477e-06, + "loss": 0.0019, + "step": 188100 + }, + { + "epoch": 8.61692020649474, + "grad_norm": 0.36917853355407715, + "learning_rate": 5.712352547219058e-06, + "loss": 0.0018, + "step": 188200 + }, + { + "epoch": 8.621498803841444, + "grad_norm": 0.8480884432792664, + "learning_rate": 5.675316328775126e-06, + "loss": 0.0023, + "step": 188300 + }, + { + "epoch": 8.626077401188146, + "grad_norm": 0.009566806256771088, + "learning_rate": 5.638393338637432e-06, + "loss": 0.0018, + "step": 188400 + }, + { + "epoch": 8.630655998534849, + "grad_norm": 0.15766866505146027, + "learning_rate": 5.601583671126531e-06, + "loss": 0.0015, + "step": 188500 + }, + { + "epoch": 8.635234595881553, + "grad_norm": 0.003668803023174405, + "learning_rate": 5.5648874202734565e-06, + "loss": 0.0014, + "step": 188600 + }, + { + "epoch": 8.639813193228255, + "grad_norm": 0.01124663557857275, + "learning_rate": 5.528304679819513e-06, + "loss": 0.0012, + "step": 188700 + }, + { + "epoch": 8.644391790574957, + "grad_norm": 0.06179165840148926, + "learning_rate": 5.4918355432160726e-06, + "loss": 0.0013, + "step": 188800 + }, + { + "epoch": 8.64897038792166, + "grad_norm": 0.1397646963596344, + "learning_rate": 5.455480103624283e-06, + "loss": 0.0018, + "step": 188900 + }, + { + "epoch": 8.653548985268364, + "grad_norm": 0.0030619765166193247, + "learning_rate": 5.41923845391486e-06, + "loss": 0.002, + "step": 189000 + }, + { + "epoch": 8.658127582615066, + "grad_norm": 0.015350698493421078, + "learning_rate": 5.383110686667831e-06, + "loss": 0.0018, + "step": 189100 + }, + { + "epoch": 8.662706179961768, + "grad_norm": 0.6472819447517395, + "learning_rate": 5.347096894172304e-06, + "loss": 0.0014, + "step": 189200 + }, + { + "epoch": 8.667284777308472, + "grad_norm": 1.0994236469268799, + "learning_rate": 5.3111971684262574e-06, + "loss": 0.0017, + "step": 189300 + }, + { + "epoch": 8.671863374655175, + "grad_norm": 0.010606258176267147, + "learning_rate": 5.275411601136254e-06, + "loss": 0.0016, + "step": 189400 + }, + { + "epoch": 8.676441972001877, + "grad_norm": 0.0032367429230362177, + "learning_rate": 5.239740283717265e-06, + "loss": 0.002, + "step": 189500 + }, + { + "epoch": 8.68102056934858, + "grad_norm": 0.0026265005581080914, + "learning_rate": 5.20418330729241e-06, + "loss": 0.001, + "step": 189600 + }, + { + "epoch": 8.685599166695283, + "grad_norm": 0.05965089425444603, + "learning_rate": 5.168740762692681e-06, + "loss": 0.0016, + "step": 189700 + }, + { + "epoch": 8.690177764041985, + "grad_norm": 0.040079813450574875, + "learning_rate": 5.133412740456806e-06, + "loss": 0.0022, + "step": 189800 + }, + { + "epoch": 8.694756361388688, + "grad_norm": 0.06221432238817215, + "learning_rate": 5.098199330830922e-06, + "loss": 0.002, + "step": 189900 + }, + { + "epoch": 8.699334958735392, + "grad_norm": 0.013662228360772133, + "learning_rate": 5.063100623768391e-06, + "loss": 0.0013, + "step": 190000 + }, + { + "epoch": 8.703913556082094, + "grad_norm": 0.00042499735718593, + "learning_rate": 5.028116708929587e-06, + "loss": 0.0017, + "step": 190100 + }, + { + "epoch": 8.708492153428796, + "grad_norm": 0.6862035989761353, + "learning_rate": 4.993247675681639e-06, + "loss": 0.0019, + "step": 190200 + }, + { + "epoch": 8.7130707507755, + "grad_norm": 0.10454216599464417, + "learning_rate": 4.958493613098186e-06, + "loss": 0.0017, + "step": 190300 + }, + { + "epoch": 8.717649348122203, + "grad_norm": 0.26306042075157166, + "learning_rate": 4.9238546099592e-06, + "loss": 0.0013, + "step": 190400 + }, + { + "epoch": 8.722227945468905, + "grad_norm": 0.026483699679374695, + "learning_rate": 4.8893307547507205e-06, + "loss": 0.0016, + "step": 190500 + }, + { + "epoch": 8.72680654281561, + "grad_norm": 0.033151958137750626, + "learning_rate": 4.854922135664619e-06, + "loss": 0.002, + "step": 190600 + }, + { + "epoch": 8.731385140162311, + "grad_norm": 0.03364422544836998, + "learning_rate": 4.820628840598423e-06, + "loss": 0.0018, + "step": 190700 + }, + { + "epoch": 8.735963737509014, + "grad_norm": 0.004185411147773266, + "learning_rate": 4.786450957155064e-06, + "loss": 0.0021, + "step": 190800 + }, + { + "epoch": 8.740542334855718, + "grad_norm": 0.007191179320216179, + "learning_rate": 4.7523885726426355e-06, + "loss": 0.0017, + "step": 190900 + }, + { + "epoch": 8.74512093220242, + "grad_norm": 0.0175678301602602, + "learning_rate": 4.71844177407419e-06, + "loss": 0.002, + "step": 191000 + }, + { + "epoch": 8.749699529549122, + "grad_norm": 0.20129109919071198, + "learning_rate": 4.684610648167503e-06, + "loss": 0.0017, + "step": 191100 + }, + { + "epoch": 8.754278126895825, + "grad_norm": 0.22425204515457153, + "learning_rate": 4.6508952813448965e-06, + "loss": 0.0015, + "step": 191200 + }, + { + "epoch": 8.758856724242529, + "grad_norm": 0.011632180772721767, + "learning_rate": 4.617295759732937e-06, + "loss": 0.0019, + "step": 191300 + }, + { + "epoch": 8.763435321589231, + "grad_norm": 0.00452096201479435, + "learning_rate": 4.5838121691623e-06, + "loss": 0.0012, + "step": 191400 + }, + { + "epoch": 8.768013918935933, + "grad_norm": 0.004986160434782505, + "learning_rate": 4.550444595167502e-06, + "loss": 0.0014, + "step": 191500 + }, + { + "epoch": 8.772592516282637, + "grad_norm": 0.0067661721259355545, + "learning_rate": 4.517193122986679e-06, + "loss": 0.0013, + "step": 191600 + }, + { + "epoch": 8.77717111362934, + "grad_norm": 0.06658513098955154, + "learning_rate": 4.484057837561406e-06, + "loss": 0.003, + "step": 191700 + }, + { + "epoch": 8.781749710976042, + "grad_norm": 0.003857834730297327, + "learning_rate": 4.4510388235364405e-06, + "loss": 0.0015, + "step": 191800 + }, + { + "epoch": 8.786328308322746, + "grad_norm": 0.00674202898517251, + "learning_rate": 4.418136165259512e-06, + "loss": 0.001, + "step": 191900 + }, + { + "epoch": 8.790906905669448, + "grad_norm": 0.004305652808398008, + "learning_rate": 4.385349946781136e-06, + "loss": 0.0008, + "step": 192000 + }, + { + "epoch": 8.79548550301615, + "grad_norm": 0.011842915788292885, + "learning_rate": 4.352680251854391e-06, + "loss": 0.0015, + "step": 192100 + }, + { + "epoch": 8.800064100362853, + "grad_norm": 0.030167168006300926, + "learning_rate": 4.320127163934657e-06, + "loss": 0.0015, + "step": 192200 + }, + { + "epoch": 8.804642697709557, + "grad_norm": 0.006344472989439964, + "learning_rate": 4.2876907661794755e-06, + "loss": 0.0016, + "step": 192300 + }, + { + "epoch": 8.80922129505626, + "grad_norm": 0.8136438131332397, + "learning_rate": 4.255371141448272e-06, + "loss": 0.0015, + "step": 192400 + }, + { + "epoch": 8.813799892402962, + "grad_norm": 0.03604700043797493, + "learning_rate": 4.223168372302189e-06, + "loss": 0.0019, + "step": 192500 + }, + { + "epoch": 8.818378489749666, + "grad_norm": 0.023059792816638947, + "learning_rate": 4.191082541003849e-06, + "loss": 0.0009, + "step": 192600 + }, + { + "epoch": 8.822957087096368, + "grad_norm": 0.04644302278757095, + "learning_rate": 4.159113729517184e-06, + "loss": 0.0023, + "step": 192700 + }, + { + "epoch": 8.82753568444307, + "grad_norm": 0.038498032838106155, + "learning_rate": 4.127262019507145e-06, + "loss": 0.0017, + "step": 192800 + }, + { + "epoch": 8.832114281789774, + "grad_norm": 0.010661243461072445, + "learning_rate": 4.095527492339596e-06, + "loss": 0.0017, + "step": 192900 + }, + { + "epoch": 8.836692879136477, + "grad_norm": 0.03207453712821007, + "learning_rate": 4.0639102290810135e-06, + "loss": 0.0024, + "step": 193000 + }, + { + "epoch": 8.841271476483179, + "grad_norm": 0.1272786557674408, + "learning_rate": 4.032410310498358e-06, + "loss": 0.0015, + "step": 193100 + }, + { + "epoch": 8.845850073829883, + "grad_norm": 0.004953332711011171, + "learning_rate": 4.001027817058789e-06, + "loss": 0.0015, + "step": 193200 + }, + { + "epoch": 8.850428671176585, + "grad_norm": 0.08756324648857117, + "learning_rate": 3.969762828929547e-06, + "loss": 0.0006, + "step": 193300 + }, + { + "epoch": 8.855007268523288, + "grad_norm": 0.5247501134872437, + "learning_rate": 3.938615425977676e-06, + "loss": 0.0018, + "step": 193400 + }, + { + "epoch": 8.85958586586999, + "grad_norm": 0.0369555726647377, + "learning_rate": 3.907585687769838e-06, + "loss": 0.0012, + "step": 193500 + }, + { + "epoch": 8.864164463216694, + "grad_norm": 0.13189056515693665, + "learning_rate": 3.876673693572147e-06, + "loss": 0.0009, + "step": 193600 + }, + { + "epoch": 8.868743060563396, + "grad_norm": 0.022357501089572906, + "learning_rate": 3.84587952234991e-06, + "loss": 0.001, + "step": 193700 + }, + { + "epoch": 8.873321657910099, + "grad_norm": 0.0008908796007744968, + "learning_rate": 3.815203252767463e-06, + "loss": 0.001, + "step": 193800 + }, + { + "epoch": 8.877900255256803, + "grad_norm": 0.003647018224000931, + "learning_rate": 3.7846449631879667e-06, + "loss": 0.0017, + "step": 193900 + }, + { + "epoch": 8.882478852603505, + "grad_norm": 0.427442729473114, + "learning_rate": 3.754204731673194e-06, + "loss": 0.0018, + "step": 194000 + }, + { + "epoch": 8.887057449950207, + "grad_norm": 0.07006958872079849, + "learning_rate": 3.723882635983328e-06, + "loss": 0.0018, + "step": 194100 + }, + { + "epoch": 8.891636047296911, + "grad_norm": 0.18033552169799805, + "learning_rate": 3.6936787535767903e-06, + "loss": 0.002, + "step": 194200 + }, + { + "epoch": 8.896214644643614, + "grad_norm": 0.6087344288825989, + "learning_rate": 3.6635931616100073e-06, + "loss": 0.0016, + "step": 194300 + }, + { + "epoch": 8.900793241990316, + "grad_norm": 0.14380215108394623, + "learning_rate": 3.6336259369372296e-06, + "loss": 0.0019, + "step": 194400 + }, + { + "epoch": 8.905371839337018, + "grad_norm": 0.09099259227514267, + "learning_rate": 3.6037771561103496e-06, + "loss": 0.0007, + "step": 194500 + }, + { + "epoch": 8.909950436683722, + "grad_norm": 0.3938591480255127, + "learning_rate": 3.5740468953786855e-06, + "loss": 0.002, + "step": 194600 + }, + { + "epoch": 8.914529034030425, + "grad_norm": 0.024904364719986916, + "learning_rate": 3.544435230688792e-06, + "loss": 0.0007, + "step": 194700 + }, + { + "epoch": 8.919107631377127, + "grad_norm": 0.0034130678977817297, + "learning_rate": 3.514942237684271e-06, + "loss": 0.0015, + "step": 194800 + }, + { + "epoch": 8.923686228723831, + "grad_norm": 0.007133205886930227, + "learning_rate": 3.485567991705563e-06, + "loss": 0.0012, + "step": 194900 + }, + { + "epoch": 8.928264826070533, + "grad_norm": 0.0010082671651616693, + "learning_rate": 3.4563125677897932e-06, + "loss": 0.0016, + "step": 195000 + }, + { + "epoch": 8.932843423417236, + "grad_norm": 0.011788592673838139, + "learning_rate": 3.427176040670521e-06, + "loss": 0.0023, + "step": 195100 + }, + { + "epoch": 8.93742202076394, + "grad_norm": 0.11746617406606674, + "learning_rate": 3.3981584847776026e-06, + "loss": 0.0014, + "step": 195200 + }, + { + "epoch": 8.942000618110642, + "grad_norm": 0.06418687850236893, + "learning_rate": 3.369259974236988e-06, + "loss": 0.0018, + "step": 195300 + }, + { + "epoch": 8.946579215457344, + "grad_norm": 0.012977411039173603, + "learning_rate": 3.340480582870503e-06, + "loss": 0.0014, + "step": 195400 + }, + { + "epoch": 8.951157812804048, + "grad_norm": 0.0007548317080363631, + "learning_rate": 3.311820384195674e-06, + "loss": 0.0013, + "step": 195500 + }, + { + "epoch": 8.95573641015075, + "grad_norm": 0.004369072150439024, + "learning_rate": 3.2832794514255803e-06, + "loss": 0.0011, + "step": 195600 + }, + { + "epoch": 8.960315007497453, + "grad_norm": 0.14467285573482513, + "learning_rate": 3.2548578574686018e-06, + "loss": 0.0016, + "step": 195700 + }, + { + "epoch": 8.964893604844155, + "grad_norm": 0.018095914274454117, + "learning_rate": 3.2265556749282834e-06, + "loss": 0.0013, + "step": 195800 + }, + { + "epoch": 8.96947220219086, + "grad_norm": 0.031725652515888214, + "learning_rate": 3.198372976103137e-06, + "loss": 0.0013, + "step": 195900 + }, + { + "epoch": 8.974050799537562, + "grad_norm": 0.12283707410097122, + "learning_rate": 3.1703098329864233e-06, + "loss": 0.0019, + "step": 196000 + }, + { + "epoch": 8.978629396884264, + "grad_norm": 0.0016571872401982546, + "learning_rate": 3.1423663172660267e-06, + "loss": 0.002, + "step": 196100 + }, + { + "epoch": 8.983207994230968, + "grad_norm": 0.005055413115769625, + "learning_rate": 3.114542500324219e-06, + "loss": 0.001, + "step": 196200 + }, + { + "epoch": 8.98778659157767, + "grad_norm": 0.008997324854135513, + "learning_rate": 3.086838453237506e-06, + "loss": 0.0007, + "step": 196300 + }, + { + "epoch": 8.992365188924373, + "grad_norm": 0.42291346192359924, + "learning_rate": 3.059254246776433e-06, + "loss": 0.0017, + "step": 196400 + }, + { + "epoch": 8.996943786271077, + "grad_norm": 0.23169781267642975, + "learning_rate": 3.0317899514054336e-06, + "loss": 0.0015, + "step": 196500 + }, + { + "epoch": 8.9999656605199, + "eval_loss": 0.2418144792318344, + "eval_runtime": 261.8983, + "eval_samples_per_second": 21.001, + "eval_steps_per_second": 21.001, + "step": 196566 + }, + { + "epoch": 9.001522383617779, + "grad_norm": 0.08088653534650803, + "learning_rate": 3.0044456372825992e-06, + "loss": 0.0019, + "step": 196600 + }, + { + "epoch": 9.006100980964481, + "grad_norm": 0.13099326193332672, + "learning_rate": 2.9772213742595367e-06, + "loss": 0.001, + "step": 196700 + }, + { + "epoch": 9.010679578311183, + "grad_norm": 0.008611609227955341, + "learning_rate": 2.950117231881183e-06, + "loss": 0.0008, + "step": 196800 + }, + { + "epoch": 9.015258175657888, + "grad_norm": 0.6876717209815979, + "learning_rate": 2.923133279385615e-06, + "loss": 0.0006, + "step": 196900 + }, + { + "epoch": 9.01983677300459, + "grad_norm": 0.2837451100349426, + "learning_rate": 2.8962695857038922e-06, + "loss": 0.0009, + "step": 197000 + }, + { + "epoch": 9.024415370351292, + "grad_norm": 0.012697268277406693, + "learning_rate": 2.8695262194598615e-06, + "loss": 0.0011, + "step": 197100 + }, + { + "epoch": 9.028993967697996, + "grad_norm": 0.002202101983129978, + "learning_rate": 2.8429032489700135e-06, + "loss": 0.0007, + "step": 197200 + }, + { + "epoch": 9.033572565044699, + "grad_norm": 0.00148550805170089, + "learning_rate": 2.8164007422432583e-06, + "loss": 0.001, + "step": 197300 + }, + { + "epoch": 9.0381511623914, + "grad_norm": 0.002804514952003956, + "learning_rate": 2.790018766980773e-06, + "loss": 0.0008, + "step": 197400 + }, + { + "epoch": 9.042729759738105, + "grad_norm": 0.004013043362647295, + "learning_rate": 2.763757390575872e-06, + "loss": 0.0013, + "step": 197500 + }, + { + "epoch": 9.047308357084807, + "grad_norm": 0.3350330591201782, + "learning_rate": 2.737616680113758e-06, + "loss": 0.0013, + "step": 197600 + }, + { + "epoch": 9.05188695443151, + "grad_norm": 0.2957717478275299, + "learning_rate": 2.7115967023714215e-06, + "loss": 0.0009, + "step": 197700 + }, + { + "epoch": 9.056465551778214, + "grad_norm": 0.13251306116580963, + "learning_rate": 2.6856975238174266e-06, + "loss": 0.0008, + "step": 197800 + }, + { + "epoch": 9.061044149124916, + "grad_norm": 0.0039755236357450485, + "learning_rate": 2.6599192106117333e-06, + "loss": 0.0021, + "step": 197900 + }, + { + "epoch": 9.065622746471618, + "grad_norm": 0.040325064212083817, + "learning_rate": 2.634261828605594e-06, + "loss": 0.001, + "step": 198000 + }, + { + "epoch": 9.07020134381832, + "grad_norm": 0.001606648089364171, + "learning_rate": 2.608725443341292e-06, + "loss": 0.0014, + "step": 198100 + }, + { + "epoch": 9.074779941165025, + "grad_norm": 0.0010452588321641088, + "learning_rate": 2.583310120052046e-06, + "loss": 0.0012, + "step": 198200 + }, + { + "epoch": 9.079358538511727, + "grad_norm": 0.06777454912662506, + "learning_rate": 2.5580159236618162e-06, + "loss": 0.0006, + "step": 198300 + }, + { + "epoch": 9.083937135858429, + "grad_norm": 0.014749701134860516, + "learning_rate": 2.5328429187851552e-06, + "loss": 0.0012, + "step": 198400 + }, + { + "epoch": 9.088515733205133, + "grad_norm": 0.0003846607287414372, + "learning_rate": 2.507791169727003e-06, + "loss": 0.0008, + "step": 198500 + }, + { + "epoch": 9.093094330551835, + "grad_norm": 0.004392546135932207, + "learning_rate": 2.4828607404825677e-06, + "loss": 0.0006, + "step": 198600 + }, + { + "epoch": 9.097672927898538, + "grad_norm": 0.006986264605075121, + "learning_rate": 2.4580516947371348e-06, + "loss": 0.001, + "step": 198700 + }, + { + "epoch": 9.102251525245242, + "grad_norm": 0.009725336916744709, + "learning_rate": 2.4333640958659143e-06, + "loss": 0.0007, + "step": 198800 + }, + { + "epoch": 9.106830122591944, + "grad_norm": 0.02167440392076969, + "learning_rate": 2.408798006933882e-06, + "loss": 0.001, + "step": 198900 + }, + { + "epoch": 9.111408719938646, + "grad_norm": 0.054156869649887085, + "learning_rate": 2.3843534906956123e-06, + "loss": 0.0013, + "step": 199000 + }, + { + "epoch": 9.11598731728535, + "grad_norm": 0.011062448844313622, + "learning_rate": 2.3600306095951264e-06, + "loss": 0.0013, + "step": 199100 + }, + { + "epoch": 9.120565914632053, + "grad_norm": 0.0029075967613607645, + "learning_rate": 2.335829425765712e-06, + "loss": 0.0015, + "step": 199200 + }, + { + "epoch": 9.125144511978755, + "grad_norm": 0.013693880289793015, + "learning_rate": 2.311750001029783e-06, + "loss": 0.0014, + "step": 199300 + }, + { + "epoch": 9.129723109325457, + "grad_norm": 0.011990150436758995, + "learning_rate": 2.2877923968987247e-06, + "loss": 0.0011, + "step": 199400 + }, + { + "epoch": 9.134301706672161, + "grad_norm": 0.01806030236184597, + "learning_rate": 2.2639566745727205e-06, + "loss": 0.0007, + "step": 199500 + }, + { + "epoch": 9.138880304018864, + "grad_norm": 0.005009980872273445, + "learning_rate": 2.2402428949406086e-06, + "loss": 0.0007, + "step": 199600 + }, + { + "epoch": 9.143458901365566, + "grad_norm": 0.031974907964468, + "learning_rate": 2.216651118579727e-06, + "loss": 0.0013, + "step": 199700 + }, + { + "epoch": 9.14803749871227, + "grad_norm": 0.0008488456369377673, + "learning_rate": 2.19318140575574e-06, + "loss": 0.0009, + "step": 199800 + }, + { + "epoch": 9.152616096058972, + "grad_norm": 0.12342657893896103, + "learning_rate": 2.169833816422517e-06, + "loss": 0.001, + "step": 199900 + }, + { + "epoch": 9.157194693405675, + "grad_norm": 0.0888073593378067, + "learning_rate": 2.1466084102219452e-06, + "loss": 0.0011, + "step": 200000 + }, + { + "epoch": 9.161773290752379, + "grad_norm": 0.0031123904045671225, + "learning_rate": 2.123505246483787e-06, + "loss": 0.0012, + "step": 200100 + }, + { + "epoch": 9.166351888099081, + "grad_norm": 0.021208738908171654, + "learning_rate": 2.100524384225555e-06, + "loss": 0.001, + "step": 200200 + }, + { + "epoch": 9.170930485445783, + "grad_norm": 0.025913584977388382, + "learning_rate": 2.077665882152335e-06, + "loss": 0.0012, + "step": 200300 + }, + { + "epoch": 9.175509082792486, + "grad_norm": 0.16090908646583557, + "learning_rate": 2.0549297986566186e-06, + "loss": 0.0014, + "step": 200400 + }, + { + "epoch": 9.18008768013919, + "grad_norm": 0.08480704575777054, + "learning_rate": 2.032316191818212e-06, + "loss": 0.0018, + "step": 200500 + }, + { + "epoch": 9.184666277485892, + "grad_norm": 0.0889834314584732, + "learning_rate": 2.009825119404024e-06, + "loss": 0.0012, + "step": 200600 + }, + { + "epoch": 9.189244874832594, + "grad_norm": 0.10864217579364777, + "learning_rate": 1.9874566388679518e-06, + "loss": 0.001, + "step": 200700 + }, + { + "epoch": 9.193823472179298, + "grad_norm": 0.0008274565334431827, + "learning_rate": 1.9652108073507425e-06, + "loss": 0.0011, + "step": 200800 + }, + { + "epoch": 9.198402069526, + "grad_norm": 0.002116286661475897, + "learning_rate": 1.943087681679823e-06, + "loss": 0.001, + "step": 200900 + }, + { + "epoch": 9.202980666872703, + "grad_norm": 0.029289819300174713, + "learning_rate": 1.9210873183691692e-06, + "loss": 0.0007, + "step": 201000 + }, + { + "epoch": 9.207559264219407, + "grad_norm": 0.006239714100956917, + "learning_rate": 1.899209773619154e-06, + "loss": 0.0011, + "step": 201100 + }, + { + "epoch": 9.21213786156611, + "grad_norm": 0.002388751832768321, + "learning_rate": 1.8774551033164112e-06, + "loss": 0.0009, + "step": 201200 + }, + { + "epoch": 9.216716458912812, + "grad_norm": 0.15311861038208008, + "learning_rate": 1.8558233630336929e-06, + "loss": 0.0011, + "step": 201300 + }, + { + "epoch": 9.221295056259516, + "grad_norm": 0.20378436148166656, + "learning_rate": 1.8343146080297135e-06, + "loss": 0.0007, + "step": 201400 + }, + { + "epoch": 9.225873653606218, + "grad_norm": 0.01194208487868309, + "learning_rate": 1.8129288932490274e-06, + "loss": 0.0008, + "step": 201500 + }, + { + "epoch": 9.23045225095292, + "grad_norm": 0.002687977161258459, + "learning_rate": 1.7916662733218847e-06, + "loss": 0.001, + "step": 201600 + }, + { + "epoch": 9.235030848299623, + "grad_norm": 0.0027624531649053097, + "learning_rate": 1.7705268025640709e-06, + "loss": 0.0005, + "step": 201700 + }, + { + "epoch": 9.239609445646327, + "grad_norm": 0.04479651898145676, + "learning_rate": 1.7495105349767948e-06, + "loss": 0.0012, + "step": 201800 + }, + { + "epoch": 9.244188042993029, + "grad_norm": 0.21192124485969543, + "learning_rate": 1.7286175242465509e-06, + "loss": 0.0012, + "step": 201900 + }, + { + "epoch": 9.248766640339731, + "grad_norm": 0.0029038949869573116, + "learning_rate": 1.7078478237449402e-06, + "loss": 0.0008, + "step": 202000 + }, + { + "epoch": 9.253345237686435, + "grad_norm": 0.009675376117229462, + "learning_rate": 1.6872014865286057e-06, + "loss": 0.0013, + "step": 202100 + }, + { + "epoch": 9.257923835033138, + "grad_norm": 0.007381136529147625, + "learning_rate": 1.6666785653390249e-06, + "loss": 0.001, + "step": 202200 + }, + { + "epoch": 9.26250243237984, + "grad_norm": 0.001989328535273671, + "learning_rate": 1.6462791126024169e-06, + "loss": 0.0007, + "step": 202300 + }, + { + "epoch": 9.267081029726544, + "grad_norm": 0.0015792534686625004, + "learning_rate": 1.6260031804296084e-06, + "loss": 0.0008, + "step": 202400 + }, + { + "epoch": 9.271659627073246, + "grad_norm": 0.0014649959048256278, + "learning_rate": 1.6058508206158728e-06, + "loss": 0.0008, + "step": 202500 + }, + { + "epoch": 9.276238224419949, + "grad_norm": 0.017108794301748276, + "learning_rate": 1.58582208464082e-06, + "loss": 0.0009, + "step": 202600 + }, + { + "epoch": 9.280816821766653, + "grad_norm": 0.0022038191091269255, + "learning_rate": 1.5659170236682674e-06, + "loss": 0.0007, + "step": 202700 + }, + { + "epoch": 9.285395419113355, + "grad_norm": 0.002022168133407831, + "learning_rate": 1.5461356885461075e-06, + "loss": 0.0014, + "step": 202800 + }, + { + "epoch": 9.289974016460057, + "grad_norm": 0.003931309096515179, + "learning_rate": 1.5264781298061415e-06, + "loss": 0.0013, + "step": 202900 + }, + { + "epoch": 9.29455261380676, + "grad_norm": 0.13134440779685974, + "learning_rate": 1.5069443976640284e-06, + "loss": 0.0009, + "step": 203000 + }, + { + "epoch": 9.299131211153464, + "grad_norm": 0.004310674965381622, + "learning_rate": 1.4875345420190645e-06, + "loss": 0.0012, + "step": 203100 + }, + { + "epoch": 9.303709808500166, + "grad_norm": 0.00843301322311163, + "learning_rate": 1.4682486124541373e-06, + "loss": 0.0011, + "step": 203200 + }, + { + "epoch": 9.308288405846868, + "grad_norm": 0.01970786415040493, + "learning_rate": 1.4490866582355267e-06, + "loss": 0.0015, + "step": 203300 + }, + { + "epoch": 9.312867003193572, + "grad_norm": 0.012120225466787815, + "learning_rate": 1.4300487283128495e-06, + "loss": 0.0011, + "step": 203400 + }, + { + "epoch": 9.317445600540275, + "grad_norm": 0.0033403183333575726, + "learning_rate": 1.4111348713188866e-06, + "loss": 0.0007, + "step": 203500 + }, + { + "epoch": 9.322024197886977, + "grad_norm": 0.07732047885656357, + "learning_rate": 1.3923451355694617e-06, + "loss": 0.0008, + "step": 203600 + }, + { + "epoch": 9.326602795233681, + "grad_norm": 0.004604123532772064, + "learning_rate": 1.3736795690633354e-06, + "loss": 0.0011, + "step": 203700 + }, + { + "epoch": 9.331181392580383, + "grad_norm": 0.003377101384103298, + "learning_rate": 1.3551382194820884e-06, + "loss": 0.0008, + "step": 203800 + }, + { + "epoch": 9.335759989927086, + "grad_norm": 0.0024128479417413473, + "learning_rate": 1.3367211341899667e-06, + "loss": 0.0009, + "step": 203900 + }, + { + "epoch": 9.340338587273788, + "grad_norm": 0.01782609149813652, + "learning_rate": 1.3184283602337865e-06, + "loss": 0.001, + "step": 204000 + }, + { + "epoch": 9.344917184620492, + "grad_norm": 0.0036396984942257404, + "learning_rate": 1.3002599443428243e-06, + "loss": 0.0009, + "step": 204100 + }, + { + "epoch": 9.349495781967194, + "grad_norm": 0.08581870794296265, + "learning_rate": 1.2822159329286598e-06, + "loss": 0.0009, + "step": 204200 + }, + { + "epoch": 9.354074379313897, + "grad_norm": 0.044847775250673294, + "learning_rate": 1.264296372085083e-06, + "loss": 0.0011, + "step": 204300 + }, + { + "epoch": 9.3586529766606, + "grad_norm": 0.001584995654411614, + "learning_rate": 1.2465013075879883e-06, + "loss": 0.0018, + "step": 204400 + }, + { + "epoch": 9.363231574007303, + "grad_norm": 0.009455603547394276, + "learning_rate": 1.2288307848952186e-06, + "loss": 0.0007, + "step": 204500 + }, + { + "epoch": 9.367810171354005, + "grad_norm": 0.08216769993305206, + "learning_rate": 1.2112848491464824e-06, + "loss": 0.0012, + "step": 204600 + }, + { + "epoch": 9.37238876870071, + "grad_norm": 0.00180336635094136, + "learning_rate": 1.1938635451632429e-06, + "loss": 0.0013, + "step": 204700 + }, + { + "epoch": 9.376967366047412, + "grad_norm": 0.007049913518130779, + "learning_rate": 1.1765669174485684e-06, + "loss": 0.0008, + "step": 204800 + }, + { + "epoch": 9.381545963394114, + "grad_norm": 0.07437339425086975, + "learning_rate": 1.1593950101870422e-06, + "loss": 0.0006, + "step": 204900 + }, + { + "epoch": 9.386124560740818, + "grad_norm": 0.0391901396214962, + "learning_rate": 1.1423478672446586e-06, + "loss": 0.001, + "step": 205000 + }, + { + "epoch": 9.39070315808752, + "grad_norm": 0.16931991279125214, + "learning_rate": 1.1254255321686836e-06, + "loss": 0.0012, + "step": 205100 + }, + { + "epoch": 9.395281755434223, + "grad_norm": 0.0040185777470469475, + "learning_rate": 1.1086280481875654e-06, + "loss": 0.0012, + "step": 205200 + }, + { + "epoch": 9.399860352780925, + "grad_norm": 0.1352093517780304, + "learning_rate": 1.0919554582108249e-06, + "loss": 0.0005, + "step": 205300 + }, + { + "epoch": 9.404438950127629, + "grad_norm": 0.11504676938056946, + "learning_rate": 1.0754078048289374e-06, + "loss": 0.0006, + "step": 205400 + }, + { + "epoch": 9.409017547474331, + "grad_norm": 0.015873286873102188, + "learning_rate": 1.0589851303132114e-06, + "loss": 0.0017, + "step": 205500 + }, + { + "epoch": 9.413596144821033, + "grad_norm": 0.12407149374485016, + "learning_rate": 1.0426874766157003e-06, + "loss": 0.001, + "step": 205600 + }, + { + "epoch": 9.418174742167738, + "grad_norm": 0.002775526139885187, + "learning_rate": 1.0265148853691009e-06, + "loss": 0.0017, + "step": 205700 + }, + { + "epoch": 9.42275333951444, + "grad_norm": 0.15169379115104675, + "learning_rate": 1.0104673978866164e-06, + "loss": 0.0011, + "step": 205800 + }, + { + "epoch": 9.427331936861142, + "grad_norm": 0.007750590797513723, + "learning_rate": 9.945450551618884e-07, + "loss": 0.001, + "step": 205900 + }, + { + "epoch": 9.431910534207846, + "grad_norm": 0.0007150355377234519, + "learning_rate": 9.787478978688646e-07, + "loss": 0.001, + "step": 206000 + }, + { + "epoch": 9.436489131554548, + "grad_norm": 0.01111397985368967, + "learning_rate": 9.630759663616983e-07, + "loss": 0.0011, + "step": 206100 + }, + { + "epoch": 9.44106772890125, + "grad_norm": 0.027570601552724838, + "learning_rate": 9.475293006746711e-07, + "loss": 0.0007, + "step": 206200 + }, + { + "epoch": 9.445646326247953, + "grad_norm": 0.24770981073379517, + "learning_rate": 9.321079405220423e-07, + "loss": 0.0008, + "step": 206300 + }, + { + "epoch": 9.450224923594657, + "grad_norm": 0.003402331378310919, + "learning_rate": 9.168119252979946e-07, + "loss": 0.0005, + "step": 206400 + }, + { + "epoch": 9.45480352094136, + "grad_norm": 0.0008960131090134382, + "learning_rate": 9.016412940765106e-07, + "loss": 0.0007, + "step": 206500 + }, + { + "epoch": 9.459382118288062, + "grad_norm": 0.02515571191906929, + "learning_rate": 8.865960856112799e-07, + "loss": 0.0015, + "step": 206600 + }, + { + "epoch": 9.463960715634766, + "grad_norm": 0.1351311206817627, + "learning_rate": 8.716763383355864e-07, + "loss": 0.0013, + "step": 206700 + }, + { + "epoch": 9.468539312981468, + "grad_norm": 0.0024123205803334713, + "learning_rate": 8.568820903622376e-07, + "loss": 0.0012, + "step": 206800 + }, + { + "epoch": 9.47311791032817, + "grad_norm": 0.010249449871480465, + "learning_rate": 8.422133794834363e-07, + "loss": 0.001, + "step": 206900 + }, + { + "epoch": 9.477696507674874, + "grad_norm": 0.006125771440565586, + "learning_rate": 8.276702431706973e-07, + "loss": 0.0009, + "step": 207000 + }, + { + "epoch": 9.482275105021577, + "grad_norm": 0.0066106487065553665, + "learning_rate": 8.132527185747641e-07, + "loss": 0.0008, + "step": 207100 + }, + { + "epoch": 9.486853702368279, + "grad_norm": 0.010087539441883564, + "learning_rate": 7.989608425254924e-07, + "loss": 0.001, + "step": 207200 + }, + { + "epoch": 9.491432299714983, + "grad_norm": 0.06039687991142273, + "learning_rate": 7.847946515317839e-07, + "loss": 0.0011, + "step": 207300 + }, + { + "epoch": 9.496010897061685, + "grad_norm": 0.0053437924943864346, + "learning_rate": 7.707541817814468e-07, + "loss": 0.001, + "step": 207400 + }, + { + "epoch": 9.500589494408388, + "grad_norm": 0.0029611322097480297, + "learning_rate": 7.568394691411462e-07, + "loss": 0.0005, + "step": 207500 + }, + { + "epoch": 9.50516809175509, + "grad_norm": 0.007037085480988026, + "learning_rate": 7.4305054915631e-07, + "loss": 0.0007, + "step": 207600 + }, + { + "epoch": 9.509746689101794, + "grad_norm": 0.0024143033660948277, + "learning_rate": 7.293874570510062e-07, + "loss": 0.001, + "step": 207700 + }, + { + "epoch": 9.514325286448496, + "grad_norm": 0.0061892117373645306, + "learning_rate": 7.158502277278823e-07, + "loss": 0.0011, + "step": 207800 + }, + { + "epoch": 9.518903883795199, + "grad_norm": 0.006517268251627684, + "learning_rate": 7.024388957680705e-07, + "loss": 0.0013, + "step": 207900 + }, + { + "epoch": 9.523482481141903, + "grad_norm": 0.02947130799293518, + "learning_rate": 6.891534954310885e-07, + "loss": 0.0005, + "step": 208000 + }, + { + "epoch": 9.528061078488605, + "grad_norm": 0.009931573644280434, + "learning_rate": 6.75994060654761e-07, + "loss": 0.0009, + "step": 208100 + }, + { + "epoch": 9.532639675835307, + "grad_norm": 0.0006516918656416237, + "learning_rate": 6.629606250551368e-07, + "loss": 0.0006, + "step": 208200 + }, + { + "epoch": 9.537218273182011, + "grad_norm": 0.0033905524760484695, + "learning_rate": 6.500532219263833e-07, + "loss": 0.0008, + "step": 208300 + }, + { + "epoch": 9.541796870528714, + "grad_norm": 0.035781797021627426, + "learning_rate": 6.372718842407255e-07, + "loss": 0.001, + "step": 208400 + }, + { + "epoch": 9.546375467875416, + "grad_norm": 0.006109519395977259, + "learning_rate": 6.24616644648357e-07, + "loss": 0.0013, + "step": 208500 + }, + { + "epoch": 9.550954065222118, + "grad_norm": 0.10211105644702911, + "learning_rate": 6.120875354773459e-07, + "loss": 0.0008, + "step": 208600 + }, + { + "epoch": 9.555532662568822, + "grad_norm": 0.002378986682742834, + "learning_rate": 5.996845887335511e-07, + "loss": 0.0011, + "step": 208700 + }, + { + "epoch": 9.560111259915525, + "grad_norm": 0.007951854728162289, + "learning_rate": 5.874078361005564e-07, + "loss": 0.0012, + "step": 208800 + }, + { + "epoch": 9.564689857262227, + "grad_norm": 0.01049245335161686, + "learning_rate": 5.75257308939564e-07, + "loss": 0.0009, + "step": 208900 + }, + { + "epoch": 9.569268454608931, + "grad_norm": 0.00224009295925498, + "learning_rate": 5.632330382893569e-07, + "loss": 0.0005, + "step": 209000 + }, + { + "epoch": 9.573847051955633, + "grad_norm": 0.004827695898711681, + "learning_rate": 5.513350548661811e-07, + "loss": 0.0007, + "step": 209100 + }, + { + "epoch": 9.578425649302336, + "grad_norm": 0.021496234461665154, + "learning_rate": 5.395633890636631e-07, + "loss": 0.0007, + "step": 209200 + }, + { + "epoch": 9.58300424664904, + "grad_norm": 0.001609979895874858, + "learning_rate": 5.279180709527765e-07, + "loss": 0.0009, + "step": 209300 + }, + { + "epoch": 9.587582843995742, + "grad_norm": 0.006100552622228861, + "learning_rate": 5.163991302817139e-07, + "loss": 0.0012, + "step": 209400 + }, + { + "epoch": 9.592161441342444, + "grad_norm": 0.0038456227630376816, + "learning_rate": 5.050065964758488e-07, + "loss": 0.0009, + "step": 209500 + }, + { + "epoch": 9.596740038689148, + "grad_norm": 0.0012561274925246835, + "learning_rate": 4.937404986376348e-07, + "loss": 0.0011, + "step": 209600 + }, + { + "epoch": 9.60131863603585, + "grad_norm": 0.007517179474234581, + "learning_rate": 4.826008655465508e-07, + "loss": 0.0011, + "step": 209700 + }, + { + "epoch": 9.605897233382553, + "grad_norm": 0.16051574051380157, + "learning_rate": 4.7158772565902843e-07, + "loss": 0.0003, + "step": 209800 + }, + { + "epoch": 9.610475830729255, + "grad_norm": 0.004418348427861929, + "learning_rate": 4.6070110710834116e-07, + "loss": 0.0013, + "step": 209900 + }, + { + "epoch": 9.61505442807596, + "grad_norm": 0.003388006007298827, + "learning_rate": 4.4994103770457653e-07, + "loss": 0.0007, + "step": 210000 + }, + { + "epoch": 9.619633025422662, + "grad_norm": 0.1264602690935135, + "learning_rate": 4.3930754493456403e-07, + "loss": 0.0006, + "step": 210100 + }, + { + "epoch": 9.624211622769364, + "grad_norm": 0.0033033695071935654, + "learning_rate": 4.2880065596176967e-07, + "loss": 0.0009, + "step": 210200 + }, + { + "epoch": 9.628790220116068, + "grad_norm": 0.002471612999215722, + "learning_rate": 4.184203976262513e-07, + "loss": 0.0011, + "step": 210300 + }, + { + "epoch": 9.63336881746277, + "grad_norm": 0.02394956909120083, + "learning_rate": 4.081667964446034e-07, + "loss": 0.0009, + "step": 210400 + }, + { + "epoch": 9.637947414809473, + "grad_norm": 0.0018725660629570484, + "learning_rate": 3.980398786098405e-07, + "loss": 0.0013, + "step": 210500 + }, + { + "epoch": 9.642526012156177, + "grad_norm": 0.0007891812711022794, + "learning_rate": 3.8803966999139684e-07, + "loss": 0.0006, + "step": 210600 + }, + { + "epoch": 9.647104609502879, + "grad_norm": 0.01715545915067196, + "learning_rate": 3.7816619613499913e-07, + "loss": 0.0017, + "step": 210700 + }, + { + "epoch": 9.651683206849581, + "grad_norm": 0.004238943103700876, + "learning_rate": 3.6841948226263854e-07, + "loss": 0.0009, + "step": 210800 + }, + { + "epoch": 9.656261804196284, + "grad_norm": 0.07635319977998734, + "learning_rate": 3.587995532724986e-07, + "loss": 0.0013, + "step": 210900 + }, + { + "epoch": 9.660840401542988, + "grad_norm": 0.016986342146992683, + "learning_rate": 3.493064337388774e-07, + "loss": 0.0004, + "step": 211000 + }, + { + "epoch": 9.66541899888969, + "grad_norm": 0.004902221262454987, + "learning_rate": 3.399401479121489e-07, + "loss": 0.0009, + "step": 211100 + }, + { + "epoch": 9.669997596236392, + "grad_norm": 0.004907084163278341, + "learning_rate": 3.30700719718674e-07, + "loss": 0.0011, + "step": 211200 + }, + { + "epoch": 9.674576193583096, + "grad_norm": 0.0083727166056633, + "learning_rate": 3.215881727607617e-07, + "loss": 0.0015, + "step": 211300 + }, + { + "epoch": 9.679154790929799, + "grad_norm": 0.005267091561108828, + "learning_rate": 3.126025303166025e-07, + "loss": 0.0009, + "step": 211400 + }, + { + "epoch": 9.6837333882765, + "grad_norm": 0.0178202036768198, + "learning_rate": 3.0374381534019613e-07, + "loss": 0.0009, + "step": 211500 + }, + { + "epoch": 9.688311985623205, + "grad_norm": 0.05054371431469917, + "learning_rate": 2.9501205046131295e-07, + "loss": 0.0006, + "step": 211600 + }, + { + "epoch": 9.692890582969907, + "grad_norm": 0.01484039518982172, + "learning_rate": 2.8640725798543266e-07, + "loss": 0.0009, + "step": 211700 + }, + { + "epoch": 9.69746918031661, + "grad_norm": 0.1556658148765564, + "learning_rate": 2.7792945989366105e-07, + "loss": 0.0013, + "step": 211800 + }, + { + "epoch": 9.702047777663314, + "grad_norm": 0.0019681896083056927, + "learning_rate": 2.6957867784270787e-07, + "loss": 0.0008, + "step": 211900 + }, + { + "epoch": 9.706626375010016, + "grad_norm": 0.014196610078215599, + "learning_rate": 2.6135493316482017e-07, + "loss": 0.0008, + "step": 212000 + }, + { + "epoch": 9.711204972356718, + "grad_norm": 0.0016565111000090837, + "learning_rate": 2.532582468677214e-07, + "loss": 0.0012, + "step": 212100 + }, + { + "epoch": 9.715783569703422, + "grad_norm": 0.0729876384139061, + "learning_rate": 2.452886396345555e-07, + "loss": 0.0008, + "step": 212200 + }, + { + "epoch": 9.720362167050125, + "grad_norm": 0.05016588792204857, + "learning_rate": 2.3744613182384856e-07, + "loss": 0.0008, + "step": 212300 + }, + { + "epoch": 9.724940764396827, + "grad_norm": 0.0004378720186650753, + "learning_rate": 2.2973074346944734e-07, + "loss": 0.001, + "step": 212400 + }, + { + "epoch": 9.72951936174353, + "grad_norm": 0.005694561637938023, + "learning_rate": 2.2214249428046952e-07, + "loss": 0.001, + "step": 212500 + }, + { + "epoch": 9.734097959090233, + "grad_norm": 0.006911196745932102, + "learning_rate": 2.1468140364125367e-07, + "loss": 0.0015, + "step": 212600 + }, + { + "epoch": 9.738676556436936, + "grad_norm": 0.002870377618819475, + "learning_rate": 2.0734749061130377e-07, + "loss": 0.0012, + "step": 212700 + }, + { + "epoch": 9.743255153783638, + "grad_norm": 0.029163073748350143, + "learning_rate": 2.0014077392525031e-07, + "loss": 0.0007, + "step": 212800 + }, + { + "epoch": 9.747833751130342, + "grad_norm": 0.5740547180175781, + "learning_rate": 1.930612719927949e-07, + "loss": 0.0013, + "step": 212900 + }, + { + "epoch": 9.752412348477044, + "grad_norm": 0.017990708351135254, + "learning_rate": 1.8610900289867673e-07, + "loss": 0.0009, + "step": 213000 + }, + { + "epoch": 9.756990945823746, + "grad_norm": 0.0017326328670606017, + "learning_rate": 1.792839844026062e-07, + "loss": 0.001, + "step": 213100 + }, + { + "epoch": 9.76156954317045, + "grad_norm": 0.0013371937675401568, + "learning_rate": 1.725862339392259e-07, + "loss": 0.0008, + "step": 213200 + }, + { + "epoch": 9.766148140517153, + "grad_norm": 0.0008829734288156033, + "learning_rate": 1.66015768618083e-07, + "loss": 0.0017, + "step": 213300 + }, + { + "epoch": 9.770726737863855, + "grad_norm": 0.09034759551286697, + "learning_rate": 1.5957260522356243e-07, + "loss": 0.0009, + "step": 213400 + }, + { + "epoch": 9.775305335210557, + "grad_norm": 0.0011923068668693304, + "learning_rate": 1.5325676021484825e-07, + "loss": 0.001, + "step": 213500 + }, + { + "epoch": 9.779883932557262, + "grad_norm": 0.0013882212806493044, + "learning_rate": 1.4706824972591238e-07, + "loss": 0.001, + "step": 213600 + }, + { + "epoch": 9.784462529903964, + "grad_norm": 0.004688725806772709, + "learning_rate": 1.410070895654203e-07, + "loss": 0.0012, + "step": 213700 + }, + { + "epoch": 9.789041127250666, + "grad_norm": 0.0011394763132557273, + "learning_rate": 1.3507329521672552e-07, + "loss": 0.0005, + "step": 213800 + }, + { + "epoch": 9.79361972459737, + "grad_norm": 0.015895133838057518, + "learning_rate": 1.2926688183783066e-07, + "loss": 0.0006, + "step": 213900 + }, + { + "epoch": 9.798198321944072, + "grad_norm": 0.032990917563438416, + "learning_rate": 1.235878642613375e-07, + "loss": 0.0006, + "step": 214000 + }, + { + "epoch": 9.802776919290775, + "grad_norm": 0.012515276670455933, + "learning_rate": 1.1803625699440824e-07, + "loss": 0.001, + "step": 214100 + }, + { + "epoch": 9.807355516637479, + "grad_norm": 0.002379771787673235, + "learning_rate": 1.1261207421874309e-07, + "loss": 0.0007, + "step": 214200 + }, + { + "epoch": 9.811934113984181, + "grad_norm": 0.09647377580404282, + "learning_rate": 1.0731532979051939e-07, + "loss": 0.0007, + "step": 214300 + }, + { + "epoch": 9.816512711330883, + "grad_norm": 0.0067366501316428185, + "learning_rate": 1.021460372403915e-07, + "loss": 0.0012, + "step": 214400 + }, + { + "epoch": 9.821091308677588, + "grad_norm": 0.005677223205566406, + "learning_rate": 9.710420977340762e-08, + "loss": 0.0015, + "step": 214500 + }, + { + "epoch": 9.82566990602429, + "grad_norm": 0.004333006218075752, + "learning_rate": 9.218986026902632e-08, + "loss": 0.0014, + "step": 214600 + }, + { + "epoch": 9.830248503370992, + "grad_norm": 0.0027217718306928873, + "learning_rate": 8.740300128105005e-08, + "loss": 0.0003, + "step": 214700 + }, + { + "epoch": 9.834827100717694, + "grad_norm": 0.30776289105415344, + "learning_rate": 8.274364503760845e-08, + "loss": 0.0014, + "step": 214800 + }, + { + "epoch": 9.839405698064398, + "grad_norm": 0.005994019098579884, + "learning_rate": 7.8211803441125e-08, + "loss": 0.0009, + "step": 214900 + }, + { + "epoch": 9.8439842954111, + "grad_norm": 0.0010411434341222048, + "learning_rate": 7.380748806827819e-08, + "loss": 0.0011, + "step": 215000 + }, + { + "epoch": 9.848562892757803, + "grad_norm": 0.04015972465276718, + "learning_rate": 6.953071016998491e-08, + "loss": 0.0005, + "step": 215100 + }, + { + "epoch": 9.853141490104507, + "grad_norm": 0.0010900140041485429, + "learning_rate": 6.538148067135596e-08, + "loss": 0.001, + "step": 215200 + }, + { + "epoch": 9.85772008745121, + "grad_norm": 0.5897096991539001, + "learning_rate": 6.135981017167947e-08, + "loss": 0.0007, + "step": 215300 + }, + { + "epoch": 9.862298684797912, + "grad_norm": 0.03506353124976158, + "learning_rate": 5.7465708944404175e-08, + "loss": 0.0006, + "step": 215400 + }, + { + "epoch": 9.866877282144616, + "grad_norm": 0.042582686990499496, + "learning_rate": 5.3699186937089526e-08, + "loss": 0.001, + "step": 215500 + }, + { + "epoch": 9.871455879491318, + "grad_norm": 0.008183780126273632, + "learning_rate": 5.006025377138901e-08, + "loss": 0.0009, + "step": 215600 + }, + { + "epoch": 9.87603447683802, + "grad_norm": 0.0004912464646622539, + "learning_rate": 4.6548918743033464e-08, + "loss": 0.0013, + "step": 215700 + }, + { + "epoch": 9.880613074184723, + "grad_norm": 0.008000398054718971, + "learning_rate": 4.316519082179227e-08, + "loss": 0.0008, + "step": 215800 + }, + { + "epoch": 9.885191671531427, + "grad_norm": 0.0017890778835862875, + "learning_rate": 3.9909078651478856e-08, + "loss": 0.0008, + "step": 215900 + }, + { + "epoch": 9.889770268878129, + "grad_norm": 0.002216469496488571, + "learning_rate": 3.678059054988969e-08, + "loss": 0.0008, + "step": 216000 + }, + { + "epoch": 9.894348866224831, + "grad_norm": 0.07988656312227249, + "learning_rate": 3.377973450881533e-08, + "loss": 0.001, + "step": 216100 + }, + { + "epoch": 9.898927463571535, + "grad_norm": 0.0008704246138222516, + "learning_rate": 3.0906518194001586e-08, + "loss": 0.0006, + "step": 216200 + }, + { + "epoch": 9.903506060918238, + "grad_norm": 0.0025461604818701744, + "learning_rate": 2.8160948945138434e-08, + "loss": 0.0012, + "step": 216300 + }, + { + "epoch": 9.90808465826494, + "grad_norm": 0.00465493043884635, + "learning_rate": 2.554303377584333e-08, + "loss": 0.0006, + "step": 216400 + }, + { + "epoch": 9.912663255611644, + "grad_norm": 0.0038689495995640755, + "learning_rate": 2.305277937362238e-08, + "loss": 0.0011, + "step": 216500 + }, + { + "epoch": 9.917241852958346, + "grad_norm": 0.7484769225120544, + "learning_rate": 2.0690192099892535e-08, + "loss": 0.001, + "step": 216600 + }, + { + "epoch": 9.921820450305049, + "grad_norm": 0.01186487078666687, + "learning_rate": 1.845527798992608e-08, + "loss": 0.0008, + "step": 216700 + }, + { + "epoch": 9.926399047651753, + "grad_norm": 0.030242715030908585, + "learning_rate": 1.6348042752856173e-08, + "loss": 0.0005, + "step": 216800 + }, + { + "epoch": 9.930977644998455, + "grad_norm": 0.007519678212702274, + "learning_rate": 1.436849177166022e-08, + "loss": 0.0005, + "step": 216900 + }, + { + "epoch": 9.935556242345157, + "grad_norm": 0.0013559481594711542, + "learning_rate": 1.2516630103137638e-08, + "loss": 0.0007, + "step": 217000 + }, + { + "epoch": 9.94013483969186, + "grad_norm": 0.0019348141504451632, + "learning_rate": 1.0792462477909882e-08, + "loss": 0.0008, + "step": 217100 + }, + { + "epoch": 9.944713437038564, + "grad_norm": 0.004812970757484436, + "learning_rate": 9.195993300398221e-09, + "loss": 0.0015, + "step": 217200 + }, + { + "epoch": 9.949292034385266, + "grad_norm": 0.0022001820616424084, + "learning_rate": 7.727226648818198e-09, + "loss": 0.0014, + "step": 217300 + }, + { + "epoch": 9.953870631731968, + "grad_norm": 0.07122460752725601, + "learning_rate": 6.386166275157424e-09, + "loss": 0.0011, + "step": 217400 + }, + { + "epoch": 9.958449229078672, + "grad_norm": 0.10452170670032501, + "learning_rate": 5.172815605186676e-09, + "loss": 0.0016, + "step": 217500 + }, + { + "epoch": 9.963027826425375, + "grad_norm": 0.009487117640674114, + "learning_rate": 4.087177738432146e-09, + "loss": 0.001, + "step": 217600 + }, + { + "epoch": 9.967606423772077, + "grad_norm": 0.0035909826401621103, + "learning_rate": 3.12925544818099e-09, + "loss": 0.0016, + "step": 217700 + }, + { + "epoch": 9.972185021118781, + "grad_norm": 0.0012206127867102623, + "learning_rate": 2.299051181464673e-09, + "loss": 0.0007, + "step": 217800 + }, + { + "epoch": 9.976763618465483, + "grad_norm": 0.016972968354821205, + "learning_rate": 1.596567059053422e-09, + "loss": 0.001, + "step": 217900 + }, + { + "epoch": 9.981342215812186, + "grad_norm": 0.0021455176174640656, + "learning_rate": 1.0218048754617738e-09, + "loss": 0.0006, + "step": 218000 + }, + { + "epoch": 9.985920813158888, + "grad_norm": 0.0012913336977362633, + "learning_rate": 5.747660989263714e-10, + "loss": 0.001, + "step": 218100 + }, + { + "epoch": 9.990499410505592, + "grad_norm": 0.0040626926347613335, + "learning_rate": 2.554518714226184e-10, + "loss": 0.001, + "step": 218200 + }, + { + "epoch": 9.995078007852294, + "grad_norm": 0.0009735480998642743, + "learning_rate": 6.386300864247297e-11, + "loss": 0.0008, + "step": 218300 + }, + { + "epoch": 9.999656605198997, + "grad_norm": 0.01641685888171196, + "learning_rate": 0.0, + "loss": 0.0009, + "step": 218400 + }, + { + "epoch": 9.999656605198997, + "eval_loss": 0.29180198907852173, + "eval_runtime": 243.5776, + "eval_samples_per_second": 22.58, + "eval_steps_per_second": 22.58, + "step": 218400 + } + ], + "logging_steps": 100, + "max_steps": 218400, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.5885525416859468e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}