{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.791044776119403, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03731343283582089, "grad_norm": 0.8186072111129761, "learning_rate": 1.0000000000000002e-06, "loss": 1.3847, "step": 10 }, { "epoch": 0.07462686567164178, "grad_norm": 0.5007426142692566, "learning_rate": 2.0000000000000003e-06, "loss": 1.4283, "step": 20 }, { "epoch": 0.11194029850746269, "grad_norm": 0.49460887908935547, "learning_rate": 3e-06, "loss": 1.4868, "step": 30 }, { "epoch": 0.14925373134328357, "grad_norm": 0.5032920837402344, "learning_rate": 4.000000000000001e-06, "loss": 1.4491, "step": 40 }, { "epoch": 0.1865671641791045, "grad_norm": 0.5688469409942627, "learning_rate": 5e-06, "loss": 1.3703, "step": 50 }, { "epoch": 0.22388059701492538, "grad_norm": 0.5052517652511597, "learning_rate": 6e-06, "loss": 1.419, "step": 60 }, { "epoch": 0.26119402985074625, "grad_norm": 0.6315643787384033, "learning_rate": 7.000000000000001e-06, "loss": 1.3058, "step": 70 }, { "epoch": 0.29850746268656714, "grad_norm": 0.6060447692871094, "learning_rate": 8.000000000000001e-06, "loss": 1.2908, "step": 80 }, { "epoch": 0.3358208955223881, "grad_norm": 0.5513179302215576, "learning_rate": 9e-06, "loss": 1.2311, "step": 90 }, { "epoch": 0.373134328358209, "grad_norm": 0.8467404246330261, "learning_rate": 1e-05, "loss": 1.2043, "step": 100 }, { "epoch": 0.41044776119402987, "grad_norm": 0.8141824007034302, "learning_rate": 1.1000000000000001e-05, "loss": 1.0707, "step": 110 }, { "epoch": 0.44776119402985076, "grad_norm": 0.7932347059249878, "learning_rate": 1.2e-05, "loss": 0.9377, "step": 120 }, { "epoch": 0.48507462686567165, "grad_norm": 0.684220552444458, "learning_rate": 1.3000000000000001e-05, "loss": 0.714, "step": 130 }, { "epoch": 0.5223880597014925, "grad_norm": 0.5886895060539246, "learning_rate": 1.4000000000000001e-05, "loss": 0.6479, "step": 140 }, { "epoch": 0.5597014925373134, "grad_norm": 0.4764939248561859, "learning_rate": 1.5e-05, "loss": 0.5463, "step": 150 }, { "epoch": 0.5970149253731343, "grad_norm": 0.4621008038520813, "learning_rate": 1.6000000000000003e-05, "loss": 0.4641, "step": 160 }, { "epoch": 0.6343283582089553, "grad_norm": 0.46492910385131836, "learning_rate": 1.7000000000000003e-05, "loss": 0.4159, "step": 170 }, { "epoch": 0.6716417910447762, "grad_norm": 0.5017415881156921, "learning_rate": 1.8e-05, "loss": 0.4094, "step": 180 }, { "epoch": 0.7089552238805971, "grad_norm": 0.34392210841178894, "learning_rate": 1.9e-05, "loss": 0.3478, "step": 190 }, { "epoch": 0.746268656716418, "grad_norm": 0.3240516483783722, "learning_rate": 2e-05, "loss": 0.3821, "step": 200 }, { "epoch": 0.7835820895522388, "grad_norm": 0.26301339268684387, "learning_rate": 2.1e-05, "loss": 0.3606, "step": 210 }, { "epoch": 0.8208955223880597, "grad_norm": 0.34712520241737366, "learning_rate": 2.2000000000000003e-05, "loss": 0.3421, "step": 220 }, { "epoch": 0.8582089552238806, "grad_norm": 0.3248469829559326, "learning_rate": 2.3000000000000003e-05, "loss": 0.3389, "step": 230 }, { "epoch": 0.8955223880597015, "grad_norm": 0.298149436712265, "learning_rate": 2.4e-05, "loss": 0.3145, "step": 240 }, { "epoch": 0.9328358208955224, "grad_norm": 0.2757190763950348, "learning_rate": 2.5e-05, "loss": 0.3065, "step": 250 }, { "epoch": 0.9701492537313433, "grad_norm": 0.30510950088500977, "learning_rate": 2.6000000000000002e-05, "loss": 0.2971, "step": 260 }, { "epoch": 1.007462686567164, "grad_norm": 0.37349891662597656, "learning_rate": 2.7000000000000002e-05, "loss": 0.3273, "step": 270 }, { "epoch": 1.044776119402985, "grad_norm": 0.3667634129524231, "learning_rate": 2.8000000000000003e-05, "loss": 0.308, "step": 280 }, { "epoch": 1.0820895522388059, "grad_norm": 0.3463355004787445, "learning_rate": 2.9e-05, "loss": 0.3109, "step": 290 }, { "epoch": 1.1194029850746268, "grad_norm": 0.3888525366783142, "learning_rate": 3e-05, "loss": 0.2644, "step": 300 }, { "epoch": 1.1567164179104479, "grad_norm": 0.3749147951602936, "learning_rate": 3.1e-05, "loss": 0.2858, "step": 310 }, { "epoch": 1.1940298507462686, "grad_norm": 0.3270276188850403, "learning_rate": 3.2000000000000005e-05, "loss": 0.2573, "step": 320 }, { "epoch": 1.2313432835820897, "grad_norm": 0.3658592998981476, "learning_rate": 3.3e-05, "loss": 0.2613, "step": 330 }, { "epoch": 1.2686567164179103, "grad_norm": 0.3526328206062317, "learning_rate": 3.4000000000000007e-05, "loss": 0.2328, "step": 340 }, { "epoch": 1.3059701492537314, "grad_norm": 0.4528139531612396, "learning_rate": 3.5e-05, "loss": 0.2429, "step": 350 }, { "epoch": 1.3432835820895521, "grad_norm": 0.5426791310310364, "learning_rate": 3.6e-05, "loss": 0.2209, "step": 360 }, { "epoch": 1.3805970149253732, "grad_norm": 0.41844552755355835, "learning_rate": 3.7e-05, "loss": 0.2319, "step": 370 }, { "epoch": 1.417910447761194, "grad_norm": 0.4749431908130646, "learning_rate": 3.8e-05, "loss": 0.2233, "step": 380 }, { "epoch": 1.455223880597015, "grad_norm": 0.7010189890861511, "learning_rate": 3.9000000000000006e-05, "loss": 0.2181, "step": 390 }, { "epoch": 1.4925373134328357, "grad_norm": 0.5747635960578918, "learning_rate": 4e-05, "loss": 0.213, "step": 400 }, { "epoch": 1.5298507462686568, "grad_norm": 0.3661474287509918, "learning_rate": 4.1e-05, "loss": 0.2171, "step": 410 }, { "epoch": 1.5671641791044775, "grad_norm": 0.467835396528244, "learning_rate": 4.2e-05, "loss": 0.1985, "step": 420 }, { "epoch": 1.6044776119402986, "grad_norm": 0.5470123291015625, "learning_rate": 4.3e-05, "loss": 0.2176, "step": 430 }, { "epoch": 1.6417910447761193, "grad_norm": 0.5761199593544006, "learning_rate": 4.4000000000000006e-05, "loss": 0.2007, "step": 440 }, { "epoch": 1.6791044776119404, "grad_norm": 0.48257485032081604, "learning_rate": 4.5e-05, "loss": 0.2043, "step": 450 }, { "epoch": 1.716417910447761, "grad_norm": 0.48353052139282227, "learning_rate": 4.600000000000001e-05, "loss": 0.1872, "step": 460 }, { "epoch": 1.7537313432835822, "grad_norm": 0.4388391375541687, "learning_rate": 4.7e-05, "loss": 0.206, "step": 470 }, { "epoch": 1.7910447761194028, "grad_norm": 0.47332626581192017, "learning_rate": 4.8e-05, "loss": 0.1876, "step": 480 }, { "epoch": 1.828358208955224, "grad_norm": 0.8053535223007202, "learning_rate": 4.9e-05, "loss": 0.1839, "step": 490 }, { "epoch": 1.8656716417910446, "grad_norm": 0.413979709148407, "learning_rate": 5e-05, "loss": 0.1732, "step": 500 }, { "epoch": 1.9029850746268657, "grad_norm": 0.36910712718963623, "learning_rate": 5.1000000000000006e-05, "loss": 0.1827, "step": 510 }, { "epoch": 1.9402985074626866, "grad_norm": 0.8458298444747925, "learning_rate": 5.2000000000000004e-05, "loss": 0.1727, "step": 520 }, { "epoch": 1.9776119402985075, "grad_norm": 0.5452115535736084, "learning_rate": 5.300000000000001e-05, "loss": 0.1818, "step": 530 }, { "epoch": 2.014925373134328, "grad_norm": 0.4518108069896698, "learning_rate": 5.4000000000000005e-05, "loss": 0.177, "step": 540 }, { "epoch": 2.0522388059701493, "grad_norm": 0.66865074634552, "learning_rate": 5.500000000000001e-05, "loss": 0.1726, "step": 550 }, { "epoch": 2.08955223880597, "grad_norm": 0.6536034345626831, "learning_rate": 5.6000000000000006e-05, "loss": 0.1541, "step": 560 }, { "epoch": 2.126865671641791, "grad_norm": 0.5571377277374268, "learning_rate": 5.6999999999999996e-05, "loss": 0.1671, "step": 570 }, { "epoch": 2.1641791044776117, "grad_norm": 0.5385546684265137, "learning_rate": 5.8e-05, "loss": 0.1582, "step": 580 }, { "epoch": 2.201492537313433, "grad_norm": 0.577961266040802, "learning_rate": 5.9e-05, "loss": 0.1528, "step": 590 }, { "epoch": 2.2388059701492535, "grad_norm": 0.5082416534423828, "learning_rate": 6e-05, "loss": 0.1638, "step": 600 }, { "epoch": 2.2761194029850746, "grad_norm": 0.5490861535072327, "learning_rate": 6.1e-05, "loss": 0.166, "step": 610 }, { "epoch": 2.3134328358208958, "grad_norm": 0.492366760969162, "learning_rate": 6.2e-05, "loss": 0.1481, "step": 620 }, { "epoch": 2.3507462686567164, "grad_norm": 0.3702855110168457, "learning_rate": 6.3e-05, "loss": 0.1514, "step": 630 }, { "epoch": 2.388059701492537, "grad_norm": 0.664667010307312, "learning_rate": 6.400000000000001e-05, "loss": 0.1441, "step": 640 }, { "epoch": 2.425373134328358, "grad_norm": 0.33382174372673035, "learning_rate": 6.500000000000001e-05, "loss": 0.1573, "step": 650 }, { "epoch": 2.4626865671641793, "grad_norm": 0.4848814010620117, "learning_rate": 6.6e-05, "loss": 0.1457, "step": 660 }, { "epoch": 2.5, "grad_norm": 0.3649997413158417, "learning_rate": 6.7e-05, "loss": 0.1467, "step": 670 }, { "epoch": 2.5373134328358207, "grad_norm": 0.6385223865509033, "learning_rate": 6.800000000000001e-05, "loss": 0.145, "step": 680 }, { "epoch": 2.574626865671642, "grad_norm": 0.4580625891685486, "learning_rate": 6.9e-05, "loss": 0.1352, "step": 690 }, { "epoch": 2.611940298507463, "grad_norm": 0.5141746401786804, "learning_rate": 7e-05, "loss": 0.1444, "step": 700 }, { "epoch": 2.6492537313432836, "grad_norm": 0.40220722556114197, "learning_rate": 7.1e-05, "loss": 0.1493, "step": 710 }, { "epoch": 2.6865671641791042, "grad_norm": 0.5510571002960205, "learning_rate": 7.2e-05, "loss": 0.1387, "step": 720 }, { "epoch": 2.7238805970149254, "grad_norm": 0.43814659118652344, "learning_rate": 7.3e-05, "loss": 0.1374, "step": 730 }, { "epoch": 2.7611940298507465, "grad_norm": 0.4118008613586426, "learning_rate": 7.4e-05, "loss": 0.1297, "step": 740 }, { "epoch": 2.798507462686567, "grad_norm": 0.5626503229141235, "learning_rate": 7.500000000000001e-05, "loss": 0.1299, "step": 750 }, { "epoch": 2.835820895522388, "grad_norm": 0.4066360592842102, "learning_rate": 7.6e-05, "loss": 0.1102, "step": 760 }, { "epoch": 2.873134328358209, "grad_norm": 0.47184985876083374, "learning_rate": 7.7e-05, "loss": 0.1219, "step": 770 }, { "epoch": 2.91044776119403, "grad_norm": 0.6611475348472595, "learning_rate": 7.800000000000001e-05, "loss": 0.1267, "step": 780 }, { "epoch": 2.9477611940298507, "grad_norm": 0.3570108413696289, "learning_rate": 7.900000000000001e-05, "loss": 0.1191, "step": 790 }, { "epoch": 2.9850746268656714, "grad_norm": 0.4581681489944458, "learning_rate": 8e-05, "loss": 0.1209, "step": 800 }, { "epoch": 3.0223880597014925, "grad_norm": 0.4643435776233673, "learning_rate": 8.1e-05, "loss": 0.129, "step": 810 }, { "epoch": 3.0597014925373136, "grad_norm": 0.5595763921737671, "learning_rate": 8.2e-05, "loss": 0.1158, "step": 820 }, { "epoch": 3.0970149253731343, "grad_norm": 0.48848605155944824, "learning_rate": 8.3e-05, "loss": 0.1188, "step": 830 }, { "epoch": 3.1343283582089554, "grad_norm": 0.4496570825576782, "learning_rate": 8.4e-05, "loss": 0.114, "step": 840 }, { "epoch": 3.171641791044776, "grad_norm": 0.31364986300468445, "learning_rate": 8.5e-05, "loss": 0.1196, "step": 850 }, { "epoch": 3.208955223880597, "grad_norm": 0.3395878076553345, "learning_rate": 8.6e-05, "loss": 0.1124, "step": 860 }, { "epoch": 3.246268656716418, "grad_norm": 0.4917413592338562, "learning_rate": 8.7e-05, "loss": 0.1074, "step": 870 }, { "epoch": 3.283582089552239, "grad_norm": 0.44114553928375244, "learning_rate": 8.800000000000001e-05, "loss": 0.1095, "step": 880 }, { "epoch": 3.3208955223880596, "grad_norm": 0.3323831558227539, "learning_rate": 8.900000000000001e-05, "loss": 0.106, "step": 890 }, { "epoch": 3.3582089552238807, "grad_norm": 0.4495660066604614, "learning_rate": 9e-05, "loss": 0.1222, "step": 900 }, { "epoch": 3.3955223880597014, "grad_norm": 0.40784788131713867, "learning_rate": 9.1e-05, "loss": 0.1048, "step": 910 }, { "epoch": 3.4328358208955225, "grad_norm": 0.4643700420856476, "learning_rate": 9.200000000000001e-05, "loss": 0.1097, "step": 920 }, { "epoch": 3.470149253731343, "grad_norm": 0.472494512796402, "learning_rate": 9.300000000000001e-05, "loss": 0.1041, "step": 930 }, { "epoch": 3.5074626865671643, "grad_norm": 0.6110897660255432, "learning_rate": 9.4e-05, "loss": 0.0959, "step": 940 }, { "epoch": 3.544776119402985, "grad_norm": 0.5313069820404053, "learning_rate": 9.5e-05, "loss": 0.113, "step": 950 }, { "epoch": 3.582089552238806, "grad_norm": 0.4223133623600006, "learning_rate": 9.6e-05, "loss": 0.099, "step": 960 }, { "epoch": 3.6194029850746268, "grad_norm": 0.5464731454849243, "learning_rate": 9.7e-05, "loss": 0.1008, "step": 970 }, { "epoch": 3.656716417910448, "grad_norm": 0.3538314402103424, "learning_rate": 9.8e-05, "loss": 0.1049, "step": 980 }, { "epoch": 3.6940298507462686, "grad_norm": 0.7460148334503174, "learning_rate": 9.900000000000001e-05, "loss": 0.1088, "step": 990 }, { "epoch": 3.7313432835820897, "grad_norm": 0.3210597038269043, "learning_rate": 0.0001, "loss": 0.1041, "step": 1000 }, { "epoch": 3.7686567164179103, "grad_norm": 0.4450497627258301, "learning_rate": 9.999993165095463e-05, "loss": 0.0985, "step": 1010 }, { "epoch": 3.8059701492537314, "grad_norm": 0.4348960816860199, "learning_rate": 9.999972660400536e-05, "loss": 0.1015, "step": 1020 }, { "epoch": 3.843283582089552, "grad_norm": 0.462782621383667, "learning_rate": 9.999938485971279e-05, "loss": 0.1068, "step": 1030 }, { "epoch": 3.8805970149253732, "grad_norm": 0.3801368474960327, "learning_rate": 9.999890641901125e-05, "loss": 0.1117, "step": 1040 }, { "epoch": 3.917910447761194, "grad_norm": 0.45135366916656494, "learning_rate": 9.999829128320874e-05, "loss": 0.0917, "step": 1050 }, { "epoch": 3.955223880597015, "grad_norm": 0.41138389706611633, "learning_rate": 9.999753945398704e-05, "loss": 0.1049, "step": 1060 }, { "epoch": 3.9925373134328357, "grad_norm": 0.4976252317428589, "learning_rate": 9.999665093340165e-05, "loss": 0.1029, "step": 1070 }, { "epoch": 4.029850746268656, "grad_norm": 0.46372008323669434, "learning_rate": 9.99956257238817e-05, "loss": 0.1012, "step": 1080 }, { "epoch": 4.067164179104478, "grad_norm": 0.546938955783844, "learning_rate": 9.999446382823013e-05, "loss": 0.0829, "step": 1090 }, { "epoch": 4.104477611940299, "grad_norm": 0.40513405203819275, "learning_rate": 9.999316524962345e-05, "loss": 0.0933, "step": 1100 }, { "epoch": 4.141791044776119, "grad_norm": 0.4198484420776367, "learning_rate": 9.999172999161198e-05, "loss": 0.0895, "step": 1110 }, { "epoch": 4.17910447761194, "grad_norm": 0.3965628743171692, "learning_rate": 9.999015805811965e-05, "loss": 0.0917, "step": 1120 }, { "epoch": 4.2164179104477615, "grad_norm": 0.3095884621143341, "learning_rate": 9.998844945344405e-05, "loss": 0.0953, "step": 1130 }, { "epoch": 4.253731343283582, "grad_norm": 0.7962276339530945, "learning_rate": 9.998660418225645e-05, "loss": 0.0979, "step": 1140 }, { "epoch": 4.291044776119403, "grad_norm": 0.42066490650177, "learning_rate": 9.998462224960175e-05, "loss": 0.099, "step": 1150 }, { "epoch": 4.3283582089552235, "grad_norm": 0.3894193470478058, "learning_rate": 9.998250366089848e-05, "loss": 0.0887, "step": 1160 }, { "epoch": 4.365671641791045, "grad_norm": 0.28998032212257385, "learning_rate": 9.998024842193876e-05, "loss": 0.0943, "step": 1170 }, { "epoch": 4.402985074626866, "grad_norm": 0.3919823467731476, "learning_rate": 9.997785653888835e-05, "loss": 0.0916, "step": 1180 }, { "epoch": 4.440298507462686, "grad_norm": 0.3708650469779968, "learning_rate": 9.997532801828658e-05, "loss": 0.0858, "step": 1190 }, { "epoch": 4.477611940298507, "grad_norm": 0.2935069799423218, "learning_rate": 9.997266286704631e-05, "loss": 0.0992, "step": 1200 }, { "epoch": 4.514925373134329, "grad_norm": 0.4675377607345581, "learning_rate": 9.996986109245395e-05, "loss": 0.0854, "step": 1210 }, { "epoch": 4.552238805970149, "grad_norm": 0.31374865770339966, "learning_rate": 9.996692270216947e-05, "loss": 0.0788, "step": 1220 }, { "epoch": 4.58955223880597, "grad_norm": 0.419249951839447, "learning_rate": 9.996384770422629e-05, "loss": 0.0873, "step": 1230 }, { "epoch": 4.6268656716417915, "grad_norm": 0.26002731919288635, "learning_rate": 9.996063610703137e-05, "loss": 0.0845, "step": 1240 }, { "epoch": 4.664179104477612, "grad_norm": 0.29573896527290344, "learning_rate": 9.995728791936504e-05, "loss": 0.091, "step": 1250 }, { "epoch": 4.701492537313433, "grad_norm": 0.33090147376060486, "learning_rate": 9.995380315038119e-05, "loss": 0.0827, "step": 1260 }, { "epoch": 4.7388059701492535, "grad_norm": 0.24417485296726227, "learning_rate": 9.9950181809607e-05, "loss": 0.0859, "step": 1270 }, { "epoch": 4.776119402985074, "grad_norm": 0.48290401697158813, "learning_rate": 9.994642390694308e-05, "loss": 0.0889, "step": 1280 }, { "epoch": 4.813432835820896, "grad_norm": 0.4479697048664093, "learning_rate": 9.99425294526634e-05, "loss": 0.097, "step": 1290 }, { "epoch": 4.850746268656716, "grad_norm": 0.3560147285461426, "learning_rate": 9.993849845741524e-05, "loss": 0.0904, "step": 1300 }, { "epoch": 4.888059701492537, "grad_norm": 0.6645416617393494, "learning_rate": 9.99343309322192e-05, "loss": 0.0922, "step": 1310 }, { "epoch": 4.925373134328359, "grad_norm": 0.29696759581565857, "learning_rate": 9.993002688846913e-05, "loss": 0.093, "step": 1320 }, { "epoch": 4.962686567164179, "grad_norm": 0.47146692872047424, "learning_rate": 9.992558633793212e-05, "loss": 0.085, "step": 1330 }, { "epoch": 5.0, "grad_norm": 0.3430916368961334, "learning_rate": 9.992100929274846e-05, "loss": 0.0805, "step": 1340 }, { "epoch": 5.037313432835821, "grad_norm": 0.3205055892467499, "learning_rate": 9.991629576543163e-05, "loss": 0.0766, "step": 1350 }, { "epoch": 5.074626865671641, "grad_norm": 0.3664805293083191, "learning_rate": 9.991144576886823e-05, "loss": 0.0766, "step": 1360 }, { "epoch": 5.111940298507463, "grad_norm": 0.3753412663936615, "learning_rate": 9.990645931631796e-05, "loss": 0.0688, "step": 1370 }, { "epoch": 5.149253731343284, "grad_norm": 0.31633055210113525, "learning_rate": 9.990133642141359e-05, "loss": 0.0796, "step": 1380 }, { "epoch": 5.186567164179104, "grad_norm": 0.3355732262134552, "learning_rate": 9.989607709816091e-05, "loss": 0.0716, "step": 1390 }, { "epoch": 5.223880597014926, "grad_norm": 0.24850831925868988, "learning_rate": 9.989068136093873e-05, "loss": 0.0778, "step": 1400 }, { "epoch": 5.2611940298507465, "grad_norm": 0.29537102580070496, "learning_rate": 9.988514922449879e-05, "loss": 0.0759, "step": 1410 }, { "epoch": 5.298507462686567, "grad_norm": 0.3430945873260498, "learning_rate": 9.987948070396571e-05, "loss": 0.0774, "step": 1420 }, { "epoch": 5.335820895522388, "grad_norm": 0.5220637917518616, "learning_rate": 9.987367581483705e-05, "loss": 0.0836, "step": 1430 }, { "epoch": 5.373134328358209, "grad_norm": 0.28184008598327637, "learning_rate": 9.986773457298311e-05, "loss": 0.0752, "step": 1440 }, { "epoch": 5.41044776119403, "grad_norm": 0.36261311173439026, "learning_rate": 9.986165699464705e-05, "loss": 0.075, "step": 1450 }, { "epoch": 5.447761194029851, "grad_norm": 0.5107380151748657, "learning_rate": 9.985544309644475e-05, "loss": 0.0814, "step": 1460 }, { "epoch": 5.485074626865671, "grad_norm": 0.2446671426296234, "learning_rate": 9.984909289536473e-05, "loss": 0.0704, "step": 1470 }, { "epoch": 5.522388059701493, "grad_norm": 0.30449381470680237, "learning_rate": 9.984260640876821e-05, "loss": 0.0794, "step": 1480 }, { "epoch": 5.559701492537314, "grad_norm": 0.25645050406455994, "learning_rate": 9.983598365438902e-05, "loss": 0.0709, "step": 1490 }, { "epoch": 5.597014925373134, "grad_norm": 0.23825006186962128, "learning_rate": 9.98292246503335e-05, "loss": 0.0828, "step": 1500 }, { "epoch": 5.634328358208955, "grad_norm": 0.3259269893169403, "learning_rate": 9.98223294150805e-05, "loss": 0.0824, "step": 1510 }, { "epoch": 5.6716417910447765, "grad_norm": 0.24058914184570312, "learning_rate": 9.981529796748134e-05, "loss": 0.073, "step": 1520 }, { "epoch": 5.708955223880597, "grad_norm": 0.34457242488861084, "learning_rate": 9.980813032675974e-05, "loss": 0.0845, "step": 1530 }, { "epoch": 5.746268656716418, "grad_norm": 0.32940393686294556, "learning_rate": 9.980082651251175e-05, "loss": 0.0832, "step": 1540 }, { "epoch": 5.7835820895522385, "grad_norm": 0.5683007836341858, "learning_rate": 9.979338654470569e-05, "loss": 0.0836, "step": 1550 }, { "epoch": 5.82089552238806, "grad_norm": 0.31041061878204346, "learning_rate": 9.97858104436822e-05, "loss": 0.07, "step": 1560 }, { "epoch": 5.858208955223881, "grad_norm": 0.37858131527900696, "learning_rate": 9.977809823015401e-05, "loss": 0.0738, "step": 1570 }, { "epoch": 5.895522388059701, "grad_norm": 0.2743091583251953, "learning_rate": 9.977024992520602e-05, "loss": 0.0761, "step": 1580 }, { "epoch": 5.932835820895522, "grad_norm": 0.29117098450660706, "learning_rate": 9.976226555029522e-05, "loss": 0.0777, "step": 1590 }, { "epoch": 5.970149253731344, "grad_norm": 0.31398633122444153, "learning_rate": 9.975414512725057e-05, "loss": 0.0664, "step": 1600 }, { "epoch": 6.007462686567164, "grad_norm": 0.2684272527694702, "learning_rate": 9.974588867827301e-05, "loss": 0.0686, "step": 1610 }, { "epoch": 6.044776119402985, "grad_norm": 0.3945397436618805, "learning_rate": 9.973749622593534e-05, "loss": 0.0614, "step": 1620 }, { "epoch": 6.082089552238806, "grad_norm": 0.2747954726219177, "learning_rate": 9.972896779318219e-05, "loss": 0.0681, "step": 1630 }, { "epoch": 6.119402985074627, "grad_norm": 0.43257200717926025, "learning_rate": 9.972030340333001e-05, "loss": 0.0725, "step": 1640 }, { "epoch": 6.156716417910448, "grad_norm": 0.3559250831604004, "learning_rate": 9.97115030800669e-05, "loss": 0.0804, "step": 1650 }, { "epoch": 6.1940298507462686, "grad_norm": 0.3079264760017395, "learning_rate": 9.970256684745258e-05, "loss": 0.0649, "step": 1660 }, { "epoch": 6.231343283582089, "grad_norm": 0.32298946380615234, "learning_rate": 9.969349472991838e-05, "loss": 0.0668, "step": 1670 }, { "epoch": 6.268656716417911, "grad_norm": 0.2826225459575653, "learning_rate": 9.968428675226714e-05, "loss": 0.0734, "step": 1680 }, { "epoch": 6.3059701492537314, "grad_norm": 0.39002349972724915, "learning_rate": 9.967494293967312e-05, "loss": 0.0728, "step": 1690 }, { "epoch": 6.343283582089552, "grad_norm": 0.403890997171402, "learning_rate": 9.966546331768191e-05, "loss": 0.067, "step": 1700 }, { "epoch": 6.380597014925373, "grad_norm": 0.3755359351634979, "learning_rate": 9.965584791221048e-05, "loss": 0.0755, "step": 1710 }, { "epoch": 6.417910447761194, "grad_norm": 0.26346635818481445, "learning_rate": 9.964609674954696e-05, "loss": 0.0728, "step": 1720 }, { "epoch": 6.455223880597015, "grad_norm": 0.45292145013809204, "learning_rate": 9.963620985635065e-05, "loss": 0.0731, "step": 1730 }, { "epoch": 6.492537313432836, "grad_norm": 0.3568434715270996, "learning_rate": 9.962618725965196e-05, "loss": 0.0761, "step": 1740 }, { "epoch": 6.529850746268656, "grad_norm": 0.2551257014274597, "learning_rate": 9.961602898685226e-05, "loss": 0.0694, "step": 1750 }, { "epoch": 6.567164179104478, "grad_norm": 0.6106354594230652, "learning_rate": 9.96057350657239e-05, "loss": 0.0827, "step": 1760 }, { "epoch": 6.604477611940299, "grad_norm": 0.3226093053817749, "learning_rate": 9.959530552441005e-05, "loss": 0.0716, "step": 1770 }, { "epoch": 6.641791044776119, "grad_norm": 0.4297254979610443, "learning_rate": 9.95847403914247e-05, "loss": 0.0748, "step": 1780 }, { "epoch": 6.67910447761194, "grad_norm": 0.26469680666923523, "learning_rate": 9.95740396956525e-05, "loss": 0.074, "step": 1790 }, { "epoch": 6.7164179104477615, "grad_norm": 0.22717897593975067, "learning_rate": 9.956320346634876e-05, "loss": 0.0739, "step": 1800 }, { "epoch": 6.753731343283582, "grad_norm": 0.4513498544692993, "learning_rate": 9.955223173313931e-05, "loss": 0.0664, "step": 1810 }, { "epoch": 6.791044776119403, "grad_norm": 0.31683439016342163, "learning_rate": 9.954112452602045e-05, "loss": 0.069, "step": 1820 }, { "epoch": 6.8283582089552235, "grad_norm": 0.3350532650947571, "learning_rate": 9.952988187535886e-05, "loss": 0.0699, "step": 1830 }, { "epoch": 6.865671641791045, "grad_norm": 0.29829463362693787, "learning_rate": 9.95185038118915e-05, "loss": 0.0663, "step": 1840 }, { "epoch": 6.902985074626866, "grad_norm": 0.31650781631469727, "learning_rate": 9.950699036672559e-05, "loss": 0.0668, "step": 1850 }, { "epoch": 6.940298507462686, "grad_norm": 0.360944926738739, "learning_rate": 9.949534157133844e-05, "loss": 0.0696, "step": 1860 }, { "epoch": 6.977611940298507, "grad_norm": 0.31337013840675354, "learning_rate": 9.948355745757741e-05, "loss": 0.073, "step": 1870 }, { "epoch": 7.014925373134329, "grad_norm": 0.4675919711589813, "learning_rate": 9.94716380576598e-05, "loss": 0.0688, "step": 1880 }, { "epoch": 7.052238805970149, "grad_norm": 0.3031919002532959, "learning_rate": 9.945958340417283e-05, "loss": 0.0596, "step": 1890 }, { "epoch": 7.08955223880597, "grad_norm": 0.24858474731445312, "learning_rate": 9.944739353007344e-05, "loss": 0.0717, "step": 1900 }, { "epoch": 7.126865671641791, "grad_norm": 0.20959483087062836, "learning_rate": 9.943506846868826e-05, "loss": 0.0694, "step": 1910 }, { "epoch": 7.164179104477612, "grad_norm": 0.35621434450149536, "learning_rate": 9.942260825371358e-05, "loss": 0.063, "step": 1920 }, { "epoch": 7.201492537313433, "grad_norm": 0.3462587594985962, "learning_rate": 9.941001291921512e-05, "loss": 0.068, "step": 1930 }, { "epoch": 7.2388059701492535, "grad_norm": 0.38649681210517883, "learning_rate": 9.939728249962807e-05, "loss": 0.0638, "step": 1940 }, { "epoch": 7.276119402985074, "grad_norm": 0.29564595222473145, "learning_rate": 9.938441702975689e-05, "loss": 0.0626, "step": 1950 }, { "epoch": 7.313432835820896, "grad_norm": 0.339857816696167, "learning_rate": 9.937141654477528e-05, "loss": 0.0535, "step": 1960 }, { "epoch": 7.350746268656716, "grad_norm": 0.2591215670108795, "learning_rate": 9.93582810802261e-05, "loss": 0.0645, "step": 1970 }, { "epoch": 7.388059701492537, "grad_norm": 0.30237796902656555, "learning_rate": 9.934501067202117e-05, "loss": 0.0675, "step": 1980 }, { "epoch": 7.425373134328359, "grad_norm": 0.28394174575805664, "learning_rate": 9.93316053564413e-05, "loss": 0.0643, "step": 1990 }, { "epoch": 7.462686567164179, "grad_norm": 0.3124663233757019, "learning_rate": 9.931806517013612e-05, "loss": 0.059, "step": 2000 }, { "epoch": 7.5, "grad_norm": 0.36073037981987, "learning_rate": 9.930439015012396e-05, "loss": 0.0606, "step": 2010 }, { "epoch": 7.537313432835821, "grad_norm": 0.4091481864452362, "learning_rate": 9.929058033379181e-05, "loss": 0.0603, "step": 2020 }, { "epoch": 7.574626865671641, "grad_norm": 0.44718074798583984, "learning_rate": 9.927663575889521e-05, "loss": 0.0741, "step": 2030 }, { "epoch": 7.611940298507463, "grad_norm": 0.3819601833820343, "learning_rate": 9.926255646355804e-05, "loss": 0.0707, "step": 2040 }, { "epoch": 7.649253731343284, "grad_norm": 0.23336420953273773, "learning_rate": 9.92483424862726e-05, "loss": 0.0676, "step": 2050 }, { "epoch": 7.686567164179104, "grad_norm": 0.24415315687656403, "learning_rate": 9.923399386589933e-05, "loss": 0.0594, "step": 2060 }, { "epoch": 7.723880597014926, "grad_norm": 0.3735473155975342, "learning_rate": 9.921951064166684e-05, "loss": 0.062, "step": 2070 }, { "epoch": 7.7611940298507465, "grad_norm": 0.31629472970962524, "learning_rate": 9.92048928531717e-05, "loss": 0.0606, "step": 2080 }, { "epoch": 7.798507462686567, "grad_norm": 0.37902557849884033, "learning_rate": 9.919014054037836e-05, "loss": 0.0584, "step": 2090 }, { "epoch": 7.835820895522388, "grad_norm": 0.3486720323562622, "learning_rate": 9.917525374361912e-05, "loss": 0.056, "step": 2100 }, { "epoch": 7.8731343283582085, "grad_norm": 0.3731362521648407, "learning_rate": 9.91602325035939e-05, "loss": 0.0601, "step": 2110 }, { "epoch": 7.91044776119403, "grad_norm": 0.3560399115085602, "learning_rate": 9.914507686137019e-05, "loss": 0.06, "step": 2120 }, { "epoch": 7.947761194029851, "grad_norm": 0.30075564980506897, "learning_rate": 9.912978685838294e-05, "loss": 0.0657, "step": 2130 }, { "epoch": 7.985074626865671, "grad_norm": 0.2984028458595276, "learning_rate": 9.911436253643445e-05, "loss": 0.0587, "step": 2140 }, { "epoch": 8.022388059701493, "grad_norm": 0.1980169117450714, "learning_rate": 9.90988039376942e-05, "loss": 0.0718, "step": 2150 }, { "epoch": 8.059701492537313, "grad_norm": 0.31339579820632935, "learning_rate": 9.90831111046988e-05, "loss": 0.0557, "step": 2160 }, { "epoch": 8.097014925373134, "grad_norm": 0.1968696266412735, "learning_rate": 9.90672840803519e-05, "loss": 0.0571, "step": 2170 }, { "epoch": 8.134328358208956, "grad_norm": 0.23931682109832764, "learning_rate": 9.905132290792394e-05, "loss": 0.0566, "step": 2180 }, { "epoch": 8.171641791044776, "grad_norm": 0.21741189062595367, "learning_rate": 9.903522763105218e-05, "loss": 0.0575, "step": 2190 }, { "epoch": 8.208955223880597, "grad_norm": 0.22874368727207184, "learning_rate": 9.901899829374047e-05, "loss": 0.0565, "step": 2200 }, { "epoch": 8.246268656716419, "grad_norm": 0.3441888093948364, "learning_rate": 9.900263494035921e-05, "loss": 0.0565, "step": 2210 }, { "epoch": 8.283582089552239, "grad_norm": 0.2539830803871155, "learning_rate": 9.89861376156452e-05, "loss": 0.0538, "step": 2220 }, { "epoch": 8.32089552238806, "grad_norm": 0.2235102653503418, "learning_rate": 9.896950636470147e-05, "loss": 0.0609, "step": 2230 }, { "epoch": 8.35820895522388, "grad_norm": 0.1941322684288025, "learning_rate": 9.895274123299723e-05, "loss": 0.0562, "step": 2240 }, { "epoch": 8.395522388059701, "grad_norm": 0.2691369950771332, "learning_rate": 9.893584226636772e-05, "loss": 0.0608, "step": 2250 }, { "epoch": 8.432835820895523, "grad_norm": 0.24730461835861206, "learning_rate": 9.891880951101407e-05, "loss": 0.0582, "step": 2260 }, { "epoch": 8.470149253731343, "grad_norm": 0.34785839915275574, "learning_rate": 9.890164301350318e-05, "loss": 0.0506, "step": 2270 }, { "epoch": 8.507462686567164, "grad_norm": 0.3625825345516205, "learning_rate": 9.888434282076758e-05, "loss": 0.0614, "step": 2280 }, { "epoch": 8.544776119402986, "grad_norm": 0.25210148096084595, "learning_rate": 9.886690898010535e-05, "loss": 0.0611, "step": 2290 }, { "epoch": 8.582089552238806, "grad_norm": 0.27312466502189636, "learning_rate": 9.884934153917997e-05, "loss": 0.0537, "step": 2300 }, { "epoch": 8.619402985074627, "grad_norm": 0.314647912979126, "learning_rate": 9.883164054602012e-05, "loss": 0.0602, "step": 2310 }, { "epoch": 8.656716417910447, "grad_norm": 0.21531912684440613, "learning_rate": 9.881380604901964e-05, "loss": 0.0552, "step": 2320 }, { "epoch": 8.694029850746269, "grad_norm": 0.23920664191246033, "learning_rate": 9.879583809693738e-05, "loss": 0.0613, "step": 2330 }, { "epoch": 8.73134328358209, "grad_norm": 0.21864956617355347, "learning_rate": 9.877773673889701e-05, "loss": 0.0649, "step": 2340 }, { "epoch": 8.76865671641791, "grad_norm": 0.27523377537727356, "learning_rate": 9.8759502024387e-05, "loss": 0.0606, "step": 2350 }, { "epoch": 8.805970149253731, "grad_norm": 0.24805469810962677, "learning_rate": 9.87411340032603e-05, "loss": 0.0549, "step": 2360 }, { "epoch": 8.843283582089553, "grad_norm": 0.23070092499256134, "learning_rate": 9.872263272573443e-05, "loss": 0.0562, "step": 2370 }, { "epoch": 8.880597014925373, "grad_norm": 0.20833946764469147, "learning_rate": 9.870399824239117e-05, "loss": 0.05, "step": 2380 }, { "epoch": 8.917910447761194, "grad_norm": 0.34507372975349426, "learning_rate": 9.868523060417646e-05, "loss": 0.0613, "step": 2390 }, { "epoch": 8.955223880597014, "grad_norm": 0.32865110039711, "learning_rate": 9.86663298624003e-05, "loss": 0.0621, "step": 2400 }, { "epoch": 8.992537313432836, "grad_norm": 0.21305270493030548, "learning_rate": 9.864729606873663e-05, "loss": 0.0572, "step": 2410 }, { "epoch": 9.029850746268657, "grad_norm": 0.28193730115890503, "learning_rate": 9.862812927522309e-05, "loss": 0.0555, "step": 2420 }, { "epoch": 9.067164179104477, "grad_norm": 0.3953789472579956, "learning_rate": 9.860882953426099e-05, "loss": 0.0536, "step": 2430 }, { "epoch": 9.104477611940299, "grad_norm": 0.23013322055339813, "learning_rate": 9.858939689861506e-05, "loss": 0.0572, "step": 2440 }, { "epoch": 9.14179104477612, "grad_norm": 0.2906680107116699, "learning_rate": 9.856983142141339e-05, "loss": 0.0592, "step": 2450 }, { "epoch": 9.17910447761194, "grad_norm": 0.23490828275680542, "learning_rate": 9.855013315614725e-05, "loss": 0.0583, "step": 2460 }, { "epoch": 9.216417910447761, "grad_norm": 0.22825880348682404, "learning_rate": 9.853030215667093e-05, "loss": 0.059, "step": 2470 }, { "epoch": 9.253731343283581, "grad_norm": 0.25871285796165466, "learning_rate": 9.851033847720166e-05, "loss": 0.0555, "step": 2480 }, { "epoch": 9.291044776119403, "grad_norm": 0.27220776677131653, "learning_rate": 9.849024217231935e-05, "loss": 0.0542, "step": 2490 }, { "epoch": 9.328358208955224, "grad_norm": 0.26534005999565125, "learning_rate": 9.847001329696653e-05, "loss": 0.0526, "step": 2500 }, { "epoch": 9.365671641791044, "grad_norm": 0.33486032485961914, "learning_rate": 9.844965190644817e-05, "loss": 0.0563, "step": 2510 }, { "epoch": 9.402985074626866, "grad_norm": 0.2949483394622803, "learning_rate": 9.842915805643155e-05, "loss": 0.0556, "step": 2520 }, { "epoch": 9.440298507462687, "grad_norm": 0.24123981595039368, "learning_rate": 9.840853180294608e-05, "loss": 0.05, "step": 2530 }, { "epoch": 9.477611940298507, "grad_norm": 0.22536049783229828, "learning_rate": 9.838777320238312e-05, "loss": 0.0522, "step": 2540 }, { "epoch": 9.514925373134329, "grad_norm": 0.23206663131713867, "learning_rate": 9.836688231149592e-05, "loss": 0.0591, "step": 2550 }, { "epoch": 9.552238805970148, "grad_norm": 0.28573134541511536, "learning_rate": 9.834585918739936e-05, "loss": 0.0568, "step": 2560 }, { "epoch": 9.58955223880597, "grad_norm": 0.2628820538520813, "learning_rate": 9.832470388756987e-05, "loss": 0.0571, "step": 2570 }, { "epoch": 9.626865671641792, "grad_norm": 0.2880440652370453, "learning_rate": 9.830341646984521e-05, "loss": 0.0559, "step": 2580 }, { "epoch": 9.664179104477611, "grad_norm": 0.1786259263753891, "learning_rate": 9.82819969924244e-05, "loss": 0.058, "step": 2590 }, { "epoch": 9.701492537313433, "grad_norm": 0.3501608073711395, "learning_rate": 9.826044551386744e-05, "loss": 0.0523, "step": 2600 }, { "epoch": 9.738805970149254, "grad_norm": 0.24757252633571625, "learning_rate": 9.823876209309527e-05, "loss": 0.0587, "step": 2610 }, { "epoch": 9.776119402985074, "grad_norm": 0.2556290626525879, "learning_rate": 9.821694678938953e-05, "loss": 0.0555, "step": 2620 }, { "epoch": 9.813432835820896, "grad_norm": 0.2561217248439789, "learning_rate": 9.819499966239243e-05, "loss": 0.052, "step": 2630 }, { "epoch": 9.850746268656717, "grad_norm": 0.2776634097099304, "learning_rate": 9.817292077210659e-05, "loss": 0.0498, "step": 2640 }, { "epoch": 9.888059701492537, "grad_norm": 0.20668549835681915, "learning_rate": 9.815071017889482e-05, "loss": 0.0517, "step": 2650 }, { "epoch": 9.925373134328359, "grad_norm": 0.3100263178348541, "learning_rate": 9.812836794348004e-05, "loss": 0.0633, "step": 2660 }, { "epoch": 9.962686567164178, "grad_norm": 0.2780782878398895, "learning_rate": 9.81058941269451e-05, "loss": 0.0581, "step": 2670 }, { "epoch": 10.0, "grad_norm": 0.28903728723526, "learning_rate": 9.808328879073251e-05, "loss": 0.0538, "step": 2680 }, { "epoch": 10.037313432835822, "grad_norm": 0.22727562487125397, "learning_rate": 9.806055199664446e-05, "loss": 0.0491, "step": 2690 }, { "epoch": 10.074626865671641, "grad_norm": 0.267918199300766, "learning_rate": 9.803768380684242e-05, "loss": 0.0562, "step": 2700 }, { "epoch": 10.111940298507463, "grad_norm": 0.2988606095314026, "learning_rate": 9.801468428384716e-05, "loss": 0.0566, "step": 2710 }, { "epoch": 10.149253731343283, "grad_norm": 0.2710281312465668, "learning_rate": 9.799155349053851e-05, "loss": 0.0541, "step": 2720 }, { "epoch": 10.186567164179104, "grad_norm": 0.15320520102977753, "learning_rate": 9.796829149015517e-05, "loss": 0.0548, "step": 2730 }, { "epoch": 10.223880597014926, "grad_norm": 0.2653089463710785, "learning_rate": 9.794489834629455e-05, "loss": 0.0599, "step": 2740 }, { "epoch": 10.261194029850746, "grad_norm": 0.19223959743976593, "learning_rate": 9.792137412291265e-05, "loss": 0.0494, "step": 2750 }, { "epoch": 10.298507462686567, "grad_norm": 0.20455987751483917, "learning_rate": 9.789771888432375e-05, "loss": 0.0538, "step": 2760 }, { "epoch": 10.335820895522389, "grad_norm": 0.24908749759197235, "learning_rate": 9.787393269520039e-05, "loss": 0.0481, "step": 2770 }, { "epoch": 10.373134328358208, "grad_norm": 0.3131813406944275, "learning_rate": 9.785001562057309e-05, "loss": 0.0526, "step": 2780 }, { "epoch": 10.41044776119403, "grad_norm": 0.24828971922397614, "learning_rate": 9.782596772583026e-05, "loss": 0.0489, "step": 2790 }, { "epoch": 10.447761194029852, "grad_norm": 0.21727119386196136, "learning_rate": 9.780178907671789e-05, "loss": 0.0532, "step": 2800 }, { "epoch": 10.485074626865671, "grad_norm": 0.20279547572135925, "learning_rate": 9.777747973933948e-05, "loss": 0.0565, "step": 2810 }, { "epoch": 10.522388059701493, "grad_norm": 0.17726702988147736, "learning_rate": 9.775303978015585e-05, "loss": 0.0437, "step": 2820 }, { "epoch": 10.559701492537313, "grad_norm": 0.18961119651794434, "learning_rate": 9.772846926598491e-05, "loss": 0.0584, "step": 2830 }, { "epoch": 10.597014925373134, "grad_norm": 0.2498980015516281, "learning_rate": 9.77037682640015e-05, "loss": 0.0496, "step": 2840 }, { "epoch": 10.634328358208956, "grad_norm": 0.16978798806667328, "learning_rate": 9.767893684173721e-05, "loss": 0.0469, "step": 2850 }, { "epoch": 10.671641791044776, "grad_norm": 0.16128584742546082, "learning_rate": 9.765397506708023e-05, "loss": 0.0533, "step": 2860 }, { "epoch": 10.708955223880597, "grad_norm": 0.20463155210018158, "learning_rate": 9.762888300827507e-05, "loss": 0.0464, "step": 2870 }, { "epoch": 10.746268656716419, "grad_norm": 0.30601629614830017, "learning_rate": 9.760366073392246e-05, "loss": 0.0489, "step": 2880 }, { "epoch": 10.783582089552239, "grad_norm": 0.2730671763420105, "learning_rate": 9.757830831297914e-05, "loss": 0.0495, "step": 2890 }, { "epoch": 10.82089552238806, "grad_norm": 0.251432865858078, "learning_rate": 9.755282581475769e-05, "loss": 0.0549, "step": 2900 }, { "epoch": 10.85820895522388, "grad_norm": 0.26670166850090027, "learning_rate": 9.752721330892624e-05, "loss": 0.061, "step": 2910 }, { "epoch": 10.895522388059701, "grad_norm": 0.2965967655181885, "learning_rate": 9.750147086550844e-05, "loss": 0.0473, "step": 2920 }, { "epoch": 10.932835820895523, "grad_norm": 0.683840274810791, "learning_rate": 9.747559855488313e-05, "loss": 0.0509, "step": 2930 }, { "epoch": 10.970149253731343, "grad_norm": 0.25740495324134827, "learning_rate": 9.744959644778422e-05, "loss": 0.0515, "step": 2940 }, { "epoch": 11.007462686567164, "grad_norm": 0.2880542278289795, "learning_rate": 9.742346461530048e-05, "loss": 0.0482, "step": 2950 }, { "epoch": 11.044776119402986, "grad_norm": 0.45032551884651184, "learning_rate": 9.739720312887535e-05, "loss": 0.0557, "step": 2960 }, { "epoch": 11.082089552238806, "grad_norm": 0.2829900085926056, "learning_rate": 9.73708120603067e-05, "loss": 0.052, "step": 2970 }, { "epoch": 11.119402985074627, "grad_norm": 0.309597373008728, "learning_rate": 9.734429148174675e-05, "loss": 0.0541, "step": 2980 }, { "epoch": 11.156716417910447, "grad_norm": 0.2433389127254486, "learning_rate": 9.731764146570173e-05, "loss": 0.0482, "step": 2990 }, { "epoch": 11.194029850746269, "grad_norm": 0.24458132684230804, "learning_rate": 9.729086208503174e-05, "loss": 0.0505, "step": 3000 }, { "epoch": 11.23134328358209, "grad_norm": 0.2305087298154831, "learning_rate": 9.726395341295062e-05, "loss": 0.0504, "step": 3010 }, { "epoch": 11.26865671641791, "grad_norm": 0.18110457062721252, "learning_rate": 9.723691552302562e-05, "loss": 0.0575, "step": 3020 }, { "epoch": 11.305970149253731, "grad_norm": 0.20407621562480927, "learning_rate": 9.720974848917735e-05, "loss": 0.0494, "step": 3030 }, { "epoch": 11.343283582089553, "grad_norm": 0.25924697518348694, "learning_rate": 9.718245238567939e-05, "loss": 0.0472, "step": 3040 }, { "epoch": 11.380597014925373, "grad_norm": 0.23041822016239166, "learning_rate": 9.715502728715826e-05, "loss": 0.0481, "step": 3050 }, { "epoch": 11.417910447761194, "grad_norm": 0.25381171703338623, "learning_rate": 9.712747326859315e-05, "loss": 0.0543, "step": 3060 }, { "epoch": 11.455223880597014, "grad_norm": 0.18027640879154205, "learning_rate": 9.709979040531569e-05, "loss": 0.055, "step": 3070 }, { "epoch": 11.492537313432836, "grad_norm": 0.2954868674278259, "learning_rate": 9.707197877300974e-05, "loss": 0.0473, "step": 3080 }, { "epoch": 11.529850746268657, "grad_norm": 0.25323861837387085, "learning_rate": 9.704403844771128e-05, "loss": 0.0509, "step": 3090 }, { "epoch": 11.567164179104477, "grad_norm": 0.36910176277160645, "learning_rate": 9.701596950580806e-05, "loss": 0.0504, "step": 3100 }, { "epoch": 11.604477611940299, "grad_norm": 0.34199246764183044, "learning_rate": 9.698777202403953e-05, "loss": 0.0526, "step": 3110 }, { "epoch": 11.64179104477612, "grad_norm": 0.2146557718515396, "learning_rate": 9.695944607949649e-05, "loss": 0.0579, "step": 3120 }, { "epoch": 11.67910447761194, "grad_norm": 0.20559175312519073, "learning_rate": 9.693099174962103e-05, "loss": 0.0514, "step": 3130 }, { "epoch": 11.716417910447761, "grad_norm": 0.2689419090747833, "learning_rate": 9.690240911220618e-05, "loss": 0.0534, "step": 3140 }, { "epoch": 11.753731343283581, "grad_norm": 0.34870603680610657, "learning_rate": 9.687369824539577e-05, "loss": 0.0485, "step": 3150 }, { "epoch": 11.791044776119403, "grad_norm": 0.15433363616466522, "learning_rate": 9.684485922768422e-05, "loss": 0.0418, "step": 3160 }, { "epoch": 11.828358208955224, "grad_norm": 0.26874423027038574, "learning_rate": 9.681589213791633e-05, "loss": 0.0537, "step": 3170 }, { "epoch": 11.865671641791044, "grad_norm": 0.3361654281616211, "learning_rate": 9.6786797055287e-05, "loss": 0.0474, "step": 3180 }, { "epoch": 11.902985074626866, "grad_norm": 0.17938771843910217, "learning_rate": 9.675757405934103e-05, "loss": 0.0443, "step": 3190 }, { "epoch": 11.940298507462687, "grad_norm": 0.31368622183799744, "learning_rate": 9.672822322997305e-05, "loss": 0.0594, "step": 3200 }, { "epoch": 11.977611940298507, "grad_norm": 0.16268151998519897, "learning_rate": 9.669874464742705e-05, "loss": 0.0487, "step": 3210 }, { "epoch": 12.014925373134329, "grad_norm": 0.23879969120025635, "learning_rate": 9.66691383922964e-05, "loss": 0.0484, "step": 3220 }, { "epoch": 12.052238805970148, "grad_norm": 0.2321789413690567, "learning_rate": 9.663940454552342e-05, "loss": 0.051, "step": 3230 }, { "epoch": 12.08955223880597, "grad_norm": 0.22873088717460632, "learning_rate": 9.660954318839933e-05, "loss": 0.0406, "step": 3240 }, { "epoch": 12.126865671641792, "grad_norm": 0.3767557740211487, "learning_rate": 9.657955440256395e-05, "loss": 0.0432, "step": 3250 }, { "epoch": 12.164179104477611, "grad_norm": 0.21569453179836273, "learning_rate": 9.654943827000548e-05, "loss": 0.0528, "step": 3260 }, { "epoch": 12.201492537313433, "grad_norm": 0.23698291182518005, "learning_rate": 9.651919487306025e-05, "loss": 0.0457, "step": 3270 }, { "epoch": 12.238805970149254, "grad_norm": 0.21086478233337402, "learning_rate": 9.648882429441257e-05, "loss": 0.0508, "step": 3280 }, { "epoch": 12.276119402985074, "grad_norm": 0.19763463735580444, "learning_rate": 9.645832661709444e-05, "loss": 0.0497, "step": 3290 }, { "epoch": 12.313432835820896, "grad_norm": 0.18413852155208588, "learning_rate": 9.642770192448536e-05, "loss": 0.0441, "step": 3300 }, { "epoch": 12.350746268656717, "grad_norm": 0.13946911692619324, "learning_rate": 9.639695030031204e-05, "loss": 0.0453, "step": 3310 }, { "epoch": 12.388059701492537, "grad_norm": 0.21613670885562897, "learning_rate": 9.636607182864827e-05, "loss": 0.0511, "step": 3320 }, { "epoch": 12.425373134328359, "grad_norm": 0.24953646957874298, "learning_rate": 9.63350665939146e-05, "loss": 0.0451, "step": 3330 }, { "epoch": 12.462686567164178, "grad_norm": 0.2993795871734619, "learning_rate": 9.630393468087818e-05, "loss": 0.0469, "step": 3340 }, { "epoch": 12.5, "grad_norm": 0.2261819839477539, "learning_rate": 9.627267617465243e-05, "loss": 0.0484, "step": 3350 }, { "epoch": 12.537313432835822, "grad_norm": 0.23026186227798462, "learning_rate": 9.624129116069694e-05, "loss": 0.0452, "step": 3360 }, { "epoch": 12.574626865671641, "grad_norm": 0.27859947085380554, "learning_rate": 9.620977972481716e-05, "loss": 0.0593, "step": 3370 }, { "epoch": 12.611940298507463, "grad_norm": 0.23060785233974457, "learning_rate": 9.617814195316411e-05, "loss": 0.05, "step": 3380 }, { "epoch": 12.649253731343283, "grad_norm": 0.20185025036334991, "learning_rate": 9.614637793223425e-05, "loss": 0.0573, "step": 3390 }, { "epoch": 12.686567164179104, "grad_norm": 0.3584498167037964, "learning_rate": 9.611448774886924e-05, "loss": 0.052, "step": 3400 }, { "epoch": 12.723880597014926, "grad_norm": 0.19336827099323273, "learning_rate": 9.60824714902556e-05, "loss": 0.0535, "step": 3410 }, { "epoch": 12.761194029850746, "grad_norm": 0.22223635017871857, "learning_rate": 9.605032924392457e-05, "loss": 0.05, "step": 3420 }, { "epoch": 12.798507462686567, "grad_norm": 0.17108851671218872, "learning_rate": 9.601806109775179e-05, "loss": 0.0475, "step": 3430 }, { "epoch": 12.835820895522389, "grad_norm": 0.3861902952194214, "learning_rate": 9.598566713995718e-05, "loss": 0.0439, "step": 3440 }, { "epoch": 12.873134328358208, "grad_norm": 0.18927253782749176, "learning_rate": 9.595314745910456e-05, "loss": 0.052, "step": 3450 }, { "epoch": 12.91044776119403, "grad_norm": 0.21963383257389069, "learning_rate": 9.59205021441015e-05, "loss": 0.0504, "step": 3460 }, { "epoch": 12.947761194029852, "grad_norm": 0.18016670644283295, "learning_rate": 9.588773128419906e-05, "loss": 0.0467, "step": 3470 }, { "epoch": 12.985074626865671, "grad_norm": 0.1776365041732788, "learning_rate": 9.58548349689915e-05, "loss": 0.0414, "step": 3480 }, { "epoch": 13.022388059701493, "grad_norm": 0.2616482973098755, "learning_rate": 9.582181328841611e-05, "loss": 0.0442, "step": 3490 }, { "epoch": 13.059701492537313, "grad_norm": 0.20341171324253082, "learning_rate": 9.578866633275288e-05, "loss": 0.0533, "step": 3500 }, { "epoch": 13.097014925373134, "grad_norm": 0.2223699688911438, "learning_rate": 9.575539419262434e-05, "loss": 0.0458, "step": 3510 }, { "epoch": 13.134328358208956, "grad_norm": 0.22557464241981506, "learning_rate": 9.572199695899522e-05, "loss": 0.0445, "step": 3520 }, { "epoch": 13.171641791044776, "grad_norm": 0.25104308128356934, "learning_rate": 9.568847472317232e-05, "loss": 0.0435, "step": 3530 }, { "epoch": 13.208955223880597, "grad_norm": 0.18720711767673492, "learning_rate": 9.565482757680415e-05, "loss": 0.0453, "step": 3540 }, { "epoch": 13.246268656716419, "grad_norm": 0.16838951408863068, "learning_rate": 9.562105561188069e-05, "loss": 0.0505, "step": 3550 }, { "epoch": 13.283582089552239, "grad_norm": 0.31681734323501587, "learning_rate": 9.558715892073323e-05, "loss": 0.0494, "step": 3560 }, { "epoch": 13.32089552238806, "grad_norm": 0.2390700727701187, "learning_rate": 9.555313759603402e-05, "loss": 0.0538, "step": 3570 }, { "epoch": 13.35820895522388, "grad_norm": 0.20680709183216095, "learning_rate": 9.551899173079607e-05, "loss": 0.0519, "step": 3580 }, { "epoch": 13.395522388059701, "grad_norm": 0.2758580148220062, "learning_rate": 9.548472141837286e-05, "loss": 0.0512, "step": 3590 }, { "epoch": 13.432835820895523, "grad_norm": 0.3653097450733185, "learning_rate": 9.545032675245813e-05, "loss": 0.0496, "step": 3600 }, { "epoch": 13.470149253731343, "grad_norm": 0.23886866867542267, "learning_rate": 9.541580782708557e-05, "loss": 0.0455, "step": 3610 }, { "epoch": 13.507462686567164, "grad_norm": 0.3280908465385437, "learning_rate": 9.538116473662861e-05, "loss": 0.0489, "step": 3620 }, { "epoch": 13.544776119402986, "grad_norm": 0.20268180966377258, "learning_rate": 9.534639757580013e-05, "loss": 0.0484, "step": 3630 }, { "epoch": 13.582089552238806, "grad_norm": 0.2582015097141266, "learning_rate": 9.531150643965223e-05, "loss": 0.0487, "step": 3640 }, { "epoch": 13.619402985074627, "grad_norm": 0.18157973885536194, "learning_rate": 9.527649142357596e-05, "loss": 0.0496, "step": 3650 }, { "epoch": 13.656716417910447, "grad_norm": 0.22841542959213257, "learning_rate": 9.524135262330098e-05, "loss": 0.0467, "step": 3660 }, { "epoch": 13.694029850746269, "grad_norm": 0.2519935369491577, "learning_rate": 9.520609013489547e-05, "loss": 0.0487, "step": 3670 }, { "epoch": 13.73134328358209, "grad_norm": 0.24680495262145996, "learning_rate": 9.517070405476575e-05, "loss": 0.0457, "step": 3680 }, { "epoch": 13.76865671641791, "grad_norm": 0.26362067461013794, "learning_rate": 9.513519447965595e-05, "loss": 0.0495, "step": 3690 }, { "epoch": 13.805970149253731, "grad_norm": 0.3240712583065033, "learning_rate": 9.509956150664796e-05, "loss": 0.0496, "step": 3700 }, { "epoch": 13.843283582089553, "grad_norm": 0.21009013056755066, "learning_rate": 9.50638052331609e-05, "loss": 0.0457, "step": 3710 }, { "epoch": 13.880597014925373, "grad_norm": 0.1669154316186905, "learning_rate": 9.502792575695112e-05, "loss": 0.0496, "step": 3720 }, { "epoch": 13.917910447761194, "grad_norm": 0.22347605228424072, "learning_rate": 9.499192317611167e-05, "loss": 0.0426, "step": 3730 }, { "epoch": 13.955223880597014, "grad_norm": 0.15208907425403595, "learning_rate": 9.49557975890723e-05, "loss": 0.0447, "step": 3740 }, { "epoch": 13.992537313432836, "grad_norm": 0.3206101059913635, "learning_rate": 9.491954909459895e-05, "loss": 0.0471, "step": 3750 }, { "epoch": 14.029850746268657, "grad_norm": 0.15873713791370392, "learning_rate": 9.488317779179361e-05, "loss": 0.0401, "step": 3760 }, { "epoch": 14.067164179104477, "grad_norm": 0.19690357148647308, "learning_rate": 9.484668378009408e-05, "loss": 0.0491, "step": 3770 }, { "epoch": 14.104477611940299, "grad_norm": 0.3211113214492798, "learning_rate": 9.481006715927351e-05, "loss": 0.049, "step": 3780 }, { "epoch": 14.14179104477612, "grad_norm": 0.27657604217529297, "learning_rate": 9.477332802944044e-05, "loss": 0.0396, "step": 3790 }, { "epoch": 14.17910447761194, "grad_norm": 0.20194031298160553, "learning_rate": 9.473646649103818e-05, "loss": 0.0442, "step": 3800 }, { "epoch": 14.216417910447761, "grad_norm": 0.20344595611095428, "learning_rate": 9.46994826448448e-05, "loss": 0.0427, "step": 3810 }, { "epoch": 14.253731343283581, "grad_norm": 0.2067718505859375, "learning_rate": 9.46623765919727e-05, "loss": 0.0501, "step": 3820 }, { "epoch": 14.291044776119403, "grad_norm": 0.29719170928001404, "learning_rate": 9.462514843386845e-05, "loss": 0.0519, "step": 3830 }, { "epoch": 14.328358208955224, "grad_norm": 0.2347182184457779, "learning_rate": 9.458779827231237e-05, "loss": 0.0413, "step": 3840 }, { "epoch": 14.365671641791044, "grad_norm": 0.1558852344751358, "learning_rate": 9.45503262094184e-05, "loss": 0.0442, "step": 3850 }, { "epoch": 14.402985074626866, "grad_norm": 0.23085005581378937, "learning_rate": 9.451273234763371e-05, "loss": 0.047, "step": 3860 }, { "epoch": 14.440298507462687, "grad_norm": 0.1515151560306549, "learning_rate": 9.447501678973852e-05, "loss": 0.0481, "step": 3870 }, { "epoch": 14.477611940298507, "grad_norm": 0.1916729211807251, "learning_rate": 9.443717963884569e-05, "loss": 0.0474, "step": 3880 }, { "epoch": 14.514925373134329, "grad_norm": 0.2536492943763733, "learning_rate": 9.439922099840054e-05, "loss": 0.0382, "step": 3890 }, { "epoch": 14.552238805970148, "grad_norm": 0.1672086864709854, "learning_rate": 9.43611409721806e-05, "loss": 0.0497, "step": 3900 }, { "epoch": 14.58955223880597, "grad_norm": 0.3644237518310547, "learning_rate": 9.432293966429514e-05, "loss": 0.0444, "step": 3910 }, { "epoch": 14.626865671641792, "grad_norm": 0.20307251811027527, "learning_rate": 9.428461717918511e-05, "loss": 0.0452, "step": 3920 }, { "epoch": 14.664179104477611, "grad_norm": 0.20441733300685883, "learning_rate": 9.424617362162271e-05, "loss": 0.0454, "step": 3930 }, { "epoch": 14.701492537313433, "grad_norm": 0.26315611600875854, "learning_rate": 9.420760909671118e-05, "loss": 0.0486, "step": 3940 }, { "epoch": 14.738805970149254, "grad_norm": 0.1983092874288559, "learning_rate": 9.416892370988444e-05, "loss": 0.0483, "step": 3950 }, { "epoch": 14.776119402985074, "grad_norm": 0.18301443755626678, "learning_rate": 9.413011756690685e-05, "loss": 0.0456, "step": 3960 }, { "epoch": 14.813432835820896, "grad_norm": 0.2433597594499588, "learning_rate": 9.409119077387294e-05, "loss": 0.0463, "step": 3970 }, { "epoch": 14.850746268656717, "grad_norm": 0.27949392795562744, "learning_rate": 9.405214343720707e-05, "loss": 0.0412, "step": 3980 }, { "epoch": 14.888059701492537, "grad_norm": 0.22806599736213684, "learning_rate": 9.401297566366318e-05, "loss": 0.0448, "step": 3990 }, { "epoch": 14.925373134328359, "grad_norm": 0.25421562790870667, "learning_rate": 9.397368756032445e-05, "loss": 0.0426, "step": 4000 }, { "epoch": 14.962686567164178, "grad_norm": 0.2436474859714508, "learning_rate": 9.393427923460308e-05, "loss": 0.0474, "step": 4010 }, { "epoch": 15.0, "grad_norm": 0.3756405711174011, "learning_rate": 9.389475079423988e-05, "loss": 0.0438, "step": 4020 }, { "epoch": 15.037313432835822, "grad_norm": 0.25687697529792786, "learning_rate": 9.385510234730415e-05, "loss": 0.0435, "step": 4030 }, { "epoch": 15.074626865671641, "grad_norm": 0.17263716459274292, "learning_rate": 9.381533400219318e-05, "loss": 0.0455, "step": 4040 }, { "epoch": 15.111940298507463, "grad_norm": 0.2471216470003128, "learning_rate": 9.377544586763215e-05, "loss": 0.0429, "step": 4050 }, { "epoch": 15.149253731343283, "grad_norm": 0.20195460319519043, "learning_rate": 9.373543805267368e-05, "loss": 0.0432, "step": 4060 }, { "epoch": 15.186567164179104, "grad_norm": 0.1709851622581482, "learning_rate": 9.369531066669758e-05, "loss": 0.0477, "step": 4070 }, { "epoch": 15.223880597014926, "grad_norm": 0.23063932359218597, "learning_rate": 9.365506381941066e-05, "loss": 0.0379, "step": 4080 }, { "epoch": 15.261194029850746, "grad_norm": 0.3265426754951477, "learning_rate": 9.36146976208462e-05, "loss": 0.0435, "step": 4090 }, { "epoch": 15.298507462686567, "grad_norm": 0.26373934745788574, "learning_rate": 9.357421218136386e-05, "loss": 0.047, "step": 4100 }, { "epoch": 15.335820895522389, "grad_norm": 0.16861388087272644, "learning_rate": 9.353360761164931e-05, "loss": 0.0448, "step": 4110 }, { "epoch": 15.373134328358208, "grad_norm": 0.303790807723999, "learning_rate": 9.349288402271388e-05, "loss": 0.0396, "step": 4120 }, { "epoch": 15.41044776119403, "grad_norm": 0.1940719038248062, "learning_rate": 9.345204152589428e-05, "loss": 0.0474, "step": 4130 }, { "epoch": 15.447761194029852, "grad_norm": 0.34091615676879883, "learning_rate": 9.341108023285238e-05, "loss": 0.0424, "step": 4140 }, { "epoch": 15.485074626865671, "grad_norm": 0.27036693692207336, "learning_rate": 9.337000025557476e-05, "loss": 0.0482, "step": 4150 }, { "epoch": 15.522388059701493, "grad_norm": 0.16908007860183716, "learning_rate": 9.332880170637252e-05, "loss": 0.0381, "step": 4160 }, { "epoch": 15.559701492537313, "grad_norm": 0.23332923650741577, "learning_rate": 9.328748469788093e-05, "loss": 0.0427, "step": 4170 }, { "epoch": 15.597014925373134, "grad_norm": 0.16899706423282623, "learning_rate": 9.32460493430591e-05, "loss": 0.0439, "step": 4180 }, { "epoch": 15.634328358208956, "grad_norm": 0.12869524955749512, "learning_rate": 9.320449575518972e-05, "loss": 0.0481, "step": 4190 }, { "epoch": 15.671641791044776, "grad_norm": 0.21159130334854126, "learning_rate": 9.316282404787871e-05, "loss": 0.0446, "step": 4200 }, { "epoch": 15.708955223880597, "grad_norm": 0.1849961131811142, "learning_rate": 9.31210343350549e-05, "loss": 0.041, "step": 4210 }, { "epoch": 15.746268656716419, "grad_norm": 0.16107840836048126, "learning_rate": 9.30791267309698e-05, "loss": 0.0429, "step": 4220 }, { "epoch": 15.783582089552239, "grad_norm": 0.14206446707248688, "learning_rate": 9.30371013501972e-05, "loss": 0.0409, "step": 4230 }, { "epoch": 15.82089552238806, "grad_norm": 0.2168441116809845, "learning_rate": 9.299495830763286e-05, "loss": 0.0413, "step": 4240 }, { "epoch": 15.85820895522388, "grad_norm": 0.21431951224803925, "learning_rate": 9.295269771849427e-05, "loss": 0.0472, "step": 4250 }, { "epoch": 15.895522388059701, "grad_norm": 0.16851255297660828, "learning_rate": 9.291031969832026e-05, "loss": 0.0508, "step": 4260 }, { "epoch": 15.932835820895523, "grad_norm": 0.18404732644557953, "learning_rate": 9.286782436297073e-05, "loss": 0.0402, "step": 4270 }, { "epoch": 15.970149253731343, "grad_norm": 0.21722930669784546, "learning_rate": 9.282521182862629e-05, "loss": 0.0397, "step": 4280 }, { "epoch": 16.007462686567163, "grad_norm": 0.2523709833621979, "learning_rate": 9.278248221178798e-05, "loss": 0.0427, "step": 4290 }, { "epoch": 16.044776119402986, "grad_norm": 0.17736563086509705, "learning_rate": 9.273963562927695e-05, "loss": 0.0458, "step": 4300 }, { "epoch": 16.082089552238806, "grad_norm": 0.20613858103752136, "learning_rate": 9.269667219823412e-05, "loss": 0.0387, "step": 4310 }, { "epoch": 16.119402985074625, "grad_norm": 0.16557513177394867, "learning_rate": 9.265359203611987e-05, "loss": 0.0411, "step": 4320 }, { "epoch": 16.15671641791045, "grad_norm": 0.28119519352912903, "learning_rate": 9.261039526071374e-05, "loss": 0.0468, "step": 4330 }, { "epoch": 16.19402985074627, "grad_norm": 0.21538576483726501, "learning_rate": 9.256708199011401e-05, "loss": 0.0368, "step": 4340 }, { "epoch": 16.23134328358209, "grad_norm": 0.19657357037067413, "learning_rate": 9.252365234273755e-05, "loss": 0.038, "step": 4350 }, { "epoch": 16.26865671641791, "grad_norm": 0.19258421659469604, "learning_rate": 9.248010643731935e-05, "loss": 0.0414, "step": 4360 }, { "epoch": 16.30597014925373, "grad_norm": 0.28801625967025757, "learning_rate": 9.243644439291223e-05, "loss": 0.0387, "step": 4370 }, { "epoch": 16.34328358208955, "grad_norm": 0.16581468284130096, "learning_rate": 9.239266632888659e-05, "loss": 0.0383, "step": 4380 }, { "epoch": 16.380597014925375, "grad_norm": 0.34664949774742126, "learning_rate": 9.234877236492997e-05, "loss": 0.0453, "step": 4390 }, { "epoch": 16.417910447761194, "grad_norm": 0.1439947783946991, "learning_rate": 9.230476262104677e-05, "loss": 0.0466, "step": 4400 }, { "epoch": 16.455223880597014, "grad_norm": 0.15509940683841705, "learning_rate": 9.226063721755799e-05, "loss": 0.0488, "step": 4410 }, { "epoch": 16.492537313432837, "grad_norm": 0.18005985021591187, "learning_rate": 9.221639627510076e-05, "loss": 0.0407, "step": 4420 }, { "epoch": 16.529850746268657, "grad_norm": 0.16012470424175262, "learning_rate": 9.217203991462815e-05, "loss": 0.0394, "step": 4430 }, { "epoch": 16.567164179104477, "grad_norm": 0.2978847920894623, "learning_rate": 9.212756825740873e-05, "loss": 0.0451, "step": 4440 }, { "epoch": 16.604477611940297, "grad_norm": 0.2236834019422531, "learning_rate": 9.208298142502636e-05, "loss": 0.0487, "step": 4450 }, { "epoch": 16.64179104477612, "grad_norm": 0.2686060667037964, "learning_rate": 9.20382795393797e-05, "loss": 0.0403, "step": 4460 }, { "epoch": 16.67910447761194, "grad_norm": 0.33534038066864014, "learning_rate": 9.199346272268199e-05, "loss": 0.0385, "step": 4470 }, { "epoch": 16.71641791044776, "grad_norm": 0.19250528514385223, "learning_rate": 9.194853109746074e-05, "loss": 0.0441, "step": 4480 }, { "epoch": 16.753731343283583, "grad_norm": 0.19218407571315765, "learning_rate": 9.190348478655724e-05, "loss": 0.0474, "step": 4490 }, { "epoch": 16.791044776119403, "grad_norm": 0.21163488924503326, "learning_rate": 9.185832391312644e-05, "loss": 0.0411, "step": 4500 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 75, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3828481543643136e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }