diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5688 +1,2850 @@ { - "best_metric": 0.03688763454556465, - "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold8/checkpoint-1300", - "epoch": 9.993365557767037, + "best_metric": 0.03543492406606674, + "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold8/checkpoint-1200", + "epoch": 4.996682778883518, "eval_steps": 50, - "global_step": 6590, + "global_step": 3295, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015164439389631315, - "grad_norm": 4.874523639678955, - "learning_rate": 1.5174506828528075e-06, - "loss": 1.5896, + "grad_norm": 4.908313751220703, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.59, "step": 10 }, { "epoch": 0.03032887877926263, - "grad_norm": 4.5314226150512695, - "learning_rate": 3.034901365705615e-06, - "loss": 1.6239, + "grad_norm": 4.702552318572998, + "learning_rate": 6.060606060606061e-06, + "loss": 1.5951, "step": 20 }, { "epoch": 0.045493318168893945, - "grad_norm": 5.180280685424805, - "learning_rate": 4.552352048558422e-06, - "loss": 1.4105, + "grad_norm": 7.187352657318115, + "learning_rate": 9.090909090909091e-06, + "loss": 1.2503, "step": 30 }, { "epoch": 0.06065775755852526, - "grad_norm": 5.546182155609131, - "learning_rate": 6.06980273141123e-06, - "loss": 1.3376, + "grad_norm": 2.202908515930176, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.9145, "step": 40 }, { "epoch": 0.07582219694815658, - "grad_norm": 2.2393367290496826, - "learning_rate": 7.587253414264037e-06, - "loss": 1.0146, + "grad_norm": 1.6536071300506592, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.6782, "step": 50 }, { "epoch": 0.07582219694815658, - "eval_loss": 0.7523382902145386, - "eval_runtime": 159.7996, - "eval_samples_per_second": 7.34, - "eval_steps_per_second": 7.34, + "eval_loss": 0.43230628967285156, + "eval_runtime": 159.6252, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, "step": 50 }, { "epoch": 0.09098663633778789, - "grad_norm": 1.793792963027954, - "learning_rate": 9.104704097116844e-06, - "loss": 0.7441, + "grad_norm": 1.2327362298965454, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.3929, "step": 60 }, { "epoch": 0.1061510757274192, - "grad_norm": 1.471730351448059, - "learning_rate": 1.0622154779969651e-05, - "loss": 0.5322, + "grad_norm": 0.9732389450073242, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.2232, "step": 70 }, { "epoch": 0.12131551511705052, - "grad_norm": 1.1876047849655151, - "learning_rate": 1.213960546282246e-05, - "loss": 0.4693, + "grad_norm": 0.822551429271698, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.2301, "step": 80 }, { "epoch": 0.13647995450668182, - "grad_norm": 1.0602554082870483, - "learning_rate": 1.3657056145675265e-05, - "loss": 0.2379, + "grad_norm": 0.8734151124954224, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.127, "step": 90 }, { "epoch": 0.15164439389631315, - "grad_norm": 0.9468147158622742, - "learning_rate": 1.5174506828528074e-05, - "loss": 0.2615, + "grad_norm": 0.9926938414573669, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.1528, "step": 100 }, { "epoch": 0.15164439389631315, - "eval_loss": 0.15230286121368408, - "eval_runtime": 159.4525, - "eval_samples_per_second": 7.356, - "eval_steps_per_second": 7.356, + "eval_loss": 0.0901423990726471, + "eval_runtime": 159.655, + "eval_samples_per_second": 7.347, + "eval_steps_per_second": 7.347, "step": 100 }, { "epoch": 0.16680883328594445, - "grad_norm": 0.7414047718048096, - "learning_rate": 1.6691957511380883e-05, - "loss": 0.1503, + "grad_norm": 0.5548447966575623, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0945, "step": 110 }, { "epoch": 0.18197327267557578, - "grad_norm": 1.0056959390640259, - "learning_rate": 1.8209408194233688e-05, - "loss": 0.1237, + "grad_norm": 1.0051425695419312, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.0798, "step": 120 }, { "epoch": 0.19713771206520708, - "grad_norm": 0.907751202583313, - "learning_rate": 1.9726858877086497e-05, - "loss": 0.0952, + "grad_norm": 1.2151364088058472, + "learning_rate": 3.939393939393939e-05, + "loss": 0.0769, "step": 130 }, { "epoch": 0.2123021514548384, - "grad_norm": 0.9936216473579407, - "learning_rate": 2.1244309559939302e-05, - "loss": 0.0872, + "grad_norm": 0.7133122086524963, + "learning_rate": 4.242424242424243e-05, + "loss": 0.0684, "step": 140 }, { "epoch": 0.2274665908444697, - "grad_norm": 1.5066570043563843, - "learning_rate": 2.276176024279211e-05, - "loss": 0.0918, + "grad_norm": 1.0978891849517822, + "learning_rate": 4.545454545454546e-05, + "loss": 0.0741, "step": 150 }, { "epoch": 0.2274665908444697, - "eval_loss": 0.08111745119094849, - "eval_runtime": 159.4403, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.06654910743236542, + "eval_runtime": 159.7218, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, "step": 150 }, { "epoch": 0.24263103023410104, - "grad_norm": 1.069483995437622, - "learning_rate": 2.427921092564492e-05, - "loss": 0.087, + "grad_norm": 0.9477673768997192, + "learning_rate": 4.848484848484849e-05, + "loss": 0.0795, "step": 160 }, { "epoch": 0.25779546962373234, - "grad_norm": 1.585922122001648, - "learning_rate": 2.5796661608497725e-05, - "loss": 0.0735, + "grad_norm": 1.3763097524642944, + "learning_rate": 5.151515151515152e-05, + "loss": 0.0702, "step": 170 }, { "epoch": 0.27295990901336364, - "grad_norm": 0.7989636659622192, - "learning_rate": 2.731411229135053e-05, - "loss": 0.0922, + "grad_norm": 0.5879313945770264, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.0818, "step": 180 }, { "epoch": 0.288124348402995, - "grad_norm": 0.7541722655296326, - "learning_rate": 2.883156297420334e-05, - "loss": 0.0796, + "grad_norm": 0.590936541557312, + "learning_rate": 5.757575757575758e-05, + "loss": 0.0682, "step": 190 }, { "epoch": 0.3032887877926263, - "grad_norm": 1.2159948348999023, - "learning_rate": 3.0349013657056148e-05, - "loss": 0.0608, + "grad_norm": 1.0090534687042236, + "learning_rate": 6.060606060606061e-05, + "loss": 0.0544, "step": 200 }, { "epoch": 0.3032887877926263, - "eval_loss": 0.0636054202914238, - "eval_runtime": 159.4295, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.05628184229135513, + "eval_runtime": 159.7671, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 200 }, { "epoch": 0.3184532271822576, - "grad_norm": 0.8055582046508789, - "learning_rate": 3.1866464339908954e-05, - "loss": 0.0592, + "grad_norm": 0.6095383167266846, + "learning_rate": 6.363636363636364e-05, + "loss": 0.0538, "step": 210 }, { "epoch": 0.3336176665718889, - "grad_norm": 1.0923850536346436, - "learning_rate": 3.3383915022761766e-05, - "loss": 0.0633, + "grad_norm": 0.9077891707420349, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0571, "step": 220 }, { "epoch": 0.34878210596152026, - "grad_norm": 0.6764631271362305, - "learning_rate": 3.490136570561457e-05, - "loss": 0.0671, + "grad_norm": 0.4802258014678955, + "learning_rate": 6.96969696969697e-05, + "loss": 0.0628, "step": 230 }, { "epoch": 0.36394654535115156, - "grad_norm": 0.9003273844718933, - "learning_rate": 3.6418816388467377e-05, - "loss": 0.052, + "grad_norm": 0.7471550107002258, + "learning_rate": 7.272727272727273e-05, + "loss": 0.0476, "step": 240 }, { "epoch": 0.37911098474078286, - "grad_norm": 1.0069081783294678, - "learning_rate": 3.793626707132019e-05, - "loss": 0.0577, + "grad_norm": 0.8440163731575012, + "learning_rate": 7.575757575757576e-05, + "loss": 0.053, "step": 250 }, { "epoch": 0.37911098474078286, - "eval_loss": 0.05946296826004982, - "eval_runtime": 159.4133, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.0572020523250103, + "eval_runtime": 159.824, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 250 }, { "epoch": 0.39427542413041416, - "grad_norm": 0.9949697852134705, - "learning_rate": 3.9453717754172994e-05, - "loss": 0.0539, + "grad_norm": 0.8062713146209717, + "learning_rate": 7.878787878787879e-05, + "loss": 0.0516, "step": 260 }, { "epoch": 0.40943986352004547, - "grad_norm": 0.4372946321964264, - "learning_rate": 4.09711684370258e-05, - "loss": 0.0767, + "grad_norm": 0.33338773250579834, + "learning_rate": 8.181818181818183e-05, + "loss": 0.0712, "step": 270 }, { "epoch": 0.4246043029096768, - "grad_norm": 0.5403107404708862, - "learning_rate": 4.2488619119878605e-05, - "loss": 0.057, + "grad_norm": 0.550988495349884, + "learning_rate": 8.484848484848486e-05, + "loss": 0.0531, "step": 280 }, { "epoch": 0.4397687422993081, - "grad_norm": 0.7988236546516418, - "learning_rate": 4.400606980273142e-05, - "loss": 0.0511, + "grad_norm": 0.37053975462913513, + "learning_rate": 8.787878787878789e-05, + "loss": 0.0466, "step": 290 }, { "epoch": 0.4549331816889394, - "grad_norm": 1.121690273284912, - "learning_rate": 4.552352048558422e-05, - "loss": 0.0629, + "grad_norm": 0.7273926734924316, + "learning_rate": 9.090909090909092e-05, + "loss": 0.0626, "step": 300 }, { "epoch": 0.4549331816889394, - "eval_loss": 0.06021720543503761, - "eval_runtime": 159.4348, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.05890120193362236, + "eval_runtime": 159.7681, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 300 }, { "epoch": 0.4700976210785707, - "grad_norm": 0.9702941179275513, - "learning_rate": 4.704097116843703e-05, - "loss": 0.0499, + "grad_norm": 0.6740069389343262, + "learning_rate": 9.393939393939395e-05, + "loss": 0.0501, "step": 310 }, { "epoch": 0.4852620604682021, - "grad_norm": 0.871698260307312, - "learning_rate": 4.855842185128984e-05, - "loss": 0.0495, + "grad_norm": 0.5992850065231323, + "learning_rate": 9.696969696969698e-05, + "loss": 0.0443, "step": 320 }, { "epoch": 0.5004264998578334, - "grad_norm": 1.3688944578170776, - "learning_rate": 5.0075872534142645e-05, - "loss": 0.0463, + "grad_norm": 0.6561434268951416, + "learning_rate": 0.0001, + "loss": 0.0479, "step": 330 }, { "epoch": 0.5155909392474647, - "grad_norm": 0.9986417293548584, - "learning_rate": 5.159332321699545e-05, - "loss": 0.0559, + "grad_norm": 0.614966630935669, + "learning_rate": 9.999719336268101e-05, + "loss": 0.0507, "step": 340 }, { "epoch": 0.530755378637096, - "grad_norm": 0.6767694354057312, - "learning_rate": 5.3110773899848256e-05, - "loss": 0.0642, + "grad_norm": 0.4214603304862976, + "learning_rate": 9.998877376581251e-05, + "loss": 0.0646, "step": 350 }, { "epoch": 0.530755378637096, - "eval_loss": 0.05028086155653, - "eval_runtime": 159.4174, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.04616093635559082, + "eval_runtime": 159.8294, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 350 }, { "epoch": 0.5459198180267273, - "grad_norm": 0.8358095288276672, - "learning_rate": 5.462822458270106e-05, - "loss": 0.0554, + "grad_norm": 0.5347225069999695, + "learning_rate": 9.997474215462472e-05, + "loss": 0.0549, "step": 360 }, { "epoch": 0.5610842574163586, - "grad_norm": 1.043109655380249, - "learning_rate": 5.6145675265553874e-05, - "loss": 0.0455, + "grad_norm": 0.6282948851585388, + "learning_rate": 9.995510010438337e-05, + "loss": 0.0442, "step": 370 }, { "epoch": 0.57624869680599, - "grad_norm": 1.3854273557662964, - "learning_rate": 5.766312594840668e-05, - "loss": 0.0632, + "grad_norm": 0.8127485513687134, + "learning_rate": 9.992984982021295e-05, + "loss": 0.0607, "step": 380 }, { "epoch": 0.5914131361956213, - "grad_norm": 0.7631179690361023, - "learning_rate": 5.9180576631259484e-05, - "loss": 0.0556, + "grad_norm": 0.7069970965385437, + "learning_rate": 9.9898994136849e-05, + "loss": 0.0526, "step": 390 }, { "epoch": 0.6065775755852526, - "grad_norm": 1.1543526649475098, - "learning_rate": 6.0698027314112297e-05, - "loss": 0.0359, + "grad_norm": 0.3750600516796112, + "learning_rate": 9.986253651832005e-05, + "loss": 0.0336, "step": 400 }, { "epoch": 0.6065775755852526, - "eval_loss": 0.046978481113910675, - "eval_runtime": 159.4838, - "eval_samples_per_second": 7.355, - "eval_steps_per_second": 7.355, + "eval_loss": 0.04486996307969093, + "eval_runtime": 159.7479, + "eval_samples_per_second": 7.343, + "eval_steps_per_second": 7.343, "step": 400 }, { "epoch": 0.6217420149748839, - "grad_norm": 0.504883348941803, - "learning_rate": 6.22154779969651e-05, - "loss": 0.0413, + "grad_norm": 0.2943829298019409, + "learning_rate": 9.982048105755859e-05, + "loss": 0.0409, "step": 410 }, { "epoch": 0.6369064543645152, - "grad_norm": 0.4671061038970947, - "learning_rate": 6.373292867981791e-05, + "grad_norm": 0.3517925441265106, + "learning_rate": 9.977283247594166e-05, "loss": 0.052, "step": 420 }, { "epoch": 0.6520708937541465, - "grad_norm": 0.5526520609855652, - "learning_rate": 6.525037936267073e-05, - "loss": 0.0457, + "grad_norm": 0.47345930337905884, + "learning_rate": 9.971959612276076e-05, + "loss": 0.047, "step": 430 }, { "epoch": 0.6672353331437778, - "grad_norm": 0.48194679617881775, - "learning_rate": 6.676783004552353e-05, - "loss": 0.0488, + "grad_norm": 0.5188047289848328, + "learning_rate": 9.966077797462129e-05, + "loss": 0.0456, "step": 440 }, { "epoch": 0.6823997725334091, - "grad_norm": 0.4185965955257416, - "learning_rate": 6.828528072837634e-05, - "loss": 0.0432, + "grad_norm": 0.3099527060985565, + "learning_rate": 9.959638463477165e-05, + "loss": 0.0426, "step": 450 }, { "epoch": 0.6823997725334091, - "eval_loss": 0.044805318117141724, - "eval_runtime": 159.5001, - "eval_samples_per_second": 7.354, - "eval_steps_per_second": 7.354, + "eval_loss": 0.043455954641103745, + "eval_runtime": 159.7181, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, "step": 450 }, { "epoch": 0.6975642119230405, - "grad_norm": 0.48105889558792114, - "learning_rate": 6.980273141122914e-05, - "loss": 0.0515, + "grad_norm": 0.4291819632053375, + "learning_rate": 9.952642333236186e-05, + "loss": 0.0471, "step": 460 }, { "epoch": 0.7127286513126718, - "grad_norm": 0.6213817596435547, - "learning_rate": 7.132018209408195e-05, - "loss": 0.0552, + "grad_norm": 0.47742322087287903, + "learning_rate": 9.945090192163202e-05, + "loss": 0.0514, "step": 470 }, { "epoch": 0.7278930907023031, - "grad_norm": 0.6921278834342957, - "learning_rate": 7.283763277693475e-05, - "loss": 0.0413, + "grad_norm": 0.5357955098152161, + "learning_rate": 9.936982888103051e-05, + "loss": 0.0372, "step": 480 }, { "epoch": 0.7430575300919344, - "grad_norm": 0.6581971645355225, - "learning_rate": 7.435508345978756e-05, - "loss": 0.0412, + "grad_norm": 0.3578835725784302, + "learning_rate": 9.928321331226219e-05, + "loss": 0.036, "step": 490 }, { "epoch": 0.7582219694815657, - "grad_norm": 0.9359895586967468, - "learning_rate": 7.587253414264038e-05, - "loss": 0.0391, + "grad_norm": 1.1390984058380127, + "learning_rate": 9.919106493926655e-05, + "loss": 0.0354, "step": 500 }, { "epoch": 0.7582219694815657, - "eval_loss": 0.04941130056977272, - "eval_runtime": 159.5084, - "eval_samples_per_second": 7.354, - "eval_steps_per_second": 7.354, + "eval_loss": 0.04678919166326523, + "eval_runtime": 159.8624, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, "step": 500 }, { "epoch": 0.773386408871197, - "grad_norm": 0.39575156569480896, - "learning_rate": 7.738998482549318e-05, - "loss": 0.0563, + "grad_norm": 0.2849079966545105, + "learning_rate": 9.909339410712612e-05, + "loss": 0.0463, "step": 510 }, { "epoch": 0.7885508482608283, - "grad_norm": 0.6010810732841492, - "learning_rate": 7.890743550834599e-05, - "loss": 0.0439, + "grad_norm": 0.43958020210266113, + "learning_rate": 9.8990211780905e-05, + "loss": 0.0388, "step": 520 }, { "epoch": 0.8037152876504596, - "grad_norm": 0.5600255727767944, - "learning_rate": 8.04248861911988e-05, - "loss": 0.0426, + "grad_norm": 0.4281414747238159, + "learning_rate": 9.888152954441785e-05, + "loss": 0.0368, "step": 530 }, { "epoch": 0.8188797270400909, - "grad_norm": 0.2710343301296234, - "learning_rate": 8.19423368740516e-05, - "loss": 0.0463, + "grad_norm": 0.20032383501529694, + "learning_rate": 9.876735959892953e-05, + "loss": 0.039, "step": 540 }, { "epoch": 0.8340441664297223, - "grad_norm": 0.7152983546257019, - "learning_rate": 8.34597875569044e-05, - "loss": 0.0534, + "grad_norm": 0.697373628616333, + "learning_rate": 9.864771476178522e-05, + "loss": 0.0506, "step": 550 }, { "epoch": 0.8340441664297223, - "eval_loss": 0.04858021065592766, - "eval_runtime": 159.4647, - "eval_samples_per_second": 7.356, - "eval_steps_per_second": 7.356, + "eval_loss": 0.04438818246126175, + "eval_runtime": 159.8887, + "eval_samples_per_second": 7.336, + "eval_steps_per_second": 7.336, "step": 550 }, { "epoch": 0.8492086058193536, - "grad_norm": 0.559477686882019, - "learning_rate": 8.497723823975721e-05, - "loss": 0.0617, + "grad_norm": 0.4126269817352295, + "learning_rate": 9.852260846497153e-05, + "loss": 0.0515, "step": 560 }, { "epoch": 0.864373045208985, - "grad_norm": 0.43257811665534973, - "learning_rate": 8.649468892261003e-05, - "loss": 0.0559, + "grad_norm": 0.31782063841819763, + "learning_rate": 9.839205475360851e-05, + "loss": 0.0537, "step": 570 }, { "epoch": 0.8795374845986162, - "grad_norm": 0.4455533027648926, - "learning_rate": 8.801213960546283e-05, - "loss": 0.0545, + "grad_norm": 0.28636643290519714, + "learning_rate": 9.825606828437291e-05, + "loss": 0.0497, "step": 580 }, { "epoch": 0.8947019239882475, - "grad_norm": 0.5453502535820007, - "learning_rate": 8.952959028831564e-05, - "loss": 0.044, + "grad_norm": 0.3322339951992035, + "learning_rate": 9.811466432385267e-05, + "loss": 0.0392, "step": 590 }, { "epoch": 0.9098663633778789, - "grad_norm": 0.5415958762168884, - "learning_rate": 9.104704097116844e-05, - "loss": 0.0398, + "grad_norm": 0.40966206789016724, + "learning_rate": 9.796785874683314e-05, + "loss": 0.0346, "step": 600 }, { "epoch": 0.9098663633778789, - "eval_loss": 0.044507917016744614, - "eval_runtime": 159.5001, - "eval_samples_per_second": 7.354, - "eval_steps_per_second": 7.354, + "eval_loss": 0.04112881049513817, + "eval_runtime": 159.8597, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, "step": 600 }, { "epoch": 0.9250308027675102, - "grad_norm": 0.44225940108299255, - "learning_rate": 9.256449165402125e-05, - "loss": 0.0529, + "grad_norm": 0.34210318326950073, + "learning_rate": 9.781566803451475e-05, + "loss": 0.05, "step": 610 }, { "epoch": 0.9401952421571415, - "grad_norm": 0.2991369664669037, - "learning_rate": 9.408194233687406e-05, - "loss": 0.041, + "grad_norm": 0.45713552832603455, + "learning_rate": 9.765810927266281e-05, + "loss": 0.0414, "step": 620 }, { "epoch": 0.9553596815467729, - "grad_norm": 0.37005698680877686, - "learning_rate": 9.559939301972687e-05, - "loss": 0.0418, + "grad_norm": 0.320637047290802, + "learning_rate": 9.749520014968934e-05, + "loss": 0.0389, "step": 630 }, { "epoch": 0.9705241209364042, - "grad_norm": 0.4909774661064148, - "learning_rate": 9.711684370257968e-05, - "loss": 0.049, + "grad_norm": 0.3496937155723572, + "learning_rate": 9.732695895466735e-05, + "loss": 0.0449, "step": 640 }, { "epoch": 0.9856885603260355, - "grad_norm": 0.4178905189037323, - "learning_rate": 9.863429438543249e-05, - "loss": 0.0581, + "grad_norm": 0.3174608647823334, + "learning_rate": 9.715340457527746e-05, + "loss": 0.0514, "step": 650 }, { "epoch": 0.9856885603260355, - "eval_loss": 0.04815329611301422, - "eval_runtime": 159.3847, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.04426902160048485, + "eval_runtime": 159.8605, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, "step": 650 }, { "epoch": 1.0008529997156668, - "grad_norm": 0.337106317281723, - "learning_rate": 9.999999298570713e-05, - "loss": 0.0421, + "grad_norm": 0.26187363266944885, + "learning_rate": 9.697455649568761e-05, + "loss": 0.04, "step": 660 }, { "epoch": 1.016017439105298, - "grad_norm": 0.8468524813652039, - "learning_rate": 9.99991512729427e-05, - "loss": 0.0345, + "grad_norm": 0.6161110997200012, + "learning_rate": 9.679043479436556e-05, + "loss": 0.0294, "step": 670 }, { "epoch": 1.0311818784949294, - "grad_norm": 0.2221977561712265, - "learning_rate": 9.999690672866212e-05, - "loss": 0.0299, + "grad_norm": 0.18970751762390137, + "learning_rate": 9.660106014182489e-05, + "loss": 0.0277, "step": 680 }, { "epoch": 1.0463463178845607, - "grad_norm": 1.1688432693481445, - "learning_rate": 9.999325941584081e-05, - "loss": 0.0526, + "grad_norm": 0.5361956357955933, + "learning_rate": 9.640645379830424e-05, + "loss": 0.0444, "step": 690 }, { "epoch": 1.061510757274192, - "grad_norm": 0.2383510023355484, - "learning_rate": 9.99882094368118e-05, - "loss": 0.0337, + "grad_norm": 0.288272500038147, + "learning_rate": 9.620663761138067e-05, + "loss": 0.0323, "step": 700 }, { "epoch": 1.061510757274192, - "eval_loss": 0.04285712540149689, - "eval_runtime": 159.3852, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.041203178465366364, + "eval_runtime": 159.9229, + "eval_samples_per_second": 7.335, + "eval_steps_per_second": 7.335, "step": 700 }, { "epoch": 1.0766751966638233, - "grad_norm": 0.1373281627893448, - "learning_rate": 9.99817569332629e-05, - "loss": 0.029, + "grad_norm": 0.11579044908285141, + "learning_rate": 9.600163401351688e-05, + "loss": 0.0261, "step": 710 }, { "epoch": 1.0918396360534546, - "grad_norm": 0.5537685751914978, - "learning_rate": 9.99739020862327e-05, - "loss": 0.0514, + "grad_norm": 0.3901958167552948, + "learning_rate": 9.579146601954276e-05, + "loss": 0.0453, "step": 720 }, { "epoch": 1.1070040754430859, - "grad_norm": 0.24004210531711578, - "learning_rate": 9.996464511610545e-05, - "loss": 0.0377, + "grad_norm": 0.15443949401378632, + "learning_rate": 9.557615722407177e-05, + "loss": 0.0364, "step": 730 }, { "epoch": 1.1221685148327172, - "grad_norm": 0.11950217932462692, - "learning_rate": 9.9953986282605e-05, - "loss": 0.0362, + "grad_norm": 0.1345423012971878, + "learning_rate": 9.535573179885191e-05, + "loss": 0.0279, "step": 740 }, { "epoch": 1.1373329542223485, - "grad_norm": 0.2975088059902191, - "learning_rate": 9.994192588478732e-05, - "loss": 0.0331, + "grad_norm": 0.17373351752758026, + "learning_rate": 9.513021449005214e-05, + "loss": 0.0261, "step": 750 }, { "epoch": 1.1373329542223485, - "eval_loss": 0.05172675475478172, - "eval_runtime": 159.3847, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.04307902976870537, + "eval_runtime": 159.8102, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 750 }, { "epoch": 1.15249739361198, - "grad_norm": 0.21092985570430756, - "learning_rate": 9.992846426103231e-05, - "loss": 0.0379, + "grad_norm": 0.1945541799068451, + "learning_rate": 9.489963061548428e-05, + "loss": 0.0326, "step": 760 }, { "epoch": 1.1676618330016113, - "grad_norm": 0.19494560360908508, - "learning_rate": 9.991360178903418e-05, - "loss": 0.0258, + "grad_norm": 0.14704962074756622, + "learning_rate": 9.466400606176062e-05, + "loss": 0.0232, "step": 770 }, { "epoch": 1.1828262723912426, - "grad_norm": 0.35451167821884155, - "learning_rate": 9.989733888579087e-05, - "loss": 0.0262, + "grad_norm": 0.2738613486289978, + "learning_rate": 9.442336728138779e-05, + "loss": 0.0235, "step": 780 }, { "epoch": 1.197990711780874, - "grad_norm": 0.1696435958147049, - "learning_rate": 9.98796760075924e-05, - "loss": 0.0408, + "grad_norm": 0.14102770388126373, + "learning_rate": 9.417774128979706e-05, + "loss": 0.0364, "step": 790 }, { "epoch": 1.2131551511705052, - "grad_norm": 0.4249808192253113, - "learning_rate": 9.986061365000804e-05, - "loss": 0.0343, + "grad_norm": 0.3271743953227997, + "learning_rate": 9.39271556623114e-05, + "loss": 0.0307, "step": 800 }, { "epoch": 1.2131551511705052, - "eval_loss": 0.044131066650152206, - "eval_runtime": 159.4325, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.03978438675403595, + "eval_runtime": 159.8781, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 800 }, { "epoch": 1.2283195905601365, - "grad_norm": 0.42603185772895813, - "learning_rate": 9.984015234787239e-05, - "loss": 0.0442, + "grad_norm": 0.4989463686943054, + "learning_rate": 9.367163853104975e-05, + "loss": 0.039, "step": 810 }, { "epoch": 1.2434840299497678, - "grad_norm": 0.21767574548721313, - "learning_rate": 9.981829267527036e-05, - "loss": 0.0415, + "grad_norm": 0.19276122748851776, + "learning_rate": 9.341121858176876e-05, + "loss": 0.0413, "step": 820 }, { "epoch": 1.2586484693393991, - "grad_norm": 0.29455825686454773, - "learning_rate": 9.979503524552112e-05, - "loss": 0.0376, + "grad_norm": 0.29775118827819824, + "learning_rate": 9.314592505064239e-05, + "loss": 0.036, "step": 830 }, { "epoch": 1.2738129087290304, - "grad_norm": 0.2217111587524414, - "learning_rate": 9.977038071116087e-05, - "loss": 0.0267, + "grad_norm": 0.23416060209274292, + "learning_rate": 9.28757877209796e-05, + "loss": 0.0258, "step": 840 }, { "epoch": 1.2889773481186617, - "grad_norm": 0.7142578363418579, - "learning_rate": 9.974432976392451e-05, - "loss": 0.0363, + "grad_norm": 0.14653538167476654, + "learning_rate": 9.260083691988084e-05, + "loss": 0.0284, "step": 850 }, { "epoch": 1.2889773481186617, - "eval_loss": 0.047669440507888794, - "eval_runtime": 159.4191, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.04156646877527237, + "eval_runtime": 159.8802, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 850 }, { "epoch": 1.304141787508293, - "grad_norm": 0.3395718038082123, - "learning_rate": 9.971688313472623e-05, - "loss": 0.0407, + "grad_norm": 0.4026676416397095, + "learning_rate": 9.232110351483327e-05, + "loss": 0.0368, "step": 860 }, { "epoch": 1.3193062268979243, - "grad_norm": 0.5649396181106567, - "learning_rate": 9.968804159363903e-05, - "loss": 0.0375, + "grad_norm": 0.4935552179813385, + "learning_rate": 9.203661891024547e-05, + "loss": 0.0369, "step": 870 }, { "epoch": 1.3344706662875556, - "grad_norm": 0.3515680134296417, - "learning_rate": 9.965780594987311e-05, - "loss": 0.0354, + "grad_norm": 0.3299759328365326, + "learning_rate": 9.174741504392173e-05, + "loss": 0.0345, "step": 880 }, { "epoch": 1.349635105677187, - "grad_norm": 0.2906147539615631, - "learning_rate": 9.962617705175314e-05, - "loss": 0.0434, + "grad_norm": 0.24818608164787292, + "learning_rate": 9.145352438347662e-05, + "loss": 0.0413, "step": 890 }, { "epoch": 1.3647995450668184, - "grad_norm": 0.45233169198036194, - "learning_rate": 9.95931557866945e-05, - "loss": 0.0329, + "grad_norm": 0.2329672873020172, + "learning_rate": 9.115497992268995e-05, + "loss": 0.0316, "step": 900 }, { "epoch": 1.3647995450668184, - "eval_loss": 0.04343215748667717, - "eval_runtime": 159.4082, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.03982315585017204, + "eval_runtime": 159.8004, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 900 }, { "epoch": 1.3799639844564497, - "grad_norm": 0.7280616760253906, - "learning_rate": 9.955874308117835e-05, - "loss": 0.0344, + "grad_norm": 0.07506946474313736, + "learning_rate": 9.085181517780273e-05, + "loss": 0.0309, "step": 910 }, { "epoch": 1.395128423846081, - "grad_norm": 0.19865655899047852, - "learning_rate": 9.952293990072558e-05, - "loss": 0.0378, + "grad_norm": 0.19676469266414642, + "learning_rate": 9.054406418375443e-05, + "loss": 0.0355, "step": 920 }, { "epoch": 1.4102928632357123, - "grad_norm": 0.27480316162109375, - "learning_rate": 9.948574724986992e-05, + "grad_norm": 0.2670162320137024, + "learning_rate": 9.023176149036203e-05, "loss": 0.0299, "step": 930 }, { "epoch": 1.4254573026253436, - "grad_norm": 0.2553894519805908, - "learning_rate": 9.944716617212948e-05, - "loss": 0.0513, + "grad_norm": 0.24655964970588684, + "learning_rate": 8.991494215844132e-05, + "loss": 0.0416, "step": 940 }, { "epoch": 1.440621742014975, - "grad_norm": 0.4464769959449768, - "learning_rate": 9.94071977499777e-05, - "loss": 0.0324, + "grad_norm": 0.42746463418006897, + "learning_rate": 8.959364175587069e-05, + "loss": 0.0288, "step": 950 }, { "epoch": 1.440621742014975, - "eval_loss": 0.0429108627140522, - "eval_runtime": 159.438, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.039692338556051254, + "eval_runtime": 159.8479, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, "step": 950 }, { "epoch": 1.4557861814046062, - "grad_norm": 0.19798366725444794, - "learning_rate": 9.936584310481285e-05, - "loss": 0.0491, + "grad_norm": 0.20224446058273315, + "learning_rate": 8.926789635359817e-05, + "loss": 0.0448, "step": 960 }, { "epoch": 1.4709506207942376, - "grad_norm": 0.17483264207839966, - "learning_rate": 9.932310339692661e-05, - "loss": 0.0326, + "grad_norm": 0.16787214577198029, + "learning_rate": 8.893774252159187e-05, + "loss": 0.0294, "step": 970 }, { "epoch": 1.4861150601838689, - "grad_norm": 0.19839631021022797, - "learning_rate": 9.927897982547154e-05, - "loss": 0.0384, + "grad_norm": 0.20546148717403412, + "learning_rate": 8.860321732473439e-05, + "loss": 0.0396, "step": 980 }, { "epoch": 1.5012794995735002, - "grad_norm": 0.47600170969963074, - "learning_rate": 9.923347362842736e-05, - "loss": 0.0404, + "grad_norm": 0.4057621657848358, + "learning_rate": 8.826435831866184e-05, + "loss": 0.0389, "step": 990 }, { "epoch": 1.5164439389631315, - "grad_norm": 0.12121453881263733, - "learning_rate": 9.918658608256633e-05, - "loss": 0.0315, + "grad_norm": 0.09442304074764252, + "learning_rate": 8.79212035455475e-05, + "loss": 0.0306, "step": 1000 }, { "epoch": 1.5164439389631315, - "eval_loss": 0.04633578285574913, - "eval_runtime": 159.3871, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.041055887937545776, + "eval_runtime": 159.778, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, "step": 1000 }, { "epoch": 1.5316083783527628, - "grad_norm": 0.3031330704689026, - "learning_rate": 9.913831850341726e-05, - "loss": 0.0329, + "grad_norm": 0.2399837076663971, + "learning_rate": 8.757379152983103e-05, + "loss": 0.03, "step": 1010 }, { "epoch": 1.546772817742394, - "grad_norm": 0.4583679735660553, - "learning_rate": 9.908867224522881e-05, - "loss": 0.0355, + "grad_norm": 0.38911059498786926, + "learning_rate": 8.722216127389363e-05, + "loss": 0.0335, "step": 1020 }, { "epoch": 1.5619372571320254, - "grad_norm": 0.49641188979148865, - "learning_rate": 9.903764870093136e-05, - "loss": 0.0408, + "grad_norm": 0.6509933471679688, + "learning_rate": 8.686635225367919e-05, + "loss": 0.0361, "step": 1030 }, { "epoch": 1.5771016965216567, - "grad_norm": 0.2882194519042969, - "learning_rate": 9.898524930209786e-05, - "loss": 0.0427, + "grad_norm": 0.20148853957653046, + "learning_rate": 8.650640441426274e-05, + "loss": 0.038, "step": 1040 }, { "epoch": 1.592266135911288, - "grad_norm": 0.5170530676841736, - "learning_rate": 9.893147551890387e-05, - "loss": 0.0482, + "grad_norm": 0.3823687434196472, + "learning_rate": 8.614235816536582e-05, + "loss": 0.0449, "step": 1050 }, { "epoch": 1.592266135911288, - "eval_loss": 0.04051831737160683, - "eval_runtime": 159.4265, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.03836758807301521, + "eval_runtime": 159.8354, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 1050 }, { "epoch": 1.6074305753009193, - "grad_norm": 0.28377339243888855, - "learning_rate": 9.887632886008616e-05, - "loss": 0.0402, + "grad_norm": 0.3107754588127136, + "learning_rate": 8.577425437681994e-05, + "loss": 0.0377, "step": 1060 }, { "epoch": 1.6225950146905506, - "grad_norm": 0.15742602944374084, - "learning_rate": 9.88198108729004e-05, - "loss": 0.0375, + "grad_norm": 0.17179957032203674, + "learning_rate": 8.540213437397833e-05, + "loss": 0.0322, "step": 1070 }, { "epoch": 1.6377594540801819, - "grad_norm": 0.14881542325019836, - "learning_rate": 9.876192314307777e-05, - "loss": 0.0306, + "grad_norm": 0.10634970664978027, + "learning_rate": 8.502603993307647e-05, + "loss": 0.0273, "step": 1080 }, { "epoch": 1.6529238934698132, - "grad_norm": 0.5151090621948242, - "learning_rate": 9.87026672947805e-05, - "loss": 0.0281, + "grad_norm": 0.32147282361984253, + "learning_rate": 8.464601327654207e-05, + "loss": 0.0251, "step": 1090 }, { "epoch": 1.6680883328594445, - "grad_norm": 0.2572789490222931, - "learning_rate": 9.864204499055624e-05, - "loss": 0.0318, + "grad_norm": 0.29508742690086365, + "learning_rate": 8.4262097068255e-05, + "loss": 0.0281, "step": 1100 }, { "epoch": 1.6680883328594445, - "eval_loss": 0.042203232645988464, - "eval_runtime": 159.3976, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.038478951901197433, + "eval_runtime": 159.8811, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1100 }, { "epoch": 1.6832527722490758, - "grad_norm": 0.6860706806182861, - "learning_rate": 9.858005793129143e-05, - "loss": 0.0363, + "grad_norm": 0.5426458716392517, + "learning_rate": 8.387433440875758e-05, + "loss": 0.0331, "step": 1110 }, { "epoch": 1.698417211638707, - "grad_norm": 0.4389311969280243, - "learning_rate": 9.851670785616358e-05, - "loss": 0.0382, + "grad_norm": 0.2759105861186981, + "learning_rate": 8.348276883041583e-05, + "loss": 0.0336, "step": 1120 }, { "epoch": 1.7135816510283386, - "grad_norm": 0.2743322253227234, - "learning_rate": 9.845199654259254e-05, - "loss": 0.0439, + "grad_norm": 0.4834611415863037, + "learning_rate": 8.308744429253238e-05, + "loss": 0.0464, "step": 1130 }, { "epoch": 1.72874609041797, - "grad_norm": 0.41370782256126404, - "learning_rate": 9.83859258061905e-05, - "loss": 0.038, + "grad_norm": 0.3538722097873688, + "learning_rate": 8.268840517641123e-05, + "loss": 0.0345, "step": 1140 }, { "epoch": 1.7439105298076012, - "grad_norm": 0.2958690822124481, - "learning_rate": 9.831849750071118e-05, - "loss": 0.04, + "grad_norm": 0.41168689727783203, + "learning_rate": 8.228569628037543e-05, + "loss": 0.038, "step": 1150 }, { "epoch": 1.7439105298076012, - "eval_loss": 0.03917526453733444, - "eval_runtime": 159.3838, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.03758051246404648, + "eval_runtime": 159.8738, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1150 }, { "epoch": 1.7590749691972325, - "grad_norm": 0.2892114818096161, - "learning_rate": 9.82497135179977e-05, - "loss": 0.0333, + "grad_norm": 0.3065544068813324, + "learning_rate": 8.187936281473758e-05, + "loss": 0.0325, "step": 1160 }, { "epoch": 1.7742394085868638, - "grad_norm": 0.3978842496871948, - "learning_rate": 9.817957578792962e-05, - "loss": 0.0313, + "grad_norm": 0.24398308992385864, + "learning_rate": 8.146945039672446e-05, + "loss": 0.0276, "step": 1170 }, { "epoch": 1.789403847976495, - "grad_norm": 0.37295064330101013, - "learning_rate": 9.810808627836869e-05, - "loss": 0.0324, + "grad_norm": 0.29391804337501526, + "learning_rate": 8.105600504535567e-05, + "loss": 0.0306, "step": 1180 }, { "epoch": 1.8045682873661264, - "grad_norm": 0.3397061824798584, - "learning_rate": 9.803524699510374e-05, - "loss": 0.0491, + "grad_norm": 0.33613964915275574, + "learning_rate": 8.06390731762773e-05, + "loss": 0.0416, "step": 1190 }, { "epoch": 1.8197327267557577, - "grad_norm": 0.6286438703536987, - "learning_rate": 9.796105998179424e-05, - "loss": 0.0307, + "grad_norm": 0.6730492115020752, + "learning_rate": 8.021870159655109e-05, + "loss": 0.0277, "step": 1200 }, { "epoch": 1.8197327267557577, - "eval_loss": 0.03775982931256294, - "eval_runtime": 159.417, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.03543492406606674, + "eval_runtime": 159.8678, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1200 }, { "epoch": 1.8348971661453892, - "grad_norm": 0.3092387020587921, - "learning_rate": 9.788552731991316e-05, - "loss": 0.0337, + "grad_norm": 0.31159326434135437, + "learning_rate": 7.979493749939955e-05, + "loss": 0.0335, "step": 1210 }, { "epoch": 1.8500616055350205, - "grad_norm": 0.3837817907333374, - "learning_rate": 9.780865112868847e-05, - "loss": 0.0461, + "grad_norm": 0.2657136917114258, + "learning_rate": 7.93678284589079e-05, + "loss": 0.0421, "step": 1220 }, { "epoch": 1.8652260449246518, - "grad_norm": 0.1962728202342987, - "learning_rate": 9.773043356504362e-05, - "loss": 0.0316, + "grad_norm": 0.1698162853717804, + "learning_rate": 7.893742242468301e-05, + "loss": 0.0282, "step": 1230 }, { "epoch": 1.8803904843142831, - "grad_norm": 0.5413288474082947, - "learning_rate": 9.765087682353715e-05, - "loss": 0.0423, + "grad_norm": 0.5252974629402161, + "learning_rate": 7.850376771647038e-05, + "loss": 0.0386, "step": 1240 }, { "epoch": 1.8955549237039144, - "grad_norm": 0.2946687340736389, - "learning_rate": 9.756998313630103e-05, - "loss": 0.0397, + "grad_norm": 0.31378066539764404, + "learning_rate": 7.806691301872958e-05, + "loss": 0.0381, "step": 1250 }, { "epoch": 1.8955549237039144, - "eval_loss": 0.0390542708337307, - "eval_runtime": 159.3224, - "eval_samples_per_second": 7.362, - "eval_steps_per_second": 7.362, + "eval_loss": 0.037748388946056366, + "eval_runtime": 159.8804, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1250 }, { "epoch": 1.9107193630935457, - "grad_norm": 0.2026185393333435, - "learning_rate": 9.748775477297807e-05, - "loss": 0.0395, + "grad_norm": 0.16329045593738556, + "learning_rate": 7.762690737516846e-05, + "loss": 0.0359, "step": 1260 }, { "epoch": 1.925883802483177, - "grad_norm": 0.22304978966712952, - "learning_rate": 9.740419404065822e-05, - "loss": 0.0336, + "grad_norm": 0.23127301037311554, + "learning_rate": 7.718380018323742e-05, + "loss": 0.0326, "step": 1270 }, { "epoch": 1.9410482418728083, - "grad_norm": 0.3638700246810913, - "learning_rate": 9.731930328381384e-05, - "loss": 0.03, + "grad_norm": 0.29971396923065186, + "learning_rate": 7.673764118858371e-05, + "loss": 0.0279, "step": 1280 }, { "epoch": 1.9562126812624396, - "grad_norm": 0.13318689167499542, - "learning_rate": 9.723308488423397e-05, - "loss": 0.0426, + "grad_norm": 0.14393766224384308, + "learning_rate": 7.628848047946675e-05, + "loss": 0.0408, "step": 1290 }, { "epoch": 1.971377120652071, - "grad_norm": 0.3296961188316345, - "learning_rate": 9.714554126095742e-05, - "loss": 0.0292, + "grad_norm": 0.24297180771827698, + "learning_rate": 7.583636848113483e-05, + "loss": 0.0278, "step": 1300 }, { "epoch": 1.971377120652071, - "eval_loss": 0.03688763454556465, - "eval_runtime": 159.379, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.03594334051012993, + "eval_runtime": 159.9049, + "eval_samples_per_second": 7.336, + "eval_steps_per_second": 7.336, "step": 1300 }, { "epoch": 1.9865415600417022, - "grad_norm": 0.36734312772750854, - "learning_rate": 9.705667487020495e-05, - "loss": 0.0339, + "grad_norm": 0.33698412775993347, + "learning_rate": 7.538135595016423e-05, + "loss": 0.0322, "step": 1310 }, { "epoch": 2.0017059994313335, - "grad_norm": 0.2776101529598236, - "learning_rate": 9.696648820531039e-05, - "loss": 0.0319, + "grad_norm": 0.28321677446365356, + "learning_rate": 7.492349396876096e-05, + "loss": 0.0292, "step": 1320 }, { "epoch": 2.016870438820965, - "grad_norm": 0.13725480437278748, - "learning_rate": 9.687498379665056e-05, - "loss": 0.0282, + "grad_norm": 0.11086804419755936, + "learning_rate": 7.4462833939026e-05, + "loss": 0.0237, "step": 1330 }, { "epoch": 2.032034878210596, - "grad_norm": 0.23157070577144623, - "learning_rate": 9.678216421157443e-05, - "loss": 0.0222, + "grad_norm": 0.3019621670246124, + "learning_rate": 7.399942757718455e-05, + "loss": 0.0219, "step": 1340 }, { "epoch": 2.0471993176002274, - "grad_norm": 0.26500123739242554, - "learning_rate": 9.668803205433101e-05, - "loss": 0.0225, + "grad_norm": 0.26927098631858826, + "learning_rate": 7.35333269077802e-05, + "loss": 0.0194, "step": 1350 }, { "epoch": 2.0471993176002274, - "eval_loss": 0.040766872465610504, - "eval_runtime": 159.3887, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.03932846337556839, + "eval_runtime": 159.9101, + "eval_samples_per_second": 7.335, + "eval_steps_per_second": 7.335, "step": 1350 }, { "epoch": 2.0623637569898587, - "grad_norm": 0.2015039324760437, - "learning_rate": 9.659258996599624e-05, - "loss": 0.0319, + "grad_norm": 0.1983146220445633, + "learning_rate": 7.306458425783426e-05, + "loss": 0.0287, "step": 1360 }, { "epoch": 2.07752819637949, - "grad_norm": 0.17752842605113983, - "learning_rate": 9.649584062439898e-05, - "loss": 0.0213, + "grad_norm": 0.2132590264081955, + "learning_rate": 7.25932522509713e-05, + "loss": 0.02, "step": 1370 }, { "epoch": 2.0926926357691213, - "grad_norm": 0.6231749653816223, - "learning_rate": 9.639778674404581e-05, - "loss": 0.0261, + "grad_norm": 0.4830004572868347, + "learning_rate": 7.211938380151133e-05, + "loss": 0.0244, "step": 1380 }, { "epoch": 2.1078570751587526, - "grad_norm": 0.41245266795158386, - "learning_rate": 9.629843107604491e-05, - "loss": 0.0354, + "grad_norm": 0.3054353594779968, + "learning_rate": 7.164303210852934e-05, + "loss": 0.0309, "step": 1390 }, { "epoch": 2.123021514548384, - "grad_norm": 0.22366976737976074, - "learning_rate": 9.619777640802885e-05, - "loss": 0.0179, + "grad_norm": 0.22349397838115692, + "learning_rate": 7.116425064988286e-05, + "loss": 0.0149, "step": 1400 }, { "epoch": 2.123021514548384, - "eval_loss": 0.03914536535739899, - "eval_runtime": 159.3589, - "eval_samples_per_second": 7.361, - "eval_steps_per_second": 7.361, + "eval_loss": 0.03746328130364418, + "eval_runtime": 159.9069, + "eval_samples_per_second": 7.336, + "eval_steps_per_second": 7.336, "step": 1400 }, { "epoch": 2.1381859539380152, - "grad_norm": 0.16097739338874817, - "learning_rate": 9.609582556407635e-05, - "loss": 0.025, + "grad_norm": 0.23747169971466064, + "learning_rate": 7.068309317620827e-05, + "loss": 0.0244, "step": 1410 }, { "epoch": 2.1533503933276466, - "grad_norm": 0.2614944875240326, - "learning_rate": 9.599258140463314e-05, - "loss": 0.023, + "grad_norm": 0.2875257730484009, + "learning_rate": 7.019961370488645e-05, + "loss": 0.0211, "step": 1420 }, { "epoch": 2.168514832717278, - "grad_norm": 0.2686197757720947, - "learning_rate": 9.588804682643158e-05, - "loss": 0.022, + "grad_norm": 0.24418668448925018, + "learning_rate": 6.971386651397849e-05, + "loss": 0.02, "step": 1430 }, { "epoch": 2.183679272106909, - "grad_norm": 0.42036938667297363, - "learning_rate": 9.57822247624095e-05, - "loss": 0.0195, + "grad_norm": 0.5265827178955078, + "learning_rate": 6.922590613613211e-05, + "loss": 0.0169, "step": 1440 }, { "epoch": 2.1988437114965405, - "grad_norm": 0.20641587674617767, - "learning_rate": 9.56751181816278e-05, - "loss": 0.0292, + "grad_norm": 0.2406657487154007, + "learning_rate": 6.873578735245961e-05, + "loss": 0.0243, "step": 1450 }, { "epoch": 2.1988437114965405, - "eval_loss": 0.041304007172584534, - "eval_runtime": 159.3958, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.04021035507321358, + "eval_runtime": 159.985, + "eval_samples_per_second": 7.332, + "eval_steps_per_second": 7.332, "step": 1450 }, { "epoch": 2.2140081508861718, - "grad_norm": 0.12383019179105759, - "learning_rate": 9.556673008918725e-05, - "loss": 0.0221, + "grad_norm": 0.14891931414604187, + "learning_rate": 6.824356518638775e-05, + "loss": 0.021, "step": 1460 }, { "epoch": 2.229172590275803, - "grad_norm": 0.21757106482982635, - "learning_rate": 9.54570635261441e-05, - "loss": 0.0243, + "grad_norm": 0.32252541184425354, + "learning_rate": 6.774929489748052e-05, + "loss": 0.0198, "step": 1470 }, { "epoch": 2.2443370296654344, - "grad_norm": 1.360998511314392, - "learning_rate": 9.534612156942479e-05, - "loss": 0.0331, + "grad_norm": 1.1908115148544312, + "learning_rate": 6.725303197523548e-05, + "loss": 0.0302, "step": 1480 }, { "epoch": 2.259501469055066, - "grad_norm": 0.20993027091026306, - "learning_rate": 9.523390733173966e-05, - "loss": 0.0237, + "grad_norm": 0.19598431885242462, + "learning_rate": 6.675483213285412e-05, + "loss": 0.0225, "step": 1490 }, { "epoch": 2.274665908444697, - "grad_norm": 0.36339879035949707, - "learning_rate": 9.512042396149549e-05, - "loss": 0.0285, + "grad_norm": 0.37645435333251953, + "learning_rate": 6.625475130098728e-05, + "loss": 0.0234, "step": 1500 }, { "epoch": 2.274665908444697, - "eval_loss": 0.041971635073423386, - "eval_runtime": 159.3803, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.039622507989406586, + "eval_runtime": 160.0131, + "eval_samples_per_second": 7.331, + "eval_steps_per_second": 7.331, "step": 1500 }, { "epoch": 2.2898303478343287, - "grad_norm": 0.26336702704429626, - "learning_rate": 9.500567464270734e-05, - "loss": 0.0239, + "grad_norm": 0.2178558111190796, + "learning_rate": 6.575284562145593e-05, + "loss": 0.0213, "step": 1510 }, { "epoch": 2.30499478722396, - "grad_norm": 0.3695557415485382, - "learning_rate": 9.488966259490904e-05, - "loss": 0.0315, + "grad_norm": 0.4462236762046814, + "learning_rate": 6.524917144094851e-05, + "loss": 0.0258, "step": 1520 }, { "epoch": 2.3201592266135913, - "grad_norm": 0.107005275785923, - "learning_rate": 9.477239107306299e-05, - "loss": 0.028, + "grad_norm": 0.0876719132065773, + "learning_rate": 6.474378530469509e-05, + "loss": 0.025, "step": 1530 }, { "epoch": 2.3353236660032226, - "grad_norm": 0.1988624483346939, - "learning_rate": 9.46538633674688e-05, - "loss": 0.0212, + "grad_norm": 0.16359026730060577, + "learning_rate": 6.42367439501193e-05, + "loss": 0.0205, "step": 1540 }, { "epoch": 2.350488105392854, - "grad_norm": 0.4089479446411133, - "learning_rate": 9.453408280367092e-05, - "loss": 0.0244, + "grad_norm": 0.25489160418510437, + "learning_rate": 6.372810430046862e-05, + "loss": 0.0189, "step": 1550 }, { "epoch": 2.350488105392854, - "eval_loss": 0.04119926691055298, - "eval_runtime": 159.391, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.03922748565673828, + "eval_runtime": 159.9514, + "eval_samples_per_second": 7.333, + "eval_steps_per_second": 7.333, "step": 1550 }, { "epoch": 2.365652544782485, - "grad_norm": 0.1164274513721466, - "learning_rate": 9.441305274236536e-05, - "loss": 0.0267, + "grad_norm": 0.11957678943872452, + "learning_rate": 6.321792345842402e-05, + "loss": 0.0228, "step": 1560 }, { "epoch": 2.3808169841721165, - "grad_norm": 0.2434811145067215, - "learning_rate": 9.429077657930551e-05, - "loss": 0.0344, + "grad_norm": 0.23994044959545135, + "learning_rate": 6.270625869968906e-05, + "loss": 0.0323, "step": 1570 }, { "epoch": 2.395981423561748, - "grad_norm": 0.20269888639450073, - "learning_rate": 9.416725774520667e-05, - "loss": 0.0364, + "grad_norm": 0.1954767256975174, + "learning_rate": 6.219316746656007e-05, + "loss": 0.0284, "step": 1580 }, { "epoch": 2.411145862951379, - "grad_norm": 0.360107958316803, - "learning_rate": 9.404249970564995e-05, - "loss": 0.0285, + "grad_norm": 0.25645068287849426, + "learning_rate": 6.167870736147713e-05, + "loss": 0.0234, "step": 1590 }, { "epoch": 2.4263103023410104, - "grad_norm": 0.4151430130004883, - "learning_rate": 9.391650596098496e-05, - "loss": 0.021, + "grad_norm": 0.7860223054885864, + "learning_rate": 6.116293614055744e-05, + "loss": 0.0195, "step": 1600 }, { "epoch": 2.4263103023410104, - "eval_loss": 0.04432837665081024, - "eval_runtime": 159.3883, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.03961315006017685, + "eval_runtime": 159.8725, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1600 }, { "epoch": 2.4414747417306417, - "grad_norm": 0.2724536955356598, - "learning_rate": 9.378928004623163e-05, - "loss": 0.0231, + "grad_norm": 0.2385544329881668, + "learning_rate": 6.06459117071113e-05, + "loss": 0.0186, "step": 1610 }, { "epoch": 2.456639181120273, - "grad_norm": 0.3904866874217987, - "learning_rate": 9.366082553098097e-05, - "loss": 0.0249, + "grad_norm": 0.5610677599906921, + "learning_rate": 6.012769210514146e-05, + "loss": 0.0213, "step": 1620 }, { "epoch": 2.4718036205099043, - "grad_norm": 0.2163807600736618, - "learning_rate": 9.353114601929505e-05, - "loss": 0.0277, + "grad_norm": 0.1168915182352066, + "learning_rate": 5.9608335512826915e-05, + "loss": 0.0207, "step": 1630 }, { "epoch": 2.4869680598995356, - "grad_norm": 0.17040489614009857, - "learning_rate": 9.340024514960574e-05, - "loss": 0.0148, + "grad_norm": 0.08927187323570251, + "learning_rate": 5.908790023599144e-05, + "loss": 0.0143, "step": 1640 }, { "epoch": 2.502132499289167, - "grad_norm": 0.26220399141311646, - "learning_rate": 9.32681265946127e-05, - "loss": 0.0237, + "grad_norm": 0.2827674448490143, + "learning_rate": 5.856644470155781e-05, + "loss": 0.0221, "step": 1650 }, { "epoch": 2.502132499289167, - "eval_loss": 0.04865751415491104, - "eval_runtime": 159.3583, - "eval_samples_per_second": 7.361, - "eval_steps_per_second": 7.361, + "eval_loss": 0.04753277450799942, + "eval_runtime": 159.8134, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 1650 }, { "epoch": 2.5172969386787982, - "grad_norm": 0.4003421664237976, - "learning_rate": 9.313479406118029e-05, - "loss": 0.0336, + "grad_norm": 0.2750401496887207, + "learning_rate": 5.8044027450988546e-05, + "loss": 0.028, "step": 1660 }, { "epoch": 2.5324613780684295, - "grad_norm": 0.19889383018016815, - "learning_rate": 9.30002512902336e-05, - "loss": 0.0235, + "grad_norm": 0.19203156232833862, + "learning_rate": 5.752070713371371e-05, + "loss": 0.0161, "step": 1670 }, { "epoch": 2.547625817458061, - "grad_norm": 0.39777085185050964, - "learning_rate": 9.286450205665353e-05, - "loss": 0.0209, + "grad_norm": 0.1918914020061493, + "learning_rate": 5.699654250054662e-05, + "loss": 0.0165, "step": 1680 }, { "epoch": 2.562790256847692, - "grad_norm": 0.38904377818107605, - "learning_rate": 9.272755016917077e-05, - "loss": 0.0174, + "grad_norm": 0.1869034469127655, + "learning_rate": 5.647159239708809e-05, + "loss": 0.014, "step": 1690 }, { "epoch": 2.5779546962373234, - "grad_norm": 0.21256382763385773, - "learning_rate": 9.258939947025901e-05, - "loss": 0.0223, + "grad_norm": 0.18233343958854675, + "learning_rate": 5.5945915757120146e-05, + "loss": 0.0179, "step": 1700 }, { "epoch": 2.5779546962373234, - "eval_loss": 0.04217704012989998, - "eval_runtime": 159.3606, - "eval_samples_per_second": 7.361, - "eval_steps_per_second": 7.361, + "eval_loss": 0.04084445536136627, + "eval_runtime": 159.9315, + "eval_samples_per_second": 7.334, + "eval_steps_per_second": 7.334, "step": 1700 }, { "epoch": 2.5931191356269547, - "grad_norm": 0.4108022153377533, - "learning_rate": 9.245005383602719e-05, + "grad_norm": 0.4812876284122467, + "learning_rate": 5.5419571595989825e-05, "loss": 0.0272, "step": 1710 }, { "epoch": 2.608283575016586, - "grad_norm": 0.6556879281997681, - "learning_rate": 9.230951717611056e-05, - "loss": 0.028, + "grad_norm": 0.3959842920303345, + "learning_rate": 5.4892619003983734e-05, + "loss": 0.0248, "step": 1720 }, { "epoch": 2.6234480144062173, - "grad_norm": 0.2054949253797531, - "learning_rate": 9.21677934335612e-05, - "loss": 0.0194, + "grad_norm": 0.14155489206314087, + "learning_rate": 5.436511713969428e-05, + "loss": 0.0157, "step": 1730 }, { "epoch": 2.6386124537958486, - "grad_norm": 0.1257191002368927, - "learning_rate": 9.202488658473725e-05, - "loss": 0.028, + "grad_norm": 0.07145866006612778, + "learning_rate": 5.383712522337817e-05, + "loss": 0.024, "step": 1740 }, { "epoch": 2.65377689318548, - "grad_norm": 0.2633645534515381, - "learning_rate": 9.188080063919137e-05, - "loss": 0.0251, + "grad_norm": 0.26969072222709656, + "learning_rate": 5.3308702530308076e-05, + "loss": 0.0225, "step": 1750 }, { "epoch": 2.65377689318548, - "eval_loss": 0.04111066460609436, - "eval_runtime": 159.3666, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.038576629012823105, + "eval_runtime": 159.8548, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, "step": 1750 }, { "epoch": 2.6689413325751112, - "grad_norm": 0.264186292886734, - "learning_rate": 9.173553963955836e-05, - "loss": 0.0297, + "grad_norm": 0.3706766366958618, + "learning_rate": 5.2779908384118025e-05, + "loss": 0.0258, "step": 1760 }, { "epoch": 2.6841057719647425, - "grad_norm": 0.49441930651664734, - "learning_rate": 9.15891076614415e-05, - "loss": 0.0292, + "grad_norm": 0.36738407611846924, + "learning_rate": 5.22508021501434e-05, + "loss": 0.0215, "step": 1770 }, { "epoch": 2.699270211354374, - "grad_norm": 0.3683488368988037, - "learning_rate": 9.144150881329845e-05, - "loss": 0.0215, + "grad_norm": 0.34079742431640625, + "learning_rate": 5.1721443228756284e-05, + "loss": 0.0163, "step": 1780 }, { "epoch": 2.714434650744005, - "grad_norm": 0.37258797883987427, - "learning_rate": 9.129274723632579e-05, - "loss": 0.0214, + "grad_norm": 0.33863401412963867, + "learning_rate": 5.119189104869683e-05, + "loss": 0.0219, "step": 1790 }, { "epoch": 2.729599090133637, - "grad_norm": 0.3390253782272339, - "learning_rate": 9.114282710434297e-05, - "loss": 0.0284, + "grad_norm": 0.2650861442089081, + "learning_rate": 5.066220506040148e-05, + "loss": 0.0207, "step": 1800 }, { "epoch": 2.729599090133637, - "eval_loss": 0.04366917908191681, - "eval_runtime": 159.374, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.03975927457213402, + "eval_runtime": 159.781, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, "step": 1800 }, { "epoch": 2.7447635295232677, - "grad_norm": 0.28402847051620483, - "learning_rate": 9.099175262367509e-05, - "loss": 0.0298, + "grad_norm": 0.20335064828395844, + "learning_rate": 5.013244472932872e-05, + "loss": 0.0248, "step": 1810 }, { "epoch": 2.7599279689128995, - "grad_norm": 0.2853905260562897, - "learning_rate": 9.083952803303497e-05, - "loss": 0.0272, + "grad_norm": 0.29428455233573914, + "learning_rate": 4.960266952928316e-05, + "loss": 0.0215, "step": 1820 }, { "epoch": 2.7750924083025303, - "grad_norm": 0.31876683235168457, - "learning_rate": 9.068615760340416e-05, - "loss": 0.0273, + "grad_norm": 0.3635200262069702, + "learning_rate": 4.907293893573867e-05, + "loss": 0.0225, "step": 1830 }, { "epoch": 2.790256847692162, - "grad_norm": 0.30106455087661743, - "learning_rate": 9.053164563791316e-05, - "loss": 0.0247, + "grad_norm": 0.32583731412887573, + "learning_rate": 4.8543312419161396e-05, + "loss": 0.0197, "step": 1840 }, { "epoch": 2.805421287081793, - "grad_norm": 0.15850597620010376, - "learning_rate": 9.037599647172066e-05, - "loss": 0.019, + "grad_norm": 0.20632241666316986, + "learning_rate": 4.8013849438333165e-05, + "loss": 0.015, "step": 1850 }, { "epoch": 2.805421287081793, - "eval_loss": 0.04848329350352287, - "eval_runtime": 159.325, - "eval_samples_per_second": 7.362, - "eval_steps_per_second": 7.362, + "eval_loss": 0.041027262806892395, + "eval_runtime": 159.7438, + "eval_samples_per_second": 7.343, + "eval_steps_per_second": 7.343, "step": 1850 }, { "epoch": 2.8205857264714247, - "grad_norm": 0.6553308963775635, - "learning_rate": 9.021921447189188e-05, - "loss": 0.0322, + "grad_norm": 0.8117703795433044, + "learning_rate": 4.748460943367643e-05, + "loss": 0.0212, "step": 1860 }, { "epoch": 2.8357501658610555, - "grad_norm": 0.20365716516971588, - "learning_rate": 9.006130403727613e-05, - "loss": 0.033, + "grad_norm": 0.374080628156662, + "learning_rate": 4.695565182058113e-05, + "loss": 0.0312, "step": 1870 }, { "epoch": 2.8509146052506873, - "grad_norm": 0.1727505922317505, - "learning_rate": 8.990226959838328e-05, - "loss": 0.0278, + "grad_norm": 0.36783552169799805, + "learning_rate": 4.642703598273431e-05, + "loss": 0.0249, "step": 1880 }, { "epoch": 2.8660790446403186, - "grad_norm": 0.16985675692558289, - "learning_rate": 8.974211561725956e-05, - "loss": 0.0263, + "grad_norm": 0.2705755829811096, + "learning_rate": 4.589882126545352e-05, + "loss": 0.0234, "step": 1890 }, { "epoch": 2.88124348402995, - "grad_norm": 0.2777211368083954, - "learning_rate": 8.958084658736226e-05, - "loss": 0.0366, + "grad_norm": 0.25217315554618835, + "learning_rate": 4.537106696902425e-05, + "loss": 0.0295, "step": 1900 }, { "epoch": 2.88124348402995, - "eval_loss": 0.04076583683490753, - "eval_runtime": 159.3658, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.03719955310225487, + "eval_runtime": 159.7214, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, "step": 1900 }, { "epoch": 2.896407923419581, - "grad_norm": 1.2582695484161377, - "learning_rate": 8.941846703343373e-05, - "loss": 0.0297, + "grad_norm": 0.40652021765708923, + "learning_rate": 4.484383234204266e-05, + "loss": 0.0224, "step": 1910 }, { "epoch": 2.9115723628092125, - "grad_norm": 0.40788841247558594, - "learning_rate": 8.925498151137446e-05, - "loss": 0.027, + "grad_norm": 0.3361893892288208, + "learning_rate": 4.4317176574763935e-05, + "loss": 0.0247, "step": 1920 }, { "epoch": 2.926736802198844, - "grad_norm": 0.16330642998218536, - "learning_rate": 8.909039460811515e-05, - "loss": 0.0276, + "grad_norm": 0.35932040214538574, + "learning_rate": 4.3791158792457334e-05, + "loss": 0.024, "step": 1930 }, { "epoch": 2.941901241588475, - "grad_norm": 0.1846770942211151, - "learning_rate": 8.892471094148807e-05, - "loss": 0.0257, + "grad_norm": 0.15191532671451569, + "learning_rate": 4.3265838048768334e-05, + "loss": 0.0216, "step": 1940 }, { "epoch": 2.9570656809781064, - "grad_norm": 0.27030831575393677, - "learning_rate": 8.875793516009752e-05, - "loss": 0.023, + "grad_norm": 0.5233368873596191, + "learning_rate": 4.274127331908915e-05, + "loss": 0.0227, "step": 1950 }, { "epoch": 2.9570656809781064, - "eval_loss": 0.041393086314201355, - "eval_runtime": 159.3966, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.03795257583260536, + "eval_runtime": 159.8645, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 1950 }, { "epoch": 2.9722301203677377, - "grad_norm": 0.07221832871437073, - "learning_rate": 8.859007194318939e-05, - "loss": 0.0197, + "grad_norm": 0.05902265012264252, + "learning_rate": 4.221752349393758e-05, + "loss": 0.0173, "step": 1960 }, { "epoch": 2.987394559757369, - "grad_norm": 0.18859471380710602, - "learning_rate": 8.842112600051983e-05, - "loss": 0.0306, + "grad_norm": 0.1508518010377884, + "learning_rate": 4.1694647372345886e-05, + "loss": 0.0249, "step": 1970 }, { "epoch": 3.0025589991470003, - "grad_norm": 0.10923754423856735, - "learning_rate": 8.825110207222318e-05, - "loss": 0.0237, + "grad_norm": 0.13050998747348785, + "learning_rate": 4.117270365525946e-05, + "loss": 0.0182, "step": 1980 }, { "epoch": 3.0177234385366316, - "grad_norm": 0.1019091084599495, - "learning_rate": 8.808000492867888e-05, - "loss": 0.016, + "grad_norm": 0.06321557611227036, + "learning_rate": 4.065175093894694e-05, + "loss": 0.0132, "step": 1990 }, { "epoch": 3.032887877926263, - "grad_norm": 0.23444364964962006, - "learning_rate": 8.790783937037777e-05, - "loss": 0.0161, + "grad_norm": 0.155404731631279, + "learning_rate": 4.013184770842167e-05, + "loss": 0.0128, "step": 2000 }, { "epoch": 3.032887877926263, - "eval_loss": 0.04153500497341156, - "eval_runtime": 159.4222, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.03980863094329834, + "eval_runtime": 159.767, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 2000 }, { "epoch": 3.048052317315894, - "grad_norm": 0.22297435998916626, - "learning_rate": 8.773461022778722e-05, - "loss": 0.02, + "grad_norm": 0.18662160634994507, + "learning_rate": 3.9613052330876064e-05, + "loss": 0.0166, "step": 2010 }, { "epoch": 3.0632167567055255, - "grad_norm": 1.0138680934906006, - "learning_rate": 8.756032236121579e-05, - "loss": 0.0194, + "grad_norm": 0.5268431901931763, + "learning_rate": 3.909542304912881e-05, + "loss": 0.0136, "step": 2020 }, { "epoch": 3.078381196095157, - "grad_norm": 0.2017587423324585, - "learning_rate": 8.73849806606767e-05, - "loss": 0.018, + "grad_norm": 0.18333646655082703, + "learning_rate": 3.857901797508628e-05, + "loss": 0.0162, "step": 2030 }, { "epoch": 3.093545635484788, - "grad_norm": 0.12758751213550568, - "learning_rate": 8.720859004575074e-05, - "loss": 0.0202, + "grad_norm": 0.2035679817199707, + "learning_rate": 3.80638950832186e-05, + "loss": 0.0174, "step": 2040 }, { "epoch": 3.1087100748744194, - "grad_norm": 0.2535541355609894, - "learning_rate": 8.703115546544819e-05, - "loss": 0.0175, + "grad_norm": 0.200400710105896, + "learning_rate": 3.7550112204051014e-05, + "loss": 0.0096, "step": 2050 }, { "epoch": 3.1087100748744194, - "eval_loss": 0.04077655076980591, - "eval_runtime": 159.4217, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.04321278631687164, + "eval_runtime": 159.8384, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 2050 }, { "epoch": 3.1238745142640507, - "grad_norm": 0.10268954932689667, - "learning_rate": 8.685268189807e-05, - "loss": 0.0118, + "grad_norm": 0.10098811984062195, + "learning_rate": 3.703772701767167e-05, + "loss": 0.0098, "step": 2060 }, { "epoch": 3.139038953653682, - "grad_norm": 0.30055707693099976, - "learning_rate": 8.667317435106801e-05, - "loss": 0.0221, + "grad_norm": 0.3599345088005066, + "learning_rate": 3.652679704725596e-05, + "loss": 0.0095, "step": 2070 }, { "epoch": 3.1542033930433133, - "grad_norm": 0.29088059067726135, - "learning_rate": 8.649263786090466e-05, - "loss": 0.0137, + "grad_norm": 0.17654354870319366, + "learning_rate": 3.601737965260882e-05, + "loss": 0.0111, "step": 2080 }, { "epoch": 3.1693678324329446, - "grad_norm": 0.21581275761127472, - "learning_rate": 8.63110774929115e-05, - "loss": 0.0108, + "grad_norm": 0.3085581660270691, + "learning_rate": 3.550953202372503e-05, + "loss": 0.0072, "step": 2090 }, { "epoch": 3.184532271822576, - "grad_norm": 0.2984847128391266, - "learning_rate": 8.612849834114706e-05, - "loss": 0.0144, + "grad_norm": 0.5332884192466736, + "learning_rate": 3.500331117436895e-05, + "loss": 0.0076, "step": 2100 }, { "epoch": 3.184532271822576, - "eval_loss": 0.04835474491119385, - "eval_runtime": 159.3861, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.048784155398607254, + "eval_runtime": 159.7604, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 2100 }, { "epoch": 3.1996967112122072, - "grad_norm": 0.4704117178916931, - "learning_rate": 8.594490552825414e-05, - "loss": 0.0155, + "grad_norm": 0.20715317130088806, + "learning_rate": 3.4498773935673626e-05, + "loss": 0.0089, "step": 2110 }, { "epoch": 3.2148611506018385, - "grad_norm": 0.21770451962947845, - "learning_rate": 8.576030420531581e-05, - "loss": 0.0227, + "grad_norm": 0.1660224199295044, + "learning_rate": 3.399597694976081e-05, + "loss": 0.0116, "step": 2120 }, { "epoch": 3.23002558999147, - "grad_norm": 0.5758800506591797, - "learning_rate": 8.557469955171106e-05, - "loss": 0.0192, + "grad_norm": 0.19592803716659546, + "learning_rate": 3.349497666338187e-05, + "loss": 0.0152, "step": 2130 }, { "epoch": 3.245190029381101, - "grad_norm": 0.15580996870994568, - "learning_rate": 8.538809677496948e-05, - "loss": 0.0158, + "grad_norm": 0.19497735798358917, + "learning_rate": 3.299582932158085e-05, + "loss": 0.0118, "step": 2140 }, { "epoch": 3.260354468770733, - "grad_norm": 0.19080595672130585, - "learning_rate": 8.520050111062501e-05, - "loss": 0.0198, + "grad_norm": 0.23641538619995117, + "learning_rate": 3.2498590961379996e-05, + "loss": 0.0122, "step": 2150 }, { "epoch": 3.260354468770733, - "eval_loss": 0.046114206314086914, - "eval_runtime": 159.4004, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.04506111145019531, + "eval_runtime": 159.8166, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2150 }, { "epoch": 3.2755189081603637, - "grad_norm": 0.10289459675550461, - "learning_rate": 8.501191782206923e-05, - "loss": 0.0141, + "grad_norm": 0.037290170788764954, + "learning_rate": 3.200331740548887e-05, + "loss": 0.0105, "step": 2160 }, { "epoch": 3.2906833475499955, - "grad_norm": 0.27776649594306946, - "learning_rate": 8.482235220040358e-05, - "loss": 0.0212, + "grad_norm": 0.43852153420448303, + "learning_rate": 3.1510064256037274e-05, + "loss": 0.0136, "step": 2170 }, { "epoch": 3.3058477869396263, - "grad_norm": 0.09492180496454239, - "learning_rate": 8.463180956429086e-05, - "loss": 0.0139, + "grad_norm": 0.11988549679517746, + "learning_rate": 3.1018886888333065e-05, + "loss": 0.0089, "step": 2180 }, { "epoch": 3.321012226329258, - "grad_norm": 0.26768144965171814, - "learning_rate": 8.444029525980617e-05, - "loss": 0.0243, + "grad_norm": 0.32281386852264404, + "learning_rate": 3.052984044464548e-05, + "loss": 0.0159, "step": 2190 }, { "epoch": 3.3361766657188894, - "grad_norm": 0.13106976449489594, - "learning_rate": 8.424781466028675e-05, - "loss": 0.0168, + "grad_norm": 0.1835431307554245, + "learning_rate": 3.0042979828014496e-05, + "loss": 0.0101, "step": 2200 }, { "epoch": 3.3361766657188894, - "eval_loss": 0.04420452192425728, - "eval_runtime": 159.385, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.046676941215991974, + "eval_runtime": 159.7855, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, "step": 2200 }, { "epoch": 3.3513411051085207, - "grad_norm": 0.24808184802532196, - "learning_rate": 8.405437316618128e-05, - "loss": 0.0205, + "grad_norm": 0.20971719920635223, + "learning_rate": 2.9558359696087152e-05, + "loss": 0.015, "step": 2210 }, { "epoch": 3.366505544498152, - "grad_norm": 0.20891816914081573, - "learning_rate": 8.385997620489838e-05, - "loss": 0.0188, + "grad_norm": 0.30802661180496216, + "learning_rate": 2.9076034454981332e-05, + "loss": 0.0143, "step": 2220 }, { "epoch": 3.3816699838877833, - "grad_norm": 0.4753153622150421, - "learning_rate": 8.366462923065432e-05, - "loss": 0.02, + "grad_norm": 0.24673651158809662, + "learning_rate": 2.8596058253177932e-05, + "loss": 0.0124, "step": 2230 }, { "epoch": 3.3968344232774146, - "grad_norm": 0.06872042268514633, - "learning_rate": 8.346833772432e-05, - "loss": 0.0176, + "grad_norm": 0.09349372982978821, + "learning_rate": 2.811848497544175e-05, + "loss": 0.013, "step": 2240 }, { "epoch": 3.411998862667046, - "grad_norm": 0.25017935037612915, - "learning_rate": 8.327110719326708e-05, - "loss": 0.0207, + "grad_norm": 0.14247673749923706, + "learning_rate": 2.764336823677216e-05, + "loss": 0.0124, "step": 2250 }, { "epoch": 3.411998862667046, - "eval_loss": 0.043796706944704056, - "eval_runtime": 159.3036, - "eval_samples_per_second": 7.363, - "eval_steps_per_second": 7.363, + "eval_loss": 0.04635697975754738, + "eval_runtime": 159.7764, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 2250 }, { "epoch": 3.427163302056677, - "grad_norm": 0.478998601436615, - "learning_rate": 8.307294317121361e-05, - "loss": 0.0161, + "grad_norm": 0.2467641681432724, + "learning_rate": 2.717076137638388e-05, + "loss": 0.0096, "step": 2260 }, { "epoch": 3.4423277414463085, - "grad_norm": 0.23058924078941345, - "learning_rate": 8.287385121806869e-05, - "loss": 0.018, + "grad_norm": 0.32236963510513306, + "learning_rate": 2.6700717451719066e-05, + "loss": 0.0153, "step": 2270 }, { "epoch": 3.45749218083594, - "grad_norm": 0.2511160373687744, - "learning_rate": 8.267383691977648e-05, - "loss": 0.0202, + "grad_norm": 0.33867254853248596, + "learning_rate": 2.6233289232490447e-05, + "loss": 0.0156, "step": 2280 }, { "epoch": 3.472656620225571, - "grad_norm": 0.0964888334274292, - "learning_rate": 8.247290588815944e-05, - "loss": 0.0174, + "grad_norm": 0.17547796666622162, + "learning_rate": 2.5768529194757474e-05, + "loss": 0.0071, "step": 2290 }, { "epoch": 3.4878210596152024, - "grad_norm": 0.29743412137031555, - "learning_rate": 8.227106376076095e-05, - "loss": 0.0141, + "grad_norm": 0.22590090334415436, + "learning_rate": 2.5306489515034713e-05, + "loss": 0.0113, "step": 2300 }, { "epoch": 3.4878210596152024, - "eval_loss": 0.048306405544281006, - "eval_runtime": 159.3919, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.047961652278900146, + "eval_runtime": 159.815, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2300 }, { "epoch": 3.5029854990048337, - "grad_norm": 0.18017958104610443, - "learning_rate": 8.20683162006871e-05, - "loss": 0.0211, + "grad_norm": 0.1563863903284073, + "learning_rate": 2.484722206443455e-05, + "loss": 0.017, "step": 2310 }, { "epoch": 3.518149938394465, - "grad_norm": 0.08050891011953354, - "learning_rate": 8.186466889644775e-05, - "loss": 0.0204, + "grad_norm": 0.09933234751224518, + "learning_rate": 2.4390778402843605e-05, + "loss": 0.0129, "step": 2320 }, { "epoch": 3.5333143777840963, - "grad_norm": 0.30461400747299194, - "learning_rate": 8.166012756179706e-05, - "loss": 0.0159, + "grad_norm": 0.2538220286369324, + "learning_rate": 2.3937209773134472e-05, + "loss": 0.0094, "step": 2330 }, { "epoch": 3.5484788171737276, - "grad_norm": 0.19909420609474182, - "learning_rate": 8.145469793557306e-05, - "loss": 0.0161, + "grad_norm": 0.32708045840263367, + "learning_rate": 2.3486567095412864e-05, + "loss": 0.0098, "step": 2340 }, { "epoch": 3.563643256563359, - "grad_norm": 0.27763473987579346, - "learning_rate": 8.124838578153664e-05, - "loss": 0.0176, + "grad_norm": 0.3891366720199585, + "learning_rate": 2.3038900961301053e-05, + "loss": 0.0126, "step": 2350 }, { "epoch": 3.563643256563359, - "eval_loss": 0.04680194333195686, - "eval_runtime": 159.445, - "eval_samples_per_second": 7.357, - "eval_steps_per_second": 7.357, + "eval_loss": 0.047991689294576645, + "eval_runtime": 159.8161, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2350 }, { "epoch": 3.57880769595299, - "grad_norm": 0.3020648658275604, - "learning_rate": 8.10411968882099e-05, - "loss": 0.0145, + "grad_norm": 0.06344733387231827, + "learning_rate": 2.259426162825819e-05, + "loss": 0.0099, "step": 2360 }, { "epoch": 3.5939721353426215, - "grad_norm": 0.5402631759643555, - "learning_rate": 8.083313706871372e-05, - "loss": 0.0222, + "grad_norm": 0.5568035244941711, + "learning_rate": 2.215269901393805e-05, + "loss": 0.0164, "step": 2370 }, { "epoch": 3.609136574732253, - "grad_norm": 0.25688526034355164, - "learning_rate": 8.062421216060463e-05, - "loss": 0.0152, + "grad_norm": 0.06175050139427185, + "learning_rate": 2.1714262690585192e-05, + "loss": 0.0094, "step": 2380 }, { "epoch": 3.624301014121884, - "grad_norm": 0.2796648442745209, - "learning_rate": 8.041442802571103e-05, - "loss": 0.0181, + "grad_norm": 0.6271355152130127, + "learning_rate": 2.1279001879469424e-05, + "loss": 0.0126, "step": 2390 }, { "epoch": 3.6394654535115154, - "grad_norm": 0.04485485702753067, - "learning_rate": 8.020379054996874e-05, - "loss": 0.0139, + "grad_norm": 0.14509186148643494, + "learning_rate": 2.084696544536025e-05, + "loss": 0.0114, "step": 2400 }, { "epoch": 3.6394654535115154, - "eval_loss": 0.046483203768730164, - "eval_runtime": 159.3412, - "eval_samples_per_second": 7.362, - "eval_steps_per_second": 7.362, + "eval_loss": 0.04358929395675659, + "eval_runtime": 159.7768, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, "step": 2400 }, { "epoch": 3.6546298929011467, - "grad_norm": 0.07298114895820618, - "learning_rate": 7.99923056432559e-05, - "loss": 0.0178, + "grad_norm": 0.04602226987481117, + "learning_rate": 2.0418201891040778e-05, + "loss": 0.0102, "step": 2410 }, { "epoch": 3.669794332290778, - "grad_norm": 0.3859151303768158, - "learning_rate": 7.977997923922707e-05, - "loss": 0.0304, + "grad_norm": 0.4110815227031708, + "learning_rate": 1.9992759351862772e-05, + "loss": 0.0201, "step": 2420 }, { "epoch": 3.6849587716804093, - "grad_norm": 0.2295611947774887, - "learning_rate": 7.956681729514677e-05, - "loss": 0.0158, + "grad_norm": 0.3334294557571411, + "learning_rate": 1.9570685590342463e-05, + "loss": 0.0098, "step": 2430 }, { "epoch": 3.7001232110700406, - "grad_norm": 0.29482001066207886, - "learning_rate": 7.935282579172239e-05, - "loss": 0.015, + "grad_norm": 0.19647225737571716, + "learning_rate": 1.9152027990798748e-05, + "loss": 0.0109, "step": 2440 }, { "epoch": 3.715287650459672, - "grad_norm": 0.5195229649543762, - "learning_rate": 7.913801073293639e-05, - "loss": 0.0124, + "grad_norm": 0.06987540423870087, + "learning_rate": 1.8736833554033263e-05, + "loss": 0.0071, "step": 2450 }, { "epoch": 3.715287650459672, - "eval_loss": 0.04703487083315849, - "eval_runtime": 159.4022, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.047177620232105255, + "eval_runtime": 159.8168, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2450 }, { "epoch": 3.7304520898493037, - "grad_norm": 0.28464585542678833, - "learning_rate": 7.89223781458777e-05, - "loss": 0.0193, + "grad_norm": 0.3399925231933594, + "learning_rate": 1.8325148892054093e-05, + "loss": 0.0125, "step": 2460 }, { "epoch": 3.7456165292389345, - "grad_norm": 0.3494454324245453, - "learning_rate": 7.870593408057287e-05, - "loss": 0.0167, + "grad_norm": 0.1921069622039795, + "learning_rate": 1.7917020222842662e-05, + "loss": 0.0089, "step": 2470 }, { "epoch": 3.7607809686285663, - "grad_norm": 0.1398647278547287, - "learning_rate": 7.848868460981606e-05, - "loss": 0.0153, + "grad_norm": 0.10305212438106537, + "learning_rate": 1.751249336516513e-05, + "loss": 0.0106, "step": 2480 }, { "epoch": 3.775945408018197, - "grad_norm": 0.6757034063339233, - "learning_rate": 7.827063582899885e-05, - "loss": 0.0212, + "grad_norm": 0.07067303359508514, + "learning_rate": 1.7111613733428522e-05, + "loss": 0.0125, "step": 2490 }, { "epoch": 3.791109847407829, - "grad_norm": 0.10396584123373032, - "learning_rate": 7.805179385593912e-05, - "loss": 0.0207, + "grad_norm": 0.1127607673406601, + "learning_rate": 1.671442633258218e-05, + "loss": 0.0117, "step": 2500 }, { "epoch": 3.791109847407829, - "eval_loss": 0.04721406102180481, - "eval_runtime": 159.3581, - "eval_samples_per_second": 7.361, - "eval_steps_per_second": 7.361, + "eval_loss": 0.04904229938983917, + "eval_runtime": 159.7619, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 2500 }, { "epoch": 3.8062742867974597, - "grad_norm": 0.3236182630062103, - "learning_rate": 7.783216483070941e-05, - "loss": 0.0222, + "grad_norm": 0.47790926694869995, + "learning_rate": 1.6320975753065453e-05, + "loss": 0.0095, "step": 2510 }, { "epoch": 3.8214387261870915, - "grad_norm": 0.48767486214637756, - "learning_rate": 7.761175491546471e-05, - "loss": 0.0176, + "grad_norm": 0.6272779703140259, + "learning_rate": 1.5931306165801452e-05, + "loss": 0.01, "step": 2520 }, { "epoch": 3.8366031655767223, - "grad_norm": 0.13401205837726593, - "learning_rate": 7.73905702942695e-05, - "loss": 0.0224, + "grad_norm": 0.11979278177022934, + "learning_rate": 1.554546131723848e-05, + "loss": 0.0112, "step": 2530 }, { "epoch": 3.851767604966354, - "grad_norm": 0.35947826504707336, - "learning_rate": 7.716861717292425e-05, - "loss": 0.0142, + "grad_norm": 0.4187261760234833, + "learning_rate": 1.5163484524438516e-05, + "loss": 0.0075, "step": 2540 }, { "epoch": 3.8669320443559854, - "grad_norm": 0.18299825489521027, - "learning_rate": 7.694590177879137e-05, - "loss": 0.0243, + "grad_norm": 0.5277063846588135, + "learning_rate": 1.4785418670214496e-05, + "loss": 0.0145, "step": 2550 }, { "epoch": 3.8669320443559854, - "eval_loss": 0.043086327612400055, - "eval_runtime": 159.4014, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.047451041638851166, + "eval_runtime": 159.8306, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 2550 }, { "epoch": 3.8820964837456167, - "grad_norm": 0.3703920543193817, - "learning_rate": 7.67224303606204e-05, - "loss": 0.0198, + "grad_norm": 0.4268508553504944, + "learning_rate": 1.4411306198315777e-05, + "loss": 0.0123, "step": 2560 }, { "epoch": 3.897260923135248, - "grad_norm": 0.47570738196372986, - "learning_rate": 7.649820918837276e-05, - "loss": 0.0161, + "grad_norm": 0.10897725075483322, + "learning_rate": 1.4041189108663421e-05, + "loss": 0.008, "step": 2570 }, { "epoch": 3.9124253625248793, - "grad_norm": 0.16566331684589386, - "learning_rate": 7.627324455304575e-05, - "loss": 0.0145, + "grad_norm": 0.24288566410541534, + "learning_rate": 1.3675108952634824e-05, + "loss": 0.007, "step": 2580 }, { "epoch": 3.9275898019145106, - "grad_norm": 0.5200514793395996, - "learning_rate": 7.604754276649608e-05, - "loss": 0.0207, + "grad_norm": 0.3925674259662628, + "learning_rate": 1.3313106828399147e-05, + "loss": 0.0147, "step": 2590 }, { "epoch": 3.942754241304142, - "grad_norm": 0.21101753413677216, - "learning_rate": 7.582111016126292e-05, - "loss": 0.0162, + "grad_norm": 0.07586287707090378, + "learning_rate": 1.2955223376303205e-05, + "loss": 0.008, "step": 2600 }, { "epoch": 3.942754241304142, - "eval_loss": 0.044680364429950714, - "eval_runtime": 159.3302, - "eval_samples_per_second": 7.362, - "eval_steps_per_second": 7.362, + "eval_loss": 0.04594454541802406, + "eval_runtime": 159.8774, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 7.337, "step": 2600 }, { "epoch": 3.957918680693773, - "grad_norm": 0.11622188985347748, - "learning_rate": 7.559395309038994e-05, - "loss": 0.0202, + "grad_norm": 0.19912473857402802, + "learning_rate": 1.2601498774309112e-05, + "loss": 0.0104, "step": 2610 }, { "epoch": 3.9730831200834045, - "grad_norm": 0.25880640745162964, - "learning_rate": 7.536607792724732e-05, - "loss": 0.0198, + "grad_norm": 0.40546008944511414, + "learning_rate": 1.2251972733483612e-05, + "loss": 0.0117, "step": 2620 }, { "epoch": 3.9882475594730358, - "grad_norm": 0.5818021893501282, - "learning_rate": 7.513749106535278e-05, - "loss": 0.0167, + "grad_norm": 0.6201793551445007, + "learning_rate": 1.19066844935399e-05, + "loss": 0.0106, "step": 2630 }, { "epoch": 4.003411998862667, - "grad_norm": 0.16204509139060974, - "learning_rate": 7.490819891819228e-05, - "loss": 0.0219, + "grad_norm": 0.12441912293434143, + "learning_rate": 1.156567281843241e-05, + "loss": 0.0095, "step": 2640 }, { "epoch": 4.018576438252298, - "grad_norm": 0.3111201226711273, - "learning_rate": 7.467820791904004e-05, - "loss": 0.0087, + "grad_norm": 0.24595439434051514, + "learning_rate": 1.1228975992004842e-05, + "loss": 0.0044, "step": 2650 }, { "epoch": 4.018576438252298, - "eval_loss": 0.04547402262687683, - "eval_runtime": 159.3926, - "eval_samples_per_second": 7.359, - "eval_steps_per_second": 7.359, + "eval_loss": 0.048001233488321304, + "eval_runtime": 159.8103, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2650 }, { "epoch": 4.03374087764193, - "grad_norm": 0.24685142934322357, - "learning_rate": 7.444752452077801e-05, - "loss": 0.0115, + "grad_norm": 0.33981209993362427, + "learning_rate": 1.0896631813692377e-05, + "loss": 0.0076, "step": 2660 }, { "epoch": 4.048905317031561, - "grad_norm": 0.20904898643493652, - "learning_rate": 7.421615519571489e-05, - "loss": 0.0097, + "grad_norm": 0.2191678285598755, + "learning_rate": 1.0568677594277899e-05, + "loss": 0.0043, "step": 2670 }, { "epoch": 4.064069756421192, - "grad_norm": 0.12282831966876984, - "learning_rate": 7.39841064354045e-05, - "loss": 0.0069, + "grad_norm": 0.06934363394975662, + "learning_rate": 1.0245150151703509e-05, + "loss": 0.003, "step": 2680 }, { "epoch": 4.079234195810824, - "grad_norm": 0.3911077678203583, - "learning_rate": 7.375138475046363e-05, - "loss": 0.0065, + "grad_norm": 0.19347621500492096, + "learning_rate": 9.926085806936918e-06, + "loss": 0.0057, "step": 2690 }, { "epoch": 4.094398635200455, - "grad_norm": 0.26328033208847046, - "learning_rate": 7.351799667038942e-05, - "loss": 0.0139, + "grad_norm": 0.28373149037361145, + "learning_rate": 9.611520379894068e-06, + "loss": 0.0061, "step": 2700 }, { "epoch": 4.094398635200455, - "eval_loss": 0.05206388607621193, - "eval_runtime": 159.4264, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.05424731224775314, + "eval_runtime": 159.7665, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 7.342, "step": 2700 }, { "epoch": 4.109563074590087, - "grad_norm": 0.10450565069913864, - "learning_rate": 7.32839487433761e-05, - "loss": 0.0072, + "grad_norm": 0.10718700289726257, + "learning_rate": 9.301489185417578e-06, + "loss": 0.0043, "step": 2710 }, { "epoch": 4.1247275139797175, - "grad_norm": 0.2634604573249817, - "learning_rate": 7.304924753613127e-05, - "loss": 0.0073, + "grad_norm": 0.06889745593070984, + "learning_rate": 8.996027029312314e-06, + "loss": 0.0038, "step": 2720 }, { "epoch": 4.139891953369349, - "grad_norm": 0.5017417669296265, - "learning_rate": 7.281389963369174e-05, - "loss": 0.0135, + "grad_norm": 0.4347403645515442, + "learning_rate": 8.695168204437721e-06, + "loss": 0.0102, "step": 2730 }, { "epoch": 4.15505639275898, - "grad_norm": 0.5688347816467285, - "learning_rate": 7.257791163923869e-05, - "loss": 0.0109, + "grad_norm": 1.3129316568374634, + "learning_rate": 8.398946486858029e-06, + "loss": 0.0066, "step": 2740 }, { "epoch": 4.170220832148612, - "grad_norm": 0.18953905999660492, - "learning_rate": 7.234129017391245e-05, - "loss": 0.0117, + "grad_norm": 0.044139549136161804, + "learning_rate": 8.107395132050333e-06, + "loss": 0.0048, "step": 2750 }, { "epoch": 4.170220832148612, - "eval_loss": 0.04974653944373131, - "eval_runtime": 159.4242, - "eval_samples_per_second": 7.358, - "eval_steps_per_second": 7.358, + "eval_loss": 0.05559329688549042, + "eval_runtime": 159.8213, + "eval_samples_per_second": 7.339, + "eval_steps_per_second": 7.339, "step": 2750 }, { "epoch": 4.185385271538243, - "grad_norm": 0.18478193879127502, - "learning_rate": 7.210404187662668e-05, - "loss": 0.0106, + "grad_norm": 0.0535513274371624, + "learning_rate": 7.820546871171142e-06, + "loss": 0.0067, "step": 2760 }, { "epoch": 4.200549710927874, - "grad_norm": 0.09356517344713211, - "learning_rate": 7.186617340388211e-05, - "loss": 0.007, + "grad_norm": 0.37897995114326477, + "learning_rate": 7.5384339073818145e-06, + "loss": 0.0052, "step": 2770 }, { "epoch": 4.215714150317505, - "grad_norm": 0.3026033639907837, - "learning_rate": 7.162769142957987e-05, - "loss": 0.0096, + "grad_norm": 0.18212661147117615, + "learning_rate": 7.261087912233228e-06, + "loss": 0.0057, "step": 2780 }, { "epoch": 4.230878589707137, - "grad_norm": 0.1788073480129242, - "learning_rate": 7.138860264483413e-05, - "loss": 0.008, + "grad_norm": 0.09377432614564896, + "learning_rate": 6.988540022110235e-06, + "loss": 0.0062, "step": 2790 }, { "epoch": 4.246043029096768, - "grad_norm": 0.17431391775608063, - "learning_rate": 7.114891375778438e-05, - "loss": 0.0069, + "grad_norm": 0.1912451833486557, + "learning_rate": 6.720820834735969e-06, + "loss": 0.0043, "step": 2800 }, { "epoch": 4.246043029096768, - "eval_loss": 0.06040658801794052, - "eval_runtime": 159.3654, - "eval_samples_per_second": 7.36, - "eval_steps_per_second": 7.36, + "eval_loss": 0.05572189763188362, + "eval_runtime": 159.8918, + "eval_samples_per_second": 7.336, + "eval_steps_per_second": 7.336, "step": 2800 }, { "epoch": 4.2612074684864, - "grad_norm": 0.5801697969436646, - "learning_rate": 7.090863149340731e-05, - "loss": 0.0087, + "grad_norm": 0.32161590456962585, + "learning_rate": 6.457960405736968e-06, + "loss": 0.0077, "step": 2810 }, { "epoch": 4.2763719078760305, - "grad_norm": 0.25983890891075134, - "learning_rate": 7.066776259332797e-05, - "loss": 0.007, + "grad_norm": 0.07208878546953201, + "learning_rate": 6.199988245268778e-06, + "loss": 0.0043, "step": 2820 }, { "epoch": 4.291536347265662, - "grad_norm": 0.47554492950439453, - "learning_rate": 7.042631381563081e-05, - "loss": 0.0076, + "grad_norm": 0.09758219122886658, + "learning_rate": 5.946933314703157e-06, + "loss": 0.003, "step": 2830 }, { "epoch": 4.306700786655293, - "grad_norm": 0.3678058087825775, - "learning_rate": 7.018429193466988e-05, - "loss": 0.0074, + "grad_norm": 0.22281204164028168, + "learning_rate": 5.698824023376531e-06, + "loss": 0.0051, "step": 2840 }, { "epoch": 4.321865226044925, - "grad_norm": 0.6562399864196777, - "learning_rate": 6.994170374087893e-05, - "loss": 0.0121, + "grad_norm": 0.4457966983318329, + "learning_rate": 5.455688225400802e-06, + "loss": 0.0049, "step": 2850 }, { "epoch": 4.321865226044925, - "eval_loss": 0.06339146196842194, - "eval_runtime": 159.5041, - "eval_samples_per_second": 7.354, - "eval_steps_per_second": 7.354, + "eval_loss": 0.05698266625404358, + "eval_runtime": 159.8196, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, "step": 2850 }, { "epoch": 4.337029665434556, - "grad_norm": 0.24855035543441772, - "learning_rate": 6.969855604058072e-05, - "loss": 0.0175, + "grad_norm": 0.22873935103416443, + "learning_rate": 5.217553216536098e-06, + "loss": 0.0069, "step": 2860 }, { "epoch": 4.3521941048241874, - "grad_norm": 0.14766111969947815, - "learning_rate": 6.945485565579622e-05, - "loss": 0.0129, + "grad_norm": 0.13292750716209412, + "learning_rate": 4.984445731126542e-06, + "loss": 0.0059, "step": 2870 }, { "epoch": 4.367358544213818, - "grad_norm": 0.25352802872657776, - "learning_rate": 6.92106094240531e-05, - "loss": 0.0063, + "grad_norm": 0.021369215101003647, + "learning_rate": 4.756391939098853e-06, + "loss": 0.0036, "step": 2880 }, { "epoch": 4.38252298360345, - "grad_norm": 0.05130884796380997, - "learning_rate": 6.896582419819392e-05, - "loss": 0.016, + "grad_norm": 0.06876254081726074, + "learning_rate": 4.533417443024374e-06, + "loss": 0.0077, "step": 2890 }, { "epoch": 4.397687422993081, - "grad_norm": 0.3185183107852936, - "learning_rate": 6.872050684618382e-05, - "loss": 0.0063, + "grad_norm": 0.1053222119808197, + "learning_rate": 4.315547275244769e-06, + "loss": 0.0019, "step": 2900 }, { "epoch": 4.397687422993081, - "eval_loss": 0.05942343547940254, - "eval_runtime": 159.5952, - "eval_samples_per_second": 7.35, - "eval_steps_per_second": 7.35, + "eval_loss": 0.05874261260032654, + "eval_runtime": 159.7069, + "eval_samples_per_second": 7.345, + "eval_steps_per_second": 7.345, "step": 2900 }, { "epoch": 4.412851862382713, - "grad_norm": 1.3478113412857056, - "learning_rate": 6.847466425091792e-05, - "loss": 0.0177, + "grad_norm": 2.4315688610076904, + "learning_rate": 4.10280589506179e-06, + "loss": 0.0069, "step": 2910 }, { "epoch": 4.4280163017723435, - "grad_norm": 0.16440452635288239, - "learning_rate": 6.822830331002812e-05, - "loss": 0.0115, + "grad_norm": 0.21709750592708588, + "learning_rate": 3.895217185991312e-06, + "loss": 0.0039, "step": 2920 }, { "epoch": 4.443180741161975, - "grad_norm": 0.1425771266222, - "learning_rate": 6.798143093568958e-05, - "loss": 0.0079, + "grad_norm": 0.3738054037094116, + "learning_rate": 3.692804453082038e-06, + "loss": 0.0046, "step": 2930 }, { "epoch": 4.458345180551606, - "grad_norm": 0.090119868516922, - "learning_rate": 6.773405405442683e-05, - "loss": 0.0086, + "grad_norm": 0.07227662950754166, + "learning_rate": 3.495590420299194e-06, + "loss": 0.0051, "step": 2940 }, { "epoch": 4.473509619941238, - "grad_norm": 0.28995010256767273, - "learning_rate": 6.748617960691937e-05, - "loss": 0.0087, + "grad_norm": 0.25434616208076477, + "learning_rate": 3.3035972279733207e-06, + "loss": 0.0053, "step": 2950 }, { "epoch": 4.473509619941238, - "eval_loss": 0.05463121458888054, - "eval_runtime": 159.5729, - "eval_samples_per_second": 7.351, - "eval_steps_per_second": 7.351, + "eval_loss": 0.05949469655752182, + "eval_runtime": 159.7116, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, "step": 2950 }, { "epoch": 4.488674059330869, - "grad_norm": 0.28919604420661926, - "learning_rate": 6.723781454780702e-05, - "loss": 0.0094, + "grad_norm": 0.5970948934555054, + "learning_rate": 3.116846430314796e-06, + "loss": 0.0053, "step": 2960 }, { "epoch": 4.5038384987205005, - "grad_norm": 0.02757749892771244, - "learning_rate": 6.69889658454947e-05, - "loss": 0.0078, + "grad_norm": 0.07033645361661911, + "learning_rate": 2.9353589929939087e-06, + "loss": 0.0025, "step": 2970 }, { "epoch": 4.519002938110132, - "grad_norm": 0.12801842391490936, - "learning_rate": 6.6739640481957e-05, - "loss": 0.0107, + "grad_norm": 0.0930805653333664, + "learning_rate": 2.7591552907872574e-06, + "loss": 0.0043, "step": 2980 }, { "epoch": 4.534167377499763, - "grad_norm": 0.0783754289150238, - "learning_rate": 6.648984545254217e-05, - "loss": 0.0092, + "grad_norm": 0.10770905762910843, + "learning_rate": 2.5882551052902883e-06, + "loss": 0.0049, "step": 2990 }, { "epoch": 4.549331816889394, - "grad_norm": 0.7740666270256042, - "learning_rate": 6.623958776577599e-05, - "loss": 0.012, + "grad_norm": 0.7434157133102417, + "learning_rate": 2.4226776226965453e-06, + "loss": 0.0072, "step": 3000 }, { "epoch": 4.549331816889394, - "eval_loss": 0.05945741385221481, - "eval_runtime": 159.625, + "eval_loss": 0.05965949594974518, + "eval_runtime": 159.644, "eval_samples_per_second": 7.348, "eval_steps_per_second": 7.348, "step": 3000 }, { "epoch": 4.564496256279026, - "grad_norm": 0.7128525972366333, - "learning_rate": 6.598887444316506e-05, - "loss": 0.014, + "grad_norm": 0.01642761193215847, + "learning_rate": 2.262441431643697e-06, + "loss": 0.0062, "step": 3010 }, { "epoch": 4.579660695668657, - "grad_norm": 0.0910588726401329, - "learning_rate": 6.573771251899981e-05, - "loss": 0.0086, + "grad_norm": 0.06564271450042725, + "learning_rate": 2.1075645211266927e-06, + "loss": 0.0017, "step": 3020 }, { "epoch": 4.594825135058288, - "grad_norm": 0.31712573766708374, - "learning_rate": 6.548610904015709e-05, - "loss": 0.0095, + "grad_norm": 0.38251250982284546, + "learning_rate": 1.9580642784782053e-06, + "loss": 0.0046, "step": 3030 }, { "epoch": 4.60998957444792, - "grad_norm": 0.08617256581783295, - "learning_rate": 6.523407106590252e-05, - "loss": 0.005, + "grad_norm": 0.10563746094703674, + "learning_rate": 1.813957487416651e-06, + "loss": 0.0023, "step": 3040 }, { "epoch": 4.625154013837551, - "grad_norm": 0.11196815967559814, - "learning_rate": 6.49816056676924e-05, - "loss": 0.0111, + "grad_norm": 0.1952509880065918, + "learning_rate": 1.6752603261619315e-06, + "loss": 0.0047, "step": 3050 }, { "epoch": 4.625154013837551, - "eval_loss": 0.056488387286663055, - "eval_runtime": 159.6271, + "eval_loss": 0.05990082770586014, + "eval_runtime": 159.6308, "eval_samples_per_second": 7.348, "eval_steps_per_second": 7.348, "step": 3050 }, { "epoch": 4.640318453227183, - "grad_norm": 0.35999494791030884, - "learning_rate": 6.472871992897536e-05, - "loss": 0.0111, + "grad_norm": 0.2965155243873596, + "learning_rate": 1.541988365619207e-06, + "loss": 0.0044, "step": 3060 }, { "epoch": 4.6554828926168135, - "grad_norm": 0.18991102278232574, - "learning_rate": 6.447542094499347e-05, - "loss": 0.0099, + "grad_norm": 0.28507962822914124, + "learning_rate": 1.4141565676307865e-06, + "loss": 0.0052, "step": 3070 }, { "epoch": 4.670647332006445, - "grad_norm": 0.21012216806411743, - "learning_rate": 6.422171582258334e-05, - "loss": 0.0072, + "grad_norm": 0.09388308972120285, + "learning_rate": 1.291779283296468e-06, + "loss": 0.0029, "step": 3080 }, { "epoch": 4.685811771396076, - "grad_norm": 0.15050624310970306, - "learning_rate": 6.396761167997664e-05, - "loss": 0.0149, + "grad_norm": 0.3553768992424011, + "learning_rate": 1.1748702513623922e-06, + "loss": 0.0064, "step": 3090 }, { "epoch": 4.700976210785708, - "grad_norm": 0.3615538477897644, - "learning_rate": 6.371311564660039e-05, - "loss": 0.012, + "grad_norm": 0.01811501383781433, + "learning_rate": 1.0634425966786155e-06, + "loss": 0.003, "step": 3100 }, { "epoch": 4.700976210785708, - "eval_loss": 0.049746330827474594, - "eval_runtime": 159.6194, - "eval_samples_per_second": 7.349, - "eval_steps_per_second": 7.349, + "eval_loss": 0.06024787947535515, + "eval_runtime": 159.5976, + "eval_samples_per_second": 7.35, + "eval_steps_per_second": 7.35, "step": 3100 }, { "epoch": 4.716140650175339, - "grad_norm": 0.1543639749288559, - "learning_rate": 6.345823486287687e-05, - "loss": 0.0084, + "grad_norm": 0.17447508871555328, + "learning_rate": 9.575088287257118e-07, + "loss": 0.005, "step": 3110 }, { "epoch": 4.73130508956497, - "grad_norm": 0.8648191690444946, - "learning_rate": 6.320297648002346e-05, - "loss": 0.0159, + "grad_norm": 2.3331735134124756, + "learning_rate": 8.570808402103392e-07, + "loss": 0.0059, "step": 3120 }, { "epoch": 4.746469528954601, - "grad_norm": 0.34107914566993713, - "learning_rate": 6.294734765985176e-05, - "loss": 0.0086, + "grad_norm": 0.20535585284233093, + "learning_rate": 7.62169905730109e-07, + "loss": 0.0048, "step": 3130 }, { "epoch": 4.761633968344233, - "grad_norm": 0.38437411189079285, - "learning_rate": 6.269135557456686e-05, - "loss": 0.0106, + "grad_norm": 0.5628078579902649, + "learning_rate": 6.727866805078531e-07, + "loss": 0.0029, "step": 3140 }, { "epoch": 4.776798407733864, - "grad_norm": 0.23792046308517456, - "learning_rate": 6.243500740656594e-05, - "loss": 0.0088, + "grad_norm": 0.09494868665933609, + "learning_rate": 5.889411991953975e-07, + "loss": 0.0025, "step": 3150 }, { "epoch": 4.776798407733864, - "eval_loss": 0.05470900237560272, - "eval_runtime": 159.6235, - "eval_samples_per_second": 7.349, - "eval_steps_per_second": 7.349, + "eval_loss": 0.06030402332544327, + "eval_runtime": 159.5412, + "eval_samples_per_second": 7.352, + "eval_steps_per_second": 7.352, "step": 3150 }, { "epoch": 4.791962847123496, - "grad_norm": 0.4615446925163269, - "learning_rate": 6.217831034823694e-05, - "loss": 0.0054, + "grad_norm": 0.21833842992782593, + "learning_rate": 5.106428747470137e-07, + "loss": 0.0033, "step": 3160 }, { "epoch": 4.8071272865131265, - "grad_norm": 0.11633367091417313, - "learning_rate": 6.19212716017565e-05, - "loss": 0.0135, + "grad_norm": 0.24270612001419067, + "learning_rate": 4.37900497362681e-07, + "loss": 0.0061, "step": 3170 }, { "epoch": 4.822291725902758, - "grad_norm": 0.16857339441776276, - "learning_rate": 6.166389837888819e-05, - "loss": 0.0083, + "grad_norm": 0.10475039482116699, + "learning_rate": 3.7072223350124167e-07, + "loss": 0.0028, "step": 3180 }, { "epoch": 4.837456165292389, - "grad_norm": 0.13763786852359772, - "learning_rate": 6.140619790077991e-05, - "loss": 0.0128, + "grad_norm": 0.19077710807323456, + "learning_rate": 3.0911562496358517e-07, + "loss": 0.0062, "step": 3190 }, { "epoch": 4.852620604682021, - "grad_norm": 0.261172890663147, - "learning_rate": 6.114817739776147e-05, - "loss": 0.0082, + "grad_norm": 0.36398255825042725, + "learning_rate": 2.53087588045986e-07, + "loss": 0.0027, "step": 3200 }, { "epoch": 4.852620604682021, - "eval_loss": 0.050943680107593536, - "eval_runtime": 159.6883, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, + "eval_loss": 0.06035413593053818, + "eval_runtime": 159.4969, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 7.354, "step": 3200 }, { "epoch": 4.867785044071652, - "grad_norm": 0.6665744185447693, - "learning_rate": 6.0889844109141626e-05, - "loss": 0.0078, + "grad_norm": 0.1687215268611908, + "learning_rate": 2.026444127636029e-07, + "loss": 0.0034, "step": 3210 }, { "epoch": 4.882949483461283, - "grad_norm": 0.16863170266151428, - "learning_rate": 6.0631205283004967e-05, - "loss": 0.0074, + "grad_norm": 0.03258746862411499, + "learning_rate": 1.577917621443825e-07, + "loss": 0.0051, "step": 3220 }, { "epoch": 4.898113922850914, - "grad_norm": 0.3022199869155884, - "learning_rate": 6.0372268176008605e-05, - "loss": 0.0127, + "grad_norm": 0.1656346619129181, + "learning_rate": 1.185346715932345e-07, + "loss": 0.0083, "step": 3230 }, { "epoch": 4.913278362240546, - "grad_norm": 0.2758694291114807, - "learning_rate": 6.011304005317856e-05, - "loss": 0.0097, + "grad_norm": 0.45706111192703247, + "learning_rate": 8.48775483267783e-08, + "loss": 0.0055, "step": 3240 }, { "epoch": 4.928442801630177, - "grad_norm": 0.429555207490921, - "learning_rate": 5.985352818770591e-05, - "loss": 0.01, + "grad_norm": 0.07887681573629379, + "learning_rate": 5.6824170878544414e-08, + "loss": 0.0022, "step": 3250 }, { "epoch": 4.928442801630177, - "eval_loss": 0.05470656231045723, - "eval_runtime": 159.6932, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, + "eval_loss": 0.060390591621398926, + "eval_runtime": 159.4858, + "eval_samples_per_second": 7.355, + "eval_steps_per_second": 7.355, "step": 3250 }, { "epoch": 4.943607241019809, - "grad_norm": 0.43170544505119324, - "learning_rate": 5.95937398607427e-05, - "loss": 0.0093, + "grad_norm": 0.4127300977706909, + "learning_rate": 3.4377688674774866e-08, + "loss": 0.0053, "step": 3260 }, { "epoch": 4.9587716804094395, - "grad_norm": 0.020290417596697807, - "learning_rate": 5.933368236119773e-05, - "loss": 0.0081, + "grad_norm": 0.03591546416282654, + "learning_rate": 1.7540621680850466e-08, + "loss": 0.0029, "step": 3270 }, { "epoch": 4.973936119799071, - "grad_norm": 0.07297362387180328, - "learning_rate": 5.907336298553197e-05, - "loss": 0.0089, + "grad_norm": 0.06558314710855484, + "learning_rate": 6.314860118400434e-09, + "loss": 0.0064, "step": 3280 }, { "epoch": 4.989100559188703, - "grad_norm": 0.1088402196764946, - "learning_rate": 5.8812789037553894e-05, - "loss": 0.0103, + "grad_norm": 0.06449679285287857, + "learning_rate": 7.016642530777162e-10, + "loss": 0.0032, "step": 3290 }, { - "epoch": 5.004264998578334, - "grad_norm": 0.14226531982421875, - "learning_rate": 5.855196782821452e-05, - "loss": 0.007, - "step": 3300 - }, - { - "epoch": 5.004264998578334, - "eval_loss": 0.05577705800533295, - "eval_runtime": 159.6667, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 3300 - }, - { - "epoch": 5.019429437967965, - "grad_norm": 0.6658801436424255, - "learning_rate": 5.829090667540231e-05, - "loss": 0.0062, - "step": 3310 - }, - { - "epoch": 5.0345938773575964, - "grad_norm": 0.1950637698173523, - "learning_rate": 5.802961290373788e-05, - "loss": 0.007, - "step": 3320 - }, - { - "epoch": 5.049758316747228, - "grad_norm": 0.31470271944999695, - "learning_rate": 5.776809384436843e-05, - "loss": 0.004, - "step": 3330 - }, - { - "epoch": 5.064922756136859, - "grad_norm": 0.012491337023675442, - "learning_rate": 5.7506356834762064e-05, - "loss": 0.0082, - "step": 3340 - }, - { - "epoch": 5.080087195526491, - "grad_norm": 0.37925124168395996, - "learning_rate": 5.724440921850196e-05, - "loss": 0.0022, - "step": 3350 - }, - { - "epoch": 5.080087195526491, - "eval_loss": 0.06049768999218941, - "eval_runtime": 159.5785, - "eval_samples_per_second": 7.351, - "eval_steps_per_second": 7.351, - "step": 3350 - }, - { - "epoch": 5.095251634916122, - "grad_norm": 0.08176101744174957, - "learning_rate": 5.6982258345080284e-05, - "loss": 0.0041, - "step": 3360 - }, - { - "epoch": 5.110416074305753, - "grad_norm": 0.17478878796100616, - "learning_rate": 5.671991156969209e-05, - "loss": 0.0094, - "step": 3370 - }, - { - "epoch": 5.125580513695384, - "grad_norm": 0.21786946058273315, - "learning_rate": 5.645737625302875e-05, - "loss": 0.0027, - "step": 3380 - }, - { - "epoch": 5.140744953085016, - "grad_norm": 0.10232345759868622, - "learning_rate": 5.619465976107168e-05, - "loss": 0.0058, - "step": 3390 - }, - { - "epoch": 5.155909392474647, - "grad_norm": 0.013475810177624226, - "learning_rate": 5.5931769464885455e-05, - "loss": 0.0025, - "step": 3400 - }, - { - "epoch": 5.155909392474647, - "eval_loss": 0.06552357226610184, - "eval_runtime": 159.6541, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 3400 - }, - { - "epoch": 5.171073831864279, - "grad_norm": 0.03600894659757614, - "learning_rate": 5.566871274041114e-05, - "loss": 0.0071, - "step": 3410 - }, - { - "epoch": 5.1862382712539095, - "grad_norm": 0.12144148349761963, - "learning_rate": 5.540549696825932e-05, - "loss": 0.0109, - "step": 3420 - }, - { - "epoch": 5.201402710643541, - "grad_norm": 0.056402988731861115, - "learning_rate": 5.514212953350293e-05, - "loss": 0.0054, - "step": 3430 - }, - { - "epoch": 5.216567150033172, - "grad_norm": 0.09186814725399017, - "learning_rate": 5.487861782547018e-05, - "loss": 0.0056, - "step": 3440 - }, - { - "epoch": 5.231731589422804, - "grad_norm": 0.018445612862706184, - "learning_rate": 5.4614969237537115e-05, - "loss": 0.0079, - "step": 3450 - }, - { - "epoch": 5.231731589422804, - "eval_loss": 0.05608835071325302, - "eval_runtime": 159.719, - "eval_samples_per_second": 7.344, - "eval_steps_per_second": 7.344, - "step": 3450 - }, - { - "epoch": 5.246896028812435, - "grad_norm": 0.023402627557516098, - "learning_rate": 5.435119116692032e-05, - "loss": 0.0052, - "step": 3460 - }, - { - "epoch": 5.262060468202066, - "grad_norm": 0.0944986343383789, - "learning_rate": 5.40872910144692e-05, - "loss": 0.0054, - "step": 3470 - }, - { - "epoch": 5.277224907591697, - "grad_norm": 0.11471125483512878, - "learning_rate": 5.382327618445847e-05, - "loss": 0.0054, - "step": 3480 - }, - { - "epoch": 5.292389346981329, - "grad_norm": 0.1250847578048706, - "learning_rate": 5.355915408438034e-05, - "loss": 0.0048, - "step": 3490 - }, - { - "epoch": 5.30755378637096, - "grad_norm": 0.013143649324774742, - "learning_rate": 5.329493212473673e-05, - "loss": 0.0072, - "step": 3500 - }, - { - "epoch": 5.30755378637096, - "eval_loss": 0.06184273958206177, - "eval_runtime": 159.6342, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 3500 - }, - { - "epoch": 5.322718225760592, - "grad_norm": 0.030204279348254204, - "learning_rate": 5.303061771883133e-05, - "loss": 0.004, - "step": 3510 - }, - { - "epoch": 5.3378826651502225, - "grad_norm": 0.4626109302043915, - "learning_rate": 5.27662182825616e-05, - "loss": 0.0048, - "step": 3520 - }, - { - "epoch": 5.353047104539854, - "grad_norm": 0.07302988320589066, - "learning_rate": 5.250174123421068e-05, - "loss": 0.0063, - "step": 3530 - }, - { - "epoch": 5.368211543929485, - "grad_norm": 0.01752207987010479, - "learning_rate": 5.223719399423931e-05, - "loss": 0.0031, - "step": 3540 - }, - { - "epoch": 5.383375983319117, - "grad_norm": 0.9372935891151428, - "learning_rate": 5.197258398507762e-05, - "loss": 0.0085, - "step": 3550 - }, - { - "epoch": 5.383375983319117, - "eval_loss": 0.06620989739894867, - "eval_runtime": 159.6486, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 3550 - }, - { - "epoch": 5.398540422708748, - "grad_norm": 0.10001783072948456, - "learning_rate": 5.170791863091682e-05, - "loss": 0.0049, - "step": 3560 - }, - { - "epoch": 5.413704862098379, - "grad_norm": 0.2315005660057068, - "learning_rate": 5.144320535750096e-05, - "loss": 0.0063, - "step": 3570 - }, - { - "epoch": 5.42886930148801, - "grad_norm": 0.02981536276638508, - "learning_rate": 5.1178451591918574e-05, - "loss": 0.0021, - "step": 3580 - }, - { - "epoch": 5.444033740877642, - "grad_norm": 0.03964484855532646, - "learning_rate": 5.091366476239425e-05, - "loss": 0.0069, - "step": 3590 - }, - { - "epoch": 5.459198180267273, - "grad_norm": 0.05413379520177841, - "learning_rate": 5.064885229808034e-05, - "loss": 0.0043, - "step": 3600 - }, - { - "epoch": 5.459198180267273, - "eval_loss": 0.0699431374669075, - "eval_runtime": 159.6742, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 3600 - }, - { - "epoch": 5.474362619656905, - "grad_norm": 0.8799638748168945, - "learning_rate": 5.0384021628848334e-05, - "loss": 0.0045, - "step": 3610 - }, - { - "epoch": 5.4895270590465355, - "grad_norm": 0.638287365436554, - "learning_rate": 5.011918018508057e-05, - "loss": 0.0078, - "step": 3620 - }, - { - "epoch": 5.504691498436167, - "grad_norm": 0.27183452248573303, - "learning_rate": 4.985433539746164e-05, - "loss": 0.0068, - "step": 3630 - }, - { - "epoch": 5.519855937825799, - "grad_norm": 0.42514100670814514, - "learning_rate": 4.958949469677001e-05, - "loss": 0.0057, - "step": 3640 - }, - { - "epoch": 5.53502037721543, - "grad_norm": 0.5132871866226196, - "learning_rate": 4.9324665513669385e-05, - "loss": 0.0091, - "step": 3650 - }, - { - "epoch": 5.53502037721543, - "eval_loss": 0.06599996238946915, - "eval_runtime": 159.6046, - "eval_samples_per_second": 7.349, - "eval_steps_per_second": 7.349, - "step": 3650 - }, - { - "epoch": 5.550184816605061, - "grad_norm": 0.13555912673473358, - "learning_rate": 4.9059855278500446e-05, - "loss": 0.0052, - "step": 3660 - }, - { - "epoch": 5.565349255994692, - "grad_norm": 0.06053599342703819, - "learning_rate": 4.879507142107215e-05, - "loss": 0.0081, - "step": 3670 - }, - { - "epoch": 5.580513695384324, - "grad_norm": 0.2532534897327423, - "learning_rate": 4.853032137045343e-05, - "loss": 0.0089, - "step": 3680 - }, - { - "epoch": 5.595678134773955, - "grad_norm": 0.44041526317596436, - "learning_rate": 4.826561255476463e-05, - "loss": 0.0073, - "step": 3690 - }, - { - "epoch": 5.610842574163586, - "grad_norm": 0.045529644936323166, - "learning_rate": 4.800095240096925e-05, - "loss": 0.0041, - "step": 3700 - }, - { - "epoch": 5.610842574163586, - "eval_loss": 0.057340603321790695, - "eval_runtime": 159.6358, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 3700 - }, - { - "epoch": 5.626007013553218, - "grad_norm": 0.3167114555835724, - "learning_rate": 4.773634833466542e-05, - "loss": 0.0081, - "step": 3710 - }, - { - "epoch": 5.641171452942849, - "grad_norm": 0.18999992311000824, - "learning_rate": 4.747180777987761e-05, - "loss": 0.0035, - "step": 3720 - }, - { - "epoch": 5.65633589233248, - "grad_norm": 0.25365421175956726, - "learning_rate": 4.720733815884836e-05, - "loss": 0.0045, - "step": 3730 - }, - { - "epoch": 5.671500331722112, - "grad_norm": 0.22395853698253632, - "learning_rate": 4.694294689183005e-05, - "loss": 0.0059, - "step": 3740 - }, - { - "epoch": 5.686664771111743, - "grad_norm": 0.11444402486085892, - "learning_rate": 4.667864139687661e-05, - "loss": 0.0079, - "step": 3750 - }, - { - "epoch": 5.686664771111743, - "eval_loss": 0.06023582071065903, - "eval_runtime": 159.6684, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 3750 - }, - { - "epoch": 5.701829210501375, - "grad_norm": 0.03902348875999451, - "learning_rate": 4.641442908963549e-05, - "loss": 0.0054, - "step": 3760 - }, - { - "epoch": 5.7169936498910054, - "grad_norm": 0.35566335916519165, - "learning_rate": 4.615031738313954e-05, - "loss": 0.0032, - "step": 3770 - }, - { - "epoch": 5.732158089280637, - "grad_norm": 0.20712420344352722, - "learning_rate": 4.588631368759908e-05, - "loss": 0.0055, - "step": 3780 - }, - { - "epoch": 5.747322528670268, - "grad_norm": 0.1179213747382164, - "learning_rate": 4.562242541019392e-05, - "loss": 0.0085, - "step": 3790 - }, - { - "epoch": 5.7624869680599, - "grad_norm": 0.46198901534080505, - "learning_rate": 4.535865995486559e-05, - "loss": 0.009, - "step": 3800 - }, - { - "epoch": 5.7624869680599, - "eval_loss": 0.06141876056790352, - "eval_runtime": 159.6458, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 3800 - }, - { - "epoch": 5.777651407449531, - "grad_norm": 0.5082039833068848, - "learning_rate": 4.509502472210956e-05, - "loss": 0.0048, - "step": 3810 - }, - { - "epoch": 5.792815846839162, - "grad_norm": 0.27177000045776367, - "learning_rate": 4.483152710876768e-05, - "loss": 0.0098, - "step": 3820 - }, - { - "epoch": 5.807980286228793, - "grad_norm": 0.1903979778289795, - "learning_rate": 4.456817450782058e-05, - "loss": 0.0048, - "step": 3830 - }, - { - "epoch": 5.823144725618425, - "grad_norm": 0.021568287163972855, - "learning_rate": 4.4304974308180225e-05, - "loss": 0.0069, - "step": 3840 - }, - { - "epoch": 5.838309165008056, - "grad_norm": 0.12434730678796768, - "learning_rate": 4.4041933894482675e-05, - "loss": 0.0021, - "step": 3850 - }, - { - "epoch": 5.838309165008056, - "eval_loss": 0.0678209662437439, - "eval_runtime": 159.6081, - "eval_samples_per_second": 7.349, - "eval_steps_per_second": 7.349, - "step": 3850 - }, - { - "epoch": 5.853473604397688, - "grad_norm": 0.5138853192329407, - "learning_rate": 4.3779060646880844e-05, - "loss": 0.0066, - "step": 3860 - }, - { - "epoch": 5.8686380437873185, - "grad_norm": 0.1913759857416153, - "learning_rate": 4.3516361940837485e-05, - "loss": 0.0034, - "step": 3870 - }, - { - "epoch": 5.88380248317695, - "grad_norm": 0.18525590002536774, - "learning_rate": 4.325384514691818e-05, - "loss": 0.0034, - "step": 3880 - }, - { - "epoch": 5.898966922566581, - "grad_norm": 0.18354234099388123, - "learning_rate": 4.299151763058458e-05, - "loss": 0.0046, - "step": 3890 - }, - { - "epoch": 5.914131361956213, - "grad_norm": 0.10036251693964005, - "learning_rate": 4.272938675198778e-05, - "loss": 0.0025, - "step": 3900 - }, - { - "epoch": 5.914131361956213, - "eval_loss": 0.07436000555753708, - "eval_runtime": 159.6604, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 3900 - }, - { - "epoch": 5.929295801345844, - "grad_norm": 0.03125413879752159, - "learning_rate": 4.2467459865761775e-05, - "loss": 0.0066, - "step": 3910 - }, - { - "epoch": 5.944460240735475, - "grad_norm": 0.010172716341912746, - "learning_rate": 4.220574432081714e-05, - "loss": 0.0044, - "step": 3920 - }, - { - "epoch": 5.959624680125106, - "grad_norm": 0.4898487627506256, - "learning_rate": 4.194424746013481e-05, - "loss": 0.0055, - "step": 3930 - }, - { - "epoch": 5.974789119514738, - "grad_norm": 0.1646544486284256, - "learning_rate": 4.168297662056005e-05, - "loss": 0.0053, - "step": 3940 - }, - { - "epoch": 5.989953558904369, - "grad_norm": 0.2785071134567261, - "learning_rate": 4.1421939132596676e-05, - "loss": 0.0039, - "step": 3950 - }, - { - "epoch": 5.989953558904369, - "eval_loss": 0.06879924982786179, - "eval_runtime": 159.6514, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 3950 - }, - { - "epoch": 6.005117998294001, - "grad_norm": 0.22272750735282898, - "learning_rate": 4.116114232020132e-05, - "loss": 0.0027, - "step": 3960 - }, - { - "epoch": 6.0202824376836315, - "grad_norm": 0.020733533427119255, - "learning_rate": 4.0900593500577925e-05, - "loss": 0.0033, - "step": 3970 - }, - { - "epoch": 6.035446877073263, - "grad_norm": 0.061399463564157486, - "learning_rate": 4.0640299983972474e-05, - "loss": 0.0014, - "step": 3980 - }, - { - "epoch": 6.050611316462894, - "grad_norm": 0.30906936526298523, - "learning_rate": 4.0380269073467944e-05, - "loss": 0.0042, - "step": 3990 - }, - { - "epoch": 6.065775755852526, - "grad_norm": 0.020939858630299568, - "learning_rate": 4.012050806477928e-05, - "loss": 0.0041, - "step": 4000 - }, - { - "epoch": 6.065775755852526, - "eval_loss": 0.06866902858018875, - "eval_runtime": 159.6741, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4000 - }, - { - "epoch": 6.080940195242158, - "grad_norm": 0.10460769385099411, - "learning_rate": 3.986102424604881e-05, - "loss": 0.0016, - "step": 4010 - }, - { - "epoch": 6.096104634631788, - "grad_norm": 0.26983514428138733, - "learning_rate": 3.960182489764165e-05, - "loss": 0.0034, - "step": 4020 - }, - { - "epoch": 6.11126907402142, - "grad_norm": 0.005320222582668066, - "learning_rate": 3.934291729194159e-05, - "loss": 0.002, - "step": 4030 - }, - { - "epoch": 6.126433513411051, - "grad_norm": 0.02497193217277527, - "learning_rate": 3.9084308693146884e-05, - "loss": 0.0013, - "step": 4040 - }, - { - "epoch": 6.141597952800683, - "grad_norm": 0.05293813347816467, - "learning_rate": 3.8826006357066583e-05, - "loss": 0.0022, - "step": 4050 - }, - { - "epoch": 6.141597952800683, - "eval_loss": 0.0742153748869896, - "eval_runtime": 159.6497, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4050 - }, - { - "epoch": 6.156762392190314, - "grad_norm": 0.023233849555253983, - "learning_rate": 3.8568017530916816e-05, - "loss": 0.0013, - "step": 4060 - }, - { - "epoch": 6.171926831579945, - "grad_norm": 0.20648224651813507, - "learning_rate": 3.8310349453117624e-05, - "loss": 0.0108, - "step": 4070 - }, - { - "epoch": 6.187091270969576, - "grad_norm": 0.1453983187675476, - "learning_rate": 3.805300935308968e-05, - "loss": 0.0018, - "step": 4080 - }, - { - "epoch": 6.202255710359208, - "grad_norm": 0.37825945019721985, - "learning_rate": 3.779600445105165e-05, - "loss": 0.0069, - "step": 4090 - }, - { - "epoch": 6.217420149748839, - "grad_norm": 0.005586002487689257, - "learning_rate": 3.7539341957817424e-05, - "loss": 0.0012, - "step": 4100 - }, - { - "epoch": 6.217420149748839, - "eval_loss": 0.06995175033807755, - "eval_runtime": 159.6566, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4100 - }, - { - "epoch": 6.232584589138471, - "grad_norm": 0.014387178234755993, - "learning_rate": 3.7283029074593924e-05, - "loss": 0.0032, - "step": 4110 - }, - { - "epoch": 6.247749028528101, - "grad_norm": 0.0813213586807251, - "learning_rate": 3.702707299277906e-05, - "loss": 0.0025, - "step": 4120 - }, - { - "epoch": 6.262913467917733, - "grad_norm": 0.16309767961502075, - "learning_rate": 3.677148089375988e-05, - "loss": 0.0015, - "step": 4130 - }, - { - "epoch": 6.278077907307364, - "grad_norm": 0.12200695276260376, - "learning_rate": 3.6516259948711164e-05, - "loss": 0.0049, - "step": 4140 - }, - { - "epoch": 6.293242346696996, - "grad_norm": 0.012286040931940079, - "learning_rate": 3.626141731839414e-05, - "loss": 0.002, - "step": 4150 - }, - { - "epoch": 6.293242346696996, - "eval_loss": 0.06934823095798492, - "eval_runtime": 159.6771, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4150 - }, - { - "epoch": 6.308406786086627, - "grad_norm": 0.037122540175914764, - "learning_rate": 3.6006960152955685e-05, - "loss": 0.003, - "step": 4160 - }, - { - "epoch": 6.323571225476258, - "grad_norm": 0.03756190463900566, - "learning_rate": 3.5752895591727575e-05, - "loss": 0.0038, - "step": 4170 - }, - { - "epoch": 6.338735664865889, - "grad_norm": 0.01007194072008133, - "learning_rate": 3.5499230763026316e-05, - "loss": 0.0032, - "step": 4180 - }, - { - "epoch": 6.353900104255521, - "grad_norm": 0.4352642297744751, - "learning_rate": 3.5245972783953e-05, - "loss": 0.0055, - "step": 4190 - }, - { - "epoch": 6.369064543645152, - "grad_norm": 0.010554233565926552, - "learning_rate": 3.499312876019378e-05, - "loss": 0.0037, - "step": 4200 - }, - { - "epoch": 6.369064543645152, - "eval_loss": 0.0697985365986824, - "eval_runtime": 159.7181, - "eval_samples_per_second": 7.344, - "eval_steps_per_second": 7.344, - "step": 4200 - }, - { - "epoch": 6.384228983034784, - "grad_norm": 0.3295115828514099, - "learning_rate": 3.474070578582037e-05, - "loss": 0.0063, - "step": 4210 - }, - { - "epoch": 6.3993934224244144, - "grad_norm": 1.6858514547348022, - "learning_rate": 3.4488710943091045e-05, - "loss": 0.0014, - "step": 4220 - }, - { - "epoch": 6.414557861814046, - "grad_norm": 0.09532463550567627, - "learning_rate": 3.423715130225196e-05, - "loss": 0.0028, - "step": 4230 - }, - { - "epoch": 6.429722301203677, - "grad_norm": 0.0165849681943655, - "learning_rate": 3.398603392133881e-05, - "loss": 0.0045, - "step": 4240 - }, - { - "epoch": 6.444886740593309, - "grad_norm": 0.05536497384309769, - "learning_rate": 3.3735365845978696e-05, - "loss": 0.0019, - "step": 4250 - }, - { - "epoch": 6.444886740593309, - "eval_loss": 0.06866072118282318, - "eval_runtime": 159.6626, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4250 - }, - { - "epoch": 6.46005117998294, - "grad_norm": 0.045207660645246506, - "learning_rate": 3.34851541091925e-05, - "loss": 0.0039, - "step": 4260 - }, - { - "epoch": 6.475215619372571, - "grad_norm": 0.7177790999412537, - "learning_rate": 3.32354057311976e-05, - "loss": 0.0056, - "step": 4270 - }, - { - "epoch": 6.490380058762202, - "grad_norm": 0.04334711655974388, - "learning_rate": 3.298612771921088e-05, - "loss": 0.0011, - "step": 4280 - }, - { - "epoch": 6.505544498151834, - "grad_norm": 0.9892463088035583, - "learning_rate": 3.2737327067252074e-05, - "loss": 0.0024, - "step": 4290 - }, - { - "epoch": 6.520708937541466, - "grad_norm": 0.011243737302720547, - "learning_rate": 3.24890107559476e-05, - "loss": 0.0007, - "step": 4300 - }, - { - "epoch": 6.520708937541466, - "eval_loss": 0.07813987135887146, - "eval_runtime": 159.6548, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4300 - }, - { - "epoch": 6.535873376931097, - "grad_norm": 0.002380318706855178, - "learning_rate": 3.2241185752334634e-05, - "loss": 0.0009, - "step": 4310 - }, - { - "epoch": 6.5510378163207275, - "grad_norm": 0.020601673051714897, - "learning_rate": 3.1993859009665744e-05, - "loss": 0.002, - "step": 4320 - }, - { - "epoch": 6.566202255710359, - "grad_norm": 0.031208863481879234, - "learning_rate": 3.1747037467213725e-05, - "loss": 0.0031, - "step": 4330 - }, - { - "epoch": 6.581366695099991, - "grad_norm": 0.010496349073946476, - "learning_rate": 3.150072805007688e-05, - "loss": 0.0004, - "step": 4340 - }, - { - "epoch": 6.596531134489622, - "grad_norm": 0.21850430965423584, - "learning_rate": 3.125493766898477e-05, - "loss": 0.0035, - "step": 4350 - }, - { - "epoch": 6.596531134489622, - "eval_loss": 0.08454664051532745, - "eval_runtime": 159.7205, - "eval_samples_per_second": 7.344, - "eval_steps_per_second": 7.344, - "step": 4350 - }, - { - "epoch": 6.611695573879253, - "grad_norm": 0.5013878345489502, - "learning_rate": 3.1009673220104305e-05, - "loss": 0.005, - "step": 4360 - }, - { - "epoch": 6.626860013268884, - "grad_norm": 0.038185011595487595, - "learning_rate": 3.076494158484631e-05, - "loss": 0.0026, - "step": 4370 - }, - { - "epoch": 6.642024452658516, - "grad_norm": 0.27242669463157654, - "learning_rate": 3.0520749629672334e-05, - "loss": 0.0034, - "step": 4380 - }, - { - "epoch": 6.657188892048147, - "grad_norm": 0.049427762627601624, - "learning_rate": 3.027710420590208e-05, - "loss": 0.0052, - "step": 4390 - }, - { - "epoch": 6.672353331437779, - "grad_norm": 0.02348467707633972, - "learning_rate": 3.0034012149521173e-05, - "loss": 0.0028, - "step": 4400 - }, - { - "epoch": 6.672353331437779, - "eval_loss": 0.08062376081943512, - "eval_runtime": 159.7095, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 4400 - }, - { - "epoch": 6.68751777082741, - "grad_norm": 0.031063517555594444, - "learning_rate": 2.979148028098936e-05, - "loss": 0.0041, - "step": 4410 - }, - { - "epoch": 6.702682210217041, - "grad_norm": 0.018211238086223602, - "learning_rate": 2.9549515405049122e-05, - "loss": 0.0021, - "step": 4420 - }, - { - "epoch": 6.717846649606672, - "grad_norm": 0.09643986076116562, - "learning_rate": 2.9308124310534774e-05, - "loss": 0.0021, - "step": 4430 - }, - { - "epoch": 6.733011088996304, - "grad_norm": 0.13222341239452362, - "learning_rate": 2.9067313770181944e-05, - "loss": 0.0039, - "step": 4440 - }, - { - "epoch": 6.748175528385935, - "grad_norm": 0.17908984422683716, - "learning_rate": 2.88270905404377e-05, - "loss": 0.0029, - "step": 4450 - }, - { - "epoch": 6.748175528385935, - "eval_loss": 0.07696054875850677, - "eval_runtime": 159.6924, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 4450 - }, - { - "epoch": 6.763339967775567, - "grad_norm": 0.13498340547084808, - "learning_rate": 2.858746136127074e-05, - "loss": 0.0068, - "step": 4460 - }, - { - "epoch": 6.778504407165197, - "grad_norm": 0.07055284827947617, - "learning_rate": 2.834843295598251e-05, - "loss": 0.0016, - "step": 4470 - }, - { - "epoch": 6.793668846554829, - "grad_norm": 0.006287859752774239, - "learning_rate": 2.811001203101845e-05, - "loss": 0.0007, - "step": 4480 - }, - { - "epoch": 6.80883328594446, - "grad_norm": 0.009424620307981968, - "learning_rate": 2.787220527577986e-05, - "loss": 0.0037, - "step": 4490 - }, - { - "epoch": 6.823997725334092, - "grad_norm": 0.005454878322780132, - "learning_rate": 2.7635019362436244e-05, - "loss": 0.0036, - "step": 4500 - }, - { - "epoch": 6.823997725334092, - "eval_loss": 0.07472400367259979, - "eval_runtime": 159.7149, - "eval_samples_per_second": 7.344, - "eval_steps_per_second": 7.344, - "step": 4500 - }, - { - "epoch": 6.839162164723723, - "grad_norm": 0.008601648733019829, - "learning_rate": 2.7398460945737997e-05, - "loss": 0.0018, - "step": 4510 - }, - { - "epoch": 6.854326604113354, - "grad_norm": 0.29192012548446655, - "learning_rate": 2.7162536662829842e-05, - "loss": 0.0017, - "step": 4520 - }, - { - "epoch": 6.869491043502985, - "grad_norm": 0.004134100396186113, - "learning_rate": 2.6927253133064577e-05, - "loss": 0.0012, - "step": 4530 - }, - { - "epoch": 6.884655482892617, - "grad_norm": 0.42937859892845154, - "learning_rate": 2.6692616957817233e-05, - "loss": 0.0023, - "step": 4540 - }, - { - "epoch": 6.899819922282248, - "grad_norm": 1.2721667289733887, - "learning_rate": 2.645863472029999e-05, - "loss": 0.003, - "step": 4550 - }, - { - "epoch": 6.899819922282248, - "eval_loss": 0.07495378702878952, - "eval_runtime": 159.6995, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 4550 - }, - { - "epoch": 6.91498436167188, - "grad_norm": 0.3483010232448578, - "learning_rate": 2.6225312985377447e-05, - "loss": 0.0031, - "step": 4560 - }, - { - "epoch": 6.93014880106151, - "grad_norm": 0.03516402468085289, - "learning_rate": 2.5992658299382333e-05, - "loss": 0.0059, - "step": 4570 - }, - { - "epoch": 6.945313240451142, - "grad_norm": 0.033090583980083466, - "learning_rate": 2.5760677189932044e-05, - "loss": 0.0046, - "step": 4580 - }, - { - "epoch": 6.960477679840773, - "grad_norm": 0.6742090582847595, - "learning_rate": 2.5529376165745233e-05, - "loss": 0.0095, - "step": 4590 - }, - { - "epoch": 6.975642119230405, - "grad_norm": 0.011352883651852608, - "learning_rate": 2.5298761716459406e-05, - "loss": 0.001, - "step": 4600 - }, - { - "epoch": 6.975642119230405, - "eval_loss": 0.06608187407255173, - "eval_runtime": 159.6676, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4600 - }, - { - "epoch": 6.990806558620036, - "grad_norm": 0.06303270161151886, - "learning_rate": 2.506884031244875e-05, - "loss": 0.0015, - "step": 4610 - }, - { - "epoch": 7.005970998009667, - "grad_norm": 0.10852441936731339, - "learning_rate": 2.4839618404642602e-05, - "loss": 0.0024, - "step": 4620 - }, - { - "epoch": 7.021135437399298, - "grad_norm": 0.035054467618465424, - "learning_rate": 2.4611102424344473e-05, - "loss": 0.0031, - "step": 4630 - }, - { - "epoch": 7.03629987678893, - "grad_norm": 0.48672059178352356, - "learning_rate": 2.4383298783051543e-05, - "loss": 0.0016, - "step": 4640 - }, - { - "epoch": 7.051464316178561, - "grad_norm": 0.012282459996640682, - "learning_rate": 2.4156213872274874e-05, - "loss": 0.0013, - "step": 4650 - }, - { - "epoch": 7.051464316178561, - "eval_loss": 0.07282822579145432, - "eval_runtime": 159.6855, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4650 - }, - { - "epoch": 7.066628755568193, - "grad_norm": 0.06393498927354813, - "learning_rate": 2.3929854063360013e-05, - "loss": 0.0023, - "step": 4660 - }, - { - "epoch": 7.081793194957824, - "grad_norm": 0.012780111283063889, - "learning_rate": 2.3704225707308243e-05, - "loss": 0.0019, - "step": 4670 - }, - { - "epoch": 7.096957634347455, - "grad_norm": 0.02090180106461048, - "learning_rate": 2.3479335134598407e-05, - "loss": 0.0008, - "step": 4680 - }, - { - "epoch": 7.112122073737087, - "grad_norm": 0.008085845038294792, - "learning_rate": 2.325518865500923e-05, - "loss": 0.0018, - "step": 4690 - }, - { - "epoch": 7.127286513126718, - "grad_norm": 0.011268621310591698, - "learning_rate": 2.303179255744243e-05, - "loss": 0.0016, - "step": 4700 - }, - { - "epoch": 7.127286513126718, - "eval_loss": 0.07448284327983856, - "eval_runtime": 159.6851, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4700 - }, - { - "epoch": 7.1424509525163495, - "grad_norm": 0.01610792987048626, - "learning_rate": 2.280915310974612e-05, - "loss": 0.0026, - "step": 4710 - }, - { - "epoch": 7.15761539190598, - "grad_norm": 0.022252675145864487, - "learning_rate": 2.2587276558538978e-05, - "loss": 0.0021, - "step": 4720 - }, - { - "epoch": 7.172779831295612, - "grad_norm": 0.0143266087397933, - "learning_rate": 2.236616912903507e-05, - "loss": 0.0009, - "step": 4730 - }, - { - "epoch": 7.187944270685243, - "grad_norm": 0.034717097878456116, - "learning_rate": 2.2145837024869116e-05, - "loss": 0.0007, - "step": 4740 - }, - { - "epoch": 7.203108710074875, - "grad_norm": 0.010767974890768528, - "learning_rate": 2.1926286427922436e-05, - "loss": 0.0007, - "step": 4750 - }, - { - "epoch": 7.203108710074875, - "eval_loss": 0.07798051834106445, - "eval_runtime": 159.6697, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4750 - }, - { - "epoch": 7.218273149464506, - "grad_norm": 0.16035127639770508, - "learning_rate": 2.170752349814955e-05, - "loss": 0.0015, - "step": 4760 - }, - { - "epoch": 7.233437588854137, - "grad_norm": 0.1238207072019577, - "learning_rate": 2.148955437340526e-05, - "loss": 0.0028, - "step": 4770 - }, - { - "epoch": 7.248602028243768, - "grad_norm": 0.017730826511979103, - "learning_rate": 2.1272385169272547e-05, - "loss": 0.0021, - "step": 4780 - }, - { - "epoch": 7.2637664676334, - "grad_norm": 0.012331496924161911, - "learning_rate": 2.1056021978890916e-05, - "loss": 0.0018, - "step": 4790 - }, - { - "epoch": 7.278930907023031, - "grad_norm": 0.0936025083065033, - "learning_rate": 2.0840470872785462e-05, - "loss": 0.0014, - "step": 4800 - }, - { - "epoch": 7.278930907023031, - "eval_loss": 0.08163706958293915, - "eval_runtime": 159.6689, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 4800 - }, - { - "epoch": 7.2940953464126626, - "grad_norm": 0.009723914787173271, - "learning_rate": 2.0625737898696552e-05, - "loss": 0.0009, - "step": 4810 - }, - { - "epoch": 7.309259785802293, - "grad_norm": 0.00484412582591176, - "learning_rate": 2.0411829081410084e-05, - "loss": 0.0002, - "step": 4820 - }, - { - "epoch": 7.324424225191925, - "grad_norm": 0.04556836187839508, - "learning_rate": 2.019875042258859e-05, - "loss": 0.0018, - "step": 4830 - }, - { - "epoch": 7.339588664581556, - "grad_norm": 0.003573835827410221, - "learning_rate": 1.9986507900602714e-05, - "loss": 0.0001, - "step": 4840 - }, - { - "epoch": 7.354753103971188, - "grad_norm": 0.005267801694571972, - "learning_rate": 1.9775107470363473e-05, - "loss": 0.0005, - "step": 4850 - }, - { - "epoch": 7.354753103971188, - "eval_loss": 0.08569210022687912, - "eval_runtime": 159.6477, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 4850 - }, - { - "epoch": 7.369917543360819, - "grad_norm": 0.08103930950164795, - "learning_rate": 1.9564555063155298e-05, - "loss": 0.0019, - "step": 4860 - }, - { - "epoch": 7.38508198275045, - "grad_norm": 0.00946909561753273, - "learning_rate": 1.9354856586469512e-05, - "loss": 0.0014, - "step": 4870 - }, - { - "epoch": 7.400246422140081, - "grad_norm": 0.002541696187108755, - "learning_rate": 1.914601792383862e-05, - "loss": 0.0007, - "step": 4880 - }, - { - "epoch": 7.415410861529713, - "grad_norm": 0.031486205756664276, - "learning_rate": 1.8938044934671266e-05, - "loss": 0.0023, - "step": 4890 - }, - { - "epoch": 7.430575300919344, - "grad_norm": 0.3101092278957367, - "learning_rate": 1.8730943454087714e-05, - "loss": 0.0026, - "step": 4900 - }, - { - "epoch": 7.430575300919344, - "eval_loss": 0.09210843592882156, - "eval_runtime": 159.7081, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 4900 - }, - { - "epoch": 7.445739740308976, - "grad_norm": 0.018890244886279106, - "learning_rate": 1.852471929275635e-05, - "loss": 0.0011, - "step": 4910 - }, - { - "epoch": 7.460904179698606, - "grad_norm": 0.05486918240785599, - "learning_rate": 1.831937823673039e-05, - "loss": 0.0014, - "step": 4920 - }, - { - "epoch": 7.476068619088238, - "grad_norm": 0.01624329388141632, - "learning_rate": 1.8114926047285753e-05, - "loss": 0.0012, - "step": 4930 - }, - { - "epoch": 7.491233058477869, - "grad_norm": 0.11589296162128448, - "learning_rate": 1.7911368460759305e-05, - "loss": 0.0015, - "step": 4940 - }, - { - "epoch": 7.506397497867501, - "grad_norm": 0.03767627850174904, - "learning_rate": 1.770871118838794e-05, - "loss": 0.0005, - "step": 4950 - }, - { - "epoch": 7.506397497867501, - "eval_loss": 0.08957915753126144, - "eval_runtime": 159.695, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 4950 - }, - { - "epoch": 7.5215619372571325, - "grad_norm": 0.0034159275237470865, - "learning_rate": 1.7506959916148374e-05, - "loss": 0.0008, - "step": 4960 - }, - { - "epoch": 7.536726376646763, - "grad_norm": 0.0016032177954912186, - "learning_rate": 1.730612030459752e-05, - "loss": 0.0021, - "step": 4970 - }, - { - "epoch": 7.551890816036394, - "grad_norm": 0.003403955604881048, - "learning_rate": 1.7106197988713764e-05, - "loss": 0.0015, - "step": 4980 - }, - { - "epoch": 7.567055255426026, - "grad_norm": 0.0031641479581594467, - "learning_rate": 1.6907198577738877e-05, - "loss": 0.0005, - "step": 4990 - }, - { - "epoch": 7.582219694815658, - "grad_norm": 0.018611367791891098, - "learning_rate": 1.6709127655020495e-05, - "loss": 0.0014, - "step": 5000 - }, - { - "epoch": 7.582219694815658, - "eval_loss": 0.0884106233716011, - "eval_runtime": 159.6976, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5000 - }, - { - "epoch": 7.597384134205289, - "grad_norm": 0.05140351504087448, - "learning_rate": 1.6511990777855617e-05, - "loss": 0.0007, - "step": 5010 - }, - { - "epoch": 7.612548573594919, - "grad_norm": 0.01023286022245884, - "learning_rate": 1.631579347733465e-05, - "loss": 0.0039, - "step": 5020 - }, - { - "epoch": 7.627713012984551, - "grad_norm": 0.5153130292892456, - "learning_rate": 1.61205412581861e-05, - "loss": 0.0033, - "step": 5030 - }, - { - "epoch": 7.642877452374183, - "grad_norm": 0.04323890432715416, - "learning_rate": 1.5926239598622355e-05, - "loss": 0.001, - "step": 5040 - }, - { - "epoch": 7.658041891763814, - "grad_norm": 0.006052754819393158, - "learning_rate": 1.5732893950185752e-05, - "loss": 0.0004, - "step": 5050 - }, - { - "epoch": 7.658041891763814, - "eval_loss": 0.08111725747585297, - "eval_runtime": 159.6756, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5050 - }, - { - "epoch": 7.6732063311534455, - "grad_norm": 0.0027039169799536467, - "learning_rate": 1.5540509737595754e-05, - "loss": 0.0015, - "step": 5060 - }, - { - "epoch": 7.688370770543076, - "grad_norm": 0.013370513916015625, - "learning_rate": 1.5349092358596728e-05, - "loss": 0.0005, - "step": 5070 - }, - { - "epoch": 7.703535209932708, - "grad_norm": 0.3145116865634918, - "learning_rate": 1.515864718380648e-05, - "loss": 0.0016, - "step": 5080 - }, - { - "epoch": 7.718699649322339, - "grad_norm": 0.018190694972872734, - "learning_rate": 1.4969179556565583e-05, - "loss": 0.0015, - "step": 5090 - }, - { - "epoch": 7.733864088711971, - "grad_norm": 0.012721856124699116, - "learning_rate": 1.4780694792787425e-05, - "loss": 0.0003, - "step": 5100 - }, - { - "epoch": 7.733864088711971, - "eval_loss": 0.08228595554828644, - "eval_runtime": 159.6773, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5100 - }, - { - "epoch": 7.749028528101602, - "grad_norm": 0.19555045664310455, - "learning_rate": 1.4593198180809109e-05, - "loss": 0.0013, - "step": 5110 - }, - { - "epoch": 7.764192967491233, - "grad_norm": 0.021047789603471756, - "learning_rate": 1.4406694981243101e-05, - "loss": 0.0005, - "step": 5120 - }, - { - "epoch": 7.779357406880864, - "grad_norm": 0.0017976905219256878, - "learning_rate": 1.4221190426829512e-05, - "loss": 0.0012, - "step": 5130 - }, - { - "epoch": 7.794521846270496, - "grad_norm": 0.0040603443048894405, - "learning_rate": 1.403668972228941e-05, - "loss": 0.0016, - "step": 5140 - }, - { - "epoch": 7.809686285660127, - "grad_norm": 0.009725396521389484, - "learning_rate": 1.3853198044178722e-05, - "loss": 0.0015, - "step": 5150 - }, - { - "epoch": 7.809686285660127, - "eval_loss": 0.08893095701932907, - "eval_runtime": 159.6878, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5150 - }, - { - "epoch": 7.8248507250497585, - "grad_norm": 0.015102843753993511, - "learning_rate": 1.367072054074302e-05, - "loss": 0.0006, - "step": 5160 - }, - { - "epoch": 7.840015164439389, - "grad_norm": 0.1920541226863861, - "learning_rate": 1.3489262331773079e-05, - "loss": 0.0046, - "step": 5170 - }, - { - "epoch": 7.855179603829021, - "grad_norm": 0.18488584458827972, - "learning_rate": 1.330882850846118e-05, - "loss": 0.0031, - "step": 5180 - }, - { - "epoch": 7.870344043218652, - "grad_norm": 0.012407021597027779, - "learning_rate": 1.312942413325835e-05, - "loss": 0.0006, - "step": 5190 - }, - { - "epoch": 7.885508482608284, - "grad_norm": 0.07114287465810776, - "learning_rate": 1.2951054239732263e-05, - "loss": 0.0007, - "step": 5200 - }, - { - "epoch": 7.885508482608284, - "eval_loss": 0.0819629430770874, - "eval_runtime": 159.6361, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 5200 - }, - { - "epoch": 7.900672921997915, - "grad_norm": 0.003206809749826789, - "learning_rate": 1.2773723832426032e-05, - "loss": 0.0011, - "step": 5210 - }, - { - "epoch": 7.915837361387546, - "grad_norm": 0.022074706852436066, - "learning_rate": 1.2597437886717805e-05, - "loss": 0.0013, - "step": 5220 - }, - { - "epoch": 7.931001800777177, - "grad_norm": 0.01131486240774393, - "learning_rate": 1.2422201348681134e-05, - "loss": 0.0017, - "step": 5230 - }, - { - "epoch": 7.946166240166809, - "grad_norm": 0.013740439899265766, - "learning_rate": 1.2248019134946225e-05, - "loss": 0.0016, - "step": 5240 - }, - { - "epoch": 7.96133067955644, - "grad_norm": 0.026393504813313484, - "learning_rate": 1.2074896132562075e-05, - "loss": 0.0001, - "step": 5250 - }, - { - "epoch": 7.96133067955644, - "eval_loss": 0.08446918427944183, - "eval_runtime": 159.6859, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5250 - }, - { - "epoch": 7.9764951189460715, - "grad_norm": 0.025374166667461395, - "learning_rate": 1.1902837198859157e-05, - "loss": 0.0011, - "step": 5260 - }, - { - "epoch": 7.991659558335702, - "grad_norm": 0.007933364249765873, - "learning_rate": 1.1731847161313325e-05, - "loss": 0.0032, - "step": 5270 - }, - { - "epoch": 8.006823997725334, - "grad_norm": 0.02547082118690014, - "learning_rate": 1.15619308174103e-05, - "loss": 0.0002, - "step": 5280 - }, - { - "epoch": 8.021988437114965, - "grad_norm": 0.05732371658086777, - "learning_rate": 1.1393092934511035e-05, - "loss": 0.001, - "step": 5290 - }, - { - "epoch": 8.037152876504596, - "grad_norm": 0.021463660523295403, - "learning_rate": 1.1225338249718032e-05, - "loss": 0.0002, - "step": 5300 - }, - { - "epoch": 8.037152876504596, - "eval_loss": 0.08296515047550201, - "eval_runtime": 159.707, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5300 - }, - { - "epoch": 8.052317315894228, - "grad_norm": 0.00827645044773817, - "learning_rate": 1.1058671469742331e-05, - "loss": 0.0002, - "step": 5310 - }, - { - "epoch": 8.06748175528386, - "grad_norm": 0.0017541047418490052, - "learning_rate": 1.0893097270771552e-05, - "loss": 0.0001, - "step": 5320 - }, - { - "epoch": 8.08264619467349, - "grad_norm": 0.006852227263152599, - "learning_rate": 1.0728620298338649e-05, - "loss": 0.0008, - "step": 5330 - }, - { - "epoch": 8.097810634063123, - "grad_norm": 0.12831629812717438, - "learning_rate": 1.0565245167191556e-05, - "loss": 0.0007, - "step": 5340 - }, - { - "epoch": 8.112975073452754, - "grad_norm": 0.005427788943052292, - "learning_rate": 1.040297646116376e-05, - "loss": 0.001, - "step": 5350 - }, - { - "epoch": 8.112975073452754, - "eval_loss": 0.08476079255342484, - "eval_runtime": 159.6884, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5350 - }, - { - "epoch": 8.128139512842385, - "grad_norm": 0.0025401024613529444, - "learning_rate": 1.0241818733045604e-05, - "loss": 0.0004, - "step": 5360 - }, - { - "epoch": 8.143303952232015, - "grad_norm": 0.0015906417975202203, - "learning_rate": 1.008177650445669e-05, - "loss": 0.0011, - "step": 5370 - }, - { - "epoch": 8.158468391621648, - "grad_norm": 0.014389735646545887, - "learning_rate": 9.922854265718845e-06, - "loss": 0.0005, - "step": 5380 - }, - { - "epoch": 8.173632831011279, - "grad_norm": 0.014736047014594078, - "learning_rate": 9.765056475730272e-06, - "loss": 0.0002, - "step": 5390 - }, - { - "epoch": 8.18879727040091, - "grad_norm": 0.003618720918893814, - "learning_rate": 9.608387561840381e-06, - "loss": 0.0002, - "step": 5400 - }, - { - "epoch": 8.18879727040091, - "eval_loss": 0.08592947572469711, - "eval_runtime": 159.6334, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 5400 - }, - { - "epoch": 8.20396170979054, - "grad_norm": 0.008707334287464619, - "learning_rate": 9.45285191972559e-06, - "loss": 0.0026, - "step": 5410 - }, - { - "epoch": 8.219126149180173, - "grad_norm": 0.0448060967028141, - "learning_rate": 9.298453913265981e-06, - "loss": 0.0006, - "step": 5420 - }, - { - "epoch": 8.234290588569804, - "grad_norm": 0.10096914321184158, - "learning_rate": 9.145197874422901e-06, - "loss": 0.0011, - "step": 5430 - }, - { - "epoch": 8.249455027959435, - "grad_norm": 0.012872147373855114, - "learning_rate": 8.993088103117314e-06, - "loss": 0.0006, - "step": 5440 - }, - { - "epoch": 8.264619467349066, - "grad_norm": 0.010536997579038143, - "learning_rate": 8.842128867109345e-06, - "loss": 0.0005, - "step": 5450 - }, - { - "epoch": 8.264619467349066, - "eval_loss": 0.08735238015651703, - "eval_runtime": 159.5975, - "eval_samples_per_second": 7.35, - "eval_steps_per_second": 7.35, - "step": 5450 - }, - { - "epoch": 8.279783906738698, - "grad_norm": 0.014972977340221405, - "learning_rate": 8.692324401878326e-06, - "loss": 0.0005, - "step": 5460 - }, - { - "epoch": 8.29494834612833, - "grad_norm": 0.0037371802609413862, - "learning_rate": 8.543678910504122e-06, - "loss": 0.0004, - "step": 5470 - }, - { - "epoch": 8.31011278551796, - "grad_norm": 0.003980707842856646, - "learning_rate": 8.39619656354913e-06, - "loss": 0.0005, - "step": 5480 - }, - { - "epoch": 8.325277224907591, - "grad_norm": 0.47391098737716675, - "learning_rate": 8.249881498941248e-06, - "loss": 0.0009, - "step": 5490 - }, - { - "epoch": 8.340441664297224, - "grad_norm": 0.001838018069975078, - "learning_rate": 8.104737821857883e-06, - "loss": 0.0006, - "step": 5500 - }, - { - "epoch": 8.340441664297224, - "eval_loss": 0.08864803612232208, - "eval_runtime": 159.6959, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5500 - }, - { - "epoch": 8.355606103686855, - "grad_norm": 0.005992853082716465, - "learning_rate": 7.960769604610619e-06, - "loss": 0.0, - "step": 5510 - }, - { - "epoch": 8.370770543076485, - "grad_norm": 0.06756465882062912, - "learning_rate": 7.817980886531078e-06, - "loss": 0.0002, - "step": 5520 - }, - { - "epoch": 8.385934982466116, - "grad_norm": 0.010373502038419247, - "learning_rate": 7.67637567385755e-06, - "loss": 0.0002, - "step": 5530 - }, - { - "epoch": 8.401099421855749, - "grad_norm": 0.0020394432358443737, - "learning_rate": 7.535957939622573e-06, - "loss": 0.0004, - "step": 5540 - }, - { - "epoch": 8.41626386124538, - "grad_norm": 0.002091666217893362, - "learning_rate": 7.396731623541481e-06, - "loss": 0.0006, - "step": 5550 - }, - { - "epoch": 8.41626386124538, - "eval_loss": 0.09102967381477356, - "eval_runtime": 159.6638, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 5550 - }, - { - "epoch": 8.43142830063501, - "grad_norm": 0.036747947335243225, - "learning_rate": 7.2587006319018766e-06, - "loss": 0.0008, - "step": 5560 - }, - { - "epoch": 8.446592740024641, - "grad_norm": 0.005755693186074495, - "learning_rate": 7.1218688374539765e-06, - "loss": 0.0001, - "step": 5570 - }, - { - "epoch": 8.461757179414274, - "grad_norm": 0.15107466280460358, - "learning_rate": 6.986240079302053e-06, - "loss": 0.0008, - "step": 5580 - }, - { - "epoch": 8.476921618803905, - "grad_norm": 0.0031072061974555254, - "learning_rate": 6.851818162796603e-06, - "loss": 0.0005, - "step": 5590 - }, - { - "epoch": 8.492086058193536, - "grad_norm": 0.00114593340549618, - "learning_rate": 6.718606859427673e-06, - "loss": 0.0024, - "step": 5600 - }, - { - "epoch": 8.492086058193536, - "eval_loss": 0.09284108132123947, - "eval_runtime": 159.6916, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5600 - }, - { - "epoch": 8.507250497583167, - "grad_norm": 0.014525875449180603, - "learning_rate": 6.586609906719005e-06, - "loss": 0.0007, - "step": 5610 - }, - { - "epoch": 8.5224149369728, - "grad_norm": 0.008361021056771278, - "learning_rate": 6.4558310081231585e-06, - "loss": 0.0011, - "step": 5620 - }, - { - "epoch": 8.53757937636243, - "grad_norm": 0.0019996934570372105, - "learning_rate": 6.326273832917651e-06, - "loss": 0.0002, - "step": 5630 - }, - { - "epoch": 8.552743815752061, - "grad_norm": 0.008239475078880787, - "learning_rate": 6.197942016101932e-06, - "loss": 0.0001, - "step": 5640 - }, - { - "epoch": 8.567908255141692, - "grad_norm": 0.0012480788864195347, - "learning_rate": 6.070839158295455e-06, - "loss": 0.0001, - "step": 5650 - }, - { - "epoch": 8.567908255141692, - "eval_loss": 0.09329073876142502, - "eval_runtime": 159.632, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 5650 - }, - { - "epoch": 8.583072694531324, - "grad_norm": 0.0237286239862442, - "learning_rate": 5.944968825636687e-06, - "loss": 0.0001, - "step": 5660 - }, - { - "epoch": 8.598237133920955, - "grad_norm": 0.003595671383664012, - "learning_rate": 5.820334549682938e-06, - "loss": 0.0001, - "step": 5670 - }, - { - "epoch": 8.613401573310586, - "grad_norm": 0.001076442888006568, - "learning_rate": 5.696939827311387e-06, - "loss": 0.0004, - "step": 5680 - }, - { - "epoch": 8.628566012700219, - "grad_norm": 0.02274579554796219, - "learning_rate": 5.574788120620894e-06, - "loss": 0.0008, - "step": 5690 - }, - { - "epoch": 8.64373045208985, - "grad_norm": 0.010793941095471382, - "learning_rate": 5.453882856834908e-06, - "loss": 0.0008, - "step": 5700 - }, - { - "epoch": 8.64373045208985, - "eval_loss": 0.09383995085954666, - "eval_runtime": 159.647, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 5700 - }, - { - "epoch": 8.65889489147948, - "grad_norm": 0.0014880397357046604, - "learning_rate": 5.334227428205335e-06, - "loss": 0.0006, - "step": 5710 - }, - { - "epoch": 8.674059330869111, - "grad_norm": 0.005215889774262905, - "learning_rate": 5.215825191917256e-06, - "loss": 0.0002, - "step": 5720 - }, - { - "epoch": 8.689223770258744, - "grad_norm": 0.0017830703873187304, - "learning_rate": 5.0986794699948495e-06, - "loss": 0.0009, - "step": 5730 - }, - { - "epoch": 8.704388209648375, - "grad_norm": 0.0023380089551210403, - "learning_rate": 4.9827935492081145e-06, - "loss": 0.0007, - "step": 5740 - }, - { - "epoch": 8.719552649038006, - "grad_norm": 0.053907815366983414, - "learning_rate": 4.868170680980683e-06, - "loss": 0.001, - "step": 5750 - }, - { - "epoch": 8.719552649038006, - "eval_loss": 0.09339133650064468, - "eval_runtime": 159.6349, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 5750 - }, - { - "epoch": 8.734717088427637, - "grad_norm": 0.058813899755477905, - "learning_rate": 4.754814081298597e-06, - "loss": 0.0008, - "step": 5760 - }, - { - "epoch": 8.74988152781727, - "grad_norm": 0.03185298293828964, - "learning_rate": 4.642726930620034e-06, - "loss": 0.0011, - "step": 5770 - }, - { - "epoch": 8.7650459672069, - "grad_norm": 0.012449444271624088, - "learning_rate": 4.53191237378614e-06, - "loss": 0.0011, - "step": 5780 - }, - { - "epoch": 8.780210406596531, - "grad_norm": 0.015862831845879555, - "learning_rate": 4.422373519932743e-06, - "loss": 0.0001, - "step": 5790 - }, - { - "epoch": 8.795374845986162, - "grad_norm": 0.18638499081134796, - "learning_rate": 4.31411344240314e-06, - "loss": 0.0041, - "step": 5800 - }, - { - "epoch": 8.795374845986162, - "eval_loss": 0.09254682064056396, - "eval_runtime": 159.6913, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5800 - }, - { - "epoch": 8.810539285375794, - "grad_norm": 0.01614658534526825, - "learning_rate": 4.207135178661875e-06, - "loss": 0.001, - "step": 5810 - }, - { - "epoch": 8.825703724765425, - "grad_norm": 0.008590052835643291, - "learning_rate": 4.101441730209471e-06, - "loss": 0.0009, - "step": 5820 - }, - { - "epoch": 8.840868164155056, - "grad_norm": 0.028087787330150604, - "learning_rate": 3.997036062498299e-06, - "loss": 0.0005, - "step": 5830 - }, - { - "epoch": 8.856032603544687, - "grad_norm": 0.0031377754639834166, - "learning_rate": 3.893921104849308e-06, - "loss": 0.0001, - "step": 5840 - }, - { - "epoch": 8.87119704293432, - "grad_norm": 0.096165731549263, - "learning_rate": 3.7920997503698318e-06, - "loss": 0.0006, - "step": 5850 - }, - { - "epoch": 8.87119704293432, - "eval_loss": 0.09359559416770935, - "eval_runtime": 159.7017, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 5850 - }, - { - "epoch": 8.88636148232395, - "grad_norm": 0.015856117010116577, - "learning_rate": 3.691574855872476e-06, - "loss": 0.0014, - "step": 5860 - }, - { - "epoch": 8.901525921713581, - "grad_norm": 0.0045267799869179726, - "learning_rate": 3.592349241794929e-06, - "loss": 0.0003, - "step": 5870 - }, - { - "epoch": 8.916690361103212, - "grad_norm": 0.12610861659049988, - "learning_rate": 3.4944256921208064e-06, - "loss": 0.0002, - "step": 5880 - }, - { - "epoch": 8.931854800492845, - "grad_norm": 0.003098243148997426, - "learning_rate": 3.3978069543015966e-06, - "loss": 0.0003, - "step": 5890 - }, - { - "epoch": 8.947019239882476, - "grad_norm": 0.0028010914102196693, - "learning_rate": 3.3024957391794897e-06, - "loss": 0.001, - "step": 5900 - }, - { - "epoch": 8.947019239882476, - "eval_loss": 0.09470677375793457, - "eval_runtime": 159.67, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 5900 - }, - { - "epoch": 8.962183679272107, - "grad_norm": 0.0004256203246768564, - "learning_rate": 3.208494720911448e-06, - "loss": 0.0009, - "step": 5910 - }, - { - "epoch": 8.977348118661737, - "grad_norm": 0.0006655902252532542, - "learning_rate": 3.115806536894034e-06, - "loss": 0.0012, - "step": 5920 - }, - { - "epoch": 8.99251255805137, - "grad_norm": 0.009475434198975563, - "learning_rate": 3.0244337876895124e-06, - "loss": 0.0004, - "step": 5930 - }, - { - "epoch": 9.007676997441001, - "grad_norm": 0.0019473334541544318, - "learning_rate": 2.9343790369528502e-06, - "loss": 0.0005, - "step": 5940 - }, - { - "epoch": 9.022841436830632, - "grad_norm": 0.010196613147854805, - "learning_rate": 2.8456448113597657e-06, - "loss": 0.0001, - "step": 5950 - }, - { - "epoch": 9.022841436830632, - "eval_loss": 0.09511291980743408, - "eval_runtime": 159.6625, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 5950 - }, - { - "epoch": 9.038005876220263, - "grad_norm": 0.0021949559450149536, - "learning_rate": 2.7582336005359144e-06, - "loss": 0.0006, - "step": 5960 - }, - { - "epoch": 9.053170315609895, - "grad_norm": 0.01409760769456625, - "learning_rate": 2.6721478569869397e-06, - "loss": 0.0001, - "step": 5970 - }, - { - "epoch": 9.068334754999526, - "grad_norm": 0.009901969693601131, - "learning_rate": 2.587389996029721e-06, - "loss": 0.0001, - "step": 5980 - }, - { - "epoch": 9.083499194389157, - "grad_norm": 0.002471962245181203, - "learning_rate": 2.503962395724596e-06, - "loss": 0.0008, - "step": 5990 - }, - { - "epoch": 9.09866363377879, - "grad_norm": 0.0007780774612911046, - "learning_rate": 2.421867396808647e-06, - "loss": 0.0001, - "step": 6000 - }, - { - "epoch": 9.09866363377879, - "eval_loss": 0.09577231109142303, - "eval_runtime": 159.7102, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 6000 - }, - { - "epoch": 9.11382807316842, - "grad_norm": 0.006762175355106592, - "learning_rate": 2.3411073026299968e-06, - "loss": 0.0001, - "step": 6010 - }, - { - "epoch": 9.128992512558051, - "grad_norm": 0.0012034398969262838, - "learning_rate": 2.2616843790832287e-06, - "loss": 0.0004, - "step": 6020 - }, - { - "epoch": 9.144156951947682, - "grad_norm": 0.005463158246129751, - "learning_rate": 2.1836008545457586e-06, - "loss": 0.0001, - "step": 6030 - }, - { - "epoch": 9.159321391337315, - "grad_norm": 0.003443205961957574, - "learning_rate": 2.1068589198153686e-06, - "loss": 0.0007, - "step": 6040 - }, - { - "epoch": 9.174485830726946, - "grad_norm": 0.004269744735211134, - "learning_rate": 2.0314607280486953e-06, - "loss": 0.001, - "step": 6050 - }, - { - "epoch": 9.174485830726946, - "eval_loss": 0.09658579528331757, - "eval_runtime": 159.7256, - "eval_samples_per_second": 7.344, - "eval_steps_per_second": 7.344, - "step": 6050 - }, - { - "epoch": 9.189650270116577, - "grad_norm": 0.0027989321388304234, - "learning_rate": 1.957408394700844e-06, - "loss": 0.0007, - "step": 6060 - }, - { - "epoch": 9.204814709506207, - "grad_norm": 0.01282491534948349, - "learning_rate": 1.8847039974660142e-06, - "loss": 0.0003, - "step": 6070 - }, - { - "epoch": 9.21997914889584, - "grad_norm": 0.002059659920632839, - "learning_rate": 1.8133495762192421e-06, - "loss": 0.0009, - "step": 6080 - }, - { - "epoch": 9.23514358828547, - "grad_norm": 0.20776422321796417, - "learning_rate": 1.7433471329591223e-06, - "loss": 0.001, - "step": 6090 - }, - { - "epoch": 9.250308027675102, - "grad_norm": 0.011187134310603142, - "learning_rate": 1.6746986317516645e-06, - "loss": 0.0001, - "step": 6100 - }, - { - "epoch": 9.250308027675102, - "eval_loss": 0.09666631370782852, - "eval_runtime": 159.6594, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 6100 - }, - { - "epoch": 9.265472467064733, - "grad_norm": 0.005988691467791796, - "learning_rate": 1.6074059986751765e-06, - "loss": 0.0002, - "step": 6110 - }, - { - "epoch": 9.280636906454365, - "grad_norm": 0.004186820704489946, - "learning_rate": 1.5414711217662581e-06, - "loss": 0.0009, - "step": 6120 - }, - { - "epoch": 9.295801345843996, - "grad_norm": 0.005222671665251255, - "learning_rate": 1.476895850966764e-06, - "loss": 0.0002, - "step": 6130 - }, - { - "epoch": 9.310965785233627, - "grad_norm": 0.019084980711340904, - "learning_rate": 1.4136819980719474e-06, - "loss": 0.0001, - "step": 6140 - }, - { - "epoch": 9.326130224623258, - "grad_norm": 0.0012924578040838242, - "learning_rate": 1.3518313366796265e-06, - "loss": 0.0002, - "step": 6150 - }, - { - "epoch": 9.326130224623258, - "eval_loss": 0.09691597521305084, - "eval_runtime": 159.6676, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 6150 - }, - { - "epoch": 9.34129466401289, - "grad_norm": 0.01494539249688387, - "learning_rate": 1.2913456021403814e-06, - "loss": 0.0001, - "step": 6160 - }, - { - "epoch": 9.356459103402521, - "grad_norm": 0.11517928540706635, - "learning_rate": 1.2322264915089365e-06, - "loss": 0.0007, - "step": 6170 - }, - { - "epoch": 9.371623542792152, - "grad_norm": 0.01227965485304594, - "learning_rate": 1.174475663496466e-06, - "loss": 0.0006, - "step": 6180 - }, - { - "epoch": 9.386787982181783, - "grad_norm": 0.0013123779790475965, - "learning_rate": 1.1180947384241025e-06, - "loss": 0.0004, - "step": 6190 - }, - { - "epoch": 9.401952421571416, - "grad_norm": 0.001861977274529636, - "learning_rate": 1.0630852981774797e-06, - "loss": 0.0008, - "step": 6200 - }, - { - "epoch": 9.401952421571416, - "eval_loss": 0.09742455184459686, - "eval_runtime": 159.661, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 6200 - }, - { - "epoch": 9.417116860961046, - "grad_norm": 0.001002249657176435, - "learning_rate": 1.0094488861623175e-06, - "loss": 0.0004, - "step": 6210 - }, - { - "epoch": 9.432281300350677, - "grad_norm": 0.0025871230755001307, - "learning_rate": 9.571870072611567e-07, - "loss": 0.0003, - "step": 6220 - }, - { - "epoch": 9.447445739740308, - "grad_norm": 0.015062691643834114, - "learning_rate": 9.063011277910871e-07, - "loss": 0.0003, - "step": 6230 - }, - { - "epoch": 9.46261017912994, - "grad_norm": 0.004613468889147043, - "learning_rate": 8.567926754626587e-07, - "loss": 0.0008, - "step": 6240 - }, - { - "epoch": 9.477774618519572, - "grad_norm": 0.0015271971933543682, - "learning_rate": 8.086630393398075e-07, - "loss": 0.0004, - "step": 6250 - }, - { - "epoch": 9.477774618519572, - "eval_loss": 0.09731167554855347, - "eval_runtime": 159.6209, - "eval_samples_per_second": 7.349, - "eval_steps_per_second": 7.349, - "step": 6250 - }, - { - "epoch": 9.492939057909203, - "grad_norm": 0.005590524524450302, - "learning_rate": 7.619135698008428e-07, - "loss": 0.0002, - "step": 6260 - }, - { - "epoch": 9.508103497298833, - "grad_norm": 0.004667097236961126, - "learning_rate": 7.165455785006326e-07, - "loss": 0.0004, - "step": 6270 - }, - { - "epoch": 9.523267936688466, - "grad_norm": 0.009542740881443024, - "learning_rate": 6.725603383337275e-07, - "loss": 0.001, - "step": 6280 - }, - { - "epoch": 9.538432376078097, - "grad_norm": 0.001641739159822464, - "learning_rate": 6.299590833987123e-07, - "loss": 0.0001, - "step": 6290 - }, - { - "epoch": 9.553596815467728, - "grad_norm": 0.009384028613567352, - "learning_rate": 5.887430089635382e-07, - "loss": 0.0005, - "step": 6300 - }, - { - "epoch": 9.553596815467728, - "eval_loss": 0.09749122709035873, - "eval_runtime": 159.6379, - "eval_samples_per_second": 7.348, - "eval_steps_per_second": 7.348, - "step": 6300 - }, - { - "epoch": 9.56876125485736, - "grad_norm": 0.0033854665234684944, - "learning_rate": 5.489132714319833e-07, - "loss": 0.0007, - "step": 6310 - }, - { - "epoch": 9.583925694246991, - "grad_norm": 0.0015902504092082381, - "learning_rate": 5.104709883112513e-07, - "loss": 0.0001, - "step": 6320 - }, - { - "epoch": 9.599090133636622, - "grad_norm": 0.1731133759021759, - "learning_rate": 4.7341723818056794e-07, - "loss": 0.0017, - "step": 6330 - }, - { - "epoch": 9.614254573026253, - "grad_norm": 0.010804546065628529, - "learning_rate": 4.3775306066096146e-07, - "loss": 0.0001, - "step": 6340 - }, - { - "epoch": 9.629419012415884, - "grad_norm": 0.0010842379415407777, - "learning_rate": 4.034794563860522e-07, - "loss": 0.0003, - "step": 6350 - }, - { - "epoch": 9.629419012415884, - "eval_loss": 0.09768021106719971, - "eval_runtime": 159.6473, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 6350 - }, - { - "epoch": 9.644583451805516, - "grad_norm": 0.0016621141694486141, - "learning_rate": 3.705973869740087e-07, - "loss": 0.0003, - "step": 6360 - }, - { - "epoch": 9.659747891195147, - "grad_norm": 0.003988614305853844, - "learning_rate": 3.3910777500056333e-07, - "loss": 0.0016, - "step": 6370 - }, - { - "epoch": 9.674912330584778, - "grad_norm": 0.004535555839538574, - "learning_rate": 3.090115039731112e-07, - "loss": 0.0005, - "step": 6380 - }, - { - "epoch": 9.69007676997441, - "grad_norm": 0.004972003400325775, - "learning_rate": 2.803094183059352e-07, - "loss": 0.0004, - "step": 6390 - }, - { - "epoch": 9.705241209364042, - "grad_norm": 0.0028819472063332796, - "learning_rate": 2.5300232329651395e-07, - "loss": 0.0009, - "step": 6400 - }, - { - "epoch": 9.705241209364042, - "eval_loss": 0.09791069477796555, - "eval_runtime": 159.6575, - "eval_samples_per_second": 7.347, - "eval_steps_per_second": 7.347, - "step": 6400 - }, - { - "epoch": 9.720405648753673, - "grad_norm": 0.0019556803163141012, - "learning_rate": 2.2709098510292348e-07, - "loss": 0.0001, - "step": 6410 - }, - { - "epoch": 9.735570088143303, - "grad_norm": 0.00658820616081357, - "learning_rate": 2.0257613072233728e-07, - "loss": 0.0003, - "step": 6420 - }, - { - "epoch": 9.750734527532936, - "grad_norm": 0.0036374719347804785, - "learning_rate": 1.7945844797063737e-07, - "loss": 0.0001, - "step": 6430 - }, - { - "epoch": 9.765898966922567, - "grad_norm": 0.008759425021708012, - "learning_rate": 1.5773858546311858e-07, - "loss": 0.0, - "step": 6440 - }, - { - "epoch": 9.781063406312198, - "grad_norm": 0.0006390007329173386, - "learning_rate": 1.374171525962753e-07, - "loss": 0.0001, - "step": 6450 - }, - { - "epoch": 9.781063406312198, - "eval_loss": 0.09774022549390793, - "eval_runtime": 159.6939, - "eval_samples_per_second": 7.345, - "eval_steps_per_second": 7.345, - "step": 6450 - }, - { - "epoch": 9.796227845701829, - "grad_norm": 0.002994662383571267, - "learning_rate": 1.1849471953070957e-07, - "loss": 0.0001, - "step": 6460 - }, - { - "epoch": 9.811392285091461, - "grad_norm": 0.07306841015815735, - "learning_rate": 1.0097181717514947e-07, - "loss": 0.0016, - "step": 6470 - }, - { - "epoch": 9.826556724481092, - "grad_norm": 0.021355099976062775, - "learning_rate": 8.484893717153331e-08, - "loss": 0.0002, - "step": 6480 - }, - { - "epoch": 9.841721163870723, - "grad_norm": 0.002837724518030882, - "learning_rate": 7.012653188122053e-08, - "loss": 0.0005, - "step": 6490 - }, - { - "epoch": 9.856885603260354, - "grad_norm": 0.0014836470363661647, - "learning_rate": 5.6805014372307564e-08, - "loss": 0.0001, - "step": 6500 - }, - { - "epoch": 9.856885603260354, - "eval_loss": 0.0978918969631195, - "eval_runtime": 159.6882, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 6500 - }, - { - "epoch": 9.872050042649986, - "grad_norm": 0.0076141091994941235, - "learning_rate": 4.488475840803141e-08, - "loss": 0.0001, - "step": 6510 - }, - { - "epoch": 9.887214482039617, - "grad_norm": 0.003527163527905941, - "learning_rate": 3.436609843628369e-08, - "loss": 0.0004, - "step": 6520 - }, - { - "epoch": 9.902378921429248, - "grad_norm": 0.19706645607948303, - "learning_rate": 2.5249329580229185e-08, - "loss": 0.0006, - "step": 6530 - }, - { - "epoch": 9.917543360818879, - "grad_norm": 0.005063205026090145, - "learning_rate": 1.753470763002363e-08, - "loss": 0.0001, - "step": 6540 - }, - { - "epoch": 9.932707800208512, - "grad_norm": 0.0015864692395552993, - "learning_rate": 1.1222449035630522e-08, - "loss": 0.0001, - "step": 6550 - }, - { - "epoch": 9.932707800208512, - "eval_loss": 0.09780346602201462, - "eval_runtime": 159.6726, - "eval_samples_per_second": 7.346, - "eval_steps_per_second": 7.346, - "step": 6550 - }, - { - "epoch": 9.947872239598142, - "grad_norm": 0.009153211489319801, - "learning_rate": 6.312730900770447e-09, - "loss": 0.0004, - "step": 6560 - }, - { - "epoch": 9.963036678987773, - "grad_norm": 0.10997940599918365, - "learning_rate": 2.8056909779250463e-09, - "loss": 0.0001, - "step": 6570 - }, - { - "epoch": 9.978201118377404, - "grad_norm": 0.012196751311421394, - "learning_rate": 7.014276644901063e-10, - "loss": 0.0014, - "step": 6580 - }, - { - "epoch": 9.993365557767037, - "grad_norm": 0.002295834943652153, - "learning_rate": 0.0, - "loss": 0.0009, - "step": 6590 - }, - { - "epoch": 9.993365557767037, - "step": 6590, - "total_flos": 1.6539435682200945e+18, - "train_loss": 0.029239414032335445, - "train_runtime": 72432.9733, - "train_samples_per_second": 1.457, + "epoch": 4.996682778883518, + "step": 3295, + "total_flos": 8.269502901505229e+17, + "train_loss": 0.04518177697310657, + "train_runtime": 36119.4942, + "train_samples_per_second": 1.461, "train_steps_per_second": 0.091 } ], "logging_steps": 10, - "max_steps": 6590, + "max_steps": 3295, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { @@ -5696,7 +2858,7 @@ "attributes": {} } }, - "total_flos": 1.6539435682200945e+18, + "total_flos": 8.269502901505229e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null