{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 6666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0144014401440144, "grad_norm": 10.98869514465332, "learning_rate": 4.347826086956522e-07, "loss": 0.6974, "step": 32 }, { "epoch": 0.0288028802880288, "grad_norm": 10.853715896606445, "learning_rate": 9.145427286356823e-07, "loss": 0.6333, "step": 64 }, { "epoch": 0.043204320432043204, "grad_norm": 13.330265045166016, "learning_rate": 1.3943028485757123e-06, "loss": 0.5058, "step": 96 }, { "epoch": 0.0576057605760576, "grad_norm": 11.676138877868652, "learning_rate": 1.8740629685157422e-06, "loss": 0.5039, "step": 128 }, { "epoch": 0.07200720072007201, "grad_norm": 11.024947166442871, "learning_rate": 2.3388305847076464e-06, "loss": 0.5272, "step": 160 }, { "epoch": 0.08640864086408641, "grad_norm": 11.500349998474121, "learning_rate": 2.8185907046476763e-06, "loss": 0.5414, "step": 192 }, { "epoch": 0.10081008100810081, "grad_norm": 14.144675254821777, "learning_rate": 3.2983508245877066e-06, "loss": 0.5167, "step": 224 }, { "epoch": 0.1152115211521152, "grad_norm": 15.339383125305176, "learning_rate": 3.763118440779611e-06, "loss": 0.4782, "step": 256 }, { "epoch": 0.12961296129612962, "grad_norm": 14.547229766845703, "learning_rate": 4.242878560719641e-06, "loss": 0.5071, "step": 288 }, { "epoch": 0.14401440144014402, "grad_norm": 16.514537811279297, "learning_rate": 4.722638680659671e-06, "loss": 0.5238, "step": 320 }, { "epoch": 0.15841584158415842, "grad_norm": 13.728464126586914, "learning_rate": 5.2023988005997004e-06, "loss": 0.6176, "step": 352 }, { "epoch": 0.17281728172817282, "grad_norm": 9.859703063964844, "learning_rate": 5.682158920539731e-06, "loss": 0.5022, "step": 384 }, { "epoch": 0.18721872187218722, "grad_norm": 16.529651641845703, "learning_rate": 6.16191904047976e-06, "loss": 0.5606, "step": 416 }, { "epoch": 0.20162016201620162, "grad_norm": 12.64522647857666, "learning_rate": 6.6416791604197905e-06, "loss": 0.5115, "step": 448 }, { "epoch": 0.21602160216021601, "grad_norm": 9.362441062927246, "learning_rate": 7.121439280359821e-06, "loss": 0.5371, "step": 480 }, { "epoch": 0.2304230423042304, "grad_norm": 12.86221694946289, "learning_rate": 7.60119940029985e-06, "loss": 0.5343, "step": 512 }, { "epoch": 0.2448244824482448, "grad_norm": 12.512419700622559, "learning_rate": 8.065967016491755e-06, "loss": 0.6191, "step": 544 }, { "epoch": 0.25922592259225924, "grad_norm": 14.35505199432373, "learning_rate": 8.53073463268366e-06, "loss": 0.5668, "step": 576 }, { "epoch": 0.27362736273627364, "grad_norm": 14.951635360717773, "learning_rate": 9.010494752623688e-06, "loss": 0.6024, "step": 608 }, { "epoch": 0.28802880288028804, "grad_norm": 11.853578567504883, "learning_rate": 9.490254872563718e-06, "loss": 0.6092, "step": 640 }, { "epoch": 0.30243024302430244, "grad_norm": 9.165077209472656, "learning_rate": 9.970014992503749e-06, "loss": 0.5587, "step": 672 }, { "epoch": 0.31683168316831684, "grad_norm": 12.137311935424805, "learning_rate": 9.999382956748588e-06, "loss": 0.5444, "step": 704 }, { "epoch": 0.33123312331233123, "grad_norm": 15.218328475952148, "learning_rate": 9.997364717027728e-06, "loss": 0.5909, "step": 736 }, { "epoch": 0.34563456345634563, "grad_norm": 13.624066352844238, "learning_rate": 9.993943105133823e-06, "loss": 0.6135, "step": 768 }, { "epoch": 0.36003600360036003, "grad_norm": 12.48714542388916, "learning_rate": 9.989119081932283e-06, "loss": 0.5868, "step": 800 }, { "epoch": 0.37443744374437443, "grad_norm": 20.65450668334961, "learning_rate": 9.9828940021171e-06, "loss": 0.6172, "step": 832 }, { "epoch": 0.38883888388838883, "grad_norm": 15.066054344177246, "learning_rate": 9.975269613830395e-06, "loss": 0.5682, "step": 864 }, { "epoch": 0.40324032403240323, "grad_norm": 10.448845863342285, "learning_rate": 9.966248058171527e-06, "loss": 0.6659, "step": 896 }, { "epoch": 0.41764176417641763, "grad_norm": 6.550926208496094, "learning_rate": 9.955831868595796e-06, "loss": 0.6053, "step": 928 }, { "epoch": 0.43204320432043203, "grad_norm": 18.164133071899414, "learning_rate": 9.94402397020302e-06, "loss": 0.5109, "step": 960 }, { "epoch": 0.4464446444644464, "grad_norm": 26.886669158935547, "learning_rate": 9.930827678916084e-06, "loss": 0.6006, "step": 992 }, { "epoch": 0.4608460846084608, "grad_norm": 9.457767486572266, "learning_rate": 9.916246700549754e-06, "loss": 0.5794, "step": 1024 }, { "epoch": 0.4752475247524752, "grad_norm": 15.5580472946167, "learning_rate": 9.900285129770016e-06, "loss": 0.5606, "step": 1056 }, { "epoch": 0.4896489648964896, "grad_norm": 9.84897232055664, "learning_rate": 9.882947448944177e-06, "loss": 0.6579, "step": 1088 }, { "epoch": 0.504050405040504, "grad_norm": 10.959239959716797, "learning_rate": 9.864238526882147e-06, "loss": 0.5455, "step": 1120 }, { "epoch": 0.5184518451845185, "grad_norm": 15.946135520935059, "learning_rate": 9.844163617469138e-06, "loss": 0.6002, "step": 1152 }, { "epoch": 0.5328532853285328, "grad_norm": 12.9840726852417, "learning_rate": 9.822728358190274e-06, "loss": 0.6541, "step": 1184 }, { "epoch": 0.5472547254725473, "grad_norm": 11.688780784606934, "learning_rate": 9.799938768547452e-06, "loss": 0.6294, "step": 1216 }, { "epoch": 0.5616561656165616, "grad_norm": 13.91913890838623, "learning_rate": 9.77580124836893e-06, "loss": 0.5856, "step": 1248 }, { "epoch": 0.5760576057605761, "grad_norm": 13.728696823120117, "learning_rate": 9.750322576012119e-06, "loss": 0.622, "step": 1280 }, { "epoch": 0.5904590459045904, "grad_norm": 8.887660026550293, "learning_rate": 9.723509906460054e-06, "loss": 0.6663, "step": 1312 }, { "epoch": 0.6048604860486049, "grad_norm": 12.27409839630127, "learning_rate": 9.69537076931213e-06, "loss": 0.5707, "step": 1344 }, { "epoch": 0.6192619261926192, "grad_norm": 23.663471221923828, "learning_rate": 9.665913066669608e-06, "loss": 0.6018, "step": 1376 }, { "epoch": 0.6336633663366337, "grad_norm": 14.472834587097168, "learning_rate": 9.635145070916541e-06, "loss": 0.5736, "step": 1408 }, { "epoch": 0.648064806480648, "grad_norm": 13.856266975402832, "learning_rate": 9.603075422396685e-06, "loss": 0.6117, "step": 1440 }, { "epoch": 0.6624662466246625, "grad_norm": 11.307412147521973, "learning_rate": 9.569713126987122e-06, "loss": 0.6123, "step": 1472 }, { "epoch": 0.6768676867686768, "grad_norm": 12.995174407958984, "learning_rate": 9.535067553569175e-06, "loss": 0.5784, "step": 1504 }, { "epoch": 0.6912691269126913, "grad_norm": 21.58072853088379, "learning_rate": 9.499148431397448e-06, "loss": 0.6081, "step": 1536 }, { "epoch": 0.7056705670567057, "grad_norm": 16.922595977783203, "learning_rate": 9.461965847367611e-06, "loss": 0.6303, "step": 1568 }, { "epoch": 0.7200720072007201, "grad_norm": 17.760387420654297, "learning_rate": 9.423530243183783e-06, "loss": 0.5761, "step": 1600 }, { "epoch": 0.7344734473447345, "grad_norm": 11.714215278625488, "learning_rate": 9.385111038077417e-06, "loss": 0.5852, "step": 1632 }, { "epoch": 0.7488748874887489, "grad_norm": 12.36257266998291, "learning_rate": 9.344240422449167e-06, "loss": 0.5965, "step": 1664 }, { "epoch": 0.7632763276327633, "grad_norm": 14.300825119018555, "learning_rate": 9.302149846608464e-06, "loss": 0.5316, "step": 1696 }, { "epoch": 0.7776777677767777, "grad_norm": 16.075668334960938, "learning_rate": 9.25885113053368e-06, "loss": 0.6228, "step": 1728 }, { "epoch": 0.7920792079207921, "grad_norm": 11.793745040893555, "learning_rate": 9.214356433476091e-06, "loss": 0.5644, "step": 1760 }, { "epoch": 0.8064806480648065, "grad_norm": 12.965546607971191, "learning_rate": 9.168678250545255e-06, "loss": 0.5646, "step": 1792 }, { "epoch": 0.8208820882088209, "grad_norm": 9.327117919921875, "learning_rate": 9.121829409200145e-06, "loss": 0.5961, "step": 1824 }, { "epoch": 0.8352835283528353, "grad_norm": 9.750507354736328, "learning_rate": 9.073823065646882e-06, "loss": 0.6158, "step": 1856 }, { "epoch": 0.8496849684968497, "grad_norm": 17.131214141845703, "learning_rate": 9.024672701144184e-06, "loss": 0.5422, "step": 1888 }, { "epoch": 0.8640864086408641, "grad_norm": 9.129823684692383, "learning_rate": 8.97439211821753e-06, "loss": 0.5846, "step": 1920 }, { "epoch": 0.8784878487848785, "grad_norm": 9.996623039245605, "learning_rate": 8.922995436783104e-06, "loss": 0.5824, "step": 1952 }, { "epoch": 0.8928892889288929, "grad_norm": 12.214370727539062, "learning_rate": 8.870497090182593e-06, "loss": 0.5921, "step": 1984 }, { "epoch": 0.9072907290729073, "grad_norm": 12.267783164978027, "learning_rate": 8.816911821129992e-06, "loss": 0.5496, "step": 2016 }, { "epoch": 0.9216921692169217, "grad_norm": 11.49367618560791, "learning_rate": 8.762254677571517e-06, "loss": 0.6221, "step": 2048 }, { "epoch": 0.9360936093609361, "grad_norm": 11.387689590454102, "learning_rate": 8.706541008459798e-06, "loss": 0.5685, "step": 2080 }, { "epoch": 0.9504950495049505, "grad_norm": 12.342247009277344, "learning_rate": 8.649786459443555e-06, "loss": 0.5922, "step": 2112 }, { "epoch": 0.9648964896489649, "grad_norm": 17.48115348815918, "learning_rate": 8.59200696847395e-06, "loss": 0.6309, "step": 2144 }, { "epoch": 0.9792979297929792, "grad_norm": 13.321036338806152, "learning_rate": 8.533218761328843e-06, "loss": 0.5513, "step": 2176 }, { "epoch": 0.9936993699369937, "grad_norm": 13.628349304199219, "learning_rate": 8.473438347056239e-06, "loss": 0.5934, "step": 2208 }, { "epoch": 1.008100810081008, "grad_norm": 6.996334075927734, "learning_rate": 8.412682513338176e-06, "loss": 0.4909, "step": 2240 }, { "epoch": 1.0225022502250225, "grad_norm": 10.135323524475098, "learning_rate": 8.35096832177636e-06, "loss": 0.3238, "step": 2272 }, { "epoch": 1.036903690369037, "grad_norm": 7.344214916229248, "learning_rate": 8.28831310310089e-06, "loss": 0.297, "step": 2304 }, { "epoch": 1.0513051305130514, "grad_norm": 8.131421089172363, "learning_rate": 8.224734452303397e-06, "loss": 0.3335, "step": 2336 }, { "epoch": 1.0657065706570656, "grad_norm": 8.626580238342285, "learning_rate": 8.160250223695987e-06, "loss": 0.3357, "step": 2368 }, { "epoch": 1.08010801080108, "grad_norm": 20.845579147338867, "learning_rate": 8.094878525897325e-06, "loss": 0.3235, "step": 2400 }, { "epoch": 1.0945094509450946, "grad_norm": 13.4508695602417, "learning_rate": 8.028637716747355e-06, "loss": 0.3331, "step": 2432 }, { "epoch": 1.108910891089109, "grad_norm": 11.288851737976074, "learning_rate": 7.961546398151988e-06, "loss": 0.3425, "step": 2464 }, { "epoch": 1.1233123312331232, "grad_norm": 12.70616626739502, "learning_rate": 7.893623410859282e-06, "loss": 0.343, "step": 2496 }, { "epoch": 1.1377137713771377, "grad_norm": 14.446159362792969, "learning_rate": 7.824887829168522e-06, "loss": 0.3426, "step": 2528 }, { "epoch": 1.1521152115211521, "grad_norm": 9.85571575164795, "learning_rate": 7.755358955573747e-06, "loss": 0.3229, "step": 2560 }, { "epoch": 1.1665166516651666, "grad_norm": 16.067195892333984, "learning_rate": 7.685056315343165e-06, "loss": 0.3246, "step": 2592 }, { "epoch": 1.1809180918091808, "grad_norm": 20.33487319946289, "learning_rate": 7.613999651036016e-06, "loss": 0.374, "step": 2624 }, { "epoch": 1.1953195319531953, "grad_norm": 12.800278663635254, "learning_rate": 7.542208916958433e-06, "loss": 0.3008, "step": 2656 }, { "epoch": 1.2097209720972097, "grad_norm": 10.589587211608887, "learning_rate": 7.469704273559807e-06, "loss": 0.3738, "step": 2688 }, { "epoch": 1.2241224122412242, "grad_norm": 9.341863632202148, "learning_rate": 7.396506081771295e-06, "loss": 0.3035, "step": 2720 }, { "epoch": 1.2385238523852384, "grad_norm": 8.244071960449219, "learning_rate": 7.322634897288008e-06, "loss": 0.3748, "step": 2752 }, { "epoch": 1.2529252925292529, "grad_norm": 6.00961971282959, "learning_rate": 7.248111464796508e-06, "loss": 0.2727, "step": 2784 }, { "epoch": 1.2673267326732673, "grad_norm": 9.179043769836426, "learning_rate": 7.172956712149234e-06, "loss": 0.3004, "step": 2816 }, { "epoch": 1.2817281728172818, "grad_norm": 9.150934219360352, "learning_rate": 7.0971917444875015e-06, "loss": 0.3635, "step": 2848 }, { "epoch": 1.296129612961296, "grad_norm": 9.036107063293457, "learning_rate": 7.020837838314691e-06, "loss": 0.2943, "step": 2880 }, { "epoch": 1.3105310531053105, "grad_norm": 9.950891494750977, "learning_rate": 6.94391643552134e-06, "loss": 0.3727, "step": 2912 }, { "epoch": 1.324932493249325, "grad_norm": 15.246482849121094, "learning_rate": 6.866449137363768e-06, "loss": 0.2992, "step": 2944 }, { "epoch": 1.3393339333933394, "grad_norm": 10.054420471191406, "learning_rate": 6.788457698397973e-06, "loss": 0.3322, "step": 2976 }, { "epoch": 1.3537353735373538, "grad_norm": 10.79068374633789, "learning_rate": 6.709964020370445e-06, "loss": 0.3245, "step": 3008 }, { "epoch": 1.368136813681368, "grad_norm": 10.584324836730957, "learning_rate": 6.630990146067687e-06, "loss": 0.3418, "step": 3040 }, { "epoch": 1.3825382538253825, "grad_norm": 16.110841751098633, "learning_rate": 6.554047211421132e-06, "loss": 0.3437, "step": 3072 }, { "epoch": 1.396939693969397, "grad_norm": 11.55324649810791, "learning_rate": 6.4741928832357855e-06, "loss": 0.3011, "step": 3104 }, { "epoch": 1.4113411341134112, "grad_norm": 15.144320487976074, "learning_rate": 6.393924568602145e-06, "loss": 0.337, "step": 3136 }, { "epoch": 1.4257425742574257, "grad_norm": 10.759925842285156, "learning_rate": 6.313264808664494e-06, "loss": 0.3149, "step": 3168 }, { "epoch": 1.4401440144014401, "grad_norm": 14.752134323120117, "learning_rate": 6.232236254493746e-06, "loss": 0.3622, "step": 3200 }, { "epoch": 1.4545454545454546, "grad_norm": 11.484366416931152, "learning_rate": 6.150861660726515e-06, "loss": 0.3112, "step": 3232 }, { "epoch": 1.468946894689469, "grad_norm": 13.652670860290527, "learning_rate": 6.069163879175092e-06, "loss": 0.3403, "step": 3264 }, { "epoch": 1.4833483348334833, "grad_norm": 9.8715181350708, "learning_rate": 5.9871658524101565e-06, "loss": 0.3205, "step": 3296 }, { "epoch": 1.4977497749774977, "grad_norm": 11.28494644165039, "learning_rate": 5.9048906073179824e-06, "loss": 0.304, "step": 3328 }, { "epoch": 1.5121512151215122, "grad_norm": 13.887341499328613, "learning_rate": 5.822361248633973e-06, "loss": 0.3027, "step": 3360 }, { "epoch": 1.5265526552655264, "grad_norm": 7.218605041503906, "learning_rate": 5.7396009524543274e-06, "loss": 0.3081, "step": 3392 }, { "epoch": 1.5409540954095409, "grad_norm": 7.5199503898620605, "learning_rate": 5.656632959727683e-06, "loss": 0.2977, "step": 3424 }, { "epoch": 1.5553555355535553, "grad_norm": 9.384359359741211, "learning_rate": 5.57348056972852e-06, "loss": 0.3032, "step": 3456 }, { "epoch": 1.5697569756975698, "grad_norm": 14.385334014892578, "learning_rate": 5.492772883806706e-06, "loss": 0.3244, "step": 3488 }, { "epoch": 1.5841584158415842, "grad_norm": 3.620002269744873, "learning_rate": 5.40932574467665e-06, "loss": 0.2992, "step": 3520 }, { "epoch": 1.5985598559855987, "grad_norm": 9.094705581665039, "learning_rate": 5.325763657690609e-06, "loss": 0.2942, "step": 3552 }, { "epoch": 1.612961296129613, "grad_norm": 10.146318435668945, "learning_rate": 5.242110088958073e-06, "loss": 0.3395, "step": 3584 }, { "epoch": 1.6273627362736274, "grad_norm": 13.460920333862305, "learning_rate": 5.158388530278656e-06, "loss": 0.3097, "step": 3616 }, { "epoch": 1.6417641764176416, "grad_norm": 11.407035827636719, "learning_rate": 5.074622492545074e-06, "loss": 0.3159, "step": 3648 }, { "epoch": 1.656165616561656, "grad_norm": 10.230562210083008, "learning_rate": 4.9908354991407666e-06, "loss": 0.327, "step": 3680 }, { "epoch": 1.6705670567056705, "grad_norm": 11.073262214660645, "learning_rate": 4.9070510793339835e-06, "loss": 0.2981, "step": 3712 }, { "epoch": 1.684968496849685, "grad_norm": 12.47003173828125, "learning_rate": 4.823292761670264e-06, "loss": 0.3065, "step": 3744 }, { "epoch": 1.6993699369936994, "grad_norm": 25.917009353637695, "learning_rate": 4.74219897937266e-06, "loss": 0.2992, "step": 3776 }, { "epoch": 1.7137713771377139, "grad_norm": 11.52535629272461, "learning_rate": 4.658560774737667e-06, "loss": 0.3474, "step": 3808 }, { "epoch": 1.7281728172817283, "grad_norm": 9.606013298034668, "learning_rate": 4.5750184539003665e-06, "loss": 0.3436, "step": 3840 }, { "epoch": 1.7425742574257426, "grad_norm": 15.367290496826172, "learning_rate": 4.4915954774194676e-06, "loss": 0.2932, "step": 3872 }, { "epoch": 1.756975697569757, "grad_norm": 7.993281364440918, "learning_rate": 4.408315272339104e-06, "loss": 0.3203, "step": 3904 }, { "epoch": 1.7713771377137713, "grad_norm": 8.027710914611816, "learning_rate": 4.325201225609999e-06, "loss": 0.3139, "step": 3936 }, { "epoch": 1.7857785778577857, "grad_norm": 10.957657814025879, "learning_rate": 4.242276677521877e-06, "loss": 0.3453, "step": 3968 }, { "epoch": 1.8001800180018002, "grad_norm": 10.544370651245117, "learning_rate": 4.159564915148997e-06, "loss": 0.2853, "step": 4000 }, { "epoch": 1.8145814581458146, "grad_norm": 12.427223205566406, "learning_rate": 4.077089165810611e-06, "loss": 0.3355, "step": 4032 }, { "epoch": 1.828982898289829, "grad_norm": 18.81423568725586, "learning_rate": 3.994872590548211e-06, "loss": 0.328, "step": 4064 }, { "epoch": 1.8433843384338435, "grad_norm": 9.081976890563965, "learning_rate": 3.9129382776213945e-06, "loss": 0.293, "step": 4096 }, { "epoch": 1.8577857785778578, "grad_norm": 8.164251327514648, "learning_rate": 3.831309236024159e-06, "loss": 0.2782, "step": 4128 }, { "epoch": 1.8721872187218722, "grad_norm": 9.661165237426758, "learning_rate": 3.7500083890234606e-06, "loss": 0.3296, "step": 4160 }, { "epoch": 1.8865886588658864, "grad_norm": 14.657001495361328, "learning_rate": 3.66905856772185e-06, "loss": 0.2631, "step": 4192 }, { "epoch": 1.900990099009901, "grad_norm": 16.178007125854492, "learning_rate": 3.5884825046459805e-06, "loss": 0.2765, "step": 4224 }, { "epoch": 1.9153915391539154, "grad_norm": 7.737805366516113, "learning_rate": 3.508302827362805e-06, "loss": 0.317, "step": 4256 }, { "epoch": 1.9297929792979298, "grad_norm": 9.958755493164062, "learning_rate": 3.4285420521252533e-06, "loss": 0.2975, "step": 4288 }, { "epoch": 1.9441944194419443, "grad_norm": 13.483292579650879, "learning_rate": 3.3492225775491582e-06, "loss": 0.2776, "step": 4320 }, { "epoch": 1.9585958595859587, "grad_norm": 11.092999458312988, "learning_rate": 3.270366678323219e-06, "loss": 0.3453, "step": 4352 }, { "epoch": 1.972997299729973, "grad_norm": 11.395092964172363, "learning_rate": 3.1919964989537755e-06, "loss": 0.2678, "step": 4384 }, { "epoch": 1.9873987398739874, "grad_norm": 10.83203411102295, "learning_rate": 3.1141340475461316e-06, "loss": 0.3074, "step": 4416 }, { "epoch": 2.0018001800180016, "grad_norm": 6.892433166503906, "learning_rate": 3.03680118962418e-06, "loss": 0.3064, "step": 4448 }, { "epoch": 2.016201620162016, "grad_norm": 6.377696514129639, "learning_rate": 2.9600196419900795e-06, "loss": 0.1179, "step": 4480 }, { "epoch": 2.0306030603060305, "grad_norm": 8.865036964416504, "learning_rate": 2.883810966625684e-06, "loss": 0.1444, "step": 4512 }, { "epoch": 2.045004500450045, "grad_norm": 7.50280237197876, "learning_rate": 2.8081965646374582e-06, "loss": 0.1138, "step": 4544 }, { "epoch": 2.0594059405940595, "grad_norm": 9.762062072753906, "learning_rate": 2.7331976702465647e-06, "loss": 0.1251, "step": 4576 }, { "epoch": 2.073807380738074, "grad_norm": 10.026853561401367, "learning_rate": 2.658835344825821e-06, "loss": 0.1239, "step": 4608 }, { "epoch": 2.0882088208820884, "grad_norm": 10.256850242614746, "learning_rate": 2.5851304709851855e-06, "loss": 0.1131, "step": 4640 }, { "epoch": 2.102610261026103, "grad_norm": 11.005268096923828, "learning_rate": 2.5121037467074596e-06, "loss": 0.1224, "step": 4672 }, { "epoch": 2.117011701170117, "grad_norm": 11.805505752563477, "learning_rate": 2.4397756795358287e-06, "loss": 0.1101, "step": 4704 }, { "epoch": 2.1314131413141313, "grad_norm": 7.897261142730713, "learning_rate": 2.3703932820444233e-06, "loss": 0.1238, "step": 4736 }, { "epoch": 2.1458145814581457, "grad_norm": 10.117232322692871, "learning_rate": 2.2994998626884623e-06, "loss": 0.1072, "step": 4768 }, { "epoch": 2.16021602160216, "grad_norm": 9.720394134521484, "learning_rate": 2.2293648043808946e-06, "loss": 0.1377, "step": 4800 }, { "epoch": 2.1746174617461747, "grad_norm": 9.53496265411377, "learning_rate": 2.1600078026201977e-06, "loss": 0.114, "step": 4832 }, { "epoch": 2.189018901890189, "grad_norm": 9.775762557983398, "learning_rate": 2.091448334409112e-06, "loss": 0.1053, "step": 4864 }, { "epoch": 2.2034203420342036, "grad_norm": 11.364226341247559, "learning_rate": 2.0237056527850555e-06, "loss": 0.0966, "step": 4896 }, { "epoch": 2.217821782178218, "grad_norm": 17.391111373901367, "learning_rate": 1.95679878141344e-06, "loss": 0.1016, "step": 4928 }, { "epoch": 2.232223222322232, "grad_norm": 10.15132999420166, "learning_rate": 1.8907465092453986e-06, "loss": 0.1184, "step": 4960 }, { "epoch": 2.2466246624662465, "grad_norm": 12.572965621948242, "learning_rate": 1.8255673852414274e-06, "loss": 0.1015, "step": 4992 }, { "epoch": 2.261026102610261, "grad_norm": 5.2437968254089355, "learning_rate": 1.7612797131624243e-06, "loss": 0.0993, "step": 5024 }, { "epoch": 2.2754275427542754, "grad_norm": 8.918633460998535, "learning_rate": 1.6979015464295785e-06, "loss": 0.1079, "step": 5056 }, { "epoch": 2.28982898289829, "grad_norm": 9.719175338745117, "learning_rate": 1.6354506830545625e-06, "loss": 0.1016, "step": 5088 }, { "epoch": 2.3042304230423043, "grad_norm": 24.36786460876465, "learning_rate": 1.5739446606414522e-06, "loss": 0.1342, "step": 5120 }, { "epoch": 2.3186318631863188, "grad_norm": 7.500924587249756, "learning_rate": 1.5134007514617827e-06, "loss": 0.0955, "step": 5152 }, { "epoch": 2.333033303330333, "grad_norm": 10.143550872802734, "learning_rate": 1.4538359576040923e-06, "loss": 0.1451, "step": 5184 }, { "epoch": 2.3474347434743477, "grad_norm": 15.87746524810791, "learning_rate": 1.395267006199363e-06, "loss": 0.1329, "step": 5216 }, { "epoch": 2.3618361836183617, "grad_norm": 12.571290016174316, "learning_rate": 1.33771034472367e-06, "loss": 0.0978, "step": 5248 }, { "epoch": 2.376237623762376, "grad_norm": 11.44887924194336, "learning_rate": 1.2811821363793497e-06, "loss": 0.1037, "step": 5280 }, { "epoch": 2.3906390639063906, "grad_norm": 11.863057136535645, "learning_rate": 1.2256982555560243e-06, "loss": 0.116, "step": 5312 }, { "epoch": 2.405040504050405, "grad_norm": 5.990699768066406, "learning_rate": 1.171274283372703e-06, "loss": 0.1037, "step": 5344 }, { "epoch": 2.4194419441944195, "grad_norm": 10.163458824157715, "learning_rate": 1.1179255033022624e-06, "loss": 0.1387, "step": 5376 }, { "epoch": 2.433843384338434, "grad_norm": 11.073272705078125, "learning_rate": 1.0672833301104142e-06, "loss": 0.12, "step": 5408 }, { "epoch": 2.4482448244824484, "grad_norm": 10.601714134216309, "learning_rate": 1.016094827126849e-06, "loss": 0.0957, "step": 5440 }, { "epoch": 2.4626462646264624, "grad_norm": 7.653828144073486, "learning_rate": 9.660250941303178e-07, "loss": 0.113, "step": 5472 }, { "epoch": 2.477047704770477, "grad_norm": 10.332706451416016, "learning_rate": 9.170881918256042e-07, "loss": 0.1001, "step": 5504 }, { "epoch": 2.4914491449144913, "grad_norm": 7.20733642578125, "learning_rate": 8.692978627932148e-07, "loss": 0.0998, "step": 5536 }, { "epoch": 2.5058505850585058, "grad_norm": 4.12467622756958, "learning_rate": 8.226675276301416e-07, "loss": 0.0855, "step": 5568 }, { "epoch": 2.5202520252025202, "grad_norm": 9.155281066894531, "learning_rate": 7.772102811810689e-07, "loss": 0.122, "step": 5600 }, { "epoch": 2.5346534653465347, "grad_norm": 5.481142997741699, "learning_rate": 7.329388888610384e-07, "loss": 0.0978, "step": 5632 }, { "epoch": 2.549054905490549, "grad_norm": 7.971097946166992, "learning_rate": 6.898657830706367e-07, "loss": 0.1157, "step": 5664 }, { "epoch": 2.5634563456345636, "grad_norm": 11.405919075012207, "learning_rate": 6.492928309381779e-07, "loss": 0.0979, "step": 5696 }, { "epoch": 2.577857785778578, "grad_norm": 13.61896800994873, "learning_rate": 6.08613879617217e-07, "loss": 0.1041, "step": 5728 }, { "epoch": 2.592259225922592, "grad_norm": 29.495868682861328, "learning_rate": 5.691681280788214e-07, "loss": 0.0992, "step": 5760 }, { "epoch": 2.6066606660666065, "grad_norm": 11.616131782531738, "learning_rate": 5.309666535753417e-07, "loss": 0.1051, "step": 5792 }, { "epoch": 2.621062106210621, "grad_norm": 8.599799156188965, "learning_rate": 4.940201839382114e-07, "loss": 0.1091, "step": 5824 }, { "epoch": 2.6354635463546354, "grad_norm": 12.70768928527832, "learning_rate": 4.5833909456532764e-07, "loss": 0.1111, "step": 5856 }, { "epoch": 2.64986498649865, "grad_norm": 11.202815055847168, "learning_rate": 4.2393340550740844e-07, "loss": 0.1046, "step": 5888 }, { "epoch": 2.6642664266426643, "grad_norm": 11.233574867248535, "learning_rate": 3.908127786541427e-07, "loss": 0.1022, "step": 5920 }, { "epoch": 2.678667866786679, "grad_norm": 9.141092300415039, "learning_rate": 3.589865150209071e-07, "loss": 0.0984, "step": 5952 }, { "epoch": 2.693069306930693, "grad_norm": 15.323395729064941, "learning_rate": 3.2846355213683456e-07, "loss": 0.1279, "step": 5984 }, { "epoch": 2.7074707470747077, "grad_norm": 10.427877426147461, "learning_rate": 2.9925246153496067e-07, "loss": 0.1169, "step": 6016 }, { "epoch": 2.7218721872187217, "grad_norm": 12.140647888183594, "learning_rate": 2.713614463451364e-07, "loss": 0.1267, "step": 6048 }, { "epoch": 2.736273627362736, "grad_norm": 13.221187591552734, "learning_rate": 2.4479833899041183e-07, "loss": 0.1198, "step": 6080 }, { "epoch": 2.7506750675067506, "grad_norm": 5.667657852172852, "learning_rate": 2.19570598987513e-07, "loss": 0.1312, "step": 6112 }, { "epoch": 2.765076507650765, "grad_norm": 0.735275149345398, "learning_rate": 1.9568531085204067e-07, "loss": 0.0699, "step": 6144 }, { "epoch": 2.7794779477947795, "grad_norm": 4.510382652282715, "learning_rate": 1.731491821089848e-07, "loss": 0.0965, "step": 6176 }, { "epoch": 2.793879387938794, "grad_norm": 13.006752967834473, "learning_rate": 1.5196854140909545e-07, "loss": 0.1026, "step": 6208 }, { "epoch": 2.8082808280828084, "grad_norm": 10.59677791595459, "learning_rate": 1.321493367516574e-07, "loss": 0.1093, "step": 6240 }, { "epoch": 2.8226822682268224, "grad_norm": 8.108485221862793, "learning_rate": 1.136971338141596e-07, "loss": 0.1191, "step": 6272 }, { "epoch": 2.8370837083708373, "grad_norm": 11.026812553405762, "learning_rate": 9.661711438932686e-08, "loss": 0.1008, "step": 6304 }, { "epoch": 2.8514851485148514, "grad_norm": 7.0519890785217285, "learning_rate": 8.09140749299564e-08, "loss": 0.1386, "step": 6336 }, { "epoch": 2.865886588658866, "grad_norm": 27.898834228515625, "learning_rate": 6.659242520196562e-08, "loss": 0.103, "step": 6368 }, { "epoch": 2.8802880288028803, "grad_norm": 7.882510185241699, "learning_rate": 5.365618704603392e-08, "loss": 0.0812, "step": 6400 }, { "epoch": 2.8946894689468947, "grad_norm": 15.030416488647461, "learning_rate": 4.2108993248173855e-08, "loss": 0.102, "step": 6432 }, { "epoch": 2.909090909090909, "grad_norm": 11.23343276977539, "learning_rate": 3.195408651956944e-08, "loss": 0.0976, "step": 6464 }, { "epoch": 2.9234923492349236, "grad_norm": 6.837776184082031, "learning_rate": 2.3194318585945673e-08, "loss": 0.0938, "step": 6496 }, { "epoch": 2.937893789378938, "grad_norm": 9.184088706970215, "learning_rate": 1.583214938674138e-08, "loss": 0.1087, "step": 6528 }, { "epoch": 2.952295229522952, "grad_norm": 7.366724967956543, "learning_rate": 1.0034770609533285e-08, "loss": 0.1139, "step": 6560 }, { "epoch": 2.9666966696669665, "grad_norm": 10.501226425170898, "learning_rate": 5.429794877803151e-09, "loss": 0.0928, "step": 6592 }, { "epoch": 2.981098109810981, "grad_norm": 7.6273908615112305, "learning_rate": 2.2274065574556804e-09, "loss": 0.1124, "step": 6624 }, { "epoch": 2.9954995499549955, "grad_norm": 5.487658977508545, "learning_rate": 4.2850495100610344e-10, "loss": 0.1158, "step": 6656 }, { "epoch": 3.0, "step": 6666, "total_flos": 9491636163499776.0, "train_loss": 0.3371730904088448, "train_runtime": 4274.2917, "train_samples_per_second": 1.56, "train_steps_per_second": 1.56 } ], "logging_steps": 32, "max_steps": 6666, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9491636163499776.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }