{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 13.215859030837004, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04405286343612335, "grad_norm": 32.503074645996094, "learning_rate": 0.0002992070484581498, "loss": 18.6527, "step": 10 }, { "epoch": 0.0881057268722467, "grad_norm": 14.659343719482422, "learning_rate": 0.0002983259911894273, "loss": 6.941, "step": 20 }, { "epoch": 0.13215859030837004, "grad_norm": 13.076040267944336, "learning_rate": 0.00029744493392070483, "loss": 4.3562, "step": 30 }, { "epoch": 0.1762114537444934, "grad_norm": 21.742950439453125, "learning_rate": 0.00029656387665198236, "loss": 3.5182, "step": 40 }, { "epoch": 0.22026431718061673, "grad_norm": 19.481538772583008, "learning_rate": 0.0002956828193832599, "loss": 2.6543, "step": 50 }, { "epoch": 0.2643171806167401, "grad_norm": 8.22918701171875, "learning_rate": 0.0002948017621145374, "loss": 1.7678, "step": 60 }, { "epoch": 0.30837004405286345, "grad_norm": 5.820700168609619, "learning_rate": 0.00029392070484581494, "loss": 1.2462, "step": 70 }, { "epoch": 0.3524229074889868, "grad_norm": 2.173941135406494, "learning_rate": 0.00029303964757709247, "loss": 1.3665, "step": 80 }, { "epoch": 0.3964757709251101, "grad_norm": 1.8419830799102783, "learning_rate": 0.00029215859030837, "loss": 1.1257, "step": 90 }, { "epoch": 0.44052863436123346, "grad_norm": 1.7700544595718384, "learning_rate": 0.0002912775330396475, "loss": 1.0317, "step": 100 }, { "epoch": 0.4845814977973568, "grad_norm": 1.2313016653060913, "learning_rate": 0.0002903964757709251, "loss": 0.8453, "step": 110 }, { "epoch": 0.5286343612334802, "grad_norm": 1.3880990743637085, "learning_rate": 0.00028951541850220263, "loss": 0.8675, "step": 120 }, { "epoch": 0.5726872246696035, "grad_norm": 2.3586199283599854, "learning_rate": 0.00028863436123348016, "loss": 0.8509, "step": 130 }, { "epoch": 0.6167400881057269, "grad_norm": 1.3902242183685303, "learning_rate": 0.0002877533039647577, "loss": 0.8528, "step": 140 }, { "epoch": 0.6607929515418502, "grad_norm": 1.0430246591567993, "learning_rate": 0.0002868722466960352, "loss": 0.8726, "step": 150 }, { "epoch": 0.7048458149779736, "grad_norm": 1.3441425561904907, "learning_rate": 0.00028599118942731274, "loss": 1.1029, "step": 160 }, { "epoch": 0.748898678414097, "grad_norm": 1.18771231174469, "learning_rate": 0.00028511013215859026, "loss": 0.9229, "step": 170 }, { "epoch": 0.7929515418502202, "grad_norm": 0.8050010800361633, "learning_rate": 0.0002842290748898678, "loss": 0.8363, "step": 180 }, { "epoch": 0.8370044052863436, "grad_norm": 1.1800554990768433, "learning_rate": 0.0002833480176211453, "loss": 0.8128, "step": 190 }, { "epoch": 0.8810572687224669, "grad_norm": 1.3873122930526733, "learning_rate": 0.00028246696035242285, "loss": 0.8275, "step": 200 }, { "epoch": 0.9251101321585903, "grad_norm": 1.1561434268951416, "learning_rate": 0.0002815859030837004, "loss": 0.8961, "step": 210 }, { "epoch": 0.9691629955947136, "grad_norm": 0.6641005277633667, "learning_rate": 0.00028070484581497795, "loss": 0.6599, "step": 220 }, { "epoch": 1.0, "eval_loss": 0.6834425330162048, "eval_runtime": 5.6075, "eval_samples_per_second": 36.023, "eval_steps_per_second": 4.637, "step": 227 }, { "epoch": 1.013215859030837, "grad_norm": 1.0736805200576782, "learning_rate": 0.0002798237885462555, "loss": 0.7283, "step": 230 }, { "epoch": 1.0572687224669604, "grad_norm": 1.069932460784912, "learning_rate": 0.000278942731277533, "loss": 0.825, "step": 240 }, { "epoch": 1.1013215859030836, "grad_norm": 1.3098456859588623, "learning_rate": 0.0002780616740088106, "loss": 0.7152, "step": 250 }, { "epoch": 1.145374449339207, "grad_norm": 1.0547797679901123, "learning_rate": 0.0002771806167400881, "loss": 0.746, "step": 260 }, { "epoch": 1.1894273127753303, "grad_norm": 1.58526611328125, "learning_rate": 0.00027629955947136564, "loss": 0.6655, "step": 270 }, { "epoch": 1.2334801762114538, "grad_norm": 1.4090569019317627, "learning_rate": 0.0002754185022026431, "loss": 0.7397, "step": 280 }, { "epoch": 1.277533039647577, "grad_norm": 1.3417810201644897, "learning_rate": 0.00027453744493392064, "loss": 0.6534, "step": 290 }, { "epoch": 1.3215859030837005, "grad_norm": 0.7320300936698914, "learning_rate": 0.0002736563876651982, "loss": 0.719, "step": 300 }, { "epoch": 1.3656387665198237, "grad_norm": 1.7867811918258667, "learning_rate": 0.00027277533039647575, "loss": 0.8878, "step": 310 }, { "epoch": 1.4096916299559472, "grad_norm": 1.0332417488098145, "learning_rate": 0.0002718942731277533, "loss": 0.7058, "step": 320 }, { "epoch": 1.4537444933920705, "grad_norm": 1.1942096948623657, "learning_rate": 0.0002710132158590308, "loss": 0.6599, "step": 330 }, { "epoch": 1.497797356828194, "grad_norm": 1.0352708101272583, "learning_rate": 0.00027013215859030833, "loss": 0.6487, "step": 340 }, { "epoch": 1.5418502202643172, "grad_norm": 1.2984694242477417, "learning_rate": 0.0002692511013215859, "loss": 0.7225, "step": 350 }, { "epoch": 1.5859030837004404, "grad_norm": 1.1419997215270996, "learning_rate": 0.00026837004405286344, "loss": 0.7233, "step": 360 }, { "epoch": 1.6299559471365639, "grad_norm": 1.5873011350631714, "learning_rate": 0.00026748898678414097, "loss": 0.5429, "step": 370 }, { "epoch": 1.6740088105726874, "grad_norm": 1.058026671409607, "learning_rate": 0.0002666079295154185, "loss": 0.7621, "step": 380 }, { "epoch": 1.7180616740088106, "grad_norm": 1.424886703491211, "learning_rate": 0.000265726872246696, "loss": 0.7103, "step": 390 }, { "epoch": 1.7621145374449338, "grad_norm": 0.9987337589263916, "learning_rate": 0.00026484581497797355, "loss": 0.6882, "step": 400 }, { "epoch": 1.8061674008810573, "grad_norm": 1.0241808891296387, "learning_rate": 0.0002639647577092511, "loss": 0.6754, "step": 410 }, { "epoch": 1.8502202643171806, "grad_norm": 0.7069824934005737, "learning_rate": 0.0002630837004405286, "loss": 0.6426, "step": 420 }, { "epoch": 1.894273127753304, "grad_norm": 1.18909752368927, "learning_rate": 0.00026220264317180613, "loss": 0.7879, "step": 430 }, { "epoch": 1.9383259911894273, "grad_norm": 0.8950007557868958, "learning_rate": 0.00026132158590308366, "loss": 0.7197, "step": 440 }, { "epoch": 1.9823788546255505, "grad_norm": 1.3497512340545654, "learning_rate": 0.00026044052863436124, "loss": 0.6892, "step": 450 }, { "epoch": 2.0, "eval_loss": 0.6154947876930237, "eval_runtime": 5.3228, "eval_samples_per_second": 37.95, "eval_steps_per_second": 4.885, "step": 454 }, { "epoch": 2.026431718061674, "grad_norm": 1.0325676202774048, "learning_rate": 0.00025955947136563877, "loss": 0.7104, "step": 460 }, { "epoch": 2.0704845814977975, "grad_norm": 1.4701021909713745, "learning_rate": 0.0002586784140969163, "loss": 0.6249, "step": 470 }, { "epoch": 2.1145374449339207, "grad_norm": 1.2472504377365112, "learning_rate": 0.0002577973568281938, "loss": 0.7115, "step": 480 }, { "epoch": 2.158590308370044, "grad_norm": 1.01516592502594, "learning_rate": 0.00025691629955947135, "loss": 0.6039, "step": 490 }, { "epoch": 2.202643171806167, "grad_norm": 1.3985668420791626, "learning_rate": 0.0002560352422907489, "loss": 0.5976, "step": 500 }, { "epoch": 2.246696035242291, "grad_norm": 0.6047684550285339, "learning_rate": 0.0002551541850220264, "loss": 0.5158, "step": 510 }, { "epoch": 2.290748898678414, "grad_norm": 0.8428493142127991, "learning_rate": 0.00025427312775330393, "loss": 0.6338, "step": 520 }, { "epoch": 2.3348017621145374, "grad_norm": 1.0199517011642456, "learning_rate": 0.00025339207048458146, "loss": 0.469, "step": 530 }, { "epoch": 2.3788546255506606, "grad_norm": 0.8641414642333984, "learning_rate": 0.000252511013215859, "loss": 0.583, "step": 540 }, { "epoch": 2.4229074889867843, "grad_norm": 0.8442863821983337, "learning_rate": 0.00025162995594713657, "loss": 0.6108, "step": 550 }, { "epoch": 2.4669603524229076, "grad_norm": 0.8864941000938416, "learning_rate": 0.0002507488986784141, "loss": 0.5572, "step": 560 }, { "epoch": 2.511013215859031, "grad_norm": 0.9025411605834961, "learning_rate": 0.0002498678414096916, "loss": 0.6174, "step": 570 }, { "epoch": 2.555066079295154, "grad_norm": 0.8481220602989197, "learning_rate": 0.00024898678414096915, "loss": 0.6118, "step": 580 }, { "epoch": 2.5991189427312777, "grad_norm": 0.9391738772392273, "learning_rate": 0.0002481057268722467, "loss": 0.569, "step": 590 }, { "epoch": 2.643171806167401, "grad_norm": 1.0381453037261963, "learning_rate": 0.0002472246696035242, "loss": 0.4904, "step": 600 }, { "epoch": 2.6872246696035242, "grad_norm": 1.023573398590088, "learning_rate": 0.00024634361233480173, "loss": 0.5969, "step": 610 }, { "epoch": 2.7312775330396475, "grad_norm": 1.2105042934417725, "learning_rate": 0.00024546255506607926, "loss": 0.6589, "step": 620 }, { "epoch": 2.7753303964757707, "grad_norm": 1.1160320043563843, "learning_rate": 0.0002445814977973568, "loss": 0.5242, "step": 630 }, { "epoch": 2.8193832599118944, "grad_norm": 1.1934391260147095, "learning_rate": 0.00024370044052863436, "loss": 0.6548, "step": 640 }, { "epoch": 2.8634361233480177, "grad_norm": 1.1788102388381958, "learning_rate": 0.0002428193832599119, "loss": 0.5828, "step": 650 }, { "epoch": 2.907488986784141, "grad_norm": 1.2889748811721802, "learning_rate": 0.00024193832599118942, "loss": 0.6468, "step": 660 }, { "epoch": 2.951541850220264, "grad_norm": 0.8837119340896606, "learning_rate": 0.00024105726872246695, "loss": 0.6694, "step": 670 }, { "epoch": 2.995594713656388, "grad_norm": 1.0216575860977173, "learning_rate": 0.00024017621145374447, "loss": 0.633, "step": 680 }, { "epoch": 3.0, "eval_loss": 0.592506468296051, "eval_runtime": 5.3852, "eval_samples_per_second": 37.51, "eval_steps_per_second": 4.828, "step": 681 }, { "epoch": 3.039647577092511, "grad_norm": 1.58785080909729, "learning_rate": 0.00023929515418502203, "loss": 0.4512, "step": 690 }, { "epoch": 3.0837004405286343, "grad_norm": 1.0036600828170776, "learning_rate": 0.00023841409691629955, "loss": 0.9613, "step": 700 }, { "epoch": 3.1277533039647576, "grad_norm": 0.9956134557723999, "learning_rate": 0.00023753303964757708, "loss": 0.479, "step": 710 }, { "epoch": 3.171806167400881, "grad_norm": 1.1154946088790894, "learning_rate": 0.0002366519823788546, "loss": 0.5444, "step": 720 }, { "epoch": 3.2158590308370045, "grad_norm": 1.3544610738754272, "learning_rate": 0.00023577092511013214, "loss": 0.5163, "step": 730 }, { "epoch": 3.2599118942731278, "grad_norm": 0.7720727920532227, "learning_rate": 0.0002348898678414097, "loss": 0.5317, "step": 740 }, { "epoch": 3.303964757709251, "grad_norm": 0.9804306030273438, "learning_rate": 0.00023400881057268722, "loss": 0.5179, "step": 750 }, { "epoch": 3.3480176211453743, "grad_norm": 1.0230934619903564, "learning_rate": 0.00023312775330396474, "loss": 0.5261, "step": 760 }, { "epoch": 3.392070484581498, "grad_norm": 0.8620821237564087, "learning_rate": 0.00023224669603524227, "loss": 0.4998, "step": 770 }, { "epoch": 3.436123348017621, "grad_norm": 0.8884461522102356, "learning_rate": 0.0002313656387665198, "loss": 0.5527, "step": 780 }, { "epoch": 3.4801762114537445, "grad_norm": 0.7721192836761475, "learning_rate": 0.00023048458149779735, "loss": 0.5279, "step": 790 }, { "epoch": 3.5242290748898677, "grad_norm": 1.0769802331924438, "learning_rate": 0.00022960352422907488, "loss": 0.5851, "step": 800 }, { "epoch": 3.568281938325991, "grad_norm": 1.3999199867248535, "learning_rate": 0.0002287224669603524, "loss": 0.44, "step": 810 }, { "epoch": 3.6123348017621146, "grad_norm": 0.9963156580924988, "learning_rate": 0.00022784140969162993, "loss": 0.6028, "step": 820 }, { "epoch": 3.656387665198238, "grad_norm": 0.9077759981155396, "learning_rate": 0.00022696035242290746, "loss": 0.6824, "step": 830 }, { "epoch": 3.700440528634361, "grad_norm": 0.9758647680282593, "learning_rate": 0.00022607929515418502, "loss": 0.5424, "step": 840 }, { "epoch": 3.744493392070485, "grad_norm": 0.9838646054267883, "learning_rate": 0.00022519823788546254, "loss": 0.5588, "step": 850 }, { "epoch": 3.788546255506608, "grad_norm": 1.1924773454666138, "learning_rate": 0.00022431718061674007, "loss": 0.6215, "step": 860 }, { "epoch": 3.8325991189427313, "grad_norm": 1.27988600730896, "learning_rate": 0.0002234361233480176, "loss": 0.5336, "step": 870 }, { "epoch": 3.8766519823788546, "grad_norm": 1.0098719596862793, "learning_rate": 0.00022255506607929512, "loss": 0.6623, "step": 880 }, { "epoch": 3.920704845814978, "grad_norm": 1.301437497138977, "learning_rate": 0.00022167400881057268, "loss": 0.4837, "step": 890 }, { "epoch": 3.964757709251101, "grad_norm": 1.3062794208526611, "learning_rate": 0.0002207929515418502, "loss": 0.4399, "step": 900 }, { "epoch": 4.0, "eval_loss": 0.5791140198707581, "eval_runtime": 5.2768, "eval_samples_per_second": 38.281, "eval_steps_per_second": 4.927, "step": 908 }, { "epoch": 4.008810572687224, "grad_norm": 1.2243441343307495, "learning_rate": 0.00021991189427312773, "loss": 0.5225, "step": 910 }, { "epoch": 4.052863436123348, "grad_norm": 1.0874862670898438, "learning_rate": 0.00021903083700440526, "loss": 0.5878, "step": 920 }, { "epoch": 4.096916299559472, "grad_norm": 1.1561787128448486, "learning_rate": 0.0002181497797356828, "loss": 0.4172, "step": 930 }, { "epoch": 4.140969162995595, "grad_norm": 0.9504215121269226, "learning_rate": 0.00021726872246696034, "loss": 0.454, "step": 940 }, { "epoch": 4.185022026431718, "grad_norm": 1.0901755094528198, "learning_rate": 0.00021638766519823787, "loss": 0.5222, "step": 950 }, { "epoch": 4.229074889867841, "grad_norm": 0.7518570423126221, "learning_rate": 0.0002155066079295154, "loss": 0.4048, "step": 960 }, { "epoch": 4.273127753303965, "grad_norm": 0.9933887720108032, "learning_rate": 0.00021462555066079292, "loss": 0.4545, "step": 970 }, { "epoch": 4.317180616740088, "grad_norm": 0.8956694006919861, "learning_rate": 0.00021374449339207048, "loss": 0.5703, "step": 980 }, { "epoch": 4.361233480176211, "grad_norm": 1.0768828392028809, "learning_rate": 0.000212863436123348, "loss": 0.411, "step": 990 }, { "epoch": 4.405286343612334, "grad_norm": 1.3219349384307861, "learning_rate": 0.00021198237885462553, "loss": 0.5096, "step": 1000 }, { "epoch": 4.4493392070484585, "grad_norm": 0.6028145551681519, "learning_rate": 0.00021110132158590306, "loss": 0.5427, "step": 1010 }, { "epoch": 4.493392070484582, "grad_norm": 0.6015641689300537, "learning_rate": 0.00021022026431718059, "loss": 0.4855, "step": 1020 }, { "epoch": 4.537444933920705, "grad_norm": 0.7184689044952393, "learning_rate": 0.00020933920704845814, "loss": 0.4893, "step": 1030 }, { "epoch": 4.581497797356828, "grad_norm": 1.445830225944519, "learning_rate": 0.00020845814977973567, "loss": 0.4412, "step": 1040 }, { "epoch": 4.6255506607929515, "grad_norm": 0.9506711959838867, "learning_rate": 0.0002075770925110132, "loss": 0.5596, "step": 1050 }, { "epoch": 4.669603524229075, "grad_norm": 0.9642265439033508, "learning_rate": 0.00020669603524229072, "loss": 0.3944, "step": 1060 }, { "epoch": 4.713656387665198, "grad_norm": 0.9548330307006836, "learning_rate": 0.00020581497797356825, "loss": 0.4925, "step": 1070 }, { "epoch": 4.757709251101321, "grad_norm": 1.5850030183792114, "learning_rate": 0.0002049339207048458, "loss": 0.5114, "step": 1080 }, { "epoch": 4.8017621145374445, "grad_norm": 0.7429970502853394, "learning_rate": 0.00020405286343612333, "loss": 0.5556, "step": 1090 }, { "epoch": 4.845814977973569, "grad_norm": 0.9865929484367371, "learning_rate": 0.00020317180616740086, "loss": 0.4612, "step": 1100 }, { "epoch": 4.889867841409692, "grad_norm": 0.8113177418708801, "learning_rate": 0.00020229074889867838, "loss": 0.5196, "step": 1110 }, { "epoch": 4.933920704845815, "grad_norm": 1.1767125129699707, "learning_rate": 0.0002014096916299559, "loss": 0.5321, "step": 1120 }, { "epoch": 4.977973568281938, "grad_norm": 0.8367587327957153, "learning_rate": 0.00020052863436123347, "loss": 0.506, "step": 1130 }, { "epoch": 5.0, "eval_loss": 0.5730367302894592, "eval_runtime": 6.1337, "eval_samples_per_second": 32.933, "eval_steps_per_second": 4.239, "step": 1135 }, { "epoch": 5.022026431718062, "grad_norm": 1.1242823600769043, "learning_rate": 0.000199647577092511, "loss": 0.4678, "step": 1140 }, { "epoch": 5.066079295154185, "grad_norm": 1.0385881662368774, "learning_rate": 0.00019876651982378852, "loss": 0.3968, "step": 1150 }, { "epoch": 5.110132158590308, "grad_norm": 0.9282165765762329, "learning_rate": 0.00019788546255506605, "loss": 0.5089, "step": 1160 }, { "epoch": 5.154185022026431, "grad_norm": 1.401548147201538, "learning_rate": 0.00019700440528634357, "loss": 0.4457, "step": 1170 }, { "epoch": 5.1982378854625555, "grad_norm": 0.6676862835884094, "learning_rate": 0.00019612334801762113, "loss": 0.3175, "step": 1180 }, { "epoch": 5.242290748898679, "grad_norm": 1.1318411827087402, "learning_rate": 0.00019524229074889866, "loss": 0.4468, "step": 1190 }, { "epoch": 5.286343612334802, "grad_norm": 0.706200361251831, "learning_rate": 0.00019436123348017618, "loss": 0.3954, "step": 1200 }, { "epoch": 5.330396475770925, "grad_norm": 0.6558952927589417, "learning_rate": 0.0001934801762114537, "loss": 0.4318, "step": 1210 }, { "epoch": 5.3744493392070485, "grad_norm": 0.59174644947052, "learning_rate": 0.00019259911894273124, "loss": 0.3962, "step": 1220 }, { "epoch": 5.418502202643172, "grad_norm": 0.9306423664093018, "learning_rate": 0.0001917180616740088, "loss": 0.4161, "step": 1230 }, { "epoch": 5.462555066079295, "grad_norm": 1.2412904500961304, "learning_rate": 0.00019083700440528632, "loss": 0.4259, "step": 1240 }, { "epoch": 5.506607929515418, "grad_norm": 0.8949795961380005, "learning_rate": 0.00018995594713656385, "loss": 0.5512, "step": 1250 }, { "epoch": 5.5506607929515415, "grad_norm": 0.9977787733078003, "learning_rate": 0.00018907488986784137, "loss": 0.4497, "step": 1260 }, { "epoch": 5.594713656387665, "grad_norm": 1.0676085948944092, "learning_rate": 0.0001881938325991189, "loss": 0.4344, "step": 1270 }, { "epoch": 5.638766519823789, "grad_norm": 0.6446275115013123, "learning_rate": 0.00018731277533039648, "loss": 0.4184, "step": 1280 }, { "epoch": 5.682819383259912, "grad_norm": 1.3255438804626465, "learning_rate": 0.000186431718061674, "loss": 0.5441, "step": 1290 }, { "epoch": 5.726872246696035, "grad_norm": 0.823581337928772, "learning_rate": 0.0001855506607929515, "loss": 0.5028, "step": 1300 }, { "epoch": 5.770925110132159, "grad_norm": 1.0471981763839722, "learning_rate": 0.00018466960352422904, "loss": 0.4407, "step": 1310 }, { "epoch": 5.814977973568282, "grad_norm": 1.0394315719604492, "learning_rate": 0.00018378854625550662, "loss": 0.4788, "step": 1320 }, { "epoch": 5.859030837004405, "grad_norm": 1.4738258123397827, "learning_rate": 0.00018290748898678414, "loss": 0.5495, "step": 1330 }, { "epoch": 5.903083700440528, "grad_norm": 1.2812182903289795, "learning_rate": 0.00018202643171806167, "loss": 0.4335, "step": 1340 }, { "epoch": 5.9471365638766525, "grad_norm": 1.4929533004760742, "learning_rate": 0.0001811453744493392, "loss": 0.5097, "step": 1350 }, { "epoch": 5.991189427312776, "grad_norm": 1.2788587808609009, "learning_rate": 0.00018026431718061673, "loss": 0.4702, "step": 1360 }, { "epoch": 6.0, "eval_loss": 0.5740869045257568, "eval_runtime": 4.9653, "eval_samples_per_second": 40.682, "eval_steps_per_second": 5.236, "step": 1362 }, { "epoch": 6.035242290748899, "grad_norm": 0.9543855786323547, "learning_rate": 0.00017938325991189428, "loss": 0.4232, "step": 1370 }, { "epoch": 6.079295154185022, "grad_norm": 1.0528812408447266, "learning_rate": 0.0001785022026431718, "loss": 0.4025, "step": 1380 }, { "epoch": 6.1233480176211454, "grad_norm": 0.9573265910148621, "learning_rate": 0.00017762114537444933, "loss": 0.4127, "step": 1390 }, { "epoch": 6.167400881057269, "grad_norm": 1.7806532382965088, "learning_rate": 0.00017674008810572686, "loss": 0.4646, "step": 1400 }, { "epoch": 6.211453744493392, "grad_norm": 1.0559179782867432, "learning_rate": 0.0001758590308370044, "loss": 0.3222, "step": 1410 }, { "epoch": 6.255506607929515, "grad_norm": 0.9502829313278198, "learning_rate": 0.00017497797356828194, "loss": 0.4697, "step": 1420 }, { "epoch": 6.299559471365638, "grad_norm": 0.6869007349014282, "learning_rate": 0.00017409691629955947, "loss": 0.4155, "step": 1430 }, { "epoch": 6.343612334801762, "grad_norm": 0.6793345808982849, "learning_rate": 0.000173215859030837, "loss": 0.4236, "step": 1440 }, { "epoch": 6.387665198237886, "grad_norm": 1.067975640296936, "learning_rate": 0.00017233480176211452, "loss": 0.3558, "step": 1450 }, { "epoch": 6.431718061674009, "grad_norm": 1.0968421697616577, "learning_rate": 0.00017145374449339205, "loss": 0.4453, "step": 1460 }, { "epoch": 6.475770925110132, "grad_norm": 1.1832313537597656, "learning_rate": 0.0001705726872246696, "loss": 0.5115, "step": 1470 }, { "epoch": 6.5198237885462555, "grad_norm": 0.9857836365699768, "learning_rate": 0.00016969162995594713, "loss": 0.4274, "step": 1480 }, { "epoch": 6.563876651982379, "grad_norm": 0.9006336331367493, "learning_rate": 0.00016881057268722466, "loss": 0.3865, "step": 1490 }, { "epoch": 6.607929515418502, "grad_norm": 1.1091986894607544, "learning_rate": 0.0001679295154185022, "loss": 0.3988, "step": 1500 }, { "epoch": 6.651982378854625, "grad_norm": 1.423886775970459, "learning_rate": 0.00016704845814977971, "loss": 0.5057, "step": 1510 }, { "epoch": 6.6960352422907485, "grad_norm": 0.9245197176933289, "learning_rate": 0.00016616740088105727, "loss": 0.3966, "step": 1520 }, { "epoch": 6.740088105726873, "grad_norm": 0.944870114326477, "learning_rate": 0.0001652863436123348, "loss": 0.4521, "step": 1530 }, { "epoch": 6.784140969162996, "grad_norm": 0.8870773315429688, "learning_rate": 0.00016440528634361232, "loss": 0.4425, "step": 1540 }, { "epoch": 6.828193832599119, "grad_norm": 0.7404115200042725, "learning_rate": 0.00016352422907488985, "loss": 0.3207, "step": 1550 }, { "epoch": 6.872246696035242, "grad_norm": 0.9958137273788452, "learning_rate": 0.00016264317180616738, "loss": 0.4244, "step": 1560 }, { "epoch": 6.916299559471366, "grad_norm": 1.0651079416275024, "learning_rate": 0.00016176211453744493, "loss": 0.4075, "step": 1570 }, { "epoch": 6.960352422907489, "grad_norm": 0.9528789520263672, "learning_rate": 0.00016088105726872246, "loss": 0.493, "step": 1580 }, { "epoch": 7.0, "eval_loss": 0.5702072381973267, "eval_runtime": 5.1787, "eval_samples_per_second": 39.006, "eval_steps_per_second": 5.021, "step": 1589 }, { "epoch": 7.004405286343612, "grad_norm": 1.0486853122711182, "learning_rate": 0.00015999999999999999, "loss": 0.4131, "step": 1590 }, { "epoch": 7.048458149779735, "grad_norm": 1.2176262140274048, "learning_rate": 0.0001591189427312775, "loss": 0.4417, "step": 1600 }, { "epoch": 7.092511013215859, "grad_norm": 1.187107801437378, "learning_rate": 0.00015823788546255504, "loss": 0.4372, "step": 1610 }, { "epoch": 7.136563876651983, "grad_norm": 0.9459372758865356, "learning_rate": 0.0001573568281938326, "loss": 0.356, "step": 1620 }, { "epoch": 7.180616740088106, "grad_norm": 0.8114103078842163, "learning_rate": 0.00015647577092511012, "loss": 0.308, "step": 1630 }, { "epoch": 7.224669603524229, "grad_norm": 1.035370945930481, "learning_rate": 0.00015559471365638765, "loss": 0.3738, "step": 1640 }, { "epoch": 7.2687224669603525, "grad_norm": 1.0260848999023438, "learning_rate": 0.00015471365638766518, "loss": 0.342, "step": 1650 }, { "epoch": 7.312775330396476, "grad_norm": 0.8079932928085327, "learning_rate": 0.00015383259911894273, "loss": 0.4381, "step": 1660 }, { "epoch": 7.356828193832599, "grad_norm": 1.318695068359375, "learning_rate": 0.00015295154185022026, "loss": 0.3685, "step": 1670 }, { "epoch": 7.400881057268722, "grad_norm": 1.3181859254837036, "learning_rate": 0.00015207048458149778, "loss": 0.3465, "step": 1680 }, { "epoch": 7.4449339207048455, "grad_norm": 1.0277948379516602, "learning_rate": 0.0001511894273127753, "loss": 0.3659, "step": 1690 }, { "epoch": 7.48898678414097, "grad_norm": 1.1619762182235718, "learning_rate": 0.00015030837004405284, "loss": 0.4304, "step": 1700 }, { "epoch": 7.533039647577093, "grad_norm": 1.2854048013687134, "learning_rate": 0.0001494273127753304, "loss": 0.4372, "step": 1710 }, { "epoch": 7.577092511013216, "grad_norm": 1.032459020614624, "learning_rate": 0.00014854625550660792, "loss": 0.3687, "step": 1720 }, { "epoch": 7.621145374449339, "grad_norm": 0.9430228471755981, "learning_rate": 0.00014766519823788545, "loss": 0.3967, "step": 1730 }, { "epoch": 7.665198237885463, "grad_norm": 1.2012503147125244, "learning_rate": 0.00014678414096916297, "loss": 0.4028, "step": 1740 }, { "epoch": 7.709251101321586, "grad_norm": 0.9703013896942139, "learning_rate": 0.00014590308370044053, "loss": 0.4037, "step": 1750 }, { "epoch": 7.753303964757709, "grad_norm": 1.2811229228973389, "learning_rate": 0.00014502202643171806, "loss": 0.3725, "step": 1760 }, { "epoch": 7.797356828193832, "grad_norm": 0.9879553914070129, "learning_rate": 0.00014414096916299558, "loss": 0.4385, "step": 1770 }, { "epoch": 7.841409691629956, "grad_norm": 1.4015151262283325, "learning_rate": 0.0001432599118942731, "loss": 0.4046, "step": 1780 }, { "epoch": 7.885462555066079, "grad_norm": 0.9369928240776062, "learning_rate": 0.00014237885462555064, "loss": 0.4232, "step": 1790 }, { "epoch": 7.929515418502203, "grad_norm": 0.7787442803382874, "learning_rate": 0.0001414977973568282, "loss": 0.3679, "step": 1800 }, { "epoch": 7.973568281938326, "grad_norm": 0.7212619781494141, "learning_rate": 0.00014061674008810572, "loss": 0.4299, "step": 1810 }, { "epoch": 8.0, "eval_loss": 0.57987380027771, "eval_runtime": 4.8534, "eval_samples_per_second": 41.62, "eval_steps_per_second": 5.357, "step": 1816 }, { "epoch": 8.017621145374449, "grad_norm": 1.1815301179885864, "learning_rate": 0.00013973568281938325, "loss": 0.4058, "step": 1820 }, { "epoch": 8.061674008810572, "grad_norm": 0.7913572192192078, "learning_rate": 0.00013885462555066077, "loss": 0.2876, "step": 1830 }, { "epoch": 8.105726872246697, "grad_norm": 0.9591747522354126, "learning_rate": 0.0001379735682819383, "loss": 0.2801, "step": 1840 }, { "epoch": 8.14977973568282, "grad_norm": 1.2883862257003784, "learning_rate": 0.00013709251101321585, "loss": 0.3435, "step": 1850 }, { "epoch": 8.193832599118943, "grad_norm": 1.2138097286224365, "learning_rate": 0.00013621145374449338, "loss": 0.4603, "step": 1860 }, { "epoch": 8.237885462555067, "grad_norm": 0.9017927050590515, "learning_rate": 0.0001353303964757709, "loss": 0.328, "step": 1870 }, { "epoch": 8.28193832599119, "grad_norm": 1.0213032960891724, "learning_rate": 0.00013444933920704844, "loss": 0.4241, "step": 1880 }, { "epoch": 8.325991189427313, "grad_norm": 0.782507598400116, "learning_rate": 0.00013356828193832596, "loss": 0.287, "step": 1890 }, { "epoch": 8.370044052863436, "grad_norm": 0.8239027261734009, "learning_rate": 0.00013268722466960352, "loss": 0.3471, "step": 1900 }, { "epoch": 8.41409691629956, "grad_norm": 0.9952473044395447, "learning_rate": 0.00013180616740088104, "loss": 0.325, "step": 1910 }, { "epoch": 8.458149779735683, "grad_norm": 0.7988440990447998, "learning_rate": 0.00013092511013215857, "loss": 0.3397, "step": 1920 }, { "epoch": 8.502202643171806, "grad_norm": 1.2881464958190918, "learning_rate": 0.0001300440528634361, "loss": 0.4655, "step": 1930 }, { "epoch": 8.54625550660793, "grad_norm": 0.9545268416404724, "learning_rate": 0.00012916299559471365, "loss": 0.4031, "step": 1940 }, { "epoch": 8.590308370044053, "grad_norm": 1.550424337387085, "learning_rate": 0.00012828193832599118, "loss": 0.3697, "step": 1950 }, { "epoch": 8.634361233480176, "grad_norm": 1.2041224241256714, "learning_rate": 0.0001274008810572687, "loss": 0.43, "step": 1960 }, { "epoch": 8.678414096916299, "grad_norm": 0.8280724287033081, "learning_rate": 0.00012651982378854626, "loss": 0.4045, "step": 1970 }, { "epoch": 8.722466960352422, "grad_norm": 0.8164283037185669, "learning_rate": 0.00012563876651982376, "loss": 0.4001, "step": 1980 }, { "epoch": 8.766519823788546, "grad_norm": 0.9470929503440857, "learning_rate": 0.00012475770925110132, "loss": 0.3767, "step": 1990 }, { "epoch": 8.810572687224669, "grad_norm": 0.7390472888946533, "learning_rate": 0.00012387665198237884, "loss": 0.4206, "step": 2000 }, { "epoch": 8.854625550660792, "grad_norm": 0.8382723927497864, "learning_rate": 0.00012299559471365637, "loss": 0.3061, "step": 2010 }, { "epoch": 8.898678414096917, "grad_norm": 1.060539722442627, "learning_rate": 0.00012211453744493392, "loss": 0.4921, "step": 2020 }, { "epoch": 8.94273127753304, "grad_norm": 0.6955994367599487, "learning_rate": 0.00012123348017621144, "loss": 0.4077, "step": 2030 }, { "epoch": 8.986784140969164, "grad_norm": 0.8158656358718872, "learning_rate": 0.00012035242290748898, "loss": 0.3759, "step": 2040 }, { "epoch": 9.0, "eval_loss": 0.582844614982605, "eval_runtime": 4.8405, "eval_samples_per_second": 41.732, "eval_steps_per_second": 5.371, "step": 2043 }, { "epoch": 9.030837004405287, "grad_norm": 0.9192315936088562, "learning_rate": 0.0001194713656387665, "loss": 0.3809, "step": 2050 }, { "epoch": 9.07488986784141, "grad_norm": 1.0536017417907715, "learning_rate": 0.00011859030837004403, "loss": 0.3321, "step": 2060 }, { "epoch": 9.118942731277533, "grad_norm": 1.1080108880996704, "learning_rate": 0.00011770925110132157, "loss": 0.407, "step": 2070 }, { "epoch": 9.162995594713657, "grad_norm": 0.9956775903701782, "learning_rate": 0.0001168281938325991, "loss": 0.3423, "step": 2080 }, { "epoch": 9.20704845814978, "grad_norm": 0.746013343334198, "learning_rate": 0.00011594713656387664, "loss": 0.3794, "step": 2090 }, { "epoch": 9.251101321585903, "grad_norm": 1.126372218132019, "learning_rate": 0.00011506607929515417, "loss": 0.4121, "step": 2100 }, { "epoch": 9.295154185022026, "grad_norm": 1.4978642463684082, "learning_rate": 0.00011418502202643172, "loss": 0.3358, "step": 2110 }, { "epoch": 9.33920704845815, "grad_norm": 0.7826859951019287, "learning_rate": 0.00011330396475770924, "loss": 0.2931, "step": 2120 }, { "epoch": 9.383259911894273, "grad_norm": 1.1644082069396973, "learning_rate": 0.00011242290748898676, "loss": 0.377, "step": 2130 }, { "epoch": 9.427312775330396, "grad_norm": 0.8106231093406677, "learning_rate": 0.00011154185022026432, "loss": 0.3562, "step": 2140 }, { "epoch": 9.47136563876652, "grad_norm": 1.162919282913208, "learning_rate": 0.00011066079295154183, "loss": 0.3441, "step": 2150 }, { "epoch": 9.515418502202643, "grad_norm": 0.7184136509895325, "learning_rate": 0.00010977973568281939, "loss": 0.3254, "step": 2160 }, { "epoch": 9.559471365638766, "grad_norm": 0.9587578177452087, "learning_rate": 0.00010889867841409691, "loss": 0.3533, "step": 2170 }, { "epoch": 9.603524229074889, "grad_norm": 0.8703950643539429, "learning_rate": 0.00010801762114537444, "loss": 0.3366, "step": 2180 }, { "epoch": 9.647577092511014, "grad_norm": 0.7304671406745911, "learning_rate": 0.00010713656387665198, "loss": 0.3608, "step": 2190 }, { "epoch": 9.691629955947137, "grad_norm": 1.1611542701721191, "learning_rate": 0.00010625550660792951, "loss": 0.3353, "step": 2200 }, { "epoch": 9.73568281938326, "grad_norm": 0.7281723022460938, "learning_rate": 0.00010537444933920705, "loss": 0.3082, "step": 2210 }, { "epoch": 9.779735682819384, "grad_norm": 1.1435456275939941, "learning_rate": 0.00010449339207048458, "loss": 0.4317, "step": 2220 }, { "epoch": 9.823788546255507, "grad_norm": 0.9928381443023682, "learning_rate": 0.0001036123348017621, "loss": 0.3564, "step": 2230 }, { "epoch": 9.86784140969163, "grad_norm": 0.8395977020263672, "learning_rate": 0.00010273127753303964, "loss": 0.3531, "step": 2240 }, { "epoch": 9.911894273127754, "grad_norm": 1.0142395496368408, "learning_rate": 0.00010185022026431717, "loss": 0.3896, "step": 2250 }, { "epoch": 9.955947136563877, "grad_norm": 0.6916971802711487, "learning_rate": 0.00010096916299559471, "loss": 0.3667, "step": 2260 }, { "epoch": 10.0, "grad_norm": 0.7665943503379822, "learning_rate": 0.00010008810572687224, "loss": 0.3075, "step": 2270 }, { "epoch": 10.0, "eval_loss": 0.579430878162384, "eval_runtime": 4.7891, "eval_samples_per_second": 42.179, "eval_steps_per_second": 5.429, "step": 2270 }, { "epoch": 10.044052863436123, "grad_norm": 1.4675018787384033, "learning_rate": 9.920704845814978e-05, "loss": 0.3251, "step": 2280 }, { "epoch": 10.088105726872246, "grad_norm": 0.6954736709594727, "learning_rate": 9.83259911894273e-05, "loss": 0.3656, "step": 2290 }, { "epoch": 10.13215859030837, "grad_norm": 1.4188182353973389, "learning_rate": 9.744493392070483e-05, "loss": 0.335, "step": 2300 }, { "epoch": 10.176211453744493, "grad_norm": 0.9333553910255432, "learning_rate": 9.656387665198237e-05, "loss": 0.2888, "step": 2310 }, { "epoch": 10.220264317180616, "grad_norm": 0.886482834815979, "learning_rate": 9.56828193832599e-05, "loss": 0.3122, "step": 2320 }, { "epoch": 10.26431718061674, "grad_norm": 0.6795399188995361, "learning_rate": 9.480176211453744e-05, "loss": 0.3765, "step": 2330 }, { "epoch": 10.308370044052863, "grad_norm": 1.3046603202819824, "learning_rate": 9.392070484581497e-05, "loss": 0.3316, "step": 2340 }, { "epoch": 10.352422907488986, "grad_norm": 1.0006519556045532, "learning_rate": 9.30396475770925e-05, "loss": 0.3659, "step": 2350 }, { "epoch": 10.396475770925111, "grad_norm": 1.1640467643737793, "learning_rate": 9.215859030837004e-05, "loss": 0.346, "step": 2360 }, { "epoch": 10.440528634361234, "grad_norm": 0.9744365811347961, "learning_rate": 9.127753303964756e-05, "loss": 0.3317, "step": 2370 }, { "epoch": 10.484581497797357, "grad_norm": 1.039802074432373, "learning_rate": 9.03964757709251e-05, "loss": 0.3162, "step": 2380 }, { "epoch": 10.52863436123348, "grad_norm": 0.9926576614379883, "learning_rate": 8.951541850220263e-05, "loss": 0.3559, "step": 2390 }, { "epoch": 10.572687224669604, "grad_norm": 1.0141366720199585, "learning_rate": 8.863436123348016e-05, "loss": 0.3196, "step": 2400 }, { "epoch": 10.616740088105727, "grad_norm": 0.5856879353523254, "learning_rate": 8.77533039647577e-05, "loss": 0.2919, "step": 2410 }, { "epoch": 10.66079295154185, "grad_norm": 0.9484356045722961, "learning_rate": 8.687224669603523e-05, "loss": 0.339, "step": 2420 }, { "epoch": 10.704845814977974, "grad_norm": 0.9014990925788879, "learning_rate": 8.599118942731277e-05, "loss": 0.3089, "step": 2430 }, { "epoch": 10.748898678414097, "grad_norm": 0.9830072522163391, "learning_rate": 8.51101321585903e-05, "loss": 0.3461, "step": 2440 }, { "epoch": 10.79295154185022, "grad_norm": 1.051647424697876, "learning_rate": 8.422907488986782e-05, "loss": 0.292, "step": 2450 }, { "epoch": 10.837004405286343, "grad_norm": 1.0580625534057617, "learning_rate": 8.334801762114536e-05, "loss": 0.4052, "step": 2460 }, { "epoch": 10.881057268722467, "grad_norm": 1.01996648311615, "learning_rate": 8.246696035242289e-05, "loss": 0.3927, "step": 2470 }, { "epoch": 10.92511013215859, "grad_norm": 0.6538860201835632, "learning_rate": 8.158590308370044e-05, "loss": 0.3451, "step": 2480 }, { "epoch": 10.969162995594713, "grad_norm": 0.9368380308151245, "learning_rate": 8.070484581497796e-05, "loss": 0.3932, "step": 2490 }, { "epoch": 11.0, "eval_loss": 0.5825287103652954, "eval_runtime": 4.811, "eval_samples_per_second": 41.987, "eval_steps_per_second": 5.404, "step": 2497 }, { "epoch": 11.013215859030836, "grad_norm": 0.9590967893600464, "learning_rate": 7.982378854625551e-05, "loss": 0.32, "step": 2500 }, { "epoch": 11.05726872246696, "grad_norm": 0.9905742406845093, "learning_rate": 7.894273127753304e-05, "loss": 0.3029, "step": 2510 }, { "epoch": 11.101321585903083, "grad_norm": 1.2009577751159668, "learning_rate": 7.806167400881057e-05, "loss": 0.3626, "step": 2520 }, { "epoch": 11.145374449339206, "grad_norm": 1.0607908964157104, "learning_rate": 7.718061674008811e-05, "loss": 0.314, "step": 2530 }, { "epoch": 11.189427312775331, "grad_norm": 1.1098504066467285, "learning_rate": 7.629955947136563e-05, "loss": 0.3062, "step": 2540 }, { "epoch": 11.233480176211454, "grad_norm": 0.6961995959281921, "learning_rate": 7.541850220264317e-05, "loss": 0.3499, "step": 2550 }, { "epoch": 11.277533039647578, "grad_norm": 1.0727498531341553, "learning_rate": 7.45374449339207e-05, "loss": 0.2559, "step": 2560 }, { "epoch": 11.321585903083701, "grad_norm": 1.064344048500061, "learning_rate": 7.365638766519823e-05, "loss": 0.3011, "step": 2570 }, { "epoch": 11.365638766519824, "grad_norm": 1.1059036254882812, "learning_rate": 7.277533039647577e-05, "loss": 0.3415, "step": 2580 }, { "epoch": 11.409691629955947, "grad_norm": 0.8815020322799683, "learning_rate": 7.18942731277533e-05, "loss": 0.3164, "step": 2590 }, { "epoch": 11.45374449339207, "grad_norm": 0.9667496085166931, "learning_rate": 7.101321585903082e-05, "loss": 0.3642, "step": 2600 }, { "epoch": 11.497797356828194, "grad_norm": 0.942876935005188, "learning_rate": 7.013215859030836e-05, "loss": 0.3624, "step": 2610 }, { "epoch": 11.541850220264317, "grad_norm": 1.022675633430481, "learning_rate": 6.925110132158589e-05, "loss": 0.3351, "step": 2620 }, { "epoch": 11.58590308370044, "grad_norm": 0.9919267892837524, "learning_rate": 6.837004405286343e-05, "loss": 0.3335, "step": 2630 }, { "epoch": 11.629955947136564, "grad_norm": 0.9724282026290894, "learning_rate": 6.748898678414096e-05, "loss": 0.3154, "step": 2640 }, { "epoch": 11.674008810572687, "grad_norm": 1.3246617317199707, "learning_rate": 6.660792951541849e-05, "loss": 0.4366, "step": 2650 }, { "epoch": 11.71806167400881, "grad_norm": 1.0111949443817139, "learning_rate": 6.572687224669603e-05, "loss": 0.3324, "step": 2660 }, { "epoch": 11.762114537444933, "grad_norm": 0.8399791717529297, "learning_rate": 6.484581497797357e-05, "loss": 0.2669, "step": 2670 }, { "epoch": 11.806167400881057, "grad_norm": 0.917736828327179, "learning_rate": 6.39647577092511e-05, "loss": 0.324, "step": 2680 }, { "epoch": 11.85022026431718, "grad_norm": 0.9939138293266296, "learning_rate": 6.308370044052864e-05, "loss": 0.2888, "step": 2690 }, { "epoch": 11.894273127753303, "grad_norm": 0.9510142803192139, "learning_rate": 6.220264317180616e-05, "loss": 0.3428, "step": 2700 }, { "epoch": 11.938325991189426, "grad_norm": 1.3216148614883423, "learning_rate": 6.132158590308369e-05, "loss": 0.3254, "step": 2710 }, { "epoch": 11.982378854625551, "grad_norm": 1.2755056619644165, "learning_rate": 6.0440528634361224e-05, "loss": 0.3188, "step": 2720 }, { "epoch": 12.0, "eval_loss": 0.5862967371940613, "eval_runtime": 5.4591, "eval_samples_per_second": 37.002, "eval_steps_per_second": 4.763, "step": 2724 }, { "epoch": 12.026431718061675, "grad_norm": 0.8617092967033386, "learning_rate": 5.955947136563876e-05, "loss": 0.2953, "step": 2730 }, { "epoch": 12.070484581497798, "grad_norm": 0.7434670329093933, "learning_rate": 5.86784140969163e-05, "loss": 0.312, "step": 2740 }, { "epoch": 12.114537444933921, "grad_norm": 0.9274753332138062, "learning_rate": 5.779735682819383e-05, "loss": 0.2664, "step": 2750 }, { "epoch": 12.158590308370044, "grad_norm": 1.058923363685608, "learning_rate": 5.691629955947135e-05, "loss": 0.3236, "step": 2760 }, { "epoch": 12.202643171806168, "grad_norm": 0.7601414918899536, "learning_rate": 5.6035242290748894e-05, "loss": 0.3076, "step": 2770 }, { "epoch": 12.246696035242291, "grad_norm": 0.7787047624588013, "learning_rate": 5.515418502202643e-05, "loss": 0.2618, "step": 2780 }, { "epoch": 12.290748898678414, "grad_norm": 0.9064326882362366, "learning_rate": 5.427312775330396e-05, "loss": 0.3305, "step": 2790 }, { "epoch": 12.334801762114537, "grad_norm": 1.0712478160858154, "learning_rate": 5.3392070484581496e-05, "loss": 0.341, "step": 2800 }, { "epoch": 12.37885462555066, "grad_norm": 0.6585920453071594, "learning_rate": 5.251101321585903e-05, "loss": 0.3476, "step": 2810 }, { "epoch": 12.422907488986784, "grad_norm": 1.1152169704437256, "learning_rate": 5.162995594713656e-05, "loss": 0.3418, "step": 2820 }, { "epoch": 12.466960352422907, "grad_norm": 0.926008403301239, "learning_rate": 5.074889867841409e-05, "loss": 0.2543, "step": 2830 }, { "epoch": 12.51101321585903, "grad_norm": 1.1506083011627197, "learning_rate": 4.9867841409691625e-05, "loss": 0.2895, "step": 2840 }, { "epoch": 12.555066079295154, "grad_norm": 0.8726121783256531, "learning_rate": 4.898678414096916e-05, "loss": 0.2917, "step": 2850 }, { "epoch": 12.599118942731277, "grad_norm": 1.1620839834213257, "learning_rate": 4.810572687224669e-05, "loss": 0.3585, "step": 2860 }, { "epoch": 12.6431718061674, "grad_norm": 1.1911215782165527, "learning_rate": 4.7224669603524226e-05, "loss": 0.3177, "step": 2870 }, { "epoch": 12.687224669603523, "grad_norm": 0.9236161708831787, "learning_rate": 4.6343612334801754e-05, "loss": 0.3203, "step": 2880 }, { "epoch": 12.731277533039648, "grad_norm": 1.0384935140609741, "learning_rate": 4.546255506607929e-05, "loss": 0.3264, "step": 2890 }, { "epoch": 12.775330396475772, "grad_norm": 1.3048256635665894, "learning_rate": 4.458149779735682e-05, "loss": 0.3544, "step": 2900 }, { "epoch": 12.819383259911895, "grad_norm": 1.127678394317627, "learning_rate": 4.370044052863436e-05, "loss": 0.3768, "step": 2910 }, { "epoch": 12.863436123348018, "grad_norm": 0.9425409436225891, "learning_rate": 4.2819383259911896e-05, "loss": 0.2778, "step": 2920 }, { "epoch": 12.907488986784141, "grad_norm": 1.2469598054885864, "learning_rate": 4.1938325991189416e-05, "loss": 0.3532, "step": 2930 }, { "epoch": 12.951541850220265, "grad_norm": 0.7975876927375793, "learning_rate": 4.105726872246696e-05, "loss": 0.3189, "step": 2940 }, { "epoch": 12.995594713656388, "grad_norm": 0.8869457840919495, "learning_rate": 4.017621145374449e-05, "loss": 0.3282, "step": 2950 }, { "epoch": 13.0, "eval_loss": 0.5989018678665161, "eval_runtime": 4.9822, "eval_samples_per_second": 40.544, "eval_steps_per_second": 5.219, "step": 2951 }, { "epoch": 13.039647577092511, "grad_norm": 1.1934689283370972, "learning_rate": 3.9295154185022025e-05, "loss": 0.2919, "step": 2960 }, { "epoch": 13.083700440528634, "grad_norm": 1.1812618970870972, "learning_rate": 3.841409691629956e-05, "loss": 0.3219, "step": 2970 }, { "epoch": 13.127753303964758, "grad_norm": 1.2065187692642212, "learning_rate": 3.753303964757709e-05, "loss": 0.29, "step": 2980 }, { "epoch": 13.17180616740088, "grad_norm": 0.8890476822853088, "learning_rate": 3.665198237885462e-05, "loss": 0.2462, "step": 2990 }, { "epoch": 13.215859030837004, "grad_norm": 1.2433491945266724, "learning_rate": 3.5770925110132154e-05, "loss": 0.2614, "step": 3000 } ], "logging_steps": 10, "max_steps": 3405, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4451705806651392.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }