{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022727272727272728, "grad_norm": 4.307063447378547, "learning_rate": 7.575757575757576e-07, "loss": 0.7126, "step": 10 }, { "epoch": 0.045454545454545456, "grad_norm": 2.478183802674108, "learning_rate": 1.5151515151515152e-06, "loss": 0.6907, "step": 20 }, { "epoch": 0.06818181818181818, "grad_norm": 2.093724232621029, "learning_rate": 2.2727272727272728e-06, "loss": 0.6475, "step": 30 }, { "epoch": 0.09090909090909091, "grad_norm": 1.988326655453792, "learning_rate": 3.0303030303030305e-06, "loss": 0.5679, "step": 40 }, { "epoch": 0.11363636363636363, "grad_norm": 1.5026333071399824, "learning_rate": 3.7878787878787882e-06, "loss": 0.5262, "step": 50 }, { "epoch": 0.13636363636363635, "grad_norm": 1.1747754528130179, "learning_rate": 4.5454545454545455e-06, "loss": 0.4857, "step": 60 }, { "epoch": 0.1590909090909091, "grad_norm": 1.3829995461023388, "learning_rate": 5.303030303030303e-06, "loss": 0.5007, "step": 70 }, { "epoch": 0.18181818181818182, "grad_norm": 1.1555847024592305, "learning_rate": 6.060606060606061e-06, "loss": 0.4704, "step": 80 }, { "epoch": 0.20454545454545456, "grad_norm": 1.530346299350765, "learning_rate": 6.818181818181818e-06, "loss": 0.4815, "step": 90 }, { "epoch": 0.22727272727272727, "grad_norm": 1.3258734821104459, "learning_rate": 7.5757575757575764e-06, "loss": 0.4453, "step": 100 }, { "epoch": 0.25, "grad_norm": 1.0866300791021735, "learning_rate": 8.333333333333334e-06, "loss": 0.479, "step": 110 }, { "epoch": 0.2727272727272727, "grad_norm": 1.1145634533248179, "learning_rate": 9.090909090909091e-06, "loss": 0.4551, "step": 120 }, { "epoch": 0.29545454545454547, "grad_norm": 1.2164089669910587, "learning_rate": 9.84848484848485e-06, "loss": 0.4506, "step": 130 }, { "epoch": 0.3181818181818182, "grad_norm": 1.4715102015058124, "learning_rate": 9.99888115313551e-06, "loss": 0.4689, "step": 140 }, { "epoch": 0.3409090909090909, "grad_norm": 1.25166857918504, "learning_rate": 9.994336695915041e-06, "loss": 0.4648, "step": 150 }, { "epoch": 0.36363636363636365, "grad_norm": 1.1482193515136296, "learning_rate": 9.986299875742612e-06, "loss": 0.4508, "step": 160 }, { "epoch": 0.38636363636363635, "grad_norm": 1.4115865068533915, "learning_rate": 9.97477631248223e-06, "loss": 0.468, "step": 170 }, { "epoch": 0.4090909090909091, "grad_norm": 1.2290742002804682, "learning_rate": 9.959774064153977e-06, "loss": 0.4541, "step": 180 }, { "epoch": 0.4318181818181818, "grad_norm": 1.1793845606021387, "learning_rate": 9.941303621299332e-06, "loss": 0.4278, "step": 190 }, { "epoch": 0.45454545454545453, "grad_norm": 0.9220892011963717, "learning_rate": 9.919377899645497e-06, "loss": 0.4379, "step": 200 }, { "epoch": 0.4772727272727273, "grad_norm": 1.0861369533049958, "learning_rate": 9.894012231073895e-06, "loss": 0.418, "step": 210 }, { "epoch": 0.5, "grad_norm": 1.0739119949772593, "learning_rate": 9.86522435289912e-06, "loss": 0.438, "step": 220 }, { "epoch": 0.5227272727272727, "grad_norm": 1.0583396452432017, "learning_rate": 9.833034395465866e-06, "loss": 0.4498, "step": 230 }, { "epoch": 0.5454545454545454, "grad_norm": 1.2558829873984296, "learning_rate": 9.797464868072489e-06, "loss": 0.4241, "step": 240 }, { "epoch": 0.5681818181818182, "grad_norm": 0.9274074484915623, "learning_rate": 9.758540643231041e-06, "loss": 0.4473, "step": 250 }, { "epoch": 0.5909090909090909, "grad_norm": 1.1351911624937234, "learning_rate": 9.716288939274818e-06, "loss": 0.4197, "step": 260 }, { "epoch": 0.6136363636363636, "grad_norm": 1.0488446178988242, "learning_rate": 9.670739301325534e-06, "loss": 0.4442, "step": 270 }, { "epoch": 0.6363636363636364, "grad_norm": 1.3143206978246442, "learning_rate": 9.621923580633462e-06, "loss": 0.4217, "step": 280 }, { "epoch": 0.6590909090909091, "grad_norm": 1.1646273970559764, "learning_rate": 9.56987591230498e-06, "loss": 0.4294, "step": 290 }, { "epoch": 0.6818181818181818, "grad_norm": 1.1283407631969753, "learning_rate": 9.514632691433108e-06, "loss": 0.4505, "step": 300 }, { "epoch": 0.7045454545454546, "grad_norm": 1.0905525560287967, "learning_rate": 9.456232547647695e-06, "loss": 0.4496, "step": 310 }, { "epoch": 0.7272727272727273, "grad_norm": 0.855761546485483, "learning_rate": 9.394716318103098e-06, "loss": 0.425, "step": 320 }, { "epoch": 0.75, "grad_norm": 1.1885215313833324, "learning_rate": 9.330127018922195e-06, "loss": 0.4691, "step": 330 }, { "epoch": 0.7727272727272727, "grad_norm": 0.9705633869137796, "learning_rate": 9.262509815116732e-06, "loss": 0.4266, "step": 340 }, { "epoch": 0.7954545454545454, "grad_norm": 0.9460295884946632, "learning_rate": 9.191911989005038e-06, "loss": 0.4334, "step": 350 }, { "epoch": 0.8181818181818182, "grad_norm": 0.8244479504968187, "learning_rate": 9.118382907149164e-06, "loss": 0.4354, "step": 360 }, { "epoch": 0.8409090909090909, "grad_norm": 1.0907034782243845, "learning_rate": 9.041973985834595e-06, "loss": 0.4409, "step": 370 }, { "epoch": 0.8636363636363636, "grad_norm": 0.9011547089995718, "learning_rate": 8.96273865511666e-06, "loss": 0.4272, "step": 380 }, { "epoch": 0.8863636363636364, "grad_norm": 1.2220001058375158, "learning_rate": 8.880732321458785e-06, "loss": 0.4325, "step": 390 }, { "epoch": 0.9090909090909091, "grad_norm": 1.0552888697398697, "learning_rate": 8.796012328988716e-06, "loss": 0.4069, "step": 400 }, { "epoch": 0.9318181818181818, "grad_norm": 1.307751530018105, "learning_rate": 8.708637919399798e-06, "loss": 0.4396, "step": 410 }, { "epoch": 0.9545454545454546, "grad_norm": 0.9812773481606568, "learning_rate": 8.61867019052535e-06, "loss": 0.4171, "step": 420 }, { "epoch": 0.9772727272727273, "grad_norm": 1.0623603631162184, "learning_rate": 8.526172053615122e-06, "loss": 0.4095, "step": 430 }, { "epoch": 1.0, "grad_norm": 1.0466603824404057, "learning_rate": 8.43120818934367e-06, "loss": 0.4303, "step": 440 }, { "epoch": 1.0227272727272727, "grad_norm": 0.9422037793723509, "learning_rate": 8.33384500258146e-06, "loss": 0.3291, "step": 450 }, { "epoch": 1.0454545454545454, "grad_norm": 1.1439839179446254, "learning_rate": 8.234150575960288e-06, "loss": 0.3231, "step": 460 }, { "epoch": 1.0681818181818181, "grad_norm": 0.9363363052426827, "learning_rate": 8.132194622265508e-06, "loss": 0.3321, "step": 470 }, { "epoch": 1.0909090909090908, "grad_norm": 0.8976501587025572, "learning_rate": 8.028048435688333e-06, "loss": 0.307, "step": 480 }, { "epoch": 1.1136363636363635, "grad_norm": 0.8926726629185868, "learning_rate": 7.921784841972355e-06, "loss": 0.2804, "step": 490 }, { "epoch": 1.1363636363636362, "grad_norm": 1.0791854561775653, "learning_rate": 7.813478147489052e-06, "loss": 0.3356, "step": 500 }, { "epoch": 1.1590909090909092, "grad_norm": 0.9510040199261199, "learning_rate": 7.703204087277989e-06, "loss": 0.3192, "step": 510 }, { "epoch": 1.1818181818181819, "grad_norm": 1.0208622518742, "learning_rate": 7.5910397720879785e-06, "loss": 0.3253, "step": 520 }, { "epoch": 1.2045454545454546, "grad_norm": 1.1056530204649813, "learning_rate": 7.477063634456263e-06, "loss": 0.2948, "step": 530 }, { "epoch": 1.2272727272727273, "grad_norm": 0.8907984718041057, "learning_rate": 7.361355373863415e-06, "loss": 0.3303, "step": 540 }, { "epoch": 1.25, "grad_norm": 0.9921150120480224, "learning_rate": 7.243995901002312e-06, "loss": 0.3281, "step": 550 }, { "epoch": 1.2727272727272727, "grad_norm": 1.1894145553342288, "learning_rate": 7.1250672812001505e-06, "loss": 0.3254, "step": 560 }, { "epoch": 1.2954545454545454, "grad_norm": 1.1062688043595825, "learning_rate": 7.004652677033069e-06, "loss": 0.328, "step": 570 }, { "epoch": 1.3181818181818181, "grad_norm": 0.8834038525733939, "learning_rate": 6.882836290173493e-06, "loss": 0.3242, "step": 580 }, { "epoch": 1.3409090909090908, "grad_norm": 1.032054335046655, "learning_rate": 6.759703302510898e-06, "loss": 0.3408, "step": 590 }, { "epoch": 1.3636363636363638, "grad_norm": 1.0032122077515986, "learning_rate": 6.635339816587109e-06, "loss": 0.3188, "step": 600 }, { "epoch": 1.3863636363636362, "grad_norm": 0.8472856067780782, "learning_rate": 6.5098327953878585e-06, "loss": 0.3108, "step": 610 }, { "epoch": 1.4090909090909092, "grad_norm": 0.9812797187008553, "learning_rate": 6.383270001532636e-06, "loss": 0.3192, "step": 620 }, { "epoch": 1.4318181818181819, "grad_norm": 0.982163512940909, "learning_rate": 6.255739935905396e-06, "loss": 0.3032, "step": 630 }, { "epoch": 1.4545454545454546, "grad_norm": 0.8121759235743116, "learning_rate": 6.127331775769023e-06, "loss": 0.328, "step": 640 }, { "epoch": 1.4772727272727273, "grad_norm": 0.8288842937391799, "learning_rate": 5.998135312406821e-06, "loss": 0.3241, "step": 650 }, { "epoch": 1.5, "grad_norm": 1.1705693195996676, "learning_rate": 5.8682408883346535e-06, "loss": 0.3097, "step": 660 }, { "epoch": 1.5227272727272727, "grad_norm": 1.1928631326301389, "learning_rate": 5.737739334127611e-06, "loss": 0.3306, "step": 670 }, { "epoch": 1.5454545454545454, "grad_norm": 0.8793536700861068, "learning_rate": 5.60672190490541e-06, "loss": 0.2965, "step": 680 }, { "epoch": 1.5681818181818183, "grad_norm": 1.1096056081901176, "learning_rate": 5.475280216520913e-06, "loss": 0.3149, "step": 690 }, { "epoch": 1.5909090909090908, "grad_norm": 1.0132771918230767, "learning_rate": 5.343506181496405e-06, "loss": 0.3376, "step": 700 }, { "epoch": 1.6136363636363638, "grad_norm": 1.0015800596494355, "learning_rate": 5.2114919447524155e-06, "loss": 0.3091, "step": 710 }, { "epoch": 1.6363636363636362, "grad_norm": 1.3217531318258682, "learning_rate": 5.07932981917404e-06, "loss": 0.3424, "step": 720 }, { "epoch": 1.6590909090909092, "grad_norm": 0.844031720581408, "learning_rate": 4.947112221059803e-06, "loss": 0.334, "step": 730 }, { "epoch": 1.6818181818181817, "grad_norm": 0.9811023453126005, "learning_rate": 4.81493160549821e-06, "loss": 0.3149, "step": 740 }, { "epoch": 1.7045454545454546, "grad_norm": 0.9120047314542666, "learning_rate": 4.682880401717178e-06, "loss": 0.3387, "step": 750 }, { "epoch": 1.7272727272727273, "grad_norm": 0.9645251862151868, "learning_rate": 4.551050948451542e-06, "loss": 0.3075, "step": 760 }, { "epoch": 1.75, "grad_norm": 1.029589258346577, "learning_rate": 4.4195354293738484e-06, "loss": 0.2997, "step": 770 }, { "epoch": 1.7727272727272727, "grad_norm": 0.8289069672393073, "learning_rate": 4.2884258086335755e-06, "loss": 0.3114, "step": 780 }, { "epoch": 1.7954545454545454, "grad_norm": 0.8269221372870464, "learning_rate": 4.1578137665498485e-06, "loss": 0.3162, "step": 790 }, { "epoch": 1.8181818181818183, "grad_norm": 1.0419412973934015, "learning_rate": 4.027790635502646e-06, "loss": 0.3022, "step": 800 }, { "epoch": 1.8409090909090908, "grad_norm": 0.828576197701733, "learning_rate": 3.898447336067297e-06, "loss": 0.3181, "step": 810 }, { "epoch": 1.8636363636363638, "grad_norm": 0.7902710878002056, "learning_rate": 3.769874313436933e-06, "loss": 0.2964, "step": 820 }, { "epoch": 1.8863636363636362, "grad_norm": 0.8650766226460292, "learning_rate": 3.6421614741773702e-06, "loss": 0.3083, "step": 830 }, { "epoch": 1.9090909090909092, "grad_norm": 0.911674397252804, "learning_rate": 3.5153981233586277e-06, "loss": 0.3278, "step": 840 }, { "epoch": 1.9318181818181817, "grad_norm": 1.0190901522385132, "learning_rate": 3.389672902107044e-06, "loss": 0.2963, "step": 850 }, { "epoch": 1.9545454545454546, "grad_norm": 0.9093931759022098, "learning_rate": 3.2650737256216885e-06, "loss": 0.3143, "step": 860 }, { "epoch": 1.9772727272727273, "grad_norm": 0.9726770446051795, "learning_rate": 3.141687721698363e-06, "loss": 0.3025, "step": 870 }, { "epoch": 2.0, "grad_norm": 0.8959220677622172, "learning_rate": 3.019601169804216e-06, "loss": 0.3343, "step": 880 }, { "epoch": 2.022727272727273, "grad_norm": 1.0245721987690635, "learning_rate": 2.898899440745569e-06, "loss": 0.2299, "step": 890 }, { "epoch": 2.0454545454545454, "grad_norm": 0.7717882058531099, "learning_rate": 2.7796669369711294e-06, "loss": 0.2129, "step": 900 }, { "epoch": 2.0681818181818183, "grad_norm": 1.4121404315256307, "learning_rate": 2.6619870335523434e-06, "loss": 0.1988, "step": 910 }, { "epoch": 2.090909090909091, "grad_norm": 0.8228740125650449, "learning_rate": 2.5459420198821604e-06, "loss": 0.2377, "step": 920 }, { "epoch": 2.1136363636363638, "grad_norm": 1.0688882544650145, "learning_rate": 2.4316130421329696e-06, "loss": 0.243, "step": 930 }, { "epoch": 2.1363636363636362, "grad_norm": 0.6981423061851445, "learning_rate": 2.319080046513954e-06, "loss": 0.1995, "step": 940 }, { "epoch": 2.159090909090909, "grad_norm": 0.7538445094011031, "learning_rate": 2.2084217233675386e-06, "loss": 0.2313, "step": 950 }, { "epoch": 2.1818181818181817, "grad_norm": 0.7057622922962025, "learning_rate": 2.09971545214401e-06, "loss": 0.2037, "step": 960 }, { "epoch": 2.2045454545454546, "grad_norm": 0.8953813377341792, "learning_rate": 1.9930372472928095e-06, "loss": 0.2385, "step": 970 }, { "epoch": 2.227272727272727, "grad_norm": 0.8650152971442162, "learning_rate": 1.8884617051083183e-06, "loss": 0.2231, "step": 980 }, { "epoch": 2.25, "grad_norm": 0.7372775145267629, "learning_rate": 1.7860619515673034e-06, "loss": 0.2233, "step": 990 }, { "epoch": 2.2727272727272725, "grad_norm": 1.1713711786724677, "learning_rate": 1.68590959119452e-06, "loss": 0.2302, "step": 1000 }, { "epoch": 2.2954545454545454, "grad_norm": 0.8830634163052888, "learning_rate": 1.5880746569921867e-06, "loss": 0.213, "step": 1010 }, { "epoch": 2.3181818181818183, "grad_norm": 0.7920929754273773, "learning_rate": 1.4926255614683931e-06, "loss": 0.2231, "step": 1020 }, { "epoch": 2.340909090909091, "grad_norm": 0.9483810580401165, "learning_rate": 1.3996290487986568e-06, "loss": 0.2046, "step": 1030 }, { "epoch": 2.3636363636363638, "grad_norm": 0.9124193486122323, "learning_rate": 1.3091501481540676e-06, "loss": 0.1922, "step": 1040 }, { "epoch": 2.3863636363636362, "grad_norm": 0.9345460355742555, "learning_rate": 1.2212521282287093e-06, "loss": 0.2193, "step": 1050 }, { "epoch": 2.409090909090909, "grad_norm": 0.7947710523061967, "learning_rate": 1.135996452998085e-06, "loss": 0.2185, "step": 1060 }, { "epoch": 2.4318181818181817, "grad_norm": 0.8478563530801623, "learning_rate": 1.0534427387395391e-06, "loss": 0.2014, "step": 1070 }, { "epoch": 2.4545454545454546, "grad_norm": 0.847550358331553, "learning_rate": 9.73648712344707e-07, "loss": 0.2197, "step": 1080 }, { "epoch": 2.4772727272727275, "grad_norm": 0.9568357719105378, "learning_rate": 8.966701709531344e-07, "loss": 0.2239, "step": 1090 }, { "epoch": 2.5, "grad_norm": 0.8646126050539757, "learning_rate": 8.225609429353187e-07, "loss": 0.2054, "step": 1100 }, { "epoch": 2.5227272727272725, "grad_norm": 0.8831276830491341, "learning_rate": 7.513728502524286e-07, "loss": 0.1895, "step": 1110 }, { "epoch": 2.5454545454545454, "grad_norm": 1.1049972930051686, "learning_rate": 6.831556722190453e-07, "loss": 0.2187, "step": 1120 }, { "epoch": 2.5681818181818183, "grad_norm": 0.819930369422315, "learning_rate": 6.179571106942466e-07, "loss": 0.2059, "step": 1130 }, { "epoch": 2.590909090909091, "grad_norm": 0.8068522453155903, "learning_rate": 5.558227567253832e-07, "loss": 0.1974, "step": 1140 }, { "epoch": 2.6136363636363638, "grad_norm": 0.8627975251731292, "learning_rate": 4.967960586678722e-07, "loss": 0.215, "step": 1150 }, { "epoch": 2.6363636363636362, "grad_norm": 0.7571397611134698, "learning_rate": 4.4091829180330503e-07, "loss": 0.2067, "step": 1160 }, { "epoch": 2.659090909090909, "grad_norm": 0.8163877858341596, "learning_rate": 3.882285294770938e-07, "loss": 0.2185, "step": 1170 }, { "epoch": 2.6818181818181817, "grad_norm": 0.847944124408921, "learning_rate": 3.3876361577587115e-07, "loss": 0.2126, "step": 1180 }, { "epoch": 2.7045454545454546, "grad_norm": 0.8609864011531457, "learning_rate": 2.9255813976372227e-07, "loss": 0.2376, "step": 1190 }, { "epoch": 2.7272727272727275, "grad_norm": 0.8142965279668429, "learning_rate": 2.4964441129527337e-07, "loss": 0.1971, "step": 1200 }, { "epoch": 2.75, "grad_norm": 0.8085909064873231, "learning_rate": 2.1005243842255552e-07, "loss": 0.2023, "step": 1210 }, { "epoch": 2.7727272727272725, "grad_norm": 0.7715524471203066, "learning_rate": 1.738099064114368e-07, "loss": 0.2122, "step": 1220 }, { "epoch": 2.7954545454545454, "grad_norm": 0.7256919162493282, "learning_rate": 1.4094215838229176e-07, "loss": 0.241, "step": 1230 }, { "epoch": 2.8181818181818183, "grad_norm": 0.7761363095194573, "learning_rate": 1.1147217758845752e-07, "loss": 0.2025, "step": 1240 }, { "epoch": 2.840909090909091, "grad_norm": 0.7416748760573418, "learning_rate": 8.542057134485638e-08, "loss": 0.219, "step": 1250 }, { "epoch": 2.8636363636363638, "grad_norm": 0.8096267555239283, "learning_rate": 6.280555661802857e-08, "loss": 0.2208, "step": 1260 }, { "epoch": 2.8863636363636362, "grad_norm": 0.9092729323776195, "learning_rate": 4.3642947287654284e-08, "loss": 0.239, "step": 1270 }, { "epoch": 2.909090909090909, "grad_norm": 0.7872817827279941, "learning_rate": 2.7946143088466437e-08, "loss": 0.236, "step": 1280 }, { "epoch": 2.9318181818181817, "grad_norm": 0.9061945080911546, "learning_rate": 1.5726120240288632e-08, "loss": 0.1971, "step": 1290 }, { "epoch": 2.9545454545454546, "grad_norm": 0.9803191049167996, "learning_rate": 6.991423772753636e-09, "loss": 0.2157, "step": 1300 }, { "epoch": 2.9772727272727275, "grad_norm": 0.9109787805486621, "learning_rate": 1.7481615500691829e-09, "loss": 0.2044, "step": 1310 }, { "epoch": 3.0, "grad_norm": 0.7033421688883779, "learning_rate": 0.0, "loss": 0.2047, "step": 1320 }, { "epoch": 3.0, "step": 1320, "total_flos": 148330505830400.0, "train_loss": 0.33294462082964, "train_runtime": 5380.5167, "train_samples_per_second": 1.959, "train_steps_per_second": 0.245 } ], "logging_steps": 10, "max_steps": 1320, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 148330505830400.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }