{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.9816091954022985, "eval_steps": 500, "global_step": 1736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04597701149425287, "grad_norm": 19.236724341828868, "learning_rate": 2.8735632183908047e-07, "loss": 0.9813, "step": 10 }, { "epoch": 0.09195402298850575, "grad_norm": 22.46256737380087, "learning_rate": 5.747126436781609e-07, "loss": 0.9645, "step": 20 }, { "epoch": 0.13793103448275862, "grad_norm": 7.740951848649134, "learning_rate": 8.620689655172415e-07, "loss": 0.8355, "step": 30 }, { "epoch": 0.1839080459770115, "grad_norm": 6.373158348765807, "learning_rate": 1.1494252873563219e-06, "loss": 0.7287, "step": 40 }, { "epoch": 0.22988505747126436, "grad_norm": 4.553132194954343, "learning_rate": 1.4367816091954023e-06, "loss": 0.6077, "step": 50 }, { "epoch": 0.27586206896551724, "grad_norm": 4.217339152427201, "learning_rate": 1.724137931034483e-06, "loss": 0.5534, "step": 60 }, { "epoch": 0.3218390804597701, "grad_norm": 4.295308772185913, "learning_rate": 2.0114942528735633e-06, "loss": 0.5322, "step": 70 }, { "epoch": 0.367816091954023, "grad_norm": 4.0641475269520155, "learning_rate": 2.2988505747126437e-06, "loss": 0.5003, "step": 80 }, { "epoch": 0.41379310344827586, "grad_norm": 13.886675789706416, "learning_rate": 2.5862068965517246e-06, "loss": 0.4883, "step": 90 }, { "epoch": 0.45977011494252873, "grad_norm": 4.673192920611484, "learning_rate": 2.8735632183908046e-06, "loss": 0.4584, "step": 100 }, { "epoch": 0.5057471264367817, "grad_norm": 3.674033307691337, "learning_rate": 3.1609195402298854e-06, "loss": 0.447, "step": 110 }, { "epoch": 0.5517241379310345, "grad_norm": 3.919248171882691, "learning_rate": 3.448275862068966e-06, "loss": 0.4556, "step": 120 }, { "epoch": 0.5977011494252874, "grad_norm": 3.5061949892253623, "learning_rate": 3.7356321839080462e-06, "loss": 0.4354, "step": 130 }, { "epoch": 0.6436781609195402, "grad_norm": 3.7489900095069073, "learning_rate": 4.022988505747127e-06, "loss": 0.4228, "step": 140 }, { "epoch": 0.6896551724137931, "grad_norm": 3.6759677856673103, "learning_rate": 4.310344827586207e-06, "loss": 0.4299, "step": 150 }, { "epoch": 0.735632183908046, "grad_norm": 3.6923883409669456, "learning_rate": 4.5977011494252875e-06, "loss": 0.4193, "step": 160 }, { "epoch": 0.7816091954022989, "grad_norm": 3.6202856036202427, "learning_rate": 4.885057471264369e-06, "loss": 0.4155, "step": 170 }, { "epoch": 0.8275862068965517, "grad_norm": 3.6300778901007864, "learning_rate": 4.999817969178238e-06, "loss": 0.4083, "step": 180 }, { "epoch": 0.8735632183908046, "grad_norm": 3.587161182258164, "learning_rate": 4.998705654596035e-06, "loss": 0.4225, "step": 190 }, { "epoch": 0.9195402298850575, "grad_norm": 3.3270245057939376, "learning_rate": 4.996582603056429e-06, "loss": 0.4019, "step": 200 }, { "epoch": 0.9655172413793104, "grad_norm": 3.8111046706601623, "learning_rate": 4.9934496733427066e-06, "loss": 0.3965, "step": 210 }, { "epoch": 1.0114942528735633, "grad_norm": 3.3652791442634187, "learning_rate": 4.989308132738127e-06, "loss": 0.3966, "step": 220 }, { "epoch": 1.0574712643678161, "grad_norm": 3.1325910691416747, "learning_rate": 4.9841596565133e-06, "loss": 0.3407, "step": 230 }, { "epoch": 1.103448275862069, "grad_norm": 3.167830493874506, "learning_rate": 4.978006327248537e-06, "loss": 0.3519, "step": 240 }, { "epoch": 1.1494252873563218, "grad_norm": 3.2716290396557377, "learning_rate": 4.970850633991432e-06, "loss": 0.3596, "step": 250 }, { "epoch": 1.1954022988505748, "grad_norm": 3.1954803751416962, "learning_rate": 4.962695471250033e-06, "loss": 0.343, "step": 260 }, { "epoch": 1.2413793103448276, "grad_norm": 3.479098208871561, "learning_rate": 4.953544137822006e-06, "loss": 0.3369, "step": 270 }, { "epoch": 1.2873563218390804, "grad_norm": 3.3457647349425104, "learning_rate": 4.9434003354602515e-06, "loss": 0.3627, "step": 280 }, { "epoch": 1.3333333333333333, "grad_norm": 3.282832685188253, "learning_rate": 4.932268167375532e-06, "loss": 0.3528, "step": 290 }, { "epoch": 1.3793103448275863, "grad_norm": 2.6789350907177822, "learning_rate": 4.920152136576706e-06, "loss": 0.3406, "step": 300 }, { "epoch": 1.4252873563218391, "grad_norm": 3.3658438451363577, "learning_rate": 4.9070571440492435e-06, "loss": 0.3643, "step": 310 }, { "epoch": 1.471264367816092, "grad_norm": 3.103704408419176, "learning_rate": 4.892988486772756e-06, "loss": 0.3434, "step": 320 }, { "epoch": 1.5172413793103448, "grad_norm": 2.9669630699191405, "learning_rate": 4.877951855578342e-06, "loss": 0.3322, "step": 330 }, { "epoch": 1.5632183908045976, "grad_norm": 3.486652210473939, "learning_rate": 4.86195333284663e-06, "loss": 0.3529, "step": 340 }, { "epoch": 1.6091954022988506, "grad_norm": 3.3652573778754253, "learning_rate": 4.844999390047419e-06, "loss": 0.3547, "step": 350 }, { "epoch": 1.6551724137931034, "grad_norm": 3.5532322465589385, "learning_rate": 4.827096885121954e-06, "loss": 0.3408, "step": 360 }, { "epoch": 1.7011494252873565, "grad_norm": 3.143444966292411, "learning_rate": 4.808253059708849e-06, "loss": 0.3506, "step": 370 }, { "epoch": 1.7471264367816093, "grad_norm": 2.8493219432318875, "learning_rate": 4.788475536214822e-06, "loss": 0.3368, "step": 380 }, { "epoch": 1.793103448275862, "grad_norm": 2.8628419910614316, "learning_rate": 4.767772314731394e-06, "loss": 0.3424, "step": 390 }, { "epoch": 1.839080459770115, "grad_norm": 3.0446997203794344, "learning_rate": 4.746151769798818e-06, "loss": 0.3549, "step": 400 }, { "epoch": 1.8850574712643677, "grad_norm": 2.7372586769671337, "learning_rate": 4.7236226470185505e-06, "loss": 0.3247, "step": 410 }, { "epoch": 1.9310344827586206, "grad_norm": 3.044988365851536, "learning_rate": 4.700194059515606e-06, "loss": 0.3373, "step": 420 }, { "epoch": 1.9770114942528736, "grad_norm": 2.9343723129428643, "learning_rate": 4.67587548425227e-06, "loss": 0.3393, "step": 430 }, { "epoch": 2.0229885057471266, "grad_norm": 2.4482722232022764, "learning_rate": 4.650676758194624e-06, "loss": 0.3095, "step": 440 }, { "epoch": 2.0689655172413794, "grad_norm": 2.9276614481022127, "learning_rate": 4.624608074333448e-06, "loss": 0.256, "step": 450 }, { "epoch": 2.1149425287356323, "grad_norm": 2.6846087529478955, "learning_rate": 4.597679977561122e-06, "loss": 0.2471, "step": 460 }, { "epoch": 2.160919540229885, "grad_norm": 2.610365319899241, "learning_rate": 4.569903360406163e-06, "loss": 0.2554, "step": 470 }, { "epoch": 2.206896551724138, "grad_norm": 2.948862836398902, "learning_rate": 4.541289458627155e-06, "loss": 0.2527, "step": 480 }, { "epoch": 2.2528735632183907, "grad_norm": 2.637472882710502, "learning_rate": 4.511849846667839e-06, "loss": 0.2504, "step": 490 }, { "epoch": 2.2988505747126435, "grad_norm": 2.8982721464809016, "learning_rate": 4.481596432975202e-06, "loss": 0.2608, "step": 500 }, { "epoch": 2.344827586206897, "grad_norm": 2.852823004859079, "learning_rate": 4.4505414551824536e-06, "loss": 0.2606, "step": 510 }, { "epoch": 2.3908045977011496, "grad_norm": 3.197376130493875, "learning_rate": 4.418697475158861e-06, "loss": 0.2677, "step": 520 }, { "epoch": 2.4367816091954024, "grad_norm": 2.9868581965061494, "learning_rate": 4.386077373928413e-06, "loss": 0.2628, "step": 530 }, { "epoch": 2.4827586206896552, "grad_norm": 2.5165223021619756, "learning_rate": 4.352694346459397e-06, "loss": 0.2739, "step": 540 }, { "epoch": 2.528735632183908, "grad_norm": 2.5532695939024723, "learning_rate": 4.318561896326973e-06, "loss": 0.2587, "step": 550 }, { "epoch": 2.574712643678161, "grad_norm": 2.80784537094022, "learning_rate": 4.283693830250926e-06, "loss": 0.271, "step": 560 }, { "epoch": 2.6206896551724137, "grad_norm": 2.834008814039304, "learning_rate": 4.248104252510786e-06, "loss": 0.2596, "step": 570 }, { "epoch": 2.6666666666666665, "grad_norm": 2.856340507047602, "learning_rate": 4.211807559240588e-06, "loss": 0.2607, "step": 580 }, { "epoch": 2.7126436781609193, "grad_norm": 2.400155413142861, "learning_rate": 4.174818432605579e-06, "loss": 0.2714, "step": 590 }, { "epoch": 2.7586206896551726, "grad_norm": 2.9121726603653446, "learning_rate": 4.137151834863213e-06, "loss": 0.267, "step": 600 }, { "epoch": 2.8045977011494254, "grad_norm": 2.4158062858535243, "learning_rate": 4.098823002310864e-06, "loss": 0.2637, "step": 610 }, { "epoch": 2.8505747126436782, "grad_norm": 2.710372669455237, "learning_rate": 4.059847439122672e-06, "loss": 0.2591, "step": 620 }, { "epoch": 2.896551724137931, "grad_norm": 2.694528049244444, "learning_rate": 4.020240911078041e-06, "loss": 0.2597, "step": 630 }, { "epoch": 2.942528735632184, "grad_norm": 2.7003321508943077, "learning_rate": 3.98001943918432e-06, "loss": 0.2694, "step": 640 }, { "epoch": 2.9885057471264367, "grad_norm": 2.9274287857580212, "learning_rate": 3.939199293196231e-06, "loss": 0.2704, "step": 650 }, { "epoch": 3.0344827586206895, "grad_norm": 2.232026779235866, "learning_rate": 3.897796985034687e-06, "loss": 0.1997, "step": 660 }, { "epoch": 3.0804597701149423, "grad_norm": 2.7014305023370415, "learning_rate": 3.855829262107653e-06, "loss": 0.1716, "step": 670 }, { "epoch": 3.1264367816091956, "grad_norm": 2.4848170071887035, "learning_rate": 3.813313100535747e-06, "loss": 0.1803, "step": 680 }, { "epoch": 3.1724137931034484, "grad_norm": 2.7346292529745027, "learning_rate": 3.770265698285328e-06, "loss": 0.1754, "step": 690 }, { "epoch": 3.218390804597701, "grad_norm": 2.705054730805242, "learning_rate": 3.726704468211844e-06, "loss": 0.1835, "step": 700 }, { "epoch": 3.264367816091954, "grad_norm": 2.6584866622780603, "learning_rate": 3.6826470310162645e-06, "loss": 0.1792, "step": 710 }, { "epoch": 3.310344827586207, "grad_norm": 2.6514881876130225, "learning_rate": 3.6381112081174254e-06, "loss": 0.1765, "step": 720 }, { "epoch": 3.3563218390804597, "grad_norm": 2.516694208524121, "learning_rate": 3.593115014443195e-06, "loss": 0.1817, "step": 730 }, { "epoch": 3.4022988505747125, "grad_norm": 2.4579414740197065, "learning_rate": 3.547676651143361e-06, "loss": 0.1849, "step": 740 }, { "epoch": 3.4482758620689653, "grad_norm": 2.6311823995451364, "learning_rate": 3.5018144982271814e-06, "loss": 0.1769, "step": 750 }, { "epoch": 3.4942528735632186, "grad_norm": 2.3725394123707026, "learning_rate": 3.455547107128602e-06, "loss": 0.1848, "step": 760 }, { "epoch": 3.5402298850574714, "grad_norm": 2.4539717154314933, "learning_rate": 3.4088931932021193e-06, "loss": 0.1892, "step": 770 }, { "epoch": 3.586206896551724, "grad_norm": 2.4899658946067933, "learning_rate": 3.3618716281523384e-06, "loss": 0.1807, "step": 780 }, { "epoch": 3.632183908045977, "grad_norm": 2.550071627944622, "learning_rate": 3.3145014324002945e-06, "loss": 0.1852, "step": 790 }, { "epoch": 3.67816091954023, "grad_norm": 2.675827164092041, "learning_rate": 3.266801767389608e-06, "loss": 0.1885, "step": 800 }, { "epoch": 3.7241379310344827, "grad_norm": 2.5251713436347227, "learning_rate": 3.2187919278356027e-06, "loss": 0.1835, "step": 810 }, { "epoch": 3.7701149425287355, "grad_norm": 2.7315545503128638, "learning_rate": 3.1704913339205107e-06, "loss": 0.1863, "step": 820 }, { "epoch": 3.8160919540229887, "grad_norm": 2.7536607881256665, "learning_rate": 3.121919523437927e-06, "loss": 0.1921, "step": 830 }, { "epoch": 3.862068965517241, "grad_norm": 2.598851663508124, "learning_rate": 3.073096143889689e-06, "loss": 0.1868, "step": 840 }, { "epoch": 3.9080459770114944, "grad_norm": 2.523389034722214, "learning_rate": 3.0240409445383835e-06, "loss": 0.1855, "step": 850 }, { "epoch": 3.954022988505747, "grad_norm": 2.7756855701720404, "learning_rate": 2.97477376841868e-06, "loss": 0.1791, "step": 860 }, { "epoch": 4.0, "grad_norm": 2.489766938283244, "learning_rate": 2.9253145443107455e-06, "loss": 0.1756, "step": 870 }, { "epoch": 4.045977011494253, "grad_norm": 2.7317574118504044, "learning_rate": 2.8756832786789667e-06, "loss": 0.1081, "step": 880 }, { "epoch": 4.091954022988506, "grad_norm": 2.1106005588504626, "learning_rate": 2.825900047579251e-06, "loss": 0.1052, "step": 890 }, { "epoch": 4.137931034482759, "grad_norm": 2.40365919767509, "learning_rate": 2.775984988538175e-06, "loss": 0.1032, "step": 900 }, { "epoch": 4.183908045977011, "grad_norm": 2.3074631869946507, "learning_rate": 2.725958292407276e-06, "loss": 0.1049, "step": 910 }, { "epoch": 4.2298850574712645, "grad_norm": 2.5166276182004714, "learning_rate": 2.6758401951957625e-06, "loss": 0.1051, "step": 920 }, { "epoch": 4.275862068965517, "grad_norm": 2.3612573716815093, "learning_rate": 2.6256509698849652e-06, "loss": 0.1071, "step": 930 }, { "epoch": 4.32183908045977, "grad_norm": 2.377295836801757, "learning_rate": 2.5754109182278298e-06, "loss": 0.1077, "step": 940 }, { "epoch": 4.3678160919540225, "grad_norm": 2.5390227361500823, "learning_rate": 2.525140362536775e-06, "loss": 0.1085, "step": 950 }, { "epoch": 4.413793103448276, "grad_norm": 2.233377317311712, "learning_rate": 2.474859637463226e-06, "loss": 0.1082, "step": 960 }, { "epoch": 4.459770114942529, "grad_norm": 2.551052323312396, "learning_rate": 2.42458908177217e-06, "loss": 0.1102, "step": 970 }, { "epoch": 4.505747126436781, "grad_norm": 2.6483774815598804, "learning_rate": 2.374349030115036e-06, "loss": 0.1105, "step": 980 }, { "epoch": 4.551724137931035, "grad_norm": 2.429849103852719, "learning_rate": 2.3241598048042383e-06, "loss": 0.1082, "step": 990 }, { "epoch": 4.597701149425287, "grad_norm": 2.486141654635446, "learning_rate": 2.2740417075927244e-06, "loss": 0.109, "step": 1000 }, { "epoch": 4.64367816091954, "grad_norm": 2.5918016937723527, "learning_rate": 2.2240150114618262e-06, "loss": 0.1068, "step": 1010 }, { "epoch": 4.689655172413794, "grad_norm": 2.663853206431205, "learning_rate": 2.17409995242075e-06, "loss": 0.1058, "step": 1020 }, { "epoch": 4.735632183908046, "grad_norm": 2.657379315743761, "learning_rate": 2.1243167213210337e-06, "loss": 0.1015, "step": 1030 }, { "epoch": 4.781609195402299, "grad_norm": 2.357887989241257, "learning_rate": 2.0746854556892545e-06, "loss": 0.1032, "step": 1040 }, { "epoch": 4.827586206896552, "grad_norm": 2.1971439440232134, "learning_rate": 2.0252262315813213e-06, "loss": 0.1033, "step": 1050 }, { "epoch": 4.873563218390805, "grad_norm": 2.3384701132936683, "learning_rate": 1.9759590554616177e-06, "loss": 0.1075, "step": 1060 }, { "epoch": 4.919540229885057, "grad_norm": 2.611584410768705, "learning_rate": 1.9269038561103114e-06, "loss": 0.1075, "step": 1070 }, { "epoch": 4.9655172413793105, "grad_norm": 2.679036350683068, "learning_rate": 1.8780804765620747e-06, "loss": 0.1033, "step": 1080 }, { "epoch": 5.011494252873563, "grad_norm": 1.5576689531522379, "learning_rate": 1.8295086660794903e-06, "loss": 0.0939, "step": 1090 }, { "epoch": 5.057471264367816, "grad_norm": 2.4254209848134027, "learning_rate": 1.7812080721643977e-06, "loss": 0.0524, "step": 1100 }, { "epoch": 5.103448275862069, "grad_norm": 2.2936105257318675, "learning_rate": 1.7331982326103922e-06, "loss": 0.0531, "step": 1110 }, { "epoch": 5.149425287356322, "grad_norm": 2.2180450680938013, "learning_rate": 1.6854985675997065e-06, "loss": 0.0518, "step": 1120 }, { "epoch": 5.195402298850575, "grad_norm": 1.9813446486569681, "learning_rate": 1.6381283718476622e-06, "loss": 0.0521, "step": 1130 }, { "epoch": 5.241379310344827, "grad_norm": 1.9952422258717124, "learning_rate": 1.591106806797882e-06, "loss": 0.0534, "step": 1140 }, { "epoch": 5.287356321839081, "grad_norm": 1.910457789638185, "learning_rate": 1.5444528928713987e-06, "loss": 0.0529, "step": 1150 }, { "epoch": 5.333333333333333, "grad_norm": 2.247598889328078, "learning_rate": 1.4981855017728197e-06, "loss": 0.054, "step": 1160 }, { "epoch": 5.379310344827586, "grad_norm": 2.355104769702966, "learning_rate": 1.4523233488566394e-06, "loss": 0.0583, "step": 1170 }, { "epoch": 5.425287356321839, "grad_norm": 2.2631373957215732, "learning_rate": 1.4068849855568042e-06, "loss": 0.0513, "step": 1180 }, { "epoch": 5.471264367816092, "grad_norm": 4.183885604965178, "learning_rate": 1.3618887918825752e-06, "loss": 0.0547, "step": 1190 }, { "epoch": 5.517241379310345, "grad_norm": 2.149593926911987, "learning_rate": 1.3173529689837355e-06, "loss": 0.0543, "step": 1200 }, { "epoch": 5.563218390804598, "grad_norm": 2.17719523603525, "learning_rate": 1.2732955317881563e-06, "loss": 0.0544, "step": 1210 }, { "epoch": 5.609195402298851, "grad_norm": 2.247756297696003, "learning_rate": 1.2297343017146727e-06, "loss": 0.0529, "step": 1220 }, { "epoch": 5.655172413793103, "grad_norm": 1.7927074781175658, "learning_rate": 1.1866868994642535e-06, "loss": 0.0514, "step": 1230 }, { "epoch": 5.7011494252873565, "grad_norm": 2.4235617167647816, "learning_rate": 1.1441707378923475e-06, "loss": 0.0529, "step": 1240 }, { "epoch": 5.747126436781609, "grad_norm": 1.7438808513899655, "learning_rate": 1.1022030149653134e-06, "loss": 0.0524, "step": 1250 }, { "epoch": 5.793103448275862, "grad_norm": 2.080026881638302, "learning_rate": 1.0608007068037702e-06, "loss": 0.0528, "step": 1260 }, { "epoch": 5.8390804597701145, "grad_norm": 1.9309457262833183, "learning_rate": 1.0199805608156802e-06, "loss": 0.0504, "step": 1270 }, { "epoch": 5.885057471264368, "grad_norm": 2.1120867508468493, "learning_rate": 9.79759088921959e-07, "loss": 0.0524, "step": 1280 }, { "epoch": 5.931034482758621, "grad_norm": 1.803440728603622, "learning_rate": 9.401525608773293e-07, "loss": 0.0453, "step": 1290 }, { "epoch": 5.977011494252873, "grad_norm": 1.756476820082346, "learning_rate": 9.011769976891368e-07, "loss": 0.0483, "step": 1300 }, { "epoch": 6.022988505747127, "grad_norm": 1.14195571634091, "learning_rate": 8.628481651367876e-07, "loss": 0.0373, "step": 1310 }, { "epoch": 6.068965517241379, "grad_norm": 1.2823038089726142, "learning_rate": 8.25181567394422e-07, "loss": 0.0257, "step": 1320 }, { "epoch": 6.114942528735632, "grad_norm": 1.9765342255678002, "learning_rate": 7.88192440759413e-07, "loss": 0.0243, "step": 1330 }, { "epoch": 6.160919540229885, "grad_norm": 1.4924174584280798, "learning_rate": 7.51895747489215e-07, "loss": 0.0244, "step": 1340 }, { "epoch": 6.206896551724138, "grad_norm": 1.4740924405566593, "learning_rate": 7.163061697490742e-07, "loss": 0.0253, "step": 1350 }, { "epoch": 6.252873563218391, "grad_norm": 1.2039837293309412, "learning_rate": 6.814381036730275e-07, "loss": 0.0237, "step": 1360 }, { "epoch": 6.2988505747126435, "grad_norm": 1.5435771614300633, "learning_rate": 6.473056535406036e-07, "loss": 0.0247, "step": 1370 }, { "epoch": 6.344827586206897, "grad_norm": 1.6972735405981059, "learning_rate": 6.139226260715872e-07, "loss": 0.0232, "step": 1380 }, { "epoch": 6.390804597701149, "grad_norm": 1.7603825909556183, "learning_rate": 5.813025248411397e-07, "loss": 0.0209, "step": 1390 }, { "epoch": 6.436781609195402, "grad_norm": 1.7734840776368426, "learning_rate": 5.494585448175474e-07, "loss": 0.0248, "step": 1400 }, { "epoch": 6.482758620689655, "grad_norm": 2.155524841148084, "learning_rate": 5.184035670247989e-07, "loss": 0.0251, "step": 1410 }, { "epoch": 6.528735632183908, "grad_norm": 2.1248857415061315, "learning_rate": 4.881501533321605e-07, "loss": 0.0238, "step": 1420 }, { "epoch": 6.574712643678161, "grad_norm": 1.2638686532353145, "learning_rate": 4.587105413728457e-07, "loss": 0.0227, "step": 1430 }, { "epoch": 6.620689655172414, "grad_norm": 1.3539825062665019, "learning_rate": 4.3009663959383776e-07, "loss": 0.0229, "step": 1440 }, { "epoch": 6.666666666666667, "grad_norm": 1.2318013380818635, "learning_rate": 4.0232002243887873e-07, "loss": 0.0244, "step": 1450 }, { "epoch": 6.712643678160919, "grad_norm": 2.1272364662366887, "learning_rate": 3.7539192566655254e-07, "loss": 0.0234, "step": 1460 }, { "epoch": 6.758620689655173, "grad_norm": 1.4894449105223861, "learning_rate": 3.493232418053774e-07, "loss": 0.0223, "step": 1470 }, { "epoch": 6.804597701149425, "grad_norm": 1.5981961713791006, "learning_rate": 3.24124515747731e-07, "loss": 0.0218, "step": 1480 }, { "epoch": 6.850574712643678, "grad_norm": 1.6361128000262934, "learning_rate": 2.9980594048439477e-07, "loss": 0.0226, "step": 1490 }, { "epoch": 6.896551724137931, "grad_norm": 2.3404616936998455, "learning_rate": 2.7637735298145064e-07, "loss": 0.0212, "step": 1500 }, { "epoch": 6.942528735632184, "grad_norm": 1.7100444245019304, "learning_rate": 2.538482302011822e-07, "loss": 0.0225, "step": 1510 }, { "epoch": 6.988505747126437, "grad_norm": 1.4325323759545483, "learning_rate": 2.3222768526860701e-07, "loss": 0.0227, "step": 1520 }, { "epoch": 7.0344827586206895, "grad_norm": 0.8626586832251956, "learning_rate": 2.115244637851782e-07, "loss": 0.0151, "step": 1530 }, { "epoch": 7.080459770114943, "grad_norm": 0.650488570824418, "learning_rate": 1.9174694029115148e-07, "loss": 0.0123, "step": 1540 }, { "epoch": 7.126436781609195, "grad_norm": 2.091704655001972, "learning_rate": 1.7290311487804689e-07, "loss": 0.0126, "step": 1550 }, { "epoch": 7.172413793103448, "grad_norm": 0.8790217013924919, "learning_rate": 1.5500060995258136e-07, "loss": 0.0128, "step": 1560 }, { "epoch": 7.218390804597701, "grad_norm": 0.7303846115721658, "learning_rate": 1.3804666715337117e-07, "loss": 0.0122, "step": 1570 }, { "epoch": 7.264367816091954, "grad_norm": 1.0662215851718453, "learning_rate": 1.2204814442165814e-07, "loss": 0.0124, "step": 1580 }, { "epoch": 7.310344827586207, "grad_norm": 0.8739271592863969, "learning_rate": 1.0701151322724451e-07, "loss": 0.0117, "step": 1590 }, { "epoch": 7.35632183908046, "grad_norm": 0.7737781041692757, "learning_rate": 9.294285595075669e-08, "loss": 0.0122, "step": 1600 }, { "epoch": 7.402298850574713, "grad_norm": 0.7170467686655235, "learning_rate": 7.984786342329493e-08, "loss": 0.0133, "step": 1610 }, { "epoch": 7.448275862068965, "grad_norm": 0.6837027766335966, "learning_rate": 6.773183262446914e-08, "loss": 0.0112, "step": 1620 }, { "epoch": 7.494252873563219, "grad_norm": 0.9449591802350379, "learning_rate": 5.65996645397493e-08, "loss": 0.0139, "step": 1630 }, { "epoch": 7.540229885057471, "grad_norm": 1.999894266793329, "learning_rate": 4.645586217799453e-08, "loss": 0.0131, "step": 1640 }, { "epoch": 7.586206896551724, "grad_norm": 0.8351980509392402, "learning_rate": 3.730452874996737e-08, "loss": 0.0124, "step": 1650 }, { "epoch": 7.6321839080459775, "grad_norm": 0.7193434252067797, "learning_rate": 2.914936600856899e-08, "loss": 0.0112, "step": 1660 }, { "epoch": 7.67816091954023, "grad_norm": 0.8238436973186618, "learning_rate": 2.199367275146358e-08, "loss": 0.0112, "step": 1670 }, { "epoch": 7.724137931034483, "grad_norm": 1.05150261931409, "learning_rate": 1.5840343486700216e-08, "loss": 0.0133, "step": 1680 }, { "epoch": 7.7701149425287355, "grad_norm": 0.8116973247797554, "learning_rate": 1.0691867261874155e-08, "loss": 0.0116, "step": 1690 }, { "epoch": 7.816091954022989, "grad_norm": 0.8080179165425967, "learning_rate": 6.550326657293882e-09, "loss": 0.0111, "step": 1700 }, { "epoch": 7.862068965517241, "grad_norm": 0.8576475297914047, "learning_rate": 3.4173969435710717e-09, "loss": 0.0121, "step": 1710 }, { "epoch": 7.908045977011494, "grad_norm": 1.1647467814066779, "learning_rate": 1.2943454039654467e-09, "loss": 0.0122, "step": 1720 }, { "epoch": 7.954022988505747, "grad_norm": 0.6589417835112291, "learning_rate": 1.8203082176287967e-10, "loss": 0.0125, "step": 1730 }, { "epoch": 7.9816091954022985, "step": 1736, "total_flos": 118981985435648.0, "train_loss": 0.19023791693305503, "train_runtime": 9688.5007, "train_samples_per_second": 1.436, "train_steps_per_second": 0.179 } ], "logging_steps": 10, "max_steps": 1736, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 118981985435648.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }