| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 1000, | |
| "global_step": 2975, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01680672268907563, | |
| "grad_norm": 1.6710422039031982, | |
| "learning_rate": 6.7114093959731546e-06, | |
| "loss": 1.2952, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03361344537815126, | |
| "grad_norm": 1.708608865737915, | |
| "learning_rate": 1.3422818791946309e-05, | |
| "loss": 1.4041, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05042016806722689, | |
| "grad_norm": 1.9668169021606445, | |
| "learning_rate": 2.013422818791946e-05, | |
| "loss": 1.2144, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06722689075630252, | |
| "grad_norm": 1.0957667827606201, | |
| "learning_rate": 2.6845637583892618e-05, | |
| "loss": 0.8496, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08403361344537816, | |
| "grad_norm": 0.7974053621292114, | |
| "learning_rate": 3.3557046979865775e-05, | |
| "loss": 0.642, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10084033613445378, | |
| "grad_norm": 0.6723673939704895, | |
| "learning_rate": 4.026845637583892e-05, | |
| "loss": 0.5978, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 0.7411757707595825, | |
| "learning_rate": 4.697986577181208e-05, | |
| "loss": 0.5342, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13445378151260504, | |
| "grad_norm": 0.7030589580535889, | |
| "learning_rate": 5.3691275167785237e-05, | |
| "loss": 0.536, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15126050420168066, | |
| "grad_norm": 0.5388085842132568, | |
| "learning_rate": 6.04026845637584e-05, | |
| "loss": 0.5294, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16806722689075632, | |
| "grad_norm": 0.5648273229598999, | |
| "learning_rate": 6.711409395973155e-05, | |
| "loss": 0.5288, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18487394957983194, | |
| "grad_norm": 0.5652063488960266, | |
| "learning_rate": 7.382550335570471e-05, | |
| "loss": 0.5467, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20168067226890757, | |
| "grad_norm": 0.5379688739776611, | |
| "learning_rate": 8.053691275167784e-05, | |
| "loss": 0.5057, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2184873949579832, | |
| "grad_norm": 0.7668149471282959, | |
| "learning_rate": 8.7248322147651e-05, | |
| "loss": 0.5094, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.5256843566894531, | |
| "learning_rate": 9.395973154362417e-05, | |
| "loss": 0.5195, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.25210084033613445, | |
| "grad_norm": 0.5416532754898071, | |
| "learning_rate": 0.00010067114093959733, | |
| "loss": 0.4653, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2689075630252101, | |
| "grad_norm": 0.474119633436203, | |
| "learning_rate": 0.00010738255033557047, | |
| "loss": 0.4979, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.5296077132225037, | |
| "learning_rate": 0.00011409395973154362, | |
| "loss": 0.5004, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3025210084033613, | |
| "grad_norm": 0.4529200494289398, | |
| "learning_rate": 0.0001208053691275168, | |
| "loss": 0.4975, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.31932773109243695, | |
| "grad_norm": 0.4488178491592407, | |
| "learning_rate": 0.00012751677852348994, | |
| "loss": 0.4779, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 0.4906866252422333, | |
| "learning_rate": 0.0001342281879194631, | |
| "loss": 0.5172, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 0.647227942943573, | |
| "learning_rate": 0.00014093959731543624, | |
| "loss": 0.4858, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3697478991596639, | |
| "grad_norm": 0.5192333459854126, | |
| "learning_rate": 0.00014765100671140942, | |
| "loss": 0.5518, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3865546218487395, | |
| "grad_norm": 0.4241911470890045, | |
| "learning_rate": 0.00015436241610738256, | |
| "loss": 0.4668, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.40336134453781514, | |
| "grad_norm": 0.49162909388542175, | |
| "learning_rate": 0.0001610738255033557, | |
| "loss": 0.5524, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.42016806722689076, | |
| "grad_norm": 0.45716652274131775, | |
| "learning_rate": 0.00016778523489932888, | |
| "loss": 0.4681, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4369747899159664, | |
| "grad_norm": 0.46797800064086914, | |
| "learning_rate": 0.000174496644295302, | |
| "loss": 0.4922, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.453781512605042, | |
| "grad_norm": 0.5456430315971375, | |
| "learning_rate": 0.00018120805369127517, | |
| "loss": 0.4656, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.41116565465927124, | |
| "learning_rate": 0.00018791946308724833, | |
| "loss": 0.4973, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.48739495798319327, | |
| "grad_norm": 0.47659677267074585, | |
| "learning_rate": 0.0001946308724832215, | |
| "loss": 0.4703, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5042016806722689, | |
| "grad_norm": 0.4758327007293701, | |
| "learning_rate": 0.00019999972455632807, | |
| "loss": 0.4574, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5210084033613446, | |
| "grad_norm": 0.38325995206832886, | |
| "learning_rate": 0.00019999008418713482, | |
| "loss": 0.4596, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5378151260504201, | |
| "grad_norm": 0.5056403875350952, | |
| "learning_rate": 0.00019996667315169163, | |
| "loss": 0.4853, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5546218487394958, | |
| "grad_norm": 0.49656251072883606, | |
| "learning_rate": 0.00019992949467417378, | |
| "loss": 0.4407, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.49380919337272644, | |
| "learning_rate": 0.00019987855387481305, | |
| "loss": 0.4614, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.47169238328933716, | |
| "learning_rate": 0.0001998138577691925, | |
| "loss": 0.4619, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6050420168067226, | |
| "grad_norm": 0.34966766834259033, | |
| "learning_rate": 0.00019973541526728048, | |
| "loss": 0.4828, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6218487394957983, | |
| "grad_norm": 0.4971473217010498, | |
| "learning_rate": 0.0001996432371722033, | |
| "loss": 0.4406, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6386554621848739, | |
| "grad_norm": 0.3743559420108795, | |
| "learning_rate": 0.00019953733617875759, | |
| "loss": 0.4854, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6554621848739496, | |
| "grad_norm": 0.3752104341983795, | |
| "learning_rate": 0.00019941772687166192, | |
| "loss": 0.4921, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 0.4422326982021332, | |
| "learning_rate": 0.00019928442572354814, | |
| "loss": 0.4802, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6890756302521008, | |
| "grad_norm": 0.4718156158924103, | |
| "learning_rate": 0.00019913745109269288, | |
| "loss": 0.5042, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.4682941734790802, | |
| "learning_rate": 0.00019897682322048908, | |
| "loss": 0.4865, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7226890756302521, | |
| "grad_norm": 0.5025270581245422, | |
| "learning_rate": 0.0001988025642286585, | |
| "loss": 0.4781, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7394957983193278, | |
| "grad_norm": 0.6233184337615967, | |
| "learning_rate": 0.000198614698116205, | |
| "loss": 0.5091, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7563025210084033, | |
| "grad_norm": 0.354615718126297, | |
| "learning_rate": 0.00019841325075610944, | |
| "loss": 0.5004, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.773109243697479, | |
| "grad_norm": 0.42839425802230835, | |
| "learning_rate": 0.00019819824989176645, | |
| "loss": 0.4652, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7899159663865546, | |
| "grad_norm": 0.379213809967041, | |
| "learning_rate": 0.00019796972513316355, | |
| "loss": 0.4739, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8067226890756303, | |
| "grad_norm": 0.4357176423072815, | |
| "learning_rate": 0.00019772770795280329, | |
| "loss": 0.5072, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 0.4875657856464386, | |
| "learning_rate": 0.00019747223168136886, | |
| "loss": 0.5094, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8403361344537815, | |
| "grad_norm": 0.4749436676502228, | |
| "learning_rate": 0.00019720333150313377, | |
| "loss": 0.4812, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.446424663066864, | |
| "learning_rate": 0.0001969210444511162, | |
| "loss": 0.4823, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8739495798319328, | |
| "grad_norm": 0.4699801802635193, | |
| "learning_rate": 0.00019662540940197892, | |
| "loss": 0.4559, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8907563025210085, | |
| "grad_norm": 0.4818078875541687, | |
| "learning_rate": 0.00019631646707067511, | |
| "loss": 0.4803, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.907563025210084, | |
| "grad_norm": 0.4135447144508362, | |
| "learning_rate": 0.000195994260004841, | |
| "loss": 0.4481, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9243697478991597, | |
| "grad_norm": 0.3625709116458893, | |
| "learning_rate": 0.00019565883257893635, | |
| "loss": 0.4606, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.6398768424987793, | |
| "learning_rate": 0.00019531023098813316, | |
| "loss": 0.4819, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.957983193277311, | |
| "grad_norm": 0.40339210629463196, | |
| "learning_rate": 0.00019494850324195353, | |
| "loss": 0.4742, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9747899159663865, | |
| "grad_norm": 0.37650129199028015, | |
| "learning_rate": 0.00019457369915765798, | |
| "loss": 0.4514, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9915966386554622, | |
| "grad_norm": 0.41677525639533997, | |
| "learning_rate": 0.00019418587035338432, | |
| "loss": 0.4849, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0084033613445378, | |
| "grad_norm": 0.44225984811782837, | |
| "learning_rate": 0.0001937850702410391, | |
| "loss": 0.4626, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0252100840336134, | |
| "grad_norm": 0.4943791627883911, | |
| "learning_rate": 0.00019337135401894146, | |
| "loss": 0.4441, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0420168067226891, | |
| "grad_norm": 0.38774925470352173, | |
| "learning_rate": 0.0001929447786642213, | |
| "loss": 0.4287, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0588235294117647, | |
| "grad_norm": 0.4631214737892151, | |
| "learning_rate": 0.00019250540292497235, | |
| "loss": 0.4369, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.0756302521008403, | |
| "grad_norm": 0.5732137560844421, | |
| "learning_rate": 0.00019205328731216147, | |
| "loss": 0.4119, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.092436974789916, | |
| "grad_norm": 0.5002462863922119, | |
| "learning_rate": 0.00019158849409129484, | |
| "loss": 0.4172, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1092436974789917, | |
| "grad_norm": 0.6402649879455566, | |
| "learning_rate": 0.00019111108727384296, | |
| "loss": 0.3912, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1260504201680672, | |
| "grad_norm": 0.5174903869628906, | |
| "learning_rate": 0.00019062113260842476, | |
| "loss": 0.4656, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.4554598033428192, | |
| "learning_rate": 0.0001901186975717529, | |
| "loss": 0.4059, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1596638655462184, | |
| "grad_norm": 0.480656236410141, | |
| "learning_rate": 0.0001896038513593407, | |
| "loss": 0.4013, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 0.40827691555023193, | |
| "learning_rate": 0.00018907666487597257, | |
| "loss": 0.4278, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1932773109243697, | |
| "grad_norm": 0.4276060163974762, | |
| "learning_rate": 0.00018853721072593894, | |
| "loss": 0.4112, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2100840336134453, | |
| "grad_norm": 0.5786320567131042, | |
| "learning_rate": 0.00018798556320303718, | |
| "loss": 0.4518, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.226890756302521, | |
| "grad_norm": 0.37131550908088684, | |
| "learning_rate": 0.00018742179828033992, | |
| "loss": 0.4132, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2436974789915967, | |
| "grad_norm": 0.4482327401638031, | |
| "learning_rate": 0.00018684599359973194, | |
| "loss": 0.4196, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2605042016806722, | |
| "grad_norm": 0.6399847269058228, | |
| "learning_rate": 0.00018625822846121727, | |
| "loss": 0.427, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.2773109243697478, | |
| "grad_norm": 0.5624536871910095, | |
| "learning_rate": 0.0001856585838119981, | |
| "loss": 0.3879, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.2941176470588236, | |
| "grad_norm": 0.4050937592983246, | |
| "learning_rate": 0.0001850471422353265, | |
| "loss": 0.44, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3109243697478992, | |
| "grad_norm": 0.4883362054824829, | |
| "learning_rate": 0.00018442398793913118, | |
| "loss": 0.416, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3277310924369747, | |
| "grad_norm": 0.5154753923416138, | |
| "learning_rate": 0.00018378920674442045, | |
| "loss": 0.4266, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.3445378151260505, | |
| "grad_norm": 0.5751391649246216, | |
| "learning_rate": 0.00018314288607346258, | |
| "loss": 0.4331, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.361344537815126, | |
| "grad_norm": 0.5200420618057251, | |
| "learning_rate": 0.0001824851149377463, | |
| "loss": 0.4275, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.3781512605042017, | |
| "grad_norm": 0.5671929121017456, | |
| "learning_rate": 0.00018181598392572203, | |
| "loss": 0.4285, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.3949579831932772, | |
| "grad_norm": 0.5295830368995667, | |
| "learning_rate": 0.0001811355851903259, | |
| "loss": 0.3888, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4117647058823528, | |
| "grad_norm": 0.4832970201969147, | |
| "learning_rate": 0.0001804440124362885, | |
| "loss": 0.4299, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.4614173173904419, | |
| "learning_rate": 0.00017974136090722985, | |
| "loss": 0.4012, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4453781512605042, | |
| "grad_norm": 0.5086454153060913, | |
| "learning_rate": 0.00017902772737254238, | |
| "loss": 0.4103, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.46218487394958, | |
| "grad_norm": 0.6145858764648438, | |
| "learning_rate": 0.00017830321011406386, | |
| "loss": 0.4378, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.4789915966386555, | |
| "grad_norm": 0.4497447907924652, | |
| "learning_rate": 0.00017756790891254194, | |
| "loss": 0.4021, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.495798319327731, | |
| "grad_norm": 0.4918227195739746, | |
| "learning_rate": 0.00017682192503389236, | |
| "loss": 0.4364, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5126050420168067, | |
| "grad_norm": 0.7057837247848511, | |
| "learning_rate": 0.00017606536121525255, | |
| "loss": 0.3721, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5294117647058822, | |
| "grad_norm": 0.5684515237808228, | |
| "learning_rate": 0.00017529832165083265, | |
| "loss": 0.4458, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.5462184873949578, | |
| "grad_norm": 0.6396010518074036, | |
| "learning_rate": 0.0001745209119775658, | |
| "loss": 0.4156, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.5630252100840336, | |
| "grad_norm": 0.5182276368141174, | |
| "learning_rate": 0.0001737332392605599, | |
| "loss": 0.4167, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.5798319327731094, | |
| "grad_norm": 0.4409577548503876, | |
| "learning_rate": 0.0001729354119783525, | |
| "loss": 0.4272, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.596638655462185, | |
| "grad_norm": 0.4636439085006714, | |
| "learning_rate": 0.00017212754000797107, | |
| "loss": 0.3888, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6134453781512605, | |
| "grad_norm": 0.5916410684585571, | |
| "learning_rate": 0.00017130973460980077, | |
| "loss": 0.4319, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6302521008403361, | |
| "grad_norm": 0.48982053995132446, | |
| "learning_rate": 0.00017048210841226155, | |
| "loss": 0.4168, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.6470588235294117, | |
| "grad_norm": 0.8442604541778564, | |
| "learning_rate": 0.00016964477539629702, | |
| "loss": 0.427, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.6638655462184873, | |
| "grad_norm": 0.5364897847175598, | |
| "learning_rate": 0.0001687978508796769, | |
| "loss": 0.4284, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.680672268907563, | |
| "grad_norm": 0.5252152681350708, | |
| "learning_rate": 0.00016794145150111542, | |
| "loss": 0.4081, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.680672268907563, | |
| "eval_loss": 0.469873309135437, | |
| "eval_runtime": 8.5876, | |
| "eval_samples_per_second": 116.564, | |
| "eval_steps_per_second": 14.672, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6974789915966386, | |
| "grad_norm": 0.5419629812240601, | |
| "learning_rate": 0.00016707569520420785, | |
| "loss": 0.4571, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.447343647480011, | |
| "learning_rate": 0.00016620070122118724, | |
| "loss": 0.417, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.73109243697479, | |
| "grad_norm": 0.7582902908325195, | |
| "learning_rate": 0.00016531659005650367, | |
| "loss": 0.4514, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.7478991596638656, | |
| "grad_norm": 0.5247509479522705, | |
| "learning_rate": 0.00016442348347022842, | |
| "loss": 0.4377, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 0.525404155254364, | |
| "learning_rate": 0.0001635215044612851, | |
| "loss": 0.4159, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.7815126050420167, | |
| "grad_norm": 0.5584174990653992, | |
| "learning_rate": 0.00016261077725051015, | |
| "loss": 0.4243, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.7983193277310925, | |
| "grad_norm": 0.4921722114086151, | |
| "learning_rate": 0.00016169142726354515, | |
| "loss": 0.3873, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.815126050420168, | |
| "grad_norm": 0.5573268532752991, | |
| "learning_rate": 0.00016076358111356332, | |
| "loss": 0.418, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.8319327731092439, | |
| "grad_norm": 0.5113047361373901, | |
| "learning_rate": 0.0001598273665838319, | |
| "loss": 0.4322, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.8487394957983194, | |
| "grad_norm": 0.48914453387260437, | |
| "learning_rate": 0.0001588829126101143, | |
| "loss": 0.431, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.865546218487395, | |
| "grad_norm": 0.4903518259525299, | |
| "learning_rate": 0.0001579303492629126, | |
| "loss": 0.4301, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 0.6053085923194885, | |
| "learning_rate": 0.0001569698077295545, | |
| "loss": 0.4376, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.8991596638655461, | |
| "grad_norm": 0.6364733576774597, | |
| "learning_rate": 0.00015600142029612596, | |
| "loss": 0.4321, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.9159663865546217, | |
| "grad_norm": 0.5780381560325623, | |
| "learning_rate": 0.00015502532032925283, | |
| "loss": 0.4931, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.9327731092436975, | |
| "grad_norm": 0.5068004131317139, | |
| "learning_rate": 0.0001540416422577335, | |
| "loss": 0.4143, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.949579831932773, | |
| "grad_norm": 0.44138628244400024, | |
| "learning_rate": 0.00015305052155402545, | |
| "loss": 0.4577, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.9663865546218489, | |
| "grad_norm": 0.542020857334137, | |
| "learning_rate": 0.0001520520947155877, | |
| "loss": 0.4184, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.9831932773109244, | |
| "grad_norm": 0.38625264167785645, | |
| "learning_rate": 0.00015104649924608252, | |
| "loss": 0.3933, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.42412513494491577, | |
| "learning_rate": 0.00015003387363643836, | |
| "loss": 0.3885, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.0168067226890756, | |
| "grad_norm": 0.4694175720214844, | |
| "learning_rate": 0.00014901435734577677, | |
| "loss": 0.3581, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.033613445378151, | |
| "grad_norm": 0.5906610488891602, | |
| "learning_rate": 0.0001479880907822061, | |
| "loss": 0.3139, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.0504201680672267, | |
| "grad_norm": 0.6226663589477539, | |
| "learning_rate": 0.00014695521528348447, | |
| "loss": 0.3298, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.0672268907563027, | |
| "grad_norm": 0.5035540461540222, | |
| "learning_rate": 0.00014591587309755464, | |
| "loss": 0.3193, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.0840336134453783, | |
| "grad_norm": 0.5115870237350464, | |
| "learning_rate": 0.00014487020736295354, | |
| "loss": 0.3035, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.100840336134454, | |
| "grad_norm": 0.5900698304176331, | |
| "learning_rate": 0.00014381836208909927, | |
| "loss": 0.3432, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.1176470588235294, | |
| "grad_norm": 0.4748307466506958, | |
| "learning_rate": 0.00014276048213645809, | |
| "loss": 0.3214, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.134453781512605, | |
| "grad_norm": 0.9723687767982483, | |
| "learning_rate": 0.0001416967131965941, | |
| "loss": 0.3571, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.1512605042016806, | |
| "grad_norm": 0.46582403779029846, | |
| "learning_rate": 0.00014062720177210455, | |
| "loss": 0.3357, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.168067226890756, | |
| "grad_norm": 0.6172323226928711, | |
| "learning_rate": 0.00013955209515644363, | |
| "loss": 0.3249, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.184873949579832, | |
| "grad_norm": 0.6465519070625305, | |
| "learning_rate": 0.00013847154141363691, | |
| "loss": 0.3485, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2016806722689077, | |
| "grad_norm": 0.560625433921814, | |
| "learning_rate": 0.00013738568935789015, | |
| "loss": 0.3311, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.2184873949579833, | |
| "grad_norm": 0.6312292218208313, | |
| "learning_rate": 0.00013629468853309433, | |
| "loss": 0.3137, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 0.7188405990600586, | |
| "learning_rate": 0.00013519868919223069, | |
| "loss": 0.3484, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.2521008403361344, | |
| "grad_norm": 0.4797142446041107, | |
| "learning_rate": 0.00013409784227667762, | |
| "loss": 0.3316, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.26890756302521, | |
| "grad_norm": 0.5650676488876343, | |
| "learning_rate": 0.00013299229939542302, | |
| "loss": 0.3169, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.7440696358680725, | |
| "learning_rate": 0.00013188221280418472, | |
| "loss": 0.3573, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.302521008403361, | |
| "grad_norm": 0.5263564586639404, | |
| "learning_rate": 0.0001307677353844417, | |
| "loss": 0.3157, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.3193277310924367, | |
| "grad_norm": 0.8205832839012146, | |
| "learning_rate": 0.0001296490206223793, | |
| "loss": 0.3192, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.3361344537815127, | |
| "grad_norm": 0.6581016778945923, | |
| "learning_rate": 0.000128526222587751, | |
| "loss": 0.3136, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 0.6751677393913269, | |
| "learning_rate": 0.00012739949591265975, | |
| "loss": 0.3157, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.369747899159664, | |
| "grad_norm": 0.7323719263076782, | |
| "learning_rate": 0.0001262689957702623, | |
| "loss": 0.3448, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.3865546218487395, | |
| "grad_norm": 0.6688811182975769, | |
| "learning_rate": 0.00012513487785339832, | |
| "loss": 0.3492, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.403361344537815, | |
| "grad_norm": 0.7764780521392822, | |
| "learning_rate": 0.0001239972983531487, | |
| "loss": 0.3318, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.4201680672268906, | |
| "grad_norm": 0.7399850487709045, | |
| "learning_rate": 0.00012285641393732462, | |
| "loss": 0.3171, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.4369747899159666, | |
| "grad_norm": 0.9961610436439514, | |
| "learning_rate": 0.00012171238172889135, | |
| "loss": 0.3426, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.453781512605042, | |
| "grad_norm": 0.6966540813446045, | |
| "learning_rate": 0.00012056535928432902, | |
| "loss": 0.3198, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 0.5827167630195618, | |
| "learning_rate": 0.00011941550457193418, | |
| "loss": 0.3535, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.4873949579831933, | |
| "grad_norm": 0.6033148765563965, | |
| "learning_rate": 0.00011826297595006417, | |
| "loss": 0.3141, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.504201680672269, | |
| "grad_norm": 0.7176485061645508, | |
| "learning_rate": 0.00011710793214532791, | |
| "loss": 0.3435, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.5210084033613445, | |
| "grad_norm": 0.626823365688324, | |
| "learning_rate": 0.00011595053223072623, | |
| "loss": 0.3532, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.53781512605042, | |
| "grad_norm": 0.6843551993370056, | |
| "learning_rate": 0.00011479093560374406, | |
| "loss": 0.327, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.5546218487394956, | |
| "grad_norm": 0.7965649962425232, | |
| "learning_rate": 0.00011362930196439832, | |
| "loss": 0.3564, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.5511319041252136, | |
| "learning_rate": 0.00011246579129324393, | |
| "loss": 0.3281, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.588235294117647, | |
| "grad_norm": 0.6023262143135071, | |
| "learning_rate": 0.00011130056382934115, | |
| "loss": 0.3361, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.6050420168067228, | |
| "grad_norm": 0.7548812627792358, | |
| "learning_rate": 0.00011013378004818762, | |
| "loss": 0.3317, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.6218487394957983, | |
| "grad_norm": 0.5712910294532776, | |
| "learning_rate": 0.00010896560063961732, | |
| "loss": 0.3243, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.638655462184874, | |
| "grad_norm": 0.686442494392395, | |
| "learning_rate": 0.00010779618648567058, | |
| "loss": 0.3443, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.6554621848739495, | |
| "grad_norm": 0.6805222034454346, | |
| "learning_rate": 0.00010662569863843725, | |
| "loss": 0.3314, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.6722689075630255, | |
| "grad_norm": 0.776661217212677, | |
| "learning_rate": 0.00010545429829787659, | |
| "loss": 0.3219, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.689075630252101, | |
| "grad_norm": 0.5905969142913818, | |
| "learning_rate": 0.00010428214678961668, | |
| "loss": 0.3445, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.7058823529411766, | |
| "grad_norm": 0.5671495199203491, | |
| "learning_rate": 0.00010310940554273663, | |
| "loss": 0.3459, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.722689075630252, | |
| "grad_norm": 0.7796810269355774, | |
| "learning_rate": 0.00010193623606753464, | |
| "loss": 0.3121, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.7394957983193278, | |
| "grad_norm": 0.6266621947288513, | |
| "learning_rate": 0.00010076279993328455, | |
| "loss": 0.34, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.7563025210084033, | |
| "grad_norm": 0.6675502061843872, | |
| "learning_rate": 9.958925874598464e-05, | |
| "loss": 0.298, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.773109243697479, | |
| "grad_norm": 0.6532730460166931, | |
| "learning_rate": 9.841577412610108e-05, | |
| "loss": 0.3519, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.7899159663865545, | |
| "grad_norm": 0.8185803890228271, | |
| "learning_rate": 9.724250768630959e-05, | |
| "loss": 0.3373, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.80672268907563, | |
| "grad_norm": 0.6204152703285217, | |
| "learning_rate": 9.606962100923807e-05, | |
| "loss": 0.3061, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.8235294117647056, | |
| "grad_norm": 0.5997849702835083, | |
| "learning_rate": 9.489727562521332e-05, | |
| "loss": 0.3478, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.8403361344537816, | |
| "grad_norm": 0.6466342806816101, | |
| "learning_rate": 9.372563299001514e-05, | |
| "loss": 0.3242, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.8826155662536621, | |
| "learning_rate": 9.255485446264046e-05, | |
| "loss": 0.3093, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.8739495798319328, | |
| "grad_norm": 0.9787838459014893, | |
| "learning_rate": 9.138510128308117e-05, | |
| "loss": 0.3608, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.8907563025210083, | |
| "grad_norm": 0.7216944098472595, | |
| "learning_rate": 9.021653455011778e-05, | |
| "loss": 0.3247, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.907563025210084, | |
| "grad_norm": 0.7170887589454651, | |
| "learning_rate": 8.904931519913304e-05, | |
| "loss": 0.354, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.92436974789916, | |
| "grad_norm": 0.6556657552719116, | |
| "learning_rate": 8.788360397994785e-05, | |
| "loss": 0.3406, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 0.7959017157554626, | |
| "learning_rate": 8.671956143468267e-05, | |
| "loss": 0.3388, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.957983193277311, | |
| "grad_norm": 0.7491987347602844, | |
| "learning_rate": 8.55573478756477e-05, | |
| "loss": 0.3676, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.9747899159663866, | |
| "grad_norm": 0.7010266780853271, | |
| "learning_rate": 8.439712336326452e-05, | |
| "loss": 0.3152, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.991596638655462, | |
| "grad_norm": 0.7219695448875427, | |
| "learning_rate": 8.32390476840227e-05, | |
| "loss": 0.3575, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.008403361344538, | |
| "grad_norm": 0.8448328971862793, | |
| "learning_rate": 8.208328032847366e-05, | |
| "loss": 0.2936, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.0252100840336134, | |
| "grad_norm": 0.575706958770752, | |
| "learning_rate": 8.092998046926583e-05, | |
| "loss": 0.2582, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.042016806722689, | |
| "grad_norm": 0.8433834910392761, | |
| "learning_rate": 7.977930693922329e-05, | |
| "loss": 0.2324, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.0588235294117645, | |
| "grad_norm": 0.8530866503715515, | |
| "learning_rate": 7.863141820947114e-05, | |
| "loss": 0.2219, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.0756302521008405, | |
| "grad_norm": 0.7315991520881653, | |
| "learning_rate": 7.748647236761094e-05, | |
| "loss": 0.2201, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.092436974789916, | |
| "grad_norm": 0.7668396234512329, | |
| "learning_rate": 7.63446270959488e-05, | |
| "loss": 0.2664, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.1092436974789917, | |
| "grad_norm": 0.7540835738182068, | |
| "learning_rate": 7.520603964977939e-05, | |
| "loss": 0.2426, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.1260504201680672, | |
| "grad_norm": 0.756682276725769, | |
| "learning_rate": 7.407086683572846e-05, | |
| "loss": 0.2297, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.142857142857143, | |
| "grad_norm": 0.7847204804420471, | |
| "learning_rate": 7.293926499015763e-05, | |
| "loss": 0.257, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.1596638655462184, | |
| "grad_norm": 0.909298837184906, | |
| "learning_rate": 7.181138995763362e-05, | |
| "loss": 0.2374, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.176470588235294, | |
| "grad_norm": 1.1145635843276978, | |
| "learning_rate": 7.068739706946517e-05, | |
| "loss": 0.2389, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.19327731092437, | |
| "grad_norm": 0.7492973804473877, | |
| "learning_rate": 6.956744112231095e-05, | |
| "loss": 0.2466, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.2100840336134455, | |
| "grad_norm": 0.7049612998962402, | |
| "learning_rate": 6.845167635686083e-05, | |
| "loss": 0.2369, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.226890756302521, | |
| "grad_norm": 0.6115100979804993, | |
| "learning_rate": 6.73402564365938e-05, | |
| "loss": 0.216, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.2436974789915967, | |
| "grad_norm": 0.7355627417564392, | |
| "learning_rate": 6.623333442661532e-05, | |
| "loss": 0.2449, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.2605042016806722, | |
| "grad_norm": 0.8105019927024841, | |
| "learning_rate": 6.513106277257728e-05, | |
| "loss": 0.2361, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.277310924369748, | |
| "grad_norm": 0.6543210744857788, | |
| "learning_rate": 6.403359327968303e-05, | |
| "loss": 0.2136, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.2941176470588234, | |
| "grad_norm": 1.191417932510376, | |
| "learning_rate": 6.294107709178079e-05, | |
| "loss": 0.2457, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.310924369747899, | |
| "grad_norm": 0.8825222849845886, | |
| "learning_rate": 6.185366467054799e-05, | |
| "loss": 0.2184, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.327731092436975, | |
| "grad_norm": 0.8879269361495972, | |
| "learning_rate": 6.0771505774769734e-05, | |
| "loss": 0.2419, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.3445378151260505, | |
| "grad_norm": 0.692824125289917, | |
| "learning_rate": 5.9694749439713936e-05, | |
| "loss": 0.2559, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.361344537815126, | |
| "grad_norm": 0.7757511138916016, | |
| "learning_rate": 5.8623543956606074e-05, | |
| "loss": 0.2383, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.361344537815126, | |
| "eval_loss": 0.5869996547698975, | |
| "eval_runtime": 8.7439, | |
| "eval_samples_per_second": 114.479, | |
| "eval_steps_per_second": 14.41, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.3781512605042017, | |
| "grad_norm": 0.8735210299491882, | |
| "learning_rate": 5.7558036852206596e-05, | |
| "loss": 0.2554, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.3949579831932772, | |
| "grad_norm": 0.9418761134147644, | |
| "learning_rate": 5.6498374868493296e-05, | |
| "loss": 0.2682, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.411764705882353, | |
| "grad_norm": 0.7112303972244263, | |
| "learning_rate": 5.544470394245225e-05, | |
| "loss": 0.2347, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.4285714285714284, | |
| "grad_norm": 1.082740068435669, | |
| "learning_rate": 5.439716918597906e-05, | |
| "loss": 0.2561, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.4453781512605044, | |
| "grad_norm": 0.7755303382873535, | |
| "learning_rate": 5.335591486589406e-05, | |
| "loss": 0.2087, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.46218487394958, | |
| "grad_norm": 0.8446845412254333, | |
| "learning_rate": 5.232108438407399e-05, | |
| "loss": 0.2323, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.4789915966386555, | |
| "grad_norm": 0.8153941035270691, | |
| "learning_rate": 5.129282025770252e-05, | |
| "loss": 0.2275, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.495798319327731, | |
| "grad_norm": 0.7065542340278625, | |
| "learning_rate": 5.027126409964271e-05, | |
| "loss": 0.2136, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.5126050420168067, | |
| "grad_norm": 1.1057276725769043, | |
| "learning_rate": 4.9256556598934054e-05, | |
| "loss": 0.2381, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.5294117647058822, | |
| "grad_norm": 0.6242520809173584, | |
| "learning_rate": 4.82488375014169e-05, | |
| "loss": 0.2394, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.546218487394958, | |
| "grad_norm": 0.6137174367904663, | |
| "learning_rate": 4.7248245590486375e-05, | |
| "loss": 0.2411, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.5630252100840334, | |
| "grad_norm": 0.8595492839813232, | |
| "learning_rate": 4.625491866797917e-05, | |
| "loss": 0.2383, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.5798319327731094, | |
| "grad_norm": 0.7664365768432617, | |
| "learning_rate": 4.526899353519557e-05, | |
| "loss": 0.264, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.596638655462185, | |
| "grad_norm": 1.2114841938018799, | |
| "learning_rate": 4.4290605974058995e-05, | |
| "loss": 0.2412, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.6134453781512605, | |
| "grad_norm": 0.8489940166473389, | |
| "learning_rate": 4.331989072841604e-05, | |
| "loss": 0.2408, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.630252100840336, | |
| "grad_norm": 0.7826299071311951, | |
| "learning_rate": 4.235698148547955e-05, | |
| "loss": 0.2347, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.6470588235294117, | |
| "grad_norm": 0.9073792099952698, | |
| "learning_rate": 4.140201085741731e-05, | |
| "loss": 0.2454, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.6638655462184873, | |
| "grad_norm": 0.8302606344223022, | |
| "learning_rate": 4.0455110363088424e-05, | |
| "loss": 0.2381, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.6806722689075633, | |
| "grad_norm": 0.7296493649482727, | |
| "learning_rate": 3.9516410409930594e-05, | |
| "loss": 0.2432, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.697478991596639, | |
| "grad_norm": 0.8790242075920105, | |
| "learning_rate": 3.858604027600049e-05, | |
| "loss": 0.2387, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.7142857142857144, | |
| "grad_norm": 0.8583736419677734, | |
| "learning_rate": 3.766412809216937e-05, | |
| "loss": 0.2482, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.73109243697479, | |
| "grad_norm": 0.8858778476715088, | |
| "learning_rate": 3.675080082447688e-05, | |
| "loss": 0.2524, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.7478991596638656, | |
| "grad_norm": 0.896706759929657, | |
| "learning_rate": 3.584618425664526e-05, | |
| "loss": 0.2477, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.764705882352941, | |
| "grad_norm": 0.7617205381393433, | |
| "learning_rate": 3.495040297275651e-05, | |
| "loss": 0.2194, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.7815126050420167, | |
| "grad_norm": 0.7257958650588989, | |
| "learning_rate": 3.40635803400944e-05, | |
| "loss": 0.2414, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.7983193277310923, | |
| "grad_norm": 0.83749920129776, | |
| "learning_rate": 3.3185838492154386e-05, | |
| "loss": 0.2218, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.815126050420168, | |
| "grad_norm": 1.028795599937439, | |
| "learning_rate": 3.231729831182344e-05, | |
| "loss": 0.2312, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.831932773109244, | |
| "grad_norm": 0.8044925332069397, | |
| "learning_rate": 3.1458079414731955e-05, | |
| "loss": 0.2317, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.8487394957983194, | |
| "grad_norm": 0.7760542035102844, | |
| "learning_rate": 3.060830013278012e-05, | |
| "loss": 0.2222, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 3.865546218487395, | |
| "grad_norm": 0.7686411738395691, | |
| "learning_rate": 2.9768077497841352e-05, | |
| "loss": 0.2313, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.8823529411764706, | |
| "grad_norm": 0.700995147228241, | |
| "learning_rate": 2.8937527225644677e-05, | |
| "loss": 0.231, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.899159663865546, | |
| "grad_norm": 0.9482907652854919, | |
| "learning_rate": 2.8116763699838167e-05, | |
| "loss": 0.2504, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.9159663865546217, | |
| "grad_norm": 0.9015063047409058, | |
| "learning_rate": 2.7305899956236046e-05, | |
| "loss": 0.24, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.9327731092436977, | |
| "grad_norm": 0.9054192900657654, | |
| "learning_rate": 2.6505047667251437e-05, | |
| "loss": 0.2478, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.9495798319327733, | |
| "grad_norm": 0.9170212745666504, | |
| "learning_rate": 2.5714317126516763e-05, | |
| "loss": 0.2419, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.966386554621849, | |
| "grad_norm": 1.029768466949463, | |
| "learning_rate": 2.493381723369398e-05, | |
| "loss": 0.2341, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.9831932773109244, | |
| "grad_norm": 0.6440130472183228, | |
| "learning_rate": 2.4163655479476943e-05, | |
| "loss": 0.213, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.8131651282310486, | |
| "learning_rate": 2.340393793078789e-05, | |
| "loss": 0.2302, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.016806722689076, | |
| "grad_norm": 0.7478677034378052, | |
| "learning_rate": 2.2654769216169602e-05, | |
| "loss": 0.1944, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.033613445378151, | |
| "grad_norm": 0.7543561458587646, | |
| "learning_rate": 2.1916252511376133e-05, | |
| "loss": 0.172, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.050420168067227, | |
| "grad_norm": 0.7355443239212036, | |
| "learning_rate": 2.1188489525163367e-05, | |
| "loss": 0.166, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.067226890756302, | |
| "grad_norm": 0.7719018459320068, | |
| "learning_rate": 2.047158048528165e-05, | |
| "loss": 0.1421, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.084033613445378, | |
| "grad_norm": 0.7346205115318298, | |
| "learning_rate": 1.976562412467232e-05, | |
| "loss": 0.174, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.100840336134453, | |
| "grad_norm": 0.960121214389801, | |
| "learning_rate": 1.9070717667870263e-05, | |
| "loss": 0.1725, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.117647058823529, | |
| "grad_norm": 0.8343680500984192, | |
| "learning_rate": 1.838695681761412e-05, | |
| "loss": 0.1777, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.1344537815126055, | |
| "grad_norm": 0.9134616851806641, | |
| "learning_rate": 1.7714435741666037e-05, | |
| "loss": 0.1541, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.151260504201681, | |
| "grad_norm": 0.9840583801269531, | |
| "learning_rate": 1.7053247059842793e-05, | |
| "loss": 0.1905, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.168067226890757, | |
| "grad_norm": 0.7404395341873169, | |
| "learning_rate": 1.6403481831260338e-05, | |
| "loss": 0.1803, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.184873949579832, | |
| "grad_norm": 1.0390299558639526, | |
| "learning_rate": 1.5765229541792982e-05, | |
| "loss": 0.1691, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.201680672268908, | |
| "grad_norm": 0.8568964600563049, | |
| "learning_rate": 1.5138578091749334e-05, | |
| "loss": 0.1706, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.218487394957983, | |
| "grad_norm": 0.9740147590637207, | |
| "learning_rate": 1.4523613783766732e-05, | |
| "loss": 0.1795, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.235294117647059, | |
| "grad_norm": 0.9405394792556763, | |
| "learning_rate": 1.3920421310925636e-05, | |
| "loss": 0.1614, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.2521008403361344, | |
| "grad_norm": 0.8562027215957642, | |
| "learning_rate": 1.3329083745085547e-05, | |
| "loss": 0.1621, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.26890756302521, | |
| "grad_norm": 0.8962126970291138, | |
| "learning_rate": 1.2749682525444407e-05, | |
| "loss": 0.1642, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 0.9526483416557312, | |
| "learning_rate": 1.2182297447322787e-05, | |
| "loss": 0.1581, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.302521008403361, | |
| "grad_norm": 0.7188050746917725, | |
| "learning_rate": 1.1627006651174444e-05, | |
| "loss": 0.1861, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.319327731092437, | |
| "grad_norm": 0.8526328206062317, | |
| "learning_rate": 1.10838866118247e-05, | |
| "loss": 0.1589, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.336134453781512, | |
| "grad_norm": 0.845211923122406, | |
| "learning_rate": 1.0553012127938389e-05, | |
| "loss": 0.1958, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.352941176470588, | |
| "grad_norm": 0.9076491594314575, | |
| "learning_rate": 1.0034456311718609e-05, | |
| "loss": 0.1664, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.369747899159664, | |
| "grad_norm": 1.0046247243881226, | |
| "learning_rate": 9.528290578837551e-06, | |
| "loss": 0.1713, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.38655462184874, | |
| "grad_norm": 0.7860477566719055, | |
| "learning_rate": 9.034584638601196e-06, | |
| "loss": 0.1494, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.4033613445378155, | |
| "grad_norm": 1.0020995140075684, | |
| "learning_rate": 8.553406484348914e-06, | |
| "loss": 0.1823, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.420168067226891, | |
| "grad_norm": 1.2964270114898682, | |
| "learning_rate": 8.084822384089453e-06, | |
| "loss": 0.1825, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.436974789915967, | |
| "grad_norm": 1.0893545150756836, | |
| "learning_rate": 7.628896871374358e-06, | |
| "loss": 0.1799, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.453781512605042, | |
| "grad_norm": 0.8512413501739502, | |
| "learning_rate": 7.185692736410477e-06, | |
| "loss": 0.1707, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.470588235294118, | |
| "grad_norm": 1.3223856687545776, | |
| "learning_rate": 6.755271017412501e-06, | |
| "loss": 0.1748, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.487394957983193, | |
| "grad_norm": 1.0431852340698242, | |
| "learning_rate": 6.337690992196688e-06, | |
| "loss": 0.1703, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.504201680672269, | |
| "grad_norm": 1.3509666919708252, | |
| "learning_rate": 5.933010170017139e-06, | |
| "loss": 0.1717, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.5210084033613445, | |
| "grad_norm": 0.9030164480209351, | |
| "learning_rate": 5.54128428364562e-06, | |
| "loss": 0.1866, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.53781512605042, | |
| "grad_norm": 0.9294901490211487, | |
| "learning_rate": 5.162567281696018e-06, | |
| "loss": 0.1659, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.554621848739496, | |
| "grad_norm": 0.887606680393219, | |
| "learning_rate": 4.796911321194475e-06, | |
| "loss": 0.166, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.571428571428571, | |
| "grad_norm": 0.8213235139846802, | |
| "learning_rate": 4.444366760396357e-06, | |
| "loss": 0.1768, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.588235294117647, | |
| "grad_norm": 0.9339560270309448, | |
| "learning_rate": 4.104982151850911e-06, | |
| "loss": 0.1769, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.605042016806722, | |
| "grad_norm": 0.8878592252731323, | |
| "learning_rate": 3.7788042357145616e-06, | |
| "loss": 0.1708, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.621848739495798, | |
| "grad_norm": 0.8466598987579346, | |
| "learning_rate": 3.465877933313799e-06, | |
| "loss": 0.176, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.6386554621848735, | |
| "grad_norm": 0.8101462721824646, | |
| "learning_rate": 3.166246340958712e-06, | |
| "loss": 0.1722, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.65546218487395, | |
| "grad_norm": 1.1340394020080566, | |
| "learning_rate": 2.8799507240076406e-06, | |
| "loss": 0.1851, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.6722689075630255, | |
| "grad_norm": 1.2405331134796143, | |
| "learning_rate": 2.607030511184183e-06, | |
| "loss": 0.1671, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.689075630252101, | |
| "grad_norm": 0.8664318323135376, | |
| "learning_rate": 2.347523289146991e-06, | |
| "loss": 0.1728, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.705882352941177, | |
| "grad_norm": 0.9817457795143127, | |
| "learning_rate": 2.101464797313357e-06, | |
| "loss": 0.1549, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.722689075630252, | |
| "grad_norm": 0.8827335834503174, | |
| "learning_rate": 1.868888922937162e-06, | |
| "loss": 0.1423, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.739495798319328, | |
| "grad_norm": 0.7880733609199524, | |
| "learning_rate": 1.649827696441919e-06, | |
| "loss": 0.1571, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 4.756302521008403, | |
| "grad_norm": 1.198318600654602, | |
| "learning_rate": 1.4443112870095588e-06, | |
| "loss": 0.1624, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 4.773109243697479, | |
| "grad_norm": 0.7685941457748413, | |
| "learning_rate": 1.25236799842543e-06, | |
| "loss": 0.1609, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 4.7899159663865545, | |
| "grad_norm": 0.929218053817749, | |
| "learning_rate": 1.0740242651803956e-06, | |
| "loss": 0.1664, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 4.80672268907563, | |
| "grad_norm": 0.9220144152641296, | |
| "learning_rate": 9.093046488302337e-07, | |
| "loss": 0.1793, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 4.823529411764706, | |
| "grad_norm": 0.8192142844200134, | |
| "learning_rate": 7.582318346129657e-07, | |
| "loss": 0.1641, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 4.840336134453781, | |
| "grad_norm": 0.8484081029891968, | |
| "learning_rate": 6.20826628324711e-07, | |
| "loss": 0.1508, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 4.857142857142857, | |
| "grad_norm": 1.0260757207870483, | |
| "learning_rate": 4.971079534542345e-07, | |
| "loss": 0.1628, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 4.873949579831933, | |
| "grad_norm": 0.9038840532302856, | |
| "learning_rate": 3.8709284857687546e-07, | |
| "loss": 0.1684, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.890756302521009, | |
| "grad_norm": 0.7963437438011169, | |
| "learning_rate": 2.9079646500792534e-07, | |
| "loss": 0.1619, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 4.907563025210084, | |
| "grad_norm": 0.8787494897842407, | |
| "learning_rate": 2.0823206471599632e-07, | |
| "loss": 0.1612, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 4.92436974789916, | |
| "grad_norm": 1.0699881315231323, | |
| "learning_rate": 1.3941101849660466e-07, | |
| "loss": 0.1647, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 4.9411764705882355, | |
| "grad_norm": 0.6501001715660095, | |
| "learning_rate": 8.434280440616782e-08, | |
| "loss": 0.1611, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 4.957983193277311, | |
| "grad_norm": 0.9485645890235901, | |
| "learning_rate": 4.3035006456670915e-08, | |
| "loss": 0.1694, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 4.974789915966387, | |
| "grad_norm": 0.8494912385940552, | |
| "learning_rate": 1.5493313571202093e-08, | |
| "loss": 0.1695, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 4.991596638655462, | |
| "grad_norm": 0.9211341738700867, | |
| "learning_rate": 1.7215188004904293e-09, | |
| "loss": 0.155, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 2975, | |
| "total_flos": 2.1667263896407245e+17, | |
| "train_loss": 0.34065224741687294, | |
| "train_runtime": 1015.5372, | |
| "train_samples_per_second": 93.62, | |
| "train_steps_per_second": 2.929 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2975, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1667263896407245e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |