{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.105875, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 0.32223036885261536, "learning_rate": 9.890999999999999e-06, "loss": 2.9122833251953124, "step": 10 }, { "epoch": 0.0025, "grad_norm": 0.3097129762172699, "learning_rate": 2.0881000000000002e-05, "loss": 2.881389617919922, "step": 20 }, { "epoch": 0.00375, "grad_norm": 0.30452761054039, "learning_rate": 3.1871e-05, "loss": 2.8967803955078124, "step": 30 }, { "epoch": 0.005, "grad_norm": 0.2955208420753479, "learning_rate": 4.2861e-05, "loss": 2.8681930541992187, "step": 40 }, { "epoch": 0.00625, "grad_norm": 0.303114652633667, "learning_rate": 5.3850999999999997e-05, "loss": 2.8751144409179688, "step": 50 }, { "epoch": 0.0075, "grad_norm": 0.299868106842041, "learning_rate": 6.4841e-05, "loss": 2.8749458312988283, "step": 60 }, { "epoch": 0.00875, "grad_norm": 0.31019559502601624, "learning_rate": 7.5831e-05, "loss": 2.860850524902344, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.30683860182762146, "learning_rate": 8.6821e-05, "loss": 2.8604888916015625, "step": 80 }, { "epoch": 0.01125, "grad_norm": 0.30718618631362915, "learning_rate": 9.7811e-05, "loss": 2.8761545181274415, "step": 90 }, { "epoch": 0.0125, "grad_norm": 0.32211124897003174, "learning_rate": 0.000108801, "loss": 2.8468536376953124, "step": 100 }, { "epoch": 0.01375, "grad_norm": 0.31109386682510376, "learning_rate": 0.000119791, "loss": 2.8445552825927733, "step": 110 }, { "epoch": 0.015, "grad_norm": 0.3102831542491913, "learning_rate": 0.000130781, "loss": 2.864554214477539, "step": 120 }, { "epoch": 0.01625, "grad_norm": 0.3220812976360321, "learning_rate": 0.000141771, "loss": 2.8806329727172852, "step": 130 }, { "epoch": 0.0175, "grad_norm": 0.30876684188842773, "learning_rate": 0.00015276099999999998, "loss": 2.8452987670898438, "step": 140 }, { "epoch": 0.01875, "grad_norm": 0.31868258118629456, "learning_rate": 0.000163751, "loss": 2.8517589569091797, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.3087399899959564, "learning_rate": 0.000174741, "loss": 2.8347522735595705, "step": 160 }, { "epoch": 0.02125, "grad_norm": 0.3106062710285187, "learning_rate": 0.000185731, "loss": 2.85534553527832, "step": 170 }, { "epoch": 0.0225, "grad_norm": 0.32315531373023987, "learning_rate": 0.00019672100000000002, "loss": 2.858936309814453, "step": 180 }, { "epoch": 0.02375, "grad_norm": 0.3293415307998657, "learning_rate": 0.000207711, "loss": 2.8992713928222655, "step": 190 }, { "epoch": 0.025, "grad_norm": 0.3309278190135956, "learning_rate": 0.000218701, "loss": 2.863359832763672, "step": 200 }, { "epoch": 0.02625, "grad_norm": 0.3089866638183594, "learning_rate": 0.0002197992779574687, "loss": 2.8769275665283205, "step": 210 }, { "epoch": 0.0275, "grad_norm": 0.32934558391571045, "learning_rate": 0.0002197967820201583, "loss": 2.8595829010009766, "step": 220 }, { "epoch": 0.02875, "grad_norm": 0.3154727518558502, "learning_rate": 0.00021979250331444358, "loss": 2.8655704498291015, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.31462976336479187, "learning_rate": 0.0002197864419097345, "loss": 2.8554920196533202, "step": 240 }, { "epoch": 0.03125, "grad_norm": 0.3277083933353424, "learning_rate": 0.00021977859790436047, "loss": 2.896647262573242, "step": 250 }, { "epoch": 0.0325, "grad_norm": 0.3230266571044922, "learning_rate": 0.00021976897142556858, "loss": 2.8914859771728514, "step": 260 }, { "epoch": 0.03375, "grad_norm": 0.3097373843193054, "learning_rate": 0.00021975756262952153, "loss": 2.867509460449219, "step": 270 }, { "epoch": 0.035, "grad_norm": 0.31526896357536316, "learning_rate": 0.00021974437170129525, "loss": 2.861627388000488, "step": 280 }, { "epoch": 0.03625, "grad_norm": 0.3264971077442169, "learning_rate": 0.0002197293988548756, "loss": 2.8434619903564453, "step": 290 }, { "epoch": 0.0375, "grad_norm": 0.31985947489738464, "learning_rate": 0.00021971264433315533, "loss": 2.858683776855469, "step": 300 }, { "epoch": 0.03875, "grad_norm": 0.3007575571537018, "learning_rate": 0.00021969410840792965, "loss": 2.856831359863281, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.32181960344314575, "learning_rate": 0.00021967379137989224, "loss": 2.8669090270996094, "step": 320 }, { "epoch": 0.04125, "grad_norm": 0.3142366111278534, "learning_rate": 0.00021965169357863014, "loss": 2.864155578613281, "step": 330 }, { "epoch": 0.0425, "grad_norm": 0.31411442160606384, "learning_rate": 0.00021962781536261853, "loss": 2.8719043731689453, "step": 340 }, { "epoch": 0.04375, "grad_norm": 0.3069416880607605, "learning_rate": 0.00021960215711921467, "loss": 2.8788784027099608, "step": 350 }, { "epoch": 0.045, "grad_norm": 0.3287704586982727, "learning_rate": 0.00021957471926465198, "loss": 2.8686893463134764, "step": 360 }, { "epoch": 0.04625, "grad_norm": 0.32815802097320557, "learning_rate": 0.00021954550224403304, "loss": 2.872859573364258, "step": 370 }, { "epoch": 0.0475, "grad_norm": 0.3123241066932678, "learning_rate": 0.0002195145065313224, "loss": 2.861919975280762, "step": 380 }, { "epoch": 0.04875, "grad_norm": 0.3143039643764496, "learning_rate": 0.0002194817326293389, "loss": 2.8754358291625977, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.32305410504341125, "learning_rate": 0.00021944718106974763, "loss": 2.830820083618164, "step": 400 }, { "epoch": 0.05125, "grad_norm": 0.3187738060951233, "learning_rate": 0.00021941085241305118, "loss": 2.8469779968261717, "step": 410 }, { "epoch": 0.0525, "grad_norm": 0.3240358829498291, "learning_rate": 0.00021937274724858052, "loss": 2.872676467895508, "step": 420 }, { "epoch": 0.05375, "grad_norm": 0.3307654857635498, "learning_rate": 0.00021933286619448556, "loss": 2.868929862976074, "step": 430 }, { "epoch": 0.055, "grad_norm": 0.31867194175720215, "learning_rate": 0.00021929120989772503, "loss": 2.837067794799805, "step": 440 }, { "epoch": 0.05625, "grad_norm": 0.3109733760356903, "learning_rate": 0.00021924777903405596, "loss": 2.8356159210205076, "step": 450 }, { "epoch": 0.0575, "grad_norm": 0.33047595620155334, "learning_rate": 0.00021920257430802295, "loss": 2.859963226318359, "step": 460 }, { "epoch": 0.05875, "grad_norm": 0.3140341341495514, "learning_rate": 0.00021915559645294634, "loss": 2.864061737060547, "step": 470 }, { "epoch": 0.06, "grad_norm": 0.30880865454673767, "learning_rate": 0.0002191068462309107, "loss": 2.8523515701293944, "step": 480 }, { "epoch": 0.06125, "grad_norm": 0.3137487769126892, "learning_rate": 0.00021905632443275225, "loss": 2.8639093399047852, "step": 490 }, { "epoch": 0.0625, "grad_norm": 0.340537965297699, "learning_rate": 0.00021900403187804607, "loss": 2.8927494049072267, "step": 500 }, { "epoch": 0.06375, "grad_norm": 0.31051260232925415, "learning_rate": 0.00021894996941509282, "loss": 2.840711212158203, "step": 510 }, { "epoch": 0.065, "grad_norm": 0.3152431547641754, "learning_rate": 0.00021889413792090502, "loss": 2.862700653076172, "step": 520 }, { "epoch": 0.06625, "grad_norm": 0.3119368553161621, "learning_rate": 0.00021883653830119274, "loss": 2.8526124954223633, "step": 530 }, { "epoch": 0.0675, "grad_norm": 0.31616318225860596, "learning_rate": 0.00021877717149034896, "loss": 2.855159378051758, "step": 540 }, { "epoch": 0.06875, "grad_norm": 0.30254286527633667, "learning_rate": 0.00021871603845143443, "loss": 2.854717254638672, "step": 550 }, { "epoch": 0.07, "grad_norm": 0.3120061159133911, "learning_rate": 0.000218653140176162, "loss": 2.850946044921875, "step": 560 }, { "epoch": 0.07125, "grad_norm": 0.30754292011260986, "learning_rate": 0.00021858847768488048, "loss": 2.8386112213134767, "step": 570 }, { "epoch": 0.0725, "grad_norm": 0.3003309667110443, "learning_rate": 0.0002185220520265583, "loss": 2.858784294128418, "step": 580 }, { "epoch": 0.07375, "grad_norm": 0.31817367672920227, "learning_rate": 0.00021845386427876622, "loss": 2.8400810241699217, "step": 590 }, { "epoch": 0.075, "grad_norm": 0.31158024072647095, "learning_rate": 0.00021838391554766004, "loss": 2.8315425872802735, "step": 600 }, { "epoch": 0.07625, "grad_norm": 0.31356877088546753, "learning_rate": 0.00021831220696796264, "loss": 2.85643310546875, "step": 610 }, { "epoch": 0.0775, "grad_norm": 0.3057396411895752, "learning_rate": 0.00021823873970294543, "loss": 2.8644752502441406, "step": 620 }, { "epoch": 0.07875, "grad_norm": 0.30540961027145386, "learning_rate": 0.00021816351494440965, "loss": 2.840130615234375, "step": 630 }, { "epoch": 0.08, "grad_norm": 0.3201405107975006, "learning_rate": 0.00021808653391266697, "loss": 2.81726016998291, "step": 640 }, { "epoch": 0.08125, "grad_norm": 0.31356149911880493, "learning_rate": 0.0002180077978565196, "loss": 2.841321563720703, "step": 650 }, { "epoch": 0.0825, "grad_norm": 0.3322322368621826, "learning_rate": 0.00021792730805324023, "loss": 2.833037185668945, "step": 660 }, { "epoch": 0.08375, "grad_norm": 0.3101900517940521, "learning_rate": 0.0002178450658085511, "loss": 2.8306228637695314, "step": 670 }, { "epoch": 0.085, "grad_norm": 0.31162750720977783, "learning_rate": 0.00021776107245660307, "loss": 2.849654769897461, "step": 680 }, { "epoch": 0.08625, "grad_norm": 0.3168909251689911, "learning_rate": 0.00021767532935995366, "loss": 2.882074737548828, "step": 690 }, { "epoch": 0.0875, "grad_norm": 0.2994805574417114, "learning_rate": 0.00021758783790954515, "loss": 2.834335517883301, "step": 700 }, { "epoch": 0.08875, "grad_norm": 0.3097037672996521, "learning_rate": 0.0002174985995246821, "loss": 2.8143672943115234, "step": 710 }, { "epoch": 0.09, "grad_norm": 0.32182541489601135, "learning_rate": 0.00021740761565300799, "loss": 2.845683288574219, "step": 720 }, { "epoch": 0.09125, "grad_norm": 0.32514718174934387, "learning_rate": 0.00021731488777048213, "loss": 2.8221324920654296, "step": 730 }, { "epoch": 0.0925, "grad_norm": 0.3028743267059326, "learning_rate": 0.0002172204173813555, "loss": 2.8356349945068358, "step": 740 }, { "epoch": 0.09375, "grad_norm": 0.3133573830127716, "learning_rate": 0.0002171242060181463, "loss": 2.838234710693359, "step": 750 }, { "epoch": 0.095, "grad_norm": 0.3107962906360626, "learning_rate": 0.00021702625524161527, "loss": 2.8331020355224608, "step": 760 }, { "epoch": 0.09625, "grad_norm": 0.3266642987728119, "learning_rate": 0.00021692656664074023, "loss": 2.847811698913574, "step": 770 }, { "epoch": 0.0975, "grad_norm": 0.3073740303516388, "learning_rate": 0.00021682514183269034, "loss": 2.8351299285888674, "step": 780 }, { "epoch": 0.09875, "grad_norm": 0.3130224645137787, "learning_rate": 0.00021672198246279985, "loss": 2.7890214920043945, "step": 790 }, { "epoch": 0.1, "grad_norm": 0.3218679130077362, "learning_rate": 0.00021661709020454157, "loss": 2.8209762573242188, "step": 800 }, { "epoch": 0.10125, "grad_norm": 0.2967888414859772, "learning_rate": 0.00021651046675949938, "loss": 2.819289207458496, "step": 810 }, { "epoch": 0.1025, "grad_norm": 0.32564085721969604, "learning_rate": 0.000216402113857341, "loss": 2.8148468017578123, "step": 820 }, { "epoch": 0.10375, "grad_norm": 0.30720430612564087, "learning_rate": 0.00021629203325578962, "loss": 2.832720947265625, "step": 830 }, { "epoch": 0.105, "grad_norm": 0.31553003191947937, "learning_rate": 0.00021618022674059568, "loss": 2.8313037872314455, "step": 840 }, { "epoch": 0.10625, "grad_norm": 0.2927679121494293, "learning_rate": 0.0002160666961255076, "loss": 2.822229766845703, "step": 850 }, { "epoch": 0.1075, "grad_norm": 0.3168841302394867, "learning_rate": 0.00021595144325224264, "loss": 2.8234331130981447, "step": 860 }, { "epoch": 0.10875, "grad_norm": 0.3195788860321045, "learning_rate": 0.0002158344699904568, "loss": 2.8171760559082033, "step": 870 }, { "epoch": 0.11, "grad_norm": 0.30483055114746094, "learning_rate": 0.00021571577823771462, "loss": 2.82617244720459, "step": 880 }, { "epoch": 0.11125, "grad_norm": 0.31678906083106995, "learning_rate": 0.00021559536991945833, "loss": 2.8162193298339844, "step": 890 }, { "epoch": 0.1125, "grad_norm": 0.30715152621269226, "learning_rate": 0.00021547324698897665, "loss": 2.8252620697021484, "step": 900 }, { "epoch": 0.11375, "grad_norm": 0.303281307220459, "learning_rate": 0.00021534941142737314, "loss": 2.8220481872558594, "step": 910 }, { "epoch": 0.115, "grad_norm": 0.3042793571949005, "learning_rate": 0.00021522386524353395, "loss": 2.825517272949219, "step": 920 }, { "epoch": 0.11625, "grad_norm": 0.328135222196579, "learning_rate": 0.00021509661047409534, "loss": 2.806531524658203, "step": 930 }, { "epoch": 0.1175, "grad_norm": 0.30471575260162354, "learning_rate": 0.00021496764918341058, "loss": 2.8206180572509765, "step": 940 }, { "epoch": 0.11875, "grad_norm": 0.3096025884151459, "learning_rate": 0.0002148369834635165, "loss": 2.8001310348510744, "step": 950 }, { "epoch": 0.12, "grad_norm": 0.30915719270706177, "learning_rate": 0.0002147046154340995, "loss": 2.838936996459961, "step": 960 }, { "epoch": 0.12125, "grad_norm": 0.30633190274238586, "learning_rate": 0.00021457054724246125, "loss": 2.8280914306640623, "step": 970 }, { "epoch": 0.1225, "grad_norm": 0.3169943392276764, "learning_rate": 0.00021443478106348375, "loss": 2.8208492279052733, "step": 980 }, { "epoch": 0.12375, "grad_norm": 0.31402623653411865, "learning_rate": 0.00021429731909959417, "loss": 2.803514099121094, "step": 990 }, { "epoch": 0.125, "grad_norm": 0.31064271926879883, "learning_rate": 0.00021415816358072898, "loss": 2.828254508972168, "step": 1000 }, { "epoch": 0.12625, "grad_norm": 0.3190893530845642, "learning_rate": 0.00021401731676429792, "loss": 2.814365196228027, "step": 1010 }, { "epoch": 0.1275, "grad_norm": 0.3164026141166687, "learning_rate": 0.00021387478093514724, "loss": 2.803851509094238, "step": 1020 }, { "epoch": 0.12875, "grad_norm": 0.3159414529800415, "learning_rate": 0.00021373055840552275, "loss": 2.8509082794189453, "step": 1030 }, { "epoch": 0.13, "grad_norm": 0.3224294185638428, "learning_rate": 0.00021358465151503225, "loss": 2.789044952392578, "step": 1040 }, { "epoch": 0.13125, "grad_norm": 0.31033849716186523, "learning_rate": 0.00021343706263060765, "loss": 2.8226268768310545, "step": 1050 }, { "epoch": 0.1325, "grad_norm": 0.3086431622505188, "learning_rate": 0.00021328779414646635, "loss": 2.8077007293701173, "step": 1060 }, { "epoch": 0.13375, "grad_norm": 0.3155769109725952, "learning_rate": 0.00021313684848407282, "loss": 2.8190916061401365, "step": 1070 }, { "epoch": 0.135, "grad_norm": 0.3062079846858978, "learning_rate": 0.0002129842280920988, "loss": 2.8035049438476562, "step": 1080 }, { "epoch": 0.13625, "grad_norm": 0.3113609552383423, "learning_rate": 0.000212829935446384, "loss": 2.808064842224121, "step": 1090 }, { "epoch": 0.1375, "grad_norm": 0.3248916566371918, "learning_rate": 0.0002126739730498958, "loss": 2.8036418914794923, "step": 1100 }, { "epoch": 0.13875, "grad_norm": 0.314177930355072, "learning_rate": 0.00021251634343268845, "loss": 2.8073291778564453, "step": 1110 }, { "epoch": 0.14, "grad_norm": 0.31667032837867737, "learning_rate": 0.00021235704915186242, "loss": 2.8247406005859377, "step": 1120 }, { "epoch": 0.14125, "grad_norm": 0.32587730884552, "learning_rate": 0.0002121960927915225, "loss": 2.81424560546875, "step": 1130 }, { "epoch": 0.1425, "grad_norm": 0.3099067509174347, "learning_rate": 0.00021203347696273621, "loss": 2.833042526245117, "step": 1140 }, { "epoch": 0.14375, "grad_norm": 0.3176534175872803, "learning_rate": 0.0002118692043034913, "loss": 2.8056007385253907, "step": 1150 }, { "epoch": 0.145, "grad_norm": 0.32910725474357605, "learning_rate": 0.00021170327747865292, "loss": 2.791951370239258, "step": 1160 }, { "epoch": 0.14625, "grad_norm": 0.31169673800468445, "learning_rate": 0.00021153569917992042, "loss": 2.809808540344238, "step": 1170 }, { "epoch": 0.1475, "grad_norm": 0.31293970346450806, "learning_rate": 0.00021136647212578378, "loss": 2.7925342559814452, "step": 1180 }, { "epoch": 0.14875, "grad_norm": 0.3170998990535736, "learning_rate": 0.00021119559906147942, "loss": 2.809326934814453, "step": 1190 }, { "epoch": 0.15, "grad_norm": 0.30116304755210876, "learning_rate": 0.00021102308275894555, "loss": 2.7981502532958986, "step": 1200 }, { "epoch": 0.15125, "grad_norm": 0.30669230222702026, "learning_rate": 0.0002108489260167775, "loss": 2.7857837677001953, "step": 1210 }, { "epoch": 0.1525, "grad_norm": 0.30800774693489075, "learning_rate": 0.00021067313166018209, "loss": 2.806937408447266, "step": 1220 }, { "epoch": 0.15375, "grad_norm": 0.3087230622768402, "learning_rate": 0.00021049570254093184, "loss": 2.8145347595214845, "step": 1230 }, { "epoch": 0.155, "grad_norm": 0.30576276779174805, "learning_rate": 0.00021031664153731874, "loss": 2.806387710571289, "step": 1240 }, { "epoch": 0.15625, "grad_norm": 0.3263702392578125, "learning_rate": 0.00021013595155410756, "loss": 2.773836135864258, "step": 1250 }, { "epoch": 0.1575, "grad_norm": 0.3177431523799896, "learning_rate": 0.00020995363552248867, "loss": 2.7588844299316406, "step": 1260 }, { "epoch": 0.15875, "grad_norm": 0.30336225032806396, "learning_rate": 0.00020976969640003064, "loss": 2.8113712310791015, "step": 1270 }, { "epoch": 0.16, "grad_norm": 0.32169830799102783, "learning_rate": 0.000209584137170632, "loss": 2.788315773010254, "step": 1280 }, { "epoch": 0.16125, "grad_norm": 0.30413737893104553, "learning_rate": 0.00020939696084447314, "loss": 2.7458065032958983, "step": 1290 }, { "epoch": 0.1625, "grad_norm": 0.3089154064655304, "learning_rate": 0.00020920817045796727, "loss": 2.7877056121826174, "step": 1300 }, { "epoch": 0.16375, "grad_norm": 0.30705851316452026, "learning_rate": 0.00020901776907371116, "loss": 2.773893356323242, "step": 1310 }, { "epoch": 0.165, "grad_norm": 0.3133976459503174, "learning_rate": 0.00020882575978043566, "loss": 2.784181594848633, "step": 1320 }, { "epoch": 0.16625, "grad_norm": 0.31440430879592896, "learning_rate": 0.00020863214569295533, "loss": 2.8143083572387697, "step": 1330 }, { "epoch": 0.1675, "grad_norm": 0.29583054780960083, "learning_rate": 0.00020843692995211805, "loss": 2.7985980987548826, "step": 1340 }, { "epoch": 0.16875, "grad_norm": 0.3040190637111664, "learning_rate": 0.0002082401157247541, "loss": 2.774214744567871, "step": 1350 }, { "epoch": 0.17, "grad_norm": 0.30737006664276123, "learning_rate": 0.00020804170620362475, "loss": 2.803047943115234, "step": 1360 }, { "epoch": 0.17125, "grad_norm": 0.30594661831855774, "learning_rate": 0.0002078417046073704, "loss": 2.7990367889404295, "step": 1370 }, { "epoch": 0.1725, "grad_norm": 0.3074641823768616, "learning_rate": 0.00020764011418045845, "loss": 2.770071792602539, "step": 1380 }, { "epoch": 0.17375, "grad_norm": 0.304598331451416, "learning_rate": 0.00020743693819313063, "loss": 2.7667999267578125, "step": 1390 }, { "epoch": 0.175, "grad_norm": 0.32464832067489624, "learning_rate": 0.00020723217994135003, "loss": 2.8097129821777345, "step": 1400 }, { "epoch": 0.17625, "grad_norm": 0.3164089620113373, "learning_rate": 0.00020702584274674742, "loss": 2.7955820083618166, "step": 1410 }, { "epoch": 0.1775, "grad_norm": 0.310068279504776, "learning_rate": 0.00020681792995656763, "loss": 2.7704933166503904, "step": 1420 }, { "epoch": 0.17875, "grad_norm": 0.3000030219554901, "learning_rate": 0.00020660844494361513, "loss": 2.8106201171875, "step": 1430 }, { "epoch": 0.18, "grad_norm": 0.3196377456188202, "learning_rate": 0.00020639739110619917, "loss": 2.7796897888183594, "step": 1440 }, { "epoch": 0.18125, "grad_norm": 0.3006730079650879, "learning_rate": 0.000206184771868079, "loss": 2.791950225830078, "step": 1450 }, { "epoch": 0.1825, "grad_norm": 0.3123355805873871, "learning_rate": 0.000205970590678408, "loss": 2.7847476959228517, "step": 1460 }, { "epoch": 0.18375, "grad_norm": 0.31853538751602173, "learning_rate": 0.00020575485101167782, "loss": 2.7865251541137694, "step": 1470 }, { "epoch": 0.185, "grad_norm": 0.30936041474342346, "learning_rate": 0.0002055375563676622, "loss": 2.7906095504760744, "step": 1480 }, { "epoch": 0.18625, "grad_norm": 0.30842670798301697, "learning_rate": 0.0002053187102713599, "loss": 2.7754417419433595, "step": 1490 }, { "epoch": 0.1875, "grad_norm": 0.3201216757297516, "learning_rate": 0.00020509831627293766, "loss": 2.796547698974609, "step": 1500 }, { "epoch": 0.18875, "grad_norm": 0.3134450316429138, "learning_rate": 0.00020487637794767275, "loss": 2.7649627685546876, "step": 1510 }, { "epoch": 0.19, "grad_norm": 0.3114699721336365, "learning_rate": 0.00020465289889589467, "loss": 2.8279897689819338, "step": 1520 }, { "epoch": 0.19125, "grad_norm": 0.3171784281730652, "learning_rate": 0.00020442788274292704, "loss": 2.776567840576172, "step": 1530 }, { "epoch": 0.1925, "grad_norm": 0.30708587169647217, "learning_rate": 0.00020420133313902856, "loss": 2.786650466918945, "step": 1540 }, { "epoch": 0.19375, "grad_norm": 0.3005415201187134, "learning_rate": 0.00020397325375933387, "loss": 2.7795650482177736, "step": 1550 }, { "epoch": 0.195, "grad_norm": 0.30447477102279663, "learning_rate": 0.0002037436483037941, "loss": 2.7910282135009767, "step": 1560 }, { "epoch": 0.19625, "grad_norm": 0.308108389377594, "learning_rate": 0.0002035125204971165, "loss": 2.7864933013916016, "step": 1570 }, { "epoch": 0.1975, "grad_norm": 0.31156831979751587, "learning_rate": 0.00020327987408870436, "loss": 2.77624397277832, "step": 1580 }, { "epoch": 0.19875, "grad_norm": 0.30407053232192993, "learning_rate": 0.00020304571285259602, "loss": 2.786225509643555, "step": 1590 }, { "epoch": 0.2, "grad_norm": 0.30873724818229675, "learning_rate": 0.0002028100405874036, "loss": 2.7831089019775392, "step": 1600 }, { "epoch": 0.20125, "grad_norm": 0.305469274520874, "learning_rate": 0.00020257286111625156, "loss": 2.770510673522949, "step": 1610 }, { "epoch": 0.2025, "grad_norm": 0.3133813440799713, "learning_rate": 0.00020233417828671444, "loss": 2.7937782287597654, "step": 1620 }, { "epoch": 0.20375, "grad_norm": 0.3113247752189636, "learning_rate": 0.00020209399597075463, "loss": 2.811221694946289, "step": 1630 }, { "epoch": 0.205, "grad_norm": 0.29653653502464294, "learning_rate": 0.00020185231806465958, "loss": 2.736056900024414, "step": 1640 }, { "epoch": 0.20625, "grad_norm": 0.296674519777298, "learning_rate": 0.00020160914848897833, "loss": 2.773727035522461, "step": 1650 }, { "epoch": 0.2075, "grad_norm": 0.3117091953754425, "learning_rate": 0.00020136449118845828, "loss": 2.7696605682373048, "step": 1660 }, { "epoch": 0.20875, "grad_norm": 0.3065008819103241, "learning_rate": 0.00020111835013198088, "loss": 2.7859319686889648, "step": 1670 }, { "epoch": 0.21, "grad_norm": 0.30614563822746277, "learning_rate": 0.00020087072931249746, "loss": 2.761496734619141, "step": 1680 }, { "epoch": 0.21125, "grad_norm": 0.3214632272720337, "learning_rate": 0.0002006216327469644, "loss": 2.795328140258789, "step": 1690 }, { "epoch": 0.2125, "grad_norm": 0.3141666054725647, "learning_rate": 0.00020037106447627772, "loss": 2.7613990783691404, "step": 1700 }, { "epoch": 0.21375, "grad_norm": 0.32107681035995483, "learning_rate": 0.00020011902856520807, "loss": 2.7515789031982423, "step": 1710 }, { "epoch": 0.215, "grad_norm": 0.3231985867023468, "learning_rate": 0.00019986552910233424, "loss": 2.7852977752685546, "step": 1720 }, { "epoch": 0.21625, "grad_norm": 0.3149876892566681, "learning_rate": 0.00019961057019997707, "loss": 2.754520225524902, "step": 1730 }, { "epoch": 0.2175, "grad_norm": 0.31885862350463867, "learning_rate": 0.00019935415599413287, "loss": 2.7804901123046877, "step": 1740 }, { "epoch": 0.21875, "grad_norm": 0.30009323358535767, "learning_rate": 0.0001990962906444061, "loss": 2.766156005859375, "step": 1750 }, { "epoch": 0.22, "grad_norm": 0.31249675154685974, "learning_rate": 0.00019883697833394186, "loss": 2.779193878173828, "step": 1760 }, { "epoch": 0.22125, "grad_norm": 0.30822932720184326, "learning_rate": 0.0001985762232693584, "loss": 2.7579469680786133, "step": 1770 }, { "epoch": 0.2225, "grad_norm": 0.3053094446659088, "learning_rate": 0.00019831402968067843, "loss": 2.76893310546875, "step": 1780 }, { "epoch": 0.22375, "grad_norm": 0.31457704305648804, "learning_rate": 0.00019805040182126077, "loss": 2.781879425048828, "step": 1790 }, { "epoch": 0.225, "grad_norm": 0.30379778146743774, "learning_rate": 0.00019778534396773127, "loss": 2.783489799499512, "step": 1800 }, { "epoch": 0.22625, "grad_norm": 0.31210359930992126, "learning_rate": 0.0001975188604199134, "loss": 2.7574298858642576, "step": 1810 }, { "epoch": 0.2275, "grad_norm": 0.3024740219116211, "learning_rate": 0.00019725095550075862, "loss": 2.7888748168945314, "step": 1820 }, { "epoch": 0.22875, "grad_norm": 0.3073548376560211, "learning_rate": 0.0001969816335562761, "loss": 2.7340553283691404, "step": 1830 }, { "epoch": 0.23, "grad_norm": 0.31958791613578796, "learning_rate": 0.00019671089895546232, "loss": 2.804524230957031, "step": 1840 }, { "epoch": 0.23125, "grad_norm": 0.3051760196685791, "learning_rate": 0.00019643875609023017, "loss": 2.775598907470703, "step": 1850 }, { "epoch": 0.2325, "grad_norm": 0.3086925148963928, "learning_rate": 0.0001961652093753377, "loss": 2.7774431228637697, "step": 1860 }, { "epoch": 0.23375, "grad_norm": 0.3144133388996124, "learning_rate": 0.00019589026324831643, "loss": 2.7702011108398437, "step": 1870 }, { "epoch": 0.235, "grad_norm": 0.3036665916442871, "learning_rate": 0.00019561392216939954, "loss": 2.7927045822143555, "step": 1880 }, { "epoch": 0.23625, "grad_norm": 0.30784451961517334, "learning_rate": 0.00019533619062144934, "loss": 2.741124725341797, "step": 1890 }, { "epoch": 0.2375, "grad_norm": 0.29786407947540283, "learning_rate": 0.00019505707310988463, "loss": 2.748614501953125, "step": 1900 }, { "epoch": 0.23875, "grad_norm": 0.30479830503463745, "learning_rate": 0.00019477657416260764, "loss": 2.7626161575317383, "step": 1910 }, { "epoch": 0.24, "grad_norm": 0.30530789494514465, "learning_rate": 0.0001944946983299305, "loss": 2.7705900192260744, "step": 1920 }, { "epoch": 0.24125, "grad_norm": 0.30881696939468384, "learning_rate": 0.00019421145018450145, "loss": 2.7753509521484374, "step": 1930 }, { "epoch": 0.2425, "grad_norm": 0.30990368127822876, "learning_rate": 0.00019392683432123065, "loss": 2.7618339538574217, "step": 1940 }, { "epoch": 0.24375, "grad_norm": 0.30068239569664, "learning_rate": 0.00019364085535721574, "loss": 2.751456451416016, "step": 1950 }, { "epoch": 1.000875, "grad_norm": 0.32766178250312805, "learning_rate": 0.00019335351793166682, "loss": 2.9953849792480467, "step": 1960 }, { "epoch": 1.002125, "grad_norm": 0.31653207540512085, "learning_rate": 0.00019306482670583127, "loss": 2.7172924041748048, "step": 1970 }, { "epoch": 1.003375, "grad_norm": 0.30188634991645813, "learning_rate": 0.000192774786362918, "loss": 2.718875503540039, "step": 1980 }, { "epoch": 1.004625, "grad_norm": 0.3092830777168274, "learning_rate": 0.00019248340160802165, "loss": 2.6953250885009767, "step": 1990 }, { "epoch": 1.005875, "grad_norm": 0.3100144863128662, "learning_rate": 0.00019219067716804626, "loss": 2.7128387451171876, "step": 2000 }, { "epoch": 1.007125, "grad_norm": 0.32156386971473694, "learning_rate": 0.00019189661779162834, "loss": 2.7038270950317385, "step": 2010 }, { "epoch": 1.008375, "grad_norm": 0.3106272518634796, "learning_rate": 0.00019160122824906018, "loss": 2.7032100677490236, "step": 2020 }, { "epoch": 1.009625, "grad_norm": 0.3121194541454315, "learning_rate": 0.00019130451333221226, "loss": 2.6769741058349608, "step": 2030 }, { "epoch": 1.010875, "grad_norm": 0.31094688177108765, "learning_rate": 0.0001910064778544555, "loss": 2.6934465408325194, "step": 2040 }, { "epoch": 1.012125, "grad_norm": 0.3150351941585541, "learning_rate": 0.00019070712665058325, "loss": 2.674116325378418, "step": 2050 }, { "epoch": 1.013375, "grad_norm": 0.3132378160953522, "learning_rate": 0.00019040646457673294, "loss": 2.667017936706543, "step": 2060 }, { "epoch": 1.014625, "grad_norm": 0.30859819054603577, "learning_rate": 0.000190104496510307, "loss": 2.6529170989990236, "step": 2070 }, { "epoch": 1.015875, "grad_norm": 0.3140536844730377, "learning_rate": 0.00018980122734989425, "loss": 2.649005889892578, "step": 2080 }, { "epoch": 1.017125, "grad_norm": 0.3163485825061798, "learning_rate": 0.00018949666201518978, "loss": 2.658115005493164, "step": 2090 }, { "epoch": 1.018375, "grad_norm": 0.3046296536922455, "learning_rate": 0.00018919080544691573, "loss": 2.637746238708496, "step": 2100 }, { "epoch": 1.019625, "grad_norm": 0.30639058351516724, "learning_rate": 0.00018888366260674078, "loss": 2.6267181396484376, "step": 2110 }, { "epoch": 1.020875, "grad_norm": 0.3216869831085205, "learning_rate": 0.00018857523847719992, "loss": 2.6571407318115234, "step": 2120 }, { "epoch": 1.022125, "grad_norm": 0.32431310415267944, "learning_rate": 0.0001882655380616133, "loss": 2.6225955963134764, "step": 2130 }, { "epoch": 1.023375, "grad_norm": 0.3109528720378876, "learning_rate": 0.0001879545663840053, "loss": 2.633950042724609, "step": 2140 }, { "epoch": 1.024625, "grad_norm": 0.32065126299858093, "learning_rate": 0.00018764232848902314, "loss": 2.602225494384766, "step": 2150 }, { "epoch": 1.025875, "grad_norm": 0.32300078868865967, "learning_rate": 0.00018732882944185462, "loss": 2.615239715576172, "step": 2160 }, { "epoch": 1.027125, "grad_norm": 0.3188120424747467, "learning_rate": 0.00018701407432814644, "loss": 2.594603157043457, "step": 2170 }, { "epoch": 1.028375, "grad_norm": 0.3217035233974457, "learning_rate": 0.00018669806825392132, "loss": 2.601702117919922, "step": 2180 }, { "epoch": 1.029625, "grad_norm": 0.322839617729187, "learning_rate": 0.00018638081634549534, "loss": 2.597119903564453, "step": 2190 }, { "epoch": 1.030875, "grad_norm": 0.33341312408447266, "learning_rate": 0.00018606232374939488, "loss": 2.604803466796875, "step": 2200 }, { "epoch": 1.032125, "grad_norm": 0.32422640919685364, "learning_rate": 0.00018574259563227289, "loss": 2.622762107849121, "step": 2210 }, { "epoch": 1.033375, "grad_norm": 0.3312685787677765, "learning_rate": 0.00018542163718082523, "loss": 2.623911666870117, "step": 2220 }, { "epoch": 1.034625, "grad_norm": 0.3332018256187439, "learning_rate": 0.0001850994536017065, "loss": 2.5997699737548827, "step": 2230 }, { "epoch": 1.035875, "grad_norm": 0.32356560230255127, "learning_rate": 0.00018477605012144564, "loss": 2.59320182800293, "step": 2240 }, { "epoch": 1.037125, "grad_norm": 0.30938515067100525, "learning_rate": 0.00018445143198636093, "loss": 2.5783287048339845, "step": 2250 }, { "epoch": 1.038375, "grad_norm": 0.33119791746139526, "learning_rate": 0.0001841256044624752, "loss": 2.6023700714111326, "step": 2260 }, { "epoch": 1.039625, "grad_norm": 0.32936912775039673, "learning_rate": 0.00018379857283543015, "loss": 2.595666694641113, "step": 2270 }, { "epoch": 1.040875, "grad_norm": 0.34784626960754395, "learning_rate": 0.00018347034241040066, "loss": 2.6071990966796874, "step": 2280 }, { "epoch": 1.042125, "grad_norm": 0.3317442238330841, "learning_rate": 0.00018314091851200881, "loss": 2.5899078369140627, "step": 2290 }, { "epoch": 1.043375, "grad_norm": 0.3433104157447815, "learning_rate": 0.0001828103064842375, "loss": 2.6167388916015626, "step": 2300 }, { "epoch": 1.044625, "grad_norm": 0.3177641034126282, "learning_rate": 0.00018247851169034358, "loss": 2.5915859222412108, "step": 2310 }, { "epoch": 1.045875, "grad_norm": 0.33989644050598145, "learning_rate": 0.00018214553951277114, "loss": 2.5995319366455076, "step": 2320 }, { "epoch": 1.047125, "grad_norm": 0.3309226930141449, "learning_rate": 0.00018181139535306383, "loss": 2.5778053283691404, "step": 2330 }, { "epoch": 1.048375, "grad_norm": 0.33091750741004944, "learning_rate": 0.00018147608463177768, "loss": 2.6125743865966795, "step": 2340 }, { "epoch": 1.049625, "grad_norm": 0.32603928446769714, "learning_rate": 0.00018113961278839268, "loss": 2.5618928909301757, "step": 2350 }, { "epoch": 1.050875, "grad_norm": 0.3253335654735565, "learning_rate": 0.00018080198528122495, "loss": 2.592588424682617, "step": 2360 }, { "epoch": 1.052125, "grad_norm": 0.3284412622451782, "learning_rate": 0.000180463207587338, "loss": 2.568330764770508, "step": 2370 }, { "epoch": 1.053375, "grad_norm": 0.32107362151145935, "learning_rate": 0.00018012328520245385, "loss": 2.5809921264648437, "step": 2380 }, { "epoch": 1.054625, "grad_norm": 0.3348993957042694, "learning_rate": 0.000179782223640864, "loss": 2.5713642120361326, "step": 2390 }, { "epoch": 1.055875, "grad_norm": 0.3235042095184326, "learning_rate": 0.00017944002843533986, "loss": 2.608296203613281, "step": 2400 }, { "epoch": 1.057125, "grad_norm": 0.33322450518608093, "learning_rate": 0.00017909670513704306, "loss": 2.587118911743164, "step": 2410 }, { "epoch": 1.058375, "grad_norm": 0.32530325651168823, "learning_rate": 0.00017875225931543543, "loss": 2.5887866973876954, "step": 2420 }, { "epoch": 1.059625, "grad_norm": 0.3360804319381714, "learning_rate": 0.00017840669655818856, "loss": 2.598593902587891, "step": 2430 }, { "epoch": 1.060875, "grad_norm": 0.3203558921813965, "learning_rate": 0.00017806002247109317, "loss": 2.5644474029541016, "step": 2440 }, { "epoch": 1.062125, "grad_norm": 0.34525611996650696, "learning_rate": 0.00017771224267796828, "loss": 2.5811479568481444, "step": 2450 }, { "epoch": 1.063375, "grad_norm": 0.33284473419189453, "learning_rate": 0.00017736336282056986, "loss": 2.5817935943603514, "step": 2460 }, { "epoch": 1.064625, "grad_norm": 0.34238749742507935, "learning_rate": 0.00017701338855849938, "loss": 2.570195770263672, "step": 2470 }, { "epoch": 1.065875, "grad_norm": 0.330721378326416, "learning_rate": 0.0001766623255691119, "loss": 2.5676502227783202, "step": 2480 }, { "epoch": 1.067125, "grad_norm": 0.33618465065956116, "learning_rate": 0.00017631017954742415, "loss": 2.581513595581055, "step": 2490 }, { "epoch": 1.068375, "grad_norm": 0.3335385322570801, "learning_rate": 0.00017595695620602192, "loss": 2.6056888580322264, "step": 2500 }, { "epoch": 1.069625, "grad_norm": 0.3303595781326294, "learning_rate": 0.00017560266127496753, "loss": 2.5539363861083983, "step": 2510 }, { "epoch": 1.070875, "grad_norm": 0.32198089361190796, "learning_rate": 0.00017524730050170697, "loss": 2.569991683959961, "step": 2520 }, { "epoch": 1.072125, "grad_norm": 0.3235105872154236, "learning_rate": 0.0001748908796509764, "loss": 2.5943014144897463, "step": 2530 }, { "epoch": 1.073375, "grad_norm": 0.3448514938354492, "learning_rate": 0.00017453340450470885, "loss": 2.5967823028564454, "step": 2540 }, { "epoch": 1.074625, "grad_norm": 0.32868677377700806, "learning_rate": 0.00017417488086194028, "loss": 2.5600149154663088, "step": 2550 }, { "epoch": 1.075875, "grad_norm": 0.3214341104030609, "learning_rate": 0.00017381531453871567, "loss": 2.5800102233886717, "step": 2560 }, { "epoch": 1.077125, "grad_norm": 0.33103859424591064, "learning_rate": 0.00017345471136799454, "loss": 2.568808364868164, "step": 2570 }, { "epoch": 1.078375, "grad_norm": 0.31372782588005066, "learning_rate": 0.00017309307719955632, "loss": 2.554553413391113, "step": 2580 }, { "epoch": 1.079625, "grad_norm": 0.3474419116973877, "learning_rate": 0.00017273041789990558, "loss": 2.540375900268555, "step": 2590 }, { "epoch": 1.080875, "grad_norm": 0.3302421569824219, "learning_rate": 0.0001723667393521767, "loss": 2.5536571502685548, "step": 2600 }, { "epoch": 1.082125, "grad_norm": 0.3372521996498108, "learning_rate": 0.00017200204745603854, "loss": 2.5786903381347654, "step": 2610 }, { "epoch": 1.083375, "grad_norm": 0.34310850501060486, "learning_rate": 0.00017163634812759882, "loss": 2.56533203125, "step": 2620 }, { "epoch": 1.084625, "grad_norm": 0.3463296890258789, "learning_rate": 0.00017126964729930784, "loss": 2.5742265701293947, "step": 2630 }, { "epoch": 1.085875, "grad_norm": 0.3372081220149994, "learning_rate": 0.00017090195091986254, "loss": 2.5609130859375, "step": 2640 }, { "epoch": 1.087125, "grad_norm": 0.33471760153770447, "learning_rate": 0.00017053326495410998, "loss": 2.570426177978516, "step": 2650 }, { "epoch": 1.088375, "grad_norm": 0.3420524299144745, "learning_rate": 0.0001701635953829503, "loss": 2.5492122650146483, "step": 2660 }, { "epoch": 1.089625, "grad_norm": 0.33050400018692017, "learning_rate": 0.0001697929482032401, "loss": 2.5594730377197266, "step": 2670 }, { "epoch": 1.090875, "grad_norm": 0.33682385087013245, "learning_rate": 0.00016942132942769476, "loss": 2.560088348388672, "step": 2680 }, { "epoch": 1.092125, "grad_norm": 0.34267619252204895, "learning_rate": 0.00016904874508479127, "loss": 2.5474054336547853, "step": 2690 }, { "epoch": 1.093375, "grad_norm": 0.33607542514801025, "learning_rate": 0.00016867520121867006, "loss": 2.5770172119140624, "step": 2700 }, { "epoch": 1.094625, "grad_norm": 0.3332061171531677, "learning_rate": 0.0001683007038890373, "loss": 2.5588443756103514, "step": 2710 }, { "epoch": 1.095875, "grad_norm": 0.34043437242507935, "learning_rate": 0.00016792525917106642, "loss": 2.5765233993530274, "step": 2720 }, { "epoch": 1.097125, "grad_norm": 0.3437064290046692, "learning_rate": 0.00016754887315529948, "loss": 2.598227691650391, "step": 2730 }, { "epoch": 1.098375, "grad_norm": 0.3502216935157776, "learning_rate": 0.0001671715519475486, "loss": 2.5620880126953125, "step": 2740 }, { "epoch": 1.099625, "grad_norm": 0.32694822549819946, "learning_rate": 0.00016679330166879665, "loss": 2.5393630981445314, "step": 2750 }, { "epoch": 1.100875, "grad_norm": 0.3365384042263031, "learning_rate": 0.00016641412845509818, "loss": 2.5454193115234376, "step": 2760 }, { "epoch": 1.102125, "grad_norm": 0.3421364426612854, "learning_rate": 0.00016603403845747984, "loss": 2.5687324523925783, "step": 2770 }, { "epoch": 1.103375, "grad_norm": 0.32685622572898865, "learning_rate": 0.0001656530378418403, "loss": 2.564802551269531, "step": 2780 }, { "epoch": 1.104625, "grad_norm": 0.32674023509025574, "learning_rate": 0.0001652711327888507, "loss": 2.5603107452392577, "step": 2790 }, { "epoch": 1.105875, "grad_norm": 0.3370579481124878, "learning_rate": 0.00016488832949385402, "loss": 2.537816619873047, "step": 2800 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7508190343633306e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }