| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999162712810494, | |
| "eval_steps": 500, | |
| "global_step": 895, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0011163829193413341, | |
| "grad_norm": 0.3974737008844776, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 1.607, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0055819145967066705, | |
| "grad_norm": 0.4252789938746273, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 1.5942, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011163829193413341, | |
| "grad_norm": 0.4658525758416883, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 1.5877, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01674574379012001, | |
| "grad_norm": 0.27282017063503095, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 1.5695, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.022327658386826682, | |
| "grad_norm": 0.24165395076839943, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 1.558, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.027909572983533353, | |
| "grad_norm": 0.1767403193777301, | |
| "learning_rate": 5.555555555555556e-05, | |
| "loss": 1.4678, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03349148758024002, | |
| "grad_norm": 0.16356442786177314, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 1.467, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.039073402176946694, | |
| "grad_norm": 0.15556520577978836, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 1.429, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.044655316773653364, | |
| "grad_norm": 0.1263609432879071, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 1.4253, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.050237231370360035, | |
| "grad_norm": 0.1696978939183065, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3895, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.055819145967066705, | |
| "grad_norm": 0.10830406775154863, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 1.3645, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.061401060563773376, | |
| "grad_norm": 0.08414898733986972, | |
| "learning_rate": 0.00012222222222222224, | |
| "loss": 1.3082, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06698297516048005, | |
| "grad_norm": 0.07973185533121883, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.2962, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07256488975718671, | |
| "grad_norm": 0.09811845100733502, | |
| "learning_rate": 0.00014444444444444444, | |
| "loss": 1.3061, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07814680435389339, | |
| "grad_norm": 0.08298371354138047, | |
| "learning_rate": 0.00015555555555555556, | |
| "loss": 1.3017, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08372871895060005, | |
| "grad_norm": 0.07510078793315819, | |
| "learning_rate": 0.0001666666666666667, | |
| "loss": 1.2989, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08931063354730673, | |
| "grad_norm": 0.07085309149624731, | |
| "learning_rate": 0.00017777777777777779, | |
| "loss": 1.2787, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09489254814401339, | |
| "grad_norm": 0.09400917029194135, | |
| "learning_rate": 0.00018888888888888888, | |
| "loss": 1.2843, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.10047446274072007, | |
| "grad_norm": 0.09230059652672952, | |
| "learning_rate": 0.0002, | |
| "loss": 1.262, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10605637733742673, | |
| "grad_norm": 0.10009657676945562, | |
| "learning_rate": 0.00019998096274980728, | |
| "loss": 1.2821, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.11163829193413341, | |
| "grad_norm": 0.12201167887174731, | |
| "learning_rate": 0.000199923858247567, | |
| "loss": 1.2668, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11722020653084007, | |
| "grad_norm": 0.09628889966493127, | |
| "learning_rate": 0.00019982870823553308, | |
| "loss": 1.2503, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12280212112754675, | |
| "grad_norm": 0.10028621820088561, | |
| "learning_rate": 0.00019969554894159723, | |
| "loss": 1.2632, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12838403572425341, | |
| "grad_norm": 0.08593461106683208, | |
| "learning_rate": 0.00019952443106549533, | |
| "loss": 1.2396, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1339659503209601, | |
| "grad_norm": 0.08827739693201113, | |
| "learning_rate": 0.00019931541975950378, | |
| "loss": 1.2784, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13954786491766677, | |
| "grad_norm": 0.0911508607290428, | |
| "learning_rate": 0.00019906859460363307, | |
| "loss": 1.2689, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.14512977951437342, | |
| "grad_norm": 0.12157025851983183, | |
| "learning_rate": 0.00019878404957532814, | |
| "loss": 1.2563, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1507116941110801, | |
| "grad_norm": 0.10772740664174668, | |
| "learning_rate": 0.0001984618930136869, | |
| "loss": 1.2853, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15629360870778677, | |
| "grad_norm": 0.09940063564218579, | |
| "learning_rate": 0.00019810224757821064, | |
| "loss": 1.241, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16187552330449345, | |
| "grad_norm": 0.09118466185918958, | |
| "learning_rate": 0.00019770525020210204, | |
| "loss": 1.2746, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.1674574379012001, | |
| "grad_norm": 0.09674538853934604, | |
| "learning_rate": 0.0001972710520401287, | |
| "loss": 1.2561, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17303935249790678, | |
| "grad_norm": 0.1126652956332537, | |
| "learning_rate": 0.0001967998184110713, | |
| "loss": 1.257, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17862126709461346, | |
| "grad_norm": 0.0869341846350413, | |
| "learning_rate": 0.00019629172873477995, | |
| "loss": 1.2529, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18420318169132013, | |
| "grad_norm": 0.09888626799953022, | |
| "learning_rate": 0.00019574697646386027, | |
| "loss": 1.244, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.18978509628802678, | |
| "grad_norm": 0.09785278620381999, | |
| "learning_rate": 0.0001951657690100178, | |
| "loss": 1.2334, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19536701088473346, | |
| "grad_norm": 0.07378537831469305, | |
| "learning_rate": 0.0001945483276650868, | |
| "loss": 1.2415, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.20094892548144014, | |
| "grad_norm": 0.08814263560160436, | |
| "learning_rate": 0.0001938948875167745, | |
| "loss": 1.2512, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20653084007814682, | |
| "grad_norm": 0.09775538276417937, | |
| "learning_rate": 0.00019320569735915271, | |
| "loss": 1.2213, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.21211275467485347, | |
| "grad_norm": 0.09538626874304115, | |
| "learning_rate": 0.00019248101959793066, | |
| "loss": 1.2354, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21769466927156014, | |
| "grad_norm": 0.08332625788355251, | |
| "learning_rate": 0.00019172113015054532, | |
| "loss": 1.2444, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.22327658386826682, | |
| "grad_norm": 0.08309090570657847, | |
| "learning_rate": 0.00019092631834110723, | |
| "loss": 1.2316, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2288584984649735, | |
| "grad_norm": 0.09054323693110126, | |
| "learning_rate": 0.0001900968867902419, | |
| "loss": 1.27, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.23444041306168015, | |
| "grad_norm": 0.08549436898181585, | |
| "learning_rate": 0.00018923315129986835, | |
| "loss": 1.2348, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.24002232765838682, | |
| "grad_norm": 0.086610993256363, | |
| "learning_rate": 0.00018833544073295917, | |
| "loss": 1.2461, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2456042422550935, | |
| "grad_norm": 0.08146109722648563, | |
| "learning_rate": 0.00018740409688832764, | |
| "loss": 1.2431, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2511861568518002, | |
| "grad_norm": 0.08232534290451142, | |
| "learning_rate": 0.00018643947437048944, | |
| "loss": 1.2408, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.25676807144850683, | |
| "grad_norm": 0.08507739560575232, | |
| "learning_rate": 0.00018544194045464886, | |
| "loss": 1.243, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.26234998604521353, | |
| "grad_norm": 0.09782665661618925, | |
| "learning_rate": 0.00018441187494686053, | |
| "loss": 1.2426, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2679319006419202, | |
| "grad_norm": 0.0809973818897895, | |
| "learning_rate": 0.0001833496700394202, | |
| "loss": 1.2345, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.27351381523862683, | |
| "grad_norm": 0.09269081567542259, | |
| "learning_rate": 0.00018225573016153945, | |
| "loss": 1.2343, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.27909572983533354, | |
| "grad_norm": 0.09671785308848269, | |
| "learning_rate": 0.00018113047182536127, | |
| "loss": 1.2327, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2846776444320402, | |
| "grad_norm": 0.0906432644454991, | |
| "learning_rate": 0.00017997432346737524, | |
| "loss": 1.2532, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.29025955902874684, | |
| "grad_norm": 0.08371586611488784, | |
| "learning_rate": 0.00017878772528529232, | |
| "loss": 1.2384, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.29584147362545354, | |
| "grad_norm": 0.08640773776491195, | |
| "learning_rate": 0.000177571129070442, | |
| "loss": 1.2193, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3014233882221602, | |
| "grad_norm": 0.08164649256677078, | |
| "learning_rate": 0.00017632499803575474, | |
| "loss": 1.2327, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3070053028188669, | |
| "grad_norm": 0.09156690890905773, | |
| "learning_rate": 0.00017504980663939613, | |
| "loss": 1.2534, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.31258721741557355, | |
| "grad_norm": 0.08393163680296412, | |
| "learning_rate": 0.00017374604040411935, | |
| "loss": 1.2411, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3181691320122802, | |
| "grad_norm": 0.08340859881557235, | |
| "learning_rate": 0.00017241419573240462, | |
| "loss": 1.2398, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3237510466089869, | |
| "grad_norm": 0.08622506272483123, | |
| "learning_rate": 0.00017105477971745666, | |
| "loss": 1.2321, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.32933296120569355, | |
| "grad_norm": 0.08338497396964428, | |
| "learning_rate": 0.00016966830995013133, | |
| "loss": 1.2453, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3349148758024002, | |
| "grad_norm": 0.08718794446584939, | |
| "learning_rate": 0.00016825531432186543, | |
| "loss": 1.2134, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3404967903991069, | |
| "grad_norm": 0.09158015865602193, | |
| "learning_rate": 0.00016681633082368498, | |
| "loss": 1.223, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.34607870499581356, | |
| "grad_norm": 0.08768121171152027, | |
| "learning_rate": 0.0001653519073413675, | |
| "loss": 1.235, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3516606195925202, | |
| "grad_norm": 0.08907125432704804, | |
| "learning_rate": 0.00016386260144683745, | |
| "loss": 1.2169, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.3572425341892269, | |
| "grad_norm": 0.08767993008424768, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 1.2435, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.36282444878593356, | |
| "grad_norm": 0.08991663909567185, | |
| "learning_rate": 0.00016081161986220807, | |
| "loss": 1.2371, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.36840636338264027, | |
| "grad_norm": 0.07876061570647706, | |
| "learning_rate": 0.00015925110581810394, | |
| "loss": 1.2118, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3739882779793469, | |
| "grad_norm": 0.09088539514665886, | |
| "learning_rate": 0.00015766803221148673, | |
| "loss": 1.2333, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.37957019257605357, | |
| "grad_norm": 0.09371191064756335, | |
| "learning_rate": 0.00015606300178972287, | |
| "loss": 1.2192, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.38515210717276027, | |
| "grad_norm": 0.0988524027231739, | |
| "learning_rate": 0.00015443662566012645, | |
| "loss": 1.2201, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3907340217694669, | |
| "grad_norm": 0.08068655015289312, | |
| "learning_rate": 0.00015278952305728324, | |
| "loss": 1.2312, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.39631593636617357, | |
| "grad_norm": 0.08530580419429784, | |
| "learning_rate": 0.00015112232110728015, | |
| "loss": 1.2103, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4018978509628803, | |
| "grad_norm": 0.0832856621155852, | |
| "learning_rate": 0.00014943565458893, | |
| "loss": 1.2049, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4074797655595869, | |
| "grad_norm": 0.10112900442930213, | |
| "learning_rate": 0.00014773016569208283, | |
| "loss": 1.2381, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.41306168015629363, | |
| "grad_norm": 0.08250019530921109, | |
| "learning_rate": 0.00014600650377311522, | |
| "loss": 1.2185, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4186435947530003, | |
| "grad_norm": 0.0987578329954232, | |
| "learning_rate": 0.0001442653251076912, | |
| "loss": 1.2222, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.42422550934970693, | |
| "grad_norm": 0.08530899013880136, | |
| "learning_rate": 0.00014250729264088843, | |
| "loss": 1.2556, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.42980742394641364, | |
| "grad_norm": 0.10267562745822716, | |
| "learning_rate": 0.00014073307573478526, | |
| "loss": 1.2146, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.4353893385431203, | |
| "grad_norm": 0.09189285950155643, | |
| "learning_rate": 0.00013894334991360448, | |
| "loss": 1.2206, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.44097125313982694, | |
| "grad_norm": 0.08370196846674145, | |
| "learning_rate": 0.00013713879660651068, | |
| "loss": 1.2076, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.44655316773653364, | |
| "grad_norm": 0.08423557906306067, | |
| "learning_rate": 0.0001353201028881598, | |
| "loss": 1.2223, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4521350823332403, | |
| "grad_norm": 0.08292081122541138, | |
| "learning_rate": 0.00013348796121709862, | |
| "loss": 1.2294, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.457716996929947, | |
| "grad_norm": 0.08767079524531268, | |
| "learning_rate": 0.00013164306917211476, | |
| "loss": 1.2229, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.46329891152665365, | |
| "grad_norm": 0.0865942463810843, | |
| "learning_rate": 0.000129786129186637, | |
| "loss": 1.2163, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.4688808261233603, | |
| "grad_norm": 0.08101515714055764, | |
| "learning_rate": 0.00012791784828128724, | |
| "loss": 1.2337, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.474462740720067, | |
| "grad_norm": 0.09009147490161429, | |
| "learning_rate": 0.00012603893779468604, | |
| "loss": 1.2148, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.48004465531677365, | |
| "grad_norm": 0.08757351279515291, | |
| "learning_rate": 0.0001241501131126138, | |
| "loss": 1.2056, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4856265699134803, | |
| "grad_norm": 0.08418609867162384, | |
| "learning_rate": 0.00012225209339563145, | |
| "loss": 1.2419, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.491208484510187, | |
| "grad_norm": 0.08790367723325618, | |
| "learning_rate": 0.0001203456013052634, | |
| "loss": 1.2115, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.49679039910689365, | |
| "grad_norm": 0.08071789319204539, | |
| "learning_rate": 0.00011843136272884794, | |
| "loss": 1.2072, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5023723137036004, | |
| "grad_norm": 0.0879278395825441, | |
| "learning_rate": 0.00011651010650315923, | |
| "loss": 1.2194, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.507954228300307, | |
| "grad_norm": 0.08506166782358492, | |
| "learning_rate": 0.00011458256413690633, | |
| "loss": 1.2077, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5135361428970137, | |
| "grad_norm": 0.08984730610411729, | |
| "learning_rate": 0.00011264946953221496, | |
| "loss": 1.2484, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5191180574937203, | |
| "grad_norm": 0.2978083078661545, | |
| "learning_rate": 0.00011071155870519777, | |
| "loss": 1.2491, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5246999720904271, | |
| "grad_norm": 0.08504227931172395, | |
| "learning_rate": 0.00010876956950572006, | |
| "loss": 1.2268, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5302818866871337, | |
| "grad_norm": 0.08620167875904892, | |
| "learning_rate": 0.0001068242413364671, | |
| "loss": 1.2252, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5358638012838404, | |
| "grad_norm": 0.08669957736640198, | |
| "learning_rate": 0.00010487631487142017, | |
| "loss": 1.217, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.541445715880547, | |
| "grad_norm": 0.08577871896034497, | |
| "learning_rate": 0.00010292653177384876, | |
| "loss": 1.2169, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5470276304772537, | |
| "grad_norm": 0.08417260057895289, | |
| "learning_rate": 0.00010097563441392581, | |
| "loss": 1.2354, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5526095450739603, | |
| "grad_norm": 0.08676422431924583, | |
| "learning_rate": 9.90243655860742e-05, | |
| "loss": 1.2039, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5581914596706671, | |
| "grad_norm": 0.09103906295111437, | |
| "learning_rate": 9.707346822615128e-05, | |
| "loss": 1.2194, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5637733742673737, | |
| "grad_norm": 0.08594537537719427, | |
| "learning_rate": 9.512368512857984e-05, | |
| "loss": 1.1949, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5693552888640804, | |
| "grad_norm": 0.08392759057088481, | |
| "learning_rate": 9.317575866353292e-05, | |
| "loss": 1.2196, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.574937203460787, | |
| "grad_norm": 0.08201912454761111, | |
| "learning_rate": 9.123043049427995e-05, | |
| "loss": 1.2131, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5805191180574937, | |
| "grad_norm": 0.08925291750313868, | |
| "learning_rate": 8.928844129480227e-05, | |
| "loss": 1.2369, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5861010326542004, | |
| "grad_norm": 0.08954980070951671, | |
| "learning_rate": 8.735053046778506e-05, | |
| "loss": 1.2175, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5916829472509071, | |
| "grad_norm": 0.08574100993825345, | |
| "learning_rate": 8.541743586309365e-05, | |
| "loss": 1.2166, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5972648618476137, | |
| "grad_norm": 0.08840883290578404, | |
| "learning_rate": 8.348989349684076e-05, | |
| "loss": 1.2271, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6028467764443204, | |
| "grad_norm": 0.08443946017557556, | |
| "learning_rate": 8.156863727115211e-05, | |
| "loss": 1.2329, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.608428691041027, | |
| "grad_norm": 0.0902640782545258, | |
| "learning_rate": 7.965439869473664e-05, | |
| "loss": 1.2253, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.6140106056377338, | |
| "grad_norm": 0.08988630625422679, | |
| "learning_rate": 7.774790660436858e-05, | |
| "loss": 1.1785, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6195925202344404, | |
| "grad_norm": 0.08134808753957644, | |
| "learning_rate": 7.584988688738622e-05, | |
| "loss": 1.2261, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6251744348311471, | |
| "grad_norm": 0.08768193779762151, | |
| "learning_rate": 7.396106220531398e-05, | |
| "loss": 1.2463, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6307563494278537, | |
| "grad_norm": 0.0885816930556393, | |
| "learning_rate": 7.208215171871277e-05, | |
| "loss": 1.2141, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6363382640245604, | |
| "grad_norm": 0.08553683878588977, | |
| "learning_rate": 7.021387081336301e-05, | |
| "loss": 1.2026, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.641920178621267, | |
| "grad_norm": 0.09505838067263224, | |
| "learning_rate": 6.835693082788525e-05, | |
| "loss": 1.2168, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6475020932179738, | |
| "grad_norm": 0.08769224685329463, | |
| "learning_rate": 6.651203878290139e-05, | |
| "loss": 1.2493, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6530840078146805, | |
| "grad_norm": 0.07990213288377576, | |
| "learning_rate": 6.46798971118402e-05, | |
| "loss": 1.2308, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6586659224113871, | |
| "grad_norm": 0.08133261350163556, | |
| "learning_rate": 6.286120339348935e-05, | |
| "loss": 1.2014, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6642478370080938, | |
| "grad_norm": 0.09363089434544866, | |
| "learning_rate": 6.105665008639557e-05, | |
| "loss": 1.2238, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6698297516048004, | |
| "grad_norm": 0.07910287951552411, | |
| "learning_rate": 5.926692426521474e-05, | |
| "loss": 1.2473, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6754116662015072, | |
| "grad_norm": 0.0801209902764544, | |
| "learning_rate": 5.749270735911158e-05, | |
| "loss": 1.1975, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6809935807982138, | |
| "grad_norm": 0.08087293360533905, | |
| "learning_rate": 5.573467489230879e-05, | |
| "loss": 1.1966, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6865754953949205, | |
| "grad_norm": 0.08220997258417966, | |
| "learning_rate": 5.399349622688479e-05, | |
| "loss": 1.2345, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6921574099916271, | |
| "grad_norm": 0.0825575277760057, | |
| "learning_rate": 5.226983430791722e-05, | |
| "loss": 1.2289, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6977393245883338, | |
| "grad_norm": 0.08305460425818378, | |
| "learning_rate": 5.0564345411070025e-05, | |
| "loss": 1.204, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.7033212391850404, | |
| "grad_norm": 0.08011105262542664, | |
| "learning_rate": 4.8877678892719866e-05, | |
| "loss": 1.1946, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7089031537817472, | |
| "grad_norm": 0.08686069747720479, | |
| "learning_rate": 4.721047694271676e-05, | |
| "loss": 1.2, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7144850683784538, | |
| "grad_norm": 0.08537977661965272, | |
| "learning_rate": 4.556337433987359e-05, | |
| "loss": 1.2054, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7200669829751605, | |
| "grad_norm": 0.08857193949478791, | |
| "learning_rate": 4.393699821027716e-05, | |
| "loss": 1.1988, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7256488975718671, | |
| "grad_norm": 0.09608004999262602, | |
| "learning_rate": 4.2331967788513295e-05, | |
| "loss": 1.2226, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7312308121685738, | |
| "grad_norm": 0.08235757922811432, | |
| "learning_rate": 4.074889418189608e-05, | |
| "loss": 1.2202, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7368127267652805, | |
| "grad_norm": 0.08660069823512372, | |
| "learning_rate": 3.9188380137791936e-05, | |
| "loss": 1.215, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7423946413619872, | |
| "grad_norm": 0.08090639704744831, | |
| "learning_rate": 3.7651019814126654e-05, | |
| "loss": 1.2255, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7479765559586938, | |
| "grad_norm": 0.08082821477995833, | |
| "learning_rate": 3.613739855316257e-05, | |
| "loss": 1.2176, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7535584705554005, | |
| "grad_norm": 0.08469395080984878, | |
| "learning_rate": 3.46480926586325e-05, | |
| "loss": 1.2275, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7591403851521071, | |
| "grad_norm": 0.0871555466504494, | |
| "learning_rate": 3.3183669176315045e-05, | |
| "loss": 1.2351, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7647222997488139, | |
| "grad_norm": 0.08170223557553191, | |
| "learning_rate": 3.174468567813461e-05, | |
| "loss": 1.2074, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7703042143455205, | |
| "grad_norm": 0.0838318843856818, | |
| "learning_rate": 3.033169004986873e-05, | |
| "loss": 1.2396, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7758861289422272, | |
| "grad_norm": 0.08831381148889993, | |
| "learning_rate": 2.894522028254334e-05, | |
| "loss": 1.1947, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7814680435389338, | |
| "grad_norm": 0.08158536981215994, | |
| "learning_rate": 2.7585804267595384e-05, | |
| "loss": 1.208, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7870499581356405, | |
| "grad_norm": 0.08116519613000232, | |
| "learning_rate": 2.6253959595880673e-05, | |
| "loss": 1.2191, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7926318727323471, | |
| "grad_norm": 0.08294169676184929, | |
| "learning_rate": 2.495019336060387e-05, | |
| "loss": 1.195, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7982137873290539, | |
| "grad_norm": 0.08406756837278591, | |
| "learning_rate": 2.367500196424529e-05, | |
| "loss": 1.2203, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.8037957019257606, | |
| "grad_norm": 0.08211403607563178, | |
| "learning_rate": 2.242887092955801e-05, | |
| "loss": 1.2041, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8093776165224672, | |
| "grad_norm": 0.07980978787138238, | |
| "learning_rate": 2.121227471470768e-05, | |
| "loss": 1.2394, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.8149595311191739, | |
| "grad_norm": 0.08416184610807921, | |
| "learning_rate": 2.002567653262479e-05, | |
| "loss": 1.2228, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8205414457158805, | |
| "grad_norm": 0.08256062792318115, | |
| "learning_rate": 1.8869528174638752e-05, | |
| "loss": 1.203, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8261233603125873, | |
| "grad_norm": 0.09043351264554417, | |
| "learning_rate": 1.774426983846058e-05, | |
| "loss": 1.2275, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8317052749092939, | |
| "grad_norm": 0.08486147964302236, | |
| "learning_rate": 1.6650329960579792e-05, | |
| "loss": 1.2208, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8372871895060006, | |
| "grad_norm": 0.0935945466460169, | |
| "learning_rate": 1.5588125053139468e-05, | |
| "loss": 1.2131, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8428691041027072, | |
| "grad_norm": 0.08282716353976063, | |
| "learning_rate": 1.4558059545351143e-05, | |
| "loss": 1.2284, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8484510186994139, | |
| "grad_norm": 0.08286515378820142, | |
| "learning_rate": 1.3560525629510568e-05, | |
| "loss": 1.2086, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8540329332961206, | |
| "grad_norm": 0.08295259360853054, | |
| "learning_rate": 1.259590311167238e-05, | |
| "loss": 1.2061, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8596148478928273, | |
| "grad_norm": 0.08358389042910293, | |
| "learning_rate": 1.166455926704082e-05, | |
| "loss": 1.222, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8651967624895339, | |
| "grad_norm": 0.08388863476839661, | |
| "learning_rate": 1.0766848700131648e-05, | |
| "loss": 1.2143, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8707786770862406, | |
| "grad_norm": 0.08277339984932784, | |
| "learning_rate": 9.903113209758096e-06, | |
| "loss": 1.2192, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8763605916829472, | |
| "grad_norm": 0.08938310164317657, | |
| "learning_rate": 9.073681658892775e-06, | |
| "loss": 1.2191, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8819425062796539, | |
| "grad_norm": 0.07910593096708422, | |
| "learning_rate": 8.278869849454718e-06, | |
| "loss": 1.2269, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8875244208763606, | |
| "grad_norm": 0.08295037453317607, | |
| "learning_rate": 7.5189804020693536e-06, | |
| "loss": 1.2021, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8931063354730673, | |
| "grad_norm": 0.08199446080472911, | |
| "learning_rate": 6.794302640847294e-06, | |
| "loss": 1.1961, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8986882500697739, | |
| "grad_norm": 0.08481342663212112, | |
| "learning_rate": 6.1051124832254944e-06, | |
| "loss": 1.2069, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.9042701646664806, | |
| "grad_norm": 0.08217551850800063, | |
| "learning_rate": 5.451672334913216e-06, | |
| "loss": 1.2055, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9098520792631872, | |
| "grad_norm": 0.08322503504827561, | |
| "learning_rate": 4.834230989982213e-06, | |
| "loss": 1.2156, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.915433993859894, | |
| "grad_norm": 0.08125961805104615, | |
| "learning_rate": 4.253023536139733e-06, | |
| "loss": 1.2005, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9210159084566006, | |
| "grad_norm": 0.09037682759604541, | |
| "learning_rate": 3.7082712652200867e-06, | |
| "loss": 1.2079, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.9265978230533073, | |
| "grad_norm": 0.08711894287392291, | |
| "learning_rate": 3.2001815889286856e-06, | |
| "loss": 1.232, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9321797376500139, | |
| "grad_norm": 0.08367132801462379, | |
| "learning_rate": 2.728947959871353e-06, | |
| "loss": 1.1858, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.9377616522467206, | |
| "grad_norm": 0.0809801248589102, | |
| "learning_rate": 2.294749797897955e-06, | |
| "loss": 1.1871, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9433435668434274, | |
| "grad_norm": 0.08412969109149288, | |
| "learning_rate": 1.8977524217893783e-06, | |
| "loss": 1.2248, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.948925481440134, | |
| "grad_norm": 0.08014128153610968, | |
| "learning_rate": 1.5381069863131037e-06, | |
| "loss": 1.2312, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9545073960368406, | |
| "grad_norm": 0.08040835492341503, | |
| "learning_rate": 1.2159504246718522e-06, | |
| "loss": 1.2213, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9600893106335473, | |
| "grad_norm": 0.08170226749481643, | |
| "learning_rate": 9.314053963669245e-07, | |
| "loss": 1.2114, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.965671225230254, | |
| "grad_norm": 0.08123838559159317, | |
| "learning_rate": 6.845802404962243e-07, | |
| "loss": 1.2455, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.9712531398269606, | |
| "grad_norm": 0.08532355248950987, | |
| "learning_rate": 4.7556893450466653e-07, | |
| "loss": 1.2017, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9768350544236674, | |
| "grad_norm": 0.07935413274906811, | |
| "learning_rate": 3.044510584027771e-07, | |
| "loss": 1.203, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.982416969020374, | |
| "grad_norm": 0.07922680701516337, | |
| "learning_rate": 1.7129176446692984e-07, | |
| "loss": 1.1993, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9879988836170807, | |
| "grad_norm": 0.08007277288266887, | |
| "learning_rate": 7.614175243301213e-08, | |
| "loss": 1.221, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9935807982137873, | |
| "grad_norm": 0.08190648675567455, | |
| "learning_rate": 1.9037250192732726e-08, | |
| "loss": 1.2245, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.999162712810494, | |
| "grad_norm": 0.07884795604109555, | |
| "learning_rate": 0.0, | |
| "loss": 1.2359, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.999162712810494, | |
| "eval_loss": 1.1748292446136475, | |
| "eval_runtime": 1569.4225, | |
| "eval_samples_per_second": 8.524, | |
| "eval_steps_per_second": 0.533, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.999162712810494, | |
| "step": 895, | |
| "total_flos": 1.1254972268150784e+16, | |
| "train_loss": 1.2433469767011078, | |
| "train_runtime": 20318.3129, | |
| "train_samples_per_second": 2.821, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 895, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1254972268150784e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |