{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07476915024860742, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007476915024860742, "grad_norm": 58.74357986450195, "learning_rate": 2.4919013207077e-08, "loss": 2.1224, "step": 10 }, { "epoch": 0.0014953830049721484, "grad_norm": 52.66537094116211, "learning_rate": 4.9838026414154e-08, "loss": 2.0535, "step": 20 }, { "epoch": 0.002243074507458223, "grad_norm": 64.12317657470703, "learning_rate": 7.4757039621231e-08, "loss": 2.225, "step": 30 }, { "epoch": 0.002990766009944297, "grad_norm": 53.08683776855469, "learning_rate": 9.9676052828308e-08, "loss": 2.2341, "step": 40 }, { "epoch": 0.0037384575124303713, "grad_norm": 60.34415054321289, "learning_rate": 1.24595066035385e-07, "loss": 2.1499, "step": 50 }, { "epoch": 0.004486149014916446, "grad_norm": 59.9915657043457, "learning_rate": 1.49514079242462e-07, "loss": 2.1315, "step": 60 }, { "epoch": 0.00523384051740252, "grad_norm": 53.04794692993164, "learning_rate": 1.7443309244953902e-07, "loss": 1.8725, "step": 70 }, { "epoch": 0.005981532019888594, "grad_norm": 44.168540954589844, "learning_rate": 1.99352105656616e-07, "loss": 1.778, "step": 80 }, { "epoch": 0.006729223522374668, "grad_norm": 42.75059509277344, "learning_rate": 2.2427111886369301e-07, "loss": 1.4302, "step": 90 }, { "epoch": 0.007476915024860743, "grad_norm": 27.886425018310547, "learning_rate": 2.4919013207077e-07, "loss": 1.2755, "step": 100 }, { "epoch": 0.008224606527346816, "grad_norm": 26.356143951416016, "learning_rate": 2.74109145277847e-07, "loss": 0.9265, "step": 110 }, { "epoch": 0.008972298029832891, "grad_norm": 23.42557144165039, "learning_rate": 2.99028158484924e-07, "loss": 0.7193, "step": 120 }, { "epoch": 0.009719989532318965, "grad_norm": 20.218666076660156, "learning_rate": 3.2394717169200103e-07, "loss": 0.5458, "step": 130 }, { "epoch": 0.01046768103480504, "grad_norm": 15.449305534362793, "learning_rate": 3.4886618489907804e-07, "loss": 0.531, "step": 140 }, { "epoch": 0.011215372537291114, "grad_norm": 14.092931747436523, "learning_rate": 3.73785198106155e-07, "loss": 0.3885, "step": 150 }, { "epoch": 0.011963064039777187, "grad_norm": 14.002432823181152, "learning_rate": 3.98704211313232e-07, "loss": 0.4171, "step": 160 }, { "epoch": 0.012710755542263263, "grad_norm": 14.803778648376465, "learning_rate": 4.23623224520309e-07, "loss": 0.2906, "step": 170 }, { "epoch": 0.013458447044749336, "grad_norm": 12.300663948059082, "learning_rate": 4.4854223772738603e-07, "loss": 0.303, "step": 180 }, { "epoch": 0.01420613854723541, "grad_norm": 11.724754333496094, "learning_rate": 4.7346125093446304e-07, "loss": 0.2489, "step": 190 }, { "epoch": 0.014953830049721485, "grad_norm": 11.619375228881836, "learning_rate": 4.9838026414154e-07, "loss": 0.2303, "step": 200 }, { "epoch": 0.01570152155220756, "grad_norm": 11.649147987365723, "learning_rate": 5.23299277348617e-07, "loss": 0.296, "step": 210 }, { "epoch": 0.016449213054693632, "grad_norm": 9.424432754516602, "learning_rate": 5.48218290555694e-07, "loss": 0.2946, "step": 220 }, { "epoch": 0.017196904557179708, "grad_norm": 8.53671646118164, "learning_rate": 5.73137303762771e-07, "loss": 0.3388, "step": 230 }, { "epoch": 0.017944596059665783, "grad_norm": 7.045456886291504, "learning_rate": 5.98056316969848e-07, "loss": 0.2974, "step": 240 }, { "epoch": 0.018692287562151855, "grad_norm": 6.465863227844238, "learning_rate": 6.22975330176925e-07, "loss": 0.2693, "step": 250 }, { "epoch": 0.01943997906463793, "grad_norm": 10.001205444335938, "learning_rate": 6.478943433840021e-07, "loss": 0.252, "step": 260 }, { "epoch": 0.020187670567124005, "grad_norm": 11.637846946716309, "learning_rate": 6.728133565910791e-07, "loss": 0.2691, "step": 270 }, { "epoch": 0.02093536206961008, "grad_norm": 4.656905651092529, "learning_rate": 6.977323697981561e-07, "loss": 0.1984, "step": 280 }, { "epoch": 0.021683053572096152, "grad_norm": 5.980122089385986, "learning_rate": 7.22651383005233e-07, "loss": 0.2565, "step": 290 }, { "epoch": 0.022430745074582228, "grad_norm": 5.372318267822266, "learning_rate": 7.4757039621231e-07, "loss": 0.2042, "step": 300 }, { "epoch": 0.023178436577068303, "grad_norm": 5.082016944885254, "learning_rate": 7.72489409419387e-07, "loss": 0.1794, "step": 310 }, { "epoch": 0.023926128079554375, "grad_norm": 4.287816047668457, "learning_rate": 7.97408422626464e-07, "loss": 0.2197, "step": 320 }, { "epoch": 0.02467381958204045, "grad_norm": 5.412531852722168, "learning_rate": 8.22327435833541e-07, "loss": 0.2527, "step": 330 }, { "epoch": 0.025421511084526525, "grad_norm": 5.210222244262695, "learning_rate": 8.47246449040618e-07, "loss": 0.2304, "step": 340 }, { "epoch": 0.026169202587012597, "grad_norm": 6.744383335113525, "learning_rate": 8.72165462247695e-07, "loss": 0.2496, "step": 350 }, { "epoch": 0.026916894089498673, "grad_norm": 4.84708309173584, "learning_rate": 8.970844754547721e-07, "loss": 0.2347, "step": 360 }, { "epoch": 0.027664585591984748, "grad_norm": 4.0091633796691895, "learning_rate": 9.220034886618491e-07, "loss": 0.2366, "step": 370 }, { "epoch": 0.02841227709447082, "grad_norm": 4.7725348472595215, "learning_rate": 9.469225018689261e-07, "loss": 0.2645, "step": 380 }, { "epoch": 0.029159968596956895, "grad_norm": 6.1945414543151855, "learning_rate": 9.71841515076003e-07, "loss": 0.2052, "step": 390 }, { "epoch": 0.02990766009944297, "grad_norm": 3.8042452335357666, "learning_rate": 9.9676052828308e-07, "loss": 0.1775, "step": 400 }, { "epoch": 0.030655351601929046, "grad_norm": 5.524064540863037, "learning_rate": 1.0216795414901571e-06, "loss": 0.2121, "step": 410 }, { "epoch": 0.03140304310441512, "grad_norm": 3.3003029823303223, "learning_rate": 1.046598554697234e-06, "loss": 0.1952, "step": 420 }, { "epoch": 0.03215073460690119, "grad_norm": 5.770596981048584, "learning_rate": 1.0715175679043111e-06, "loss": 0.2236, "step": 430 }, { "epoch": 0.032898426109387265, "grad_norm": 6.644259929656982, "learning_rate": 1.096436581111388e-06, "loss": 0.2303, "step": 440 }, { "epoch": 0.03364611761187334, "grad_norm": 2.906667470932007, "learning_rate": 1.1213555943184652e-06, "loss": 0.2226, "step": 450 }, { "epoch": 0.034393809114359415, "grad_norm": 4.669227600097656, "learning_rate": 1.146274607525542e-06, "loss": 0.2409, "step": 460 }, { "epoch": 0.03514150061684549, "grad_norm": 2.6083242893218994, "learning_rate": 1.1711936207326192e-06, "loss": 0.2273, "step": 470 }, { "epoch": 0.035889192119331566, "grad_norm": 6.719790458679199, "learning_rate": 1.196112633939696e-06, "loss": 0.2079, "step": 480 }, { "epoch": 0.03663688362181764, "grad_norm": 4.7992167472839355, "learning_rate": 1.2210316471467732e-06, "loss": 0.2181, "step": 490 }, { "epoch": 0.03738457512430371, "grad_norm": 3.172945261001587, "learning_rate": 1.24595066035385e-06, "loss": 0.2258, "step": 500 }, { "epoch": 0.038132266626789785, "grad_norm": 3.8808019161224365, "learning_rate": 1.2708696735609272e-06, "loss": 0.2436, "step": 510 }, { "epoch": 0.03887995812927586, "grad_norm": 4.4543256759643555, "learning_rate": 1.2957886867680041e-06, "loss": 0.2319, "step": 520 }, { "epoch": 0.039627649631761935, "grad_norm": 5.0955491065979, "learning_rate": 1.3207076999750812e-06, "loss": 0.2302, "step": 530 }, { "epoch": 0.04037534113424801, "grad_norm": 3.1149497032165527, "learning_rate": 1.3456267131821581e-06, "loss": 0.2158, "step": 540 }, { "epoch": 0.041123032636734086, "grad_norm": 3.299131155014038, "learning_rate": 1.3705457263892353e-06, "loss": 0.2355, "step": 550 }, { "epoch": 0.04187072413922016, "grad_norm": 6.149620532989502, "learning_rate": 1.3954647395963122e-06, "loss": 0.243, "step": 560 }, { "epoch": 0.04261841564170623, "grad_norm": 3.9118595123291016, "learning_rate": 1.420383752803389e-06, "loss": 0.2242, "step": 570 }, { "epoch": 0.043366107144192305, "grad_norm": 2.8375048637390137, "learning_rate": 1.445302766010466e-06, "loss": 0.2153, "step": 580 }, { "epoch": 0.04411379864667838, "grad_norm": 3.6035165786743164, "learning_rate": 1.470221779217543e-06, "loss": 0.2533, "step": 590 }, { "epoch": 0.044861490149164455, "grad_norm": 2.9740023612976074, "learning_rate": 1.49514079242462e-06, "loss": 0.2069, "step": 600 }, { "epoch": 0.04560918165165053, "grad_norm": 1.984499454498291, "learning_rate": 1.5200598056316971e-06, "loss": 0.2035, "step": 610 }, { "epoch": 0.046356873154136606, "grad_norm": 3.3487439155578613, "learning_rate": 1.544978818838774e-06, "loss": 0.1891, "step": 620 }, { "epoch": 0.047104564656622674, "grad_norm": 3.0334084033966064, "learning_rate": 1.5698978320458511e-06, "loss": 0.2165, "step": 630 }, { "epoch": 0.04785225615910875, "grad_norm": 2.650395631790161, "learning_rate": 1.594816845252928e-06, "loss": 0.242, "step": 640 }, { "epoch": 0.048599947661594825, "grad_norm": 4.511347770690918, "learning_rate": 1.6197358584600052e-06, "loss": 0.2383, "step": 650 }, { "epoch": 0.0493476391640809, "grad_norm": 3.0503461360931396, "learning_rate": 1.644654871667082e-06, "loss": 0.2059, "step": 660 }, { "epoch": 0.050095330666566976, "grad_norm": 3.2818543910980225, "learning_rate": 1.6695738848741592e-06, "loss": 0.2051, "step": 670 }, { "epoch": 0.05084302216905305, "grad_norm": 3.457136392593384, "learning_rate": 1.694492898081236e-06, "loss": 0.2139, "step": 680 }, { "epoch": 0.051590713671539126, "grad_norm": 2.4822287559509277, "learning_rate": 1.7194119112883132e-06, "loss": 0.2211, "step": 690 }, { "epoch": 0.052338405174025195, "grad_norm": 3.445197820663452, "learning_rate": 1.74433092449539e-06, "loss": 0.2233, "step": 700 }, { "epoch": 0.05308609667651127, "grad_norm": 2.7770423889160156, "learning_rate": 1.7692499377024672e-06, "loss": 0.2509, "step": 710 }, { "epoch": 0.053833788178997345, "grad_norm": 5.7018513679504395, "learning_rate": 1.7941689509095441e-06, "loss": 0.2181, "step": 720 }, { "epoch": 0.05458147968148342, "grad_norm": 2.4717321395874023, "learning_rate": 1.8190879641166212e-06, "loss": 0.2241, "step": 730 }, { "epoch": 0.055329171183969496, "grad_norm": 3.148643732070923, "learning_rate": 1.8440069773236981e-06, "loss": 0.2172, "step": 740 }, { "epoch": 0.05607686268645557, "grad_norm": 2.6590192317962646, "learning_rate": 1.8689259905307753e-06, "loss": 0.1997, "step": 750 }, { "epoch": 0.05682455418894164, "grad_norm": 3.944225311279297, "learning_rate": 1.8938450037378522e-06, "loss": 0.2584, "step": 760 }, { "epoch": 0.057572245691427715, "grad_norm": 2.639666795730591, "learning_rate": 1.9187640169449293e-06, "loss": 0.2518, "step": 770 }, { "epoch": 0.05831993719391379, "grad_norm": 2.579602003097534, "learning_rate": 1.943683030152006e-06, "loss": 0.2181, "step": 780 }, { "epoch": 0.059067628696399865, "grad_norm": 3.810966968536377, "learning_rate": 1.968602043359083e-06, "loss": 0.2164, "step": 790 }, { "epoch": 0.05981532019888594, "grad_norm": 2.8656558990478516, "learning_rate": 1.99352105656616e-06, "loss": 0.258, "step": 800 }, { "epoch": 0.060563011701372016, "grad_norm": 2.6037025451660156, "learning_rate": 2.018440069773237e-06, "loss": 0.2307, "step": 810 }, { "epoch": 0.06131070320385809, "grad_norm": 2.0483126640319824, "learning_rate": 2.0433590829803142e-06, "loss": 0.2399, "step": 820 }, { "epoch": 0.06205839470634416, "grad_norm": 2.1348931789398193, "learning_rate": 2.068278096187391e-06, "loss": 0.2341, "step": 830 }, { "epoch": 0.06280608620883024, "grad_norm": 2.779850959777832, "learning_rate": 2.093197109394468e-06, "loss": 0.2077, "step": 840 }, { "epoch": 0.06355377771131632, "grad_norm": 3.0291380882263184, "learning_rate": 2.118116122601545e-06, "loss": 0.2576, "step": 850 }, { "epoch": 0.06430146921380238, "grad_norm": 2.3847033977508545, "learning_rate": 2.1430351358086223e-06, "loss": 0.2174, "step": 860 }, { "epoch": 0.06504916071628845, "grad_norm": 2.7788398265838623, "learning_rate": 2.167954149015699e-06, "loss": 0.313, "step": 870 }, { "epoch": 0.06579685221877453, "grad_norm": 3.205575466156006, "learning_rate": 2.192873162222776e-06, "loss": 0.2412, "step": 880 }, { "epoch": 0.0665445437212606, "grad_norm": 3.200623035430908, "learning_rate": 2.217792175429853e-06, "loss": 0.1836, "step": 890 }, { "epoch": 0.06729223522374668, "grad_norm": 4.147164344787598, "learning_rate": 2.2427111886369303e-06, "loss": 0.1856, "step": 900 }, { "epoch": 0.06803992672623276, "grad_norm": 2.339167833328247, "learning_rate": 2.267630201844007e-06, "loss": 0.2139, "step": 910 }, { "epoch": 0.06878761822871883, "grad_norm": 2.250478744506836, "learning_rate": 2.292549215051084e-06, "loss": 0.2164, "step": 920 }, { "epoch": 0.0695353097312049, "grad_norm": 2.227017641067505, "learning_rate": 2.317468228258161e-06, "loss": 0.2217, "step": 930 }, { "epoch": 0.07028300123369098, "grad_norm": 2.296231269836426, "learning_rate": 2.3423872414652383e-06, "loss": 0.1947, "step": 940 }, { "epoch": 0.07103069273617706, "grad_norm": 1.9884122610092163, "learning_rate": 2.3673062546723153e-06, "loss": 0.2421, "step": 950 }, { "epoch": 0.07177838423866313, "grad_norm": 2.3243603706359863, "learning_rate": 2.392225267879392e-06, "loss": 0.1988, "step": 960 }, { "epoch": 0.0725260757411492, "grad_norm": 2.440160036087036, "learning_rate": 2.417144281086469e-06, "loss": 0.2126, "step": 970 }, { "epoch": 0.07327376724363528, "grad_norm": 2.4434638023376465, "learning_rate": 2.4420632942935464e-06, "loss": 0.219, "step": 980 }, { "epoch": 0.07402145874612134, "grad_norm": 12.352116584777832, "learning_rate": 2.4669823075006233e-06, "loss": 0.2414, "step": 990 }, { "epoch": 0.07476915024860742, "grad_norm": 2.0996179580688477, "learning_rate": 2.4919013207077e-06, "loss": 0.2139, "step": 1000 } ], "logging_steps": 10, "max_steps": 40122, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5826787355262976e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }