{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996770721205597, "eval_steps": 500, "global_step": 1392, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021528525296017224, "grad_norm": 1.8190886974334717, "learning_rate": 2.9996179993481906e-05, "loss": 0.264, "step": 10 }, { "epoch": 0.04305705059203445, "grad_norm": 4.23043966293335, "learning_rate": 2.9984721919587606e-05, "loss": 0.1028, "step": 20 }, { "epoch": 0.06458557588805167, "grad_norm": 1.842679738998413, "learning_rate": 2.996563161430602e-05, "loss": 0.114, "step": 30 }, { "epoch": 0.0861141011840689, "grad_norm": 4.223649978637695, "learning_rate": 2.9938918800982563e-05, "loss": 0.0948, "step": 40 }, { "epoch": 0.10764262648008611, "grad_norm": 2.1200666427612305, "learning_rate": 2.9904597085366708e-05, "loss": 0.1096, "step": 50 }, { "epoch": 0.12917115177610333, "grad_norm": 2.793856143951416, "learning_rate": 2.9862683948682103e-05, "loss": 0.0956, "step": 60 }, { "epoch": 0.15069967707212056, "grad_norm": 1.9462778568267822, "learning_rate": 2.9813200738722784e-05, "loss": 0.1017, "step": 70 }, { "epoch": 0.1722282023681378, "grad_norm": 2.255049228668213, "learning_rate": 2.975617265898004e-05, "loss": 0.0694, "step": 80 }, { "epoch": 0.193756727664155, "grad_norm": 1.4251642227172852, "learning_rate": 2.9691628755805377e-05, "loss": 0.069, "step": 90 }, { "epoch": 0.21528525296017223, "grad_norm": 1.512846827507019, "learning_rate": 2.961960190361624e-05, "loss": 0.0861, "step": 100 }, { "epoch": 0.23681377825618946, "grad_norm": 1.1422572135925293, "learning_rate": 2.9540128788151935e-05, "loss": 0.0829, "step": 110 }, { "epoch": 0.25834230355220666, "grad_norm": 3.0731289386749268, "learning_rate": 2.9453249887788343e-05, "loss": 0.0811, "step": 120 }, { "epoch": 0.2798708288482239, "grad_norm": 3.031052350997925, "learning_rate": 2.9359009452920893e-05, "loss": 0.0762, "step": 130 }, { "epoch": 0.3013993541442411, "grad_norm": 2.248966932296753, "learning_rate": 2.925745548342631e-05, "loss": 0.0835, "step": 140 }, { "epoch": 0.32292787944025836, "grad_norm": 0.9142462611198425, "learning_rate": 2.9148639704214645e-05, "loss": 0.074, "step": 150 }, { "epoch": 0.3444564047362756, "grad_norm": 2.3527843952178955, "learning_rate": 2.9032617538884018e-05, "loss": 0.0674, "step": 160 }, { "epoch": 0.36598493003229277, "grad_norm": 2.349313259124756, "learning_rate": 2.890944808149146e-05, "loss": 0.0934, "step": 170 }, { "epoch": 0.38751345532831, "grad_norm": 0.6645804643630981, "learning_rate": 2.877919406645433e-05, "loss": 0.0759, "step": 180 }, { "epoch": 0.40904198062432723, "grad_norm": 1.5764023065567017, "learning_rate": 2.864192183659747e-05, "loss": 0.0725, "step": 190 }, { "epoch": 0.43057050592034446, "grad_norm": 2.184178590774536, "learning_rate": 2.84977013093626e-05, "loss": 0.0542, "step": 200 }, { "epoch": 0.4520990312163617, "grad_norm": 1.8497698307037354, "learning_rate": 2.8346605941196927e-05, "loss": 0.0837, "step": 210 }, { "epoch": 0.4736275565123789, "grad_norm": 1.5373315811157227, "learning_rate": 2.818871269013928e-05, "loss": 0.0717, "step": 220 }, { "epoch": 0.4951560818083961, "grad_norm": 1.3783589601516724, "learning_rate": 2.8024101976622762e-05, "loss": 0.0577, "step": 230 }, { "epoch": 0.5166846071044133, "grad_norm": 4.914410591125488, "learning_rate": 2.7852857642513838e-05, "loss": 0.0705, "step": 240 }, { "epoch": 0.5382131324004306, "grad_norm": 0.8398504853248596, "learning_rate": 2.7675066908408852e-05, "loss": 0.0716, "step": 250 }, { "epoch": 0.5597416576964478, "grad_norm": 1.0903675556182861, "learning_rate": 2.7490820329209546e-05, "loss": 0.08, "step": 260 }, { "epoch": 0.581270182992465, "grad_norm": 1.7572460174560547, "learning_rate": 2.7300211748000386e-05, "loss": 0.0741, "step": 270 }, { "epoch": 0.6027987082884823, "grad_norm": 1.668867588043213, "learning_rate": 2.7103338248251055e-05, "loss": 0.0631, "step": 280 }, { "epoch": 0.6243272335844995, "grad_norm": 1.9639641046524048, "learning_rate": 2.6900300104368527e-05, "loss": 0.0802, "step": 290 }, { "epoch": 0.6458557588805167, "grad_norm": 1.3819113969802856, "learning_rate": 2.6691200730623874e-05, "loss": 0.0647, "step": 300 }, { "epoch": 0.667384284176534, "grad_norm": 1.6586377620697021, "learning_rate": 2.6476146628479847e-05, "loss": 0.0626, "step": 310 }, { "epoch": 0.6889128094725512, "grad_norm": 0.7640856504440308, "learning_rate": 2.6255247332346036e-05, "loss": 0.0717, "step": 320 }, { "epoch": 0.7104413347685683, "grad_norm": 0.8930771350860596, "learning_rate": 2.602861535378925e-05, "loss": 0.0617, "step": 330 }, { "epoch": 0.7319698600645855, "grad_norm": 0.9496339559555054, "learning_rate": 2.5796366124227532e-05, "loss": 0.0672, "step": 340 }, { "epoch": 0.7534983853606028, "grad_norm": 3.019853115081787, "learning_rate": 2.5558617936136984e-05, "loss": 0.0702, "step": 350 }, { "epoch": 0.77502691065662, "grad_norm": 0.9336963295936584, "learning_rate": 2.531549188280135e-05, "loss": 0.0697, "step": 360 }, { "epoch": 0.7965554359526372, "grad_norm": 0.7075727581977844, "learning_rate": 2.50671117966351e-05, "loss": 0.074, "step": 370 }, { "epoch": 0.8180839612486545, "grad_norm": 0.5153305530548096, "learning_rate": 2.481360418611132e-05, "loss": 0.0566, "step": 380 }, { "epoch": 0.8396124865446717, "grad_norm": 0.5062828660011292, "learning_rate": 2.4555098171326616e-05, "loss": 0.0792, "step": 390 }, { "epoch": 0.8611410118406889, "grad_norm": 1.255761742591858, "learning_rate": 2.4291725418235848e-05, "loss": 0.0445, "step": 400 }, { "epoch": 0.8826695371367062, "grad_norm": 0.9719372391700745, "learning_rate": 2.4023620071590147e-05, "loss": 0.0553, "step": 410 }, { "epoch": 0.9041980624327234, "grad_norm": 1.868668794631958, "learning_rate": 2.3750918686612414e-05, "loss": 0.0555, "step": 420 }, { "epoch": 0.9257265877287406, "grad_norm": 0.34430617094039917, "learning_rate": 2.3473760159445058e-05, "loss": 0.0611, "step": 430 }, { "epoch": 0.9472551130247578, "grad_norm": 1.189942717552185, "learning_rate": 2.3192285656405456e-05, "loss": 0.0571, "step": 440 }, { "epoch": 0.9687836383207751, "grad_norm": 0.5107014179229736, "learning_rate": 2.2906638542085117e-05, "loss": 0.0635, "step": 450 }, { "epoch": 0.9903121636167922, "grad_norm": 0.685809850692749, "learning_rate": 2.2616964306329183e-05, "loss": 0.0584, "step": 460 }, { "epoch": 1.0118406889128095, "grad_norm": 3.305742025375366, "learning_rate": 2.2323410490133485e-05, "loss": 0.0569, "step": 470 }, { "epoch": 1.0333692142088267, "grad_norm": 1.87465500831604, "learning_rate": 2.2026126610496852e-05, "loss": 0.0481, "step": 480 }, { "epoch": 1.054897739504844, "grad_norm": 0.7248936295509338, "learning_rate": 2.172526408426702e-05, "loss": 0.0295, "step": 490 }, { "epoch": 1.0764262648008611, "grad_norm": 0.670519232749939, "learning_rate": 2.1420976151018813e-05, "loss": 0.0385, "step": 500 }, { "epoch": 1.0979547900968785, "grad_norm": 1.4730095863342285, "learning_rate": 2.1113417795004016e-05, "loss": 0.063, "step": 510 }, { "epoch": 1.1194833153928956, "grad_norm": 1.3478758335113525, "learning_rate": 2.0802745666212592e-05, "loss": 0.0528, "step": 520 }, { "epoch": 1.141011840688913, "grad_norm": 0.6316215991973877, "learning_rate": 2.048911800058546e-05, "loss": 0.0347, "step": 530 }, { "epoch": 1.16254036598493, "grad_norm": 1.4956326484680176, "learning_rate": 2.0172694539419557e-05, "loss": 0.049, "step": 540 }, { "epoch": 1.1840688912809472, "grad_norm": 1.1988089084625244, "learning_rate": 1.9853636448006094e-05, "loss": 0.0471, "step": 550 }, { "epoch": 1.2055974165769645, "grad_norm": 1.2572044134140015, "learning_rate": 1.953210623354359e-05, "loss": 0.06, "step": 560 }, { "epoch": 1.2271259418729816, "grad_norm": 0.7759698033332825, "learning_rate": 1.9208267662367378e-05, "loss": 0.043, "step": 570 }, { "epoch": 1.248654467168999, "grad_norm": 1.9407209157943726, "learning_rate": 1.888228567653781e-05, "loss": 0.051, "step": 580 }, { "epoch": 1.270182992465016, "grad_norm": 1.0966278314590454, "learning_rate": 1.8554326309829654e-05, "loss": 0.0359, "step": 590 }, { "epoch": 1.2917115177610334, "grad_norm": 2.063629150390625, "learning_rate": 1.8224556603165363e-05, "loss": 0.0484, "step": 600 }, { "epoch": 1.3132400430570506, "grad_norm": 1.6178653240203857, "learning_rate": 1.7893144519535468e-05, "loss": 0.045, "step": 610 }, { "epoch": 1.334768568353068, "grad_norm": 0.26466497778892517, "learning_rate": 1.7560258858449248e-05, "loss": 0.0528, "step": 620 }, { "epoch": 1.356297093649085, "grad_norm": 1.890158772468567, "learning_rate": 1.7226069169959393e-05, "loss": 0.0527, "step": 630 }, { "epoch": 1.3778256189451024, "grad_norm": 1.3726129531860352, "learning_rate": 1.689074566830434e-05, "loss": 0.0389, "step": 640 }, { "epoch": 1.3993541442411195, "grad_norm": 1.0230239629745483, "learning_rate": 1.655445914521236e-05, "loss": 0.0506, "step": 650 }, { "epoch": 1.4208826695371366, "grad_norm": 0.8005169630050659, "learning_rate": 1.621738088291147e-05, "loss": 0.0455, "step": 660 }, { "epoch": 1.442411194833154, "grad_norm": 1.1895893812179565, "learning_rate": 1.587968256688955e-05, "loss": 0.039, "step": 670 }, { "epoch": 1.4639397201291713, "grad_norm": 1.9981929063796997, "learning_rate": 1.5541536198449044e-05, "loss": 0.0512, "step": 680 }, { "epoch": 1.4854682454251884, "grad_norm": 1.5658233165740967, "learning_rate": 1.5203114007100828e-05, "loss": 0.0263, "step": 690 }, { "epoch": 1.5069967707212055, "grad_norm": 2.838642120361328, "learning_rate": 1.4864588362841808e-05, "loss": 0.0481, "step": 700 }, { "epoch": 1.5285252960172229, "grad_norm": 0.6982723474502563, "learning_rate": 1.4526131688360996e-05, "loss": 0.0417, "step": 710 }, { "epoch": 1.55005382131324, "grad_norm": 1.7505388259887695, "learning_rate": 1.4187916371218739e-05, "loss": 0.0486, "step": 720 }, { "epoch": 1.571582346609257, "grad_norm": 2.41610050201416, "learning_rate": 1.3850114676043837e-05, "loss": 0.0249, "step": 730 }, { "epoch": 1.5931108719052745, "grad_norm": 1.3201218843460083, "learning_rate": 1.3512898656793283e-05, "loss": 0.042, "step": 740 }, { "epoch": 1.6146393972012918, "grad_norm": 0.9440786838531494, "learning_rate": 1.3176440069119275e-05, "loss": 0.0592, "step": 750 }, { "epoch": 1.636167922497309, "grad_norm": 0.5338843464851379, "learning_rate": 1.2840910282888211e-05, "loss": 0.0405, "step": 760 }, { "epoch": 1.657696447793326, "grad_norm": 1.0818413496017456, "learning_rate": 1.2506480194896155e-05, "loss": 0.0508, "step": 770 }, { "epoch": 1.6792249730893434, "grad_norm": 1.209283471107483, "learning_rate": 1.2173320141825232e-05, "loss": 0.0342, "step": 780 }, { "epoch": 1.7007534983853607, "grad_norm": 2.5324923992156982, "learning_rate": 1.1841599813485341e-05, "loss": 0.046, "step": 790 }, { "epoch": 1.7222820236813778, "grad_norm": 1.514676809310913, "learning_rate": 1.1511488166385349e-05, "loss": 0.0348, "step": 800 }, { "epoch": 1.743810548977395, "grad_norm": 1.4090155363082886, "learning_rate": 1.1183153337677734e-05, "loss": 0.0455, "step": 810 }, { "epoch": 1.7653390742734123, "grad_norm": 2.2600796222686768, "learning_rate": 1.0856762559520605e-05, "loss": 0.0542, "step": 820 }, { "epoch": 1.7868675995694296, "grad_norm": 1.2120071649551392, "learning_rate": 1.0532482073900628e-05, "loss": 0.0323, "step": 830 }, { "epoch": 1.8083961248654468, "grad_norm": 1.3877032995224, "learning_rate": 1.0210477047960303e-05, "loss": 0.0456, "step": 840 }, { "epoch": 1.8299246501614639, "grad_norm": 0.9278028607368469, "learning_rate": 9.89091148987269e-06, "loss": 0.037, "step": 850 }, { "epoch": 1.8514531754574812, "grad_norm": 2.1230030059814453, "learning_rate": 9.573948165306438e-06, "loss": 0.0452, "step": 860 }, { "epoch": 1.8729817007534983, "grad_norm": 0.6858197450637817, "learning_rate": 9.259748514523654e-06, "loss": 0.0536, "step": 870 }, { "epoch": 1.8945102260495155, "grad_norm": 1.1023917198181152, "learning_rate": 8.948472570152874e-06, "loss": 0.0553, "step": 880 }, { "epoch": 1.9160387513455328, "grad_norm": 0.5614004731178284, "learning_rate": 8.64027887567895e-06, "loss": 0.0479, "step": 890 }, { "epoch": 1.9375672766415502, "grad_norm": 1.0492910146713257, "learning_rate": 8.33532440469145e-06, "loss": 0.0438, "step": 900 }, { "epoch": 1.9590958019375673, "grad_norm": 0.30423790216445923, "learning_rate": 8.033764480932616e-06, "loss": 0.028, "step": 910 }, { "epoch": 1.9806243272335844, "grad_norm": 1.6426568031311035, "learning_rate": 7.735752699185711e-06, "loss": 0.0574, "step": 920 }, { "epoch": 2.0021528525296017, "grad_norm": 1.1288621425628662, "learning_rate": 7.441440847043883e-06, "loss": 0.0255, "step": 930 }, { "epoch": 2.023681377825619, "grad_norm": 0.26666760444641113, "learning_rate": 7.150978827599619e-06, "loss": 0.028, "step": 940 }, { "epoch": 2.045209903121636, "grad_norm": 0.33629775047302246, "learning_rate": 6.864514583093911e-06, "loss": 0.0178, "step": 950 }, { "epoch": 2.0667384284176533, "grad_norm": 0.4371579885482788, "learning_rate": 6.582194019564266e-06, "loss": 0.0197, "step": 960 }, { "epoch": 2.0882669537136707, "grad_norm": 1.305396318435669, "learning_rate": 6.304160932529721e-06, "loss": 0.03, "step": 970 }, { "epoch": 2.109795479009688, "grad_norm": 6.668363571166992, "learning_rate": 6.0305569337509225e-06, "loss": 0.0309, "step": 980 }, { "epoch": 2.131324004305705, "grad_norm": 1.8910939693450928, "learning_rate": 5.761521379102343e-06, "loss": 0.0262, "step": 990 }, { "epoch": 2.1528525296017222, "grad_norm": 1.481408953666687, "learning_rate": 5.497191297593647e-06, "loss": 0.0337, "step": 1000 }, { "epoch": 2.1743810548977396, "grad_norm": 1.0818077325820923, "learning_rate": 5.237701321576063e-06, "loss": 0.0365, "step": 1010 }, { "epoch": 2.195909580193757, "grad_norm": 1.0381739139556885, "learning_rate": 4.98318361816957e-06, "loss": 0.0228, "step": 1020 }, { "epoch": 2.217438105489774, "grad_norm": 0.31783393025398254, "learning_rate": 4.733767821945621e-06, "loss": 0.0278, "step": 1030 }, { "epoch": 2.238966630785791, "grad_norm": 2.5186619758605957, "learning_rate": 4.4895809688998655e-06, "loss": 0.0302, "step": 1040 }, { "epoch": 2.2604951560818085, "grad_norm": 0.6198469400405884, "learning_rate": 4.25074743174833e-06, "loss": 0.0138, "step": 1050 }, { "epoch": 2.282023681377826, "grad_norm": 0.8775982856750488, "learning_rate": 4.017388856580178e-06, "loss": 0.0218, "step": 1060 }, { "epoch": 2.3035522066738428, "grad_norm": 0.4356814920902252, "learning_rate": 3.7896241008991596e-06, "loss": 0.0284, "step": 1070 }, { "epoch": 2.32508073196986, "grad_norm": 1.0270265340805054, "learning_rate": 3.567569173085455e-06, "loss": 0.0169, "step": 1080 }, { "epoch": 2.3466092572658774, "grad_norm": 1.2356810569763184, "learning_rate": 3.351337173308607e-06, "loss": 0.0145, "step": 1090 }, { "epoch": 2.3681377825618943, "grad_norm": 0.17152564227581024, "learning_rate": 3.1410382359217645e-06, "loss": 0.0249, "step": 1100 }, { "epoch": 2.3896663078579117, "grad_norm": 0.13272231817245483, "learning_rate": 2.9367794733664637e-06, "loss": 0.0296, "step": 1110 }, { "epoch": 2.411194833153929, "grad_norm": 2.4926042556762695, "learning_rate": 2.7386649216166233e-06, "loss": 0.031, "step": 1120 }, { "epoch": 2.4327233584499464, "grad_norm": 0.5246890783309937, "learning_rate": 2.546795487189436e-06, "loss": 0.0294, "step": 1130 }, { "epoch": 2.4542518837459633, "grad_norm": 1.739809513092041, "learning_rate": 2.361268895750264e-06, "loss": 0.0352, "step": 1140 }, { "epoch": 2.4757804090419806, "grad_norm": 0.07230955362319946, "learning_rate": 2.1821796423375766e-06, "loss": 0.0177, "step": 1150 }, { "epoch": 2.497308934337998, "grad_norm": 1.795920491218567, "learning_rate": 2.0096189432334194e-06, "loss": 0.032, "step": 1160 }, { "epoch": 2.518837459634015, "grad_norm": 0.4120383560657501, "learning_rate": 1.843674689503846e-06, "loss": 0.0244, "step": 1170 }, { "epoch": 2.540365984930032, "grad_norm": 1.3315762281417847, "learning_rate": 1.6844314022329676e-06, "loss": 0.0126, "step": 1180 }, { "epoch": 2.5618945102260495, "grad_norm": 0.9914199709892273, "learning_rate": 1.5319701894735023e-06, "loss": 0.022, "step": 1190 }, { "epoch": 2.583423035522067, "grad_norm": 0.9357948303222656, "learning_rate": 1.3863687049356465e-06, "loss": 0.0181, "step": 1200 }, { "epoch": 2.604951560818084, "grad_norm": 2.104593515396118, "learning_rate": 1.247701108435394e-06, "loss": 0.0241, "step": 1210 }, { "epoch": 2.626480086114101, "grad_norm": 1.0621205568313599, "learning_rate": 1.116038028122413e-06, "loss": 0.0292, "step": 1220 }, { "epoch": 2.6480086114101185, "grad_norm": 1.7859629392623901, "learning_rate": 9.914465245067022e-07, "loss": 0.0201, "step": 1230 }, { "epoch": 2.669537136706136, "grad_norm": 1.8932825326919556, "learning_rate": 8.7399005630238e-07, "loss": 0.0313, "step": 1240 }, { "epoch": 2.6910656620021527, "grad_norm": 1.2083765268325806, "learning_rate": 7.637284481059998e-07, "loss": 0.0311, "step": 1250 }, { "epoch": 2.71259418729817, "grad_norm": 0.1731128990650177, "learning_rate": 6.607178599258268e-07, "loss": 0.0134, "step": 1260 }, { "epoch": 2.7341227125941874, "grad_norm": 1.8263607025146484, "learning_rate": 5.650107585776348e-07, "loss": 0.0348, "step": 1270 }, { "epoch": 2.7556512378902047, "grad_norm": 1.52913498878479, "learning_rate": 4.766558909615504e-07, "loss": 0.0238, "step": 1280 }, { "epoch": 2.7771797631862216, "grad_norm": 1.0334974527359009, "learning_rate": 3.9569825923360503e-07, "loss": 0.0285, "step": 1290 }, { "epoch": 2.798708288482239, "grad_norm": 0.5131074786186218, "learning_rate": 3.22179097884579e-07, "loss": 0.0284, "step": 1300 }, { "epoch": 2.8202368137782563, "grad_norm": 0.869399905204773, "learning_rate": 2.5613585273788264e-07, "loss": 0.0312, "step": 1310 }, { "epoch": 2.841765339074273, "grad_norm": 1.1290533542633057, "learning_rate": 1.9760216187710788e-07, "loss": 0.0259, "step": 1320 }, { "epoch": 2.8632938643702905, "grad_norm": 0.23688088357448578, "learning_rate": 1.4660783851300318e-07, "loss": 0.0263, "step": 1330 }, { "epoch": 2.884822389666308, "grad_norm": 1.1585010290145874, "learning_rate": 1.0317885579858522e-07, "loss": 0.0175, "step": 1340 }, { "epoch": 2.9063509149623252, "grad_norm": 0.5305848717689514, "learning_rate": 6.733733360012761e-08, "loss": 0.0379, "step": 1350 }, { "epoch": 2.9278794402583426, "grad_norm": 1.1688823699951172, "learning_rate": 3.910152723075322e-08, "loss": 0.0401, "step": 1360 }, { "epoch": 2.9494079655543595, "grad_norm": 1.1842941045761108, "learning_rate": 1.848581815237671e-08, "loss": 0.0174, "step": 1370 }, { "epoch": 2.970936490850377, "grad_norm": 0.9176095724105835, "learning_rate": 5.50070665074065e-09, "loss": 0.0218, "step": 1380 }, { "epoch": 2.9924650161463937, "grad_norm": 2.5070579051971436, "learning_rate": 1.5280648725357615e-10, "loss": 0.0288, "step": 1390 }, { "epoch": 2.996770721205597, "step": 1392, "total_flos": 2.2176668825577062e+17, "train_loss": 0.04948104658677917, "train_runtime": 1703.925, "train_samples_per_second": 6.543, "train_steps_per_second": 0.817 } ], "logging_steps": 10, "max_steps": 1392, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2176668825577062e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }