| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.996770721205597, | |
| "eval_steps": 500, | |
| "global_step": 1392, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021528525296017224, | |
| "grad_norm": 1.8190886974334717, | |
| "learning_rate": 2.9996179993481906e-05, | |
| "loss": 0.264, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04305705059203445, | |
| "grad_norm": 4.23043966293335, | |
| "learning_rate": 2.9984721919587606e-05, | |
| "loss": 0.1028, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06458557588805167, | |
| "grad_norm": 1.842679738998413, | |
| "learning_rate": 2.996563161430602e-05, | |
| "loss": 0.114, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0861141011840689, | |
| "grad_norm": 4.223649978637695, | |
| "learning_rate": 2.9938918800982563e-05, | |
| "loss": 0.0948, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10764262648008611, | |
| "grad_norm": 2.1200666427612305, | |
| "learning_rate": 2.9904597085366708e-05, | |
| "loss": 0.1096, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12917115177610333, | |
| "grad_norm": 2.793856143951416, | |
| "learning_rate": 2.9862683948682103e-05, | |
| "loss": 0.0956, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15069967707212056, | |
| "grad_norm": 1.9462778568267822, | |
| "learning_rate": 2.9813200738722784e-05, | |
| "loss": 0.1017, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1722282023681378, | |
| "grad_norm": 2.255049228668213, | |
| "learning_rate": 2.975617265898004e-05, | |
| "loss": 0.0694, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.193756727664155, | |
| "grad_norm": 1.4251642227172852, | |
| "learning_rate": 2.9691628755805377e-05, | |
| "loss": 0.069, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21528525296017223, | |
| "grad_norm": 1.512846827507019, | |
| "learning_rate": 2.961960190361624e-05, | |
| "loss": 0.0861, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23681377825618946, | |
| "grad_norm": 1.1422572135925293, | |
| "learning_rate": 2.9540128788151935e-05, | |
| "loss": 0.0829, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.25834230355220666, | |
| "grad_norm": 3.0731289386749268, | |
| "learning_rate": 2.9453249887788343e-05, | |
| "loss": 0.0811, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2798708288482239, | |
| "grad_norm": 3.031052350997925, | |
| "learning_rate": 2.9359009452920893e-05, | |
| "loss": 0.0762, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3013993541442411, | |
| "grad_norm": 2.248966932296753, | |
| "learning_rate": 2.925745548342631e-05, | |
| "loss": 0.0835, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.32292787944025836, | |
| "grad_norm": 0.9142462611198425, | |
| "learning_rate": 2.9148639704214645e-05, | |
| "loss": 0.074, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3444564047362756, | |
| "grad_norm": 2.3527843952178955, | |
| "learning_rate": 2.9032617538884018e-05, | |
| "loss": 0.0674, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.36598493003229277, | |
| "grad_norm": 2.349313259124756, | |
| "learning_rate": 2.890944808149146e-05, | |
| "loss": 0.0934, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.38751345532831, | |
| "grad_norm": 0.6645804643630981, | |
| "learning_rate": 2.877919406645433e-05, | |
| "loss": 0.0759, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.40904198062432723, | |
| "grad_norm": 1.5764023065567017, | |
| "learning_rate": 2.864192183659747e-05, | |
| "loss": 0.0725, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.43057050592034446, | |
| "grad_norm": 2.184178590774536, | |
| "learning_rate": 2.84977013093626e-05, | |
| "loss": 0.0542, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4520990312163617, | |
| "grad_norm": 1.8497698307037354, | |
| "learning_rate": 2.8346605941196927e-05, | |
| "loss": 0.0837, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4736275565123789, | |
| "grad_norm": 1.5373315811157227, | |
| "learning_rate": 2.818871269013928e-05, | |
| "loss": 0.0717, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4951560818083961, | |
| "grad_norm": 1.3783589601516724, | |
| "learning_rate": 2.8024101976622762e-05, | |
| "loss": 0.0577, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5166846071044133, | |
| "grad_norm": 4.914410591125488, | |
| "learning_rate": 2.7852857642513838e-05, | |
| "loss": 0.0705, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5382131324004306, | |
| "grad_norm": 0.8398504853248596, | |
| "learning_rate": 2.7675066908408852e-05, | |
| "loss": 0.0716, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5597416576964478, | |
| "grad_norm": 1.0903675556182861, | |
| "learning_rate": 2.7490820329209546e-05, | |
| "loss": 0.08, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.581270182992465, | |
| "grad_norm": 1.7572460174560547, | |
| "learning_rate": 2.7300211748000386e-05, | |
| "loss": 0.0741, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6027987082884823, | |
| "grad_norm": 1.668867588043213, | |
| "learning_rate": 2.7103338248251055e-05, | |
| "loss": 0.0631, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6243272335844995, | |
| "grad_norm": 1.9639641046524048, | |
| "learning_rate": 2.6900300104368527e-05, | |
| "loss": 0.0802, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6458557588805167, | |
| "grad_norm": 1.3819113969802856, | |
| "learning_rate": 2.6691200730623874e-05, | |
| "loss": 0.0647, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.667384284176534, | |
| "grad_norm": 1.6586377620697021, | |
| "learning_rate": 2.6476146628479847e-05, | |
| "loss": 0.0626, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6889128094725512, | |
| "grad_norm": 0.7640856504440308, | |
| "learning_rate": 2.6255247332346036e-05, | |
| "loss": 0.0717, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7104413347685683, | |
| "grad_norm": 0.8930771350860596, | |
| "learning_rate": 2.602861535378925e-05, | |
| "loss": 0.0617, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7319698600645855, | |
| "grad_norm": 0.9496339559555054, | |
| "learning_rate": 2.5796366124227532e-05, | |
| "loss": 0.0672, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7534983853606028, | |
| "grad_norm": 3.019853115081787, | |
| "learning_rate": 2.5558617936136984e-05, | |
| "loss": 0.0702, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.77502691065662, | |
| "grad_norm": 0.9336963295936584, | |
| "learning_rate": 2.531549188280135e-05, | |
| "loss": 0.0697, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7965554359526372, | |
| "grad_norm": 0.7075727581977844, | |
| "learning_rate": 2.50671117966351e-05, | |
| "loss": 0.074, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8180839612486545, | |
| "grad_norm": 0.5153305530548096, | |
| "learning_rate": 2.481360418611132e-05, | |
| "loss": 0.0566, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8396124865446717, | |
| "grad_norm": 0.5062828660011292, | |
| "learning_rate": 2.4555098171326616e-05, | |
| "loss": 0.0792, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8611410118406889, | |
| "grad_norm": 1.255761742591858, | |
| "learning_rate": 2.4291725418235848e-05, | |
| "loss": 0.0445, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8826695371367062, | |
| "grad_norm": 0.9719372391700745, | |
| "learning_rate": 2.4023620071590147e-05, | |
| "loss": 0.0553, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9041980624327234, | |
| "grad_norm": 1.868668794631958, | |
| "learning_rate": 2.3750918686612414e-05, | |
| "loss": 0.0555, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9257265877287406, | |
| "grad_norm": 0.34430617094039917, | |
| "learning_rate": 2.3473760159445058e-05, | |
| "loss": 0.0611, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9472551130247578, | |
| "grad_norm": 1.189942717552185, | |
| "learning_rate": 2.3192285656405456e-05, | |
| "loss": 0.0571, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9687836383207751, | |
| "grad_norm": 0.5107014179229736, | |
| "learning_rate": 2.2906638542085117e-05, | |
| "loss": 0.0635, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9903121636167922, | |
| "grad_norm": 0.685809850692749, | |
| "learning_rate": 2.2616964306329183e-05, | |
| "loss": 0.0584, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0118406889128095, | |
| "grad_norm": 3.305742025375366, | |
| "learning_rate": 2.2323410490133485e-05, | |
| "loss": 0.0569, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0333692142088267, | |
| "grad_norm": 1.87465500831604, | |
| "learning_rate": 2.2026126610496852e-05, | |
| "loss": 0.0481, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.054897739504844, | |
| "grad_norm": 0.7248936295509338, | |
| "learning_rate": 2.172526408426702e-05, | |
| "loss": 0.0295, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0764262648008611, | |
| "grad_norm": 0.670519232749939, | |
| "learning_rate": 2.1420976151018813e-05, | |
| "loss": 0.0385, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0979547900968785, | |
| "grad_norm": 1.4730095863342285, | |
| "learning_rate": 2.1113417795004016e-05, | |
| "loss": 0.063, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1194833153928956, | |
| "grad_norm": 1.3478758335113525, | |
| "learning_rate": 2.0802745666212592e-05, | |
| "loss": 0.0528, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.141011840688913, | |
| "grad_norm": 0.6316215991973877, | |
| "learning_rate": 2.048911800058546e-05, | |
| "loss": 0.0347, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.16254036598493, | |
| "grad_norm": 1.4956326484680176, | |
| "learning_rate": 2.0172694539419557e-05, | |
| "loss": 0.049, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1840688912809472, | |
| "grad_norm": 1.1988089084625244, | |
| "learning_rate": 1.9853636448006094e-05, | |
| "loss": 0.0471, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2055974165769645, | |
| "grad_norm": 1.2572044134140015, | |
| "learning_rate": 1.953210623354359e-05, | |
| "loss": 0.06, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2271259418729816, | |
| "grad_norm": 0.7759698033332825, | |
| "learning_rate": 1.9208267662367378e-05, | |
| "loss": 0.043, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.248654467168999, | |
| "grad_norm": 1.9407209157943726, | |
| "learning_rate": 1.888228567653781e-05, | |
| "loss": 0.051, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.270182992465016, | |
| "grad_norm": 1.0966278314590454, | |
| "learning_rate": 1.8554326309829654e-05, | |
| "loss": 0.0359, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2917115177610334, | |
| "grad_norm": 2.063629150390625, | |
| "learning_rate": 1.8224556603165363e-05, | |
| "loss": 0.0484, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3132400430570506, | |
| "grad_norm": 1.6178653240203857, | |
| "learning_rate": 1.7893144519535468e-05, | |
| "loss": 0.045, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.334768568353068, | |
| "grad_norm": 0.26466497778892517, | |
| "learning_rate": 1.7560258858449248e-05, | |
| "loss": 0.0528, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.356297093649085, | |
| "grad_norm": 1.890158772468567, | |
| "learning_rate": 1.7226069169959393e-05, | |
| "loss": 0.0527, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3778256189451024, | |
| "grad_norm": 1.3726129531860352, | |
| "learning_rate": 1.689074566830434e-05, | |
| "loss": 0.0389, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3993541442411195, | |
| "grad_norm": 1.0230239629745483, | |
| "learning_rate": 1.655445914521236e-05, | |
| "loss": 0.0506, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.4208826695371366, | |
| "grad_norm": 0.8005169630050659, | |
| "learning_rate": 1.621738088291147e-05, | |
| "loss": 0.0455, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.442411194833154, | |
| "grad_norm": 1.1895893812179565, | |
| "learning_rate": 1.587968256688955e-05, | |
| "loss": 0.039, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4639397201291713, | |
| "grad_norm": 1.9981929063796997, | |
| "learning_rate": 1.5541536198449044e-05, | |
| "loss": 0.0512, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4854682454251884, | |
| "grad_norm": 1.5658233165740967, | |
| "learning_rate": 1.5203114007100828e-05, | |
| "loss": 0.0263, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.5069967707212055, | |
| "grad_norm": 2.838642120361328, | |
| "learning_rate": 1.4864588362841808e-05, | |
| "loss": 0.0481, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.5285252960172229, | |
| "grad_norm": 0.6982723474502563, | |
| "learning_rate": 1.4526131688360996e-05, | |
| "loss": 0.0417, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.55005382131324, | |
| "grad_norm": 1.7505388259887695, | |
| "learning_rate": 1.4187916371218739e-05, | |
| "loss": 0.0486, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.571582346609257, | |
| "grad_norm": 2.41610050201416, | |
| "learning_rate": 1.3850114676043837e-05, | |
| "loss": 0.0249, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5931108719052745, | |
| "grad_norm": 1.3201218843460083, | |
| "learning_rate": 1.3512898656793283e-05, | |
| "loss": 0.042, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.6146393972012918, | |
| "grad_norm": 0.9440786838531494, | |
| "learning_rate": 1.3176440069119275e-05, | |
| "loss": 0.0592, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.636167922497309, | |
| "grad_norm": 0.5338843464851379, | |
| "learning_rate": 1.2840910282888211e-05, | |
| "loss": 0.0405, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.657696447793326, | |
| "grad_norm": 1.0818413496017456, | |
| "learning_rate": 1.2506480194896155e-05, | |
| "loss": 0.0508, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6792249730893434, | |
| "grad_norm": 1.209283471107483, | |
| "learning_rate": 1.2173320141825232e-05, | |
| "loss": 0.0342, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.7007534983853607, | |
| "grad_norm": 2.5324923992156982, | |
| "learning_rate": 1.1841599813485341e-05, | |
| "loss": 0.046, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.7222820236813778, | |
| "grad_norm": 1.514676809310913, | |
| "learning_rate": 1.1511488166385349e-05, | |
| "loss": 0.0348, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.743810548977395, | |
| "grad_norm": 1.4090155363082886, | |
| "learning_rate": 1.1183153337677734e-05, | |
| "loss": 0.0455, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7653390742734123, | |
| "grad_norm": 2.2600796222686768, | |
| "learning_rate": 1.0856762559520605e-05, | |
| "loss": 0.0542, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7868675995694296, | |
| "grad_norm": 1.2120071649551392, | |
| "learning_rate": 1.0532482073900628e-05, | |
| "loss": 0.0323, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.8083961248654468, | |
| "grad_norm": 1.3877032995224, | |
| "learning_rate": 1.0210477047960303e-05, | |
| "loss": 0.0456, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.8299246501614639, | |
| "grad_norm": 0.9278028607368469, | |
| "learning_rate": 9.89091148987269e-06, | |
| "loss": 0.037, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.8514531754574812, | |
| "grad_norm": 2.1230030059814453, | |
| "learning_rate": 9.573948165306438e-06, | |
| "loss": 0.0452, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.8729817007534983, | |
| "grad_norm": 0.6858197450637817, | |
| "learning_rate": 9.259748514523654e-06, | |
| "loss": 0.0536, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8945102260495155, | |
| "grad_norm": 1.1023917198181152, | |
| "learning_rate": 8.948472570152874e-06, | |
| "loss": 0.0553, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.9160387513455328, | |
| "grad_norm": 0.5614004731178284, | |
| "learning_rate": 8.64027887567895e-06, | |
| "loss": 0.0479, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.9375672766415502, | |
| "grad_norm": 1.0492910146713257, | |
| "learning_rate": 8.33532440469145e-06, | |
| "loss": 0.0438, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.9590958019375673, | |
| "grad_norm": 0.30423790216445923, | |
| "learning_rate": 8.033764480932616e-06, | |
| "loss": 0.028, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.9806243272335844, | |
| "grad_norm": 1.6426568031311035, | |
| "learning_rate": 7.735752699185711e-06, | |
| "loss": 0.0574, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.0021528525296017, | |
| "grad_norm": 1.1288621425628662, | |
| "learning_rate": 7.441440847043883e-06, | |
| "loss": 0.0255, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.023681377825619, | |
| "grad_norm": 0.26666760444641113, | |
| "learning_rate": 7.150978827599619e-06, | |
| "loss": 0.028, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.045209903121636, | |
| "grad_norm": 0.33629775047302246, | |
| "learning_rate": 6.864514583093911e-06, | |
| "loss": 0.0178, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.0667384284176533, | |
| "grad_norm": 0.4371579885482788, | |
| "learning_rate": 6.582194019564266e-06, | |
| "loss": 0.0197, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.0882669537136707, | |
| "grad_norm": 1.305396318435669, | |
| "learning_rate": 6.304160932529721e-06, | |
| "loss": 0.03, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.109795479009688, | |
| "grad_norm": 6.668363571166992, | |
| "learning_rate": 6.0305569337509225e-06, | |
| "loss": 0.0309, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.131324004305705, | |
| "grad_norm": 1.8910939693450928, | |
| "learning_rate": 5.761521379102343e-06, | |
| "loss": 0.0262, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.1528525296017222, | |
| "grad_norm": 1.481408953666687, | |
| "learning_rate": 5.497191297593647e-06, | |
| "loss": 0.0337, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.1743810548977396, | |
| "grad_norm": 1.0818077325820923, | |
| "learning_rate": 5.237701321576063e-06, | |
| "loss": 0.0365, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.195909580193757, | |
| "grad_norm": 1.0381739139556885, | |
| "learning_rate": 4.98318361816957e-06, | |
| "loss": 0.0228, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.217438105489774, | |
| "grad_norm": 0.31783393025398254, | |
| "learning_rate": 4.733767821945621e-06, | |
| "loss": 0.0278, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.238966630785791, | |
| "grad_norm": 2.5186619758605957, | |
| "learning_rate": 4.4895809688998655e-06, | |
| "loss": 0.0302, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.2604951560818085, | |
| "grad_norm": 0.6198469400405884, | |
| "learning_rate": 4.25074743174833e-06, | |
| "loss": 0.0138, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.282023681377826, | |
| "grad_norm": 0.8775982856750488, | |
| "learning_rate": 4.017388856580178e-06, | |
| "loss": 0.0218, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.3035522066738428, | |
| "grad_norm": 0.4356814920902252, | |
| "learning_rate": 3.7896241008991596e-06, | |
| "loss": 0.0284, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.32508073196986, | |
| "grad_norm": 1.0270265340805054, | |
| "learning_rate": 3.567569173085455e-06, | |
| "loss": 0.0169, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.3466092572658774, | |
| "grad_norm": 1.2356810569763184, | |
| "learning_rate": 3.351337173308607e-06, | |
| "loss": 0.0145, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.3681377825618943, | |
| "grad_norm": 0.17152564227581024, | |
| "learning_rate": 3.1410382359217645e-06, | |
| "loss": 0.0249, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3896663078579117, | |
| "grad_norm": 0.13272231817245483, | |
| "learning_rate": 2.9367794733664637e-06, | |
| "loss": 0.0296, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.411194833153929, | |
| "grad_norm": 2.4926042556762695, | |
| "learning_rate": 2.7386649216166233e-06, | |
| "loss": 0.031, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.4327233584499464, | |
| "grad_norm": 0.5246890783309937, | |
| "learning_rate": 2.546795487189436e-06, | |
| "loss": 0.0294, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.4542518837459633, | |
| "grad_norm": 1.739809513092041, | |
| "learning_rate": 2.361268895750264e-06, | |
| "loss": 0.0352, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.4757804090419806, | |
| "grad_norm": 0.07230955362319946, | |
| "learning_rate": 2.1821796423375766e-06, | |
| "loss": 0.0177, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.497308934337998, | |
| "grad_norm": 1.795920491218567, | |
| "learning_rate": 2.0096189432334194e-06, | |
| "loss": 0.032, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.518837459634015, | |
| "grad_norm": 0.4120383560657501, | |
| "learning_rate": 1.843674689503846e-06, | |
| "loss": 0.0244, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.540365984930032, | |
| "grad_norm": 1.3315762281417847, | |
| "learning_rate": 1.6844314022329676e-06, | |
| "loss": 0.0126, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.5618945102260495, | |
| "grad_norm": 0.9914199709892273, | |
| "learning_rate": 1.5319701894735023e-06, | |
| "loss": 0.022, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.583423035522067, | |
| "grad_norm": 0.9357948303222656, | |
| "learning_rate": 1.3863687049356465e-06, | |
| "loss": 0.0181, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.604951560818084, | |
| "grad_norm": 2.104593515396118, | |
| "learning_rate": 1.247701108435394e-06, | |
| "loss": 0.0241, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.626480086114101, | |
| "grad_norm": 1.0621205568313599, | |
| "learning_rate": 1.116038028122413e-06, | |
| "loss": 0.0292, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.6480086114101185, | |
| "grad_norm": 1.7859629392623901, | |
| "learning_rate": 9.914465245067022e-07, | |
| "loss": 0.0201, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.669537136706136, | |
| "grad_norm": 1.8932825326919556, | |
| "learning_rate": 8.7399005630238e-07, | |
| "loss": 0.0313, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.6910656620021527, | |
| "grad_norm": 1.2083765268325806, | |
| "learning_rate": 7.637284481059998e-07, | |
| "loss": 0.0311, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.71259418729817, | |
| "grad_norm": 0.1731128990650177, | |
| "learning_rate": 6.607178599258268e-07, | |
| "loss": 0.0134, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.7341227125941874, | |
| "grad_norm": 1.8263607025146484, | |
| "learning_rate": 5.650107585776348e-07, | |
| "loss": 0.0348, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.7556512378902047, | |
| "grad_norm": 1.52913498878479, | |
| "learning_rate": 4.766558909615504e-07, | |
| "loss": 0.0238, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.7771797631862216, | |
| "grad_norm": 1.0334974527359009, | |
| "learning_rate": 3.9569825923360503e-07, | |
| "loss": 0.0285, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.798708288482239, | |
| "grad_norm": 0.5131074786186218, | |
| "learning_rate": 3.22179097884579e-07, | |
| "loss": 0.0284, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.8202368137782563, | |
| "grad_norm": 0.869399905204773, | |
| "learning_rate": 2.5613585273788264e-07, | |
| "loss": 0.0312, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.841765339074273, | |
| "grad_norm": 1.1290533542633057, | |
| "learning_rate": 1.9760216187710788e-07, | |
| "loss": 0.0259, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.8632938643702905, | |
| "grad_norm": 0.23688088357448578, | |
| "learning_rate": 1.4660783851300318e-07, | |
| "loss": 0.0263, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.884822389666308, | |
| "grad_norm": 1.1585010290145874, | |
| "learning_rate": 1.0317885579858522e-07, | |
| "loss": 0.0175, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.9063509149623252, | |
| "grad_norm": 0.5305848717689514, | |
| "learning_rate": 6.733733360012761e-08, | |
| "loss": 0.0379, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.9278794402583426, | |
| "grad_norm": 1.1688823699951172, | |
| "learning_rate": 3.910152723075322e-08, | |
| "loss": 0.0401, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.9494079655543595, | |
| "grad_norm": 1.1842941045761108, | |
| "learning_rate": 1.848581815237671e-08, | |
| "loss": 0.0174, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.970936490850377, | |
| "grad_norm": 0.9176095724105835, | |
| "learning_rate": 5.50070665074065e-09, | |
| "loss": 0.0218, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.9924650161463937, | |
| "grad_norm": 2.5070579051971436, | |
| "learning_rate": 1.5280648725357615e-10, | |
| "loss": 0.0288, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.996770721205597, | |
| "step": 1392, | |
| "total_flos": 2.2176668825577062e+17, | |
| "train_loss": 0.04948104658677917, | |
| "train_runtime": 1703.925, | |
| "train_samples_per_second": 6.543, | |
| "train_steps_per_second": 0.817 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1392, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2176668825577062e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |