{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3876, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015479876160990712, "grad_norm": 1.2312519550323486, "learning_rate": 1.9587628865979382e-05, "loss": 35.56, "step": 20 }, { "epoch": 0.030959752321981424, "grad_norm": 3.618231773376465, "learning_rate": 4.020618556701031e-05, "loss": 35.4506, "step": 40 }, { "epoch": 0.04643962848297214, "grad_norm": 7.444742679595947, "learning_rate": 6.0824742268041234e-05, "loss": 33.9665, "step": 60 }, { "epoch": 0.06191950464396285, "grad_norm": 10.143885612487793, "learning_rate": 8.144329896907217e-05, "loss": 30.007, "step": 80 }, { "epoch": 0.07739938080495357, "grad_norm": 8.995733261108398, "learning_rate": 0.00010206185567010309, "loss": 22.9033, "step": 100 }, { "epoch": 0.09287925696594428, "grad_norm": 6.135839939117432, "learning_rate": 0.00012268041237113402, "loss": 14.5223, "step": 120 }, { "epoch": 0.10835913312693499, "grad_norm": 1.3609894514083862, "learning_rate": 0.00014329896907216494, "loss": 8.502, "step": 140 }, { "epoch": 0.1238390092879257, "grad_norm": 0.6270231604576111, "learning_rate": 0.0001639175257731959, "loss": 6.1607, "step": 160 }, { "epoch": 0.1393188854489164, "grad_norm": 1.1116787195205688, "learning_rate": 0.0001845360824742268, "loss": 5.2469, "step": 180 }, { "epoch": 0.15479876160990713, "grad_norm": 19.375608444213867, "learning_rate": 0.00019999908999970863, "loss": 4.6304, "step": 200 }, { "epoch": 0.17027863777089783, "grad_norm": 2.622767925262451, "learning_rate": 0.0001999772508208056, "loss": 4.0212, "step": 220 }, { "epoch": 0.18575851393188855, "grad_norm": 1.9569060802459717, "learning_rate": 0.00019992629891946655, "loss": 3.5883, "step": 240 }, { "epoch": 0.20123839009287925, "grad_norm": 2.5778486728668213, "learning_rate": 0.00019984624913255234, "loss": 3.2739, "step": 260 }, { "epoch": 0.21671826625386997, "grad_norm": 3.097757339477539, "learning_rate": 0.00019973712477003812, "loss": 3.2694, "step": 280 }, { "epoch": 0.23219814241486067, "grad_norm": 4.823343753814697, "learning_rate": 0.00019959895760822546, "loss": 2.9338, "step": 300 }, { "epoch": 0.2476780185758514, "grad_norm": 4.312209606170654, "learning_rate": 0.00019943178788048947, "loss": 2.8247, "step": 320 }, { "epoch": 0.2631578947368421, "grad_norm": 4.690278053283691, "learning_rate": 0.00019923566426556296, "loss": 2.7036, "step": 340 }, { "epoch": 0.2786377708978328, "grad_norm": 3.4156758785247803, "learning_rate": 0.00019901064387336164, "loss": 2.6405, "step": 360 }, { "epoch": 0.29411764705882354, "grad_norm": 15.009350776672363, "learning_rate": 0.00019875679222835398, "loss": 2.7523, "step": 380 }, { "epoch": 0.30959752321981426, "grad_norm": 3.018602132797241, "learning_rate": 0.000198474183250481, "loss": 2.5967, "step": 400 }, { "epoch": 0.32507739938080493, "grad_norm": 61.5975227355957, "learning_rate": 0.00019816289923363115, "loss": 2.4831, "step": 420 }, { "epoch": 0.34055727554179566, "grad_norm": 5.313195705413818, "learning_rate": 0.00019782303082167704, "loss": 2.5759, "step": 440 }, { "epoch": 0.3560371517027864, "grad_norm": 7.81691312789917, "learning_rate": 0.0001974546769820803, "loss": 2.4488, "step": 460 }, { "epoch": 0.3715170278637771, "grad_norm": 2.6861789226531982, "learning_rate": 0.00019705794497707312, "loss": 2.5337, "step": 480 }, { "epoch": 0.38699690402476783, "grad_norm": 2.6320855617523193, "learning_rate": 0.00019663295033242416, "loss": 2.4594, "step": 500 }, { "epoch": 0.4024767801857585, "grad_norm": 4.0807294845581055, "learning_rate": 0.00019617981680379804, "loss": 2.5519, "step": 520 }, { "epoch": 0.4179566563467492, "grad_norm": 1.9054477214813232, "learning_rate": 0.00019569867634071866, "loss": 2.5392, "step": 540 }, { "epoch": 0.43343653250773995, "grad_norm": 3.3869524002075195, "learning_rate": 0.00019518966904814625, "loss": 2.4402, "step": 560 }, { "epoch": 0.44891640866873067, "grad_norm": 3.94587779045105, "learning_rate": 0.00019465294314567987, "loss": 2.3743, "step": 580 }, { "epoch": 0.46439628482972134, "grad_norm": 1.9025462865829468, "learning_rate": 0.00019408865492439667, "loss": 2.4514, "step": 600 }, { "epoch": 0.47987616099071206, "grad_norm": 2.714090585708618, "learning_rate": 0.00019349696870134104, "loss": 2.4145, "step": 620 }, { "epoch": 0.4953560371517028, "grad_norm": 1.989125370979309, "learning_rate": 0.0001928780567716765, "loss": 2.4118, "step": 640 }, { "epoch": 0.5108359133126935, "grad_norm": 1.9810141324996948, "learning_rate": 0.00019223209935851455, "loss": 2.3477, "step": 660 }, { "epoch": 0.5263157894736842, "grad_norm": 2.902862310409546, "learning_rate": 0.0001915592845604348, "loss": 2.3803, "step": 680 }, { "epoch": 0.541795665634675, "grad_norm": 4.101681232452393, "learning_rate": 0.00019085980829671202, "loss": 2.2595, "step": 700 }, { "epoch": 0.5572755417956656, "grad_norm": 3.3641412258148193, "learning_rate": 0.0001901338742502655, "loss": 2.1746, "step": 720 }, { "epoch": 0.5727554179566563, "grad_norm": 4.526272296905518, "learning_rate": 0.0001893816938083481, "loss": 2.2116, "step": 740 }, { "epoch": 0.5882352941176471, "grad_norm": 2.6758034229278564, "learning_rate": 0.00018860348600099167, "loss": 2.2205, "step": 760 }, { "epoch": 0.6037151702786377, "grad_norm": 2.3024699687957764, "learning_rate": 0.00018779947743722685, "loss": 2.2901, "step": 780 }, { "epoch": 0.6191950464396285, "grad_norm": 3.218477487564087, "learning_rate": 0.00018696990223909595, "loss": 2.3473, "step": 800 }, { "epoch": 0.6346749226006192, "grad_norm": 2.0516302585601807, "learning_rate": 0.00018611500197347836, "loss": 2.3375, "step": 820 }, { "epoch": 0.6501547987616099, "grad_norm": 2.359009265899658, "learning_rate": 0.0001852350255817476, "loss": 2.2232, "step": 840 }, { "epoch": 0.6656346749226006, "grad_norm": 2.1015849113464355, "learning_rate": 0.00018433022930728133, "loss": 2.0797, "step": 860 }, { "epoch": 0.6811145510835913, "grad_norm": 2.1213488578796387, "learning_rate": 0.000183400876620845, "loss": 2.0405, "step": 880 }, { "epoch": 0.6965944272445821, "grad_norm": 3.6181552410125732, "learning_rate": 0.00018244723814387083, "loss": 2.283, "step": 900 }, { "epoch": 0.7120743034055728, "grad_norm": 7.133749961853027, "learning_rate": 0.0001814695915696546, "loss": 2.2591, "step": 920 }, { "epoch": 0.7275541795665634, "grad_norm": 5.1819987297058105, "learning_rate": 0.00018046822158249325, "loss": 2.2727, "step": 940 }, { "epoch": 0.7430340557275542, "grad_norm": 1.5060796737670898, "learning_rate": 0.00017944341977478654, "loss": 2.1029, "step": 960 }, { "epoch": 0.7585139318885449, "grad_norm": 1.9794375896453857, "learning_rate": 0.00017839548456212735, "loss": 2.126, "step": 980 }, { "epoch": 0.7739938080495357, "grad_norm": 5.905936241149902, "learning_rate": 0.00017732472109640503, "loss": 2.0345, "step": 1000 }, { "epoch": 0.7894736842105263, "grad_norm": 1.8466005325317383, "learning_rate": 0.00017623144117694708, "loss": 2.0384, "step": 1020 }, { "epoch": 0.804953560371517, "grad_norm": 1.9260845184326172, "learning_rate": 0.00017511596315972525, "loss": 2.0789, "step": 1040 }, { "epoch": 0.8204334365325078, "grad_norm": 39.37140655517578, "learning_rate": 0.00017397861186465243, "loss": 2.1383, "step": 1060 }, { "epoch": 0.8359133126934984, "grad_norm": 2.01326322555542, "learning_rate": 0.00017281971848099708, "loss": 2.0823, "step": 1080 }, { "epoch": 0.8513931888544891, "grad_norm": 6.53413724899292, "learning_rate": 0.00017163962047094328, "loss": 2.0473, "step": 1100 }, { "epoch": 0.8668730650154799, "grad_norm": 2.0091307163238525, "learning_rate": 0.0001704386614713236, "loss": 2.0851, "step": 1120 }, { "epoch": 0.8823529411764706, "grad_norm": 2.5082335472106934, "learning_rate": 0.00016921719119355468, "loss": 1.9664, "step": 1140 }, { "epoch": 0.8978328173374613, "grad_norm": 2.0674970149993896, "learning_rate": 0.0001679755653218034, "loss": 2.053, "step": 1160 }, { "epoch": 0.913312693498452, "grad_norm": 1.7843290567398071, "learning_rate": 0.0001667141454094139, "loss": 2.0598, "step": 1180 }, { "epoch": 0.9287925696594427, "grad_norm": 3.198160171508789, "learning_rate": 0.00016543329877362567, "loss": 2.0459, "step": 1200 }, { "epoch": 0.9442724458204335, "grad_norm": 2.996542453765869, "learning_rate": 0.0001641333983886132, "loss": 1.9699, "step": 1220 }, { "epoch": 0.9597523219814241, "grad_norm": 1.5705294609069824, "learning_rate": 0.00016281482277687826, "loss": 1.9142, "step": 1240 }, { "epoch": 0.9752321981424149, "grad_norm": 2.064488172531128, "learning_rate": 0.00016147795589902675, "loss": 2.0485, "step": 1260 }, { "epoch": 0.9907120743034056, "grad_norm": 3.425995111465454, "learning_rate": 0.00016012318704196164, "loss": 2.0223, "step": 1280 }, { "epoch": 1.0061919504643964, "grad_norm": 3.16005539894104, "learning_rate": 0.0001587509107055255, "loss": 2.1468, "step": 1300 }, { "epoch": 1.021671826625387, "grad_norm": 2.574629545211792, "learning_rate": 0.00015736152648762434, "loss": 1.9561, "step": 1320 }, { "epoch": 1.0371517027863777, "grad_norm": 2.4031307697296143, "learning_rate": 0.00015595543896786777, "loss": 2.0086, "step": 1340 }, { "epoch": 1.0526315789473684, "grad_norm": 2.7825675010681152, "learning_rate": 0.00015453305758975758, "loss": 1.9428, "step": 1360 }, { "epoch": 1.068111455108359, "grad_norm": 2.7439022064208984, "learning_rate": 0.0001530947965414608, "loss": 1.9599, "step": 1380 }, { "epoch": 1.08359133126935, "grad_norm": 3.3258135318756104, "learning_rate": 0.0001516410746352006, "loss": 1.9358, "step": 1400 }, { "epoch": 1.0990712074303406, "grad_norm": 2.5647904872894287, "learning_rate": 0.00015017231518530118, "loss": 1.9641, "step": 1420 }, { "epoch": 1.1145510835913313, "grad_norm": 2.749521017074585, "learning_rate": 0.00014868894588492104, "loss": 2.0731, "step": 1440 }, { "epoch": 1.130030959752322, "grad_norm": 4.515990257263184, "learning_rate": 0.00014719139868151184, "loss": 2.0112, "step": 1460 }, { "epoch": 1.1455108359133126, "grad_norm": 2.183593273162842, "learning_rate": 0.00014568010965103795, "loss": 1.8446, "step": 1480 }, { "epoch": 1.1609907120743035, "grad_norm": 6.230432033538818, "learning_rate": 0.00014415551887099405, "loss": 1.8628, "step": 1500 }, { "epoch": 1.1764705882352942, "grad_norm": 2.1607792377471924, "learning_rate": 0.0001426180702922574, "loss": 1.9779, "step": 1520 }, { "epoch": 1.1919504643962848, "grad_norm": 2.69480299949646, "learning_rate": 0.00014106821160981222, "loss": 2.0428, "step": 1540 }, { "epoch": 1.2074303405572755, "grad_norm": 2.5969278812408447, "learning_rate": 0.00013950639413238394, "loss": 1.9777, "step": 1560 }, { "epoch": 1.2229102167182662, "grad_norm": 2.709635019302368, "learning_rate": 0.00013793307265102096, "loss": 1.8938, "step": 1580 }, { "epoch": 1.238390092879257, "grad_norm": 2.4569203853607178, "learning_rate": 0.00013634870530666247, "loss": 1.7336, "step": 1600 }, { "epoch": 1.2538699690402477, "grad_norm": 2.850924253463745, "learning_rate": 0.00013475375345673083, "loss": 1.8305, "step": 1620 }, { "epoch": 1.2693498452012384, "grad_norm": 3.0461461544036865, "learning_rate": 0.00013314868154078725, "loss": 1.8139, "step": 1640 }, { "epoch": 1.284829721362229, "grad_norm": 2.8259775638580322, "learning_rate": 0.00013153395694529016, "loss": 1.8382, "step": 1660 }, { "epoch": 1.3003095975232197, "grad_norm": 2.21964693069458, "learning_rate": 0.00012991004986749515, "loss": 1.7279, "step": 1680 }, { "epoch": 1.3157894736842106, "grad_norm": 3.952878713607788, "learning_rate": 0.00012827743317853665, "loss": 1.8069, "step": 1700 }, { "epoch": 1.3312693498452013, "grad_norm": 2.4886014461517334, "learning_rate": 0.00012663658228573112, "loss": 1.8024, "step": 1720 }, { "epoch": 1.346749226006192, "grad_norm": 2.1773338317871094, "learning_rate": 0.0001249879749941412, "loss": 1.8586, "step": 1740 }, { "epoch": 1.3622291021671826, "grad_norm": 2.7643444538116455, "learning_rate": 0.00012333209136744237, "loss": 1.7066, "step": 1760 }, { "epoch": 1.3777089783281733, "grad_norm": 2.801255702972412, "learning_rate": 0.00012166941358813125, "loss": 1.8759, "step": 1780 }, { "epoch": 1.3931888544891642, "grad_norm": 3.2398030757904053, "learning_rate": 0.00012000042581711737, "loss": 1.8216, "step": 1800 }, { "epoch": 1.4086687306501549, "grad_norm": 2.008368492126465, "learning_rate": 0.00011832561405273867, "loss": 1.634, "step": 1820 }, { "epoch": 1.4241486068111455, "grad_norm": 2.6406748294830322, "learning_rate": 0.00011664546598924184, "loss": 1.8042, "step": 1840 }, { "epoch": 1.4396284829721362, "grad_norm": 1.6120656728744507, "learning_rate": 0.00011496047087476906, "loss": 1.7964, "step": 1860 }, { "epoch": 1.4551083591331269, "grad_norm": 3.240208864212036, "learning_rate": 0.00011327111936889212, "loss": 1.7457, "step": 1880 }, { "epoch": 1.4705882352941178, "grad_norm": 2.6874380111694336, "learning_rate": 0.00011157790339973546, "loss": 1.7428, "step": 1900 }, { "epoch": 1.4860681114551084, "grad_norm": 3.631455421447754, "learning_rate": 0.00010988131602073008, "loss": 1.8673, "step": 1920 }, { "epoch": 1.501547987616099, "grad_norm": 5.774441242218018, "learning_rate": 0.00010818185126703943, "loss": 1.7561, "step": 1940 }, { "epoch": 1.5170278637770898, "grad_norm": 2.388491153717041, "learning_rate": 0.0001064800040116997, "loss": 1.7853, "step": 1960 }, { "epoch": 1.5325077399380804, "grad_norm": 3.2117481231689453, "learning_rate": 0.00010477626982151603, "loss": 1.8202, "step": 1980 }, { "epoch": 1.5479876160990713, "grad_norm": 4.212379455566406, "learning_rate": 0.0001030711448127566, "loss": 1.7177, "step": 2000 }, { "epoch": 1.5634674922600618, "grad_norm": 1.7437323331832886, "learning_rate": 0.00010136512550668693, "loss": 1.5671, "step": 2020 }, { "epoch": 1.5789473684210527, "grad_norm": 5.586548805236816, "learning_rate": 9.965870868498605e-05, "loss": 1.7472, "step": 2040 }, { "epoch": 1.5944272445820433, "grad_norm": 4.029956340789795, "learning_rate": 9.795239124508695e-05, "loss": 1.8359, "step": 2060 }, { "epoch": 1.609907120743034, "grad_norm": 3.604818105697632, "learning_rate": 9.62466700554833e-05, "loss": 1.7598, "step": 2080 }, { "epoch": 1.6253869969040249, "grad_norm": 2.304302215576172, "learning_rate": 9.454204181104455e-05, "loss": 1.747, "step": 2100 }, { "epoch": 1.6408668730650153, "grad_norm": 3.2332301139831543, "learning_rate": 9.28390028883817e-05, "loss": 1.7388, "step": 2120 }, { "epoch": 1.6563467492260062, "grad_norm": 4.4512763023376465, "learning_rate": 9.113804920130558e-05, "loss": 1.6419, "step": 2140 }, { "epoch": 1.671826625386997, "grad_norm": 4.963044166564941, "learning_rate": 8.943967605642006e-05, "loss": 1.6978, "step": 2160 }, { "epoch": 1.6873065015479876, "grad_norm": 24.24378776550293, "learning_rate": 8.774437800889198e-05, "loss": 1.7875, "step": 2180 }, { "epoch": 1.7027863777089784, "grad_norm": 2.131967306137085, "learning_rate": 8.605264871843994e-05, "loss": 1.8406, "step": 2200 }, { "epoch": 1.718266253869969, "grad_norm": 2.9940407276153564, "learning_rate": 8.436498080558373e-05, "loss": 1.7843, "step": 2220 }, { "epoch": 1.7337461300309598, "grad_norm": 2.6848549842834473, "learning_rate": 8.268186570819657e-05, "loss": 1.7385, "step": 2240 }, { "epoch": 1.7492260061919505, "grad_norm": 2.7113239765167236, "learning_rate": 8.10037935384015e-05, "loss": 1.6555, "step": 2260 }, { "epoch": 1.7647058823529411, "grad_norm": 2.9977896213531494, "learning_rate": 7.933125293985404e-05, "loss": 1.7137, "step": 2280 }, { "epoch": 1.780185758513932, "grad_norm": 2.588730573654175, "learning_rate": 7.766473094545223e-05, "loss": 1.6741, "step": 2300 }, { "epoch": 1.7956656346749225, "grad_norm": 2.706368923187256, "learning_rate": 7.600471283551596e-05, "loss": 1.6268, "step": 2320 }, { "epoch": 1.8111455108359134, "grad_norm": 12.232383728027344, "learning_rate": 7.435168199647638e-05, "loss": 1.6722, "step": 2340 }, { "epoch": 1.826625386996904, "grad_norm": 2.679602861404419, "learning_rate": 7.270611978011702e-05, "loss": 1.6203, "step": 2360 }, { "epoch": 1.8421052631578947, "grad_norm": 2.306474208831787, "learning_rate": 7.10685053634073e-05, "loss": 1.6872, "step": 2380 }, { "epoch": 1.8575851393188856, "grad_norm": 1.9884538650512695, "learning_rate": 6.943931560896921e-05, "loss": 1.6097, "step": 2400 }, { "epoch": 1.873065015479876, "grad_norm": 2.9608068466186523, "learning_rate": 6.781902492621822e-05, "loss": 1.6861, "step": 2420 }, { "epoch": 1.888544891640867, "grad_norm": 2.4801182746887207, "learning_rate": 6.620810513321816e-05, "loss": 1.5144, "step": 2440 }, { "epoch": 1.9040247678018576, "grad_norm": 2.943125009536743, "learning_rate": 6.460702531929099e-05, "loss": 1.5874, "step": 2460 }, { "epoch": 1.9195046439628483, "grad_norm": 2.4978630542755127, "learning_rate": 6.30162517084211e-05, "loss": 1.6379, "step": 2480 }, { "epoch": 1.9349845201238391, "grad_norm": 2.3578176498413086, "learning_rate": 6.143624752349373e-05, "loss": 1.5288, "step": 2500 }, { "epoch": 1.9504643962848296, "grad_norm": 2.534557819366455, "learning_rate": 5.986747285140779e-05, "loss": 1.5616, "step": 2520 }, { "epoch": 1.9659442724458205, "grad_norm": 2.966055154800415, "learning_rate": 5.83103845091013e-05, "loss": 1.5537, "step": 2540 }, { "epoch": 1.9814241486068112, "grad_norm": 2.5695040225982666, "learning_rate": 5.676543591052934e-05, "loss": 1.6248, "step": 2560 }, { "epoch": 1.9969040247678018, "grad_norm": 2.0038466453552246, "learning_rate": 5.523307693463303e-05, "loss": 1.5773, "step": 2580 }, { "epoch": 2.0123839009287927, "grad_norm": 2.372335195541382, "learning_rate": 5.3713753794337454e-05, "loss": 1.4961, "step": 2600 }, { "epoch": 2.027863777089783, "grad_norm": 2.7436954975128174, "learning_rate": 5.2207908906617596e-05, "loss": 1.5137, "step": 2620 }, { "epoch": 2.043343653250774, "grad_norm": 1.831878423690796, "learning_rate": 5.0715980763669346e-05, "loss": 1.5333, "step": 2640 }, { "epoch": 2.0588235294117645, "grad_norm": 2.895763635635376, "learning_rate": 4.923840380522341e-05, "loss": 1.59, "step": 2660 }, { "epoch": 2.0743034055727554, "grad_norm": 2.4093143939971924, "learning_rate": 4.777560829203918e-05, "loss": 1.4739, "step": 2680 }, { "epoch": 2.0897832817337463, "grad_norm": 2.4006857872009277, "learning_rate": 4.632802018061588e-05, "loss": 1.4223, "step": 2700 }, { "epoch": 2.1052631578947367, "grad_norm": 2.8316566944122314, "learning_rate": 4.4896060999156584e-05, "loss": 1.5238, "step": 2720 }, { "epoch": 2.1207430340557276, "grad_norm": 2.2058160305023193, "learning_rate": 4.348014772482212e-05, "loss": 1.5024, "step": 2740 }, { "epoch": 2.136222910216718, "grad_norm": 3.6439034938812256, "learning_rate": 4.208069266230983e-05, "loss": 1.6302, "step": 2760 }, { "epoch": 2.151702786377709, "grad_norm": 2.221247911453247, "learning_rate": 4.069810332379343e-05, "loss": 1.4838, "step": 2780 }, { "epoch": 2.1671826625387, "grad_norm": 2.868905544281006, "learning_rate": 3.933278231025784e-05, "loss": 1.3948, "step": 2800 }, { "epoch": 2.1826625386996903, "grad_norm": 3.4799671173095703, "learning_rate": 3.7985127194264645e-05, "loss": 1.4766, "step": 2820 }, { "epoch": 2.198142414860681, "grad_norm": 2.0027518272399902, "learning_rate": 3.665553040418132e-05, "loss": 1.477, "step": 2840 }, { "epoch": 2.2136222910216716, "grad_norm": 2.957260847091675, "learning_rate": 3.534437910990891e-05, "loss": 1.5907, "step": 2860 }, { "epoch": 2.2291021671826625, "grad_norm": 2.046623468399048, "learning_rate": 3.4052055110140455e-05, "loss": 1.5678, "step": 2880 }, { "epoch": 2.2445820433436534, "grad_norm": 2.445232391357422, "learning_rate": 3.277893472118392e-05, "loss": 1.5119, "step": 2900 }, { "epoch": 2.260061919504644, "grad_norm": 3.0307981967926025, "learning_rate": 3.152538866738108e-05, "loss": 1.418, "step": 2920 }, { "epoch": 2.2755417956656347, "grad_norm": 2.7826693058013916, "learning_rate": 3.029178197315533e-05, "loss": 1.561, "step": 2940 }, { "epoch": 2.291021671826625, "grad_norm": 3.2509384155273438, "learning_rate": 2.9078473856718636e-05, "loss": 1.639, "step": 2960 }, { "epoch": 2.306501547987616, "grad_norm": 1.799000859260559, "learning_rate": 2.7885817625469813e-05, "loss": 1.6101, "step": 2980 }, { "epoch": 2.321981424148607, "grad_norm": 2.011575222015381, "learning_rate": 2.67141605731135e-05, "loss": 1.4611, "step": 3000 }, { "epoch": 2.3374613003095974, "grad_norm": 2.495401382446289, "learning_rate": 2.5563843878530713e-05, "loss": 1.5235, "step": 3020 }, { "epoch": 2.3529411764705883, "grad_norm": 2.8658440113067627, "learning_rate": 2.4435202506429522e-05, "loss": 1.4874, "step": 3040 }, { "epoch": 2.3684210526315788, "grad_norm": 2.25388240814209, "learning_rate": 2.332856510980582e-05, "loss": 1.5087, "step": 3060 }, { "epoch": 2.3839009287925697, "grad_norm": 1.8696151971817017, "learning_rate": 2.224425393424142e-05, "loss": 1.4936, "step": 3080 }, { "epoch": 2.3993808049535605, "grad_norm": 1.8870784044265747, "learning_rate": 2.118258472406851e-05, "loss": 1.5062, "step": 3100 }, { "epoch": 2.414860681114551, "grad_norm": 2.0219244956970215, "learning_rate": 2.0143866630426733e-05, "loss": 1.5437, "step": 3120 }, { "epoch": 2.430340557275542, "grad_norm": 3.397005558013916, "learning_rate": 1.9128402121240586e-05, "loss": 1.4137, "step": 3140 }, { "epoch": 2.4458204334365323, "grad_norm": 6.5130133628845215, "learning_rate": 1.8136486893142592e-05, "loss": 1.4625, "step": 3160 }, { "epoch": 2.461300309597523, "grad_norm": 2.9240591526031494, "learning_rate": 1.7168409785368513e-05, "loss": 1.4246, "step": 3180 }, { "epoch": 2.476780185758514, "grad_norm": 2.079099655151367, "learning_rate": 1.622445269564905e-05, "loss": 1.4998, "step": 3200 }, { "epoch": 2.4922600619195046, "grad_norm": 2.2761752605438232, "learning_rate": 1.5304890498123338e-05, "loss": 1.4858, "step": 3220 }, { "epoch": 2.5077399380804954, "grad_norm": 3.006927728652954, "learning_rate": 1.4409990963297093e-05, "loss": 1.4152, "step": 3240 }, { "epoch": 2.523219814241486, "grad_norm": 2.3741161823272705, "learning_rate": 1.3540014680069857e-05, "loss": 1.5054, "step": 3260 }, { "epoch": 2.538699690402477, "grad_norm": 3.471731424331665, "learning_rate": 1.2695214979852987e-05, "loss": 1.4098, "step": 3280 }, { "epoch": 2.5541795665634677, "grad_norm": 2.505537509918213, "learning_rate": 1.1875837862801431e-05, "loss": 1.4786, "step": 3300 }, { "epoch": 2.569659442724458, "grad_norm": 2.0300590991973877, "learning_rate": 1.1082121926179844e-05, "loss": 1.4354, "step": 3320 }, { "epoch": 2.585139318885449, "grad_norm": 2.6981465816497803, "learning_rate": 1.0314298294884839e-05, "loss": 1.4713, "step": 3340 }, { "epoch": 2.6006191950464395, "grad_norm": 2.780489444732666, "learning_rate": 9.572590554142757e-06, "loss": 1.4439, "step": 3360 }, { "epoch": 2.6160990712074303, "grad_norm": 2.1285483837127686, "learning_rate": 8.85721468440327e-06, "loss": 1.5715, "step": 3380 }, { "epoch": 2.6315789473684212, "grad_norm": 2.0249032974243164, "learning_rate": 8.168378998447123e-06, "loss": 1.3899, "step": 3400 }, { "epoch": 2.6470588235294117, "grad_norm": 2.097409725189209, "learning_rate": 7.506284080726955e-06, "loss": 1.5343, "step": 3420 }, { "epoch": 2.6625386996904026, "grad_norm": 1.9598788022994995, "learning_rate": 6.87112272895829e-06, "loss": 1.4559, "step": 3440 }, { "epoch": 2.678018575851393, "grad_norm": 2.149622917175293, "learning_rate": 6.26307989797823e-06, "loss": 1.5203, "step": 3460 }, { "epoch": 2.693498452012384, "grad_norm": 1.9326039552688599, "learning_rate": 5.682332645887689e-06, "loss": 1.373, "step": 3480 }, { "epoch": 2.708978328173375, "grad_norm": 2.489595890045166, "learning_rate": 5.129050082493336e-06, "loss": 1.4674, "step": 3500 }, { "epoch": 2.7244582043343653, "grad_norm": 2.2312111854553223, "learning_rate": 4.603393320063831e-06, "loss": 1.4345, "step": 3520 }, { "epoch": 2.739938080495356, "grad_norm": 2.5505943298339844, "learning_rate": 4.105515426415074e-06, "loss": 1.3763, "step": 3540 }, { "epoch": 2.7554179566563466, "grad_norm": 2.39302659034729, "learning_rate": 3.6355613803378154e-06, "loss": 1.4481, "step": 3560 }, { "epoch": 2.7708978328173375, "grad_norm": 2.956669569015503, "learning_rate": 3.193668029380725e-06, "loss": 1.3423, "step": 3580 }, { "epoch": 2.7863777089783284, "grad_norm": 2.6478095054626465, "learning_rate": 2.7799640500014047e-06, "loss": 1.5363, "step": 3600 }, { "epoch": 2.801857585139319, "grad_norm": 2.2021002769470215, "learning_rate": 2.3945699100965e-06, "loss": 1.4281, "step": 3620 }, { "epoch": 2.8173374613003097, "grad_norm": 2.756779909133911, "learning_rate": 2.0375978339223776e-06, "loss": 1.3747, "step": 3640 }, { "epoch": 2.8328173374613, "grad_norm": 2.004383087158203, "learning_rate": 1.7091517694160286e-06, "loss": 1.3276, "step": 3660 }, { "epoch": 2.848297213622291, "grad_norm": 2.8317830562591553, "learning_rate": 1.4093273579261935e-06, "loss": 1.3893, "step": 3680 }, { "epoch": 2.863777089783282, "grad_norm": 2.2364342212677, "learning_rate": 1.1382119063631736e-06, "loss": 1.4623, "step": 3700 }, { "epoch": 2.8792569659442724, "grad_norm": 2.155151128768921, "learning_rate": 8.958843617757007e-07, "loss": 1.4898, "step": 3720 }, { "epoch": 2.8947368421052633, "grad_norm": 2.1057004928588867, "learning_rate": 6.824152883619705e-07, "loss": 1.4417, "step": 3740 }, { "epoch": 2.9102167182662537, "grad_norm": 2.36039662361145, "learning_rate": 4.978668469218906e-07, "loss": 1.4101, "step": 3760 }, { "epoch": 2.9256965944272446, "grad_norm": 2.6042068004608154, "learning_rate": 3.422927767562256e-07, "loss": 1.3303, "step": 3780 }, { "epoch": 2.9411764705882355, "grad_norm": 2.136204957962036, "learning_rate": 2.1573838001808232e-07, "loss": 1.4015, "step": 3800 }, { "epoch": 2.956656346749226, "grad_norm": 2.2292158603668213, "learning_rate": 1.182405085211724e-07, "loss": 1.4547, "step": 3820 }, { "epoch": 2.972136222910217, "grad_norm": 2.508124589920044, "learning_rate": 4.982755300889652e-08, "loss": 1.3741, "step": 3840 }, { "epoch": 2.9876160990712073, "grad_norm": 2.2655694484710693, "learning_rate": 1.0519434887057422e-08, "loss": 1.4247, "step": 3860 } ], "logging_steps": 20, "max_steps": 3876, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4761450366959616e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }