{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 5, "global_step": 201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014925373134328358, "grad_norm": 7.200786590576172, "learning_rate": 4.5454545454545457e-07, "loss": 0.1719, "step": 1 }, { "epoch": 0.029850746268656716, "grad_norm": 7.886128902435303, "learning_rate": 9.090909090909091e-07, "loss": 0.1847, "step": 2 }, { "epoch": 0.04477611940298507, "grad_norm": 8.476142883300781, "learning_rate": 1.3636363636363636e-06, "loss": 0.2012, "step": 3 }, { "epoch": 0.05970149253731343, "grad_norm": 4.055701732635498, "learning_rate": 1.8181818181818183e-06, "loss": 0.1325, "step": 4 }, { "epoch": 0.07462686567164178, "grad_norm": 3.249504566192627, "learning_rate": 2.2727272727272728e-06, "loss": 0.0968, "step": 5 }, { "epoch": 0.07462686567164178, "eval_loss": 0.06899096071720123, "eval_runtime": 14.448, "eval_samples_per_second": 8.236, "eval_steps_per_second": 0.277, "step": 5 }, { "epoch": 0.08955223880597014, "grad_norm": 1.5543451309204102, "learning_rate": 2.7272727272727272e-06, "loss": 0.0721, "step": 6 }, { "epoch": 0.1044776119402985, "grad_norm": 1.0983480215072632, "learning_rate": 3.181818181818182e-06, "loss": 0.0506, "step": 7 }, { "epoch": 0.11940298507462686, "grad_norm": 0.9362279176712036, "learning_rate": 3.6363636363636366e-06, "loss": 0.0397, "step": 8 }, { "epoch": 0.13432835820895522, "grad_norm": 0.84356290102005, "learning_rate": 4.0909090909090915e-06, "loss": 0.0427, "step": 9 }, { "epoch": 0.14925373134328357, "grad_norm": 0.7953129410743713, "learning_rate": 4.5454545454545455e-06, "loss": 0.0346, "step": 10 }, { "epoch": 0.14925373134328357, "eval_loss": 0.03658153489232063, "eval_runtime": 8.2566, "eval_samples_per_second": 14.413, "eval_steps_per_second": 0.484, "step": 10 }, { "epoch": 0.16417910447761194, "grad_norm": 0.6031929850578308, "learning_rate": 5e-06, "loss": 0.0325, "step": 11 }, { "epoch": 0.1791044776119403, "grad_norm": 0.6844140291213989, "learning_rate": 4.999658262481173e-06, "loss": 0.0372, "step": 12 }, { "epoch": 0.19402985074626866, "grad_norm": 0.6683032512664795, "learning_rate": 4.998633143352315e-06, "loss": 0.0309, "step": 13 }, { "epoch": 0.208955223880597, "grad_norm": 0.5927382111549377, "learning_rate": 4.9969249228707625e-06, "loss": 0.0265, "step": 14 }, { "epoch": 0.22388059701492538, "grad_norm": 0.6329669952392578, "learning_rate": 4.994534068046936e-06, "loss": 0.0256, "step": 15 }, { "epoch": 0.22388059701492538, "eval_loss": 0.03213270381093025, "eval_runtime": 8.2934, "eval_samples_per_second": 14.349, "eval_steps_per_second": 0.482, "step": 15 }, { "epoch": 0.23880597014925373, "grad_norm": 0.689909040927887, "learning_rate": 4.991461232516675e-06, "loss": 0.0393, "step": 16 }, { "epoch": 0.2537313432835821, "grad_norm": 0.623458743095398, "learning_rate": 4.987707256362529e-06, "loss": 0.0329, "step": 17 }, { "epoch": 0.26865671641791045, "grad_norm": 0.622241735458374, "learning_rate": 4.983273165884096e-06, "loss": 0.0311, "step": 18 }, { "epoch": 0.2835820895522388, "grad_norm": 0.5380865335464478, "learning_rate": 4.978160173317439e-06, "loss": 0.0272, "step": 19 }, { "epoch": 0.29850746268656714, "grad_norm": 0.6085941791534424, "learning_rate": 4.972369676503672e-06, "loss": 0.0328, "step": 20 }, { "epoch": 0.29850746268656714, "eval_loss": 0.030199836939573288, "eval_runtime": 8.2944, "eval_samples_per_second": 14.347, "eval_steps_per_second": 0.482, "step": 20 }, { "epoch": 0.31343283582089554, "grad_norm": 0.5220608711242676, "learning_rate": 4.965903258506806e-06, "loss": 0.0286, "step": 21 }, { "epoch": 0.3283582089552239, "grad_norm": 0.5969937443733215, "learning_rate": 4.9587626871809564e-06, "loss": 0.0294, "step": 22 }, { "epoch": 0.34328358208955223, "grad_norm": 0.4778025448322296, "learning_rate": 4.950949914687024e-06, "loss": 0.0238, "step": 23 }, { "epoch": 0.3582089552238806, "grad_norm": 0.41100114583969116, "learning_rate": 4.942467076958999e-06, "loss": 0.022, "step": 24 }, { "epoch": 0.373134328358209, "grad_norm": 0.4831908941268921, "learning_rate": 4.933316493120015e-06, "loss": 0.0287, "step": 25 }, { "epoch": 0.373134328358209, "eval_loss": 0.02796478196978569, "eval_runtime": 8.2814, "eval_samples_per_second": 14.37, "eval_steps_per_second": 0.483, "step": 25 }, { "epoch": 0.3880597014925373, "grad_norm": 0.5058245062828064, "learning_rate": 4.923500664848327e-06, "loss": 0.0262, "step": 26 }, { "epoch": 0.40298507462686567, "grad_norm": 0.5201866626739502, "learning_rate": 4.913022275693372e-06, "loss": 0.029, "step": 27 }, { "epoch": 0.417910447761194, "grad_norm": 0.46352681517601013, "learning_rate": 4.901884190342121e-06, "loss": 0.0261, "step": 28 }, { "epoch": 0.43283582089552236, "grad_norm": 0.5337287187576294, "learning_rate": 4.890089453835894e-06, "loss": 0.0258, "step": 29 }, { "epoch": 0.44776119402985076, "grad_norm": 0.43212640285491943, "learning_rate": 4.8776412907378845e-06, "loss": 0.0221, "step": 30 }, { "epoch": 0.44776119402985076, "eval_loss": 0.027510978281497955, "eval_runtime": 8.3069, "eval_samples_per_second": 14.325, "eval_steps_per_second": 0.482, "step": 30 }, { "epoch": 0.4626865671641791, "grad_norm": 0.5198773145675659, "learning_rate": 4.864543104251587e-06, "loss": 0.0261, "step": 31 }, { "epoch": 0.47761194029850745, "grad_norm": 0.46835553646087646, "learning_rate": 4.850798475290403e-06, "loss": 0.0238, "step": 32 }, { "epoch": 0.4925373134328358, "grad_norm": 0.521562397480011, "learning_rate": 4.836411161498653e-06, "loss": 0.0311, "step": 33 }, { "epoch": 0.5074626865671642, "grad_norm": 0.3529301881790161, "learning_rate": 4.821385096224268e-06, "loss": 0.0216, "step": 34 }, { "epoch": 0.5223880597014925, "grad_norm": 0.559899091720581, "learning_rate": 4.8057243874434625e-06, "loss": 0.0347, "step": 35 }, { "epoch": 0.5223880597014925, "eval_loss": 0.026492305099964142, "eval_runtime": 8.3008, "eval_samples_per_second": 14.336, "eval_steps_per_second": 0.482, "step": 35 }, { "epoch": 0.5373134328358209, "grad_norm": 0.42942824959754944, "learning_rate": 4.789433316637644e-06, "loss": 0.0253, "step": 36 }, { "epoch": 0.5522388059701493, "grad_norm": 0.432647168636322, "learning_rate": 4.772516337622907e-06, "loss": 0.0209, "step": 37 }, { "epoch": 0.5671641791044776, "grad_norm": 0.47124215960502625, "learning_rate": 4.754978075332398e-06, "loss": 0.0213, "step": 38 }, { "epoch": 0.582089552238806, "grad_norm": 0.41911983489990234, "learning_rate": 4.736823324551909e-06, "loss": 0.0209, "step": 39 }, { "epoch": 0.5970149253731343, "grad_norm": 0.3859071731567383, "learning_rate": 4.71805704860903e-06, "loss": 0.0212, "step": 40 }, { "epoch": 0.5970149253731343, "eval_loss": 0.02548597753047943, "eval_runtime": 8.3185, "eval_samples_per_second": 14.306, "eval_steps_per_second": 0.481, "step": 40 }, { "epoch": 0.6119402985074627, "grad_norm": 0.5006839036941528, "learning_rate": 4.698684378016223e-06, "loss": 0.0237, "step": 41 }, { "epoch": 0.6268656716417911, "grad_norm": 0.35737383365631104, "learning_rate": 4.678710609068193e-06, "loss": 0.0218, "step": 42 }, { "epoch": 0.6417910447761194, "grad_norm": 0.41281658411026, "learning_rate": 4.658141202393935e-06, "loss": 0.0193, "step": 43 }, { "epoch": 0.6567164179104478, "grad_norm": 0.4524123966693878, "learning_rate": 4.636981781463848e-06, "loss": 0.0314, "step": 44 }, { "epoch": 0.6716417910447762, "grad_norm": 0.4154703617095947, "learning_rate": 4.615238131052339e-06, "loss": 0.0245, "step": 45 }, { "epoch": 0.6716417910447762, "eval_loss": 0.025405339896678925, "eval_runtime": 8.3109, "eval_samples_per_second": 14.318, "eval_steps_per_second": 0.481, "step": 45 }, { "epoch": 0.6865671641791045, "grad_norm": 0.45151591300964355, "learning_rate": 4.592916195656322e-06, "loss": 0.0273, "step": 46 }, { "epoch": 0.7014925373134329, "grad_norm": 0.3298991918563843, "learning_rate": 4.570022077870051e-06, "loss": 0.0204, "step": 47 }, { "epoch": 0.7164179104477612, "grad_norm": 0.4888248145580292, "learning_rate": 4.546562036716732e-06, "loss": 0.0269, "step": 48 }, { "epoch": 0.7313432835820896, "grad_norm": 0.3940542936325073, "learning_rate": 4.522542485937369e-06, "loss": 0.0235, "step": 49 }, { "epoch": 0.746268656716418, "grad_norm": 0.3737621307373047, "learning_rate": 4.497969992237312e-06, "loss": 0.0209, "step": 50 }, { "epoch": 0.746268656716418, "eval_loss": 0.024418316781520844, "eval_runtime": 8.3141, "eval_samples_per_second": 14.313, "eval_steps_per_second": 0.481, "step": 50 }, { "epoch": 0.7611940298507462, "grad_norm": 0.32089656591415405, "learning_rate": 4.472851273490985e-06, "loss": 0.0182, "step": 51 }, { "epoch": 0.7761194029850746, "grad_norm": 0.4259447455406189, "learning_rate": 4.4471931969052816e-06, "loss": 0.0229, "step": 52 }, { "epoch": 0.7910447761194029, "grad_norm": 0.33431047201156616, "learning_rate": 4.421002777142148e-06, "loss": 0.0191, "step": 53 }, { "epoch": 0.8059701492537313, "grad_norm": 0.3420027196407318, "learning_rate": 4.394287174400838e-06, "loss": 0.0187, "step": 54 }, { "epoch": 0.8208955223880597, "grad_norm": 0.4479522407054901, "learning_rate": 4.3670536924603855e-06, "loss": 0.0242, "step": 55 }, { "epoch": 0.8208955223880597, "eval_loss": 0.023885194212198257, "eval_runtime": 8.3377, "eval_samples_per_second": 14.273, "eval_steps_per_second": 0.48, "step": 55 }, { "epoch": 0.835820895522388, "grad_norm": 0.46404945850372314, "learning_rate": 4.33930977668283e-06, "loss": 0.0226, "step": 56 }, { "epoch": 0.8507462686567164, "grad_norm": 0.49134090542793274, "learning_rate": 4.311063011977723e-06, "loss": 0.0277, "step": 57 }, { "epoch": 0.8656716417910447, "grad_norm": 0.373234361410141, "learning_rate": 4.282321120728493e-06, "loss": 0.0199, "step": 58 }, { "epoch": 0.8805970149253731, "grad_norm": 0.31845158338546753, "learning_rate": 4.253091960681222e-06, "loss": 0.0196, "step": 59 }, { "epoch": 0.8955223880597015, "grad_norm": 0.4006720781326294, "learning_rate": 4.2233835227964145e-06, "loss": 0.0226, "step": 60 }, { "epoch": 0.8955223880597015, "eval_loss": 0.023553457111120224, "eval_runtime": 8.2827, "eval_samples_per_second": 14.367, "eval_steps_per_second": 0.483, "step": 60 }, { "epoch": 0.9104477611940298, "grad_norm": 0.395221084356308, "learning_rate": 4.1932039290643534e-06, "loss": 0.0238, "step": 61 }, { "epoch": 0.9253731343283582, "grad_norm": 0.40594613552093506, "learning_rate": 4.162561430284621e-06, "loss": 0.0235, "step": 62 }, { "epoch": 0.9402985074626866, "grad_norm": 0.32194435596466064, "learning_rate": 4.1314644038104215e-06, "loss": 0.0188, "step": 63 }, { "epoch": 0.9552238805970149, "grad_norm": 0.36028629541397095, "learning_rate": 4.099921351258292e-06, "loss": 0.0207, "step": 64 }, { "epoch": 0.9701492537313433, "grad_norm": 0.3143925368785858, "learning_rate": 4.067940896183843e-06, "loss": 0.0208, "step": 65 }, { "epoch": 0.9701492537313433, "eval_loss": 0.023211363703012466, "eval_runtime": 8.272, "eval_samples_per_second": 14.386, "eval_steps_per_second": 0.484, "step": 65 }, { "epoch": 0.9850746268656716, "grad_norm": 0.4360805153846741, "learning_rate": 4.0355317817241705e-06, "loss": 0.0195, "step": 66 }, { "epoch": 1.0, "grad_norm": 0.424087256193161, "learning_rate": 4.002702868207563e-06, "loss": 0.0243, "step": 67 }, { "epoch": 1.0149253731343284, "grad_norm": 0.3290930986404419, "learning_rate": 3.969463130731183e-06, "loss": 0.0169, "step": 68 }, { "epoch": 1.0298507462686568, "grad_norm": 0.34683364629745483, "learning_rate": 3.935821656707359e-06, "loss": 0.0188, "step": 69 }, { "epoch": 1.044776119402985, "grad_norm": 0.32021239399909973, "learning_rate": 3.901787643379183e-06, "loss": 0.0135, "step": 70 }, { "epoch": 1.044776119402985, "eval_loss": 0.022930506616830826, "eval_runtime": 8.303, "eval_samples_per_second": 14.332, "eval_steps_per_second": 0.482, "step": 70 }, { "epoch": 1.0597014925373134, "grad_norm": 0.2900172472000122, "learning_rate": 3.8673703953060685e-06, "loss": 0.0169, "step": 71 }, { "epoch": 1.0746268656716418, "grad_norm": 0.35844284296035767, "learning_rate": 3.832579321819985e-06, "loss": 0.0152, "step": 72 }, { "epoch": 1.0895522388059702, "grad_norm": 0.352897584438324, "learning_rate": 3.797423934453038e-06, "loss": 0.0168, "step": 73 }, { "epoch": 1.1044776119402986, "grad_norm": 0.35191500186920166, "learning_rate": 3.76191384433711e-06, "loss": 0.0173, "step": 74 }, { "epoch": 1.1194029850746268, "grad_norm": 0.28631043434143066, "learning_rate": 3.726058759576271e-06, "loss": 0.0141, "step": 75 }, { "epoch": 1.1194029850746268, "eval_loss": 0.023363711312413216, "eval_runtime": 8.2771, "eval_samples_per_second": 14.377, "eval_steps_per_second": 0.483, "step": 75 }, { "epoch": 1.1343283582089552, "grad_norm": 0.36245712637901306, "learning_rate": 3.6898684825926845e-06, "loss": 0.0133, "step": 76 }, { "epoch": 1.1492537313432836, "grad_norm": 0.3419128954410553, "learning_rate": 3.65335290744672e-06, "loss": 0.0139, "step": 77 }, { "epoch": 1.164179104477612, "grad_norm": 0.3986120820045471, "learning_rate": 3.616522017132017e-06, "loss": 0.0168, "step": 78 }, { "epoch": 1.1791044776119404, "grad_norm": 0.3793441951274872, "learning_rate": 3.579385880846232e-06, "loss": 0.0165, "step": 79 }, { "epoch": 1.1940298507462686, "grad_norm": 0.36774227023124695, "learning_rate": 3.5419546512382264e-06, "loss": 0.0165, "step": 80 }, { "epoch": 1.1940298507462686, "eval_loss": 0.023542851209640503, "eval_runtime": 8.316, "eval_samples_per_second": 14.31, "eval_steps_per_second": 0.481, "step": 80 }, { "epoch": 1.208955223880597, "grad_norm": 0.36220625042915344, "learning_rate": 3.5042385616324243e-06, "loss": 0.0189, "step": 81 }, { "epoch": 1.2238805970149254, "grad_norm": 0.3013781011104584, "learning_rate": 3.466247923231131e-06, "loss": 0.0141, "step": 82 }, { "epoch": 1.2388059701492538, "grad_norm": 0.359733521938324, "learning_rate": 3.427993122295552e-06, "loss": 0.0161, "step": 83 }, { "epoch": 1.2537313432835822, "grad_norm": 0.39510107040405273, "learning_rate": 3.3894846173062917e-06, "loss": 0.0153, "step": 84 }, { "epoch": 1.2686567164179103, "grad_norm": 0.38427668809890747, "learning_rate": 3.350732936104108e-06, "loss": 0.0173, "step": 85 }, { "epoch": 1.2686567164179103, "eval_loss": 0.023341603577136993, "eval_runtime": 8.5378, "eval_samples_per_second": 13.938, "eval_steps_per_second": 0.469, "step": 85 }, { "epoch": 1.2835820895522387, "grad_norm": 0.30994802713394165, "learning_rate": 3.3117486730117092e-06, "loss": 0.0134, "step": 86 }, { "epoch": 1.2985074626865671, "grad_norm": 0.3489951193332672, "learning_rate": 3.272542485937369e-06, "loss": 0.0169, "step": 87 }, { "epoch": 1.3134328358208955, "grad_norm": 0.31990131735801697, "learning_rate": 3.2331250934611623e-06, "loss": 0.0169, "step": 88 }, { "epoch": 1.328358208955224, "grad_norm": 0.29082977771759033, "learning_rate": 3.193507271904612e-06, "loss": 0.0121, "step": 89 }, { "epoch": 1.3432835820895521, "grad_norm": 0.3279978334903717, "learning_rate": 3.15369985238455e-06, "loss": 0.0123, "step": 90 }, { "epoch": 1.3432835820895521, "eval_loss": 0.02307475358247757, "eval_runtime": 8.3253, "eval_samples_per_second": 14.294, "eval_steps_per_second": 0.48, "step": 90 }, { "epoch": 1.3582089552238805, "grad_norm": 0.30484575033187866, "learning_rate": 3.1137137178519983e-06, "loss": 0.0153, "step": 91 }, { "epoch": 1.373134328358209, "grad_norm": 0.37242934107780457, "learning_rate": 3.073559800116879e-06, "loss": 0.0189, "step": 92 }, { "epoch": 1.3880597014925373, "grad_norm": 0.33749932050704956, "learning_rate": 3.0332490768593676e-06, "loss": 0.02, "step": 93 }, { "epoch": 1.4029850746268657, "grad_norm": 0.322444349527359, "learning_rate": 2.9927925686287006e-06, "loss": 0.0135, "step": 94 }, { "epoch": 1.417910447761194, "grad_norm": 0.3586093783378601, "learning_rate": 2.9522013358302754e-06, "loss": 0.0145, "step": 95 }, { "epoch": 1.417910447761194, "eval_loss": 0.02322915382683277, "eval_runtime": 8.3124, "eval_samples_per_second": 14.316, "eval_steps_per_second": 0.481, "step": 95 }, { "epoch": 1.4328358208955223, "grad_norm": 0.29217156767845154, "learning_rate": 2.911486475701835e-06, "loss": 0.0132, "step": 96 }, { "epoch": 1.4477611940298507, "grad_norm": 0.36368077993392944, "learning_rate": 2.870659119279605e-06, "loss": 0.0157, "step": 97 }, { "epoch": 1.462686567164179, "grad_norm": 0.44833701848983765, "learning_rate": 2.829730428355173e-06, "loss": 0.0163, "step": 98 }, { "epoch": 1.4776119402985075, "grad_norm": 0.3234724700450897, "learning_rate": 2.788711592423966e-06, "loss": 0.0126, "step": 99 }, { "epoch": 1.4925373134328357, "grad_norm": 0.3331272006034851, "learning_rate": 2.7476138256261575e-06, "loss": 0.0154, "step": 100 }, { "epoch": 1.4925373134328357, "eval_loss": 0.02257104031741619, "eval_runtime": 8.3116, "eval_samples_per_second": 14.317, "eval_steps_per_second": 0.481, "step": 100 }, { "epoch": 1.5074626865671643, "grad_norm": 0.2923891544342041, "learning_rate": 2.7064483636808314e-06, "loss": 0.012, "step": 101 }, { "epoch": 1.5223880597014925, "grad_norm": 0.4166359007358551, "learning_rate": 2.6652264608142487e-06, "loss": 0.0207, "step": 102 }, { "epoch": 1.537313432835821, "grad_norm": 0.3134080469608307, "learning_rate": 2.623959386683056e-06, "loss": 0.0129, "step": 103 }, { "epoch": 1.5522388059701493, "grad_norm": 0.33056458830833435, "learning_rate": 2.5826584232932707e-06, "loss": 0.0141, "step": 104 }, { "epoch": 1.5671641791044775, "grad_norm": 0.36771681904792786, "learning_rate": 2.5413348619158966e-06, "loss": 0.0147, "step": 105 }, { "epoch": 1.5671641791044775, "eval_loss": 0.022433940321207047, "eval_runtime": 8.2998, "eval_samples_per_second": 14.338, "eval_steps_per_second": 0.482, "step": 105 }, { "epoch": 1.582089552238806, "grad_norm": 0.36414942145347595, "learning_rate": 2.5e-06, "loss": 0.017, "step": 106 }, { "epoch": 1.5970149253731343, "grad_norm": 0.37462717294692993, "learning_rate": 2.458665138084104e-06, "loss": 0.0152, "step": 107 }, { "epoch": 1.6119402985074627, "grad_norm": 0.3305724263191223, "learning_rate": 2.4173415767067297e-06, "loss": 0.0147, "step": 108 }, { "epoch": 1.626865671641791, "grad_norm": 0.3597583472728729, "learning_rate": 2.376040613316944e-06, "loss": 0.0152, "step": 109 }, { "epoch": 1.6417910447761193, "grad_norm": 0.32973572611808777, "learning_rate": 2.3347735391857517e-06, "loss": 0.0132, "step": 110 }, { "epoch": 1.6417910447761193, "eval_loss": 0.022819483652710915, "eval_runtime": 8.3196, "eval_samples_per_second": 14.304, "eval_steps_per_second": 0.481, "step": 110 }, { "epoch": 1.6567164179104479, "grad_norm": 0.28718486428260803, "learning_rate": 2.2935516363191695e-06, "loss": 0.0115, "step": 111 }, { "epoch": 1.671641791044776, "grad_norm": 0.3861285150051117, "learning_rate": 2.2523861743738433e-06, "loss": 0.0159, "step": 112 }, { "epoch": 1.6865671641791045, "grad_norm": 0.465282678604126, "learning_rate": 2.211288407576035e-06, "loss": 0.0187, "step": 113 }, { "epoch": 1.7014925373134329, "grad_norm": 0.31902506947517395, "learning_rate": 2.1702695716448276e-06, "loss": 0.0142, "step": 114 }, { "epoch": 1.716417910447761, "grad_norm": 0.3674122989177704, "learning_rate": 2.129340880720395e-06, "loss": 0.0155, "step": 115 }, { "epoch": 1.716417910447761, "eval_loss": 0.022671934217214584, "eval_runtime": 8.2807, "eval_samples_per_second": 14.371, "eval_steps_per_second": 0.483, "step": 115 }, { "epoch": 1.7313432835820897, "grad_norm": 0.4284082353115082, "learning_rate": 2.088513524298165e-06, "loss": 0.0185, "step": 116 }, { "epoch": 1.7462686567164178, "grad_norm": 0.3359587490558624, "learning_rate": 2.0477986641697263e-06, "loss": 0.0161, "step": 117 }, { "epoch": 1.7611940298507462, "grad_norm": 0.33396944403648376, "learning_rate": 2.0072074313713e-06, "loss": 0.0109, "step": 118 }, { "epoch": 1.7761194029850746, "grad_norm": 0.3261159658432007, "learning_rate": 1.9667509231406332e-06, "loss": 0.0142, "step": 119 }, { "epoch": 1.7910447761194028, "grad_norm": 0.35204392671585083, "learning_rate": 1.9264401998831213e-06, "loss": 0.0149, "step": 120 }, { "epoch": 1.7910447761194028, "eval_loss": 0.022122090682387352, "eval_runtime": 8.2768, "eval_samples_per_second": 14.378, "eval_steps_per_second": 0.483, "step": 120 }, { "epoch": 1.8059701492537314, "grad_norm": 0.320769727230072, "learning_rate": 1.8862862821480023e-06, "loss": 0.0145, "step": 121 }, { "epoch": 1.8208955223880596, "grad_norm": 0.34036341309547424, "learning_rate": 1.8463001476154508e-06, "loss": 0.0142, "step": 122 }, { "epoch": 1.835820895522388, "grad_norm": 0.2602291405200958, "learning_rate": 1.8064927280953893e-06, "loss": 0.0117, "step": 123 }, { "epoch": 1.8507462686567164, "grad_norm": 0.3351423442363739, "learning_rate": 1.7668749065388385e-06, "loss": 0.0132, "step": 124 }, { "epoch": 1.8656716417910446, "grad_norm": 0.4113386869430542, "learning_rate": 1.7274575140626318e-06, "loss": 0.0169, "step": 125 }, { "epoch": 1.8656716417910446, "eval_loss": 0.02193240076303482, "eval_runtime": 8.2973, "eval_samples_per_second": 14.342, "eval_steps_per_second": 0.482, "step": 125 }, { "epoch": 1.8805970149253732, "grad_norm": 0.3114112615585327, "learning_rate": 1.6882513269882916e-06, "loss": 0.0134, "step": 126 }, { "epoch": 1.8955223880597014, "grad_norm": 0.39908191561698914, "learning_rate": 1.6492670638958924e-06, "loss": 0.0195, "step": 127 }, { "epoch": 1.9104477611940298, "grad_norm": 0.3014167845249176, "learning_rate": 1.6105153826937087e-06, "loss": 0.0147, "step": 128 }, { "epoch": 1.9253731343283582, "grad_norm": 0.3079487681388855, "learning_rate": 1.5720068777044479e-06, "loss": 0.0134, "step": 129 }, { "epoch": 1.9402985074626866, "grad_norm": 0.28381192684173584, "learning_rate": 1.53375207676887e-06, "loss": 0.0136, "step": 130 }, { "epoch": 1.9402985074626866, "eval_loss": 0.02178417146205902, "eval_runtime": 8.276, "eval_samples_per_second": 14.379, "eval_steps_per_second": 0.483, "step": 130 }, { "epoch": 1.955223880597015, "grad_norm": 0.344938188791275, "learning_rate": 1.495761438367577e-06, "loss": 0.0137, "step": 131 }, { "epoch": 1.9701492537313432, "grad_norm": 0.3104920983314514, "learning_rate": 1.4580453487617747e-06, "loss": 0.0146, "step": 132 }, { "epoch": 1.9850746268656716, "grad_norm": 0.3058537244796753, "learning_rate": 1.4206141191537681e-06, "loss": 0.0141, "step": 133 }, { "epoch": 2.0, "grad_norm": 0.311489999294281, "learning_rate": 1.383477982867984e-06, "loss": 0.0127, "step": 134 }, { "epoch": 2.014925373134328, "grad_norm": 0.35948511958122253, "learning_rate": 1.346647092553281e-06, "loss": 0.0139, "step": 135 }, { "epoch": 2.014925373134328, "eval_loss": 0.02187744900584221, "eval_runtime": 8.2762, "eval_samples_per_second": 14.379, "eval_steps_per_second": 0.483, "step": 135 }, { "epoch": 2.029850746268657, "grad_norm": 0.24142009019851685, "learning_rate": 1.3101315174073162e-06, "loss": 0.011, "step": 136 }, { "epoch": 2.044776119402985, "grad_norm": 0.2628946602344513, "learning_rate": 1.2739412404237306e-06, "loss": 0.0107, "step": 137 }, { "epoch": 2.0597014925373136, "grad_norm": 0.2656816244125366, "learning_rate": 1.2380861556628915e-06, "loss": 0.0087, "step": 138 }, { "epoch": 2.074626865671642, "grad_norm": 0.23779849708080292, "learning_rate": 1.2025760655469629e-06, "loss": 0.0096, "step": 139 }, { "epoch": 2.08955223880597, "grad_norm": 0.2537413537502289, "learning_rate": 1.1674206781800162e-06, "loss": 0.0101, "step": 140 }, { "epoch": 2.08955223880597, "eval_loss": 0.02204073593020439, "eval_runtime": 8.28, "eval_samples_per_second": 14.372, "eval_steps_per_second": 0.483, "step": 140 }, { "epoch": 2.1044776119402986, "grad_norm": 0.22632823884487152, "learning_rate": 1.1326296046939334e-06, "loss": 0.0097, "step": 141 }, { "epoch": 2.1194029850746268, "grad_norm": 0.2632769048213959, "learning_rate": 1.0982123566208187e-06, "loss": 0.012, "step": 142 }, { "epoch": 2.1343283582089554, "grad_norm": 0.2738955616950989, "learning_rate": 1.0641783432926412e-06, "loss": 0.0109, "step": 143 }, { "epoch": 2.1492537313432836, "grad_norm": 0.30985602736473083, "learning_rate": 1.0305368692688175e-06, "loss": 0.011, "step": 144 }, { "epoch": 2.1641791044776117, "grad_norm": 0.2866548001766205, "learning_rate": 9.972971317924373e-07, "loss": 0.0087, "step": 145 }, { "epoch": 2.1641791044776117, "eval_loss": 0.02224661409854889, "eval_runtime": 8.27, "eval_samples_per_second": 14.389, "eval_steps_per_second": 0.484, "step": 145 }, { "epoch": 2.1791044776119404, "grad_norm": 0.29876846075057983, "learning_rate": 9.644682182758305e-07, "loss": 0.0114, "step": 146 }, { "epoch": 2.1940298507462686, "grad_norm": 0.2641301453113556, "learning_rate": 9.320591038161575e-07, "loss": 0.0099, "step": 147 }, { "epoch": 2.208955223880597, "grad_norm": 0.3317098915576935, "learning_rate": 9.000786487417084e-07, "loss": 0.0115, "step": 148 }, { "epoch": 2.2238805970149254, "grad_norm": 0.22708454728126526, "learning_rate": 8.685355961895783e-07, "loss": 0.0088, "step": 149 }, { "epoch": 2.2388059701492535, "grad_norm": 0.24335134029388428, "learning_rate": 8.374385697153792e-07, "loss": 0.0089, "step": 150 }, { "epoch": 2.2388059701492535, "eval_loss": 0.022272665053606033, "eval_runtime": 8.2936, "eval_samples_per_second": 14.348, "eval_steps_per_second": 0.482, "step": 150 }, { "epoch": 2.253731343283582, "grad_norm": 0.26413631439208984, "learning_rate": 8.067960709356479e-07, "loss": 0.0101, "step": 151 }, { "epoch": 2.2686567164179103, "grad_norm": 0.24371112883090973, "learning_rate": 7.766164772035856e-07, "loss": 0.0079, "step": 152 }, { "epoch": 2.283582089552239, "grad_norm": 0.25570055842399597, "learning_rate": 7.469080393187786e-07, "loss": 0.0089, "step": 153 }, { "epoch": 2.298507462686567, "grad_norm": 0.26083242893218994, "learning_rate": 7.176788792715076e-07, "loss": 0.008, "step": 154 }, { "epoch": 2.3134328358208958, "grad_norm": 0.36927682161331177, "learning_rate": 6.889369880222776e-07, "loss": 0.0112, "step": 155 }, { "epoch": 2.3134328358208958, "eval_loss": 0.0225172471255064, "eval_runtime": 8.2668, "eval_samples_per_second": 14.395, "eval_steps_per_second": 0.484, "step": 155 }, { "epoch": 2.328358208955224, "grad_norm": 0.2831665873527527, "learning_rate": 6.60690223317171e-07, "loss": 0.0093, "step": 156 }, { "epoch": 2.343283582089552, "grad_norm": 0.2767029106616974, "learning_rate": 6.329463075396161e-07, "loss": 0.0093, "step": 157 }, { "epoch": 2.3582089552238807, "grad_norm": 0.22897273302078247, "learning_rate": 6.057128255991637e-07, "loss": 0.007, "step": 158 }, { "epoch": 2.373134328358209, "grad_norm": 0.2247919887304306, "learning_rate": 5.78997222857853e-07, "loss": 0.0081, "step": 159 }, { "epoch": 2.388059701492537, "grad_norm": 0.25745531916618347, "learning_rate": 5.528068030947193e-07, "loss": 0.0083, "step": 160 }, { "epoch": 2.388059701492537, "eval_loss": 0.022666901350021362, "eval_runtime": 8.287, "eval_samples_per_second": 14.36, "eval_steps_per_second": 0.483, "step": 160 }, { "epoch": 2.4029850746268657, "grad_norm": 0.2648358643054962, "learning_rate": 5.271487265090163e-07, "loss": 0.009, "step": 161 }, { "epoch": 2.417910447761194, "grad_norm": 0.28286054730415344, "learning_rate": 5.020300077626883e-07, "loss": 0.0101, "step": 162 }, { "epoch": 2.4328358208955225, "grad_norm": 0.28459128737449646, "learning_rate": 4.774575140626317e-07, "loss": 0.0087, "step": 163 }, { "epoch": 2.4477611940298507, "grad_norm": 0.21343541145324707, "learning_rate": 4.534379632832692e-07, "loss": 0.0079, "step": 164 }, { "epoch": 2.4626865671641793, "grad_norm": 0.2659505009651184, "learning_rate": 4.299779221299499e-07, "loss": 0.008, "step": 165 }, { "epoch": 2.4626865671641793, "eval_loss": 0.022702785208821297, "eval_runtime": 8.2796, "eval_samples_per_second": 14.373, "eval_steps_per_second": 0.483, "step": 165 }, { "epoch": 2.4776119402985075, "grad_norm": 0.29091840982437134, "learning_rate": 4.070838043436787e-07, "loss": 0.0093, "step": 166 }, { "epoch": 2.4925373134328357, "grad_norm": 0.24148190021514893, "learning_rate": 3.847618689476612e-07, "loss": 0.0083, "step": 167 }, { "epoch": 2.5074626865671643, "grad_norm": 0.33075305819511414, "learning_rate": 3.630182185361522e-07, "loss": 0.0081, "step": 168 }, { "epoch": 2.5223880597014925, "grad_norm": 0.24389711022377014, "learning_rate": 3.4185879760606525e-07, "loss": 0.0079, "step": 169 }, { "epoch": 2.5373134328358207, "grad_norm": 0.32192865014076233, "learning_rate": 3.2128939093180654e-07, "loss": 0.0109, "step": 170 }, { "epoch": 2.5373134328358207, "eval_loss": 0.022760972380638123, "eval_runtime": 8.2746, "eval_samples_per_second": 14.381, "eval_steps_per_second": 0.483, "step": 170 }, { "epoch": 2.5522388059701493, "grad_norm": 0.3323243260383606, "learning_rate": 3.0131562198377763e-07, "loss": 0.0074, "step": 171 }, { "epoch": 2.5671641791044775, "grad_norm": 0.209702268242836, "learning_rate": 2.819429513909705e-07, "loss": 0.006, "step": 172 }, { "epoch": 2.582089552238806, "grad_norm": 0.29595962166786194, "learning_rate": 2.6317667544809135e-07, "loss": 0.0102, "step": 173 }, { "epoch": 2.5970149253731343, "grad_norm": 0.23989610373973846, "learning_rate": 2.450219246676028e-07, "loss": 0.0073, "step": 174 }, { "epoch": 2.611940298507463, "grad_norm": 0.3013235330581665, "learning_rate": 2.2748366237709374e-07, "loss": 0.0103, "step": 175 }, { "epoch": 2.611940298507463, "eval_loss": 0.02282480150461197, "eval_runtime": 8.2894, "eval_samples_per_second": 14.356, "eval_steps_per_second": 0.483, "step": 175 }, { "epoch": 2.626865671641791, "grad_norm": 0.23502513766288757, "learning_rate": 2.1056668336235624e-07, "loss": 0.0089, "step": 176 }, { "epoch": 2.6417910447761193, "grad_norm": 0.26030927896499634, "learning_rate": 1.9427561255653816e-07, "loss": 0.0082, "step": 177 }, { "epoch": 2.656716417910448, "grad_norm": 0.27521783113479614, "learning_rate": 1.786149037757326e-07, "loss": 0.0089, "step": 178 }, { "epoch": 2.671641791044776, "grad_norm": 0.3519136905670166, "learning_rate": 1.6358883850134815e-07, "loss": 0.0121, "step": 179 }, { "epoch": 2.6865671641791042, "grad_norm": 0.25255823135375977, "learning_rate": 1.492015247095971e-07, "loss": 0.008, "step": 180 }, { "epoch": 2.6865671641791042, "eval_loss": 0.02286040224134922, "eval_runtime": 8.2767, "eval_samples_per_second": 14.378, "eval_steps_per_second": 0.483, "step": 180 }, { "epoch": 2.701492537313433, "grad_norm": 0.24869883060455322, "learning_rate": 1.3545689574841341e-07, "loss": 0.0072, "step": 181 }, { "epoch": 2.716417910447761, "grad_norm": 0.3076685070991516, "learning_rate": 1.223587092621162e-07, "loss": 0.009, "step": 182 }, { "epoch": 2.7313432835820897, "grad_norm": 0.2468510866165161, "learning_rate": 1.099105461641059e-07, "loss": 0.0091, "step": 183 }, { "epoch": 2.746268656716418, "grad_norm": 0.4652602970600128, "learning_rate": 9.811580965787965e-08, "loss": 0.0078, "step": 184 }, { "epoch": 2.7611940298507465, "grad_norm": 0.30688929557800293, "learning_rate": 8.697772430662859e-08, "loss": 0.0103, "step": 185 }, { "epoch": 2.7611940298507465, "eval_loss": 0.022866524755954742, "eval_runtime": 8.2907, "eval_samples_per_second": 14.353, "eval_steps_per_second": 0.482, "step": 185 }, { "epoch": 2.7761194029850746, "grad_norm": 0.3009447753429413, "learning_rate": 7.649933515167407e-08, "loss": 0.0077, "step": 186 }, { "epoch": 2.791044776119403, "grad_norm": 0.3035335838794708, "learning_rate": 6.668350687998565e-08, "loss": 0.0104, "step": 187 }, { "epoch": 2.8059701492537314, "grad_norm": 0.2949415445327759, "learning_rate": 5.753292304100183e-08, "loss": 0.0084, "step": 188 }, { "epoch": 2.8208955223880596, "grad_norm": 0.26432371139526367, "learning_rate": 4.905008531297661e-08, "loss": 0.0085, "step": 189 }, { "epoch": 2.835820895522388, "grad_norm": 0.25077345967292786, "learning_rate": 4.123731281904408e-08, "loss": 0.008, "step": 190 }, { "epoch": 2.835820895522388, "eval_loss": 0.02286113053560257, "eval_runtime": 8.2705, "eval_samples_per_second": 14.389, "eval_steps_per_second": 0.484, "step": 190 }, { "epoch": 2.8507462686567164, "grad_norm": 0.2702929675579071, "learning_rate": 3.4096741493194196e-08, "loss": 0.0096, "step": 191 }, { "epoch": 2.8656716417910446, "grad_norm": 0.3009881377220154, "learning_rate": 2.763032349632877e-08, "loss": 0.0098, "step": 192 }, { "epoch": 2.8805970149253732, "grad_norm": 0.24812233448028564, "learning_rate": 2.1839826682562015e-08, "loss": 0.0081, "step": 193 }, { "epoch": 2.8955223880597014, "grad_norm": 0.2779428958892822, "learning_rate": 1.6726834115904645e-08, "loss": 0.008, "step": 194 }, { "epoch": 2.91044776119403, "grad_norm": 0.3340453803539276, "learning_rate": 1.2292743637471461e-08, "loss": 0.0109, "step": 195 }, { "epoch": 2.91044776119403, "eval_loss": 0.022852875292301178, "eval_runtime": 8.2644, "eval_samples_per_second": 14.399, "eval_steps_per_second": 0.484, "step": 195 }, { "epoch": 2.925373134328358, "grad_norm": 0.2553161382675171, "learning_rate": 8.538767483325384e-09, "loss": 0.0068, "step": 196 }, { "epoch": 2.9402985074626864, "grad_norm": 0.2572329044342041, "learning_rate": 5.465931953063663e-09, "loss": 0.0097, "step": 197 }, { "epoch": 2.955223880597015, "grad_norm": 0.25870975852012634, "learning_rate": 3.0750771292381575e-09, "loss": 0.0093, "step": 198 }, { "epoch": 2.970149253731343, "grad_norm": 0.21340855956077576, "learning_rate": 1.3668566476848777e-09, "loss": 0.0075, "step": 199 }, { "epoch": 2.9850746268656714, "grad_norm": 0.3591207265853882, "learning_rate": 3.4173751882748964e-10, "loss": 0.009, "step": 200 }, { "epoch": 2.9850746268656714, "eval_loss": 0.022861387580633163, "eval_runtime": 8.2928, "eval_samples_per_second": 14.35, "eval_steps_per_second": 0.482, "step": 200 }, { "epoch": 3.0, "grad_norm": 0.2385999709367752, "learning_rate": 0.0, "loss": 0.0082, "step": 201 }, { "epoch": 3.0, "step": 201, "total_flos": 21438131871744.0, "train_loss": 0.020334236270548842, "train_runtime": 4074.9852, "train_samples_per_second": 0.786, "train_steps_per_second": 0.049 } ], "logging_steps": 1, "max_steps": 201, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 81, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 21438131871744.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }