| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 5, |
| "global_step": 201, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014925373134328358, |
| "grad_norm": 7.200786590576172, |
| "learning_rate": 4.5454545454545457e-07, |
| "loss": 0.1719, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.029850746268656716, |
| "grad_norm": 7.886128902435303, |
| "learning_rate": 9.090909090909091e-07, |
| "loss": 0.1847, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.04477611940298507, |
| "grad_norm": 8.476142883300781, |
| "learning_rate": 1.3636363636363636e-06, |
| "loss": 0.2012, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05970149253731343, |
| "grad_norm": 4.055701732635498, |
| "learning_rate": 1.8181818181818183e-06, |
| "loss": 0.1325, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 3.249504566192627, |
| "learning_rate": 2.2727272727272728e-06, |
| "loss": 0.0968, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "eval_loss": 0.06899096071720123, |
| "eval_runtime": 14.448, |
| "eval_samples_per_second": 8.236, |
| "eval_steps_per_second": 0.277, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08955223880597014, |
| "grad_norm": 1.5543451309204102, |
| "learning_rate": 2.7272727272727272e-06, |
| "loss": 0.0721, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.1044776119402985, |
| "grad_norm": 1.0983480215072632, |
| "learning_rate": 3.181818181818182e-06, |
| "loss": 0.0506, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.11940298507462686, |
| "grad_norm": 0.9362279176712036, |
| "learning_rate": 3.6363636363636366e-06, |
| "loss": 0.0397, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.13432835820895522, |
| "grad_norm": 0.84356290102005, |
| "learning_rate": 4.0909090909090915e-06, |
| "loss": 0.0427, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 0.7953129410743713, |
| "learning_rate": 4.5454545454545455e-06, |
| "loss": 0.0346, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "eval_loss": 0.03658153489232063, |
| "eval_runtime": 8.2566, |
| "eval_samples_per_second": 14.413, |
| "eval_steps_per_second": 0.484, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.16417910447761194, |
| "grad_norm": 0.6031929850578308, |
| "learning_rate": 5e-06, |
| "loss": 0.0325, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1791044776119403, |
| "grad_norm": 0.6844140291213989, |
| "learning_rate": 4.999658262481173e-06, |
| "loss": 0.0372, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.19402985074626866, |
| "grad_norm": 0.6683032512664795, |
| "learning_rate": 4.998633143352315e-06, |
| "loss": 0.0309, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.208955223880597, |
| "grad_norm": 0.5927382111549377, |
| "learning_rate": 4.9969249228707625e-06, |
| "loss": 0.0265, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 0.6329669952392578, |
| "learning_rate": 4.994534068046936e-06, |
| "loss": 0.0256, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "eval_loss": 0.03213270381093025, |
| "eval_runtime": 8.2934, |
| "eval_samples_per_second": 14.349, |
| "eval_steps_per_second": 0.482, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.23880597014925373, |
| "grad_norm": 0.689909040927887, |
| "learning_rate": 4.991461232516675e-06, |
| "loss": 0.0393, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2537313432835821, |
| "grad_norm": 0.623458743095398, |
| "learning_rate": 4.987707256362529e-06, |
| "loss": 0.0329, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.26865671641791045, |
| "grad_norm": 0.622241735458374, |
| "learning_rate": 4.983273165884096e-06, |
| "loss": 0.0311, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2835820895522388, |
| "grad_norm": 0.5380865335464478, |
| "learning_rate": 4.978160173317439e-06, |
| "loss": 0.0272, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 0.6085941791534424, |
| "learning_rate": 4.972369676503672e-06, |
| "loss": 0.0328, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "eval_loss": 0.030199836939573288, |
| "eval_runtime": 8.2944, |
| "eval_samples_per_second": 14.347, |
| "eval_steps_per_second": 0.482, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.31343283582089554, |
| "grad_norm": 0.5220608711242676, |
| "learning_rate": 4.965903258506806e-06, |
| "loss": 0.0286, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.3283582089552239, |
| "grad_norm": 0.5969937443733215, |
| "learning_rate": 4.9587626871809564e-06, |
| "loss": 0.0294, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.34328358208955223, |
| "grad_norm": 0.4778025448322296, |
| "learning_rate": 4.950949914687024e-06, |
| "loss": 0.0238, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3582089552238806, |
| "grad_norm": 0.41100114583969116, |
| "learning_rate": 4.942467076958999e-06, |
| "loss": 0.022, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 0.4831908941268921, |
| "learning_rate": 4.933316493120015e-06, |
| "loss": 0.0287, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "eval_loss": 0.02796478196978569, |
| "eval_runtime": 8.2814, |
| "eval_samples_per_second": 14.37, |
| "eval_steps_per_second": 0.483, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3880597014925373, |
| "grad_norm": 0.5058245062828064, |
| "learning_rate": 4.923500664848327e-06, |
| "loss": 0.0262, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.40298507462686567, |
| "grad_norm": 0.5201866626739502, |
| "learning_rate": 4.913022275693372e-06, |
| "loss": 0.029, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.417910447761194, |
| "grad_norm": 0.46352681517601013, |
| "learning_rate": 4.901884190342121e-06, |
| "loss": 0.0261, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.43283582089552236, |
| "grad_norm": 0.5337287187576294, |
| "learning_rate": 4.890089453835894e-06, |
| "loss": 0.0258, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 0.43212640285491943, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.0221, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "eval_loss": 0.027510978281497955, |
| "eval_runtime": 8.3069, |
| "eval_samples_per_second": 14.325, |
| "eval_steps_per_second": 0.482, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4626865671641791, |
| "grad_norm": 0.5198773145675659, |
| "learning_rate": 4.864543104251587e-06, |
| "loss": 0.0261, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.47761194029850745, |
| "grad_norm": 0.46835553646087646, |
| "learning_rate": 4.850798475290403e-06, |
| "loss": 0.0238, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4925373134328358, |
| "grad_norm": 0.521562397480011, |
| "learning_rate": 4.836411161498653e-06, |
| "loss": 0.0311, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.5074626865671642, |
| "grad_norm": 0.3529301881790161, |
| "learning_rate": 4.821385096224268e-06, |
| "loss": 0.0216, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 0.559899091720581, |
| "learning_rate": 4.8057243874434625e-06, |
| "loss": 0.0347, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "eval_loss": 0.026492305099964142, |
| "eval_runtime": 8.3008, |
| "eval_samples_per_second": 14.336, |
| "eval_steps_per_second": 0.482, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5373134328358209, |
| "grad_norm": 0.42942824959754944, |
| "learning_rate": 4.789433316637644e-06, |
| "loss": 0.0253, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.5522388059701493, |
| "grad_norm": 0.432647168636322, |
| "learning_rate": 4.772516337622907e-06, |
| "loss": 0.0209, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.5671641791044776, |
| "grad_norm": 0.47124215960502625, |
| "learning_rate": 4.754978075332398e-06, |
| "loss": 0.0213, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.582089552238806, |
| "grad_norm": 0.41911983489990234, |
| "learning_rate": 4.736823324551909e-06, |
| "loss": 0.0209, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.3859071731567383, |
| "learning_rate": 4.71805704860903e-06, |
| "loss": 0.0212, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "eval_loss": 0.02548597753047943, |
| "eval_runtime": 8.3185, |
| "eval_samples_per_second": 14.306, |
| "eval_steps_per_second": 0.481, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.6119402985074627, |
| "grad_norm": 0.5006839036941528, |
| "learning_rate": 4.698684378016223e-06, |
| "loss": 0.0237, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.6268656716417911, |
| "grad_norm": 0.35737383365631104, |
| "learning_rate": 4.678710609068193e-06, |
| "loss": 0.0218, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.6417910447761194, |
| "grad_norm": 0.41281658411026, |
| "learning_rate": 4.658141202393935e-06, |
| "loss": 0.0193, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6567164179104478, |
| "grad_norm": 0.4524123966693878, |
| "learning_rate": 4.636981781463848e-06, |
| "loss": 0.0314, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 0.4154703617095947, |
| "learning_rate": 4.615238131052339e-06, |
| "loss": 0.0245, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "eval_loss": 0.025405339896678925, |
| "eval_runtime": 8.3109, |
| "eval_samples_per_second": 14.318, |
| "eval_steps_per_second": 0.481, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6865671641791045, |
| "grad_norm": 0.45151591300964355, |
| "learning_rate": 4.592916195656322e-06, |
| "loss": 0.0273, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.7014925373134329, |
| "grad_norm": 0.3298991918563843, |
| "learning_rate": 4.570022077870051e-06, |
| "loss": 0.0204, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.7164179104477612, |
| "grad_norm": 0.4888248145580292, |
| "learning_rate": 4.546562036716732e-06, |
| "loss": 0.0269, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.7313432835820896, |
| "grad_norm": 0.3940542936325073, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 0.0235, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.3737621307373047, |
| "learning_rate": 4.497969992237312e-06, |
| "loss": 0.0209, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "eval_loss": 0.024418316781520844, |
| "eval_runtime": 8.3141, |
| "eval_samples_per_second": 14.313, |
| "eval_steps_per_second": 0.481, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.7611940298507462, |
| "grad_norm": 0.32089656591415405, |
| "learning_rate": 4.472851273490985e-06, |
| "loss": 0.0182, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.7761194029850746, |
| "grad_norm": 0.4259447455406189, |
| "learning_rate": 4.4471931969052816e-06, |
| "loss": 0.0229, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.7910447761194029, |
| "grad_norm": 0.33431047201156616, |
| "learning_rate": 4.421002777142148e-06, |
| "loss": 0.0191, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.8059701492537313, |
| "grad_norm": 0.3420027196407318, |
| "learning_rate": 4.394287174400838e-06, |
| "loss": 0.0187, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 0.4479522407054901, |
| "learning_rate": 4.3670536924603855e-06, |
| "loss": 0.0242, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "eval_loss": 0.023885194212198257, |
| "eval_runtime": 8.3377, |
| "eval_samples_per_second": 14.273, |
| "eval_steps_per_second": 0.48, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.835820895522388, |
| "grad_norm": 0.46404945850372314, |
| "learning_rate": 4.33930977668283e-06, |
| "loss": 0.0226, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.8507462686567164, |
| "grad_norm": 0.49134090542793274, |
| "learning_rate": 4.311063011977723e-06, |
| "loss": 0.0277, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.8656716417910447, |
| "grad_norm": 0.373234361410141, |
| "learning_rate": 4.282321120728493e-06, |
| "loss": 0.0199, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.8805970149253731, |
| "grad_norm": 0.31845158338546753, |
| "learning_rate": 4.253091960681222e-06, |
| "loss": 0.0196, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 0.4006720781326294, |
| "learning_rate": 4.2233835227964145e-06, |
| "loss": 0.0226, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "eval_loss": 0.023553457111120224, |
| "eval_runtime": 8.2827, |
| "eval_samples_per_second": 14.367, |
| "eval_steps_per_second": 0.483, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.9104477611940298, |
| "grad_norm": 0.395221084356308, |
| "learning_rate": 4.1932039290643534e-06, |
| "loss": 0.0238, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.9253731343283582, |
| "grad_norm": 0.40594613552093506, |
| "learning_rate": 4.162561430284621e-06, |
| "loss": 0.0235, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.9402985074626866, |
| "grad_norm": 0.32194435596466064, |
| "learning_rate": 4.1314644038104215e-06, |
| "loss": 0.0188, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.9552238805970149, |
| "grad_norm": 0.36028629541397095, |
| "learning_rate": 4.099921351258292e-06, |
| "loss": 0.0207, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 0.3143925368785858, |
| "learning_rate": 4.067940896183843e-06, |
| "loss": 0.0208, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "eval_loss": 0.023211363703012466, |
| "eval_runtime": 8.272, |
| "eval_samples_per_second": 14.386, |
| "eval_steps_per_second": 0.484, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.9850746268656716, |
| "grad_norm": 0.4360805153846741, |
| "learning_rate": 4.0355317817241705e-06, |
| "loss": 0.0195, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.424087256193161, |
| "learning_rate": 4.002702868207563e-06, |
| "loss": 0.0243, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.0149253731343284, |
| "grad_norm": 0.3290930986404419, |
| "learning_rate": 3.969463130731183e-06, |
| "loss": 0.0169, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.0298507462686568, |
| "grad_norm": 0.34683364629745483, |
| "learning_rate": 3.935821656707359e-06, |
| "loss": 0.0188, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 0.32021239399909973, |
| "learning_rate": 3.901787643379183e-06, |
| "loss": 0.0135, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "eval_loss": 0.022930506616830826, |
| "eval_runtime": 8.303, |
| "eval_samples_per_second": 14.332, |
| "eval_steps_per_second": 0.482, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.0597014925373134, |
| "grad_norm": 0.2900172472000122, |
| "learning_rate": 3.8673703953060685e-06, |
| "loss": 0.0169, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.0746268656716418, |
| "grad_norm": 0.35844284296035767, |
| "learning_rate": 3.832579321819985e-06, |
| "loss": 0.0152, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.0895522388059702, |
| "grad_norm": 0.352897584438324, |
| "learning_rate": 3.797423934453038e-06, |
| "loss": 0.0168, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.1044776119402986, |
| "grad_norm": 0.35191500186920166, |
| "learning_rate": 3.76191384433711e-06, |
| "loss": 0.0173, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 0.28631043434143066, |
| "learning_rate": 3.726058759576271e-06, |
| "loss": 0.0141, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "eval_loss": 0.023363711312413216, |
| "eval_runtime": 8.2771, |
| "eval_samples_per_second": 14.377, |
| "eval_steps_per_second": 0.483, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.1343283582089552, |
| "grad_norm": 0.36245712637901306, |
| "learning_rate": 3.6898684825926845e-06, |
| "loss": 0.0133, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.1492537313432836, |
| "grad_norm": 0.3419128954410553, |
| "learning_rate": 3.65335290744672e-06, |
| "loss": 0.0139, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.164179104477612, |
| "grad_norm": 0.3986120820045471, |
| "learning_rate": 3.616522017132017e-06, |
| "loss": 0.0168, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.1791044776119404, |
| "grad_norm": 0.3793441951274872, |
| "learning_rate": 3.579385880846232e-06, |
| "loss": 0.0165, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 0.36774227023124695, |
| "learning_rate": 3.5419546512382264e-06, |
| "loss": 0.0165, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "eval_loss": 0.023542851209640503, |
| "eval_runtime": 8.316, |
| "eval_samples_per_second": 14.31, |
| "eval_steps_per_second": 0.481, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.208955223880597, |
| "grad_norm": 0.36220625042915344, |
| "learning_rate": 3.5042385616324243e-06, |
| "loss": 0.0189, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.2238805970149254, |
| "grad_norm": 0.3013781011104584, |
| "learning_rate": 3.466247923231131e-06, |
| "loss": 0.0141, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.2388059701492538, |
| "grad_norm": 0.359733521938324, |
| "learning_rate": 3.427993122295552e-06, |
| "loss": 0.0161, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.2537313432835822, |
| "grad_norm": 0.39510107040405273, |
| "learning_rate": 3.3894846173062917e-06, |
| "loss": 0.0153, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 0.38427668809890747, |
| "learning_rate": 3.350732936104108e-06, |
| "loss": 0.0173, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "eval_loss": 0.023341603577136993, |
| "eval_runtime": 8.5378, |
| "eval_samples_per_second": 13.938, |
| "eval_steps_per_second": 0.469, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.2835820895522387, |
| "grad_norm": 0.30994802713394165, |
| "learning_rate": 3.3117486730117092e-06, |
| "loss": 0.0134, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.2985074626865671, |
| "grad_norm": 0.3489951193332672, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.0169, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.3134328358208955, |
| "grad_norm": 0.31990131735801697, |
| "learning_rate": 3.2331250934611623e-06, |
| "loss": 0.0169, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.328358208955224, |
| "grad_norm": 0.29082977771759033, |
| "learning_rate": 3.193507271904612e-06, |
| "loss": 0.0121, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 0.3279978334903717, |
| "learning_rate": 3.15369985238455e-06, |
| "loss": 0.0123, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "eval_loss": 0.02307475358247757, |
| "eval_runtime": 8.3253, |
| "eval_samples_per_second": 14.294, |
| "eval_steps_per_second": 0.48, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.3582089552238805, |
| "grad_norm": 0.30484575033187866, |
| "learning_rate": 3.1137137178519983e-06, |
| "loss": 0.0153, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.373134328358209, |
| "grad_norm": 0.37242934107780457, |
| "learning_rate": 3.073559800116879e-06, |
| "loss": 0.0189, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.3880597014925373, |
| "grad_norm": 0.33749932050704956, |
| "learning_rate": 3.0332490768593676e-06, |
| "loss": 0.02, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.4029850746268657, |
| "grad_norm": 0.322444349527359, |
| "learning_rate": 2.9927925686287006e-06, |
| "loss": 0.0135, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 0.3586093783378601, |
| "learning_rate": 2.9522013358302754e-06, |
| "loss": 0.0145, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "eval_loss": 0.02322915382683277, |
| "eval_runtime": 8.3124, |
| "eval_samples_per_second": 14.316, |
| "eval_steps_per_second": 0.481, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.4328358208955223, |
| "grad_norm": 0.29217156767845154, |
| "learning_rate": 2.911486475701835e-06, |
| "loss": 0.0132, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.4477611940298507, |
| "grad_norm": 0.36368077993392944, |
| "learning_rate": 2.870659119279605e-06, |
| "loss": 0.0157, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.462686567164179, |
| "grad_norm": 0.44833701848983765, |
| "learning_rate": 2.829730428355173e-06, |
| "loss": 0.0163, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.4776119402985075, |
| "grad_norm": 0.3234724700450897, |
| "learning_rate": 2.788711592423966e-06, |
| "loss": 0.0126, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 0.3331272006034851, |
| "learning_rate": 2.7476138256261575e-06, |
| "loss": 0.0154, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "eval_loss": 0.02257104031741619, |
| "eval_runtime": 8.3116, |
| "eval_samples_per_second": 14.317, |
| "eval_steps_per_second": 0.481, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.5074626865671643, |
| "grad_norm": 0.2923891544342041, |
| "learning_rate": 2.7064483636808314e-06, |
| "loss": 0.012, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.5223880597014925, |
| "grad_norm": 0.4166359007358551, |
| "learning_rate": 2.6652264608142487e-06, |
| "loss": 0.0207, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.537313432835821, |
| "grad_norm": 0.3134080469608307, |
| "learning_rate": 2.623959386683056e-06, |
| "loss": 0.0129, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.5522388059701493, |
| "grad_norm": 0.33056458830833435, |
| "learning_rate": 2.5826584232932707e-06, |
| "loss": 0.0141, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 0.36771681904792786, |
| "learning_rate": 2.5413348619158966e-06, |
| "loss": 0.0147, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "eval_loss": 0.022433940321207047, |
| "eval_runtime": 8.2998, |
| "eval_samples_per_second": 14.338, |
| "eval_steps_per_second": 0.482, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.582089552238806, |
| "grad_norm": 0.36414942145347595, |
| "learning_rate": 2.5e-06, |
| "loss": 0.017, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.5970149253731343, |
| "grad_norm": 0.37462717294692993, |
| "learning_rate": 2.458665138084104e-06, |
| "loss": 0.0152, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.6119402985074627, |
| "grad_norm": 0.3305724263191223, |
| "learning_rate": 2.4173415767067297e-06, |
| "loss": 0.0147, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.626865671641791, |
| "grad_norm": 0.3597583472728729, |
| "learning_rate": 2.376040613316944e-06, |
| "loss": 0.0152, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 0.32973572611808777, |
| "learning_rate": 2.3347735391857517e-06, |
| "loss": 0.0132, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "eval_loss": 0.022819483652710915, |
| "eval_runtime": 8.3196, |
| "eval_samples_per_second": 14.304, |
| "eval_steps_per_second": 0.481, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.6567164179104479, |
| "grad_norm": 0.28718486428260803, |
| "learning_rate": 2.2935516363191695e-06, |
| "loss": 0.0115, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.671641791044776, |
| "grad_norm": 0.3861285150051117, |
| "learning_rate": 2.2523861743738433e-06, |
| "loss": 0.0159, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.6865671641791045, |
| "grad_norm": 0.465282678604126, |
| "learning_rate": 2.211288407576035e-06, |
| "loss": 0.0187, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.7014925373134329, |
| "grad_norm": 0.31902506947517395, |
| "learning_rate": 2.1702695716448276e-06, |
| "loss": 0.0142, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 0.3674122989177704, |
| "learning_rate": 2.129340880720395e-06, |
| "loss": 0.0155, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "eval_loss": 0.022671934217214584, |
| "eval_runtime": 8.2807, |
| "eval_samples_per_second": 14.371, |
| "eval_steps_per_second": 0.483, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.7313432835820897, |
| "grad_norm": 0.4284082353115082, |
| "learning_rate": 2.088513524298165e-06, |
| "loss": 0.0185, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.7462686567164178, |
| "grad_norm": 0.3359587490558624, |
| "learning_rate": 2.0477986641697263e-06, |
| "loss": 0.0161, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.7611940298507462, |
| "grad_norm": 0.33396944403648376, |
| "learning_rate": 2.0072074313713e-06, |
| "loss": 0.0109, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.7761194029850746, |
| "grad_norm": 0.3261159658432007, |
| "learning_rate": 1.9667509231406332e-06, |
| "loss": 0.0142, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 0.35204392671585083, |
| "learning_rate": 1.9264401998831213e-06, |
| "loss": 0.0149, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "eval_loss": 0.022122090682387352, |
| "eval_runtime": 8.2768, |
| "eval_samples_per_second": 14.378, |
| "eval_steps_per_second": 0.483, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.8059701492537314, |
| "grad_norm": 0.320769727230072, |
| "learning_rate": 1.8862862821480023e-06, |
| "loss": 0.0145, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.8208955223880596, |
| "grad_norm": 0.34036341309547424, |
| "learning_rate": 1.8463001476154508e-06, |
| "loss": 0.0142, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.835820895522388, |
| "grad_norm": 0.2602291405200958, |
| "learning_rate": 1.8064927280953893e-06, |
| "loss": 0.0117, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.8507462686567164, |
| "grad_norm": 0.3351423442363739, |
| "learning_rate": 1.7668749065388385e-06, |
| "loss": 0.0132, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 0.4113386869430542, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 0.0169, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "eval_loss": 0.02193240076303482, |
| "eval_runtime": 8.2973, |
| "eval_samples_per_second": 14.342, |
| "eval_steps_per_second": 0.482, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.8805970149253732, |
| "grad_norm": 0.3114112615585327, |
| "learning_rate": 1.6882513269882916e-06, |
| "loss": 0.0134, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.8955223880597014, |
| "grad_norm": 0.39908191561698914, |
| "learning_rate": 1.6492670638958924e-06, |
| "loss": 0.0195, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.9104477611940298, |
| "grad_norm": 0.3014167845249176, |
| "learning_rate": 1.6105153826937087e-06, |
| "loss": 0.0147, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.9253731343283582, |
| "grad_norm": 0.3079487681388855, |
| "learning_rate": 1.5720068777044479e-06, |
| "loss": 0.0134, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 0.28381192684173584, |
| "learning_rate": 1.53375207676887e-06, |
| "loss": 0.0136, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "eval_loss": 0.02178417146205902, |
| "eval_runtime": 8.276, |
| "eval_samples_per_second": 14.379, |
| "eval_steps_per_second": 0.483, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.955223880597015, |
| "grad_norm": 0.344938188791275, |
| "learning_rate": 1.495761438367577e-06, |
| "loss": 0.0137, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.9701492537313432, |
| "grad_norm": 0.3104920983314514, |
| "learning_rate": 1.4580453487617747e-06, |
| "loss": 0.0146, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.9850746268656716, |
| "grad_norm": 0.3058537244796753, |
| "learning_rate": 1.4206141191537681e-06, |
| "loss": 0.0141, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.311489999294281, |
| "learning_rate": 1.383477982867984e-06, |
| "loss": 0.0127, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 0.35948511958122253, |
| "learning_rate": 1.346647092553281e-06, |
| "loss": 0.0139, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "eval_loss": 0.02187744900584221, |
| "eval_runtime": 8.2762, |
| "eval_samples_per_second": 14.379, |
| "eval_steps_per_second": 0.483, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.029850746268657, |
| "grad_norm": 0.24142009019851685, |
| "learning_rate": 1.3101315174073162e-06, |
| "loss": 0.011, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.044776119402985, |
| "grad_norm": 0.2628946602344513, |
| "learning_rate": 1.2739412404237306e-06, |
| "loss": 0.0107, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.0597014925373136, |
| "grad_norm": 0.2656816244125366, |
| "learning_rate": 1.2380861556628915e-06, |
| "loss": 0.0087, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.074626865671642, |
| "grad_norm": 0.23779849708080292, |
| "learning_rate": 1.2025760655469629e-06, |
| "loss": 0.0096, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 0.2537413537502289, |
| "learning_rate": 1.1674206781800162e-06, |
| "loss": 0.0101, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "eval_loss": 0.02204073593020439, |
| "eval_runtime": 8.28, |
| "eval_samples_per_second": 14.372, |
| "eval_steps_per_second": 0.483, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.1044776119402986, |
| "grad_norm": 0.22632823884487152, |
| "learning_rate": 1.1326296046939334e-06, |
| "loss": 0.0097, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.1194029850746268, |
| "grad_norm": 0.2632769048213959, |
| "learning_rate": 1.0982123566208187e-06, |
| "loss": 0.012, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.1343283582089554, |
| "grad_norm": 0.2738955616950989, |
| "learning_rate": 1.0641783432926412e-06, |
| "loss": 0.0109, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.1492537313432836, |
| "grad_norm": 0.30985602736473083, |
| "learning_rate": 1.0305368692688175e-06, |
| "loss": 0.011, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 0.2866548001766205, |
| "learning_rate": 9.972971317924373e-07, |
| "loss": 0.0087, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "eval_loss": 0.02224661409854889, |
| "eval_runtime": 8.27, |
| "eval_samples_per_second": 14.389, |
| "eval_steps_per_second": 0.484, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.1791044776119404, |
| "grad_norm": 0.29876846075057983, |
| "learning_rate": 9.644682182758305e-07, |
| "loss": 0.0114, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.1940298507462686, |
| "grad_norm": 0.2641301453113556, |
| "learning_rate": 9.320591038161575e-07, |
| "loss": 0.0099, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.208955223880597, |
| "grad_norm": 0.3317098915576935, |
| "learning_rate": 9.000786487417084e-07, |
| "loss": 0.0115, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.2238805970149254, |
| "grad_norm": 0.22708454728126526, |
| "learning_rate": 8.685355961895783e-07, |
| "loss": 0.0088, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 0.24335134029388428, |
| "learning_rate": 8.374385697153792e-07, |
| "loss": 0.0089, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "eval_loss": 0.022272665053606033, |
| "eval_runtime": 8.2936, |
| "eval_samples_per_second": 14.348, |
| "eval_steps_per_second": 0.482, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.253731343283582, |
| "grad_norm": 0.26413631439208984, |
| "learning_rate": 8.067960709356479e-07, |
| "loss": 0.0101, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.2686567164179103, |
| "grad_norm": 0.24371112883090973, |
| "learning_rate": 7.766164772035856e-07, |
| "loss": 0.0079, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.283582089552239, |
| "grad_norm": 0.25570055842399597, |
| "learning_rate": 7.469080393187786e-07, |
| "loss": 0.0089, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.298507462686567, |
| "grad_norm": 0.26083242893218994, |
| "learning_rate": 7.176788792715076e-07, |
| "loss": 0.008, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 0.36927682161331177, |
| "learning_rate": 6.889369880222776e-07, |
| "loss": 0.0112, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "eval_loss": 0.0225172471255064, |
| "eval_runtime": 8.2668, |
| "eval_samples_per_second": 14.395, |
| "eval_steps_per_second": 0.484, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.328358208955224, |
| "grad_norm": 0.2831665873527527, |
| "learning_rate": 6.60690223317171e-07, |
| "loss": 0.0093, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.343283582089552, |
| "grad_norm": 0.2767029106616974, |
| "learning_rate": 6.329463075396161e-07, |
| "loss": 0.0093, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.3582089552238807, |
| "grad_norm": 0.22897273302078247, |
| "learning_rate": 6.057128255991637e-07, |
| "loss": 0.007, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.373134328358209, |
| "grad_norm": 0.2247919887304306, |
| "learning_rate": 5.78997222857853e-07, |
| "loss": 0.0081, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 0.25745531916618347, |
| "learning_rate": 5.528068030947193e-07, |
| "loss": 0.0083, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "eval_loss": 0.022666901350021362, |
| "eval_runtime": 8.287, |
| "eval_samples_per_second": 14.36, |
| "eval_steps_per_second": 0.483, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.4029850746268657, |
| "grad_norm": 0.2648358643054962, |
| "learning_rate": 5.271487265090163e-07, |
| "loss": 0.009, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.417910447761194, |
| "grad_norm": 0.28286054730415344, |
| "learning_rate": 5.020300077626883e-07, |
| "loss": 0.0101, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.4328358208955225, |
| "grad_norm": 0.28459128737449646, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.0087, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.4477611940298507, |
| "grad_norm": 0.21343541145324707, |
| "learning_rate": 4.534379632832692e-07, |
| "loss": 0.0079, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 0.2659505009651184, |
| "learning_rate": 4.299779221299499e-07, |
| "loss": 0.008, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "eval_loss": 0.022702785208821297, |
| "eval_runtime": 8.2796, |
| "eval_samples_per_second": 14.373, |
| "eval_steps_per_second": 0.483, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.4776119402985075, |
| "grad_norm": 0.29091840982437134, |
| "learning_rate": 4.070838043436787e-07, |
| "loss": 0.0093, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.4925373134328357, |
| "grad_norm": 0.24148190021514893, |
| "learning_rate": 3.847618689476612e-07, |
| "loss": 0.0083, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.5074626865671643, |
| "grad_norm": 0.33075305819511414, |
| "learning_rate": 3.630182185361522e-07, |
| "loss": 0.0081, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.5223880597014925, |
| "grad_norm": 0.24389711022377014, |
| "learning_rate": 3.4185879760606525e-07, |
| "loss": 0.0079, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 0.32192865014076233, |
| "learning_rate": 3.2128939093180654e-07, |
| "loss": 0.0109, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "eval_loss": 0.022760972380638123, |
| "eval_runtime": 8.2746, |
| "eval_samples_per_second": 14.381, |
| "eval_steps_per_second": 0.483, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.5522388059701493, |
| "grad_norm": 0.3323243260383606, |
| "learning_rate": 3.0131562198377763e-07, |
| "loss": 0.0074, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.5671641791044775, |
| "grad_norm": 0.209702268242836, |
| "learning_rate": 2.819429513909705e-07, |
| "loss": 0.006, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.582089552238806, |
| "grad_norm": 0.29595962166786194, |
| "learning_rate": 2.6317667544809135e-07, |
| "loss": 0.0102, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.5970149253731343, |
| "grad_norm": 0.23989610373973846, |
| "learning_rate": 2.450219246676028e-07, |
| "loss": 0.0073, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 0.3013235330581665, |
| "learning_rate": 2.2748366237709374e-07, |
| "loss": 0.0103, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "eval_loss": 0.02282480150461197, |
| "eval_runtime": 8.2894, |
| "eval_samples_per_second": 14.356, |
| "eval_steps_per_second": 0.483, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.626865671641791, |
| "grad_norm": 0.23502513766288757, |
| "learning_rate": 2.1056668336235624e-07, |
| "loss": 0.0089, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.6417910447761193, |
| "grad_norm": 0.26030927896499634, |
| "learning_rate": 1.9427561255653816e-07, |
| "loss": 0.0082, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.656716417910448, |
| "grad_norm": 0.27521783113479614, |
| "learning_rate": 1.786149037757326e-07, |
| "loss": 0.0089, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.671641791044776, |
| "grad_norm": 0.3519136905670166, |
| "learning_rate": 1.6358883850134815e-07, |
| "loss": 0.0121, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 0.25255823135375977, |
| "learning_rate": 1.492015247095971e-07, |
| "loss": 0.008, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "eval_loss": 0.02286040224134922, |
| "eval_runtime": 8.2767, |
| "eval_samples_per_second": 14.378, |
| "eval_steps_per_second": 0.483, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.701492537313433, |
| "grad_norm": 0.24869883060455322, |
| "learning_rate": 1.3545689574841341e-07, |
| "loss": 0.0072, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.716417910447761, |
| "grad_norm": 0.3076685070991516, |
| "learning_rate": 1.223587092621162e-07, |
| "loss": 0.009, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.7313432835820897, |
| "grad_norm": 0.2468510866165161, |
| "learning_rate": 1.099105461641059e-07, |
| "loss": 0.0091, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.746268656716418, |
| "grad_norm": 0.4652602970600128, |
| "learning_rate": 9.811580965787965e-08, |
| "loss": 0.0078, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 0.30688929557800293, |
| "learning_rate": 8.697772430662859e-08, |
| "loss": 0.0103, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "eval_loss": 0.022866524755954742, |
| "eval_runtime": 8.2907, |
| "eval_samples_per_second": 14.353, |
| "eval_steps_per_second": 0.482, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.7761194029850746, |
| "grad_norm": 0.3009447753429413, |
| "learning_rate": 7.649933515167407e-08, |
| "loss": 0.0077, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.791044776119403, |
| "grad_norm": 0.3035335838794708, |
| "learning_rate": 6.668350687998565e-08, |
| "loss": 0.0104, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.8059701492537314, |
| "grad_norm": 0.2949415445327759, |
| "learning_rate": 5.753292304100183e-08, |
| "loss": 0.0084, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.8208955223880596, |
| "grad_norm": 0.26432371139526367, |
| "learning_rate": 4.905008531297661e-08, |
| "loss": 0.0085, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 0.25077345967292786, |
| "learning_rate": 4.123731281904408e-08, |
| "loss": 0.008, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "eval_loss": 0.02286113053560257, |
| "eval_runtime": 8.2705, |
| "eval_samples_per_second": 14.389, |
| "eval_steps_per_second": 0.484, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.8507462686567164, |
| "grad_norm": 0.2702929675579071, |
| "learning_rate": 3.4096741493194196e-08, |
| "loss": 0.0096, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.8656716417910446, |
| "grad_norm": 0.3009881377220154, |
| "learning_rate": 2.763032349632877e-08, |
| "loss": 0.0098, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.8805970149253732, |
| "grad_norm": 0.24812233448028564, |
| "learning_rate": 2.1839826682562015e-08, |
| "loss": 0.0081, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.8955223880597014, |
| "grad_norm": 0.2779428958892822, |
| "learning_rate": 1.6726834115904645e-08, |
| "loss": 0.008, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 0.3340453803539276, |
| "learning_rate": 1.2292743637471461e-08, |
| "loss": 0.0109, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "eval_loss": 0.022852875292301178, |
| "eval_runtime": 8.2644, |
| "eval_samples_per_second": 14.399, |
| "eval_steps_per_second": 0.484, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.925373134328358, |
| "grad_norm": 0.2553161382675171, |
| "learning_rate": 8.538767483325384e-09, |
| "loss": 0.0068, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.9402985074626864, |
| "grad_norm": 0.2572329044342041, |
| "learning_rate": 5.465931953063663e-09, |
| "loss": 0.0097, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.955223880597015, |
| "grad_norm": 0.25870975852012634, |
| "learning_rate": 3.0750771292381575e-09, |
| "loss": 0.0093, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.970149253731343, |
| "grad_norm": 0.21340855956077576, |
| "learning_rate": 1.3668566476848777e-09, |
| "loss": 0.0075, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 0.3591207265853882, |
| "learning_rate": 3.4173751882748964e-10, |
| "loss": 0.009, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "eval_loss": 0.022861387580633163, |
| "eval_runtime": 8.2928, |
| "eval_samples_per_second": 14.35, |
| "eval_steps_per_second": 0.482, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.2385999709367752, |
| "learning_rate": 0.0, |
| "loss": 0.0082, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 201, |
| "total_flos": 21438131871744.0, |
| "train_loss": 0.020334236270548842, |
| "train_runtime": 4074.9852, |
| "train_samples_per_second": 0.786, |
| "train_steps_per_second": 0.049 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 201, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 81, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 21438131871744.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|