diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4528 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 555, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009029345372460496, + "grad_norm": 1.3914246657628389, + "learning_rate": 0.0, + "loss": 0.1984, + "num_tokens": 417963.0, + "step": 1 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 1.339086973600259, + "learning_rate": 5.882352941176471e-07, + "loss": 0.1899, + "num_tokens": 856730.0, + "step": 2 + }, + { + "epoch": 0.02708803611738149, + "grad_norm": 1.3503271124381317, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.1916, + "num_tokens": 1283679.0, + "step": 3 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 1.2668656784826644, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.179, + "num_tokens": 1722056.0, + "step": 4 + }, + { + "epoch": 0.045146726862302484, + "grad_norm": 1.3465170325986022, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.186, + "num_tokens": 2150137.0, + "step": 5 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 1.4599535688641787, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.1924, + "num_tokens": 2564649.0, + "step": 6 + }, + { + "epoch": 0.06320541760722348, + "grad_norm": 1.13869223395712, + "learning_rate": 3.529411764705883e-06, + "loss": 0.1414, + "num_tokens": 2985977.0, + "step": 7 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 1.0293594203382637, + "learning_rate": 4.11764705882353e-06, + "loss": 0.1255, + "num_tokens": 3413212.0, + "step": 8 + }, + { + "epoch": 0.08126410835214447, + "grad_norm": 0.8541647677228765, + "learning_rate": 4.705882352941177e-06, + "loss": 0.0858, + "num_tokens": 3843515.0, + "step": 9 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 0.5327508702110814, + "learning_rate": 5.294117647058824e-06, + "loss": 0.079, + "num_tokens": 4289732.0, + "step": 10 + }, + { + "epoch": 0.09932279909706546, + "grad_norm": 0.3858281500923405, + "learning_rate": 5.882352941176471e-06, + "loss": 0.0716, + "num_tokens": 4715096.0, + "step": 11 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 0.4004953099648253, + "learning_rate": 6.470588235294119e-06, + "loss": 0.0627, + "num_tokens": 5128419.0, + "step": 12 + }, + { + "epoch": 0.11738148984198646, + "grad_norm": 0.575735632330429, + "learning_rate": 7.058823529411766e-06, + "loss": 0.0728, + "num_tokens": 5540634.0, + "step": 13 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 0.44840163608890915, + "learning_rate": 7.647058823529411e-06, + "loss": 0.0741, + "num_tokens": 5968005.0, + "step": 14 + }, + { + "epoch": 0.13544018058690746, + "grad_norm": 0.36039035768163974, + "learning_rate": 8.23529411764706e-06, + "loss": 0.0728, + "num_tokens": 6397897.0, + "step": 15 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 0.37417428321398716, + "learning_rate": 8.823529411764707e-06, + "loss": 0.0726, + "num_tokens": 6815448.0, + "step": 16 + }, + { + "epoch": 0.15349887133182843, + "grad_norm": 0.34977303624046474, + "learning_rate": 9.411764705882354e-06, + "loss": 0.0693, + "num_tokens": 7230528.0, + "step": 17 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 0.37502450603843024, + "learning_rate": 1e-05, + "loss": 0.0744, + "num_tokens": 7649044.0, + "step": 18 + }, + { + "epoch": 0.17155756207674944, + "grad_norm": 0.3344300437685339, + "learning_rate": 9.999923278607256e-06, + "loss": 0.0674, + "num_tokens": 8067225.0, + "step": 19 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 0.32033300587618835, + "learning_rate": 9.999693117045099e-06, + "loss": 0.0682, + "num_tokens": 8500120.0, + "step": 20 + }, + { + "epoch": 0.18961625282167044, + "grad_norm": 0.3147578686726087, + "learning_rate": 9.99930952316167e-06, + "loss": 0.0675, + "num_tokens": 8925262.0, + "step": 21 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 0.31927214718417934, + "learning_rate": 9.998772510036905e-06, + "loss": 0.0738, + "num_tokens": 9354973.0, + "step": 22 + }, + { + "epoch": 0.2076749435665914, + "grad_norm": 0.30036097867968276, + "learning_rate": 9.998082095982091e-06, + "loss": 0.0657, + "num_tokens": 9795967.0, + "step": 23 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 0.29108545548309744, + "learning_rate": 9.997238304539241e-06, + "loss": 0.0712, + "num_tokens": 10217971.0, + "step": 24 + }, + { + "epoch": 0.22573363431151242, + "grad_norm": 0.2962492684664264, + "learning_rate": 9.99624116448029e-06, + "loss": 0.067, + "num_tokens": 10675155.0, + "step": 25 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 0.28415485362226156, + "learning_rate": 9.995090709806113e-06, + "loss": 0.0678, + "num_tokens": 11119872.0, + "step": 26 + }, + { + "epoch": 0.24379232505643342, + "grad_norm": 0.28714836481663, + "learning_rate": 9.993786979745374e-06, + "loss": 0.0686, + "num_tokens": 11538902.0, + "step": 27 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 0.2973434769464962, + "learning_rate": 9.992330018753175e-06, + "loss": 0.07, + "num_tokens": 11961057.0, + "step": 28 + }, + { + "epoch": 0.2618510158013544, + "grad_norm": 0.27796324829673646, + "learning_rate": 9.990719876509551e-06, + "loss": 0.0679, + "num_tokens": 12406878.0, + "step": 29 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 0.2973715544939514, + "learning_rate": 9.98895660791777e-06, + "loss": 0.0723, + "num_tokens": 12819692.0, + "step": 30 + }, + { + "epoch": 0.2799097065462754, + "grad_norm": 0.2536514988485092, + "learning_rate": 9.987040273102466e-06, + "loss": 0.0631, + "num_tokens": 13240915.0, + "step": 31 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 0.25966438801261443, + "learning_rate": 9.984970937407583e-06, + "loss": 0.069, + "num_tokens": 13660214.0, + "step": 32 + }, + { + "epoch": 0.2979683972911964, + "grad_norm": 0.24985113708738513, + "learning_rate": 9.98274867139415e-06, + "loss": 0.0623, + "num_tokens": 14114321.0, + "step": 33 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 0.24501577709376796, + "learning_rate": 9.980373550837877e-06, + "loss": 0.0659, + "num_tokens": 14548964.0, + "step": 34 + }, + { + "epoch": 0.3160270880361174, + "grad_norm": 0.2822656623922199, + "learning_rate": 9.977845656726565e-06, + "loss": 0.0719, + "num_tokens": 14981191.0, + "step": 35 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 0.25411206934047426, + "learning_rate": 9.975165075257351e-06, + "loss": 0.0656, + "num_tokens": 15397252.0, + "step": 36 + }, + { + "epoch": 0.3340857787810384, + "grad_norm": 0.25663363207527506, + "learning_rate": 9.972331897833766e-06, + "loss": 0.0689, + "num_tokens": 15825845.0, + "step": 37 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 0.252884566038925, + "learning_rate": 9.96934622106262e-06, + "loss": 0.0665, + "num_tokens": 16263335.0, + "step": 38 + }, + { + "epoch": 0.35214446952595935, + "grad_norm": 0.2493475121033915, + "learning_rate": 9.966208146750697e-06, + "loss": 0.0654, + "num_tokens": 16685043.0, + "step": 39 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 0.25233444360367147, + "learning_rate": 9.962917781901308e-06, + "loss": 0.0693, + "num_tokens": 17112536.0, + "step": 40 + }, + { + "epoch": 0.37020316027088035, + "grad_norm": 0.2484084225551676, + "learning_rate": 9.959475238710617e-06, + "loss": 0.0645, + "num_tokens": 17536800.0, + "step": 41 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 0.25439675549328833, + "learning_rate": 9.955880634563825e-06, + "loss": 0.0627, + "num_tokens": 17972150.0, + "step": 42 + }, + { + "epoch": 0.38826185101580135, + "grad_norm": 0.2599652954625858, + "learning_rate": 9.952134092031174e-06, + "loss": 0.0706, + "num_tokens": 18396216.0, + "step": 43 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 0.2804405270409319, + "learning_rate": 9.948235738863755e-06, + "loss": 0.072, + "num_tokens": 18821149.0, + "step": 44 + }, + { + "epoch": 0.40632054176072235, + "grad_norm": 0.27092643382226195, + "learning_rate": 9.944185707989163e-06, + "loss": 0.0658, + "num_tokens": 19245794.0, + "step": 45 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 0.2348633197698994, + "learning_rate": 9.939984137506954e-06, + "loss": 0.0643, + "num_tokens": 19678417.0, + "step": 46 + }, + { + "epoch": 0.42437923250564336, + "grad_norm": 0.26649416385103647, + "learning_rate": 9.93563117068395e-06, + "loss": 0.068, + "num_tokens": 20092889.0, + "step": 47 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 0.2627484204559895, + "learning_rate": 9.93112695594934e-06, + "loss": 0.0682, + "num_tokens": 20547069.0, + "step": 48 + }, + { + "epoch": 0.44243792325056436, + "grad_norm": 0.23093350356017434, + "learning_rate": 9.926471646889624e-06, + "loss": 0.0662, + "num_tokens": 20966538.0, + "step": 49 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 0.24873538134134296, + "learning_rate": 9.921665402243376e-06, + "loss": 0.0659, + "num_tokens": 21400755.0, + "step": 50 + }, + { + "epoch": 0.4604966139954853, + "grad_norm": 0.25670304075865563, + "learning_rate": 9.916708385895837e-06, + "loss": 0.069, + "num_tokens": 21840775.0, + "step": 51 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 0.23852734804403594, + "learning_rate": 9.911600766873312e-06, + "loss": 0.0646, + "num_tokens": 22278786.0, + "step": 52 + }, + { + "epoch": 0.4785553047404063, + "grad_norm": 0.26468569171434464, + "learning_rate": 9.906342719337427e-06, + "loss": 0.0686, + "num_tokens": 22693466.0, + "step": 53 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 0.26075883147034323, + "learning_rate": 9.900934422579167e-06, + "loss": 0.0713, + "num_tokens": 23112540.0, + "step": 54 + }, + { + "epoch": 0.4966139954853273, + "grad_norm": 0.21680495749095652, + "learning_rate": 9.895376061012786e-06, + "loss": 0.062, + "num_tokens": 23564619.0, + "step": 55 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 0.25134919798396327, + "learning_rate": 9.889667824169498e-06, + "loss": 0.0729, + "num_tokens": 23967845.0, + "step": 56 + }, + { + "epoch": 0.5146726862302483, + "grad_norm": 0.24246445889404877, + "learning_rate": 9.883809906691031e-06, + "loss": 0.0672, + "num_tokens": 24391020.0, + "step": 57 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 0.24375821508004908, + "learning_rate": 9.877802508322977e-06, + "loss": 0.068, + "num_tokens": 24825527.0, + "step": 58 + }, + { + "epoch": 0.5327313769751693, + "grad_norm": 0.2337180979027223, + "learning_rate": 9.871645833907992e-06, + "loss": 0.0722, + "num_tokens": 25263167.0, + "step": 59 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 0.21116615072438671, + "learning_rate": 9.865340093378799e-06, + "loss": 0.0605, + "num_tokens": 25693750.0, + "step": 60 + }, + { + "epoch": 0.5507900677200903, + "grad_norm": 0.211026011866818, + "learning_rate": 9.858885501751044e-06, + "loss": 0.0594, + "num_tokens": 26127066.0, + "step": 61 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 0.234373955359756, + "learning_rate": 9.852282279115951e-06, + "loss": 0.0637, + "num_tokens": 26553930.0, + "step": 62 + }, + { + "epoch": 0.5688487584650113, + "grad_norm": 0.2384193322542726, + "learning_rate": 9.84553065063283e-06, + "loss": 0.0666, + "num_tokens": 26997155.0, + "step": 63 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 0.22021676361133122, + "learning_rate": 9.838630846521381e-06, + "loss": 0.0624, + "num_tokens": 27416930.0, + "step": 64 + }, + { + "epoch": 0.5869074492099323, + "grad_norm": 0.23846790572516682, + "learning_rate": 9.831583102053868e-06, + "loss": 0.0675, + "num_tokens": 27849198.0, + "step": 65 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 0.24188858035608773, + "learning_rate": 9.824387657547074e-06, + "loss": 0.0659, + "num_tokens": 28281608.0, + "step": 66 + }, + { + "epoch": 0.6049661399548533, + "grad_norm": 0.248691810960184, + "learning_rate": 9.817044758354123e-06, + "loss": 0.0652, + "num_tokens": 28703014.0, + "step": 67 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 0.22632767975144005, + "learning_rate": 9.809554654856106e-06, + "loss": 0.0601, + "num_tokens": 29154694.0, + "step": 68 + }, + { + "epoch": 0.6230248306997742, + "grad_norm": 0.21538186785258703, + "learning_rate": 9.80191760245354e-06, + "loss": 0.0606, + "num_tokens": 29581312.0, + "step": 69 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 0.24239636168958872, + "learning_rate": 9.794133861557674e-06, + "loss": 0.0668, + "num_tokens": 30010189.0, + "step": 70 + }, + { + "epoch": 0.6410835214446953, + "grad_norm": 0.2509091810657565, + "learning_rate": 9.78620369758159e-06, + "loss": 0.0676, + "num_tokens": 30423268.0, + "step": 71 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 0.260100268673901, + "learning_rate": 9.778127380931165e-06, + "loss": 0.069, + "num_tokens": 30854686.0, + "step": 72 + }, + { + "epoch": 0.6591422121896162, + "grad_norm": 0.24677960863356, + "learning_rate": 9.76990518699585e-06, + "loss": 0.064, + "num_tokens": 31260959.0, + "step": 73 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 0.25012179245813304, + "learning_rate": 9.761537396139277e-06, + "loss": 0.0683, + "num_tokens": 31676000.0, + "step": 74 + }, + { + "epoch": 0.6772009029345373, + "grad_norm": 0.24632349085635613, + "learning_rate": 9.753024293689696e-06, + "loss": 0.0665, + "num_tokens": 32086486.0, + "step": 75 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 0.23507648655681773, + "learning_rate": 9.744366169930254e-06, + "loss": 0.0648, + "num_tokens": 32498590.0, + "step": 76 + }, + { + "epoch": 0.6952595936794582, + "grad_norm": 0.24286901371912054, + "learning_rate": 9.735563320089088e-06, + "loss": 0.0678, + "num_tokens": 32917173.0, + "step": 77 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 0.2356753847064963, + "learning_rate": 9.72661604432927e-06, + "loss": 0.0647, + "num_tokens": 33349789.0, + "step": 78 + }, + { + "epoch": 0.7133182844243793, + "grad_norm": 0.23719586126347172, + "learning_rate": 9.717524647738553e-06, + "loss": 0.0667, + "num_tokens": 33785863.0, + "step": 79 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 0.22633262027361056, + "learning_rate": 9.70828944031899e-06, + "loss": 0.0683, + "num_tokens": 34216063.0, + "step": 80 + }, + { + "epoch": 0.7313769751693002, + "grad_norm": 0.2529669405234778, + "learning_rate": 9.698910736976344e-06, + "loss": 0.0701, + "num_tokens": 34655546.0, + "step": 81 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 0.23041720583093084, + "learning_rate": 9.689388857509365e-06, + "loss": 0.0601, + "num_tokens": 35085479.0, + "step": 82 + }, + { + "epoch": 0.7494356659142212, + "grad_norm": 0.22002320085345356, + "learning_rate": 9.679724126598878e-06, + "loss": 0.0654, + "num_tokens": 35510555.0, + "step": 83 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 0.23670432469046537, + "learning_rate": 9.669916873796709e-06, + "loss": 0.0648, + "num_tokens": 35921862.0, + "step": 84 + }, + { + "epoch": 0.7674943566591422, + "grad_norm": 0.2510888617498103, + "learning_rate": 9.659967433514458e-06, + "loss": 0.0646, + "num_tokens": 36328530.0, + "step": 85 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 0.2208580474860745, + "learning_rate": 9.649876145012085e-06, + "loss": 0.0633, + "num_tokens": 36772328.0, + "step": 86 + }, + { + "epoch": 0.7855530474040632, + "grad_norm": 0.2436329150985237, + "learning_rate": 9.639643352386353e-06, + "loss": 0.0616, + "num_tokens": 37186946.0, + "step": 87 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 0.2391380002554407, + "learning_rate": 9.629269404559081e-06, + "loss": 0.069, + "num_tokens": 37591509.0, + "step": 88 + }, + { + "epoch": 0.8036117381489842, + "grad_norm": 0.2223290940119957, + "learning_rate": 9.618754655265262e-06, + "loss": 0.0623, + "num_tokens": 38045842.0, + "step": 89 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 0.24573707394801297, + "learning_rate": 9.608099463040989e-06, + "loss": 0.0651, + "num_tokens": 38465378.0, + "step": 90 + }, + { + "epoch": 0.8216704288939052, + "grad_norm": 0.21431626210226137, + "learning_rate": 9.597304191211228e-06, + "loss": 0.0621, + "num_tokens": 38900287.0, + "step": 91 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 0.23972294907192276, + "learning_rate": 9.586369207877449e-06, + "loss": 0.0645, + "num_tokens": 39323143.0, + "step": 92 + }, + { + "epoch": 0.8397291196388262, + "grad_norm": 0.24251755300004105, + "learning_rate": 9.575294885905051e-06, + "loss": 0.0674, + "num_tokens": 39745833.0, + "step": 93 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 0.23780867445242304, + "learning_rate": 9.564081602910654e-06, + "loss": 0.0657, + "num_tokens": 40155224.0, + "step": 94 + }, + { + "epoch": 0.8577878103837472, + "grad_norm": 0.21274447402055477, + "learning_rate": 9.552729741249235e-06, + "loss": 0.0636, + "num_tokens": 40600001.0, + "step": 95 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 0.2509929337786159, + "learning_rate": 9.541239688001076e-06, + "loss": 0.0693, + "num_tokens": 41034208.0, + "step": 96 + }, + { + "epoch": 0.8758465011286681, + "grad_norm": 0.24871928630001877, + "learning_rate": 9.52961183495857e-06, + "loss": 0.0631, + "num_tokens": 41463499.0, + "step": 97 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 0.23826717579099585, + "learning_rate": 9.517846578612866e-06, + "loss": 0.0669, + "num_tokens": 41905599.0, + "step": 98 + }, + { + "epoch": 0.8939051918735892, + "grad_norm": 0.25233509675129634, + "learning_rate": 9.505944320140343e-06, + "loss": 0.07, + "num_tokens": 42324749.0, + "step": 99 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 0.22011156834652607, + "learning_rate": 9.49390546538893e-06, + "loss": 0.0675, + "num_tokens": 42745369.0, + "step": 100 + }, + { + "epoch": 0.9119638826185101, + "grad_norm": 0.21997168172500522, + "learning_rate": 9.481730424864276e-06, + "loss": 0.0622, + "num_tokens": 43186876.0, + "step": 101 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 0.21886410671701703, + "learning_rate": 9.469419613715743e-06, + "loss": 0.0662, + "num_tokens": 43605408.0, + "step": 102 + }, + { + "epoch": 0.9300225733634312, + "grad_norm": 0.21945876108560552, + "learning_rate": 9.456973451722255e-06, + "loss": 0.0645, + "num_tokens": 44020619.0, + "step": 103 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 0.22294938602794231, + "learning_rate": 9.44439236327798e-06, + "loss": 0.0635, + "num_tokens": 44482117.0, + "step": 104 + }, + { + "epoch": 0.9480812641083521, + "grad_norm": 0.21920116608256485, + "learning_rate": 9.431676777377865e-06, + "loss": 0.0621, + "num_tokens": 44899252.0, + "step": 105 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 0.21987734978187456, + "learning_rate": 9.418827127603e-06, + "loss": 0.0644, + "num_tokens": 45332722.0, + "step": 106 + }, + { + "epoch": 0.9661399548532731, + "grad_norm": 0.2291873095394738, + "learning_rate": 9.405843852105846e-06, + "loss": 0.0645, + "num_tokens": 45764262.0, + "step": 107 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 0.2212097604792261, + "learning_rate": 9.392727393595278e-06, + "loss": 0.0652, + "num_tokens": 46186006.0, + "step": 108 + }, + { + "epoch": 0.9841986455981941, + "grad_norm": 0.2202752169168232, + "learning_rate": 9.379478199321508e-06, + "loss": 0.0642, + "num_tokens": 46639566.0, + "step": 109 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 0.2325232017430746, + "learning_rate": 9.366096721060817e-06, + "loss": 0.0654, + "num_tokens": 47064244.0, + "step": 110 + }, + { + "epoch": 1.0, + "grad_norm": 0.24740140314097056, + "learning_rate": 9.352583415100157e-06, + "loss": 0.0652, + "num_tokens": 47393855.0, + "step": 111 + }, + { + "epoch": 1.0, + "eval_loss": 0.07556881755590439, + "eval_num_tokens": 47393855.0, + "eval_runtime": 54.1659, + "eval_samples_per_second": 46.247, + "eval_steps_per_second": 5.797, + "step": 111 + }, + { + "epoch": 1.0090293453724606, + "grad_norm": 0.20591105395295933, + "learning_rate": 9.3389387422216e-06, + "loss": 0.0494, + "num_tokens": 47806383.0, + "step": 112 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 0.2001019939737505, + "learning_rate": 9.325163167686615e-06, + "loss": 0.0482, + "num_tokens": 48241862.0, + "step": 113 + }, + { + "epoch": 1.0270880361173815, + "grad_norm": 0.19836065917732984, + "learning_rate": 9.311257161220207e-06, + "loss": 0.0481, + "num_tokens": 48651747.0, + "step": 114 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 0.22925008294650578, + "learning_rate": 9.297221196994904e-06, + "loss": 0.0496, + "num_tokens": 49088950.0, + "step": 115 + }, + { + "epoch": 1.0451467268623025, + "grad_norm": 0.20127235857740663, + "learning_rate": 9.283055753614581e-06, + "loss": 0.0454, + "num_tokens": 49528452.0, + "step": 116 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 0.22530638037839829, + "learning_rate": 9.268761314098148e-06, + "loss": 0.0472, + "num_tokens": 49951741.0, + "step": 117 + }, + { + "epoch": 1.0632054176072234, + "grad_norm": 0.2572759457221762, + "learning_rate": 9.254338365863079e-06, + "loss": 0.0516, + "num_tokens": 50388295.0, + "step": 118 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 0.23447648702544158, + "learning_rate": 9.239787400708779e-06, + "loss": 0.0475, + "num_tokens": 50829191.0, + "step": 119 + }, + { + "epoch": 1.0812641083521444, + "grad_norm": 0.21824788821792024, + "learning_rate": 9.225108914799833e-06, + "loss": 0.0449, + "num_tokens": 51266209.0, + "step": 120 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 0.2541567123911248, + "learning_rate": 9.21030340864908e-06, + "loss": 0.0515, + "num_tokens": 51687096.0, + "step": 121 + }, + { + "epoch": 1.0993227990970655, + "grad_norm": 0.23388058304332865, + "learning_rate": 9.195371387100544e-06, + "loss": 0.0468, + "num_tokens": 52114071.0, + "step": 122 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 0.2616033969024054, + "learning_rate": 9.180313359312218e-06, + "loss": 0.052, + "num_tokens": 52541213.0, + "step": 123 + }, + { + "epoch": 1.1173814898419865, + "grad_norm": 0.22075934495085656, + "learning_rate": 9.165129838738706e-06, + "loss": 0.0464, + "num_tokens": 52966113.0, + "step": 124 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 0.2344205168060999, + "learning_rate": 9.14982134311372e-06, + "loss": 0.0461, + "num_tokens": 53418877.0, + "step": 125 + }, + { + "epoch": 1.1354401805869074, + "grad_norm": 0.21717296092556684, + "learning_rate": 9.13438839443242e-06, + "loss": 0.048, + "num_tokens": 53839935.0, + "step": 126 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 0.2263063656296933, + "learning_rate": 9.11883151893361e-06, + "loss": 0.0479, + "num_tokens": 54258590.0, + "step": 127 + }, + { + "epoch": 1.1534988713318284, + "grad_norm": 0.2416238290536927, + "learning_rate": 9.103151247081803e-06, + "loss": 0.0538, + "num_tokens": 54666548.0, + "step": 128 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 0.2193846002679233, + "learning_rate": 9.087348113549134e-06, + "loss": 0.0496, + "num_tokens": 55091316.0, + "step": 129 + }, + { + "epoch": 1.1715575620767495, + "grad_norm": 0.22367873943524713, + "learning_rate": 9.071422657197117e-06, + "loss": 0.0477, + "num_tokens": 55507416.0, + "step": 130 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 0.21440076203806707, + "learning_rate": 9.05537542105828e-06, + "loss": 0.0504, + "num_tokens": 55947804.0, + "step": 131 + }, + { + "epoch": 1.1896162528216705, + "grad_norm": 0.21726216888223565, + "learning_rate": 9.039206952317655e-06, + "loss": 0.0503, + "num_tokens": 56371231.0, + "step": 132 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 0.2381106628947952, + "learning_rate": 9.022917802294098e-06, + "loss": 0.0513, + "num_tokens": 56783891.0, + "step": 133 + }, + { + "epoch": 1.2076749435665914, + "grad_norm": 0.20469900468699612, + "learning_rate": 9.006508526421511e-06, + "loss": 0.0488, + "num_tokens": 57202386.0, + "step": 134 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 0.21383039576945337, + "learning_rate": 8.989979684229894e-06, + "loss": 0.0491, + "num_tokens": 57631769.0, + "step": 135 + }, + { + "epoch": 1.2257336343115124, + "grad_norm": 0.22010872699572298, + "learning_rate": 8.973331839326266e-06, + "loss": 0.0484, + "num_tokens": 58062896.0, + "step": 136 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 0.22210574973561567, + "learning_rate": 8.956565559375452e-06, + "loss": 0.0487, + "num_tokens": 58487448.0, + "step": 137 + }, + { + "epoch": 1.2437923250564333, + "grad_norm": 0.2243098284698005, + "learning_rate": 8.93968141608071e-06, + "loss": 0.0508, + "num_tokens": 58922017.0, + "step": 138 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 0.22583392389127852, + "learning_rate": 8.922679985164262e-06, + "loss": 0.0482, + "num_tokens": 59367436.0, + "step": 139 + }, + { + "epoch": 1.2618510158013545, + "grad_norm": 0.2791152692330447, + "learning_rate": 8.905561846347648e-06, + "loss": 0.0498, + "num_tokens": 59795521.0, + "step": 140 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 0.23985016219520722, + "learning_rate": 8.888327583331953e-06, + "loss": 0.052, + "num_tokens": 60200448.0, + "step": 141 + }, + { + "epoch": 1.2799097065462754, + "grad_norm": 0.2303471135535924, + "learning_rate": 8.870977783777917e-06, + "loss": 0.052, + "num_tokens": 60630275.0, + "step": 142 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 0.2438969499554439, + "learning_rate": 8.853513039285888e-06, + "loss": 0.0538, + "num_tokens": 61053633.0, + "step": 143 + }, + { + "epoch": 1.2979683972911964, + "grad_norm": 0.231724280764138, + "learning_rate": 8.835933945375654e-06, + "loss": 0.0514, + "num_tokens": 61498301.0, + "step": 144 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 0.23664150633259673, + "learning_rate": 8.818241101466135e-06, + "loss": 0.0508, + "num_tokens": 61937666.0, + "step": 145 + }, + { + "epoch": 1.3160270880361173, + "grad_norm": 0.21466298141293827, + "learning_rate": 8.800435110854943e-06, + "loss": 0.048, + "num_tokens": 62349356.0, + "step": 146 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 0.2166692164183172, + "learning_rate": 8.78251658069781e-06, + "loss": 0.0479, + "num_tokens": 62773669.0, + "step": 147 + }, + { + "epoch": 1.3340857787810383, + "grad_norm": 0.2294319074746598, + "learning_rate": 8.764486121987885e-06, + "loss": 0.053, + "num_tokens": 63184808.0, + "step": 148 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 0.21755069135940322, + "learning_rate": 8.746344349534905e-06, + "loss": 0.0489, + "num_tokens": 63626245.0, + "step": 149 + }, + { + "epoch": 1.3521444695259595, + "grad_norm": 0.21320430410721747, + "learning_rate": 8.728091881944226e-06, + "loss": 0.0465, + "num_tokens": 64068177.0, + "step": 150 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 0.22547729814180828, + "learning_rate": 8.70972934159573e-06, + "loss": 0.0514, + "num_tokens": 64496374.0, + "step": 151 + }, + { + "epoch": 1.3702031602708804, + "grad_norm": 0.22148701677648516, + "learning_rate": 8.691257354622602e-06, + "loss": 0.0499, + "num_tokens": 64924919.0, + "step": 152 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 0.22449512565195465, + "learning_rate": 8.672676550889985e-06, + "loss": 0.0515, + "num_tokens": 65360268.0, + "step": 153 + }, + { + "epoch": 1.3882618510158014, + "grad_norm": 0.23853074268781047, + "learning_rate": 8.653987563973494e-06, + "loss": 0.0515, + "num_tokens": 65781587.0, + "step": 154 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 0.2184573125950705, + "learning_rate": 8.635191031137624e-06, + "loss": 0.049, + "num_tokens": 66191116.0, + "step": 155 + }, + { + "epoch": 1.4063205417607223, + "grad_norm": 0.2141254012612582, + "learning_rate": 8.616287593314006e-06, + "loss": 0.0495, + "num_tokens": 66617225.0, + "step": 156 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 0.23414954280312816, + "learning_rate": 8.597277895079568e-06, + "loss": 0.0532, + "num_tokens": 67046148.0, + "step": 157 + }, + { + "epoch": 1.4243792325056432, + "grad_norm": 0.2570508287707051, + "learning_rate": 8.578162584634537e-06, + "loss": 0.0539, + "num_tokens": 67476904.0, + "step": 158 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 0.2313656170230832, + "learning_rate": 8.558942313780357e-06, + "loss": 0.0514, + "num_tokens": 67946595.0, + "step": 159 + }, + { + "epoch": 1.4424379232505644, + "grad_norm": 0.2058518948189237, + "learning_rate": 8.539617737897452e-06, + "loss": 0.0485, + "num_tokens": 68359880.0, + "step": 160 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 0.21340907414958138, + "learning_rate": 8.520189515922872e-06, + "loss": 0.0504, + "num_tokens": 68795114.0, + "step": 161 + }, + { + "epoch": 1.4604966139954854, + "grad_norm": 0.22214435682460587, + "learning_rate": 8.500658310327842e-06, + "loss": 0.0533, + "num_tokens": 69225525.0, + "step": 162 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 0.2310171721998677, + "learning_rate": 8.48102478709516e-06, + "loss": 0.0485, + "num_tokens": 69645836.0, + "step": 163 + }, + { + "epoch": 1.4785553047404063, + "grad_norm": 0.21061407662189377, + "learning_rate": 8.461289615696489e-06, + "loss": 0.0488, + "num_tokens": 70067001.0, + "step": 164 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 0.23510691853366508, + "learning_rate": 8.441453469069536e-06, + "loss": 0.0539, + "num_tokens": 70492907.0, + "step": 165 + }, + { + "epoch": 1.4966139954853273, + "grad_norm": 0.23273278092391025, + "learning_rate": 8.4215170235951e-06, + "loss": 0.0511, + "num_tokens": 70942223.0, + "step": 166 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 0.22520685014111042, + "learning_rate": 8.401480959074006e-06, + "loss": 0.0524, + "num_tokens": 71371574.0, + "step": 167 + }, + { + "epoch": 1.5146726862302482, + "grad_norm": 0.20160754999400804, + "learning_rate": 8.381345958703933e-06, + "loss": 0.0494, + "num_tokens": 71822208.0, + "step": 168 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 0.2177895336364214, + "learning_rate": 8.361112709056115e-06, + "loss": 0.0519, + "num_tokens": 72234675.0, + "step": 169 + }, + { + "epoch": 1.5327313769751694, + "grad_norm": 0.22117053168073747, + "learning_rate": 8.340781900051924e-06, + "loss": 0.05, + "num_tokens": 72652286.0, + "step": 170 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 0.213258957751641, + "learning_rate": 8.32035422493935e-06, + "loss": 0.0498, + "num_tokens": 73068061.0, + "step": 171 + }, + { + "epoch": 1.5507900677200903, + "grad_norm": 0.21293842064376717, + "learning_rate": 8.299830380269372e-06, + "loss": 0.0487, + "num_tokens": 73496804.0, + "step": 172 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 0.23438530858353465, + "learning_rate": 8.27921106587218e-06, + "loss": 0.0547, + "num_tokens": 73913610.0, + "step": 173 + }, + { + "epoch": 1.5688487584650113, + "grad_norm": 0.20441443723236968, + "learning_rate": 8.258496984833344e-06, + "loss": 0.0482, + "num_tokens": 74340086.0, + "step": 174 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 0.21850806555742064, + "learning_rate": 8.237688843469815e-06, + "loss": 0.0527, + "num_tokens": 74761072.0, + "step": 175 + }, + { + "epoch": 1.5869074492099324, + "grad_norm": 0.21956456248799458, + "learning_rate": 8.216787351305854e-06, + "loss": 0.0496, + "num_tokens": 75200788.0, + "step": 176 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 0.21029589650099137, + "learning_rate": 8.195793221048834e-06, + "loss": 0.0495, + "num_tokens": 75626000.0, + "step": 177 + }, + { + "epoch": 1.6049661399548532, + "grad_norm": 0.22896876495177707, + "learning_rate": 8.17470716856494e-06, + "loss": 0.0507, + "num_tokens": 76049835.0, + "step": 178 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 0.21982178793915255, + "learning_rate": 8.153529912854751e-06, + "loss": 0.0495, + "num_tokens": 76485166.0, + "step": 179 + }, + { + "epoch": 1.6230248306997743, + "grad_norm": 0.21633630779428678, + "learning_rate": 8.13226217602874e-06, + "loss": 0.0522, + "num_tokens": 76889061.0, + "step": 180 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 0.2121233643480878, + "learning_rate": 8.110904683282635e-06, + "loss": 0.0505, + "num_tokens": 77325361.0, + "step": 181 + }, + { + "epoch": 1.6410835214446953, + "grad_norm": 0.2044061100864184, + "learning_rate": 8.089458162872697e-06, + "loss": 0.0514, + "num_tokens": 77759756.0, + "step": 182 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 0.21313712904385135, + "learning_rate": 8.067923346090888e-06, + "loss": 0.051, + "num_tokens": 78187190.0, + "step": 183 + }, + { + "epoch": 1.6591422121896162, + "grad_norm": 0.21968087345369283, + "learning_rate": 8.046300967239934e-06, + "loss": 0.052, + "num_tokens": 78598237.0, + "step": 184 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 0.21250534465548385, + "learning_rate": 8.024591763608291e-06, + "loss": 0.05, + "num_tokens": 79028466.0, + "step": 185 + }, + { + "epoch": 1.6772009029345374, + "grad_norm": 0.2420054483079227, + "learning_rate": 8.002796475444995e-06, + "loss": 0.0537, + "num_tokens": 79444122.0, + "step": 186 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 0.22857774340429907, + "learning_rate": 7.980915845934433e-06, + "loss": 0.0511, + "num_tokens": 79857405.0, + "step": 187 + }, + { + "epoch": 1.695259593679458, + "grad_norm": 0.191057370379495, + "learning_rate": 7.95895062117099e-06, + "loss": 0.0445, + "num_tokens": 80293603.0, + "step": 188 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 0.21038670633613107, + "learning_rate": 7.936901550133616e-06, + "loss": 0.0514, + "num_tokens": 80728108.0, + "step": 189 + }, + { + "epoch": 1.7133182844243793, + "grad_norm": 0.20395471051828254, + "learning_rate": 7.914769384660283e-06, + "loss": 0.0494, + "num_tokens": 81173331.0, + "step": 190 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 0.20619798381449547, + "learning_rate": 7.892554879422351e-06, + "loss": 0.0491, + "num_tokens": 81579387.0, + "step": 191 + }, + { + "epoch": 1.7313769751693002, + "grad_norm": 0.2395907020159803, + "learning_rate": 7.870258791898832e-06, + "loss": 0.052, + "num_tokens": 82013021.0, + "step": 192 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 0.21845951904203598, + "learning_rate": 7.847881882350568e-06, + "loss": 0.0497, + "num_tokens": 82440490.0, + "step": 193 + }, + { + "epoch": 1.7494356659142212, + "grad_norm": 0.2103047340245338, + "learning_rate": 7.825424913794299e-06, + "loss": 0.0482, + "num_tokens": 82879243.0, + "step": 194 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 0.214499085976654, + "learning_rate": 7.802888651976647e-06, + "loss": 0.049, + "num_tokens": 83320595.0, + "step": 195 + }, + { + "epoch": 1.7674943566591423, + "grad_norm": 0.22016761611149513, + "learning_rate": 7.78027386534801e-06, + "loss": 0.0498, + "num_tokens": 83736643.0, + "step": 196 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 0.22369190257229843, + "learning_rate": 7.757581325036357e-06, + "loss": 0.0518, + "num_tokens": 84180990.0, + "step": 197 + }, + { + "epoch": 1.785553047404063, + "grad_norm": 0.2145060731820242, + "learning_rate": 7.73481180482093e-06, + "loss": 0.0491, + "num_tokens": 84610497.0, + "step": 198 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 0.21994613233427202, + "learning_rate": 7.711966081105863e-06, + "loss": 0.0487, + "num_tokens": 85060786.0, + "step": 199 + }, + { + "epoch": 1.8036117381489842, + "grad_norm": 0.22601411161861185, + "learning_rate": 7.68904493289371e-06, + "loss": 0.0507, + "num_tokens": 85477616.0, + "step": 200 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 0.22793524159791131, + "learning_rate": 7.666049141758878e-06, + "loss": 0.0492, + "num_tokens": 85930357.0, + "step": 201 + }, + { + "epoch": 1.8216704288939052, + "grad_norm": 0.2314883873272596, + "learning_rate": 7.642979491820974e-06, + "loss": 0.0502, + "num_tokens": 86344995.0, + "step": 202 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 0.21771251333503014, + "learning_rate": 7.619836769718075e-06, + "loss": 0.0489, + "num_tokens": 86757489.0, + "step": 203 + }, + { + "epoch": 1.8397291196388261, + "grad_norm": 0.22137764784319228, + "learning_rate": 7.596621764579904e-06, + "loss": 0.0494, + "num_tokens": 87187973.0, + "step": 204 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 0.21416818398569845, + "learning_rate": 7.573335268000918e-06, + "loss": 0.0491, + "num_tokens": 87614553.0, + "step": 205 + }, + { + "epoch": 1.8577878103837473, + "grad_norm": 0.21327049321789548, + "learning_rate": 7.549978074013314e-06, + "loss": 0.0484, + "num_tokens": 88048570.0, + "step": 206 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 0.2380813909577303, + "learning_rate": 7.5265509790599625e-06, + "loss": 0.0561, + "num_tokens": 88466238.0, + "step": 207 + }, + { + "epoch": 1.875846501128668, + "grad_norm": 0.21670816556729486, + "learning_rate": 7.503054781967241e-06, + "loss": 0.0514, + "num_tokens": 88895453.0, + "step": 208 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 0.20391244445083417, + "learning_rate": 7.479490283917802e-06, + "loss": 0.0468, + "num_tokens": 89349875.0, + "step": 209 + }, + { + "epoch": 1.8939051918735892, + "grad_norm": 0.21945815106167021, + "learning_rate": 7.455858288423249e-06, + "loss": 0.0497, + "num_tokens": 89792563.0, + "step": 210 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 0.20764294614028975, + "learning_rate": 7.43215960129674e-06, + "loss": 0.0486, + "num_tokens": 90234898.0, + "step": 211 + }, + { + "epoch": 1.9119638826185101, + "grad_norm": 0.22128274677327442, + "learning_rate": 7.408395030625513e-06, + "loss": 0.0526, + "num_tokens": 90648682.0, + "step": 212 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 0.21500957606497606, + "learning_rate": 7.384565386743327e-06, + "loss": 0.0519, + "num_tokens": 91063468.0, + "step": 213 + }, + { + "epoch": 1.930022573363431, + "grad_norm": 0.2235525798171971, + "learning_rate": 7.360671482202838e-06, + "loss": 0.0492, + "num_tokens": 91494817.0, + "step": 214 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 0.20565927837619943, + "learning_rate": 7.336714131747878e-06, + "loss": 0.047, + "num_tokens": 91925573.0, + "step": 215 + }, + { + "epoch": 1.9480812641083523, + "grad_norm": 0.20696367158705464, + "learning_rate": 7.312694152285691e-06, + "loss": 0.0496, + "num_tokens": 92350433.0, + "step": 216 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 0.22250013802120688, + "learning_rate": 7.288612362859066e-06, + "loss": 0.0524, + "num_tokens": 92765584.0, + "step": 217 + }, + { + "epoch": 1.966139954853273, + "grad_norm": 0.19659013359962887, + "learning_rate": 7.2644695846184165e-06, + "loss": 0.0467, + "num_tokens": 93229192.0, + "step": 218 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 0.22586633183246826, + "learning_rate": 7.240266640793774e-06, + "loss": 0.0519, + "num_tokens": 93636026.0, + "step": 219 + }, + { + "epoch": 1.9841986455981941, + "grad_norm": 0.21487097770206617, + "learning_rate": 7.216004356666717e-06, + "loss": 0.0506, + "num_tokens": 94060859.0, + "step": 220 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 0.22030382046071045, + "learning_rate": 7.191683559542238e-06, + "loss": 0.05, + "num_tokens": 94478604.0, + "step": 221 + }, + { + "epoch": 2.0, + "grad_norm": 0.23484302283694436, + "learning_rate": 7.167305078720527e-06, + "loss": 0.0503, + "num_tokens": 94789780.0, + "step": 222 + }, + { + "epoch": 2.0, + "eval_loss": 0.073208749294281, + "eval_num_tokens": 94789780.0, + "eval_runtime": 53.1184, + "eval_samples_per_second": 47.159, + "eval_steps_per_second": 5.911, + "step": 222 + }, + { + "epoch": 2.0090293453724604, + "grad_norm": 0.21248855714425735, + "learning_rate": 7.142869745468697e-06, + "loss": 0.0396, + "num_tokens": 95206304.0, + "step": 223 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 0.18949973202007156, + "learning_rate": 7.118378392992436e-06, + "loss": 0.0387, + "num_tokens": 95629591.0, + "step": 224 + }, + { + "epoch": 2.0270880361173815, + "grad_norm": 0.18919961721375084, + "learning_rate": 7.093831856407599e-06, + "loss": 0.0407, + "num_tokens": 96043349.0, + "step": 225 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 0.1933524321048653, + "learning_rate": 7.069230972711727e-06, + "loss": 0.037, + "num_tokens": 96461415.0, + "step": 226 + }, + { + "epoch": 2.0451467268623027, + "grad_norm": 0.18664281752580714, + "learning_rate": 7.044576580755517e-06, + "loss": 0.0365, + "num_tokens": 96920134.0, + "step": 227 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 0.2084034802481695, + "learning_rate": 7.019869521214206e-06, + "loss": 0.0359, + "num_tokens": 97365121.0, + "step": 228 + }, + { + "epoch": 2.0632054176072234, + "grad_norm": 0.19849982442558636, + "learning_rate": 6.995110636558916e-06, + "loss": 0.0358, + "num_tokens": 97777134.0, + "step": 229 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 0.19484720349355802, + "learning_rate": 6.970300771027914e-06, + "loss": 0.0356, + "num_tokens": 98209859.0, + "step": 230 + }, + { + "epoch": 2.0812641083521446, + "grad_norm": 0.23343603397155127, + "learning_rate": 6.945440770597845e-06, + "loss": 0.038, + "num_tokens": 98622007.0, + "step": 231 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 0.21507174934816595, + "learning_rate": 6.920531482954863e-06, + "loss": 0.0359, + "num_tokens": 99052847.0, + "step": 232 + }, + { + "epoch": 2.0993227990970653, + "grad_norm": 0.22684265463739195, + "learning_rate": 6.895573757465745e-06, + "loss": 0.0372, + "num_tokens": 99471292.0, + "step": 233 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 0.2407028739910598, + "learning_rate": 6.870568445148915e-06, + "loss": 0.0363, + "num_tokens": 99901083.0, + "step": 234 + }, + { + "epoch": 2.1173814898419865, + "grad_norm": 0.2461836300717414, + "learning_rate": 6.845516398645434e-06, + "loss": 0.0394, + "num_tokens": 100323717.0, + "step": 235 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 0.2262513973226894, + "learning_rate": 6.820418472189926e-06, + "loss": 0.0359, + "num_tokens": 100746774.0, + "step": 236 + }, + { + "epoch": 2.1354401805869077, + "grad_norm": 0.22078451589374182, + "learning_rate": 6.795275521581443e-06, + "loss": 0.0355, + "num_tokens": 101186553.0, + "step": 237 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 0.22801664131817995, + "learning_rate": 6.770088404154293e-06, + "loss": 0.0383, + "num_tokens": 101595403.0, + "step": 238 + }, + { + "epoch": 2.1534988713318284, + "grad_norm": 0.21698654328654343, + "learning_rate": 6.744857978748795e-06, + "loss": 0.038, + "num_tokens": 102015527.0, + "step": 239 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 0.20951063858222352, + "learning_rate": 6.719585105682012e-06, + "loss": 0.0374, + "num_tokens": 102435412.0, + "step": 240 + }, + { + "epoch": 2.1715575620767495, + "grad_norm": 0.22433468300343645, + "learning_rate": 6.6942706467183916e-06, + "loss": 0.0389, + "num_tokens": 102851570.0, + "step": 241 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 0.21525942756147845, + "learning_rate": 6.668915465040403e-06, + "loss": 0.0395, + "num_tokens": 103270322.0, + "step": 242 + }, + { + "epoch": 2.1896162528216703, + "grad_norm": 0.20431160356810532, + "learning_rate": 6.643520425219093e-06, + "loss": 0.0361, + "num_tokens": 103685477.0, + "step": 243 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 0.20072891154576847, + "learning_rate": 6.618086393184601e-06, + "loss": 0.0344, + "num_tokens": 104120983.0, + "step": 244 + }, + { + "epoch": 2.2076749435665914, + "grad_norm": 0.21540951085890264, + "learning_rate": 6.592614236196646e-06, + "loss": 0.0383, + "num_tokens": 104536337.0, + "step": 245 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 0.19977963167191465, + "learning_rate": 6.567104822814942e-06, + "loss": 0.038, + "num_tokens": 104955016.0, + "step": 246 + }, + { + "epoch": 2.2257336343115126, + "grad_norm": 0.2084210770456705, + "learning_rate": 6.541559022869589e-06, + "loss": 0.0369, + "num_tokens": 105388070.0, + "step": 247 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 0.21203785986583049, + "learning_rate": 6.515977707431411e-06, + "loss": 0.0381, + "num_tokens": 105818809.0, + "step": 248 + }, + { + "epoch": 2.2437923250564333, + "grad_norm": 0.21579098885902015, + "learning_rate": 6.490361748782248e-06, + "loss": 0.0385, + "num_tokens": 106245854.0, + "step": 249 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 0.20266425695993848, + "learning_rate": 6.464712020385223e-06, + "loss": 0.0361, + "num_tokens": 106670473.0, + "step": 250 + }, + { + "epoch": 2.2618510158013545, + "grad_norm": 0.21843258020038847, + "learning_rate": 6.439029396854955e-06, + "loss": 0.039, + "num_tokens": 107095545.0, + "step": 251 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 0.23206865464143536, + "learning_rate": 6.4133147539277295e-06, + "loss": 0.0391, + "num_tokens": 107520274.0, + "step": 252 + }, + { + "epoch": 2.2799097065462752, + "grad_norm": 0.23252896606332119, + "learning_rate": 6.3875689684316435e-06, + "loss": 0.0372, + "num_tokens": 107953982.0, + "step": 253 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 0.20022551864259014, + "learning_rate": 6.361792918256705e-06, + "loss": 0.0345, + "num_tokens": 108388280.0, + "step": 254 + }, + { + "epoch": 2.2979683972911964, + "grad_norm": 0.19924409717583683, + "learning_rate": 6.335987482324904e-06, + "loss": 0.0349, + "num_tokens": 108824002.0, + "step": 255 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 0.23853649269606275, + "learning_rate": 6.310153540560229e-06, + "loss": 0.0398, + "num_tokens": 109244245.0, + "step": 256 + }, + { + "epoch": 2.3160270880361176, + "grad_norm": 0.22180700831499442, + "learning_rate": 6.284291973858682e-06, + "loss": 0.0379, + "num_tokens": 109673712.0, + "step": 257 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 0.20726563160007283, + "learning_rate": 6.25840366405822e-06, + "loss": 0.0377, + "num_tokens": 110102425.0, + "step": 258 + }, + { + "epoch": 2.3340857787810383, + "grad_norm": 0.2186490036483062, + "learning_rate": 6.232489493908706e-06, + "loss": 0.036, + "num_tokens": 110528736.0, + "step": 259 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 0.20973239551170453, + "learning_rate": 6.2065503470417956e-06, + "loss": 0.0382, + "num_tokens": 110947222.0, + "step": 260 + }, + { + "epoch": 2.3521444695259595, + "grad_norm": 0.21473802593618457, + "learning_rate": 6.180587107940809e-06, + "loss": 0.0373, + "num_tokens": 111396704.0, + "step": 261 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 0.2131166956078576, + "learning_rate": 6.154600661910577e-06, + "loss": 0.036, + "num_tokens": 111831800.0, + "step": 262 + }, + { + "epoch": 2.37020316027088, + "grad_norm": 0.22692021680225197, + "learning_rate": 6.128591895047243e-06, + "loss": 0.0382, + "num_tokens": 112263654.0, + "step": 263 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 0.2120741047416527, + "learning_rate": 6.102561694208064e-06, + "loss": 0.0367, + "num_tokens": 112691946.0, + "step": 264 + }, + { + "epoch": 2.3882618510158014, + "grad_norm": 0.20089102240650572, + "learning_rate": 6.076510946981155e-06, + "loss": 0.0344, + "num_tokens": 113132551.0, + "step": 265 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 0.21672559849433848, + "learning_rate": 6.05044054165523e-06, + "loss": 0.0368, + "num_tokens": 113569796.0, + "step": 266 + }, + { + "epoch": 2.4063205417607225, + "grad_norm": 0.20487030347656493, + "learning_rate": 6.024351367189314e-06, + "loss": 0.0352, + "num_tokens": 114001801.0, + "step": 267 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 0.21779989020405227, + "learning_rate": 5.998244313182431e-06, + "loss": 0.0366, + "num_tokens": 114434376.0, + "step": 268 + }, + { + "epoch": 2.4243792325056432, + "grad_norm": 0.21358590635231028, + "learning_rate": 5.972120269843263e-06, + "loss": 0.0374, + "num_tokens": 114881898.0, + "step": 269 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 0.202495451961395, + "learning_rate": 5.945980127959812e-06, + "loss": 0.0354, + "num_tokens": 115303494.0, + "step": 270 + }, + { + "epoch": 2.4424379232505644, + "grad_norm": 0.21325612101148764, + "learning_rate": 5.919824778869002e-06, + "loss": 0.0404, + "num_tokens": 115722287.0, + "step": 271 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 0.21373291723694496, + "learning_rate": 5.893655114426306e-06, + "loss": 0.0373, + "num_tokens": 116161502.0, + "step": 272 + }, + { + "epoch": 2.460496613995485, + "grad_norm": 0.2093748759101363, + "learning_rate": 5.867472026975326e-06, + "loss": 0.0364, + "num_tokens": 116581091.0, + "step": 273 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 0.19427275270977723, + "learning_rate": 5.841276409317366e-06, + "loss": 0.0338, + "num_tokens": 117012555.0, + "step": 274 + }, + { + "epoch": 2.4785553047404063, + "grad_norm": 0.21119803869757173, + "learning_rate": 5.815069154680991e-06, + "loss": 0.0368, + "num_tokens": 117435175.0, + "step": 275 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 0.2119478066972626, + "learning_rate": 5.788851156691569e-06, + "loss": 0.0379, + "num_tokens": 117873663.0, + "step": 276 + }, + { + "epoch": 2.4966139954853275, + "grad_norm": 0.20555721393930926, + "learning_rate": 5.7626233093407955e-06, + "loss": 0.0387, + "num_tokens": 118286469.0, + "step": 277 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 0.20228991356399106, + "learning_rate": 5.7363865069562195e-06, + "loss": 0.035, + "num_tokens": 118713374.0, + "step": 278 + }, + { + "epoch": 2.514672686230248, + "grad_norm": 0.2099195561883515, + "learning_rate": 5.710141644170734e-06, + "loss": 0.0372, + "num_tokens": 119126272.0, + "step": 279 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 0.21910187440390475, + "learning_rate": 5.683889615892091e-06, + "loss": 0.0389, + "num_tokens": 119554963.0, + "step": 280 + }, + { + "epoch": 2.5327313769751694, + "grad_norm": 0.20516565303632384, + "learning_rate": 5.65763131727236e-06, + "loss": 0.0374, + "num_tokens": 119977012.0, + "step": 281 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 0.2028306018637831, + "learning_rate": 5.631367643677428e-06, + "loss": 0.0378, + "num_tokens": 120396254.0, + "step": 282 + }, + { + "epoch": 2.55079006772009, + "grad_norm": 0.21099179044346147, + "learning_rate": 5.605099490656459e-06, + "loss": 0.039, + "num_tokens": 120811188.0, + "step": 283 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 0.22320146886168057, + "learning_rate": 5.578827753911357e-06, + "loss": 0.038, + "num_tokens": 121249730.0, + "step": 284 + }, + { + "epoch": 2.5688487584650113, + "grad_norm": 0.2063664234760257, + "learning_rate": 5.5525533292662246e-06, + "loss": 0.0419, + "num_tokens": 121653968.0, + "step": 285 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 0.19883311700782794, + "learning_rate": 5.52627711263682e-06, + "loss": 0.0361, + "num_tokens": 122090338.0, + "step": 286 + }, + { + "epoch": 2.5869074492099324, + "grad_norm": 0.22065662379925538, + "learning_rate": 5.500000000000001e-06, + "loss": 0.0398, + "num_tokens": 122507656.0, + "step": 287 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 0.19384766634526454, + "learning_rate": 5.4737228873631835e-06, + "loss": 0.0354, + "num_tokens": 122940825.0, + "step": 288 + }, + { + "epoch": 2.604966139954853, + "grad_norm": 0.20902484615096364, + "learning_rate": 5.447446670733777e-06, + "loss": 0.0387, + "num_tokens": 123349895.0, + "step": 289 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 0.20488648661316763, + "learning_rate": 5.421172246088645e-06, + "loss": 0.0359, + "num_tokens": 123771878.0, + "step": 290 + }, + { + "epoch": 2.6230248306997743, + "grad_norm": 0.2131322969720244, + "learning_rate": 5.394900509343543e-06, + "loss": 0.0383, + "num_tokens": 124201272.0, + "step": 291 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 0.20558306032851048, + "learning_rate": 5.368632356322574e-06, + "loss": 0.0381, + "num_tokens": 124649372.0, + "step": 292 + }, + { + "epoch": 2.6410835214446955, + "grad_norm": 0.20122173044539496, + "learning_rate": 5.342368682727641e-06, + "loss": 0.039, + "num_tokens": 125066840.0, + "step": 293 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 0.20898347814556756, + "learning_rate": 5.3161103841079105e-06, + "loss": 0.0382, + "num_tokens": 125494250.0, + "step": 294 + }, + { + "epoch": 2.659142212189616, + "grad_norm": 0.2149621835847897, + "learning_rate": 5.2898583558292645e-06, + "loss": 0.037, + "num_tokens": 125918388.0, + "step": 295 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 0.19573725513598847, + "learning_rate": 5.2636134930437836e-06, + "loss": 0.034, + "num_tokens": 126360809.0, + "step": 296 + }, + { + "epoch": 2.6772009029345374, + "grad_norm": 0.20232408470142185, + "learning_rate": 5.237376690659206e-06, + "loss": 0.0379, + "num_tokens": 126789778.0, + "step": 297 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 0.20229323446104955, + "learning_rate": 5.211148843308432e-06, + "loss": 0.0366, + "num_tokens": 127228131.0, + "step": 298 + }, + { + "epoch": 2.695259593679458, + "grad_norm": 0.20734934646345476, + "learning_rate": 5.1849308453190105e-06, + "loss": 0.0372, + "num_tokens": 127662535.0, + "step": 299 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 0.22394881072549053, + "learning_rate": 5.158723590682636e-06, + "loss": 0.0361, + "num_tokens": 128099563.0, + "step": 300 + }, + { + "epoch": 2.7133182844243793, + "grad_norm": 0.216504229797845, + "learning_rate": 5.132527973024677e-06, + "loss": 0.038, + "num_tokens": 128526843.0, + "step": 301 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 0.2106031427111874, + "learning_rate": 5.106344885573695e-06, + "loss": 0.0361, + "num_tokens": 128943620.0, + "step": 302 + }, + { + "epoch": 2.7313769751693, + "grad_norm": 0.21589134891749512, + "learning_rate": 5.0801752211309995e-06, + "loss": 0.037, + "num_tokens": 129365819.0, + "step": 303 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 0.21568768479197367, + "learning_rate": 5.05401987204019e-06, + "loss": 0.0394, + "num_tokens": 129798803.0, + "step": 304 + }, + { + "epoch": 2.749435665914221, + "grad_norm": 0.19584433893876846, + "learning_rate": 5.027879730156738e-06, + "loss": 0.0357, + "num_tokens": 130218483.0, + "step": 305 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 0.2022753347596587, + "learning_rate": 5.001755686817573e-06, + "loss": 0.0357, + "num_tokens": 130649485.0, + "step": 306 + }, + { + "epoch": 2.7674943566591423, + "grad_norm": 0.20612921212382007, + "learning_rate": 4.975648632810686e-06, + "loss": 0.0335, + "num_tokens": 131078439.0, + "step": 307 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 0.19059751665241148, + "learning_rate": 4.949559458344771e-06, + "loss": 0.0346, + "num_tokens": 131532361.0, + "step": 308 + }, + { + "epoch": 2.785553047404063, + "grad_norm": 0.2063194600288434, + "learning_rate": 4.923489053018846e-06, + "loss": 0.0366, + "num_tokens": 131952787.0, + "step": 309 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 0.1919757058007069, + "learning_rate": 4.897438305791937e-06, + "loss": 0.0341, + "num_tokens": 132386508.0, + "step": 310 + }, + { + "epoch": 2.8036117381489842, + "grad_norm": 0.21144295382676526, + "learning_rate": 4.8714081049527565e-06, + "loss": 0.0378, + "num_tokens": 132820484.0, + "step": 311 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 0.2018034624028397, + "learning_rate": 4.845399338089425e-06, + "loss": 0.0361, + "num_tokens": 133254008.0, + "step": 312 + }, + { + "epoch": 2.8216704288939054, + "grad_norm": 0.20336193793898835, + "learning_rate": 4.819412892059192e-06, + "loss": 0.0356, + "num_tokens": 133686531.0, + "step": 313 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 0.19875832774739335, + "learning_rate": 4.793449652958207e-06, + "loss": 0.0372, + "num_tokens": 134117853.0, + "step": 314 + }, + { + "epoch": 2.839729119638826, + "grad_norm": 0.2025777185015454, + "learning_rate": 4.767510506091296e-06, + "loss": 0.0341, + "num_tokens": 134542656.0, + "step": 315 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 0.19450955808527112, + "learning_rate": 4.741596335941782e-06, + "loss": 0.0348, + "num_tokens": 134983147.0, + "step": 316 + }, + { + "epoch": 2.8577878103837473, + "grad_norm": 0.20212383951065388, + "learning_rate": 4.715708026141321e-06, + "loss": 0.0357, + "num_tokens": 135415172.0, + "step": 317 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 0.2108452808754182, + "learning_rate": 4.6898464594397715e-06, + "loss": 0.037, + "num_tokens": 135853491.0, + "step": 318 + }, + { + "epoch": 2.875846501128668, + "grad_norm": 0.21880269458357277, + "learning_rate": 4.664012517675098e-06, + "loss": 0.0371, + "num_tokens": 136297933.0, + "step": 319 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 0.19667310255554504, + "learning_rate": 4.638207081743295e-06, + "loss": 0.0356, + "num_tokens": 136746456.0, + "step": 320 + }, + { + "epoch": 2.893905191873589, + "grad_norm": 0.21229188587702424, + "learning_rate": 4.612431031568359e-06, + "loss": 0.0352, + "num_tokens": 137185823.0, + "step": 321 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 0.21507887445809104, + "learning_rate": 4.586685246072272e-06, + "loss": 0.0378, + "num_tokens": 137610632.0, + "step": 322 + }, + { + "epoch": 2.91196388261851, + "grad_norm": 0.20290807539222847, + "learning_rate": 4.560970603145046e-06, + "loss": 0.0359, + "num_tokens": 138036129.0, + "step": 323 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 0.20255682716813245, + "learning_rate": 4.535287979614777e-06, + "loss": 0.037, + "num_tokens": 138460480.0, + "step": 324 + }, + { + "epoch": 2.930022573363431, + "grad_norm": 0.20190662248464478, + "learning_rate": 4.5096382512177535e-06, + "loss": 0.0377, + "num_tokens": 138890848.0, + "step": 325 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 0.20842777705510368, + "learning_rate": 4.484022292568593e-06, + "loss": 0.038, + "num_tokens": 139300121.0, + "step": 326 + }, + { + "epoch": 2.9480812641083523, + "grad_norm": 0.19497433716766685, + "learning_rate": 4.458440977130413e-06, + "loss": 0.0362, + "num_tokens": 139730061.0, + "step": 327 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 0.21534603212911488, + "learning_rate": 4.432895177185061e-06, + "loss": 0.0362, + "num_tokens": 140161307.0, + "step": 328 + }, + { + "epoch": 2.966139954853273, + "grad_norm": 0.20085062285575225, + "learning_rate": 4.407385763803355e-06, + "loss": 0.036, + "num_tokens": 140580963.0, + "step": 329 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 0.1793943869493084, + "learning_rate": 4.381913606815401e-06, + "loss": 0.0349, + "num_tokens": 140998977.0, + "step": 330 + }, + { + "epoch": 2.984198645598194, + "grad_norm": 0.2210026230905219, + "learning_rate": 4.356479574780909e-06, + "loss": 0.0388, + "num_tokens": 141427751.0, + "step": 331 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 0.20535055908069738, + "learning_rate": 4.331084534959598e-06, + "loss": 0.036, + "num_tokens": 141871576.0, + "step": 332 + }, + { + "epoch": 3.0, + "grad_norm": 0.22935623936313615, + "learning_rate": 4.305729353281608e-06, + "loss": 0.0368, + "num_tokens": 142185499.0, + "step": 333 + }, + { + "epoch": 3.0, + "eval_loss": 0.07833831012248993, + "eval_num_tokens": 142185499.0, + "eval_runtime": 53.1378, + "eval_samples_per_second": 47.142, + "eval_steps_per_second": 5.909, + "step": 333 + }, + { + "epoch": 3.0090293453724604, + "grad_norm": 0.17615032965237978, + "learning_rate": 4.28041489431799e-06, + "loss": 0.0297, + "num_tokens": 142609048.0, + "step": 334 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 0.16768317651959314, + "learning_rate": 4.255142021251206e-06, + "loss": 0.0273, + "num_tokens": 143034905.0, + "step": 335 + }, + { + "epoch": 3.0270880361173815, + "grad_norm": 0.178513727958651, + "learning_rate": 4.22991159584571e-06, + "loss": 0.0296, + "num_tokens": 143441637.0, + "step": 336 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 0.16362797899107945, + "learning_rate": 4.204724478418558e-06, + "loss": 0.0274, + "num_tokens": 143864737.0, + "step": 337 + }, + { + "epoch": 3.0451467268623027, + "grad_norm": 0.1779262670205207, + "learning_rate": 4.1795815278100746e-06, + "loss": 0.0273, + "num_tokens": 144287169.0, + "step": 338 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 0.16868922203126024, + "learning_rate": 4.154483601354566e-06, + "loss": 0.0239, + "num_tokens": 144729276.0, + "step": 339 + }, + { + "epoch": 3.0632054176072234, + "grad_norm": 0.22120615468029714, + "learning_rate": 4.129431554851086e-06, + "loss": 0.0265, + "num_tokens": 145152649.0, + "step": 340 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 0.18082572504463415, + "learning_rate": 4.104426242534256e-06, + "loss": 0.024, + "num_tokens": 145603813.0, + "step": 341 + }, + { + "epoch": 3.0812641083521446, + "grad_norm": 0.19046184814783876, + "learning_rate": 4.079468517045136e-06, + "loss": 0.0251, + "num_tokens": 146042430.0, + "step": 342 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 0.21388772894027044, + "learning_rate": 4.054559229402157e-06, + "loss": 0.0269, + "num_tokens": 146486450.0, + "step": 343 + }, + { + "epoch": 3.0993227990970653, + "grad_norm": 0.19902520307706537, + "learning_rate": 4.029699228972087e-06, + "loss": 0.0265, + "num_tokens": 146918914.0, + "step": 344 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 0.1909885678030147, + "learning_rate": 4.0048893634410865e-06, + "loss": 0.0267, + "num_tokens": 147357955.0, + "step": 345 + }, + { + "epoch": 3.1173814898419865, + "grad_norm": 0.22571001846271635, + "learning_rate": 3.980130478785794e-06, + "loss": 0.0277, + "num_tokens": 147800926.0, + "step": 346 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 0.2244932380987677, + "learning_rate": 3.955423419244484e-06, + "loss": 0.0272, + "num_tokens": 148218943.0, + "step": 347 + }, + { + "epoch": 3.1354401805869077, + "grad_norm": 0.19836449340403237, + "learning_rate": 3.930769027288273e-06, + "loss": 0.0262, + "num_tokens": 148653844.0, + "step": 348 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 0.20152748515944557, + "learning_rate": 3.9061681435924014e-06, + "loss": 0.0264, + "num_tokens": 149091233.0, + "step": 349 + }, + { + "epoch": 3.1534988713318284, + "grad_norm": 0.2156215142828321, + "learning_rate": 3.881621607007565e-06, + "loss": 0.0275, + "num_tokens": 149519351.0, + "step": 350 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 0.1978178076499358, + "learning_rate": 3.857130254531303e-06, + "loss": 0.0261, + "num_tokens": 149963954.0, + "step": 351 + }, + { + "epoch": 3.1715575620767495, + "grad_norm": 0.18301437385970923, + "learning_rate": 3.832694921279474e-06, + "loss": 0.0251, + "num_tokens": 150388491.0, + "step": 352 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 0.19261702489891372, + "learning_rate": 3.8083164404577654e-06, + "loss": 0.0245, + "num_tokens": 150847232.0, + "step": 353 + }, + { + "epoch": 3.1896162528216703, + "grad_norm": 0.19975473265538973, + "learning_rate": 3.7839956433332847e-06, + "loss": 0.0271, + "num_tokens": 151284392.0, + "step": 354 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 0.1742588267326132, + "learning_rate": 3.759733359206229e-06, + "loss": 0.0236, + "num_tokens": 151731845.0, + "step": 355 + }, + { + "epoch": 3.2076749435665914, + "grad_norm": 0.19189393212971345, + "learning_rate": 3.735530415381584e-06, + "loss": 0.0233, + "num_tokens": 152170900.0, + "step": 356 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 0.18276155403004796, + "learning_rate": 3.7113876371409354e-06, + "loss": 0.025, + "num_tokens": 152607162.0, + "step": 357 + }, + { + "epoch": 3.2257336343115126, + "grad_norm": 0.2126183191680057, + "learning_rate": 3.687305847714311e-06, + "loss": 0.029, + "num_tokens": 153028418.0, + "step": 358 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 0.2079094513048321, + "learning_rate": 3.6632858682521233e-06, + "loss": 0.0303, + "num_tokens": 153425933.0, + "step": 359 + }, + { + "epoch": 3.2437923250564333, + "grad_norm": 0.19603992522830643, + "learning_rate": 3.639328517797164e-06, + "loss": 0.0276, + "num_tokens": 153862042.0, + "step": 360 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 0.2041051335322153, + "learning_rate": 3.6154346132566732e-06, + "loss": 0.0274, + "num_tokens": 154292718.0, + "step": 361 + }, + { + "epoch": 3.2618510158013545, + "grad_norm": 0.16965524174312757, + "learning_rate": 3.5916049693744883e-06, + "loss": 0.0243, + "num_tokens": 154735278.0, + "step": 362 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 0.20318480450631826, + "learning_rate": 3.5678403987032616e-06, + "loss": 0.0278, + "num_tokens": 155179493.0, + "step": 363 + }, + { + "epoch": 3.2799097065462752, + "grad_norm": 0.19444987774849537, + "learning_rate": 3.544141711576754e-06, + "loss": 0.0276, + "num_tokens": 155601091.0, + "step": 364 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 0.20536549600417725, + "learning_rate": 3.5205097160821987e-06, + "loss": 0.0282, + "num_tokens": 156037078.0, + "step": 365 + }, + { + "epoch": 3.2979683972911964, + "grad_norm": 0.18858279405516468, + "learning_rate": 3.4969452180327614e-06, + "loss": 0.0267, + "num_tokens": 156459746.0, + "step": 366 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 0.18623646350532908, + "learning_rate": 3.4734490209400397e-06, + "loss": 0.0265, + "num_tokens": 156901429.0, + "step": 367 + }, + { + "epoch": 3.3160270880361176, + "grad_norm": 0.1901505375936395, + "learning_rate": 3.450021925986687e-06, + "loss": 0.025, + "num_tokens": 157331010.0, + "step": 368 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 0.21345722769478626, + "learning_rate": 3.4266647319990832e-06, + "loss": 0.0263, + "num_tokens": 157772602.0, + "step": 369 + }, + { + "epoch": 3.3340857787810383, + "grad_norm": 0.1958238651899149, + "learning_rate": 3.403378235420096e-06, + "loss": 0.0292, + "num_tokens": 158189457.0, + "step": 370 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 0.18727031340684455, + "learning_rate": 3.380163230281928e-06, + "loss": 0.0246, + "num_tokens": 158626349.0, + "step": 371 + }, + { + "epoch": 3.3521444695259595, + "grad_norm": 0.20757335733838128, + "learning_rate": 3.3570205081790285e-06, + "loss": 0.0281, + "num_tokens": 159042993.0, + "step": 372 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 0.19878621042154004, + "learning_rate": 3.3339508582411245e-06, + "loss": 0.027, + "num_tokens": 159461779.0, + "step": 373 + }, + { + "epoch": 3.37020316027088, + "grad_norm": 0.2030492855138732, + "learning_rate": 3.3109550671062907e-06, + "loss": 0.0271, + "num_tokens": 159889249.0, + "step": 374 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 0.22579464893889872, + "learning_rate": 3.288033918894137e-06, + "loss": 0.0286, + "num_tokens": 160318538.0, + "step": 375 + }, + { + "epoch": 3.3882618510158014, + "grad_norm": 0.20788827702420026, + "learning_rate": 3.265188195179071e-06, + "loss": 0.026, + "num_tokens": 160741586.0, + "step": 376 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 0.19448390888085226, + "learning_rate": 3.2424186749636455e-06, + "loss": 0.0277, + "num_tokens": 161157068.0, + "step": 377 + }, + { + "epoch": 3.4063205417607225, + "grad_norm": 0.20401661403345364, + "learning_rate": 3.2197261346519905e-06, + "loss": 0.0268, + "num_tokens": 161591338.0, + "step": 378 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 0.21813887981650087, + "learning_rate": 3.1971113480233556e-06, + "loss": 0.0286, + "num_tokens": 162030602.0, + "step": 379 + }, + { + "epoch": 3.4243792325056432, + "grad_norm": 0.18506918547250012, + "learning_rate": 3.1745750862057033e-06, + "loss": 0.0255, + "num_tokens": 162466006.0, + "step": 380 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 0.17245637994545798, + "learning_rate": 3.152118117649433e-06, + "loss": 0.0236, + "num_tokens": 162897575.0, + "step": 381 + }, + { + "epoch": 3.4424379232505644, + "grad_norm": 0.19229533800125223, + "learning_rate": 3.1297412081011686e-06, + "loss": 0.0273, + "num_tokens": 163303958.0, + "step": 382 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 0.18764583187050646, + "learning_rate": 3.1074451205776505e-06, + "loss": 0.0288, + "num_tokens": 163713904.0, + "step": 383 + }, + { + "epoch": 3.460496613995485, + "grad_norm": 0.1959737816129912, + "learning_rate": 3.0852306153397194e-06, + "loss": 0.028, + "num_tokens": 164137525.0, + "step": 384 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 0.19142757019118462, + "learning_rate": 3.063098449866384e-06, + "loss": 0.0255, + "num_tokens": 164577195.0, + "step": 385 + }, + { + "epoch": 3.4785553047404063, + "grad_norm": 0.20734003681544289, + "learning_rate": 3.0410493788290114e-06, + "loss": 0.0273, + "num_tokens": 165007790.0, + "step": 386 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 0.1966194914501041, + "learning_rate": 3.019084154065568e-06, + "loss": 0.0267, + "num_tokens": 165448590.0, + "step": 387 + }, + { + "epoch": 3.4966139954853275, + "grad_norm": 0.19765550724465786, + "learning_rate": 2.997203524555005e-06, + "loss": 0.0261, + "num_tokens": 165878673.0, + "step": 388 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 0.20386357030127975, + "learning_rate": 2.97540823639171e-06, + "loss": 0.0288, + "num_tokens": 166305211.0, + "step": 389 + }, + { + "epoch": 3.514672686230248, + "grad_norm": 0.21520042611672668, + "learning_rate": 2.953699032760067e-06, + "loss": 0.028, + "num_tokens": 166738863.0, + "step": 390 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 0.194197513003074, + "learning_rate": 2.932076653909115e-06, + "loss": 0.0267, + "num_tokens": 167150386.0, + "step": 391 + }, + { + "epoch": 3.5327313769751694, + "grad_norm": 0.21794710700752784, + "learning_rate": 2.910541837127305e-06, + "loss": 0.0308, + "num_tokens": 167571071.0, + "step": 392 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 0.20084089240718633, + "learning_rate": 2.889095316717366e-06, + "loss": 0.0298, + "num_tokens": 167985607.0, + "step": 393 + }, + { + "epoch": 3.55079006772009, + "grad_norm": 0.1873256285610558, + "learning_rate": 2.8677378239712607e-06, + "loss": 0.0264, + "num_tokens": 168420127.0, + "step": 394 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 0.19057620256847735, + "learning_rate": 2.846470087145249e-06, + "loss": 0.0254, + "num_tokens": 168853493.0, + "step": 395 + }, + { + "epoch": 3.5688487584650113, + "grad_norm": 0.19413012990199557, + "learning_rate": 2.8252928314350626e-06, + "loss": 0.0267, + "num_tokens": 169267253.0, + "step": 396 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 0.18472207554532166, + "learning_rate": 2.804206778951168e-06, + "loss": 0.0268, + "num_tokens": 169692290.0, + "step": 397 + }, + { + "epoch": 3.5869074492099324, + "grad_norm": 0.1947047194991214, + "learning_rate": 2.7832126486941456e-06, + "loss": 0.0272, + "num_tokens": 170118921.0, + "step": 398 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 0.19622906603016707, + "learning_rate": 2.7623111565301863e-06, + "loss": 0.0256, + "num_tokens": 170541906.0, + "step": 399 + }, + { + "epoch": 3.604966139954853, + "grad_norm": 0.20226034798427545, + "learning_rate": 2.7415030151666567e-06, + "loss": 0.0288, + "num_tokens": 170969534.0, + "step": 400 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 0.1936308496497604, + "learning_rate": 2.720788934127819e-06, + "loss": 0.0289, + "num_tokens": 171393139.0, + "step": 401 + }, + { + "epoch": 3.6230248306997743, + "grad_norm": 0.19668592329870727, + "learning_rate": 2.700169619730631e-06, + "loss": 0.0281, + "num_tokens": 171822275.0, + "step": 402 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 0.1991700741524214, + "learning_rate": 2.6796457750606487e-06, + "loss": 0.0267, + "num_tokens": 172269333.0, + "step": 403 + }, + { + "epoch": 3.6410835214446955, + "grad_norm": 0.1986610830135047, + "learning_rate": 2.659218099948079e-06, + "loss": 0.0299, + "num_tokens": 172666432.0, + "step": 404 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 0.23751564872208886, + "learning_rate": 2.6388872909438875e-06, + "loss": 0.0259, + "num_tokens": 173085250.0, + "step": 405 + }, + { + "epoch": 3.659142212189616, + "grad_norm": 0.1981989081945087, + "learning_rate": 2.618654041296068e-06, + "loss": 0.0293, + "num_tokens": 173503425.0, + "step": 406 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 0.18712382263484367, + "learning_rate": 2.5985190409259957e-06, + "loss": 0.0281, + "num_tokens": 173915204.0, + "step": 407 + }, + { + "epoch": 3.6772009029345374, + "grad_norm": 0.17357413799427998, + "learning_rate": 2.5784829764049013e-06, + "loss": 0.0226, + "num_tokens": 174374964.0, + "step": 408 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 0.2123281451587569, + "learning_rate": 2.558546530930466e-06, + "loss": 0.0293, + "num_tokens": 174779332.0, + "step": 409 + }, + { + "epoch": 3.695259593679458, + "grad_norm": 0.19167695565069123, + "learning_rate": 2.5387103843035126e-06, + "loss": 0.028, + "num_tokens": 175200386.0, + "step": 410 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 0.19744183578883434, + "learning_rate": 2.5189752129048428e-06, + "loss": 0.0284, + "num_tokens": 175605971.0, + "step": 411 + }, + { + "epoch": 3.7133182844243793, + "grad_norm": 0.19382775119683274, + "learning_rate": 2.49934168967216e-06, + "loss": 0.0284, + "num_tokens": 176029017.0, + "step": 412 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 0.20108444144557538, + "learning_rate": 2.4798104840771294e-06, + "loss": 0.0286, + "num_tokens": 176454891.0, + "step": 413 + }, + { + "epoch": 3.7313769751693, + "grad_norm": 0.18135344649440682, + "learning_rate": 2.46038226210255e-06, + "loss": 0.0264, + "num_tokens": 176885789.0, + "step": 414 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 0.18906487985763146, + "learning_rate": 2.4410576862196435e-06, + "loss": 0.029, + "num_tokens": 177301433.0, + "step": 415 + }, + { + "epoch": 3.749435665914221, + "grad_norm": 0.2016510773057012, + "learning_rate": 2.4218374153654627e-06, + "loss": 0.0264, + "num_tokens": 177722579.0, + "step": 416 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 0.18779427160793136, + "learning_rate": 2.4027221049204347e-06, + "loss": 0.0266, + "num_tokens": 178144731.0, + "step": 417 + }, + { + "epoch": 3.7674943566591423, + "grad_norm": 0.18909569178817748, + "learning_rate": 2.383712406685995e-06, + "loss": 0.0266, + "num_tokens": 178574868.0, + "step": 418 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 0.1950155026411044, + "learning_rate": 2.364808968862378e-06, + "loss": 0.0271, + "num_tokens": 179005190.0, + "step": 419 + }, + { + "epoch": 3.785553047404063, + "grad_norm": 0.2100631689602541, + "learning_rate": 2.346012436026508e-06, + "loss": 0.0294, + "num_tokens": 179421217.0, + "step": 420 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 0.20136435725376678, + "learning_rate": 2.327323449110017e-06, + "loss": 0.0284, + "num_tokens": 179838546.0, + "step": 421 + }, + { + "epoch": 3.8036117381489842, + "grad_norm": 0.21336207628857884, + "learning_rate": 2.3087426453774002e-06, + "loss": 0.0309, + "num_tokens": 180250539.0, + "step": 422 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 0.19556920726792512, + "learning_rate": 2.290270658404271e-06, + "loss": 0.0278, + "num_tokens": 180672562.0, + "step": 423 + }, + { + "epoch": 3.8216704288939054, + "grad_norm": 0.18128988764785925, + "learning_rate": 2.2719081180557757e-06, + "loss": 0.0256, + "num_tokens": 181091537.0, + "step": 424 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 0.19817788412029655, + "learning_rate": 2.253655650465096e-06, + "loss": 0.0283, + "num_tokens": 181506969.0, + "step": 425 + }, + { + "epoch": 3.839729119638826, + "grad_norm": 0.202018039299863, + "learning_rate": 2.2355138780121166e-06, + "loss": 0.0251, + "num_tokens": 181969132.0, + "step": 426 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 0.2125254335557616, + "learning_rate": 2.2174834193021934e-06, + "loss": 0.0288, + "num_tokens": 182397825.0, + "step": 427 + }, + { + "epoch": 3.8577878103837473, + "grad_norm": 0.194037802262133, + "learning_rate": 2.199564889145058e-06, + "loss": 0.0252, + "num_tokens": 182846790.0, + "step": 428 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 0.19456825460267352, + "learning_rate": 2.181758898533866e-06, + "loss": 0.0282, + "num_tokens": 183262540.0, + "step": 429 + }, + { + "epoch": 3.875846501128668, + "grad_norm": 0.19670892133494983, + "learning_rate": 2.164066054624347e-06, + "loss": 0.0277, + "num_tokens": 183697485.0, + "step": 430 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 0.18889408722383957, + "learning_rate": 2.146486960714114e-06, + "loss": 0.0254, + "num_tokens": 184125479.0, + "step": 431 + }, + { + "epoch": 3.893905191873589, + "grad_norm": 0.2038533249002068, + "learning_rate": 2.129022216222085e-06, + "loss": 0.0271, + "num_tokens": 184544539.0, + "step": 432 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 0.21297764765106292, + "learning_rate": 2.111672416668048e-06, + "loss": 0.0288, + "num_tokens": 184949325.0, + "step": 433 + }, + { + "epoch": 3.91196388261851, + "grad_norm": 0.2054399947006486, + "learning_rate": 2.0944381536523526e-06, + "loss": 0.0281, + "num_tokens": 185353902.0, + "step": 434 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 0.19836086810457373, + "learning_rate": 2.077320014835738e-06, + "loss": 0.0263, + "num_tokens": 185775888.0, + "step": 435 + }, + { + "epoch": 3.930022573363431, + "grad_norm": 0.1881013038356834, + "learning_rate": 2.0603185839192914e-06, + "loss": 0.0259, + "num_tokens": 186203975.0, + "step": 436 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 0.20693333346367762, + "learning_rate": 2.043434440624551e-06, + "loss": 0.0282, + "num_tokens": 186639222.0, + "step": 437 + }, + { + "epoch": 3.9480812641083523, + "grad_norm": 0.1954652710366924, + "learning_rate": 2.0266681606737335e-06, + "loss": 0.0258, + "num_tokens": 187080093.0, + "step": 438 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 0.20223843483684298, + "learning_rate": 2.0100203157701066e-06, + "loss": 0.0263, + "num_tokens": 187510794.0, + "step": 439 + }, + { + "epoch": 3.966139954853273, + "grad_norm": 0.19525308559707952, + "learning_rate": 1.993491473578491e-06, + "loss": 0.0259, + "num_tokens": 187954041.0, + "step": 440 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 0.21128072662634845, + "learning_rate": 1.9770821977059026e-06, + "loss": 0.0258, + "num_tokens": 188383605.0, + "step": 441 + }, + { + "epoch": 3.984198645598194, + "grad_norm": 0.18577142836651914, + "learning_rate": 1.9607930476823467e-06, + "loss": 0.0255, + "num_tokens": 188821681.0, + "step": 442 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 0.1982522512127324, + "learning_rate": 1.9446245789417194e-06, + "loss": 0.0263, + "num_tokens": 189274421.0, + "step": 443 + }, + { + "epoch": 4.0, + "grad_norm": 0.21396738069349763, + "learning_rate": 1.928577342802885e-06, + "loss": 0.0265, + "num_tokens": 189586064.0, + "step": 444 + }, + { + "epoch": 4.0, + "eval_loss": 0.09049772471189499, + "eval_num_tokens": 189586064.0, + "eval_runtime": 53.1262, + "eval_samples_per_second": 47.152, + "eval_steps_per_second": 5.91, + "step": 444 + }, + { + "epoch": 4.00902934537246, + "grad_norm": 0.15418645017329702, + "learning_rate": 1.9126518864508685e-06, + "loss": 0.0214, + "num_tokens": 190012482.0, + "step": 445 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 0.14100499317108237, + "learning_rate": 1.8968487529181967e-06, + "loss": 0.0188, + "num_tokens": 190450990.0, + "step": 446 + }, + { + "epoch": 4.027088036117381, + "grad_norm": 0.16697626697347936, + "learning_rate": 1.8811684810663915e-06, + "loss": 0.0221, + "num_tokens": 190867207.0, + "step": 447 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 0.15622351235420653, + "learning_rate": 1.8656116055675816e-06, + "loss": 0.0203, + "num_tokens": 191302488.0, + "step": 448 + }, + { + "epoch": 4.045146726862303, + "grad_norm": 0.16643081164859824, + "learning_rate": 1.85017865688628e-06, + "loss": 0.0215, + "num_tokens": 191733936.0, + "step": 449 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 0.15821211882565347, + "learning_rate": 1.8348701612612951e-06, + "loss": 0.0205, + "num_tokens": 192167735.0, + "step": 450 + }, + { + "epoch": 4.063205417607223, + "grad_norm": 0.15274737321085244, + "learning_rate": 1.819686640687785e-06, + "loss": 0.0222, + "num_tokens": 192575157.0, + "step": 451 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 0.14926458971205808, + "learning_rate": 1.8046286128994578e-06, + "loss": 0.0196, + "num_tokens": 193021330.0, + "step": 452 + }, + { + "epoch": 4.081264108352144, + "grad_norm": 0.17212275585617468, + "learning_rate": 1.7896965913509213e-06, + "loss": 0.0189, + "num_tokens": 193464296.0, + "step": 453 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 0.156919720923235, + "learning_rate": 1.7748910852001684e-06, + "loss": 0.0189, + "num_tokens": 193896598.0, + "step": 454 + }, + { + "epoch": 4.099322799097066, + "grad_norm": 0.17124727090479228, + "learning_rate": 1.7602125992912239e-06, + "loss": 0.0203, + "num_tokens": 194322822.0, + "step": 455 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 0.18510272198987668, + "learning_rate": 1.7456616341369237e-06, + "loss": 0.0203, + "num_tokens": 194756207.0, + "step": 456 + }, + { + "epoch": 4.1173814898419865, + "grad_norm": 0.19407435700708406, + "learning_rate": 1.7312386859018517e-06, + "loss": 0.0238, + "num_tokens": 195185054.0, + "step": 457 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 0.1948796133416149, + "learning_rate": 1.7169442463854208e-06, + "loss": 0.0233, + "num_tokens": 195599914.0, + "step": 458 + }, + { + "epoch": 4.135440180586907, + "grad_norm": 0.18649333768434115, + "learning_rate": 1.7027788030050967e-06, + "loss": 0.0209, + "num_tokens": 196011879.0, + "step": 459 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 0.18356123055194207, + "learning_rate": 1.6887428387797942e-06, + "loss": 0.0203, + "num_tokens": 196428514.0, + "step": 460 + }, + { + "epoch": 4.153498871331829, + "grad_norm": 0.1879983996789742, + "learning_rate": 1.6748368323133868e-06, + "loss": 0.0224, + "num_tokens": 196847106.0, + "step": 461 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 0.17315203900440446, + "learning_rate": 1.6610612577784009e-06, + "loss": 0.0191, + "num_tokens": 197290686.0, + "step": 462 + }, + { + "epoch": 4.1715575620767495, + "grad_norm": 0.170525796804323, + "learning_rate": 1.6474165848998439e-06, + "loss": 0.0178, + "num_tokens": 197727840.0, + "step": 463 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 0.18053575499024638, + "learning_rate": 1.633903278939185e-06, + "loss": 0.0206, + "num_tokens": 198145792.0, + "step": 464 + }, + { + "epoch": 4.18961625282167, + "grad_norm": 0.17038024710969035, + "learning_rate": 1.6205218006784934e-06, + "loss": 0.0198, + "num_tokens": 198577289.0, + "step": 465 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 0.1818920344481054, + "learning_rate": 1.6072726064047212e-06, + "loss": 0.0192, + "num_tokens": 199021897.0, + "step": 466 + }, + { + "epoch": 4.207674943566591, + "grad_norm": 0.2133634307635849, + "learning_rate": 1.5941561478941563e-06, + "loss": 0.0235, + "num_tokens": 199437131.0, + "step": 467 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 0.1842557897841013, + "learning_rate": 1.5811728723970019e-06, + "loss": 0.0198, + "num_tokens": 199863916.0, + "step": 468 + }, + { + "epoch": 4.225733634311513, + "grad_norm": 0.17446476857607085, + "learning_rate": 1.568323222622138e-06, + "loss": 0.0197, + "num_tokens": 200298569.0, + "step": 469 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 0.16733228471185502, + "learning_rate": 1.5556076367220218e-06, + "loss": 0.018, + "num_tokens": 200746877.0, + "step": 470 + }, + { + "epoch": 4.243792325056433, + "grad_norm": 0.179628482242242, + "learning_rate": 1.543026548277746e-06, + "loss": 0.0207, + "num_tokens": 201152308.0, + "step": 471 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 0.19151302252076471, + "learning_rate": 1.5305803862842569e-06, + "loss": 0.0219, + "num_tokens": 201575609.0, + "step": 472 + }, + { + "epoch": 4.261851015801354, + "grad_norm": 0.18919554634088748, + "learning_rate": 1.5182695751357245e-06, + "loss": 0.0211, + "num_tokens": 201998154.0, + "step": 473 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 0.17747943624556425, + "learning_rate": 1.5060945346110707e-06, + "loss": 0.0201, + "num_tokens": 202418768.0, + "step": 474 + }, + { + "epoch": 4.279909706546276, + "grad_norm": 0.17001099961527702, + "learning_rate": 1.4940556798596585e-06, + "loss": 0.0212, + "num_tokens": 202847509.0, + "step": 475 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 0.1736578759132605, + "learning_rate": 1.4821534213871344e-06, + "loss": 0.0194, + "num_tokens": 203293633.0, + "step": 476 + }, + { + "epoch": 4.297968397291196, + "grad_norm": 0.19406265996719613, + "learning_rate": 1.4703881650414304e-06, + "loss": 0.0221, + "num_tokens": 203726576.0, + "step": 477 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 0.19260019267218204, + "learning_rate": 1.4587603119989263e-06, + "loss": 0.0205, + "num_tokens": 204143709.0, + "step": 478 + }, + { + "epoch": 4.316027088036117, + "grad_norm": 0.1873666100999453, + "learning_rate": 1.4472702587507655e-06, + "loss": 0.0203, + "num_tokens": 204573732.0, + "step": 479 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 0.1897998928167092, + "learning_rate": 1.435918397089347e-06, + "loss": 0.0206, + "num_tokens": 205005583.0, + "step": 480 + }, + { + "epoch": 4.334085778781039, + "grad_norm": 0.18057188660990434, + "learning_rate": 1.4247051140949513e-06, + "loss": 0.0195, + "num_tokens": 205437157.0, + "step": 481 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 0.18651736387988782, + "learning_rate": 1.4136307921225513e-06, + "loss": 0.0222, + "num_tokens": 205850496.0, + "step": 482 + }, + { + "epoch": 4.3521444695259595, + "grad_norm": 0.17170882688956685, + "learning_rate": 1.4026958087887723e-06, + "loss": 0.0214, + "num_tokens": 206281600.0, + "step": 483 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 0.19127737771243922, + "learning_rate": 1.3919005369590132e-06, + "loss": 0.0238, + "num_tokens": 206692358.0, + "step": 484 + }, + { + "epoch": 4.37020316027088, + "grad_norm": 0.17790295399194844, + "learning_rate": 1.381245344734739e-06, + "loss": 0.0203, + "num_tokens": 207116740.0, + "step": 485 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 0.1663213943446299, + "learning_rate": 1.3707305954409194e-06, + "loss": 0.0199, + "num_tokens": 207550858.0, + "step": 486 + }, + { + "epoch": 4.388261851015802, + "grad_norm": 0.17560067742983293, + "learning_rate": 1.3603566476136488e-06, + "loss": 0.0218, + "num_tokens": 207978354.0, + "step": 487 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 0.1868314805293888, + "learning_rate": 1.3501238549879156e-06, + "loss": 0.0199, + "num_tokens": 208410302.0, + "step": 488 + }, + { + "epoch": 4.4063205417607225, + "grad_norm": 0.1856195164389372, + "learning_rate": 1.3400325664855437e-06, + "loss": 0.0204, + "num_tokens": 208812870.0, + "step": 489 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 0.17420216191906296, + "learning_rate": 1.3300831262032925e-06, + "loss": 0.0191, + "num_tokens": 209254543.0, + "step": 490 + }, + { + "epoch": 4.424379232505643, + "grad_norm": 0.19181688301449054, + "learning_rate": 1.3202758734011244e-06, + "loss": 0.0223, + "num_tokens": 209669464.0, + "step": 491 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 0.17557989751907357, + "learning_rate": 1.3106111424906355e-06, + "loss": 0.0217, + "num_tokens": 210092796.0, + "step": 492 + }, + { + "epoch": 4.442437923250564, + "grad_norm": 0.1975584628146046, + "learning_rate": 1.3010892630236568e-06, + "loss": 0.0202, + "num_tokens": 210539090.0, + "step": 493 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 0.17617895289518623, + "learning_rate": 1.2917105596810112e-06, + "loss": 0.02, + "num_tokens": 210962860.0, + "step": 494 + }, + { + "epoch": 4.460496613995486, + "grad_norm": 0.18368645306428724, + "learning_rate": 1.2824753522614473e-06, + "loss": 0.0204, + "num_tokens": 211389239.0, + "step": 495 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 0.16350779571812799, + "learning_rate": 1.273383955670732e-06, + "loss": 0.0186, + "num_tokens": 211848433.0, + "step": 496 + }, + { + "epoch": 4.478555304740406, + "grad_norm": 0.16374756174145066, + "learning_rate": 1.2644366799109118e-06, + "loss": 0.0195, + "num_tokens": 212286944.0, + "step": 497 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 0.16562432322005863, + "learning_rate": 1.2556338300697485e-06, + "loss": 0.0204, + "num_tokens": 212714521.0, + "step": 498 + }, + { + "epoch": 4.496613995485327, + "grad_norm": 0.17526601821860907, + "learning_rate": 1.2469757063103061e-06, + "loss": 0.0192, + "num_tokens": 213148368.0, + "step": 499 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 0.16593144111118333, + "learning_rate": 1.2384626038607255e-06, + "loss": 0.0212, + "num_tokens": 213576096.0, + "step": 500 + }, + { + "epoch": 4.514672686230249, + "grad_norm": 0.17175436256108279, + "learning_rate": 1.2300948130041515e-06, + "loss": 0.0198, + "num_tokens": 213992716.0, + "step": 501 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 0.18448107353481236, + "learning_rate": 1.2218726190688356e-06, + "loss": 0.0206, + "num_tokens": 214418244.0, + "step": 502 + }, + { + "epoch": 4.532731376975169, + "grad_norm": 0.17500609094863792, + "learning_rate": 1.2137963024184115e-06, + "loss": 0.0207, + "num_tokens": 214857117.0, + "step": 503 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 0.17488600150980588, + "learning_rate": 1.2058661384423267e-06, + "loss": 0.0202, + "num_tokens": 215275416.0, + "step": 504 + }, + { + "epoch": 4.55079006772009, + "grad_norm": 0.17628502659157805, + "learning_rate": 1.1980823975464593e-06, + "loss": 0.0207, + "num_tokens": 215695066.0, + "step": 505 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 0.16652382229346105, + "learning_rate": 1.1904453451438951e-06, + "loss": 0.0199, + "num_tokens": 216118838.0, + "step": 506 + }, + { + "epoch": 4.568848758465011, + "grad_norm": 0.17416258725445835, + "learning_rate": 1.1829552416458775e-06, + "loss": 0.0206, + "num_tokens": 216542622.0, + "step": 507 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 0.17503846455221217, + "learning_rate": 1.1756123424529266e-06, + "loss": 0.0195, + "num_tokens": 216996879.0, + "step": 508 + }, + { + "epoch": 4.586907449209932, + "grad_norm": 0.1854930469776265, + "learning_rate": 1.1684168979461336e-06, + "loss": 0.0196, + "num_tokens": 217423398.0, + "step": 509 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 0.18437347020344613, + "learning_rate": 1.1613691534786196e-06, + "loss": 0.0204, + "num_tokens": 217840389.0, + "step": 510 + }, + { + "epoch": 4.604966139954853, + "grad_norm": 0.16995963371542427, + "learning_rate": 1.1544693493671712e-06, + "loss": 0.0204, + "num_tokens": 218270640.0, + "step": 511 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 0.17476739578001602, + "learning_rate": 1.1477177208840482e-06, + "loss": 0.0199, + "num_tokens": 218713265.0, + "step": 512 + }, + { + "epoch": 4.623024830699774, + "grad_norm": 0.17680781235832188, + "learning_rate": 1.1411144982489562e-06, + "loss": 0.0194, + "num_tokens": 219137609.0, + "step": 513 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 0.19266113865684587, + "learning_rate": 1.1346599066212008e-06, + "loss": 0.0214, + "num_tokens": 219578547.0, + "step": 514 + }, + { + "epoch": 4.6410835214446955, + "grad_norm": 0.1712887481277758, + "learning_rate": 1.128354166092009e-06, + "loss": 0.0179, + "num_tokens": 220026371.0, + "step": 515 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 0.19407164153838413, + "learning_rate": 1.1221974916770236e-06, + "loss": 0.022, + "num_tokens": 220433942.0, + "step": 516 + }, + { + "epoch": 4.659142212189616, + "grad_norm": 0.16510220010100865, + "learning_rate": 1.11619009330897e-06, + "loss": 0.0183, + "num_tokens": 220867424.0, + "step": 517 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 0.17296028711993447, + "learning_rate": 1.1103321758305028e-06, + "loss": 0.0194, + "num_tokens": 221290561.0, + "step": 518 + }, + { + "epoch": 4.677200902934537, + "grad_norm": 0.18098815155223497, + "learning_rate": 1.104623938987216e-06, + "loss": 0.0213, + "num_tokens": 221699415.0, + "step": 519 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 0.14565155092583376, + "learning_rate": 1.0990655774208339e-06, + "loss": 0.0172, + "num_tokens": 222150407.0, + "step": 520 + }, + { + "epoch": 4.6952595936794586, + "grad_norm": 0.19331583845274686, + "learning_rate": 1.0936572806625755e-06, + "loss": 0.0222, + "num_tokens": 222572283.0, + "step": 521 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 0.17126454258481613, + "learning_rate": 1.0883992331266883e-06, + "loss": 0.0216, + "num_tokens": 222986703.0, + "step": 522 + }, + { + "epoch": 4.713318284424379, + "grad_norm": 0.18948406487563782, + "learning_rate": 1.0832916141041655e-06, + "loss": 0.0205, + "num_tokens": 223408969.0, + "step": 523 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 0.19297817875902212, + "learning_rate": 1.078334597756625e-06, + "loss": 0.0212, + "num_tokens": 223835474.0, + "step": 524 + }, + { + "epoch": 4.7313769751693, + "grad_norm": 0.18397839966809734, + "learning_rate": 1.0735283531103781e-06, + "loss": 0.0207, + "num_tokens": 224259986.0, + "step": 525 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 0.17467329511775723, + "learning_rate": 1.0688730440506611e-06, + "loss": 0.0193, + "num_tokens": 224699745.0, + "step": 526 + }, + { + "epoch": 4.749435665914222, + "grad_norm": 0.17902996216481953, + "learning_rate": 1.0643688293160503e-06, + "loss": 0.0177, + "num_tokens": 225145143.0, + "step": 527 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 0.17228792421467223, + "learning_rate": 1.0600158624930462e-06, + "loss": 0.019, + "num_tokens": 225578145.0, + "step": 528 + }, + { + "epoch": 4.767494356659142, + "grad_norm": 0.17797664151251616, + "learning_rate": 1.0558142920108394e-06, + "loss": 0.0212, + "num_tokens": 226003984.0, + "step": 529 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 0.18684000945026896, + "learning_rate": 1.0517642611362464e-06, + "loss": 0.021, + "num_tokens": 226426088.0, + "step": 530 + }, + { + "epoch": 4.785553047404063, + "grad_norm": 0.17968004856043132, + "learning_rate": 1.047865907968827e-06, + "loss": 0.0196, + "num_tokens": 226858806.0, + "step": 531 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 0.17275763170461778, + "learning_rate": 1.0441193654361755e-06, + "loss": 0.0207, + "num_tokens": 227291465.0, + "step": 532 + }, + { + "epoch": 4.803611738148984, + "grad_norm": 0.17520212708362584, + "learning_rate": 1.0405247612893841e-06, + "loss": 0.0199, + "num_tokens": 227721309.0, + "step": 533 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 0.18975116685538082, + "learning_rate": 1.037082218098692e-06, + "loss": 0.021, + "num_tokens": 228142508.0, + "step": 534 + }, + { + "epoch": 4.821670428893905, + "grad_norm": 0.19153487232812502, + "learning_rate": 1.0337918532493027e-06, + "loss": 0.0215, + "num_tokens": 228560446.0, + "step": 535 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 0.20996193220725706, + "learning_rate": 1.0306537789373832e-06, + "loss": 0.022, + "num_tokens": 228983150.0, + "step": 536 + }, + { + "epoch": 4.839729119638826, + "grad_norm": 0.20126150758242514, + "learning_rate": 1.027668102166235e-06, + "loss": 0.0226, + "num_tokens": 229412211.0, + "step": 537 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 0.1573000482505692, + "learning_rate": 1.02483492474265e-06, + "loss": 0.0195, + "num_tokens": 229854229.0, + "step": 538 + }, + { + "epoch": 4.857787810383747, + "grad_norm": 0.17919483489104251, + "learning_rate": 1.0221543432734369e-06, + "loss": 0.0208, + "num_tokens": 230272495.0, + "step": 539 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 0.17970864259308456, + "learning_rate": 1.0196264491621247e-06, + "loss": 0.0215, + "num_tokens": 230670995.0, + "step": 540 + }, + { + "epoch": 4.8758465011286685, + "grad_norm": 0.17424710710496075, + "learning_rate": 1.0172513286058505e-06, + "loss": 0.0207, + "num_tokens": 231097273.0, + "step": 541 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 0.17129200768842348, + "learning_rate": 1.015029062592418e-06, + "loss": 0.0196, + "num_tokens": 231529230.0, + "step": 542 + }, + { + "epoch": 4.893905191873589, + "grad_norm": 0.2086105902915424, + "learning_rate": 1.012959726897535e-06, + "loss": 0.0242, + "num_tokens": 231936065.0, + "step": 543 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 0.18905729339176797, + "learning_rate": 1.0110433920822306e-06, + "loss": 0.0201, + "num_tokens": 232357557.0, + "step": 544 + }, + { + "epoch": 4.91196388261851, + "grad_norm": 0.1922786676997605, + "learning_rate": 1.009280123490451e-06, + "loss": 0.0207, + "num_tokens": 232776441.0, + "step": 545 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 0.19087910354270035, + "learning_rate": 1.0076699812468264e-06, + "loss": 0.021, + "num_tokens": 233194997.0, + "step": 546 + }, + { + "epoch": 4.9300225733634315, + "grad_norm": 0.18786258054280564, + "learning_rate": 1.0062130202546278e-06, + "loss": 0.0201, + "num_tokens": 233614735.0, + "step": 547 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 0.1702857862148093, + "learning_rate": 1.0049092901938875e-06, + "loss": 0.0183, + "num_tokens": 234054550.0, + "step": 548 + }, + { + "epoch": 4.948081264108352, + "grad_norm": 0.19503784356256793, + "learning_rate": 1.0037588355197116e-06, + "loss": 0.0207, + "num_tokens": 234481341.0, + "step": 549 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 0.18748199022689804, + "learning_rate": 1.00276169546076e-06, + "loss": 0.0218, + "num_tokens": 234907817.0, + "step": 550 + }, + { + "epoch": 4.966139954853273, + "grad_norm": 0.16534471540720577, + "learning_rate": 1.0019179040179093e-06, + "loss": 0.0171, + "num_tokens": 235357272.0, + "step": 551 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 0.182782300003444, + "learning_rate": 1.0012274899630954e-06, + "loss": 0.0201, + "num_tokens": 235805693.0, + "step": 552 + }, + { + "epoch": 4.984198645598195, + "grad_norm": 0.18688268444184455, + "learning_rate": 1.0006904768383305e-06, + "loss": 0.0212, + "num_tokens": 236221003.0, + "step": 553 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 0.18372657341195586, + "learning_rate": 1.0003068829549017e-06, + "loss": 0.0204, + "num_tokens": 236660222.0, + "step": 554 + }, + { + "epoch": 5.0, + "grad_norm": 0.21976081180864657, + "learning_rate": 1.0000767213927445e-06, + "loss": 0.0218, + "num_tokens": 236984937.0, + "step": 555 + }, + { + "epoch": 5.0, + "eval_loss": 0.10813906788825989, + "eval_num_tokens": 236984937.0, + "eval_runtime": 53.1583, + "eval_samples_per_second": 47.123, + "eval_steps_per_second": 5.907, + "step": 555 + }, + { + "epoch": 5.0, + "step": 555, + "total_flos": 7.792360049627628e+17, + "train_loss": 0.04186144905114496, + "train_runtime": 7507.8207, + "train_samples_per_second": 9.436, + "train_steps_per_second": 0.074 + } + ], + "logging_steps": 1, + "max_steps": 555, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.792360049627628e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}