diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -4,1282 +4,10207 @@ "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 51, - "global_step": 154, + "global_step": 1246, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.013018714401952807, - "grad_norm": 210.16377525400654, + "epoch": 0.0016073940124573037, + "grad_norm": 158.2639349654095, "learning_rate": 0.0, - "loss": 68.6058, - "num_tokens": 450093.0, + "loss": 38.4221, + "num_tokens": 429697.0, "step": 1 }, { - "epoch": 0.026037428803905614, - "grad_norm": 213.21647342533257, - "learning_rate": 2.0000000000000003e-06, - "loss": 69.1255, - "num_tokens": 887137.0, + "epoch": 0.0032147880249146074, + "grad_norm": 145.4995154142515, + "learning_rate": 2.6315789473684213e-07, + "loss": 36.4452, + "num_tokens": 834234.0, "step": 2 }, { - "epoch": 0.03905614320585842, - "grad_norm": 195.68816189219862, - "learning_rate": 4.000000000000001e-06, - "loss": 68.5814, - "num_tokens": 1328102.0, + "epoch": 0.004822182037371911, + "grad_norm": 186.23616936341938, + "learning_rate": 5.263157894736843e-07, + "loss": 37.315, + "num_tokens": 1213950.0, "step": 3 }, { - "epoch": 0.05207485760781123, - "grad_norm": 173.50104843578222, - "learning_rate": 6e-06, - "loss": 66.4735, - "num_tokens": 1755609.0, + "epoch": 0.006429576049829215, + "grad_norm": 180.6052116526557, + "learning_rate": 7.894736842105263e-07, + "loss": 37.7042, + "num_tokens": 1582170.0, "step": 4 }, { - "epoch": 0.06509357200976404, - "grad_norm": 117.42225250215388, - "learning_rate": 8.000000000000001e-06, - "loss": 62.9682, - "num_tokens": 2198437.0, + "epoch": 0.008036970062286517, + "grad_norm": 169.51269224738985, + "learning_rate": 1.0526315789473685e-06, + "loss": 38.6927, + "num_tokens": 1950850.0, "step": 5 }, { - "epoch": 0.07811228641171684, - "grad_norm": 81.06684565183237, - "learning_rate": 1e-05, - "loss": 59.2928, - "num_tokens": 2599764.0, + "epoch": 0.009644364074743821, + "grad_norm": 161.12136422037688, + "learning_rate": 1.3157894736842106e-06, + "loss": 38.695, + "num_tokens": 2310664.0, "step": 6 }, { - "epoch": 0.09113100081366965, - "grad_norm": 79.2655000627535, - "learning_rate": 9.998999784368282e-06, - "loss": 57.5591, - "num_tokens": 2996134.0, + "epoch": 0.011251758087201125, + "grad_norm": 98.65878517351578, + "learning_rate": 1.5789473684210526e-06, + "loss": 34.9695, + "num_tokens": 2775781.0, "step": 7 }, { - "epoch": 0.10414971521562245, - "grad_norm": 230.40557467759052, - "learning_rate": 9.995999582109266e-06, - "loss": 55.484, - "num_tokens": 3399656.0, + "epoch": 0.01285915209965843, + "grad_norm": 95.54406385402243, + "learning_rate": 1.8421052631578948e-06, + "loss": 34.3435, + "num_tokens": 3218804.0, "step": 8 }, { - "epoch": 0.11716842961757526, - "grad_norm": 138.19582101554622, - "learning_rate": 9.991000726933702e-06, - "loss": 54.3088, - "num_tokens": 3801972.0, + "epoch": 0.014466546112115732, + "grad_norm": 105.73320283163156, + "learning_rate": 2.105263157894737e-06, + "loss": 36.1955, + "num_tokens": 3583954.0, "step": 9 }, { - "epoch": 0.13018714401952808, - "grad_norm": 103.41981563367382, - "learning_rate": 9.984005441034079e-06, - "loss": 51.8492, - "num_tokens": 4164196.0, + "epoch": 0.016073940124573034, + "grad_norm": 66.95293937847792, + "learning_rate": 2.368421052631579e-06, + "loss": 33.9094, + "num_tokens": 4012626.0, "step": 10 }, { - "epoch": 0.14320585842148087, - "grad_norm": 135.95872356237243, - "learning_rate": 9.97501683409675e-06, - "loss": 52.1257, - "num_tokens": 4570280.0, + "epoch": 0.01768133413703034, + "grad_norm": 55.58271226047935, + "learning_rate": 2.631578947368421e-06, + "loss": 33.0959, + "num_tokens": 4445446.0, "step": 11 }, { - "epoch": 0.1562245728234337, - "grad_norm": 84.52254968743094, - "learning_rate": 9.964038901919573e-06, - "loss": 51.4563, - "num_tokens": 5020942.0, + "epoch": 0.019288728149487643, + "grad_norm": 61.05024480855454, + "learning_rate": 2.8947368421052634e-06, + "loss": 31.8251, + "num_tokens": 4844317.0, "step": 12 }, { - "epoch": 0.16924328722538648, - "grad_norm": 65.86930480360061, - "learning_rate": 9.951076524635593e-06, - "loss": 51.3736, - "num_tokens": 5425967.0, + "epoch": 0.020896122161944947, + "grad_norm": 48.862403429053636, + "learning_rate": 3.157894736842105e-06, + "loss": 32.1968, + "num_tokens": 5253670.0, "step": 13 }, { - "epoch": 0.1822620016273393, - "grad_norm": 46.06798519871614, - "learning_rate": 9.936135464543652e-06, - "loss": 50.5119, - "num_tokens": 5851273.0, + "epoch": 0.02250351617440225, + "grad_norm": 34.36514937704088, + "learning_rate": 3.421052631578948e-06, + "loss": 32.211, + "num_tokens": 5729512.0, "step": 14 }, { - "epoch": 0.19528071602929212, - "grad_norm": 44.27329631066422, - "learning_rate": 9.919222363546797e-06, - "loss": 49.1355, - "num_tokens": 6239473.0, + "epoch": 0.024110910186859555, + "grad_norm": 49.30488083250465, + "learning_rate": 3.6842105263157896e-06, + "loss": 28.194, + "num_tokens": 6100540.0, "step": 15 }, { - "epoch": 0.2082994304312449, - "grad_norm": 40.91130953224105, - "learning_rate": 9.900344740199691e-06, - "loss": 48.6006, - "num_tokens": 6643168.0, + "epoch": 0.02571830419931686, + "grad_norm": 38.449649511690374, + "learning_rate": 3.947368421052632e-06, + "loss": 28.8826, + "num_tokens": 6511297.0, "step": 16 }, { - "epoch": 0.22131814483319773, - "grad_norm": 51.01053647447493, - "learning_rate": 9.879510986366321e-06, - "loss": 49.2755, - "num_tokens": 7067385.0, + "epoch": 0.02732569821177416, + "grad_norm": 54.732441828875096, + "learning_rate": 4.210526315789474e-06, + "loss": 27.2258, + "num_tokens": 6871630.0, "step": 17 }, { - "epoch": 0.23433685923515052, - "grad_norm": 37.04542472189647, - "learning_rate": 9.856730363489465e-06, - "loss": 49.4077, - "num_tokens": 7490781.0, + "epoch": 0.028933092224231464, + "grad_norm": 54.3702700040615, + "learning_rate": 4.473684210526316e-06, + "loss": 29.0712, + "num_tokens": 7267796.0, "step": 18 }, { - "epoch": 0.24735557363710334, - "grad_norm": 31.253768022566376, - "learning_rate": 9.832012998473612e-06, - "loss": 49.1242, - "num_tokens": 7915658.0, + "epoch": 0.030540486236688768, + "grad_norm": 44.32741982843244, + "learning_rate": 4.736842105263158e-06, + "loss": 29.7555, + "num_tokens": 7726887.0, "step": 19 }, { - "epoch": 0.26037428803905616, - "grad_norm": 37.545554529167326, - "learning_rate": 9.805369879183143e-06, - "loss": 47.399, - "num_tokens": 8326723.0, + "epoch": 0.03214788024914607, + "grad_norm": 36.82706808218641, + "learning_rate": 5e-06, + "loss": 28.568, + "num_tokens": 8175090.0, "step": 20 }, { - "epoch": 0.27339300244100895, - "grad_norm": 25.04275025437533, - "learning_rate": 9.77681284955779e-06, - "loss": 47.4615, - "num_tokens": 8707910.0, + "epoch": 0.03375527426160337, + "grad_norm": 29.77153235210757, + "learning_rate": 5.263157894736842e-06, + "loss": 27.0873, + "num_tokens": 8586423.0, "step": 21 }, { - "epoch": 0.28641171684296174, - "grad_norm": 24.37445600089813, - "learning_rate": 9.74635460434752e-06, - "loss": 46.6855, - "num_tokens": 9137522.0, + "epoch": 0.03536266827406068, + "grad_norm": 22.058553049885592, + "learning_rate": 5.526315789473685e-06, + "loss": 28.311, + "num_tokens": 9009584.0, "step": 22 }, { - "epoch": 0.2994304312449146, - "grad_norm": 17.823377115061227, - "learning_rate": 9.714008683469212e-06, - "loss": 47.1495, - "num_tokens": 9532748.0, + "epoch": 0.03697006228651798, + "grad_norm": 17.651713527060586, + "learning_rate": 5.789473684210527e-06, + "loss": 24.8678, + "num_tokens": 9402644.0, "step": 23 }, { - "epoch": 0.3124491456468674, - "grad_norm": 22.71868864466224, - "learning_rate": 9.679789465987614e-06, - "loss": 46.9846, - "num_tokens": 9984555.0, + "epoch": 0.038577456298975285, + "grad_norm": 23.650041658817525, + "learning_rate": 6.0526315789473685e-06, + "loss": 25.5893, + "num_tokens": 9829075.0, "step": 24 }, { - "epoch": 0.32546786004882017, - "grad_norm": 13.991596255713164, - "learning_rate": 9.643712163723271e-06, - "loss": 47.7722, - "num_tokens": 10416508.0, + "epoch": 0.04018485031143259, + "grad_norm": 25.074620673028285, + "learning_rate": 6.31578947368421e-06, + "loss": 27.5734, + "num_tokens": 10285438.0, "step": 25 }, { - "epoch": 0.33848657445077296, - "grad_norm": 16.613683272470166, - "learning_rate": 9.605792814490263e-06, - "loss": 46.8332, - "num_tokens": 10884479.0, + "epoch": 0.04179224432388989, + "grad_norm": 19.067605149682883, + "learning_rate": 6.578947368421054e-06, + "loss": 25.2745, + "num_tokens": 10698507.0, "step": 26 }, { - "epoch": 0.3515052888527258, - "grad_norm": 11.616610179074936, - "learning_rate": 9.566048274966745e-06, - "loss": 46.1208, - "num_tokens": 11292312.0, + "epoch": 0.0433996383363472, + "grad_norm": 15.577052560784281, + "learning_rate": 6.842105263157896e-06, + "loss": 25.3718, + "num_tokens": 11130895.0, "step": 27 }, { - "epoch": 0.3645240032546786, - "grad_norm": 15.943420560796007, - "learning_rate": 9.524496213201473e-06, - "loss": 45.1858, - "num_tokens": 11705689.0, + "epoch": 0.0450070323488045, + "grad_norm": 13.717551354593585, + "learning_rate": 7.1052631578947375e-06, + "loss": 24.8468, + "num_tokens": 11558592.0, "step": 28 }, { - "epoch": 0.3775427176566314, - "grad_norm": 15.401279875526852, - "learning_rate": 9.481155100759651e-06, - "loss": 47.0287, - "num_tokens": 12138908.0, + "epoch": 0.046614426361261806, + "grad_norm": 11.784650539336601, + "learning_rate": 7.368421052631579e-06, + "loss": 25.7097, + "num_tokens": 11965372.0, "step": 29 }, { - "epoch": 0.39056143205858423, - "grad_norm": 16.562354774139983, - "learning_rate": 9.436044204511575e-06, - "loss": 45.992, - "num_tokens": 12591230.0, + "epoch": 0.04822182037371911, + "grad_norm": 11.285355526500709, + "learning_rate": 7.631578947368423e-06, + "loss": 26.3512, + "num_tokens": 12402138.0, "step": 30 }, { - "epoch": 0.403580146460537, - "grad_norm": 17.165245041950023, - "learning_rate": 9.389183578067725e-06, - "loss": 45.325, - "num_tokens": 13003475.0, + "epoch": 0.049829214386176414, + "grad_norm": 14.306809053788204, + "learning_rate": 7.894736842105265e-06, + "loss": 25.5087, + "num_tokens": 12829377.0, "step": 31 }, { - "epoch": 0.4165988608624898, - "grad_norm": 12.373994254226883, - "learning_rate": 9.34059405286414e-06, - "loss": 46.4383, - "num_tokens": 13445762.0, + "epoch": 0.05143660839863372, + "grad_norm": 12.313343949505173, + "learning_rate": 8.157894736842106e-06, + "loss": 23.2879, + "num_tokens": 13178988.0, "step": 32 }, { - "epoch": 0.42961757526444266, - "grad_norm": 12.407033062851411, - "learning_rate": 9.290297228901994e-06, - "loss": 46.6786, - "num_tokens": 13878827.0, + "epoch": 0.05304400241109102, + "grad_norm": 11.770437016515455, + "learning_rate": 8.421052631578948e-06, + "loss": 22.3224, + "num_tokens": 13669987.0, "step": 33 }, { - "epoch": 0.44263628966639545, - "grad_norm": 12.403609207306305, - "learning_rate": 9.238315465145536e-06, - "loss": 46.8816, - "num_tokens": 14290143.0, + "epoch": 0.05465139642354832, + "grad_norm": 10.146974056064717, + "learning_rate": 8.68421052631579e-06, + "loss": 22.6575, + "num_tokens": 14083529.0, "step": 34 }, { - "epoch": 0.45565500406834825, - "grad_norm": 9.657929640852593, - "learning_rate": 9.184671869582617e-06, - "loss": 44.9468, - "num_tokens": 14659938.0, + "epoch": 0.05625879043600562, + "grad_norm": 11.234451832845155, + "learning_rate": 8.947368421052632e-06, + "loss": 22.6257, + "num_tokens": 14466619.0, "step": 35 }, { - "epoch": 0.46867371847030104, - "grad_norm": 9.1496954355886, - "learning_rate": 9.129390288952273e-06, - "loss": 47.3524, - "num_tokens": 15145254.0, + "epoch": 0.05786618444846293, + "grad_norm": 12.446328236886412, + "learning_rate": 9.210526315789474e-06, + "loss": 22.1305, + "num_tokens": 14838407.0, "step": 36 }, { - "epoch": 0.4816924328722539, - "grad_norm": 8.205712275414193, - "learning_rate": 9.072495298143876e-06, - "loss": 46.4164, - "num_tokens": 15603970.0, + "epoch": 0.05947357846092023, + "grad_norm": 14.929889570838686, + "learning_rate": 9.473684210526315e-06, + "loss": 24.8194, + "num_tokens": 15243968.0, "step": 37 }, { - "epoch": 0.4947111472742067, - "grad_norm": 9.964637928182375, - "learning_rate": 9.014012189272612e-06, - "loss": 46.9795, - "num_tokens": 16082088.0, + "epoch": 0.061080972473377536, + "grad_norm": 10.639120595053091, + "learning_rate": 9.736842105263159e-06, + "loss": 23.1306, + "num_tokens": 15594653.0, "step": 38 }, { - "epoch": 0.5077298616761595, - "grad_norm": 8.823549641220852, - "learning_rate": 8.953966960436125e-06, - "loss": 46.5117, - "num_tokens": 16544185.0, + "epoch": 0.06268836648583484, + "grad_norm": 10.787429509562896, + "learning_rate": 1e-05, + "loss": 23.9048, + "num_tokens": 16080401.0, "step": 39 }, { - "epoch": 0.5207485760781123, - "grad_norm": 11.388353330888256, - "learning_rate": 8.892386304157297e-06, - "loss": 45.316, - "num_tokens": 16968984.0, + "epoch": 0.06429576049829214, + "grad_norm": 9.474274523001743, + "learning_rate": 9.999984782330418e-06, + "loss": 22.3131, + "num_tokens": 16474437.0, "step": 40 }, { - "epoch": 0.5337672904800651, - "grad_norm": 10.310453379038181, - "learning_rate": 8.829297595518357e-06, - "loss": 45.9562, - "num_tokens": 17429415.0, + "epoch": 0.06590315451074945, + "grad_norm": 9.597567668205404, + "learning_rate": 9.999939129424594e-06, + "loss": 22.184, + "num_tokens": 16865300.0, "step": 41 }, { - "epoch": 0.5467860048820179, - "grad_norm": 9.482611851815543, - "learning_rate": 8.764728879991563e-06, - "loss": 44.9445, - "num_tokens": 17823109.0, + "epoch": 0.06751054852320675, + "grad_norm": 9.092870239068711, + "learning_rate": 9.999863041591292e-06, + "loss": 21.8109, + "num_tokens": 17245976.0, "step": 42 }, { - "epoch": 0.5598047192839707, - "grad_norm": 9.770004640780463, - "learning_rate": 8.698708860971837e-06, - "loss": 47.0569, - "num_tokens": 18287782.0, + "epoch": 0.06911794253566406, + "grad_norm": 9.195902541989362, + "learning_rate": 9.999756519345133e-06, + "loss": 20.4745, + "num_tokens": 17685985.0, "step": 43 }, { - "epoch": 0.5728234336859235, - "grad_norm": 9.047986541645175, - "learning_rate": 8.631266887016973e-06, - "loss": 42.3776, - "num_tokens": 18660336.0, + "epoch": 0.07072533654812135, + "grad_norm": 10.313867436771632, + "learning_rate": 9.999619563406567e-06, + "loss": 18.9403, + "num_tokens": 18127984.0, "step": 44 }, { - "epoch": 0.5858421480878763, - "grad_norm": 9.246260416027958, - "learning_rate": 8.56243293880101e-06, - "loss": 43.6804, - "num_tokens": 19032455.0, + "epoch": 0.07233273056057866, + "grad_norm": 8.421421460008881, + "learning_rate": 9.999452174701882e-06, + "loss": 21.3797, + "num_tokens": 18579086.0, "step": 45 }, { - "epoch": 0.5988608624898292, - "grad_norm": 9.03221371104339, - "learning_rate": 8.492237615786613e-06, - "loss": 45.493, - "num_tokens": 19422873.0, + "epoch": 0.07394012457303596, + "grad_norm": 9.089103053635672, + "learning_rate": 9.999254354363198e-06, + "loss": 19.2341, + "num_tokens": 18970588.0, "step": 46 }, { - "epoch": 0.611879576891782, - "grad_norm": 10.259229118131245, - "learning_rate": 8.42071212262238e-06, - "loss": 44.7961, - "num_tokens": 19816975.0, + "epoch": 0.07554751858549327, + "grad_norm": 8.350839874863304, + "learning_rate": 9.999026103728454e-06, + "loss": 18.8585, + "num_tokens": 19371063.0, "step": 47 }, { - "epoch": 0.6248982912937348, - "grad_norm": 9.826273442928137, - "learning_rate": 8.347888255271126e-06, - "loss": 45.7773, - "num_tokens": 20228714.0, + "epoch": 0.07715491259795057, + "grad_norm": 9.008540733933607, + "learning_rate": 9.998767424341402e-06, + "loss": 21.2206, + "num_tokens": 19800781.0, "step": 48 }, { - "epoch": 0.6379170056956875, - "grad_norm": 10.251197430046966, - "learning_rate": 8.273798386875292e-06, - "loss": 45.7943, - "num_tokens": 20664940.0, + "epoch": 0.07876230661040788, + "grad_norm": 8.748330756075763, + "learning_rate": 9.998478317951598e-06, + "loss": 19.913, + "num_tokens": 20230564.0, "step": 49 }, { - "epoch": 0.6509357200976403, - "grad_norm": 7.785523578312322, - "learning_rate": 8.198475453365772e-06, - "loss": 44.5926, - "num_tokens": 21080357.0, + "epoch": 0.08036970062286518, + "grad_norm": 8.699534420253277, + "learning_rate": 9.998158786514385e-06, + "loss": 22.724, + "num_tokens": 20631221.0, "step": 50 }, { - "epoch": 0.6639544344995931, - "grad_norm": 8.698660029347833, - "learning_rate": 8.12195293882058e-06, - "loss": 44.9233, - "num_tokens": 21500833.0, + "epoch": 0.08197709463532249, + "grad_norm": 9.358794928817712, + "learning_rate": 9.997808832190885e-06, + "loss": 23.2029, + "num_tokens": 21009167.0, "step": 51 }, { - "epoch": 0.6639544344995931, - "eval_loss": 2.61832857131958, - "eval_num_tokens": 21500833.0, - "eval_runtime": 47.4552, - "eval_samples_per_second": 23.032, - "eval_steps_per_second": 5.774, + "epoch": 0.08197709463532249, + "eval_loss": 1.192024827003479, + "eval_num_tokens": 21009167.0, + "eval_runtime": 376.6651, + "eval_samples_per_second": 23.49, + "eval_steps_per_second": 5.873, "step": 51 }, { - "epoch": 0.6769731489015459, - "grad_norm": 9.758397732795995, - "learning_rate": 8.044264860579816e-06, - "loss": 45.3207, - "num_tokens": 21919765.0, + "epoch": 0.08358448864777979, + "grad_norm": 8.376767182239066, + "learning_rate": 9.997428457347981e-06, + "loss": 20.269, + "num_tokens": 21400785.0, "step": 52 }, { - "epoch": 0.6899918633034988, - "grad_norm": 8.077248759791804, - "learning_rate": 7.965445754123592e-06, - "loss": 45.2383, - "num_tokens": 22343752.0, + "epoch": 0.08519188266023708, + "grad_norm": 8.65478590665576, + "learning_rate": 9.997017664558307e-06, + "loss": 21.1242, + "num_tokens": 21801417.0, "step": 53 }, { - "epoch": 0.7030105777054516, - "grad_norm": 9.123735388233518, - "learning_rate": 7.885530657719623e-06, - "loss": 45.4359, - "num_tokens": 22777046.0, + "epoch": 0.0867992766726944, + "grad_norm": 7.669336957790533, + "learning_rate": 9.996576456600221e-06, + "loss": 20.2614, + "num_tokens": 22260021.0, "step": 54 }, { - "epoch": 0.7160292921074044, - "grad_norm": 8.349001629818781, - "learning_rate": 7.804555096847298e-06, - "loss": 44.4336, - "num_tokens": 23155237.0, + "epoch": 0.08840667068515169, + "grad_norm": 8.308209132043027, + "learning_rate": 9.99610483645779e-06, + "loss": 23.028, + "num_tokens": 22702533.0, "step": 55 }, { - "epoch": 0.7290480065093572, - "grad_norm": 7.168111229963278, - "learning_rate": 7.722555068405186e-06, - "loss": 46.3786, - "num_tokens": 23624239.0, + "epoch": 0.090014064697609, + "grad_norm": 8.519898217044366, + "learning_rate": 9.995602807320778e-06, + "loss": 18.6799, + "num_tokens": 23083303.0, "step": 56 }, { - "epoch": 0.74206672091131, - "grad_norm": 10.792218043037206, - "learning_rate": 7.639567024708953e-06, - "loss": 43.4316, - "num_tokens": 24021803.0, + "epoch": 0.0916214587100663, + "grad_norm": 7.980262100758574, + "learning_rate": 9.99507037258461e-06, + "loss": 18.7573, + "num_tokens": 23464179.0, "step": 57 }, { - "epoch": 0.7550854353132628, - "grad_norm": 8.00166752620633, - "learning_rate": 7.555627857286843e-06, - "loss": 44.6596, - "num_tokens": 24444197.0, + "epoch": 0.09322885272252361, + "grad_norm": 9.247725832927081, + "learning_rate": 9.99450753585036e-06, + "loss": 19.0532, + "num_tokens": 23865289.0, "step": 58 }, { - "epoch": 0.7681041497152156, - "grad_norm": 8.131030639637084, - "learning_rate": 7.470774880479909e-06, - "loss": 44.3341, - "num_tokens": 24905731.0, + "epoch": 0.09483624673498091, + "grad_norm": 8.6608246078619, + "learning_rate": 9.993914300924725e-06, + "loss": 22.4071, + "num_tokens": 24331373.0, "step": 59 }, { - "epoch": 0.7811228641171685, - "grad_norm": 8.155734436567066, - "learning_rate": 7.3850458148542835e-06, - "loss": 46.5438, - "num_tokens": 25351543.0, + "epoch": 0.09644364074743822, + "grad_norm": 8.438587692130351, + "learning_rate": 9.993290671819993e-06, + "loss": 19.6278, + "num_tokens": 24747706.0, "step": 60 }, { - "epoch": 0.7941415785191213, - "grad_norm": 7.91228100142201, - "learning_rate": 7.29847877043287e-06, - "loss": 43.1953, - "num_tokens": 25738964.0, + "epoch": 0.09805103475989552, + "grad_norm": 8.394041803586587, + "learning_rate": 9.992636652754025e-06, + "loss": 20.0768, + "num_tokens": 25122467.0, "step": 61 }, { - "epoch": 0.807160292921074, - "grad_norm": 10.022290878004368, - "learning_rate": 7.211112229753901e-06, - "loss": 44.9749, - "num_tokens": 26143447.0, + "epoch": 0.09965842877235283, + "grad_norm": 7.824563650783021, + "learning_rate": 9.991952248150219e-06, + "loss": 20.1924, + "num_tokens": 25568954.0, "step": 62 }, { - "epoch": 0.8201790073230268, - "grad_norm": 8.527753040925967, - "learning_rate": 7.122985030763901e-06, - "loss": 45.1693, - "num_tokens": 26549557.0, + "epoch": 0.10126582278481013, + "grad_norm": 7.59321212586289, + "learning_rate": 9.99123746263748e-06, + "loss": 17.9264, + "num_tokens": 25962654.0, "step": 63 }, { - "epoch": 0.8331977217249796, - "grad_norm": 7.693711258947606, - "learning_rate": 7.034136349552647e-06, - "loss": 44.2133, - "num_tokens": 26970439.0, + "epoch": 0.10287321679726744, + "grad_norm": 7.682899801469829, + "learning_rate": 9.990492301050195e-06, + "loss": 21.5541, + "num_tokens": 26357204.0, "step": 64 }, { - "epoch": 0.8462164361269324, - "grad_norm": 8.905244694330412, - "learning_rate": 6.944605682937834e-06, - "loss": 44.5088, - "num_tokens": 27407828.0, + "epoch": 0.10448061080972473, + "grad_norm": 8.304738004322473, + "learning_rate": 9.989716768428199e-06, + "loss": 20.724, + "num_tokens": 26801835.0, "step": 65 }, { - "epoch": 0.8592351505288853, - "grad_norm": 7.75924825182322, - "learning_rate": 6.854432830907135e-06, - "loss": 44.1882, - "num_tokens": 27857236.0, + "epoch": 0.10608800482218204, + "grad_norm": 8.33624559062188, + "learning_rate": 9.988910870016735e-06, + "loss": 22.0426, + "num_tokens": 27226911.0, "step": 66 }, { - "epoch": 0.8722538649308381, - "grad_norm": 8.443509083770014, - "learning_rate": 6.763657878925508e-06, - "loss": 44.4298, - "num_tokens": 28238704.0, + "epoch": 0.10769539883463934, + "grad_norm": 7.789540800817722, + "learning_rate": 9.988074611266423e-06, + "loss": 19.8135, + "num_tokens": 27647839.0, "step": 67 }, { - "epoch": 0.8852725793327909, - "grad_norm": 8.344119552778967, - "learning_rate": 6.672321180115595e-06, - "loss": 43.7134, - "num_tokens": 28674245.0, + "epoch": 0.10930279284709664, + "grad_norm": 8.450884047880315, + "learning_rate": 9.987207997833222e-06, + "loss": 20.5095, + "num_tokens": 28025969.0, "step": 68 }, { - "epoch": 0.8982912937347437, - "grad_norm": 6.681640241465122, - "learning_rate": 6.580463337319128e-06, - "loss": 44.147, - "num_tokens": 29100995.0, + "epoch": 0.11091018685955395, + "grad_norm": 7.390105559330802, + "learning_rate": 9.986311035578397e-06, + "loss": 20.4413, + "num_tokens": 28473518.0, "step": 69 }, { - "epoch": 0.9113100081366965, - "grad_norm": 8.380228266696482, - "learning_rate": 6.488125185047334e-06, - "loss": 44.3873, - "num_tokens": 29503685.0, + "epoch": 0.11251758087201125, + "grad_norm": 8.229556959544075, + "learning_rate": 9.985383730568463e-06, + "loss": 20.5499, + "num_tokens": 28879552.0, "step": 70 }, { - "epoch": 0.9243287225386493, - "grad_norm": 8.107177192819389, - "learning_rate": 6.39534777132835e-06, - "loss": 44.2031, - "num_tokens": 29913149.0, + "epoch": 0.11412497488446856, + "grad_norm": 7.345745078389397, + "learning_rate": 9.984426089075169e-06, + "loss": 19.4937, + "num_tokens": 29289758.0, "step": 71 }, { - "epoch": 0.9373474369406021, - "grad_norm": 7.194486869317143, - "learning_rate": 6.302172339459717e-06, - "loss": 43.1973, - "num_tokens": 30347513.0, + "epoch": 0.11573236889692586, + "grad_norm": 7.256452679337097, + "learning_rate": 9.983438117575433e-06, + "loss": 19.5966, + "num_tokens": 29711995.0, "step": 72 }, { - "epoch": 0.950366151342555, - "grad_norm": 7.572279636499541, - "learning_rate": 6.208640309674081e-06, - "loss": 44.4992, - "num_tokens": 30741683.0, + "epoch": 0.11733976290938317, + "grad_norm": 6.977070892264344, + "learning_rate": 9.982419822751311e-06, + "loss": 17.5831, + "num_tokens": 30147493.0, "step": 73 }, { - "epoch": 0.9633848657445078, - "grad_norm": 6.89343522088949, - "learning_rate": 6.1147932607262215e-06, - "loss": 44.4793, - "num_tokens": 31135907.0, + "epoch": 0.11894715692184046, + "grad_norm": 7.578693108676886, + "learning_rate": 9.981371211489947e-06, + "loss": 20.826, + "num_tokens": 30587748.0, "step": 74 }, { - "epoch": 0.9764035801464606, - "grad_norm": 6.848414483469928, - "learning_rate": 6.020672911409626e-06, - "loss": 45.7695, - "num_tokens": 31576198.0, + "epoch": 0.12055455093429777, + "grad_norm": 7.845373437703446, + "learning_rate": 9.980292290883526e-06, + "loss": 20.2453, + "num_tokens": 30979362.0, "step": 75 }, { - "epoch": 0.9894222945484134, - "grad_norm": 7.920660451528225, - "learning_rate": 5.926321102010808e-06, - "loss": 43.4168, - "num_tokens": 31989137.0, + "epoch": 0.12216194494675507, + "grad_norm": 7.7532723439239515, + "learning_rate": 9.979183068229232e-06, + "loss": 20.4564, + "num_tokens": 31409239.0, "step": 76 }, { - "epoch": 1.0, - "grad_norm": 7.920660451528225, - "learning_rate": 5.831779775709606e-06, - "loss": 36.3341, - "num_tokens": 32342065.0, + "epoch": 0.12376933895921238, + "grad_norm": 7.75917944425188, + "learning_rate": 9.978043551029189e-06, + "loss": 18.1905, + "num_tokens": 31774437.0, "step": 77 }, { - "epoch": 1.013018714401953, - "grad_norm": 7.026772342911652, - "learning_rate": 5.7370909599337585e-06, - "loss": 44.053, - "num_tokens": 32736956.0, + "epoch": 0.12537673297166968, + "grad_norm": 7.6709602087801, + "learning_rate": 9.976873746990418e-06, + "loss": 19.9238, + "num_tokens": 32188445.0, "step": 78 }, { - "epoch": 1.0260374288039056, - "grad_norm": 7.13063711727869, - "learning_rate": 5.642296747676016e-06, - "loss": 42.8494, - "num_tokens": 33129107.0, + "epoch": 0.12698412698412698, + "grad_norm": 7.417802445436351, + "learning_rate": 9.97567366402478e-06, + "loss": 19.1635, + "num_tokens": 32593801.0, "step": 79 }, { - "epoch": 1.0390561432058585, - "grad_norm": 7.522335357396645, - "learning_rate": 5.5474392787821096e-06, - "loss": 44.3732, - "num_tokens": 33573060.0, + "epoch": 0.12859152099658427, + "grad_norm": 7.309069122863378, + "learning_rate": 9.97444331024893e-06, + "loss": 18.6514, + "num_tokens": 32989758.0, "step": 80 }, { - "epoch": 1.0520748576078112, - "grad_norm": 7.142205127462634, - "learning_rate": 5.452560721217892e-06, - "loss": 43.607, - "num_tokens": 34004346.0, + "epoch": 0.1301989150090416, + "grad_norm": 7.270793757712772, + "learning_rate": 9.973182693984252e-06, + "loss": 18.9029, + "num_tokens": 33388980.0, "step": 81 }, { - "epoch": 1.065093572009764, - "grad_norm": 7.322712166714905, - "learning_rate": 5.357703252323985e-06, - "loss": 42.6721, - "num_tokens": 34402162.0, + "epoch": 0.1318063090214989, + "grad_norm": 7.973524985480283, + "learning_rate": 9.971891823756808e-06, + "loss": 20.9425, + "num_tokens": 33808924.0, "step": 82 }, { - "epoch": 1.0781122864117167, - "grad_norm": 8.613960422054696, - "learning_rate": 5.262909040066243e-06, - "loss": 44.4329, - "num_tokens": 34814255.0, + "epoch": 0.1334137030339562, + "grad_norm": 7.976476824466409, + "learning_rate": 9.970570708297282e-06, + "loss": 19.548, + "num_tokens": 34238899.0, "step": 83 }, { - "epoch": 1.0911310008136696, - "grad_norm": 7.534461032710289, - "learning_rate": 5.168220224290395e-06, - "loss": 43.2475, - "num_tokens": 35196606.0, + "epoch": 0.1350210970464135, + "grad_norm": 7.265809213418535, + "learning_rate": 9.96921935654092e-06, + "loss": 20.5002, + "num_tokens": 34673373.0, "step": 84 }, { - "epoch": 1.1041497152156226, - "grad_norm": 8.589611408708793, - "learning_rate": 5.073678897989194e-06, - "loss": 41.9812, - "num_tokens": 35645317.0, + "epoch": 0.13662849105887082, + "grad_norm": 7.07632227070082, + "learning_rate": 9.96783777762746e-06, + "loss": 19.1249, + "num_tokens": 35101851.0, "step": 85 }, { - "epoch": 1.1171684296175752, - "grad_norm": 7.7364283872737705, - "learning_rate": 4.979327088590375e-06, - "loss": 42.2822, - "num_tokens": 36028691.0, + "epoch": 0.1382358850713281, + "grad_norm": 6.488565524276084, + "learning_rate": 9.96642598090109e-06, + "loss": 19.2345, + "num_tokens": 35555852.0, "step": 86 }, { - "epoch": 1.1301871440195281, - "grad_norm": 9.502669113584114, - "learning_rate": 4.88520673927378e-06, - "loss": 44.3749, - "num_tokens": 36381326.0, + "epoch": 0.1398432790837854, + "grad_norm": 6.9639107519244, + "learning_rate": 9.964983975910368e-06, + "loss": 20.6444, + "num_tokens": 35976659.0, "step": 87 }, { - "epoch": 1.1432058584214808, - "grad_norm": 7.913550650142695, - "learning_rate": 4.791359690325921e-06, - "loss": 44.5475, - "num_tokens": 36815592.0, + "epoch": 0.1414506730962427, + "grad_norm": 7.438955356563644, + "learning_rate": 9.963511772408163e-06, + "loss": 20.8598, + "num_tokens": 36414846.0, "step": 88 }, { - "epoch": 1.1562245728234337, - "grad_norm": 8.070651997614794, - "learning_rate": 4.697827660540285e-06, - "loss": 43.7263, - "num_tokens": 37243221.0, + "epoch": 0.14305806710870003, + "grad_norm": 7.732911674346702, + "learning_rate": 9.962009380351585e-06, + "loss": 20.0468, + "num_tokens": 36871242.0, "step": 89 }, { - "epoch": 1.1692432872253864, - "grad_norm": 6.398006197653046, - "learning_rate": 4.604652228671653e-06, - "loss": 45.8467, - "num_tokens": 37654139.0, + "epoch": 0.14466546112115733, + "grad_norm": 7.000595355480363, + "learning_rate": 9.96047680990193e-06, + "loss": 18.8454, + "num_tokens": 37288272.0, "step": 90 }, { - "epoch": 1.1822620016273393, - "grad_norm": 7.607234591260942, - "learning_rate": 4.511874814952668e-06, - "loss": 41.4379, - "num_tokens": 38068131.0, + "epoch": 0.14627285513361463, + "grad_norm": 7.479551999447466, + "learning_rate": 9.958914071424597e-06, + "loss": 19.4362, + "num_tokens": 37746099.0, "step": 91 }, { - "epoch": 1.1952807160292922, - "grad_norm": 7.23721210237536, - "learning_rate": 4.419536662680873e-06, - "loss": 42.9813, - "num_tokens": 38517307.0, + "epoch": 0.14788024914607192, + "grad_norm": 6.803597898075917, + "learning_rate": 9.957321175489026e-06, + "loss": 18.3783, + "num_tokens": 38184064.0, "step": 92 }, { - "epoch": 1.2082994304312449, - "grad_norm": 6.048665613151334, - "learning_rate": 4.327678819884405e-06, - "loss": 45.077, - "num_tokens": 38947370.0, + "epoch": 0.14948764315852922, + "grad_norm": 7.462881265596427, + "learning_rate": 9.955698132868619e-06, + "loss": 20.1376, + "num_tokens": 38583322.0, "step": 93 }, { - "epoch": 1.2213181448331978, - "grad_norm": 6.506737582549749, - "learning_rate": 4.2363421210744925e-06, - "loss": 43.0281, - "num_tokens": 39335829.0, + "epoch": 0.15109503717098655, + "grad_norm": 7.561252874679805, + "learning_rate": 9.954044954540682e-06, + "loss": 19.7285, + "num_tokens": 39017031.0, "step": 94 }, { - "epoch": 1.2343368592351505, - "grad_norm": 8.090655774335538, - "learning_rate": 4.1455671690928666e-06, - "loss": 43.6451, - "num_tokens": 39766866.0, + "epoch": 0.15270243118344384, + "grad_norm": 7.868488233115406, + "learning_rate": 9.952361651686333e-06, + "loss": 22.7743, + "num_tokens": 39466601.0, "step": 95 }, { - "epoch": 1.2473555736371034, - "grad_norm": 6.9083925427883, - "learning_rate": 4.055394317062168e-06, - "loss": 45.0136, - "num_tokens": 40186245.0, + "epoch": 0.15430982519590114, + "grad_norm": 7.9965341467621895, + "learning_rate": 9.950648235690437e-06, + "loss": 21.3307, + "num_tokens": 39904372.0, "step": 96 }, { - "epoch": 1.2603742880390563, - "grad_norm": 7.758040299312007, - "learning_rate": 3.965863650447355e-06, - "loss": 43.1958, - "num_tokens": 40614768.0, + "epoch": 0.15591721920835844, + "grad_norm": 7.730953213379648, + "learning_rate": 9.948904718141527e-06, + "loss": 18.8578, + "num_tokens": 40312911.0, "step": 97 }, { - "epoch": 1.273393002441009, - "grad_norm": 7.378522384645766, - "learning_rate": 3.877014969236102e-06, - "loss": 43.0779, - "num_tokens": 41046509.0, + "epoch": 0.15752461322081576, + "grad_norm": 7.340086859749351, + "learning_rate": 9.947131110831725e-06, + "loss": 20.5623, + "num_tokens": 40724694.0, "step": 98 }, { - "epoch": 1.2864117168429616, - "grad_norm": 6.24890924117296, - "learning_rate": 3.7888877702460992e-06, - "loss": 44.3709, - "num_tokens": 41467121.0, + "epoch": 0.15913200723327306, + "grad_norm": 7.364185547667675, + "learning_rate": 9.945327425756662e-06, + "loss": 21.6948, + "num_tokens": 41178982.0, "step": 99 }, { - "epoch": 1.2994304312449145, - "grad_norm": 6.908300936733955, - "learning_rate": 3.701521229567131e-06, - "loss": 42.5868, - "num_tokens": 41877993.0, + "epoch": 0.16073940124573036, + "grad_norm": 6.935550185441897, + "learning_rate": 9.9434936751154e-06, + "loss": 18.1277, + "num_tokens": 41595935.0, "step": 100 }, { - "epoch": 1.3124491456468674, - "grad_norm": 6.889494074464293, - "learning_rate": 3.6149541851457183e-06, - "loss": 43.2867, - "num_tokens": 42283854.0, + "epoch": 0.16234679525818765, + "grad_norm": 6.9233453329159635, + "learning_rate": 9.941629871310337e-06, + "loss": 19.6283, + "num_tokens": 41990929.0, "step": 101 }, { - "epoch": 1.3254678600488201, - "grad_norm": 6.9617082920781215, - "learning_rate": 3.5292251195200932e-06, - "loss": 43.1462, - "num_tokens": 42679878.0, + "epoch": 0.16395418927064498, + "grad_norm": 7.609394581459471, + "learning_rate": 9.939736026947146e-06, + "loss": 23.2651, + "num_tokens": 42410254.0, "step": 102 }, { - "epoch": 1.3254678600488201, - "eval_loss": 2.5341105461120605, - "eval_num_tokens": 42679878.0, - "eval_runtime": 46.8916, - "eval_samples_per_second": 23.309, - "eval_steps_per_second": 5.843, + "epoch": 0.16395418927064498, + "eval_loss": 1.1346635818481445, + "eval_num_tokens": 42410254.0, + "eval_runtime": 375.3658, + "eval_samples_per_second": 23.572, + "eval_steps_per_second": 5.893, "step": 102 }, { - "epoch": 1.338486574450773, - "grad_norm": 7.143998567958961, - "learning_rate": 3.4443721427131593e-06, - "loss": 42.6499, - "num_tokens": 43104741.0, + "epoch": 0.16556158328310228, + "grad_norm": 7.0865981806855745, + "learning_rate": 9.937812154834671e-06, + "loss": 19.05, + "num_tokens": 42817534.0, "step": 103 }, { - "epoch": 1.3515052888527257, - "grad_norm": 8.217416950723322, - "learning_rate": 3.3604329752910468e-06, - "loss": 45.2888, - "num_tokens": 43520945.0, + "epoch": 0.16716897729555957, + "grad_norm": 6.948781400680116, + "learning_rate": 9.935858267984842e-06, + "loss": 18.6357, + "num_tokens": 43232271.0, "step": 104 }, { - "epoch": 1.3645240032546786, - "grad_norm": 7.2343900687446565, - "learning_rate": 3.2774449315948147e-06, - "loss": 43.9364, - "num_tokens": 43894234.0, + "epoch": 0.16877637130801687, + "grad_norm": 7.841635910969654, + "learning_rate": 9.933874379612595e-06, + "loss": 17.9843, + "num_tokens": 43624150.0, "step": 105 }, { - "epoch": 1.3775427176566315, - "grad_norm": 7.484532382523991, - "learning_rate": 3.195444903152703e-06, - "loss": 43.635, - "num_tokens": 44341295.0, + "epoch": 0.17038376532047417, + "grad_norm": 6.8850851562446795, + "learning_rate": 9.93186050313578e-06, + "loss": 20.4715, + "num_tokens": 44042404.0, "step": 106 }, { - "epoch": 1.3905614320585842, - "grad_norm": 6.952533441170867, - "learning_rate": 3.114469342280379e-06, - "loss": 43.5226, - "num_tokens": 44742146.0, + "epoch": 0.1719911593329315, + "grad_norm": 7.323685588001377, + "learning_rate": 9.929816652175062e-06, + "loss": 19.3193, + "num_tokens": 44427648.0, "step": 107 }, { - "epoch": 1.403580146460537, - "grad_norm": 6.418144240712638, - "learning_rate": 3.03455424587641e-06, - "loss": 42.3867, - "num_tokens": 45141359.0, + "epoch": 0.1735985533453888, + "grad_norm": 7.176452121711359, + "learning_rate": 9.927742840553846e-06, + "loss": 21.5694, + "num_tokens": 44856500.0, "step": 108 }, { - "epoch": 1.4165988608624898, - "grad_norm": 6.634723205633215, - "learning_rate": 2.9557351394201855e-06, - "loss": 44.929, - "num_tokens": 45592572.0, + "epoch": 0.1752059473578461, + "grad_norm": 7.387654850668608, + "learning_rate": 9.925639082298164e-06, + "loss": 18.5719, + "num_tokens": 45280657.0, "step": 109 }, { - "epoch": 1.4296175752644427, - "grad_norm": 6.6582337559408336, - "learning_rate": 2.878047061179422e-06, - "loss": 43.1964, - "num_tokens": 46004559.0, + "epoch": 0.17681334137030338, + "grad_norm": 7.158566694004073, + "learning_rate": 9.923505391636592e-06, + "loss": 20.143, + "num_tokens": 45696584.0, "step": 110 }, { - "epoch": 1.4426362896663956, - "grad_norm": 6.434721726836162, - "learning_rate": 2.8015245466342287e-06, - "loss": 41.3969, - "num_tokens": 46396302.0, + "epoch": 0.1784207353827607, + "grad_norm": 7.5473210717863966, + "learning_rate": 9.921341783000158e-06, + "loss": 21.7447, + "num_tokens": 46077559.0, "step": 111 }, { - "epoch": 1.4556550040683482, - "grad_norm": 6.951462322705828, - "learning_rate": 2.72620161312471e-06, - "loss": 43.8865, - "num_tokens": 46861541.0, + "epoch": 0.180028129395218, + "grad_norm": 6.560182124029635, + "learning_rate": 9.919148271022224e-06, + "loss": 21.3122, + "num_tokens": 46538123.0, "step": 112 }, { - "epoch": 1.468673718470301, - "grad_norm": 5.685510951219279, - "learning_rate": 2.652111744728876e-06, - "loss": 44.2875, - "num_tokens": 47324903.0, + "epoch": 0.1816355234076753, + "grad_norm": 7.392870133552766, + "learning_rate": 9.916924870538414e-06, + "loss": 20.7176, + "num_tokens": 46909974.0, "step": 113 }, { - "epoch": 1.4816924328722538, - "grad_norm": 6.965889081794389, - "learning_rate": 2.5792878773776225e-06, - "loss": 43.9692, - "num_tokens": 47748381.0, + "epoch": 0.1832429174201326, + "grad_norm": 8.1408777327834, + "learning_rate": 9.91467159658649e-06, + "loss": 21.1519, + "num_tokens": 47293261.0, "step": 114 }, { - "epoch": 1.4947111472742067, - "grad_norm": 6.2816062141399955, - "learning_rate": 2.5077623842133895e-06, - "loss": 43.2887, - "num_tokens": 48167679.0, + "epoch": 0.18485031143258993, + "grad_norm": 6.898775293631946, + "learning_rate": 9.912388464406266e-06, + "loss": 20.2515, + "num_tokens": 47710026.0, "step": 115 }, { - "epoch": 1.5077298616761596, - "grad_norm": 6.073748086459051, - "learning_rate": 2.437567061198991e-06, - "loss": 43.5468, - "num_tokens": 48610957.0, + "epoch": 0.18645770544504722, + "grad_norm": 7.268195755560978, + "learning_rate": 9.9100754894395e-06, + "loss": 20.991, + "num_tokens": 48163851.0, "step": 116 }, { - "epoch": 1.5207485760781123, - "grad_norm": 6.58074685060065, - "learning_rate": 2.3687331129830276e-06, - "loss": 43.1444, - "num_tokens": 48999358.0, + "epoch": 0.18806509945750452, + "grad_norm": 6.960585626646239, + "learning_rate": 9.907732687329784e-06, + "loss": 18.436, + "num_tokens": 48570622.0, "step": 117 }, { - "epoch": 1.533767290480065, - "grad_norm": 5.832121444000036, - "learning_rate": 2.301291139028164e-06, - "loss": 44.5786, - "num_tokens": 49474744.0, + "epoch": 0.18967249346996182, + "grad_norm": 7.497353714177138, + "learning_rate": 9.905360073922448e-06, + "loss": 21.0898, + "num_tokens": 49049880.0, "step": 118 }, { - "epoch": 1.546786004882018, - "grad_norm": 5.514788518783401, - "learning_rate": 2.235271120008439e-06, - "loss": 42.7051, - "num_tokens": 49916397.0, + "epoch": 0.19127988748241911, + "grad_norm": 7.307210603915192, + "learning_rate": 9.902957665264444e-06, + "loss": 20.0271, + "num_tokens": 49485938.0, "step": 119 }, { - "epoch": 1.5598047192839708, - "grad_norm": 5.764365579739233, - "learning_rate": 2.1707024044816433e-06, - "loss": 41.8974, - "num_tokens": 50318342.0, + "epoch": 0.19288728149487644, + "grad_norm": 7.4874976371057125, + "learning_rate": 9.900525477604248e-06, + "loss": 20.2776, + "num_tokens": 49877687.0, "step": 120 }, { - "epoch": 1.5728234336859235, - "grad_norm": 6.724066411874533, - "learning_rate": 2.107613695842705e-06, - "loss": 44.3077, - "num_tokens": 50779853.0, + "epoch": 0.19449467550733374, + "grad_norm": 7.350801953981672, + "learning_rate": 9.898063527391735e-06, + "loss": 21.6464, + "num_tokens": 50343977.0, "step": 121 }, { - "epoch": 1.5858421480878762, - "grad_norm": 7.32779608228551, - "learning_rate": 2.0460330395638754e-06, - "loss": 41.9387, - "num_tokens": 51190349.0, + "epoch": 0.19610206951979103, + "grad_norm": 7.858504044305824, + "learning_rate": 9.895571831278082e-06, + "loss": 19.6147, + "num_tokens": 50760445.0, "step": 122 }, { - "epoch": 1.598860862489829, - "grad_norm": 6.478047613041632, - "learning_rate": 1.9859878107273884e-06, - "loss": 44.5776, - "num_tokens": 51590613.0, + "epoch": 0.19770946353224833, + "grad_norm": 7.285321892355277, + "learning_rate": 9.893050406115649e-06, + "loss": 18.9395, + "num_tokens": 51149439.0, "step": 123 }, { - "epoch": 1.611879576891782, - "grad_norm": 6.334803828357373, - "learning_rate": 1.9275047018561265e-06, - "loss": 42.2772, - "num_tokens": 52009243.0, + "epoch": 0.19931685754470566, + "grad_norm": 7.33788293530188, + "learning_rate": 9.890499268957862e-06, + "loss": 20.4198, + "num_tokens": 51558687.0, "step": 124 }, { - "epoch": 1.6248982912937349, - "grad_norm": 5.844141203701462, - "learning_rate": 1.8706097110477298e-06, - "loss": 42.7055, - "num_tokens": 52421566.0, + "epoch": 0.20092425155716295, + "grad_norm": 6.56586633002251, + "learning_rate": 9.887918437059109e-06, + "loss": 18.3692, + "num_tokens": 52000068.0, "step": 125 }, { - "epoch": 1.6379170056956875, - "grad_norm": 6.6426407833922605, - "learning_rate": 1.8153281304173842e-06, - "loss": 42.5965, - "num_tokens": 52821727.0, + "epoch": 0.20253164556962025, + "grad_norm": 7.506826707578428, + "learning_rate": 9.885307927874606e-06, + "loss": 18.5089, + "num_tokens": 52413025.0, "step": 126 }, { - "epoch": 1.6509357200976402, - "grad_norm": 6.099039875821687, - "learning_rate": 1.7616845348544657e-06, - "loss": 43.3713, - "num_tokens": 53230417.0, + "epoch": 0.20413903958207755, + "grad_norm": 6.687311008794421, + "learning_rate": 9.882667759060298e-06, + "loss": 19.058, + "num_tokens": 52890512.0, "step": 127 }, { - "epoch": 1.6639544344995931, - "grad_norm": 6.66283986780362, - "learning_rate": 1.7097027710980059e-06, - "loss": 43.7114, - "num_tokens": 53676950.0, + "epoch": 0.20574643359453487, + "grad_norm": 7.8795732840943415, + "learning_rate": 9.879997948472722e-06, + "loss": 18.9731, + "num_tokens": 53244993.0, "step": 128 }, { - "epoch": 1.676973148901546, - "grad_norm": 6.316978695831662, - "learning_rate": 1.6594059471358603e-06, - "loss": 44.7969, - "num_tokens": 54125941.0, + "epoch": 0.20735382760699217, + "grad_norm": 6.797184921630517, + "learning_rate": 9.8772985141689e-06, + "loss": 19.1772, + "num_tokens": 53661685.0, "step": 129 }, { - "epoch": 1.689991863303499, - "grad_norm": 5.849607231665509, - "learning_rate": 1.6108164219322759e-06, - "loss": 44.0519, - "num_tokens": 54545731.0, + "epoch": 0.20896122161944947, + "grad_norm": 6.936550895921515, + "learning_rate": 9.874569474406209e-06, + "loss": 21.2073, + "num_tokens": 54087376.0, "step": 130 }, { - "epoch": 1.7030105777054516, - "grad_norm": 6.302486561228913, - "learning_rate": 1.5639557954884263e-06, - "loss": 43.0819, - "num_tokens": 54955550.0, + "epoch": 0.21056861563190676, + "grad_norm": 6.501832427479631, + "learning_rate": 9.871810847642259e-06, + "loss": 18.2027, + "num_tokens": 54494553.0, "step": 131 }, { - "epoch": 1.7160292921074043, - "grad_norm": 6.432081148310679, - "learning_rate": 1.5188448992403504e-06, - "loss": 43.9355, - "num_tokens": 55421291.0, + "epoch": 0.2121760096443641, + "grad_norm": 6.377405026503369, + "learning_rate": 9.869022652534774e-06, + "loss": 20.1071, + "num_tokens": 54917212.0, "step": 132 }, { - "epoch": 1.7290480065093572, - "grad_norm": 5.932351378662439, - "learning_rate": 1.4755037867985285e-06, - "loss": 44.3439, - "num_tokens": 55850758.0, + "epoch": 0.21378340365682139, + "grad_norm": 6.493469276911309, + "learning_rate": 9.866204907941453e-06, + "loss": 19.5453, + "num_tokens": 55341080.0, "step": 133 }, { - "epoch": 1.74206672091131, - "grad_norm": 5.942324039041276, - "learning_rate": 1.4339517250332565e-06, - "loss": 42.125, - "num_tokens": 56277015.0, + "epoch": 0.21539079766927868, + "grad_norm": 8.250017461256439, + "learning_rate": 9.863357632919857e-06, + "loss": 19.895, + "num_tokens": 55724847.0, "step": 134 }, { - "epoch": 1.7550854353132628, - "grad_norm": 5.67220550149327, - "learning_rate": 1.3942071855097381e-06, - "loss": 43.3022, - "num_tokens": 56696422.0, + "epoch": 0.21699819168173598, + "grad_norm": 7.2706505916624335, + "learning_rate": 9.86048084672727e-06, + "loss": 17.8357, + "num_tokens": 56114516.0, "step": 135 }, { - "epoch": 1.7681041497152155, - "grad_norm": 6.2372919539117575, - "learning_rate": 1.3562878362767296e-06, - "loss": 42.9393, - "num_tokens": 57105415.0, + "epoch": 0.21860558569419328, + "grad_norm": 7.50712008698208, + "learning_rate": 9.857574568820572e-06, + "loss": 19.4889, + "num_tokens": 56496956.0, "step": 136 }, { - "epoch": 1.7811228641171684, - "grad_norm": 6.494582997926307, - "learning_rate": 1.320210534012388e-06, - "loss": 42.9416, - "num_tokens": 57560199.0, + "epoch": 0.2202129797066506, + "grad_norm": 6.738123604557156, + "learning_rate": 9.85463881885611e-06, + "loss": 19.9032, + "num_tokens": 56927580.0, "step": 137 }, { - "epoch": 1.7941415785191213, - "grad_norm": 6.467178887755779, - "learning_rate": 1.2859913165307886e-06, - "loss": 44.7506, - "num_tokens": 57984581.0, + "epoch": 0.2218203737191079, + "grad_norm": 7.029955001141702, + "learning_rate": 9.851673616689558e-06, + "loss": 18.1296, + "num_tokens": 57322615.0, "step": 138 }, { - "epoch": 1.8071602929210742, - "grad_norm": 5.734142472530536, - "learning_rate": 1.253645395652481e-06, - "loss": 40.9494, - "num_tokens": 58362140.0, + "epoch": 0.2234277677315652, + "grad_norm": 6.94481863754793, + "learning_rate": 9.848678982375793e-06, + "loss": 16.4731, + "num_tokens": 57721842.0, "step": 139 }, { - "epoch": 1.8201790073230268, - "grad_norm": 6.129574816007133, - "learning_rate": 1.2231871504422117e-06, - "loss": 42.7382, - "num_tokens": 58786078.0, + "epoch": 0.2250351617440225, + "grad_norm": 6.5767369509570575, + "learning_rate": 9.845654936168749e-06, + "loss": 17.2809, + "num_tokens": 58133079.0, "step": 140 }, { - "epoch": 1.8331977217249795, - "grad_norm": 6.0677503343186405, - "learning_rate": 1.1946301208168593e-06, - "loss": 42.8528, - "num_tokens": 59224097.0, + "epoch": 0.22664255575647982, + "grad_norm": 8.218211027280454, + "learning_rate": 9.842601498521288e-06, + "loss": 20.541, + "num_tokens": 58538284.0, "step": 141 }, { - "epoch": 1.8462164361269324, - "grad_norm": 6.0308908457130475, - "learning_rate": 1.1679870015263908e-06, - "loss": 43.6563, - "num_tokens": 59672097.0, + "epoch": 0.22824994976893712, + "grad_norm": 6.995536783180637, + "learning_rate": 9.839518690085054e-06, + "loss": 19.399, + "num_tokens": 58959915.0, "step": 142 }, { - "epoch": 1.8592351505288853, - "grad_norm": 5.197524830163719, - "learning_rate": 1.143269636510536e-06, - "loss": 43.3286, - "num_tokens": 60096419.0, + "epoch": 0.2298573437813944, + "grad_norm": 6.697383807807822, + "learning_rate": 9.836406531710343e-06, + "loss": 20.2204, + "num_tokens": 59422814.0, "step": 143 }, { - "epoch": 1.8722538649308382, - "grad_norm": 6.375136810581173, - "learning_rate": 1.1204890136336784e-06, - "loss": 43.9128, - "num_tokens": 60545197.0, + "epoch": 0.2314647377938517, + "grad_norm": 7.048107782937074, + "learning_rate": 9.833265044445952e-06, + "loss": 18.3961, + "num_tokens": 59859503.0, "step": 144 }, { - "epoch": 1.885272579332791, - "grad_norm": 5.792583564479622, - "learning_rate": 1.0996552598003088e-06, - "loss": 43.6939, - "num_tokens": 60969707.0, + "epoch": 0.23307213180630904, + "grad_norm": 7.400488133109635, + "learning_rate": 9.830094249539045e-06, + "loss": 19.0135, + "num_tokens": 60248023.0, "step": 145 }, { - "epoch": 1.8982912937347436, - "grad_norm": 5.864100035826922, - "learning_rate": 1.0807776364532044e-06, - "loss": 43.3276, - "num_tokens": 61352863.0, + "epoch": 0.23467952581876633, + "grad_norm": 6.324047537818344, + "learning_rate": 9.826894168435005e-06, + "loss": 21.1763, + "num_tokens": 60693073.0, "step": 146 }, { - "epoch": 1.9113100081366965, - "grad_norm": 6.83293846085596, - "learning_rate": 1.0638645354563488e-06, - "loss": 44.4588, - "num_tokens": 61789668.0, + "epoch": 0.23628691983122363, + "grad_norm": 7.408143330106155, + "learning_rate": 9.823664822777285e-06, + "loss": 22.2096, + "num_tokens": 61133042.0, "step": 147 }, { - "epoch": 1.9243287225386494, - "grad_norm": 5.128151994194966, - "learning_rate": 1.0489234753644075e-06, - "loss": 44.8288, - "num_tokens": 62243941.0, + "epoch": 0.23789431384368093, + "grad_norm": 7.104234073026265, + "learning_rate": 9.820406234407274e-06, + "loss": 18.7393, + "num_tokens": 61536239.0, "step": 148 }, { - "epoch": 1.937347436940602, - "grad_norm": 5.965405751872391, - "learning_rate": 1.0359610980804286e-06, - "loss": 39.983, - "num_tokens": 62617374.0, + "epoch": 0.23950170785613822, + "grad_norm": 6.750164181863352, + "learning_rate": 9.81711842536413e-06, + "loss": 20.0071, + "num_tokens": 61979134.0, "step": 149 }, { - "epoch": 1.950366151342555, - "grad_norm": 6.32081508468605, - "learning_rate": 1.0249831659032494e-06, - "loss": 43.1486, - "num_tokens": 63083789.0, + "epoch": 0.24110910186859555, + "grad_norm": 7.177483872054614, + "learning_rate": 9.813801417884657e-06, + "loss": 18.2248, + "num_tokens": 62374068.0, "step": 150 }, { - "epoch": 1.9633848657445077, - "grad_norm": 5.626093946285433, - "learning_rate": 1.0159945589659223e-06, - "loss": 42.1506, - "num_tokens": 63480048.0, + "epoch": 0.24271649588105285, + "grad_norm": 7.131908982472468, + "learning_rate": 9.810455234403128e-06, + "loss": 19.4317, + "num_tokens": 62759525.0, "step": 151 }, { - "epoch": 1.9764035801464606, - "grad_norm": 6.197437659342925, - "learning_rate": 1.0089992730662983e-06, - "loss": 43.5875, - "num_tokens": 63917281.0, + "epoch": 0.24432388989351014, + "grad_norm": 8.388002338429523, + "learning_rate": 9.80707989755115e-06, + "loss": 19.9746, + "num_tokens": 63171704.0, "step": 152 }, { - "epoch": 1.9894222945484135, - "grad_norm": 5.865978038155229, - "learning_rate": 1.0040004178907364e-06, - "loss": 43.5149, - "num_tokens": 64360496.0, + "epoch": 0.24593128390596744, + "grad_norm": 7.087909065885135, + "learning_rate": 9.803675430157507e-06, + "loss": 19.5263, + "num_tokens": 63647314.0, "step": 153 }, { - "epoch": 1.9894222945484135, - "eval_loss": 2.50262451171875, - "eval_num_tokens": 64360496.0, - "eval_runtime": 46.915, - "eval_samples_per_second": 23.297, - "eval_steps_per_second": 5.84, + "epoch": 0.24593128390596744, + "eval_loss": 1.1032651662826538, + "eval_num_tokens": 63647314.0, + "eval_runtime": 375.3236, + "eval_samples_per_second": 23.574, + "eval_steps_per_second": 5.894, "step": 153 }, { - "epoch": 2.0, - "grad_norm": 5.664155871790813, - "learning_rate": 1.0010002156317187e-06, - "loss": 34.7068, - "num_tokens": 64683379.0, + "epoch": 0.24753867791842477, + "grad_norm": 6.691797561193694, + "learning_rate": 9.800241855248e-06, + "loss": 20.8051, + "num_tokens": 64119499.0, "step": 154 }, + { + "epoch": 0.24914607193088206, + "grad_norm": 6.807806963660468, + "learning_rate": 9.796779196045305e-06, + "loss": 20.0641, + "num_tokens": 64587660.0, + "step": 155 + }, + { + "epoch": 0.25075346594333936, + "grad_norm": 7.402294270357361, + "learning_rate": 9.793287475968795e-06, + "loss": 19.5129, + "num_tokens": 65007373.0, + "step": 156 + }, + { + "epoch": 0.25236085995579666, + "grad_norm": 6.369472351599343, + "learning_rate": 9.789766718634404e-06, + "loss": 17.6997, + "num_tokens": 65433956.0, + "step": 157 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 7.335879461866313, + "learning_rate": 9.786216947854452e-06, + "loss": 20.8764, + "num_tokens": 65856713.0, + "step": 158 + }, + { + "epoch": 0.25557564798071125, + "grad_norm": 7.124097288545834, + "learning_rate": 9.78263818763749e-06, + "loss": 20.0208, + "num_tokens": 66263715.0, + "step": 159 + }, + { + "epoch": 0.25718304199316855, + "grad_norm": 7.3500439402471205, + "learning_rate": 9.779030462188135e-06, + "loss": 19.047, + "num_tokens": 66702200.0, + "step": 160 + }, + { + "epoch": 0.2587904360056259, + "grad_norm": 6.380709759892032, + "learning_rate": 9.77539379590691e-06, + "loss": 20.038, + "num_tokens": 67176185.0, + "step": 161 + }, + { + "epoch": 0.2603978300180832, + "grad_norm": 7.519807444644578, + "learning_rate": 9.771728213390073e-06, + "loss": 20.2859, + "num_tokens": 67594499.0, + "step": 162 + }, + { + "epoch": 0.2620052240305405, + "grad_norm": 6.785046269699577, + "learning_rate": 9.76803373942946e-06, + "loss": 19.2146, + "num_tokens": 68027629.0, + "step": 163 + }, + { + "epoch": 0.2636126180429978, + "grad_norm": 7.845757156596308, + "learning_rate": 9.764310399012305e-06, + "loss": 18.2296, + "num_tokens": 68445250.0, + "step": 164 + }, + { + "epoch": 0.2652200120554551, + "grad_norm": 8.347987430227425, + "learning_rate": 9.760558217321083e-06, + "loss": 19.0917, + "num_tokens": 68830552.0, + "step": 165 + }, + { + "epoch": 0.2668274060679124, + "grad_norm": 6.218469388457667, + "learning_rate": 9.756777219733333e-06, + "loss": 18.5327, + "num_tokens": 69245032.0, + "step": 166 + }, + { + "epoch": 0.2684348000803697, + "grad_norm": 7.171779514896403, + "learning_rate": 9.752967431821485e-06, + "loss": 18.4782, + "num_tokens": 69654881.0, + "step": 167 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 7.971928509511999, + "learning_rate": 9.749128879352693e-06, + "loss": 19.8063, + "num_tokens": 70143909.0, + "step": 168 + }, + { + "epoch": 0.27164958810528433, + "grad_norm": 8.266621156848895, + "learning_rate": 9.745261588288654e-06, + "loss": 19.688, + "num_tokens": 70511588.0, + "step": 169 + }, + { + "epoch": 0.27325698211774163, + "grad_norm": 7.469375512374517, + "learning_rate": 9.741365584785442e-06, + "loss": 18.8919, + "num_tokens": 70920896.0, + "step": 170 + }, + { + "epoch": 0.27486437613019893, + "grad_norm": 7.012058268745888, + "learning_rate": 9.737440895193318e-06, + "loss": 19.951, + "num_tokens": 71311351.0, + "step": 171 + }, + { + "epoch": 0.2764717701426562, + "grad_norm": 7.220475603412168, + "learning_rate": 9.733487546056564e-06, + "loss": 20.7273, + "num_tokens": 71765297.0, + "step": 172 + }, + { + "epoch": 0.2780791641551135, + "grad_norm": 7.078330190185484, + "learning_rate": 9.729505564113293e-06, + "loss": 19.7626, + "num_tokens": 72220690.0, + "step": 173 + }, + { + "epoch": 0.2796865581675708, + "grad_norm": 7.431322826847637, + "learning_rate": 9.725494976295278e-06, + "loss": 19.7024, + "num_tokens": 72625810.0, + "step": 174 + }, + { + "epoch": 0.2812939521800281, + "grad_norm": 7.584274311665085, + "learning_rate": 9.721455809727765e-06, + "loss": 17.4695, + "num_tokens": 73029978.0, + "step": 175 + }, + { + "epoch": 0.2829013461924854, + "grad_norm": 7.697886707731134, + "learning_rate": 9.717388091729284e-06, + "loss": 18.4573, + "num_tokens": 73425163.0, + "step": 176 + }, + { + "epoch": 0.2845087402049427, + "grad_norm": 7.346010692585071, + "learning_rate": 9.713291849811479e-06, + "loss": 20.8384, + "num_tokens": 73890759.0, + "step": 177 + }, + { + "epoch": 0.28611613421740006, + "grad_norm": 7.447209559779286, + "learning_rate": 9.709167111678907e-06, + "loss": 22.2751, + "num_tokens": 74306863.0, + "step": 178 + }, + { + "epoch": 0.28772352822985736, + "grad_norm": 8.162673514699478, + "learning_rate": 9.705013905228856e-06, + "loss": 17.2907, + "num_tokens": 74704214.0, + "step": 179 + }, + { + "epoch": 0.28933092224231466, + "grad_norm": 7.417415319554917, + "learning_rate": 9.700832258551159e-06, + "loss": 19.4978, + "num_tokens": 75117773.0, + "step": 180 + }, + { + "epoch": 0.29093831625477196, + "grad_norm": 7.447408494041398, + "learning_rate": 9.696622199928004e-06, + "loss": 18.3166, + "num_tokens": 75518720.0, + "step": 181 + }, + { + "epoch": 0.29254571026722925, + "grad_norm": 6.514743424439934, + "learning_rate": 9.692383757833735e-06, + "loss": 17.7785, + "num_tokens": 75916954.0, + "step": 182 + }, + { + "epoch": 0.29415310427968655, + "grad_norm": 7.461830865292821, + "learning_rate": 9.688116960934669e-06, + "loss": 20.6671, + "num_tokens": 76332264.0, + "step": 183 + }, + { + "epoch": 0.29576049829214385, + "grad_norm": 7.197610309871883, + "learning_rate": 9.683821838088899e-06, + "loss": 20.8839, + "num_tokens": 76784257.0, + "step": 184 + }, + { + "epoch": 0.29736789230460114, + "grad_norm": 8.41450318063421, + "learning_rate": 9.679498418346095e-06, + "loss": 21.2885, + "num_tokens": 77213436.0, + "step": 185 + }, + { + "epoch": 0.29897528631705844, + "grad_norm": 6.0923115962126735, + "learning_rate": 9.675146730947313e-06, + "loss": 18.2654, + "num_tokens": 77676398.0, + "step": 186 + }, + { + "epoch": 0.3005826803295158, + "grad_norm": 7.880304102845722, + "learning_rate": 9.67076680532479e-06, + "loss": 18.0728, + "num_tokens": 78046171.0, + "step": 187 + }, + { + "epoch": 0.3021900743419731, + "grad_norm": 6.562947544605057, + "learning_rate": 9.666358671101756e-06, + "loss": 18.0592, + "num_tokens": 78472781.0, + "step": 188 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 6.562642770325094, + "learning_rate": 9.661922358092225e-06, + "loss": 17.3253, + "num_tokens": 78849837.0, + "step": 189 + }, + { + "epoch": 0.3054048623668877, + "grad_norm": 7.541085882754768, + "learning_rate": 9.65745789630079e-06, + "loss": 20.1758, + "num_tokens": 79258318.0, + "step": 190 + }, + { + "epoch": 0.307012256379345, + "grad_norm": 6.8851240974984185, + "learning_rate": 9.652965315922439e-06, + "loss": 17.139, + "num_tokens": 79651227.0, + "step": 191 + }, + { + "epoch": 0.3086196503918023, + "grad_norm": 8.639358927353467, + "learning_rate": 9.648444647342322e-06, + "loss": 20.8575, + "num_tokens": 80099527.0, + "step": 192 + }, + { + "epoch": 0.3102270444042596, + "grad_norm": 7.1457137240101325, + "learning_rate": 9.643895921135573e-06, + "loss": 20.7304, + "num_tokens": 80506635.0, + "step": 193 + }, + { + "epoch": 0.3118344384167169, + "grad_norm": 6.912939313920804, + "learning_rate": 9.639319168067081e-06, + "loss": 19.2992, + "num_tokens": 80894502.0, + "step": 194 + }, + { + "epoch": 0.3134418324291742, + "grad_norm": 7.375881499528258, + "learning_rate": 9.634714419091302e-06, + "loss": 19.8441, + "num_tokens": 81288890.0, + "step": 195 + }, + { + "epoch": 0.3150492264416315, + "grad_norm": 7.9360770994041, + "learning_rate": 9.630081705352036e-06, + "loss": 17.0689, + "num_tokens": 81676277.0, + "step": 196 + }, + { + "epoch": 0.3166566204540888, + "grad_norm": 6.57596066613604, + "learning_rate": 9.625421058182217e-06, + "loss": 16.9561, + "num_tokens": 82111818.0, + "step": 197 + }, + { + "epoch": 0.3182640144665461, + "grad_norm": 7.0609841984405355, + "learning_rate": 9.620732509103705e-06, + "loss": 18.8317, + "num_tokens": 82505285.0, + "step": 198 + }, + { + "epoch": 0.3198714084790034, + "grad_norm": 7.250033653992877, + "learning_rate": 9.616016089827078e-06, + "loss": 19.6542, + "num_tokens": 82939903.0, + "step": 199 + }, + { + "epoch": 0.3214788024914607, + "grad_norm": 6.488828376879765, + "learning_rate": 9.611271832251404e-06, + "loss": 18.148, + "num_tokens": 83384349.0, + "step": 200 + }, + { + "epoch": 0.323086196503918, + "grad_norm": 6.872717204086992, + "learning_rate": 9.606499768464039e-06, + "loss": 17.6826, + "num_tokens": 83768254.0, + "step": 201 + }, + { + "epoch": 0.3246935905163753, + "grad_norm": 5.983730177844249, + "learning_rate": 9.601699930740396e-06, + "loss": 19.7038, + "num_tokens": 84222793.0, + "step": 202 + }, + { + "epoch": 0.3263009845288326, + "grad_norm": 6.956279028300537, + "learning_rate": 9.596872351543743e-06, + "loss": 18.6472, + "num_tokens": 84612372.0, + "step": 203 + }, + { + "epoch": 0.32790837854128996, + "grad_norm": 6.69380172514063, + "learning_rate": 9.592017063524971e-06, + "loss": 16.7571, + "num_tokens": 85013797.0, + "step": 204 + }, + { + "epoch": 0.32790837854128996, + "eval_loss": 1.0769612789154053, + "eval_num_tokens": 85013797.0, + "eval_runtime": 375.2004, + "eval_samples_per_second": 23.582, + "eval_steps_per_second": 5.896, + "step": 204 + }, + { + "epoch": 0.32951577255374725, + "grad_norm": 7.154299196749382, + "learning_rate": 9.587134099522375e-06, + "loss": 16.2566, + "num_tokens": 85432680.0, + "step": 205 + }, + { + "epoch": 0.33112316656620455, + "grad_norm": 7.371547167611947, + "learning_rate": 9.58222349256144e-06, + "loss": 18.5354, + "num_tokens": 85869670.0, + "step": 206 + }, + { + "epoch": 0.33273056057866185, + "grad_norm": 7.752263487419277, + "learning_rate": 9.577285275854602e-06, + "loss": 20.4482, + "num_tokens": 86281682.0, + "step": 207 + }, + { + "epoch": 0.33433795459111915, + "grad_norm": 7.196454751417397, + "learning_rate": 9.572319482801045e-06, + "loss": 18.612, + "num_tokens": 86695185.0, + "step": 208 + }, + { + "epoch": 0.33594534860357644, + "grad_norm": 7.044888565962969, + "learning_rate": 9.567326146986454e-06, + "loss": 19.4107, + "num_tokens": 87127148.0, + "step": 209 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 7.960426673656389, + "learning_rate": 9.562305302182799e-06, + "loss": 18.8625, + "num_tokens": 87502060.0, + "step": 210 + }, + { + "epoch": 0.33916013662849104, + "grad_norm": 7.452168075928625, + "learning_rate": 9.557256982348108e-06, + "loss": 20.5907, + "num_tokens": 87936764.0, + "step": 211 + }, + { + "epoch": 0.34076753064094834, + "grad_norm": 6.677183329558479, + "learning_rate": 9.55218122162623e-06, + "loss": 18.8752, + "num_tokens": 88358884.0, + "step": 212 + }, + { + "epoch": 0.3423749246534057, + "grad_norm": 7.634969629165308, + "learning_rate": 9.547078054346608e-06, + "loss": 18.471, + "num_tokens": 88804068.0, + "step": 213 + }, + { + "epoch": 0.343982318665863, + "grad_norm": 6.852457807898474, + "learning_rate": 9.54194751502405e-06, + "loss": 18.069, + "num_tokens": 89187281.0, + "step": 214 + }, + { + "epoch": 0.3455897126783203, + "grad_norm": 7.264410137049914, + "learning_rate": 9.53678963835849e-06, + "loss": 19.5953, + "num_tokens": 89635682.0, + "step": 215 + }, + { + "epoch": 0.3471971066907776, + "grad_norm": 7.813468748372866, + "learning_rate": 9.531604459234753e-06, + "loss": 18.4266, + "num_tokens": 90057815.0, + "step": 216 + }, + { + "epoch": 0.3488045007032349, + "grad_norm": 6.5184247889637374, + "learning_rate": 9.52639201272233e-06, + "loss": 16.9564, + "num_tokens": 90463302.0, + "step": 217 + }, + { + "epoch": 0.3504118947156922, + "grad_norm": 6.158294005351242, + "learning_rate": 9.521152334075122e-06, + "loss": 19.9221, + "num_tokens": 90908813.0, + "step": 218 + }, + { + "epoch": 0.35201928872814947, + "grad_norm": 6.8086131086160675, + "learning_rate": 9.515885458731222e-06, + "loss": 17.979, + "num_tokens": 91369479.0, + "step": 219 + }, + { + "epoch": 0.35362668274060677, + "grad_norm": 7.226086313992003, + "learning_rate": 9.510591422312656e-06, + "loss": 20.4586, + "num_tokens": 91808335.0, + "step": 220 + }, + { + "epoch": 0.3552340767530641, + "grad_norm": 7.8942269775312, + "learning_rate": 9.505270260625156e-06, + "loss": 18.5366, + "num_tokens": 92186814.0, + "step": 221 + }, + { + "epoch": 0.3568414707655214, + "grad_norm": 7.16916362192126, + "learning_rate": 9.499922009657918e-06, + "loss": 19.6004, + "num_tokens": 92562034.0, + "step": 222 + }, + { + "epoch": 0.3584488647779787, + "grad_norm": 7.910843576820844, + "learning_rate": 9.494546705583345e-06, + "loss": 20.1452, + "num_tokens": 92942825.0, + "step": 223 + }, + { + "epoch": 0.360056258790436, + "grad_norm": 6.123281830335913, + "learning_rate": 9.489144384756818e-06, + "loss": 18.5464, + "num_tokens": 93401780.0, + "step": 224 + }, + { + "epoch": 0.3616636528028933, + "grad_norm": 7.320127099168028, + "learning_rate": 9.483715083716437e-06, + "loss": 16.5952, + "num_tokens": 93790948.0, + "step": 225 + }, + { + "epoch": 0.3632710468153506, + "grad_norm": 6.7270726500691005, + "learning_rate": 9.47825883918279e-06, + "loss": 18.6214, + "num_tokens": 94223464.0, + "step": 226 + }, + { + "epoch": 0.3648784408278079, + "grad_norm": 6.558343699610045, + "learning_rate": 9.472775688058682e-06, + "loss": 20.1198, + "num_tokens": 94649721.0, + "step": 227 + }, + { + "epoch": 0.3664858348402652, + "grad_norm": 8.05506781014377, + "learning_rate": 9.467265667428907e-06, + "loss": 20.1726, + "num_tokens": 95045568.0, + "step": 228 + }, + { + "epoch": 0.3680932288527225, + "grad_norm": 7.244435880559629, + "learning_rate": 9.461728814559992e-06, + "loss": 18.2383, + "num_tokens": 95487076.0, + "step": 229 + }, + { + "epoch": 0.36970062286517985, + "grad_norm": 6.879561508738424, + "learning_rate": 9.456165166899926e-06, + "loss": 18.4286, + "num_tokens": 95900328.0, + "step": 230 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 6.793333498833771, + "learning_rate": 9.450574762077941e-06, + "loss": 17.74, + "num_tokens": 96312955.0, + "step": 231 + }, + { + "epoch": 0.37291541089009445, + "grad_norm": 7.097234265203366, + "learning_rate": 9.444957637904225e-06, + "loss": 19.8148, + "num_tokens": 96730746.0, + "step": 232 + }, + { + "epoch": 0.37452280490255174, + "grad_norm": 7.6161035672407875, + "learning_rate": 9.439313832369687e-06, + "loss": 17.4529, + "num_tokens": 97131314.0, + "step": 233 + }, + { + "epoch": 0.37613019891500904, + "grad_norm": 6.422099777836712, + "learning_rate": 9.433643383645687e-06, + "loss": 18.7281, + "num_tokens": 97610362.0, + "step": 234 + }, + { + "epoch": 0.37773759292746634, + "grad_norm": 6.803522737908095, + "learning_rate": 9.427946330083792e-06, + "loss": 20.1289, + "num_tokens": 98053126.0, + "step": 235 + }, + { + "epoch": 0.37934498693992363, + "grad_norm": 7.291122097729075, + "learning_rate": 9.422222710215498e-06, + "loss": 20.2248, + "num_tokens": 98486690.0, + "step": 236 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 6.895875762992409, + "learning_rate": 9.416472562751989e-06, + "loss": 20.0881, + "num_tokens": 98951232.0, + "step": 237 + }, + { + "epoch": 0.38255977496483823, + "grad_norm": 6.988555912006623, + "learning_rate": 9.410695926583863e-06, + "loss": 18.4754, + "num_tokens": 99360561.0, + "step": 238 + }, + { + "epoch": 0.3841671689772956, + "grad_norm": 7.495262644214178, + "learning_rate": 9.404892840780868e-06, + "loss": 19.7175, + "num_tokens": 99730797.0, + "step": 239 + }, + { + "epoch": 0.3857745629897529, + "grad_norm": 7.335735889253107, + "learning_rate": 9.39906334459165e-06, + "loss": 17.5894, + "num_tokens": 100160618.0, + "step": 240 + }, + { + "epoch": 0.3873819570022102, + "grad_norm": 7.365728812014083, + "learning_rate": 9.39320747744347e-06, + "loss": 17.1801, + "num_tokens": 100540884.0, + "step": 241 + }, + { + "epoch": 0.3889893510146675, + "grad_norm": 6.624559622989891, + "learning_rate": 9.387325278941952e-06, + "loss": 17.0029, + "num_tokens": 100947113.0, + "step": 242 + }, + { + "epoch": 0.39059674502712477, + "grad_norm": 7.31248518908536, + "learning_rate": 9.381416788870808e-06, + "loss": 18.2215, + "num_tokens": 101368621.0, + "step": 243 + }, + { + "epoch": 0.39220413903958207, + "grad_norm": 7.935011378128639, + "learning_rate": 9.375482047191574e-06, + "loss": 20.4694, + "num_tokens": 101769806.0, + "step": 244 + }, + { + "epoch": 0.39381153305203936, + "grad_norm": 6.272025753522187, + "learning_rate": 9.36952109404333e-06, + "loss": 20.1013, + "num_tokens": 102197728.0, + "step": 245 + }, + { + "epoch": 0.39541892706449666, + "grad_norm": 6.3201156161526475, + "learning_rate": 9.36353396974244e-06, + "loss": 17.4853, + "num_tokens": 102675009.0, + "step": 246 + }, + { + "epoch": 0.397026321076954, + "grad_norm": 6.902133443706521, + "learning_rate": 9.357520714782274e-06, + "loss": 16.9004, + "num_tokens": 103080176.0, + "step": 247 + }, + { + "epoch": 0.3986337150894113, + "grad_norm": 6.3677319865738395, + "learning_rate": 9.35148136983293e-06, + "loss": 19.1657, + "num_tokens": 103518516.0, + "step": 248 + }, + { + "epoch": 0.4002411091018686, + "grad_norm": 7.112622457343078, + "learning_rate": 9.345415975740967e-06, + "loss": 16.6237, + "num_tokens": 103890184.0, + "step": 249 + }, + { + "epoch": 0.4018485031143259, + "grad_norm": 6.680133928403349, + "learning_rate": 9.339324573529127e-06, + "loss": 18.3627, + "num_tokens": 104353765.0, + "step": 250 + }, + { + "epoch": 0.4034558971267832, + "grad_norm": 7.252376447814255, + "learning_rate": 9.333207204396048e-06, + "loss": 17.5351, + "num_tokens": 104741138.0, + "step": 251 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 6.91995372431353, + "learning_rate": 9.327063909716002e-06, + "loss": 19.2967, + "num_tokens": 105168371.0, + "step": 252 + }, + { + "epoch": 0.4066706851516978, + "grad_norm": 6.3327985557660735, + "learning_rate": 9.320894731038598e-06, + "loss": 17.5822, + "num_tokens": 105566942.0, + "step": 253 + }, + { + "epoch": 0.4082780791641551, + "grad_norm": 6.674083555809126, + "learning_rate": 9.314699710088517e-06, + "loss": 19.3624, + "num_tokens": 106003766.0, + "step": 254 + }, + { + "epoch": 0.4098854731766124, + "grad_norm": 7.057311823109091, + "learning_rate": 9.308478888765214e-06, + "loss": 18.5495, + "num_tokens": 106425189.0, + "step": 255 + }, + { + "epoch": 0.4098854731766124, + "eval_loss": 1.0559697151184082, + "eval_num_tokens": 106425189.0, + "eval_runtime": 375.0895, + "eval_samples_per_second": 23.589, + "eval_steps_per_second": 5.897, + "step": 255 + }, + { + "epoch": 0.41149286718906974, + "grad_norm": 7.2460359095743385, + "learning_rate": 9.30223230914265e-06, + "loss": 18.3957, + "num_tokens": 106841038.0, + "step": 256 + }, + { + "epoch": 0.41310026120152704, + "grad_norm": 8.044100798334306, + "learning_rate": 9.29596001346899e-06, + "loss": 20.2878, + "num_tokens": 107242296.0, + "step": 257 + }, + { + "epoch": 0.41470765521398434, + "grad_norm": 7.449695563310002, + "learning_rate": 9.289662044166337e-06, + "loss": 17.8153, + "num_tokens": 107659159.0, + "step": 258 + }, + { + "epoch": 0.41631504922644164, + "grad_norm": 6.8217596596960774, + "learning_rate": 9.283338443830433e-06, + "loss": 17.328, + "num_tokens": 108102454.0, + "step": 259 + }, + { + "epoch": 0.41792244323889893, + "grad_norm": 6.307416432136381, + "learning_rate": 9.276989255230368e-06, + "loss": 22.172, + "num_tokens": 108585780.0, + "step": 260 + }, + { + "epoch": 0.41952983725135623, + "grad_norm": 7.160952271686585, + "learning_rate": 9.2706145213083e-06, + "loss": 19.3086, + "num_tokens": 109027474.0, + "step": 261 + }, + { + "epoch": 0.4211372312638135, + "grad_norm": 6.8310241770486515, + "learning_rate": 9.264214285179161e-06, + "loss": 18.4956, + "num_tokens": 109484031.0, + "step": 262 + }, + { + "epoch": 0.4227446252762708, + "grad_norm": 7.034832857504743, + "learning_rate": 9.257788590130365e-06, + "loss": 18.339, + "num_tokens": 109900207.0, + "step": 263 + }, + { + "epoch": 0.4243520192887282, + "grad_norm": 6.688989423505785, + "learning_rate": 9.251337479621512e-06, + "loss": 17.1802, + "num_tokens": 110291460.0, + "step": 264 + }, + { + "epoch": 0.4259594133011855, + "grad_norm": 6.619417608108093, + "learning_rate": 9.244860997284097e-06, + "loss": 19.3084, + "num_tokens": 110699704.0, + "step": 265 + }, + { + "epoch": 0.42756680731364277, + "grad_norm": 7.056189887025502, + "learning_rate": 9.238359186921222e-06, + "loss": 17.4952, + "num_tokens": 111094610.0, + "step": 266 + }, + { + "epoch": 0.42917420132610007, + "grad_norm": 7.880307587915931, + "learning_rate": 9.231832092507283e-06, + "loss": 17.2421, + "num_tokens": 111473881.0, + "step": 267 + }, + { + "epoch": 0.43078159533855737, + "grad_norm": 7.462496868670285, + "learning_rate": 9.22527975818769e-06, + "loss": 18.4837, + "num_tokens": 111892626.0, + "step": 268 + }, + { + "epoch": 0.43238898935101466, + "grad_norm": 6.934754093281141, + "learning_rate": 9.21870222827856e-06, + "loss": 17.7667, + "num_tokens": 112303268.0, + "step": 269 + }, + { + "epoch": 0.43399638336347196, + "grad_norm": 7.518017206851831, + "learning_rate": 9.21209954726641e-06, + "loss": 19.428, + "num_tokens": 112707646.0, + "step": 270 + }, + { + "epoch": 0.43560377737592926, + "grad_norm": 7.050655510301953, + "learning_rate": 9.205471759807875e-06, + "loss": 16.6039, + "num_tokens": 113109058.0, + "step": 271 + }, + { + "epoch": 0.43721117138838655, + "grad_norm": 6.957698291243739, + "learning_rate": 9.198818910729388e-06, + "loss": 18.8574, + "num_tokens": 113527822.0, + "step": 272 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 7.209626495421374, + "learning_rate": 9.192141045026888e-06, + "loss": 19.0127, + "num_tokens": 113910126.0, + "step": 273 + }, + { + "epoch": 0.4404259594133012, + "grad_norm": 7.313001822737551, + "learning_rate": 9.18543820786551e-06, + "loss": 18.3323, + "num_tokens": 114341936.0, + "step": 274 + }, + { + "epoch": 0.4420333534257585, + "grad_norm": 7.757139211998271, + "learning_rate": 9.178710444579277e-06, + "loss": 17.5366, + "num_tokens": 114753547.0, + "step": 275 + }, + { + "epoch": 0.4436407474382158, + "grad_norm": 7.35912287113386, + "learning_rate": 9.17195780067081e-06, + "loss": 15.2613, + "num_tokens": 115124756.0, + "step": 276 + }, + { + "epoch": 0.4452481414506731, + "grad_norm": 7.765786932004798, + "learning_rate": 9.165180321810992e-06, + "loss": 21.3476, + "num_tokens": 115544617.0, + "step": 277 + }, + { + "epoch": 0.4468555354631304, + "grad_norm": 6.436029792568038, + "learning_rate": 9.158378053838685e-06, + "loss": 16.9316, + "num_tokens": 115961832.0, + "step": 278 + }, + { + "epoch": 0.4484629294755877, + "grad_norm": 7.168381322194868, + "learning_rate": 9.151551042760408e-06, + "loss": 19.9864, + "num_tokens": 116365769.0, + "step": 279 + }, + { + "epoch": 0.450070323488045, + "grad_norm": 6.940173594967811, + "learning_rate": 9.144699334750028e-06, + "loss": 16.9343, + "num_tokens": 116808705.0, + "step": 280 + }, + { + "epoch": 0.4516777175005023, + "grad_norm": 6.93926501541659, + "learning_rate": 9.137822976148444e-06, + "loss": 17.2703, + "num_tokens": 117238476.0, + "step": 281 + }, + { + "epoch": 0.45328511151295964, + "grad_norm": 8.35625139268027, + "learning_rate": 9.13092201346328e-06, + "loss": 17.7834, + "num_tokens": 117596498.0, + "step": 282 + }, + { + "epoch": 0.45489250552541693, + "grad_norm": 6.821144223162512, + "learning_rate": 9.12399649336857e-06, + "loss": 18.0928, + "num_tokens": 118009049.0, + "step": 283 + }, + { + "epoch": 0.45649989953787423, + "grad_norm": 7.843401598656151, + "learning_rate": 9.117046462704436e-06, + "loss": 18.7504, + "num_tokens": 118390987.0, + "step": 284 + }, + { + "epoch": 0.45810729355033153, + "grad_norm": 6.816180213268656, + "learning_rate": 9.110071968476774e-06, + "loss": 15.0723, + "num_tokens": 118768352.0, + "step": 285 + }, + { + "epoch": 0.4597146875627888, + "grad_norm": 7.900695725507185, + "learning_rate": 9.10307305785694e-06, + "loss": 19.0988, + "num_tokens": 119162382.0, + "step": 286 + }, + { + "epoch": 0.4613220815752461, + "grad_norm": 8.05653941875474, + "learning_rate": 9.096049778181427e-06, + "loss": 16.9535, + "num_tokens": 119548757.0, + "step": 287 + }, + { + "epoch": 0.4629294755877034, + "grad_norm": 8.50077883070157, + "learning_rate": 9.089002176951545e-06, + "loss": 18.7442, + "num_tokens": 119936224.0, + "step": 288 + }, + { + "epoch": 0.4645368696001607, + "grad_norm": 8.176178108742084, + "learning_rate": 9.081930301833102e-06, + "loss": 18.135, + "num_tokens": 120294879.0, + "step": 289 + }, + { + "epoch": 0.46614426361261807, + "grad_norm": 8.139290862663955, + "learning_rate": 9.07483420065608e-06, + "loss": 18.1435, + "num_tokens": 120686511.0, + "step": 290 + }, + { + "epoch": 0.46775165762507537, + "grad_norm": 8.261957084757622, + "learning_rate": 9.067713921414314e-06, + "loss": 17.5761, + "num_tokens": 121102037.0, + "step": 291 + }, + { + "epoch": 0.46935905163753266, + "grad_norm": 8.243793152644082, + "learning_rate": 9.060569512265157e-06, + "loss": 19.812, + "num_tokens": 121542139.0, + "step": 292 + }, + { + "epoch": 0.47096644564998996, + "grad_norm": 6.69877159524607, + "learning_rate": 9.053401021529172e-06, + "loss": 18.9699, + "num_tokens": 121917347.0, + "step": 293 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 6.458637401485486, + "learning_rate": 9.046208497689792e-06, + "loss": 17.0525, + "num_tokens": 122320053.0, + "step": 294 + }, + { + "epoch": 0.47418123367490456, + "grad_norm": 6.905817375644184, + "learning_rate": 9.038991989392992e-06, + "loss": 18.9022, + "num_tokens": 122735508.0, + "step": 295 + }, + { + "epoch": 0.47578862768736185, + "grad_norm": 7.1759616747180655, + "learning_rate": 9.031751545446971e-06, + "loss": 18.4757, + "num_tokens": 123168338.0, + "step": 296 + }, + { + "epoch": 0.47739602169981915, + "grad_norm": 7.981631162216954, + "learning_rate": 9.024487214821807e-06, + "loss": 19.1257, + "num_tokens": 123610625.0, + "step": 297 + }, + { + "epoch": 0.47900341571227645, + "grad_norm": 8.367696357993415, + "learning_rate": 9.01719904664914e-06, + "loss": 19.1291, + "num_tokens": 124021285.0, + "step": 298 + }, + { + "epoch": 0.4806108097247338, + "grad_norm": 6.98931112297909, + "learning_rate": 9.009887090221829e-06, + "loss": 19.2753, + "num_tokens": 124470027.0, + "step": 299 + }, + { + "epoch": 0.4822182037371911, + "grad_norm": 5.8576978423122945, + "learning_rate": 9.002551394993622e-06, + "loss": 16.9104, + "num_tokens": 124874169.0, + "step": 300 + }, + { + "epoch": 0.4838255977496484, + "grad_norm": 6.703086687180103, + "learning_rate": 8.995192010578826e-06, + "loss": 17.509, + "num_tokens": 125277701.0, + "step": 301 + }, + { + "epoch": 0.4854329917621057, + "grad_norm": 7.1696037116910905, + "learning_rate": 8.987808986751964e-06, + "loss": 19.7357, + "num_tokens": 125713976.0, + "step": 302 + }, + { + "epoch": 0.487040385774563, + "grad_norm": 7.223007452575657, + "learning_rate": 8.980402373447446e-06, + "loss": 19.4791, + "num_tokens": 126184993.0, + "step": 303 + }, + { + "epoch": 0.4886477797870203, + "grad_norm": 6.4091544592310985, + "learning_rate": 8.972972220759224e-06, + "loss": 18.3364, + "num_tokens": 126619669.0, + "step": 304 + }, + { + "epoch": 0.4902551737994776, + "grad_norm": 6.378348403507937, + "learning_rate": 8.965518578940457e-06, + "loss": 19.5084, + "num_tokens": 127028017.0, + "step": 305 + }, + { + "epoch": 0.4918625678119349, + "grad_norm": 6.463856813882992, + "learning_rate": 8.958041498403168e-06, + "loss": 16.9806, + "num_tokens": 127448351.0, + "step": 306 + }, + { + "epoch": 0.4918625678119349, + "eval_loss": 1.0364066362380981, + "eval_num_tokens": 127448351.0, + "eval_runtime": 375.1984, + "eval_samples_per_second": 23.582, + "eval_steps_per_second": 5.896, + "step": 306 + }, + { + "epoch": 0.4934699618243922, + "grad_norm": 7.443739145802105, + "learning_rate": 8.950541029717913e-06, + "loss": 19.1327, + "num_tokens": 127879531.0, + "step": 307 + }, + { + "epoch": 0.49507735583684953, + "grad_norm": 8.013368118310533, + "learning_rate": 8.943017223613426e-06, + "loss": 16.6552, + "num_tokens": 128208302.0, + "step": 308 + }, + { + "epoch": 0.49668474984930683, + "grad_norm": 7.323746122785035, + "learning_rate": 8.935470130976282e-06, + "loss": 17.9683, + "num_tokens": 128644232.0, + "step": 309 + }, + { + "epoch": 0.4982921438617641, + "grad_norm": 6.903188449101386, + "learning_rate": 8.927899802850552e-06, + "loss": 16.9984, + "num_tokens": 129034843.0, + "step": 310 + }, + { + "epoch": 0.4998995378742214, + "grad_norm": 7.191829600830773, + "learning_rate": 8.920306290437463e-06, + "loss": 18.2013, + "num_tokens": 129423951.0, + "step": 311 + }, + { + "epoch": 0.5015069318866787, + "grad_norm": 6.829266444459179, + "learning_rate": 8.91268964509504e-06, + "loss": 20.492, + "num_tokens": 129825111.0, + "step": 312 + }, + { + "epoch": 0.5031143258991361, + "grad_norm": 7.120552045796696, + "learning_rate": 8.905049918337767e-06, + "loss": 20.3761, + "num_tokens": 130238330.0, + "step": 313 + }, + { + "epoch": 0.5047217199115933, + "grad_norm": 6.251980949336462, + "learning_rate": 8.897387161836241e-06, + "loss": 17.385, + "num_tokens": 130711423.0, + "step": 314 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 7.0860028810232505, + "learning_rate": 8.889701427416816e-06, + "loss": 21.7024, + "num_tokens": 131144059.0, + "step": 315 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 7.809882082078462, + "learning_rate": 8.881992767061252e-06, + "loss": 17.6591, + "num_tokens": 131540140.0, + "step": 316 + }, + { + "epoch": 0.5095439019489653, + "grad_norm": 6.286794090533395, + "learning_rate": 8.874261232906375e-06, + "loss": 18.5354, + "num_tokens": 131963901.0, + "step": 317 + }, + { + "epoch": 0.5111512959614225, + "grad_norm": 7.189507987810574, + "learning_rate": 8.866506877243706e-06, + "loss": 16.453, + "num_tokens": 132358532.0, + "step": 318 + }, + { + "epoch": 0.5127586899738799, + "grad_norm": 7.041714274912887, + "learning_rate": 8.858729752519122e-06, + "loss": 18.4971, + "num_tokens": 132744264.0, + "step": 319 + }, + { + "epoch": 0.5143660839863371, + "grad_norm": 7.57234603252408, + "learning_rate": 8.850929911332497e-06, + "loss": 17.8847, + "num_tokens": 133139458.0, + "step": 320 + }, + { + "epoch": 0.5159734779987944, + "grad_norm": 6.61715798097087, + "learning_rate": 8.843107406437344e-06, + "loss": 20.3427, + "num_tokens": 133570738.0, + "step": 321 + }, + { + "epoch": 0.5175808720112518, + "grad_norm": 6.493519522441878, + "learning_rate": 8.835262290740464e-06, + "loss": 17.3093, + "num_tokens": 134001828.0, + "step": 322 + }, + { + "epoch": 0.519188266023709, + "grad_norm": 7.5117135390138, + "learning_rate": 8.827394617301577e-06, + "loss": 21.0569, + "num_tokens": 134440213.0, + "step": 323 + }, + { + "epoch": 0.5207956600361664, + "grad_norm": 7.881194562171503, + "learning_rate": 8.819504439332974e-06, + "loss": 15.0986, + "num_tokens": 134767371.0, + "step": 324 + }, + { + "epoch": 0.5224030540486236, + "grad_norm": 7.008493886688543, + "learning_rate": 8.811591810199156e-06, + "loss": 16.9336, + "num_tokens": 135186817.0, + "step": 325 + }, + { + "epoch": 0.524010448061081, + "grad_norm": 7.176035537885473, + "learning_rate": 8.803656783416465e-06, + "loss": 20.4995, + "num_tokens": 135577779.0, + "step": 326 + }, + { + "epoch": 0.5256178420735382, + "grad_norm": 7.441146233333041, + "learning_rate": 8.795699412652732e-06, + "loss": 17.9565, + "num_tokens": 135985434.0, + "step": 327 + }, + { + "epoch": 0.5272252360859956, + "grad_norm": 7.910904475660442, + "learning_rate": 8.787719751726907e-06, + "loss": 19.158, + "num_tokens": 136402897.0, + "step": 328 + }, + { + "epoch": 0.5288326300984529, + "grad_norm": 7.7098330013459, + "learning_rate": 8.779717854608696e-06, + "loss": 20.1018, + "num_tokens": 136781029.0, + "step": 329 + }, + { + "epoch": 0.5304400241109102, + "grad_norm": 7.925190671601508, + "learning_rate": 8.771693775418206e-06, + "loss": 18.1571, + "num_tokens": 137163591.0, + "step": 330 + }, + { + "epoch": 0.5320474181233675, + "grad_norm": 6.641577730288724, + "learning_rate": 8.763647568425558e-06, + "loss": 16.1438, + "num_tokens": 137576805.0, + "step": 331 + }, + { + "epoch": 0.5336548121358248, + "grad_norm": 7.201035246002275, + "learning_rate": 8.75557928805054e-06, + "loss": 14.6034, + "num_tokens": 137959576.0, + "step": 332 + }, + { + "epoch": 0.5352622061482821, + "grad_norm": 6.006007754299917, + "learning_rate": 8.747488988862232e-06, + "loss": 17.1609, + "num_tokens": 138364019.0, + "step": 333 + }, + { + "epoch": 0.5368696001607394, + "grad_norm": 6.897108023495821, + "learning_rate": 8.73937672557863e-06, + "loss": 18.3156, + "num_tokens": 138797654.0, + "step": 334 + }, + { + "epoch": 0.5384769941731967, + "grad_norm": 7.292824707276435, + "learning_rate": 8.731242553066287e-06, + "loss": 17.9008, + "num_tokens": 139182693.0, + "step": 335 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 7.517185968179967, + "learning_rate": 8.723086526339939e-06, + "loss": 19.8292, + "num_tokens": 139621700.0, + "step": 336 + }, + { + "epoch": 0.5416917821981113, + "grad_norm": 6.955623557938774, + "learning_rate": 8.714908700562127e-06, + "loss": 18.7442, + "num_tokens": 140042668.0, + "step": 337 + }, + { + "epoch": 0.5432991762105687, + "grad_norm": 7.128576459466886, + "learning_rate": 8.706709131042828e-06, + "loss": 16.9151, + "num_tokens": 140419502.0, + "step": 338 + }, + { + "epoch": 0.5449065702230259, + "grad_norm": 6.2932130943417395, + "learning_rate": 8.69848787323908e-06, + "loss": 17.5391, + "num_tokens": 140852433.0, + "step": 339 + }, + { + "epoch": 0.5465139642354833, + "grad_norm": 6.320249471771769, + "learning_rate": 8.690244982754614e-06, + "loss": 19.6873, + "num_tokens": 141315581.0, + "step": 340 + }, + { + "epoch": 0.5481213582479405, + "grad_norm": 7.2966606400391925, + "learning_rate": 8.681980515339464e-06, + "loss": 16.579, + "num_tokens": 141721754.0, + "step": 341 + }, + { + "epoch": 0.5497287522603979, + "grad_norm": 8.434663540591771, + "learning_rate": 8.673694526889604e-06, + "loss": 18.3689, + "num_tokens": 142095306.0, + "step": 342 + }, + { + "epoch": 0.5513361462728551, + "grad_norm": 6.575713685393011, + "learning_rate": 8.665387073446557e-06, + "loss": 14.8495, + "num_tokens": 142512596.0, + "step": 343 + }, + { + "epoch": 0.5529435402853125, + "grad_norm": 7.739713891454046, + "learning_rate": 8.657058211197027e-06, + "loss": 18.3234, + "num_tokens": 142939289.0, + "step": 344 + }, + { + "epoch": 0.5545509342977697, + "grad_norm": 6.541818200918226, + "learning_rate": 8.648707996472514e-06, + "loss": 19.2383, + "num_tokens": 143395457.0, + "step": 345 + }, + { + "epoch": 0.556158328310227, + "grad_norm": 6.982321055915739, + "learning_rate": 8.640336485748932e-06, + "loss": 17.3277, + "num_tokens": 143817969.0, + "step": 346 + }, + { + "epoch": 0.5577657223226844, + "grad_norm": 6.614891020040069, + "learning_rate": 8.631943735646231e-06, + "loss": 17.6326, + "num_tokens": 144248659.0, + "step": 347 + }, + { + "epoch": 0.5593731163351416, + "grad_norm": 7.531914301645856, + "learning_rate": 8.623529802928008e-06, + "loss": 17.7754, + "num_tokens": 144680954.0, + "step": 348 + }, + { + "epoch": 0.560980510347599, + "grad_norm": 6.42640987272037, + "learning_rate": 8.615094744501132e-06, + "loss": 19.3656, + "num_tokens": 145104733.0, + "step": 349 + }, + { + "epoch": 0.5625879043600562, + "grad_norm": 7.342639540613883, + "learning_rate": 8.606638617415348e-06, + "loss": 17.4932, + "num_tokens": 145532678.0, + "step": 350 + }, + { + "epoch": 0.5641952983725136, + "grad_norm": 8.095679435104648, + "learning_rate": 8.5981614788629e-06, + "loss": 19.3919, + "num_tokens": 145960535.0, + "step": 351 + }, + { + "epoch": 0.5658026923849708, + "grad_norm": 6.634472412637926, + "learning_rate": 8.58966338617814e-06, + "loss": 19.1525, + "num_tokens": 146389287.0, + "step": 352 + }, + { + "epoch": 0.5674100863974282, + "grad_norm": 6.3373007138085615, + "learning_rate": 8.581144396837145e-06, + "loss": 17.4997, + "num_tokens": 146823435.0, + "step": 353 + }, + { + "epoch": 0.5690174804098854, + "grad_norm": 6.598234456219753, + "learning_rate": 8.572604568457317e-06, + "loss": 19.3919, + "num_tokens": 147236962.0, + "step": 354 + }, + { + "epoch": 0.5706248744223428, + "grad_norm": 6.904527217895117, + "learning_rate": 8.564043958797009e-06, + "loss": 17.5931, + "num_tokens": 147602708.0, + "step": 355 + }, + { + "epoch": 0.5722322684348001, + "grad_norm": 7.982491517584634, + "learning_rate": 8.555462625755123e-06, + "loss": 18.4804, + "num_tokens": 147990396.0, + "step": 356 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 7.623521727271188, + "learning_rate": 8.546860627370719e-06, + "loss": 17.4104, + "num_tokens": 148388833.0, + "step": 357 + }, + { + "epoch": 0.5738396624472574, + "eval_loss": 1.018517255783081, + "eval_num_tokens": 148388833.0, + "eval_runtime": 375.3004, + "eval_samples_per_second": 23.576, + "eval_steps_per_second": 5.894, + "step": 357 + }, + { + "epoch": 0.5754470564597147, + "grad_norm": 6.994483443676211, + "learning_rate": 8.53823802182263e-06, + "loss": 15.9082, + "num_tokens": 148793930.0, + "step": 358 + }, + { + "epoch": 0.577054450472172, + "grad_norm": 6.853290667540901, + "learning_rate": 8.52959486742906e-06, + "loss": 17.6866, + "num_tokens": 149188554.0, + "step": 359 + }, + { + "epoch": 0.5786618444846293, + "grad_norm": 6.9892789322000395, + "learning_rate": 8.520931222647196e-06, + "loss": 17.4, + "num_tokens": 149609271.0, + "step": 360 + }, + { + "epoch": 0.5802692384970866, + "grad_norm": 6.502922818180786, + "learning_rate": 8.512247146072807e-06, + "loss": 18.2203, + "num_tokens": 150059531.0, + "step": 361 + }, + { + "epoch": 0.5818766325095439, + "grad_norm": 6.151274509437495, + "learning_rate": 8.503542696439855e-06, + "loss": 18.9628, + "num_tokens": 150479022.0, + "step": 362 + }, + { + "epoch": 0.5834840265220012, + "grad_norm": 6.365082906106696, + "learning_rate": 8.494817932620087e-06, + "loss": 16.7724, + "num_tokens": 150898540.0, + "step": 363 + }, + { + "epoch": 0.5850914205344585, + "grad_norm": 7.210418685904426, + "learning_rate": 8.486072913622646e-06, + "loss": 14.6611, + "num_tokens": 151320271.0, + "step": 364 + }, + { + "epoch": 0.5866988145469159, + "grad_norm": 6.644884346721684, + "learning_rate": 8.477307698593674e-06, + "loss": 16.7516, + "num_tokens": 151726126.0, + "step": 365 + }, + { + "epoch": 0.5883062085593731, + "grad_norm": 7.139586330684278, + "learning_rate": 8.4685223468159e-06, + "loss": 20.3153, + "num_tokens": 152115374.0, + "step": 366 + }, + { + "epoch": 0.5899136025718305, + "grad_norm": 7.522053723144863, + "learning_rate": 8.459716917708248e-06, + "loss": 17.1238, + "num_tokens": 152513142.0, + "step": 367 + }, + { + "epoch": 0.5915209965842877, + "grad_norm": 7.353954131348641, + "learning_rate": 8.450891470825435e-06, + "loss": 19.6501, + "num_tokens": 152924348.0, + "step": 368 + }, + { + "epoch": 0.593128390596745, + "grad_norm": 6.8156104869933065, + "learning_rate": 8.442046065857564e-06, + "loss": 18.2348, + "num_tokens": 153315844.0, + "step": 369 + }, + { + "epoch": 0.5947357846092023, + "grad_norm": 6.7811743460299985, + "learning_rate": 8.433180762629727e-06, + "loss": 16.7336, + "num_tokens": 153698494.0, + "step": 370 + }, + { + "epoch": 0.5963431786216596, + "grad_norm": 6.959644467287595, + "learning_rate": 8.42429562110159e-06, + "loss": 17.9627, + "num_tokens": 154135423.0, + "step": 371 + }, + { + "epoch": 0.5979505726341169, + "grad_norm": 7.097809496751988, + "learning_rate": 8.415390701366996e-06, + "loss": 17.9886, + "num_tokens": 154532465.0, + "step": 372 + }, + { + "epoch": 0.5995579666465742, + "grad_norm": 7.463535112165306, + "learning_rate": 8.406466063653559e-06, + "loss": 18.719, + "num_tokens": 154937881.0, + "step": 373 + }, + { + "epoch": 0.6011653606590316, + "grad_norm": 7.437517682563063, + "learning_rate": 8.397521768322251e-06, + "loss": 17.4829, + "num_tokens": 155323002.0, + "step": 374 + }, + { + "epoch": 0.6027727546714888, + "grad_norm": 7.142340952419528, + "learning_rate": 8.388557875866996e-06, + "loss": 17.8825, + "num_tokens": 155718462.0, + "step": 375 + }, + { + "epoch": 0.6043801486839462, + "grad_norm": 6.959584793076564, + "learning_rate": 8.379574446914264e-06, + "loss": 17.1849, + "num_tokens": 156112815.0, + "step": 376 + }, + { + "epoch": 0.6059875426964034, + "grad_norm": 6.23093566443086, + "learning_rate": 8.370571542222652e-06, + "loss": 19.6748, + "num_tokens": 156587059.0, + "step": 377 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 6.290542807887182, + "learning_rate": 8.361549222682489e-06, + "loss": 17.9232, + "num_tokens": 157026982.0, + "step": 378 + }, + { + "epoch": 0.609202330721318, + "grad_norm": 6.391640594986104, + "learning_rate": 8.352507549315407e-06, + "loss": 16.9869, + "num_tokens": 157426992.0, + "step": 379 + }, + { + "epoch": 0.6108097247337754, + "grad_norm": 6.70422507417715, + "learning_rate": 8.34344658327394e-06, + "loss": 17.4459, + "num_tokens": 157811261.0, + "step": 380 + }, + { + "epoch": 0.6124171187462327, + "grad_norm": 7.298181689170068, + "learning_rate": 8.334366385841103e-06, + "loss": 15.7071, + "num_tokens": 158175074.0, + "step": 381 + }, + { + "epoch": 0.61402451275869, + "grad_norm": 6.542641733307577, + "learning_rate": 8.325267018429983e-06, + "loss": 20.6984, + "num_tokens": 158634889.0, + "step": 382 + }, + { + "epoch": 0.6156319067711473, + "grad_norm": 7.388362366666688, + "learning_rate": 8.316148542583319e-06, + "loss": 19.369, + "num_tokens": 159028014.0, + "step": 383 + }, + { + "epoch": 0.6172393007836046, + "grad_norm": 7.037760703926059, + "learning_rate": 8.307011019973095e-06, + "loss": 18.6609, + "num_tokens": 159433565.0, + "step": 384 + }, + { + "epoch": 0.6188466947960619, + "grad_norm": 6.525835718076723, + "learning_rate": 8.297854512400104e-06, + "loss": 18.8001, + "num_tokens": 159869773.0, + "step": 385 + }, + { + "epoch": 0.6204540888085192, + "grad_norm": 5.758243901603582, + "learning_rate": 8.288679081793555e-06, + "loss": 17.5763, + "num_tokens": 160311360.0, + "step": 386 + }, + { + "epoch": 0.6220614828209765, + "grad_norm": 7.540284714230129, + "learning_rate": 8.279484790210632e-06, + "loss": 20.2912, + "num_tokens": 160746202.0, + "step": 387 + }, + { + "epoch": 0.6236688768334337, + "grad_norm": 6.904262696146377, + "learning_rate": 8.270271699836085e-06, + "loss": 17.6563, + "num_tokens": 161169144.0, + "step": 388 + }, + { + "epoch": 0.6252762708458911, + "grad_norm": 7.344407892248627, + "learning_rate": 8.261039872981816e-06, + "loss": 19.7522, + "num_tokens": 161634081.0, + "step": 389 + }, + { + "epoch": 0.6268836648583485, + "grad_norm": 6.214731648044269, + "learning_rate": 8.251789372086437e-06, + "loss": 18.3555, + "num_tokens": 162067745.0, + "step": 390 + }, + { + "epoch": 0.6284910588708057, + "grad_norm": 6.267332247797748, + "learning_rate": 8.24252025971487e-06, + "loss": 16.0733, + "num_tokens": 162501629.0, + "step": 391 + }, + { + "epoch": 0.630098452883263, + "grad_norm": 6.201491529077386, + "learning_rate": 8.233232598557906e-06, + "loss": 16.952, + "num_tokens": 162943979.0, + "step": 392 + }, + { + "epoch": 0.6317058468957203, + "grad_norm": 6.224174530659254, + "learning_rate": 8.223926451431798e-06, + "loss": 17.2143, + "num_tokens": 163359066.0, + "step": 393 + }, + { + "epoch": 0.6333132409081776, + "grad_norm": 8.414978026834305, + "learning_rate": 8.21460188127782e-06, + "loss": 19.9326, + "num_tokens": 163719744.0, + "step": 394 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 6.489103592715119, + "learning_rate": 8.205258951161853e-06, + "loss": 17.1812, + "num_tokens": 164109547.0, + "step": 395 + }, + { + "epoch": 0.6365280289330922, + "grad_norm": 6.427968041982579, + "learning_rate": 8.195897724273949e-06, + "loss": 20.6841, + "num_tokens": 164616226.0, + "step": 396 + }, + { + "epoch": 0.6381354229455495, + "grad_norm": 8.050967870498752, + "learning_rate": 8.186518263927912e-06, + "loss": 18.7604, + "num_tokens": 165016852.0, + "step": 397 + }, + { + "epoch": 0.6397428169580068, + "grad_norm": 5.895546707846452, + "learning_rate": 8.177120633560866e-06, + "loss": 20.6794, + "num_tokens": 165473490.0, + "step": 398 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 5.954801120820722, + "learning_rate": 8.167704896732828e-06, + "loss": 18.8154, + "num_tokens": 165972187.0, + "step": 399 + }, + { + "epoch": 0.6429576049829214, + "grad_norm": 6.100604749806023, + "learning_rate": 8.158271117126273e-06, + "loss": 14.0513, + "num_tokens": 166332405.0, + "step": 400 + }, + { + "epoch": 0.6445649989953788, + "grad_norm": 6.022676180131037, + "learning_rate": 8.148819358545707e-06, + "loss": 19.8564, + "num_tokens": 166760445.0, + "step": 401 + }, + { + "epoch": 0.646172393007836, + "grad_norm": 7.720279742692583, + "learning_rate": 8.139349684917237e-06, + "loss": 17.4229, + "num_tokens": 167147449.0, + "step": 402 + }, + { + "epoch": 0.6477797870202934, + "grad_norm": 6.302752129071361, + "learning_rate": 8.129862160288138e-06, + "loss": 15.3328, + "num_tokens": 167595838.0, + "step": 403 + }, + { + "epoch": 0.6493871810327506, + "grad_norm": 6.510760363856386, + "learning_rate": 8.120356848826413e-06, + "loss": 17.8145, + "num_tokens": 168013738.0, + "step": 404 + }, + { + "epoch": 0.650994575045208, + "grad_norm": 6.870001355757132, + "learning_rate": 8.110833814820371e-06, + "loss": 15.4921, + "num_tokens": 168406715.0, + "step": 405 + }, + { + "epoch": 0.6526019690576652, + "grad_norm": 7.015214723334777, + "learning_rate": 8.101293122678183e-06, + "loss": 18.3763, + "num_tokens": 168805606.0, + "step": 406 + }, + { + "epoch": 0.6542093630701226, + "grad_norm": 8.313828017687712, + "learning_rate": 8.091734836927447e-06, + "loss": 18.5899, + "num_tokens": 169188752.0, + "step": 407 + }, + { + "epoch": 0.6558167570825799, + "grad_norm": 6.189536355283217, + "learning_rate": 8.082159022214759e-06, + "loss": 18.4882, + "num_tokens": 169661272.0, + "step": 408 + }, + { + "epoch": 0.6558167570825799, + "eval_loss": 1.0036649703979492, + "eval_num_tokens": 169661272.0, + "eval_runtime": 375.2485, + "eval_samples_per_second": 23.579, + "eval_steps_per_second": 5.895, + "step": 408 + }, + { + "epoch": 0.6574241510950372, + "grad_norm": 6.682278240793301, + "learning_rate": 8.072565743305264e-06, + "loss": 16.5694, + "num_tokens": 170117141.0, + "step": 409 + }, + { + "epoch": 0.6590315451074945, + "grad_norm": 7.099249460073817, + "learning_rate": 8.062955065082233e-06, + "loss": 18.8761, + "num_tokens": 170563774.0, + "step": 410 + }, + { + "epoch": 0.6606389391199518, + "grad_norm": 8.130971796121838, + "learning_rate": 8.053327052546606e-06, + "loss": 16.5009, + "num_tokens": 170934739.0, + "step": 411 + }, + { + "epoch": 0.6622463331324091, + "grad_norm": 7.375429128964028, + "learning_rate": 8.043681770816569e-06, + "loss": 16.7288, + "num_tokens": 171338738.0, + "step": 412 + }, + { + "epoch": 0.6638537271448663, + "grad_norm": 7.374811381505208, + "learning_rate": 8.034019285127106e-06, + "loss": 17.8787, + "num_tokens": 171783809.0, + "step": 413 + }, + { + "epoch": 0.6654611211573237, + "grad_norm": 6.557006090989057, + "learning_rate": 8.024339660829553e-06, + "loss": 16.9616, + "num_tokens": 172219480.0, + "step": 414 + }, + { + "epoch": 0.6670685151697809, + "grad_norm": 6.288452541391747, + "learning_rate": 8.014642963391169e-06, + "loss": 17.0935, + "num_tokens": 172673490.0, + "step": 415 + }, + { + "epoch": 0.6686759091822383, + "grad_norm": 7.085432171605262, + "learning_rate": 8.004929258394676e-06, + "loss": 17.6836, + "num_tokens": 173109293.0, + "step": 416 + }, + { + "epoch": 0.6702833031946956, + "grad_norm": 6.869949560487559, + "learning_rate": 7.995198611537839e-06, + "loss": 17.8071, + "num_tokens": 173530200.0, + "step": 417 + }, + { + "epoch": 0.6718906972071529, + "grad_norm": 7.142196718449905, + "learning_rate": 7.985451088632993e-06, + "loss": 16.903, + "num_tokens": 173947806.0, + "step": 418 + }, + { + "epoch": 0.6734980912196102, + "grad_norm": 6.00072943958544, + "learning_rate": 7.975686755606624e-06, + "loss": 17.6274, + "num_tokens": 174384636.0, + "step": 419 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 7.696619863018434, + "learning_rate": 7.965905678498905e-06, + "loss": 18.4641, + "num_tokens": 174782714.0, + "step": 420 + }, + { + "epoch": 0.6767128792445248, + "grad_norm": 7.988342717931131, + "learning_rate": 7.956107923463257e-06, + "loss": 16.2386, + "num_tokens": 175148735.0, + "step": 421 + }, + { + "epoch": 0.6783202732569821, + "grad_norm": 6.791863447468637, + "learning_rate": 7.946293556765901e-06, + "loss": 18.7633, + "num_tokens": 175598688.0, + "step": 422 + }, + { + "epoch": 0.6799276672694394, + "grad_norm": 6.852262456588884, + "learning_rate": 7.936462644785414e-06, + "loss": 16.8085, + "num_tokens": 176019624.0, + "step": 423 + }, + { + "epoch": 0.6815350612818967, + "grad_norm": 7.331602813181534, + "learning_rate": 7.926615254012268e-06, + "loss": 18.1212, + "num_tokens": 176421999.0, + "step": 424 + }, + { + "epoch": 0.683142455294354, + "grad_norm": 7.363276555871758, + "learning_rate": 7.916751451048393e-06, + "loss": 19.9554, + "num_tokens": 176831077.0, + "step": 425 + }, + { + "epoch": 0.6847498493068114, + "grad_norm": 6.220137232169898, + "learning_rate": 7.90687130260672e-06, + "loss": 14.266, + "num_tokens": 177231214.0, + "step": 426 + }, + { + "epoch": 0.6863572433192686, + "grad_norm": 6.588301824791741, + "learning_rate": 7.896974875510732e-06, + "loss": 18.9975, + "num_tokens": 177627096.0, + "step": 427 + }, + { + "epoch": 0.687964637331726, + "grad_norm": 6.5683977803963485, + "learning_rate": 7.887062236694005e-06, + "loss": 17.9853, + "num_tokens": 178047310.0, + "step": 428 + }, + { + "epoch": 0.6895720313441832, + "grad_norm": 7.350919200525401, + "learning_rate": 7.877133453199774e-06, + "loss": 18.923, + "num_tokens": 178434574.0, + "step": 429 + }, + { + "epoch": 0.6911794253566406, + "grad_norm": 6.867375478978123, + "learning_rate": 7.867188592180453e-06, + "loss": 17.3925, + "num_tokens": 178857355.0, + "step": 430 + }, + { + "epoch": 0.6927868193690978, + "grad_norm": 7.075683153812218, + "learning_rate": 7.857227720897207e-06, + "loss": 19.3395, + "num_tokens": 179272332.0, + "step": 431 + }, + { + "epoch": 0.6943942133815552, + "grad_norm": 6.302437442688561, + "learning_rate": 7.847250906719476e-06, + "loss": 18.2131, + "num_tokens": 179701817.0, + "step": 432 + }, + { + "epoch": 0.6960016073940125, + "grad_norm": 7.65679671704738, + "learning_rate": 7.837258217124533e-06, + "loss": 16.7067, + "num_tokens": 180088794.0, + "step": 433 + }, + { + "epoch": 0.6976090014064698, + "grad_norm": 7.244072956943304, + "learning_rate": 7.827249719697022e-06, + "loss": 17.7254, + "num_tokens": 180509118.0, + "step": 434 + }, + { + "epoch": 0.6992163954189271, + "grad_norm": 6.403737484232292, + "learning_rate": 7.817225482128501e-06, + "loss": 17.7113, + "num_tokens": 180956130.0, + "step": 435 + }, + { + "epoch": 0.7008237894313843, + "grad_norm": 6.301687002715623, + "learning_rate": 7.807185572216988e-06, + "loss": 16.8731, + "num_tokens": 181345716.0, + "step": 436 + }, + { + "epoch": 0.7024311834438417, + "grad_norm": 8.433745277956854, + "learning_rate": 7.797130057866493e-06, + "loss": 17.5107, + "num_tokens": 181703149.0, + "step": 437 + }, + { + "epoch": 0.7040385774562989, + "grad_norm": 6.720642056611156, + "learning_rate": 7.787059007086572e-06, + "loss": 18.4645, + "num_tokens": 182163997.0, + "step": 438 + }, + { + "epoch": 0.7056459714687563, + "grad_norm": 7.197876816813433, + "learning_rate": 7.776972487991858e-06, + "loss": 18.0343, + "num_tokens": 182557492.0, + "step": 439 + }, + { + "epoch": 0.7072533654812135, + "grad_norm": 6.84751739083323, + "learning_rate": 7.766870568801599e-06, + "loss": 18.1137, + "num_tokens": 182919306.0, + "step": 440 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 6.423059733023599, + "learning_rate": 7.756753317839206e-06, + "loss": 19.0428, + "num_tokens": 183375045.0, + "step": 441 + }, + { + "epoch": 0.7104681535061282, + "grad_norm": 7.237786646730461, + "learning_rate": 7.74662080353178e-06, + "loss": 17.4492, + "num_tokens": 183787300.0, + "step": 442 + }, + { + "epoch": 0.7120755475185855, + "grad_norm": 7.76414189081038, + "learning_rate": 7.736473094409661e-06, + "loss": 19.0814, + "num_tokens": 184166919.0, + "step": 443 + }, + { + "epoch": 0.7136829415310428, + "grad_norm": 5.798191338775854, + "learning_rate": 7.726310259105949e-06, + "loss": 17.0567, + "num_tokens": 184598822.0, + "step": 444 + }, + { + "epoch": 0.7152903355435001, + "grad_norm": 6.740279162444575, + "learning_rate": 7.716132366356052e-06, + "loss": 18.823, + "num_tokens": 185023627.0, + "step": 445 + }, + { + "epoch": 0.7168977295559574, + "grad_norm": 7.148570286317796, + "learning_rate": 7.705939484997222e-06, + "loss": 18.1619, + "num_tokens": 185406155.0, + "step": 446 + }, + { + "epoch": 0.7185051235684147, + "grad_norm": 6.140849336058534, + "learning_rate": 7.695731683968078e-06, + "loss": 15.3674, + "num_tokens": 185861961.0, + "step": 447 + }, + { + "epoch": 0.720112517580872, + "grad_norm": 7.506893312324381, + "learning_rate": 7.685509032308153e-06, + "loss": 15.7963, + "num_tokens": 186253978.0, + "step": 448 + }, + { + "epoch": 0.7217199115933293, + "grad_norm": 6.427261716897113, + "learning_rate": 7.675271599157415e-06, + "loss": 19.2672, + "num_tokens": 186692064.0, + "step": 449 + }, + { + "epoch": 0.7233273056057866, + "grad_norm": 8.112279252661878, + "learning_rate": 7.665019453755813e-06, + "loss": 18.0955, + "num_tokens": 187090941.0, + "step": 450 + }, + { + "epoch": 0.724934699618244, + "grad_norm": 7.141439356772682, + "learning_rate": 7.654752665442795e-06, + "loss": 18.458, + "num_tokens": 187519436.0, + "step": 451 + }, + { + "epoch": 0.7265420936307012, + "grad_norm": 7.23919568400913, + "learning_rate": 7.644471303656841e-06, + "loss": 19.4523, + "num_tokens": 187943635.0, + "step": 452 + }, + { + "epoch": 0.7281494876431586, + "grad_norm": 6.592197559820897, + "learning_rate": 7.634175437935011e-06, + "loss": 17.2613, + "num_tokens": 188346720.0, + "step": 453 + }, + { + "epoch": 0.7297568816556158, + "grad_norm": 6.8394032929798945, + "learning_rate": 7.623865137912449e-06, + "loss": 15.893, + "num_tokens": 188739652.0, + "step": 454 + }, + { + "epoch": 0.7313642756680732, + "grad_norm": 6.6885581917014925, + "learning_rate": 7.613540473321927e-06, + "loss": 18.2901, + "num_tokens": 189110335.0, + "step": 455 + }, + { + "epoch": 0.7329716696805304, + "grad_norm": 7.32381201173344, + "learning_rate": 7.6032015139933725e-06, + "loss": 18.7736, + "num_tokens": 189473807.0, + "step": 456 + }, + { + "epoch": 0.7345790636929878, + "grad_norm": 6.892036838673391, + "learning_rate": 7.592848329853394e-06, + "loss": 18.5622, + "num_tokens": 189911854.0, + "step": 457 + }, + { + "epoch": 0.736186457705445, + "grad_norm": 7.46613466852867, + "learning_rate": 7.582480990924805e-06, + "loss": 18.9359, + "num_tokens": 190318599.0, + "step": 458 + }, + { + "epoch": 0.7377938517179023, + "grad_norm": 6.147868514125431, + "learning_rate": 7.572099567326157e-06, + "loss": 17.1806, + "num_tokens": 190764315.0, + "step": 459 + }, + { + "epoch": 0.7377938517179023, + "eval_loss": 0.9899222254753113, + "eval_num_tokens": 190764315.0, + "eval_runtime": 375.2609, + "eval_samples_per_second": 23.578, + "eval_steps_per_second": 5.895, + "step": 459 + }, + { + "epoch": 0.7394012457303597, + "grad_norm": 7.590909169713159, + "learning_rate": 7.561704129271262e-06, + "loss": 17.9911, + "num_tokens": 191182296.0, + "step": 460 + }, + { + "epoch": 0.7410086397428169, + "grad_norm": 6.889904220650134, + "learning_rate": 7.551294747068713e-06, + "loss": 18.7929, + "num_tokens": 191599041.0, + "step": 461 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 5.596314155979784, + "learning_rate": 7.54087149112142e-06, + "loss": 15.9603, + "num_tokens": 192005140.0, + "step": 462 + }, + { + "epoch": 0.7442234277677315, + "grad_norm": 7.6543190601229405, + "learning_rate": 7.530434431926119e-06, + "loss": 18.1406, + "num_tokens": 192403549.0, + "step": 463 + }, + { + "epoch": 0.7458308217801889, + "grad_norm": 6.690810657988717, + "learning_rate": 7.519983640072911e-06, + "loss": 15.6832, + "num_tokens": 192820565.0, + "step": 464 + }, + { + "epoch": 0.7474382157926461, + "grad_norm": 6.295159454493893, + "learning_rate": 7.5095191862447694e-06, + "loss": 15.146, + "num_tokens": 193251703.0, + "step": 465 + }, + { + "epoch": 0.7490456098051035, + "grad_norm": 6.153776820696221, + "learning_rate": 7.499041141217074e-06, + "loss": 18.8432, + "num_tokens": 193684911.0, + "step": 466 + }, + { + "epoch": 0.7506530038175607, + "grad_norm": 6.565075963893563, + "learning_rate": 7.488549575857125e-06, + "loss": 18.8695, + "num_tokens": 194089750.0, + "step": 467 + }, + { + "epoch": 0.7522603978300181, + "grad_norm": 6.869213280111083, + "learning_rate": 7.478044561123666e-06, + "loss": 16.8392, + "num_tokens": 194488942.0, + "step": 468 + }, + { + "epoch": 0.7538677918424754, + "grad_norm": 6.308681253451012, + "learning_rate": 7.467526168066408e-06, + "loss": 14.4515, + "num_tokens": 194875732.0, + "step": 469 + }, + { + "epoch": 0.7554751858549327, + "grad_norm": 7.0638989428331636, + "learning_rate": 7.456994467825539e-06, + "loss": 17.2388, + "num_tokens": 195301487.0, + "step": 470 + }, + { + "epoch": 0.75708257986739, + "grad_norm": 5.832802301010244, + "learning_rate": 7.446449531631256e-06, + "loss": 16.9442, + "num_tokens": 195758241.0, + "step": 471 + }, + { + "epoch": 0.7586899738798473, + "grad_norm": 7.371514805981565, + "learning_rate": 7.435891430803267e-06, + "loss": 18.4698, + "num_tokens": 196175699.0, + "step": 472 + }, + { + "epoch": 0.7602973678923046, + "grad_norm": 6.598609414509328, + "learning_rate": 7.425320236750327e-06, + "loss": 20.8396, + "num_tokens": 196611914.0, + "step": 473 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 6.495352433700856, + "learning_rate": 7.414736020969742e-06, + "loss": 18.1988, + "num_tokens": 197025302.0, + "step": 474 + }, + { + "epoch": 0.7635121559172192, + "grad_norm": 6.5504331715547055, + "learning_rate": 7.404138855046885e-06, + "loss": 18.5392, + "num_tokens": 197433452.0, + "step": 475 + }, + { + "epoch": 0.7651195499296765, + "grad_norm": 6.670312168909138, + "learning_rate": 7.393528810654726e-06, + "loss": 16.1082, + "num_tokens": 197838734.0, + "step": 476 + }, + { + "epoch": 0.7667269439421338, + "grad_norm": 5.831662919471931, + "learning_rate": 7.3829059595533245e-06, + "loss": 16.3063, + "num_tokens": 198263719.0, + "step": 477 + }, + { + "epoch": 0.7683343379545912, + "grad_norm": 6.962307229477025, + "learning_rate": 7.372270373589372e-06, + "loss": 19.2521, + "num_tokens": 198668326.0, + "step": 478 + }, + { + "epoch": 0.7699417319670484, + "grad_norm": 7.509645815763214, + "learning_rate": 7.361622124695678e-06, + "loss": 18.4381, + "num_tokens": 199072137.0, + "step": 479 + }, + { + "epoch": 0.7715491259795058, + "grad_norm": 6.752769989949174, + "learning_rate": 7.350961284890702e-06, + "loss": 19.6074, + "num_tokens": 199465622.0, + "step": 480 + }, + { + "epoch": 0.773156519991963, + "grad_norm": 5.858892037703858, + "learning_rate": 7.3402879262780634e-06, + "loss": 17.1447, + "num_tokens": 199924843.0, + "step": 481 + }, + { + "epoch": 0.7747639140044204, + "grad_norm": 6.603279302792167, + "learning_rate": 7.329602121046045e-06, + "loss": 16.9993, + "num_tokens": 200335191.0, + "step": 482 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 6.8252858522200395, + "learning_rate": 7.318903941467119e-06, + "loss": 16.3447, + "num_tokens": 200727784.0, + "step": 483 + }, + { + "epoch": 0.777978702029335, + "grad_norm": 7.861491454221313, + "learning_rate": 7.308193459897444e-06, + "loss": 18.5839, + "num_tokens": 201088412.0, + "step": 484 + }, + { + "epoch": 0.7795860960417923, + "grad_norm": 6.378148706536678, + "learning_rate": 7.297470748776383e-06, + "loss": 17.0445, + "num_tokens": 201491222.0, + "step": 485 + }, + { + "epoch": 0.7811934900542495, + "grad_norm": 5.745516876913777, + "learning_rate": 7.286735880626017e-06, + "loss": 15.5628, + "num_tokens": 201956278.0, + "step": 486 + }, + { + "epoch": 0.7828008840667069, + "grad_norm": 6.529652062108535, + "learning_rate": 7.275988928050645e-06, + "loss": 17.5257, + "num_tokens": 202353743.0, + "step": 487 + }, + { + "epoch": 0.7844082780791641, + "grad_norm": 5.812312826544307, + "learning_rate": 7.2652299637362985e-06, + "loss": 17.5275, + "num_tokens": 202809315.0, + "step": 488 + }, + { + "epoch": 0.7860156720916215, + "grad_norm": 7.201375941445681, + "learning_rate": 7.254459060450253e-06, + "loss": 18.2771, + "num_tokens": 203197217.0, + "step": 489 + }, + { + "epoch": 0.7876230661040787, + "grad_norm": 5.839591310554668, + "learning_rate": 7.243676291040527e-06, + "loss": 16.8837, + "num_tokens": 203672135.0, + "step": 490 + }, + { + "epoch": 0.7892304601165361, + "grad_norm": 6.7427834093217385, + "learning_rate": 7.232881728435398e-06, + "loss": 17.6197, + "num_tokens": 204083983.0, + "step": 491 + }, + { + "epoch": 0.7908378541289933, + "grad_norm": 5.9055975575196475, + "learning_rate": 7.222075445642904e-06, + "loss": 18.916, + "num_tokens": 204487150.0, + "step": 492 + }, + { + "epoch": 0.7924452481414507, + "grad_norm": 6.994229184470818, + "learning_rate": 7.211257515750354e-06, + "loss": 18.4741, + "num_tokens": 204934628.0, + "step": 493 + }, + { + "epoch": 0.794052642153908, + "grad_norm": 6.628900696300984, + "learning_rate": 7.200428011923828e-06, + "loss": 15.8953, + "num_tokens": 205370971.0, + "step": 494 + }, + { + "epoch": 0.7956600361663653, + "grad_norm": 6.158580943387445, + "learning_rate": 7.189587007407687e-06, + "loss": 18.4429, + "num_tokens": 205821100.0, + "step": 495 + }, + { + "epoch": 0.7972674301788226, + "grad_norm": 8.238768100356065, + "learning_rate": 7.178734575524071e-06, + "loss": 17.9641, + "num_tokens": 206188591.0, + "step": 496 + }, + { + "epoch": 0.7988748241912799, + "grad_norm": 7.149028551972727, + "learning_rate": 7.16787078967242e-06, + "loss": 18.5137, + "num_tokens": 206628028.0, + "step": 497 + }, + { + "epoch": 0.8004822182037372, + "grad_norm": 6.512013822594115, + "learning_rate": 7.156995723328951e-06, + "loss": 18.1898, + "num_tokens": 207059837.0, + "step": 498 + }, + { + "epoch": 0.8020896122161945, + "grad_norm": 6.275418157643167, + "learning_rate": 7.146109450046187e-06, + "loss": 18.8274, + "num_tokens": 207524941.0, + "step": 499 + }, + { + "epoch": 0.8036970062286518, + "grad_norm": 7.1922443382971695, + "learning_rate": 7.13521204345244e-06, + "loss": 17.0997, + "num_tokens": 207937966.0, + "step": 500 + }, + { + "epoch": 0.805304400241109, + "grad_norm": 6.3125613684168025, + "learning_rate": 7.124303577251327e-06, + "loss": 19.1714, + "num_tokens": 208372054.0, + "step": 501 + }, + { + "epoch": 0.8069117942535664, + "grad_norm": 6.875493301746525, + "learning_rate": 7.113384125221261e-06, + "loss": 16.8024, + "num_tokens": 208785297.0, + "step": 502 + }, + { + "epoch": 0.8085191882660238, + "grad_norm": 7.102398448012959, + "learning_rate": 7.102453761214961e-06, + "loss": 17.7709, + "num_tokens": 209205694.0, + "step": 503 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 6.373412251758446, + "learning_rate": 7.0915125591589476e-06, + "loss": 18.389, + "num_tokens": 209650153.0, + "step": 504 + }, + { + "epoch": 0.8117339762909384, + "grad_norm": 6.508660250482048, + "learning_rate": 7.080560593053036e-06, + "loss": 16.1927, + "num_tokens": 210051137.0, + "step": 505 + }, + { + "epoch": 0.8133413703033956, + "grad_norm": 6.56013293803857, + "learning_rate": 7.0695979369698554e-06, + "loss": 18.754, + "num_tokens": 210450707.0, + "step": 506 + }, + { + "epoch": 0.814948764315853, + "grad_norm": 7.2328030635947025, + "learning_rate": 7.058624665054327e-06, + "loss": 17.6411, + "num_tokens": 210798907.0, + "step": 507 + }, + { + "epoch": 0.8165561583283102, + "grad_norm": 7.068904658954573, + "learning_rate": 7.047640851523172e-06, + "loss": 17.8646, + "num_tokens": 211221469.0, + "step": 508 + }, + { + "epoch": 0.8181635523407675, + "grad_norm": 6.3665855154058155, + "learning_rate": 7.036646570664412e-06, + "loss": 17.0368, + "num_tokens": 211608411.0, + "step": 509 + }, + { + "epoch": 0.8197709463532248, + "grad_norm": 6.941014338846953, + "learning_rate": 7.025641896836862e-06, + "loss": 16.9751, + "num_tokens": 212024751.0, + "step": 510 + }, + { + "epoch": 0.8197709463532248, + "eval_loss": 0.9775996804237366, + "eval_num_tokens": 212024751.0, + "eval_runtime": 375.2703, + "eval_samples_per_second": 23.578, + "eval_steps_per_second": 5.894, + "step": 510 + }, + { + "epoch": 0.8213783403656821, + "grad_norm": 6.561742888148723, + "learning_rate": 7.014626904469629e-06, + "loss": 16.4675, + "num_tokens": 212466088.0, + "step": 511 + }, + { + "epoch": 0.8229857343781395, + "grad_norm": 7.392276162619051, + "learning_rate": 7.003601668061608e-06, + "loss": 17.6864, + "num_tokens": 212906392.0, + "step": 512 + }, + { + "epoch": 0.8245931283905967, + "grad_norm": 7.638107077744779, + "learning_rate": 6.992566262180977e-06, + "loss": 18.2432, + "num_tokens": 213298416.0, + "step": 513 + }, + { + "epoch": 0.8262005224030541, + "grad_norm": 6.9933046295714725, + "learning_rate": 6.981520761464699e-06, + "loss": 17.5513, + "num_tokens": 213683660.0, + "step": 514 + }, + { + "epoch": 0.8278079164155113, + "grad_norm": 7.197194827837283, + "learning_rate": 6.970465240618006e-06, + "loss": 19.0281, + "num_tokens": 214112014.0, + "step": 515 + }, + { + "epoch": 0.8294153104279687, + "grad_norm": 6.787031852365735, + "learning_rate": 6.959399774413909e-06, + "loss": 17.6336, + "num_tokens": 214533793.0, + "step": 516 + }, + { + "epoch": 0.8310227044404259, + "grad_norm": 7.6351193201892436, + "learning_rate": 6.9483244376926735e-06, + "loss": 17.0609, + "num_tokens": 214869902.0, + "step": 517 + }, + { + "epoch": 0.8326300984528833, + "grad_norm": 7.229915323116934, + "learning_rate": 6.937239305361331e-06, + "loss": 16.4232, + "num_tokens": 215274343.0, + "step": 518 + }, + { + "epoch": 0.8342374924653405, + "grad_norm": 6.967512477176957, + "learning_rate": 6.926144452393163e-06, + "loss": 19.2509, + "num_tokens": 215714474.0, + "step": 519 + }, + { + "epoch": 0.8358448864777979, + "grad_norm": 7.1974088885426, + "learning_rate": 6.9150399538271915e-06, + "loss": 17.3042, + "num_tokens": 216106096.0, + "step": 520 + }, + { + "epoch": 0.8374522804902552, + "grad_norm": 6.689366817590714, + "learning_rate": 6.9039258847676824e-06, + "loss": 17.3625, + "num_tokens": 216505390.0, + "step": 521 + }, + { + "epoch": 0.8390596745027125, + "grad_norm": 5.415876209750619, + "learning_rate": 6.892802320383624e-06, + "loss": 18.3032, + "num_tokens": 216981264.0, + "step": 522 + }, + { + "epoch": 0.8406670685151698, + "grad_norm": 7.990248879365214, + "learning_rate": 6.8816693359082285e-06, + "loss": 19.9358, + "num_tokens": 217376103.0, + "step": 523 + }, + { + "epoch": 0.842274462527627, + "grad_norm": 6.954855732335385, + "learning_rate": 6.870527006638422e-06, + "loss": 16.3281, + "num_tokens": 217787433.0, + "step": 524 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 6.1477670220386065, + "learning_rate": 6.85937540793433e-06, + "loss": 18.5762, + "num_tokens": 218221396.0, + "step": 525 + }, + { + "epoch": 0.8454892505525416, + "grad_norm": 6.651395077172045, + "learning_rate": 6.848214615218774e-06, + "loss": 18.5822, + "num_tokens": 218688233.0, + "step": 526 + }, + { + "epoch": 0.847096644564999, + "grad_norm": 5.628389772812499, + "learning_rate": 6.837044703976754e-06, + "loss": 15.7576, + "num_tokens": 219156830.0, + "step": 527 + }, + { + "epoch": 0.8487040385774564, + "grad_norm": 7.644675866011927, + "learning_rate": 6.825865749754946e-06, + "loss": 14.8868, + "num_tokens": 219547242.0, + "step": 528 + }, + { + "epoch": 0.8503114325899136, + "grad_norm": 7.123754713161018, + "learning_rate": 6.814677828161186e-06, + "loss": 18.1605, + "num_tokens": 220004670.0, + "step": 529 + }, + { + "epoch": 0.851918826602371, + "grad_norm": 6.647925251281775, + "learning_rate": 6.803481014863962e-06, + "loss": 17.9629, + "num_tokens": 220433013.0, + "step": 530 + }, + { + "epoch": 0.8535262206148282, + "grad_norm": 6.410337186899303, + "learning_rate": 6.792275385591896e-06, + "loss": 17.0615, + "num_tokens": 220869599.0, + "step": 531 + }, + { + "epoch": 0.8551336146272855, + "grad_norm": 6.329273998628027, + "learning_rate": 6.78106101613324e-06, + "loss": 20.2688, + "num_tokens": 221327336.0, + "step": 532 + }, + { + "epoch": 0.8567410086397428, + "grad_norm": 6.399300775618686, + "learning_rate": 6.769837982335356e-06, + "loss": 19.1122, + "num_tokens": 221772904.0, + "step": 533 + }, + { + "epoch": 0.8583484026522001, + "grad_norm": 6.560453219955942, + "learning_rate": 6.758606360104214e-06, + "loss": 17.4735, + "num_tokens": 222176464.0, + "step": 534 + }, + { + "epoch": 0.8599557966646574, + "grad_norm": 6.3255305384061415, + "learning_rate": 6.747366225403858e-06, + "loss": 15.3125, + "num_tokens": 222592924.0, + "step": 535 + }, + { + "epoch": 0.8615631906771147, + "grad_norm": 7.186799570374527, + "learning_rate": 6.736117654255918e-06, + "loss": 17.6959, + "num_tokens": 223016527.0, + "step": 536 + }, + { + "epoch": 0.8631705846895721, + "grad_norm": 6.991575033953552, + "learning_rate": 6.724860722739077e-06, + "loss": 15.881, + "num_tokens": 223407200.0, + "step": 537 + }, + { + "epoch": 0.8647779787020293, + "grad_norm": 6.555882886198108, + "learning_rate": 6.713595506988563e-06, + "loss": 15.0192, + "num_tokens": 223811029.0, + "step": 538 + }, + { + "epoch": 0.8663853727144867, + "grad_norm": 6.8243529080785486, + "learning_rate": 6.702322083195634e-06, + "loss": 16.5504, + "num_tokens": 224258289.0, + "step": 539 + }, + { + "epoch": 0.8679927667269439, + "grad_norm": 6.511874502029229, + "learning_rate": 6.6910405276070636e-06, + "loss": 17.067, + "num_tokens": 224678282.0, + "step": 540 + }, + { + "epoch": 0.8696001607394013, + "grad_norm": 7.247960356729762, + "learning_rate": 6.679750916524621e-06, + "loss": 19.8149, + "num_tokens": 225108257.0, + "step": 541 + }, + { + "epoch": 0.8712075547518585, + "grad_norm": 5.493239548582633, + "learning_rate": 6.6684533263045635e-06, + "loss": 18.9799, + "num_tokens": 225550320.0, + "step": 542 + }, + { + "epoch": 0.8728149487643159, + "grad_norm": 6.631708891076825, + "learning_rate": 6.657147833357107e-06, + "loss": 20.9455, + "num_tokens": 226001327.0, + "step": 543 + }, + { + "epoch": 0.8744223427767731, + "grad_norm": 5.923877523120946, + "learning_rate": 6.645834514145925e-06, + "loss": 16.8642, + "num_tokens": 226406243.0, + "step": 544 + }, + { + "epoch": 0.8760297367892305, + "grad_norm": 6.429355979534652, + "learning_rate": 6.634513445187617e-06, + "loss": 18.8805, + "num_tokens": 226792503.0, + "step": 545 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 6.113064847228023, + "learning_rate": 6.623184703051198e-06, + "loss": 16.188, + "num_tokens": 227204455.0, + "step": 546 + }, + { + "epoch": 0.8792445248141451, + "grad_norm": 5.948054979736808, + "learning_rate": 6.611848364357585e-06, + "loss": 17.2821, + "num_tokens": 227638984.0, + "step": 547 + }, + { + "epoch": 0.8808519188266024, + "grad_norm": 6.695203380337343, + "learning_rate": 6.600504505779065e-06, + "loss": 18.8971, + "num_tokens": 228060801.0, + "step": 548 + }, + { + "epoch": 0.8824593128390597, + "grad_norm": 7.132968232294307, + "learning_rate": 6.589153204038794e-06, + "loss": 17.5713, + "num_tokens": 228470079.0, + "step": 549 + }, + { + "epoch": 0.884066706851517, + "grad_norm": 5.713030925880166, + "learning_rate": 6.577794535910262e-06, + "loss": 17.5753, + "num_tokens": 228913422.0, + "step": 550 + }, + { + "epoch": 0.8856741008639742, + "grad_norm": 6.5826756267207935, + "learning_rate": 6.566428578216786e-06, + "loss": 16.7787, + "num_tokens": 229319450.0, + "step": 551 + }, + { + "epoch": 0.8872814948764316, + "grad_norm": 6.72862310440387, + "learning_rate": 6.5550554078309815e-06, + "loss": 15.5126, + "num_tokens": 229711312.0, + "step": 552 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 6.039050702621853, + "learning_rate": 6.543675101674249e-06, + "loss": 17.4674, + "num_tokens": 230111679.0, + "step": 553 + }, + { + "epoch": 0.8904962829013462, + "grad_norm": 7.2006372029216, + "learning_rate": 6.532287736716252e-06, + "loss": 16.4901, + "num_tokens": 230503685.0, + "step": 554 + }, + { + "epoch": 0.8921036769138035, + "grad_norm": 6.43992336946641, + "learning_rate": 6.52089338997439e-06, + "loss": 18.9178, + "num_tokens": 230965742.0, + "step": 555 + }, + { + "epoch": 0.8937110709262608, + "grad_norm": 6.597278692418304, + "learning_rate": 6.5094921385132894e-06, + "loss": 18.3637, + "num_tokens": 231396081.0, + "step": 556 + }, + { + "epoch": 0.8953184649387181, + "grad_norm": 6.853713956468528, + "learning_rate": 6.4980840594442726e-06, + "loss": 18.465, + "num_tokens": 231817767.0, + "step": 557 + }, + { + "epoch": 0.8969258589511754, + "grad_norm": 7.600073583168945, + "learning_rate": 6.486669229924842e-06, + "loss": 19.6694, + "num_tokens": 232219037.0, + "step": 558 + }, + { + "epoch": 0.8985332529636327, + "grad_norm": 6.1315514203945725, + "learning_rate": 6.475247727158155e-06, + "loss": 17.8136, + "num_tokens": 232666349.0, + "step": 559 + }, + { + "epoch": 0.90014064697609, + "grad_norm": 6.360778635653487, + "learning_rate": 6.463819628392502e-06, + "loss": 18.8925, + "num_tokens": 233115729.0, + "step": 560 + }, + { + "epoch": 0.9017480409885473, + "grad_norm": 8.074083461754373, + "learning_rate": 6.452385010920783e-06, + "loss": 19.4861, + "num_tokens": 233488881.0, + "step": 561 + }, + { + "epoch": 0.9017480409885473, + "eval_loss": 0.9668655395507812, + "eval_num_tokens": 233488881.0, + "eval_runtime": 375.2765, + "eval_samples_per_second": 23.577, + "eval_steps_per_second": 5.894, + "step": 561 + }, + { + "epoch": 0.9033554350010046, + "grad_norm": 7.683144560514046, + "learning_rate": 6.4409439520799944e-06, + "loss": 16.5989, + "num_tokens": 233868597.0, + "step": 562 + }, + { + "epoch": 0.9049628290134619, + "grad_norm": 6.595367859094082, + "learning_rate": 6.429496529250689e-06, + "loss": 16.6811, + "num_tokens": 234305257.0, + "step": 563 + }, + { + "epoch": 0.9065702230259193, + "grad_norm": 6.99004295473431, + "learning_rate": 6.4180428198564695e-06, + "loss": 15.0301, + "num_tokens": 234679187.0, + "step": 564 + }, + { + "epoch": 0.9081776170383765, + "grad_norm": 6.0698773125255405, + "learning_rate": 6.40658290136345e-06, + "loss": 18.3194, + "num_tokens": 235104653.0, + "step": 565 + }, + { + "epoch": 0.9097850110508339, + "grad_norm": 6.321902001275437, + "learning_rate": 6.395116851279746e-06, + "loss": 17.3441, + "num_tokens": 235491399.0, + "step": 566 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 6.958576608891122, + "learning_rate": 6.38364474715494e-06, + "loss": 18.2241, + "num_tokens": 235896339.0, + "step": 567 + }, + { + "epoch": 0.9129997990757485, + "grad_norm": 6.423006616363349, + "learning_rate": 6.372166666579559e-06, + "loss": 18.6717, + "num_tokens": 236352672.0, + "step": 568 + }, + { + "epoch": 0.9146071930882057, + "grad_norm": 5.816971168772174, + "learning_rate": 6.3606826871845536e-06, + "loss": 18.4092, + "num_tokens": 236784572.0, + "step": 569 + }, + { + "epoch": 0.9162145871006631, + "grad_norm": 6.512056753651926, + "learning_rate": 6.349192886640773e-06, + "loss": 16.774, + "num_tokens": 237180799.0, + "step": 570 + }, + { + "epoch": 0.9178219811131203, + "grad_norm": 6.979518577962706, + "learning_rate": 6.337697342658431e-06, + "loss": 18.707, + "num_tokens": 237600456.0, + "step": 571 + }, + { + "epoch": 0.9194293751255777, + "grad_norm": 6.427220653780498, + "learning_rate": 6.32619613298659e-06, + "loss": 17.8333, + "num_tokens": 237988043.0, + "step": 572 + }, + { + "epoch": 0.921036769138035, + "grad_norm": 7.488182297989402, + "learning_rate": 6.314689335412634e-06, + "loss": 17.5055, + "num_tokens": 238360089.0, + "step": 573 + }, + { + "epoch": 0.9226441631504922, + "grad_norm": 7.096166016227048, + "learning_rate": 6.303177027761734e-06, + "loss": 16.9501, + "num_tokens": 238745687.0, + "step": 574 + }, + { + "epoch": 0.9242515571629496, + "grad_norm": 6.937814717653217, + "learning_rate": 6.291659287896334e-06, + "loss": 16.1498, + "num_tokens": 239130469.0, + "step": 575 + }, + { + "epoch": 0.9258589511754068, + "grad_norm": 7.355491386851183, + "learning_rate": 6.280136193715618e-06, + "loss": 17.3804, + "num_tokens": 239522057.0, + "step": 576 + }, + { + "epoch": 0.9274663451878642, + "grad_norm": 6.1278200443539745, + "learning_rate": 6.268607823154979e-06, + "loss": 17.9206, + "num_tokens": 239976982.0, + "step": 577 + }, + { + "epoch": 0.9290737392003214, + "grad_norm": 6.3104804399971535, + "learning_rate": 6.2570742541854974e-06, + "loss": 18.3975, + "num_tokens": 240414986.0, + "step": 578 + }, + { + "epoch": 0.9306811332127788, + "grad_norm": 6.967439138356059, + "learning_rate": 6.245535564813417e-06, + "loss": 15.5996, + "num_tokens": 240785422.0, + "step": 579 + }, + { + "epoch": 0.9322885272252361, + "grad_norm": 6.180677691879531, + "learning_rate": 6.23399183307961e-06, + "loss": 16.7028, + "num_tokens": 241197257.0, + "step": 580 + }, + { + "epoch": 0.9338959212376934, + "grad_norm": 5.589646350405961, + "learning_rate": 6.222443137059048e-06, + "loss": 15.7449, + "num_tokens": 241632132.0, + "step": 581 + }, + { + "epoch": 0.9355033152501507, + "grad_norm": 6.49005853389635, + "learning_rate": 6.2108895548602874e-06, + "loss": 18.0323, + "num_tokens": 242056898.0, + "step": 582 + }, + { + "epoch": 0.937110709262608, + "grad_norm": 6.468425634570502, + "learning_rate": 6.199331164624923e-06, + "loss": 18.2657, + "num_tokens": 242450631.0, + "step": 583 + }, + { + "epoch": 0.9387181032750653, + "grad_norm": 6.1376109037825435, + "learning_rate": 6.187768044527074e-06, + "loss": 16.7352, + "num_tokens": 242881406.0, + "step": 584 + }, + { + "epoch": 0.9403254972875226, + "grad_norm": 7.989221319371305, + "learning_rate": 6.176200272772845e-06, + "loss": 15.2611, + "num_tokens": 243293520.0, + "step": 585 + }, + { + "epoch": 0.9419328912999799, + "grad_norm": 6.684582909313, + "learning_rate": 6.1646279275998065e-06, + "loss": 15.724, + "num_tokens": 243716610.0, + "step": 586 + }, + { + "epoch": 0.9435402853124372, + "grad_norm": 5.898929475304656, + "learning_rate": 6.1530510872764586e-06, + "loss": 16.6342, + "num_tokens": 244146002.0, + "step": 587 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 5.650303104211734, + "learning_rate": 6.141469830101702e-06, + "loss": 17.506, + "num_tokens": 244588893.0, + "step": 588 + }, + { + "epoch": 0.9467550733373519, + "grad_norm": 6.264591845407961, + "learning_rate": 6.129884234404314e-06, + "loss": 16.5438, + "num_tokens": 245026735.0, + "step": 589 + }, + { + "epoch": 0.9483624673498091, + "grad_norm": 6.597647853367445, + "learning_rate": 6.118294378542413e-06, + "loss": 17.1461, + "num_tokens": 245427971.0, + "step": 590 + }, + { + "epoch": 0.9499698613622665, + "grad_norm": 6.769746810957274, + "learning_rate": 6.10670034090293e-06, + "loss": 18.7587, + "num_tokens": 245839883.0, + "step": 591 + }, + { + "epoch": 0.9515772553747237, + "grad_norm": 6.982154363080943, + "learning_rate": 6.0951021999010825e-06, + "loss": 19.3651, + "num_tokens": 246259750.0, + "step": 592 + }, + { + "epoch": 0.9531846493871811, + "grad_norm": 5.970840033567, + "learning_rate": 6.083500033979836e-06, + "loss": 16.474, + "num_tokens": 246734970.0, + "step": 593 + }, + { + "epoch": 0.9547920433996383, + "grad_norm": 6.872167218321081, + "learning_rate": 6.07189392160938e-06, + "loss": 18.0333, + "num_tokens": 247136022.0, + "step": 594 + }, + { + "epoch": 0.9563994374120957, + "grad_norm": 6.860489372705434, + "learning_rate": 6.060283941286597e-06, + "loss": 14.8999, + "num_tokens": 247575963.0, + "step": 595 + }, + { + "epoch": 0.9580068314245529, + "grad_norm": 7.591830387045807, + "learning_rate": 6.048670171534531e-06, + "loss": 18.7766, + "num_tokens": 247958906.0, + "step": 596 + }, + { + "epoch": 0.9596142254370102, + "grad_norm": 6.6878787806822375, + "learning_rate": 6.03705269090185e-06, + "loss": 16.8418, + "num_tokens": 248340234.0, + "step": 597 + }, + { + "epoch": 0.9612216194494676, + "grad_norm": 7.570818997400882, + "learning_rate": 6.0254315779623264e-06, + "loss": 15.2707, + "num_tokens": 248704826.0, + "step": 598 + }, + { + "epoch": 0.9628290134619248, + "grad_norm": 6.258699955439697, + "learning_rate": 6.013806911314294e-06, + "loss": 18.2445, + "num_tokens": 249137736.0, + "step": 599 + }, + { + "epoch": 0.9644364074743822, + "grad_norm": 7.546491029906908, + "learning_rate": 6.0021787695801255e-06, + "loss": 16.7453, + "num_tokens": 249485746.0, + "step": 600 + }, + { + "epoch": 0.9660438014868394, + "grad_norm": 6.844239490362367, + "learning_rate": 5.990547231405695e-06, + "loss": 16.7061, + "num_tokens": 249873551.0, + "step": 601 + }, + { + "epoch": 0.9676511954992968, + "grad_norm": 7.706651763295645, + "learning_rate": 5.9789123754598535e-06, + "loss": 17.1084, + "num_tokens": 250237095.0, + "step": 602 + }, + { + "epoch": 0.969258589511754, + "grad_norm": 6.833777090150442, + "learning_rate": 5.967274280433881e-06, + "loss": 18.1303, + "num_tokens": 250664705.0, + "step": 603 + }, + { + "epoch": 0.9708659835242114, + "grad_norm": 7.085020611556224, + "learning_rate": 5.955633025040974e-06, + "loss": 17.6261, + "num_tokens": 251047039.0, + "step": 604 + }, + { + "epoch": 0.9724733775366686, + "grad_norm": 7.31571136682962, + "learning_rate": 5.9439886880156996e-06, + "loss": 16.0648, + "num_tokens": 251434456.0, + "step": 605 + }, + { + "epoch": 0.974080771549126, + "grad_norm": 6.248862762457258, + "learning_rate": 5.93234134811347e-06, + "loss": 16.7192, + "num_tokens": 251893123.0, + "step": 606 + }, + { + "epoch": 0.9756881655615833, + "grad_norm": 6.341918349513711, + "learning_rate": 5.920691084110001e-06, + "loss": 17.8705, + "num_tokens": 252306375.0, + "step": 607 + }, + { + "epoch": 0.9772955595740406, + "grad_norm": 6.651418677640293, + "learning_rate": 5.909037974800794e-06, + "loss": 15.0317, + "num_tokens": 252724270.0, + "step": 608 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 6.972035736962175, + "learning_rate": 5.897382099000587e-06, + "loss": 17.5137, + "num_tokens": 253149414.0, + "step": 609 + }, + { + "epoch": 0.9805103475989552, + "grad_norm": 7.006409504677668, + "learning_rate": 5.885723535542834e-06, + "loss": 17.6626, + "num_tokens": 253551425.0, + "step": 610 + }, + { + "epoch": 0.9821177416114125, + "grad_norm": 6.871686163978822, + "learning_rate": 5.874062363279164e-06, + "loss": 16.6523, + "num_tokens": 253947134.0, + "step": 611 + }, + { + "epoch": 0.9837251356238698, + "grad_norm": 6.413107208059928, + "learning_rate": 5.862398661078849e-06, + "loss": 18.1711, + "num_tokens": 254374504.0, + "step": 612 + }, + { + "epoch": 0.9837251356238698, + "eval_loss": 0.9562180638313293, + "eval_num_tokens": 254374504.0, + "eval_runtime": 375.2953, + "eval_samples_per_second": 23.576, + "eval_steps_per_second": 5.894, + "step": 612 + }, + { + "epoch": 0.9853325296363271, + "grad_norm": 6.37838972245321, + "learning_rate": 5.850732507828276e-06, + "loss": 16.2996, + "num_tokens": 254808863.0, + "step": 613 + }, + { + "epoch": 0.9869399236487844, + "grad_norm": 6.159654354221083, + "learning_rate": 5.839063982430408e-06, + "loss": 17.4837, + "num_tokens": 255216423.0, + "step": 614 + }, + { + "epoch": 0.9885473176612417, + "grad_norm": 7.382827431029607, + "learning_rate": 5.8273931638042495e-06, + "loss": 19.0272, + "num_tokens": 255560015.0, + "step": 615 + }, + { + "epoch": 0.9901547116736991, + "grad_norm": 6.656412795951866, + "learning_rate": 5.815720130884315e-06, + "loss": 18.9118, + "num_tokens": 255964424.0, + "step": 616 + }, + { + "epoch": 0.9917621056861563, + "grad_norm": 6.201336969824191, + "learning_rate": 5.804044962620104e-06, + "loss": 17.1551, + "num_tokens": 256401882.0, + "step": 617 + }, + { + "epoch": 0.9933694996986137, + "grad_norm": 6.800302057907455, + "learning_rate": 5.792367737975541e-06, + "loss": 20.4506, + "num_tokens": 256829564.0, + "step": 618 + }, + { + "epoch": 0.9949768937110709, + "grad_norm": 7.041972422563, + "learning_rate": 5.780688535928478e-06, + "loss": 16.3257, + "num_tokens": 257199484.0, + "step": 619 + }, + { + "epoch": 0.9965842877235283, + "grad_norm": 6.400845425759442, + "learning_rate": 5.769007435470127e-06, + "loss": 17.6671, + "num_tokens": 257604305.0, + "step": 620 + }, + { + "epoch": 0.9981916817359855, + "grad_norm": 6.014576580169112, + "learning_rate": 5.757324515604547e-06, + "loss": 17.1441, + "num_tokens": 258039089.0, + "step": 621 + }, + { + "epoch": 0.9997990757484428, + "grad_norm": 7.396367752525262, + "learning_rate": 5.7456398553480974e-06, + "loss": 16.542, + "num_tokens": 258422101.0, + "step": 622 + }, + { + "epoch": 1.0, + "grad_norm": 7.396367752525262, + "learning_rate": 5.733953533728912e-06, + "loss": 2.0106, + "num_tokens": 258450164.0, + "step": 623 + }, + { + "epoch": 1.0016073940124572, + "grad_norm": 6.778169910447539, + "learning_rate": 5.722265629786361e-06, + "loss": 14.559, + "num_tokens": 340949.0, + "step": 624 + }, + { + "epoch": 1.0032147880249147, + "grad_norm": 6.938537088012852, + "learning_rate": 5.710576222570515e-06, + "loss": 16.4212, + "num_tokens": 744214.0, + "step": 625 + }, + { + "epoch": 1.004822182037372, + "grad_norm": 7.253333699599183, + "learning_rate": 5.698885391141612e-06, + "loss": 17.2368, + "num_tokens": 1153729.0, + "step": 626 + }, + { + "epoch": 1.0064295760498292, + "grad_norm": 5.871015710034311, + "learning_rate": 5.687193214569524e-06, + "loss": 16.4981, + "num_tokens": 1573854.0, + "step": 627 + }, + { + "epoch": 1.0080369700622864, + "grad_norm": 6.054818410178965, + "learning_rate": 5.675499771933218e-06, + "loss": 15.6467, + "num_tokens": 1956290.0, + "step": 628 + }, + { + "epoch": 1.009644364074744, + "grad_norm": 6.581894717303972, + "learning_rate": 5.66380514232023e-06, + "loss": 18.0779, + "num_tokens": 2435877.0, + "step": 629 + }, + { + "epoch": 1.0112517580872011, + "grad_norm": 6.013289249820927, + "learning_rate": 5.652109404826115e-06, + "loss": 18.1013, + "num_tokens": 2858766.0, + "step": 630 + }, + { + "epoch": 1.0128591520996584, + "grad_norm": 6.732578667584098, + "learning_rate": 5.640412638553927e-06, + "loss": 16.4732, + "num_tokens": 3256874.0, + "step": 631 + }, + { + "epoch": 1.0144665461121158, + "grad_norm": 6.301710955774104, + "learning_rate": 5.6287149226136785e-06, + "loss": 14.8405, + "num_tokens": 3632621.0, + "step": 632 + }, + { + "epoch": 1.016073940124573, + "grad_norm": 6.880414720270414, + "learning_rate": 5.617016336121801e-06, + "loss": 16.9139, + "num_tokens": 4040615.0, + "step": 633 + }, + { + "epoch": 1.0176813341370303, + "grad_norm": 6.180836011490945, + "learning_rate": 5.6053169582006175e-06, + "loss": 15.7685, + "num_tokens": 4446787.0, + "step": 634 + }, + { + "epoch": 1.0192887281494876, + "grad_norm": 5.944862392229163, + "learning_rate": 5.593616867977802e-06, + "loss": 16.2023, + "num_tokens": 4873583.0, + "step": 635 + }, + { + "epoch": 1.020896122161945, + "grad_norm": 6.08540369113361, + "learning_rate": 5.581916144585847e-06, + "loss": 15.3436, + "num_tokens": 5277522.0, + "step": 636 + }, + { + "epoch": 1.0225035161744023, + "grad_norm": 5.891279090488557, + "learning_rate": 5.570214867161528e-06, + "loss": 17.0891, + "num_tokens": 5707319.0, + "step": 637 + }, + { + "epoch": 1.0241109101868595, + "grad_norm": 6.7274572107778265, + "learning_rate": 5.558513114845363e-06, + "loss": 16.8331, + "num_tokens": 6113259.0, + "step": 638 + }, + { + "epoch": 1.0257183041993168, + "grad_norm": 6.125047538757013, + "learning_rate": 5.54681096678109e-06, + "loss": 17.7097, + "num_tokens": 6516170.0, + "step": 639 + }, + { + "epoch": 1.0273256982117742, + "grad_norm": 5.866582480225253, + "learning_rate": 5.5351085021151155e-06, + "loss": 15.526, + "num_tokens": 6933446.0, + "step": 640 + }, + { + "epoch": 1.0289330922242315, + "grad_norm": 6.690879246770087, + "learning_rate": 5.523405799995992e-06, + "loss": 13.5989, + "num_tokens": 7287585.0, + "step": 641 + }, + { + "epoch": 1.0305404862366887, + "grad_norm": 6.482813276552728, + "learning_rate": 5.511702939573877e-06, + "loss": 15.0947, + "num_tokens": 7664459.0, + "step": 642 + }, + { + "epoch": 1.0321478802491462, + "grad_norm": 5.676514727162951, + "learning_rate": 5.500000000000001e-06, + "loss": 16.2858, + "num_tokens": 8122985.0, + "step": 643 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 6.565658161580242, + "learning_rate": 5.488297060426123e-06, + "loss": 17.3786, + "num_tokens": 8521178.0, + "step": 644 + }, + { + "epoch": 1.0353626682740606, + "grad_norm": 6.164900191086889, + "learning_rate": 5.476594200004011e-06, + "loss": 18.1293, + "num_tokens": 8943399.0, + "step": 645 + }, + { + "epoch": 1.036970062286518, + "grad_norm": 6.2451893726089045, + "learning_rate": 5.464891497884885e-06, + "loss": 18.7489, + "num_tokens": 9415276.0, + "step": 646 + }, + { + "epoch": 1.0385774562989754, + "grad_norm": 6.500728897011229, + "learning_rate": 5.4531890332189115e-06, + "loss": 16.7469, + "num_tokens": 9847975.0, + "step": 647 + }, + { + "epoch": 1.0401848503114326, + "grad_norm": 6.68768752227647, + "learning_rate": 5.441486885154639e-06, + "loss": 18.4363, + "num_tokens": 10241519.0, + "step": 648 + }, + { + "epoch": 1.0417922443238898, + "grad_norm": 6.938084852867114, + "learning_rate": 5.429785132838475e-06, + "loss": 18.524, + "num_tokens": 10654365.0, + "step": 649 + }, + { + "epoch": 1.0433996383363473, + "grad_norm": 7.555450630693558, + "learning_rate": 5.4180838554141545e-06, + "loss": 17.2283, + "num_tokens": 11032219.0, + "step": 650 + }, + { + "epoch": 1.0450070323488045, + "grad_norm": 7.896371778379311, + "learning_rate": 5.4063831320222e-06, + "loss": 14.7754, + "num_tokens": 11420194.0, + "step": 651 + }, + { + "epoch": 1.0466144263612618, + "grad_norm": 7.367610345845942, + "learning_rate": 5.394683041799385e-06, + "loss": 17.9733, + "num_tokens": 11843676.0, + "step": 652 + }, + { + "epoch": 1.048221820373719, + "grad_norm": 6.292982989302997, + "learning_rate": 5.382983663878201e-06, + "loss": 15.59, + "num_tokens": 12222162.0, + "step": 653 + }, + { + "epoch": 1.0498292143861765, + "grad_norm": 6.721699394566863, + "learning_rate": 5.371285077386325e-06, + "loss": 18.981, + "num_tokens": 12591218.0, + "step": 654 + }, + { + "epoch": 1.0514366083986337, + "grad_norm": 6.454279083797687, + "learning_rate": 5.3595873614460735e-06, + "loss": 17.4296, + "num_tokens": 12990790.0, + "step": 655 + }, + { + "epoch": 1.053044002411091, + "grad_norm": 5.671640744246082, + "learning_rate": 5.3478905951738865e-06, + "loss": 16.1843, + "num_tokens": 13426506.0, + "step": 656 + }, + { + "epoch": 1.0546513964235482, + "grad_norm": 6.5927641739857386, + "learning_rate": 5.336194857679772e-06, + "loss": 16.2189, + "num_tokens": 13813577.0, + "step": 657 + }, + { + "epoch": 1.0562587904360057, + "grad_norm": 5.129315996883605, + "learning_rate": 5.324500228066783e-06, + "loss": 16.2944, + "num_tokens": 14276676.0, + "step": 658 + }, + { + "epoch": 1.057866184448463, + "grad_norm": 6.754243389342459, + "learning_rate": 5.312806785430478e-06, + "loss": 18.935, + "num_tokens": 14679304.0, + "step": 659 + }, + { + "epoch": 1.0594735784609202, + "grad_norm": 5.871022627487107, + "learning_rate": 5.301114608858389e-06, + "loss": 17.2466, + "num_tokens": 15105380.0, + "step": 660 + }, + { + "epoch": 1.0610809724733776, + "grad_norm": 5.440054470844332, + "learning_rate": 5.289423777429486e-06, + "loss": 16.9874, + "num_tokens": 15520191.0, + "step": 661 + }, + { + "epoch": 1.0626883664858349, + "grad_norm": 6.126728630127383, + "learning_rate": 5.277734370213641e-06, + "loss": 17.574, + "num_tokens": 15935726.0, + "step": 662 + }, + { + "epoch": 1.064295760498292, + "grad_norm": 6.612956876705847, + "learning_rate": 5.266046466271089e-06, + "loss": 18.5566, + "num_tokens": 16347999.0, + "step": 663 + }, + { + "epoch": 1.064295760498292, + "eval_loss": 0.9491293430328369, + "eval_num_tokens": 16347999.0, + "eval_runtime": 376.3653, + "eval_samples_per_second": 23.509, + "eval_steps_per_second": 5.877, + "step": 663 + }, + { + "epoch": 1.0659031545107494, + "grad_norm": 5.90165960256954, + "learning_rate": 5.254360144651906e-06, + "loss": 18.0862, + "num_tokens": 16788388.0, + "step": 664 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 6.537986021394244, + "learning_rate": 5.2426754843954545e-06, + "loss": 18.4757, + "num_tokens": 17179863.0, + "step": 665 + }, + { + "epoch": 1.069117942535664, + "grad_norm": 6.267883629174489, + "learning_rate": 5.230992564529874e-06, + "loss": 16.5988, + "num_tokens": 17572836.0, + "step": 666 + }, + { + "epoch": 1.0707253365481213, + "grad_norm": 5.822647958986145, + "learning_rate": 5.219311464071524e-06, + "loss": 15.5703, + "num_tokens": 18010698.0, + "step": 667 + }, + { + "epoch": 1.0723327305605788, + "grad_norm": 7.116474902982467, + "learning_rate": 5.20763226202446e-06, + "loss": 16.1622, + "num_tokens": 18401438.0, + "step": 668 + }, + { + "epoch": 1.073940124573036, + "grad_norm": 6.71659039844937, + "learning_rate": 5.1959550373798995e-06, + "loss": 16.4672, + "num_tokens": 18800675.0, + "step": 669 + }, + { + "epoch": 1.0755475185854932, + "grad_norm": 5.980425364808988, + "learning_rate": 5.184279869115686e-06, + "loss": 16.6197, + "num_tokens": 19251604.0, + "step": 670 + }, + { + "epoch": 1.0771549125979505, + "grad_norm": 6.670160358608296, + "learning_rate": 5.172606836195754e-06, + "loss": 13.2429, + "num_tokens": 19629163.0, + "step": 671 + }, + { + "epoch": 1.078762306610408, + "grad_norm": 5.819761570990716, + "learning_rate": 5.1609360175695955e-06, + "loss": 15.6912, + "num_tokens": 20039033.0, + "step": 672 + }, + { + "epoch": 1.0803697006228652, + "grad_norm": 7.112192100011077, + "learning_rate": 5.149267492171727e-06, + "loss": 18.5117, + "num_tokens": 20442399.0, + "step": 673 + }, + { + "epoch": 1.0819770946353224, + "grad_norm": 5.883834757335018, + "learning_rate": 5.137601338921152e-06, + "loss": 16.795, + "num_tokens": 20878531.0, + "step": 674 + }, + { + "epoch": 1.0835844886477797, + "grad_norm": 5.702814145301571, + "learning_rate": 5.125937636720838e-06, + "loss": 17.0268, + "num_tokens": 21292500.0, + "step": 675 + }, + { + "epoch": 1.0851918826602371, + "grad_norm": 5.767564468634228, + "learning_rate": 5.114276464457167e-06, + "loss": 15.7305, + "num_tokens": 21684084.0, + "step": 676 + }, + { + "epoch": 1.0867992766726944, + "grad_norm": 6.102223388935975, + "learning_rate": 5.102617900999414e-06, + "loss": 16.4462, + "num_tokens": 22132998.0, + "step": 677 + }, + { + "epoch": 1.0884066706851516, + "grad_norm": 5.469304016909309, + "learning_rate": 5.0909620251992084e-06, + "loss": 18.4383, + "num_tokens": 22603097.0, + "step": 678 + }, + { + "epoch": 1.090014064697609, + "grad_norm": 5.745601176306017, + "learning_rate": 5.07930891589e-06, + "loss": 15.7246, + "num_tokens": 23019322.0, + "step": 679 + }, + { + "epoch": 1.0916214587100663, + "grad_norm": 7.405232746979531, + "learning_rate": 5.0676586518865324e-06, + "loss": 17.2928, + "num_tokens": 23464016.0, + "step": 680 + }, + { + "epoch": 1.0932288527225236, + "grad_norm": 5.605321982431174, + "learning_rate": 5.056011311984301e-06, + "loss": 17.3525, + "num_tokens": 23940510.0, + "step": 681 + }, + { + "epoch": 1.0948362467349808, + "grad_norm": 6.401067187645022, + "learning_rate": 5.044366974959029e-06, + "loss": 16.0925, + "num_tokens": 24349143.0, + "step": 682 + }, + { + "epoch": 1.0964436407474383, + "grad_norm": 8.552847054138795, + "learning_rate": 5.03272571956612e-06, + "loss": 18.7016, + "num_tokens": 24713209.0, + "step": 683 + }, + { + "epoch": 1.0980510347598955, + "grad_norm": 5.735299406491111, + "learning_rate": 5.021087624540149e-06, + "loss": 15.559, + "num_tokens": 25143278.0, + "step": 684 + }, + { + "epoch": 1.0996584287723528, + "grad_norm": 6.476563634893618, + "learning_rate": 5.009452768594305e-06, + "loss": 18.4416, + "num_tokens": 25533514.0, + "step": 685 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 6.40973198385402, + "learning_rate": 4.997821230419876e-06, + "loss": 18.3163, + "num_tokens": 25957039.0, + "step": 686 + }, + { + "epoch": 1.1028732167972675, + "grad_norm": 6.219677676642627, + "learning_rate": 4.986193088685708e-06, + "loss": 14.8591, + "num_tokens": 26366593.0, + "step": 687 + }, + { + "epoch": 1.1044806108097247, + "grad_norm": 6.303215602007833, + "learning_rate": 4.974568422037676e-06, + "loss": 16.226, + "num_tokens": 26775646.0, + "step": 688 + }, + { + "epoch": 1.106088004822182, + "grad_norm": 6.485037857260266, + "learning_rate": 4.962947309098151e-06, + "loss": 17.2017, + "num_tokens": 27171515.0, + "step": 689 + }, + { + "epoch": 1.1076953988346394, + "grad_norm": 6.048415934086204, + "learning_rate": 4.9513298284654706e-06, + "loss": 18.6537, + "num_tokens": 27559926.0, + "step": 690 + }, + { + "epoch": 1.1093027928470967, + "grad_norm": 6.500752047812603, + "learning_rate": 4.939716058713404e-06, + "loss": 16.5501, + "num_tokens": 27986461.0, + "step": 691 + }, + { + "epoch": 1.110910186859554, + "grad_norm": 5.819865203451328, + "learning_rate": 4.928106078390622e-06, + "loss": 17.7705, + "num_tokens": 28400600.0, + "step": 692 + }, + { + "epoch": 1.1125175808720114, + "grad_norm": 6.680295318045469, + "learning_rate": 4.916499966020165e-06, + "loss": 16.9901, + "num_tokens": 28806721.0, + "step": 693 + }, + { + "epoch": 1.1141249748844686, + "grad_norm": 6.169981755108628, + "learning_rate": 4.904897800098919e-06, + "loss": 17.8031, + "num_tokens": 29248660.0, + "step": 694 + }, + { + "epoch": 1.1157323688969258, + "grad_norm": 6.33554420523799, + "learning_rate": 4.893299659097071e-06, + "loss": 18.521, + "num_tokens": 29686799.0, + "step": 695 + }, + { + "epoch": 1.117339762909383, + "grad_norm": 6.7884567480509155, + "learning_rate": 4.8817056214575885e-06, + "loss": 17.9769, + "num_tokens": 30079159.0, + "step": 696 + }, + { + "epoch": 1.1189471569218405, + "grad_norm": 6.363696346408687, + "learning_rate": 4.8701157655956874e-06, + "loss": 16.3546, + "num_tokens": 30473321.0, + "step": 697 + }, + { + "epoch": 1.1205545509342978, + "grad_norm": 6.443568713755709, + "learning_rate": 4.8585301698982986e-06, + "loss": 17.1904, + "num_tokens": 30906558.0, + "step": 698 + }, + { + "epoch": 1.122161944946755, + "grad_norm": 6.6060010535689395, + "learning_rate": 4.846948912723543e-06, + "loss": 18.2521, + "num_tokens": 31346354.0, + "step": 699 + }, + { + "epoch": 1.1237693389592125, + "grad_norm": 6.065878015433307, + "learning_rate": 4.835372072400195e-06, + "loss": 15.7444, + "num_tokens": 31727233.0, + "step": 700 + }, + { + "epoch": 1.1253767329716697, + "grad_norm": 5.850635997561641, + "learning_rate": 4.823799727227156e-06, + "loss": 17.4075, + "num_tokens": 32151169.0, + "step": 701 + }, + { + "epoch": 1.126984126984127, + "grad_norm": 6.236174245927457, + "learning_rate": 4.812231955472928e-06, + "loss": 17.7974, + "num_tokens": 32570778.0, + "step": 702 + }, + { + "epoch": 1.1285915209965842, + "grad_norm": 6.043434036845252, + "learning_rate": 4.800668835375078e-06, + "loss": 16.1506, + "num_tokens": 33000318.0, + "step": 703 + }, + { + "epoch": 1.1301989150090417, + "grad_norm": 6.751174065158783, + "learning_rate": 4.789110445139714e-06, + "loss": 18.9685, + "num_tokens": 33428812.0, + "step": 704 + }, + { + "epoch": 1.131806309021499, + "grad_norm": 6.158637076955821, + "learning_rate": 4.777556862940953e-06, + "loss": 15.8158, + "num_tokens": 33855212.0, + "step": 705 + }, + { + "epoch": 1.1334137030339562, + "grad_norm": 5.2698307375535025, + "learning_rate": 4.766008166920393e-06, + "loss": 15.3101, + "num_tokens": 34272062.0, + "step": 706 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 6.909479955147754, + "learning_rate": 4.754464435186583e-06, + "loss": 15.8761, + "num_tokens": 34674479.0, + "step": 707 + }, + { + "epoch": 1.1366284910588709, + "grad_norm": 6.49745469096331, + "learning_rate": 4.742925745814503e-06, + "loss": 15.5568, + "num_tokens": 35069992.0, + "step": 708 + }, + { + "epoch": 1.1382358850713281, + "grad_norm": 6.137482992045289, + "learning_rate": 4.7313921768450235e-06, + "loss": 15.9612, + "num_tokens": 35497170.0, + "step": 709 + }, + { + "epoch": 1.1398432790837854, + "grad_norm": 4.844856402956839, + "learning_rate": 4.719863806284383e-06, + "loss": 16.4314, + "num_tokens": 35951942.0, + "step": 710 + }, + { + "epoch": 1.1414506730962426, + "grad_norm": 6.660529943200112, + "learning_rate": 4.708340712103667e-06, + "loss": 17.349, + "num_tokens": 36361086.0, + "step": 711 + }, + { + "epoch": 1.1430580671087, + "grad_norm": 6.236853835437886, + "learning_rate": 4.6968229722382665e-06, + "loss": 15.7429, + "num_tokens": 36739239.0, + "step": 712 + }, + { + "epoch": 1.1446654611211573, + "grad_norm": 5.09041812836706, + "learning_rate": 4.685310664587368e-06, + "loss": 15.6452, + "num_tokens": 37181714.0, + "step": 713 + }, + { + "epoch": 1.1462728551336145, + "grad_norm": 6.811382674617202, + "learning_rate": 4.673803867013411e-06, + "loss": 17.6603, + "num_tokens": 37603293.0, + "step": 714 + }, + { + "epoch": 1.1462728551336145, + "eval_loss": 0.9422706961631775, + "eval_num_tokens": 37603293.0, + "eval_runtime": 374.8424, + "eval_samples_per_second": 23.605, + "eval_steps_per_second": 5.901, + "step": 714 + }, + { + "epoch": 1.147880249146072, + "grad_norm": 6.345741196409547, + "learning_rate": 4.662302657341572e-06, + "loss": 15.2447, + "num_tokens": 38017495.0, + "step": 715 + }, + { + "epoch": 1.1494876431585292, + "grad_norm": 6.144318257328143, + "learning_rate": 4.650807113359228e-06, + "loss": 17.0986, + "num_tokens": 38468632.0, + "step": 716 + }, + { + "epoch": 1.1510950371709865, + "grad_norm": 6.468777801289144, + "learning_rate": 4.639317312815447e-06, + "loss": 16.5879, + "num_tokens": 38922269.0, + "step": 717 + }, + { + "epoch": 1.1527024311834437, + "grad_norm": 6.311052618652191, + "learning_rate": 4.627833333420443e-06, + "loss": 16.4327, + "num_tokens": 39347519.0, + "step": 718 + }, + { + "epoch": 1.1543098251959012, + "grad_norm": 6.517212896984344, + "learning_rate": 4.616355252845061e-06, + "loss": 17.868, + "num_tokens": 39757816.0, + "step": 719 + }, + { + "epoch": 1.1559172192083584, + "grad_norm": 6.050043812286151, + "learning_rate": 4.604883148720255e-06, + "loss": 18.548, + "num_tokens": 40199173.0, + "step": 720 + }, + { + "epoch": 1.1575246132208157, + "grad_norm": 5.121546515822917, + "learning_rate": 4.593417098636551e-06, + "loss": 14.2826, + "num_tokens": 40618751.0, + "step": 721 + }, + { + "epoch": 1.1591320072332731, + "grad_norm": 5.633171891448615, + "learning_rate": 4.581957180143533e-06, + "loss": 14.3206, + "num_tokens": 41014943.0, + "step": 722 + }, + { + "epoch": 1.1607394012457304, + "grad_norm": 5.310919497821666, + "learning_rate": 4.570503470749312e-06, + "loss": 18.2392, + "num_tokens": 41455981.0, + "step": 723 + }, + { + "epoch": 1.1623467952581876, + "grad_norm": 5.635877243083072, + "learning_rate": 4.559056047920009e-06, + "loss": 16.5197, + "num_tokens": 41871500.0, + "step": 724 + }, + { + "epoch": 1.1639541892706449, + "grad_norm": 5.866263955135818, + "learning_rate": 4.547614989079219e-06, + "loss": 14.2667, + "num_tokens": 42294822.0, + "step": 725 + }, + { + "epoch": 1.1655615832831023, + "grad_norm": 5.203198488852726, + "learning_rate": 4.5361803716075e-06, + "loss": 15.4921, + "num_tokens": 42772432.0, + "step": 726 + }, + { + "epoch": 1.1671689772955596, + "grad_norm": 5.745074693518396, + "learning_rate": 4.5247522728418465e-06, + "loss": 16.3365, + "num_tokens": 43189461.0, + "step": 727 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 6.657725491569399, + "learning_rate": 4.513330770075159e-06, + "loss": 14.9223, + "num_tokens": 43611675.0, + "step": 728 + }, + { + "epoch": 1.1703837653204743, + "grad_norm": 5.3649624579613775, + "learning_rate": 4.501915940555729e-06, + "loss": 15.5864, + "num_tokens": 44030419.0, + "step": 729 + }, + { + "epoch": 1.1719911593329315, + "grad_norm": 5.928990654036286, + "learning_rate": 4.490507861486713e-06, + "loss": 16.2376, + "num_tokens": 44429160.0, + "step": 730 + }, + { + "epoch": 1.1735985533453888, + "grad_norm": 6.5961454037935345, + "learning_rate": 4.479106610025611e-06, + "loss": 14.3638, + "num_tokens": 44808020.0, + "step": 731 + }, + { + "epoch": 1.175205947357846, + "grad_norm": 5.855859353127952, + "learning_rate": 4.46771226328375e-06, + "loss": 17.3247, + "num_tokens": 45247964.0, + "step": 732 + }, + { + "epoch": 1.1768133413703035, + "grad_norm": 6.6337004106958615, + "learning_rate": 4.456324898325751e-06, + "loss": 17.1287, + "num_tokens": 45663922.0, + "step": 733 + }, + { + "epoch": 1.1784207353827607, + "grad_norm": 6.899295631767454, + "learning_rate": 4.444944592169019e-06, + "loss": 16.3817, + "num_tokens": 46048722.0, + "step": 734 + }, + { + "epoch": 1.180028129395218, + "grad_norm": 7.004804162911309, + "learning_rate": 4.4335714217832156e-06, + "loss": 15.2075, + "num_tokens": 46453986.0, + "step": 735 + }, + { + "epoch": 1.1816355234076754, + "grad_norm": 6.369839631373715, + "learning_rate": 4.422205464089739e-06, + "loss": 17.9019, + "num_tokens": 46886141.0, + "step": 736 + }, + { + "epoch": 1.1832429174201327, + "grad_norm": 6.092747603937975, + "learning_rate": 4.410846795961208e-06, + "loss": 16.7937, + "num_tokens": 47306541.0, + "step": 737 + }, + { + "epoch": 1.18485031143259, + "grad_norm": 5.968527233793372, + "learning_rate": 4.399495494220937e-06, + "loss": 13.5691, + "num_tokens": 47701945.0, + "step": 738 + }, + { + "epoch": 1.1864577054450471, + "grad_norm": 6.597283368160525, + "learning_rate": 4.388151635642418e-06, + "loss": 16.5046, + "num_tokens": 48090427.0, + "step": 739 + }, + { + "epoch": 1.1880650994575046, + "grad_norm": 5.662753162269524, + "learning_rate": 4.376815296948802e-06, + "loss": 15.8654, + "num_tokens": 48506160.0, + "step": 740 + }, + { + "epoch": 1.1896724934699618, + "grad_norm": 5.845947969314066, + "learning_rate": 4.365486554812385e-06, + "loss": 17.1782, + "num_tokens": 48961866.0, + "step": 741 + }, + { + "epoch": 1.191279887482419, + "grad_norm": 5.794984877820326, + "learning_rate": 4.354165485854076e-06, + "loss": 16.7455, + "num_tokens": 49396611.0, + "step": 742 + }, + { + "epoch": 1.1928872814948766, + "grad_norm": 4.658376198138069, + "learning_rate": 4.3428521666428945e-06, + "loss": 15.9232, + "num_tokens": 49886554.0, + "step": 743 + }, + { + "epoch": 1.1944946755073338, + "grad_norm": 5.695499324162317, + "learning_rate": 4.33154667369544e-06, + "loss": 16.2365, + "num_tokens": 50312464.0, + "step": 744 + }, + { + "epoch": 1.196102069519791, + "grad_norm": 5.83897594720765, + "learning_rate": 4.3202490834753795e-06, + "loss": 16.1307, + "num_tokens": 50728645.0, + "step": 745 + }, + { + "epoch": 1.1977094635322483, + "grad_norm": 6.301803202837831, + "learning_rate": 4.308959472392939e-06, + "loss": 16.2912, + "num_tokens": 51159211.0, + "step": 746 + }, + { + "epoch": 1.1993168575447057, + "grad_norm": 6.0466813849517775, + "learning_rate": 4.297677916804368e-06, + "loss": 16.0484, + "num_tokens": 51574362.0, + "step": 747 + }, + { + "epoch": 1.200924251557163, + "grad_norm": 6.929523478685934, + "learning_rate": 4.2864044930114394e-06, + "loss": 16.7487, + "num_tokens": 51967003.0, + "step": 748 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 6.175198302102206, + "learning_rate": 4.275139277260926e-06, + "loss": 16.1417, + "num_tokens": 52386191.0, + "step": 749 + }, + { + "epoch": 1.2041390395820775, + "grad_norm": 5.869782096108443, + "learning_rate": 4.263882345744083e-06, + "loss": 14.2124, + "num_tokens": 52782028.0, + "step": 750 + }, + { + "epoch": 1.205746433594535, + "grad_norm": 5.608925083020585, + "learning_rate": 4.252633774596143e-06, + "loss": 17.9075, + "num_tokens": 53260334.0, + "step": 751 + }, + { + "epoch": 1.2073538276069922, + "grad_norm": 5.483508192374919, + "learning_rate": 4.241393639895787e-06, + "loss": 17.3, + "num_tokens": 53692018.0, + "step": 752 + }, + { + "epoch": 1.2089612216194494, + "grad_norm": 6.432760591934238, + "learning_rate": 4.230162017664644e-06, + "loss": 15.8101, + "num_tokens": 54146391.0, + "step": 753 + }, + { + "epoch": 1.2105686156319067, + "grad_norm": 4.738509791206252, + "learning_rate": 4.218938983866761e-06, + "loss": 17.5337, + "num_tokens": 54611944.0, + "step": 754 + }, + { + "epoch": 1.2121760096443641, + "grad_norm": 5.866550450040604, + "learning_rate": 4.207724614408105e-06, + "loss": 18.3872, + "num_tokens": 55041956.0, + "step": 755 + }, + { + "epoch": 1.2137834036568214, + "grad_norm": 5.679391512832778, + "learning_rate": 4.1965189851360395e-06, + "loss": 17.4295, + "num_tokens": 55457237.0, + "step": 756 + }, + { + "epoch": 1.2153907976692786, + "grad_norm": 5.8759432041882595, + "learning_rate": 4.185322171838815e-06, + "loss": 16.4232, + "num_tokens": 55853888.0, + "step": 757 + }, + { + "epoch": 1.216998191681736, + "grad_norm": 5.74757143057549, + "learning_rate": 4.174134250245057e-06, + "loss": 18.8793, + "num_tokens": 56272203.0, + "step": 758 + }, + { + "epoch": 1.2186055856941933, + "grad_norm": 6.120026023756413, + "learning_rate": 4.162955296023247e-06, + "loss": 16.1994, + "num_tokens": 56667891.0, + "step": 759 + }, + { + "epoch": 1.2202129797066505, + "grad_norm": 5.83245130065759, + "learning_rate": 4.151785384781227e-06, + "loss": 15.6259, + "num_tokens": 57090485.0, + "step": 760 + }, + { + "epoch": 1.2218203737191078, + "grad_norm": 5.7628985582364605, + "learning_rate": 4.1406245920656705e-06, + "loss": 16.821, + "num_tokens": 57494051.0, + "step": 761 + }, + { + "epoch": 1.2234277677315653, + "grad_norm": 6.061419853658595, + "learning_rate": 4.1294729933615795e-06, + "loss": 14.1252, + "num_tokens": 57870370.0, + "step": 762 + }, + { + "epoch": 1.2250351617440225, + "grad_norm": 6.378263431560054, + "learning_rate": 4.118330664091773e-06, + "loss": 17.3578, + "num_tokens": 58273438.0, + "step": 763 + }, + { + "epoch": 1.2266425557564797, + "grad_norm": 5.909406879594955, + "learning_rate": 4.107197679616377e-06, + "loss": 16.1372, + "num_tokens": 58672736.0, + "step": 764 + }, + { + "epoch": 1.2282499497689372, + "grad_norm": 5.8495973883816195, + "learning_rate": 4.096074115232319e-06, + "loss": 15.9012, + "num_tokens": 59084119.0, + "step": 765 + }, + { + "epoch": 1.2282499497689372, + "eval_loss": 0.9362165927886963, + "eval_num_tokens": 59084119.0, + "eval_runtime": 374.8806, + "eval_samples_per_second": 23.602, + "eval_steps_per_second": 5.901, + "step": 765 + }, + { + "epoch": 1.2298573437813944, + "grad_norm": 6.06047121076865, + "learning_rate": 4.084960046172809e-06, + "loss": 16.3457, + "num_tokens": 59547896.0, + "step": 766 + }, + { + "epoch": 1.2314647377938517, + "grad_norm": 5.692782067534633, + "learning_rate": 4.073855547606839e-06, + "loss": 16.1397, + "num_tokens": 59973616.0, + "step": 767 + }, + { + "epoch": 1.233072131806309, + "grad_norm": 5.471614493025269, + "learning_rate": 4.06276069463867e-06, + "loss": 16.7535, + "num_tokens": 60423260.0, + "step": 768 + }, + { + "epoch": 1.2346795258187664, + "grad_norm": 5.484183021854283, + "learning_rate": 4.051675562307328e-06, + "loss": 17.9738, + "num_tokens": 60858377.0, + "step": 769 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 5.842545556934192, + "learning_rate": 4.040600225586094e-06, + "loss": 17.3957, + "num_tokens": 61271681.0, + "step": 770 + }, + { + "epoch": 1.2378943138436809, + "grad_norm": 5.318672809024005, + "learning_rate": 4.0295347593819955e-06, + "loss": 17.7325, + "num_tokens": 61731411.0, + "step": 771 + }, + { + "epoch": 1.2395017078561383, + "grad_norm": 5.572713565927631, + "learning_rate": 4.0184792385353045e-06, + "loss": 15.0312, + "num_tokens": 62124871.0, + "step": 772 + }, + { + "epoch": 1.2411091018685956, + "grad_norm": 6.634678895303221, + "learning_rate": 4.007433737819024e-06, + "loss": 16.6292, + "num_tokens": 62498701.0, + "step": 773 + }, + { + "epoch": 1.2427164958810528, + "grad_norm": 6.588648162697145, + "learning_rate": 3.996398331938395e-06, + "loss": 17.2732, + "num_tokens": 62898528.0, + "step": 774 + }, + { + "epoch": 1.24432388989351, + "grad_norm": 4.973560425893894, + "learning_rate": 3.9853730955303726e-06, + "loss": 17.3021, + "num_tokens": 63360468.0, + "step": 775 + }, + { + "epoch": 1.2459312839059675, + "grad_norm": 6.214483569733612, + "learning_rate": 3.974358103163139e-06, + "loss": 15.9264, + "num_tokens": 63738376.0, + "step": 776 + }, + { + "epoch": 1.2475386779184248, + "grad_norm": 6.138908990283441, + "learning_rate": 3.963353429335591e-06, + "loss": 16.2644, + "num_tokens": 64140584.0, + "step": 777 + }, + { + "epoch": 1.249146071930882, + "grad_norm": 5.75773355389296, + "learning_rate": 3.952359148476829e-06, + "loss": 18.3575, + "num_tokens": 64595273.0, + "step": 778 + }, + { + "epoch": 1.2507534659433395, + "grad_norm": 6.760225912995955, + "learning_rate": 3.941375334945675e-06, + "loss": 16.5994, + "num_tokens": 65012995.0, + "step": 779 + }, + { + "epoch": 1.2523608599557967, + "grad_norm": 5.787199827889434, + "learning_rate": 3.930402063030145e-06, + "loss": 16.65, + "num_tokens": 65437042.0, + "step": 780 + }, + { + "epoch": 1.253968253968254, + "grad_norm": 6.28600156841938, + "learning_rate": 3.919439406946965e-06, + "loss": 14.1051, + "num_tokens": 65798860.0, + "step": 781 + }, + { + "epoch": 1.2555756479807112, + "grad_norm": 5.368140920559627, + "learning_rate": 3.9084874408410555e-06, + "loss": 15.7228, + "num_tokens": 66261035.0, + "step": 782 + }, + { + "epoch": 1.2571830419931684, + "grad_norm": 5.923097743885843, + "learning_rate": 3.897546238785039e-06, + "loss": 16.6409, + "num_tokens": 66669823.0, + "step": 783 + }, + { + "epoch": 1.258790436005626, + "grad_norm": 6.952511527784439, + "learning_rate": 3.88661587477874e-06, + "loss": 16.4706, + "num_tokens": 67079684.0, + "step": 784 + }, + { + "epoch": 1.2603978300180831, + "grad_norm": 6.344681144379311, + "learning_rate": 3.875696422748675e-06, + "loss": 15.7259, + "num_tokens": 67493654.0, + "step": 785 + }, + { + "epoch": 1.2620052240305406, + "grad_norm": 6.004679038631093, + "learning_rate": 3.8647879565475625e-06, + "loss": 16.8373, + "num_tokens": 67935812.0, + "step": 786 + }, + { + "epoch": 1.2636126180429978, + "grad_norm": 6.246726634585745, + "learning_rate": 3.853890549953815e-06, + "loss": 18.1639, + "num_tokens": 68347303.0, + "step": 787 + }, + { + "epoch": 1.265220012055455, + "grad_norm": 6.246069797290463, + "learning_rate": 3.843004276671049e-06, + "loss": 16.9886, + "num_tokens": 68774432.0, + "step": 788 + }, + { + "epoch": 1.2668274060679123, + "grad_norm": 6.29191307403452, + "learning_rate": 3.832129210327582e-06, + "loss": 16.7451, + "num_tokens": 69200332.0, + "step": 789 + }, + { + "epoch": 1.2684348000803696, + "grad_norm": 6.55164198752415, + "learning_rate": 3.821265424475929e-06, + "loss": 15.6574, + "num_tokens": 69621490.0, + "step": 790 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 5.748612451663736, + "learning_rate": 3.810412992592317e-06, + "loss": 16.7075, + "num_tokens": 70012951.0, + "step": 791 + }, + { + "epoch": 1.2716495881052843, + "grad_norm": 6.119950114186797, + "learning_rate": 3.7995719880761737e-06, + "loss": 14.3787, + "num_tokens": 70400746.0, + "step": 792 + }, + { + "epoch": 1.2732569821177417, + "grad_norm": 6.765441755855912, + "learning_rate": 3.7887424842496466e-06, + "loss": 16.3265, + "num_tokens": 70802232.0, + "step": 793 + }, + { + "epoch": 1.274864376130199, + "grad_norm": 5.74573501398634, + "learning_rate": 3.777924554357096e-06, + "loss": 17.1395, + "num_tokens": 71269195.0, + "step": 794 + }, + { + "epoch": 1.2764717701426562, + "grad_norm": 5.3519206946405555, + "learning_rate": 3.7671182715646036e-06, + "loss": 17.1621, + "num_tokens": 71692197.0, + "step": 795 + }, + { + "epoch": 1.2780791641551135, + "grad_norm": 6.820090182712144, + "learning_rate": 3.756323708959476e-06, + "loss": 16.3326, + "num_tokens": 72075551.0, + "step": 796 + }, + { + "epoch": 1.2796865581675707, + "grad_norm": 6.952381950191912, + "learning_rate": 3.745540939549749e-06, + "loss": 18.4891, + "num_tokens": 72565305.0, + "step": 797 + }, + { + "epoch": 1.2812939521800282, + "grad_norm": 6.593401869476264, + "learning_rate": 3.7347700362637016e-06, + "loss": 18.2221, + "num_tokens": 72980568.0, + "step": 798 + }, + { + "epoch": 1.2829013461924854, + "grad_norm": 5.762122721385298, + "learning_rate": 3.7240110719493574e-06, + "loss": 15.8227, + "num_tokens": 73399383.0, + "step": 799 + }, + { + "epoch": 1.2845087402049427, + "grad_norm": 6.056136400950809, + "learning_rate": 3.713264119373985e-06, + "loss": 17.08, + "num_tokens": 73846225.0, + "step": 800 + }, + { + "epoch": 1.2861161342174001, + "grad_norm": 5.5431720313787665, + "learning_rate": 3.702529251223619e-06, + "loss": 16.1522, + "num_tokens": 74304601.0, + "step": 801 + }, + { + "epoch": 1.2877235282298574, + "grad_norm": 6.106344555603387, + "learning_rate": 3.6918065401025583e-06, + "loss": 17.1818, + "num_tokens": 74740073.0, + "step": 802 + }, + { + "epoch": 1.2893309222423146, + "grad_norm": 5.939561638290805, + "learning_rate": 3.6810960585328838e-06, + "loss": 15.8927, + "num_tokens": 75160936.0, + "step": 803 + }, + { + "epoch": 1.2909383162547718, + "grad_norm": 6.005724643908523, + "learning_rate": 3.6703978789539562e-06, + "loss": 16.6364, + "num_tokens": 75534700.0, + "step": 804 + }, + { + "epoch": 1.2925457102672293, + "grad_norm": 7.293922765317639, + "learning_rate": 3.6597120737219393e-06, + "loss": 18.5421, + "num_tokens": 75939207.0, + "step": 805 + }, + { + "epoch": 1.2941531042796866, + "grad_norm": 6.572344060335259, + "learning_rate": 3.6490387151092987e-06, + "loss": 16.6413, + "num_tokens": 76316480.0, + "step": 806 + }, + { + "epoch": 1.2957604982921438, + "grad_norm": 5.308239844591545, + "learning_rate": 3.638377875304324e-06, + "loss": 16.2034, + "num_tokens": 76745423.0, + "step": 807 + }, + { + "epoch": 1.2973678923046013, + "grad_norm": 6.062974392031781, + "learning_rate": 3.62772962641063e-06, + "loss": 18.2787, + "num_tokens": 77142173.0, + "step": 808 + }, + { + "epoch": 1.2989752863170585, + "grad_norm": 5.988407442547703, + "learning_rate": 3.617094040446677e-06, + "loss": 16.6165, + "num_tokens": 77546229.0, + "step": 809 + }, + { + "epoch": 1.3005826803295157, + "grad_norm": 5.541082088679027, + "learning_rate": 3.6064711893452766e-06, + "loss": 17.8844, + "num_tokens": 77972552.0, + "step": 810 + }, + { + "epoch": 1.302190074341973, + "grad_norm": 5.178044016274743, + "learning_rate": 3.5958611449531156e-06, + "loss": 16.0269, + "num_tokens": 78409533.0, + "step": 811 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 5.715950873682161, + "learning_rate": 3.585263979030261e-06, + "loss": 17.4473, + "num_tokens": 78829830.0, + "step": 812 + }, + { + "epoch": 1.3054048623668877, + "grad_norm": 6.945679551223017, + "learning_rate": 3.574679763249674e-06, + "loss": 17.6076, + "num_tokens": 79213285.0, + "step": 813 + }, + { + "epoch": 1.307012256379345, + "grad_norm": 7.024638994939887, + "learning_rate": 3.5641085691967335e-06, + "loss": 17.6223, + "num_tokens": 79624968.0, + "step": 814 + }, + { + "epoch": 1.3086196503918024, + "grad_norm": 5.779996836815621, + "learning_rate": 3.5535504683687474e-06, + "loss": 17.147, + "num_tokens": 80048188.0, + "step": 815 + }, + { + "epoch": 1.3102270444042596, + "grad_norm": 6.100623617581974, + "learning_rate": 3.5430055321744604e-06, + "loss": 15.5675, + "num_tokens": 80464281.0, + "step": 816 + }, + { + "epoch": 1.3102270444042596, + "eval_loss": 0.9305866956710815, + "eval_num_tokens": 80464281.0, + "eval_runtime": 374.8213, + "eval_samples_per_second": 23.606, + "eval_steps_per_second": 5.901, + "step": 816 + }, + { + "epoch": 1.3118344384167169, + "grad_norm": 6.052943166030787, + "learning_rate": 3.532473831933594e-06, + "loss": 16.0076, + "num_tokens": 80859952.0, + "step": 817 + }, + { + "epoch": 1.3134418324291741, + "grad_norm": 6.179804514955156, + "learning_rate": 3.521955438876334e-06, + "loss": 16.5775, + "num_tokens": 81280053.0, + "step": 818 + }, + { + "epoch": 1.3150492264416316, + "grad_norm": 5.957283125006404, + "learning_rate": 3.5114504241428783e-06, + "loss": 16.2266, + "num_tokens": 81678179.0, + "step": 819 + }, + { + "epoch": 1.3166566204540888, + "grad_norm": 5.720279616192968, + "learning_rate": 3.500958858782927e-06, + "loss": 16.6757, + "num_tokens": 82076335.0, + "step": 820 + }, + { + "epoch": 1.318264014466546, + "grad_norm": 5.734820898495109, + "learning_rate": 3.490480813755233e-06, + "loss": 16.4595, + "num_tokens": 82537286.0, + "step": 821 + }, + { + "epoch": 1.3198714084790035, + "grad_norm": 5.624350782328606, + "learning_rate": 3.4800163599270914e-06, + "loss": 16.8384, + "num_tokens": 82954953.0, + "step": 822 + }, + { + "epoch": 1.3214788024914608, + "grad_norm": 7.235302333946001, + "learning_rate": 3.469565568073884e-06, + "loss": 16.8539, + "num_tokens": 83337690.0, + "step": 823 + }, + { + "epoch": 1.323086196503918, + "grad_norm": 5.819218744766259, + "learning_rate": 3.459128508878583e-06, + "loss": 17.9598, + "num_tokens": 83758541.0, + "step": 824 + }, + { + "epoch": 1.3246935905163753, + "grad_norm": 5.695435612639014, + "learning_rate": 3.4487052529312883e-06, + "loss": 17.2196, + "num_tokens": 84190187.0, + "step": 825 + }, + { + "epoch": 1.3263009845288325, + "grad_norm": 6.139322675031182, + "learning_rate": 3.43829587072874e-06, + "loss": 15.9382, + "num_tokens": 84580785.0, + "step": 826 + }, + { + "epoch": 1.32790837854129, + "grad_norm": 5.870126857693754, + "learning_rate": 3.4279004326738445e-06, + "loss": 18.3935, + "num_tokens": 85019526.0, + "step": 827 + }, + { + "epoch": 1.3295157725537472, + "grad_norm": 5.9737842325808215, + "learning_rate": 3.4175190090751963e-06, + "loss": 17.7463, + "num_tokens": 85420533.0, + "step": 828 + }, + { + "epoch": 1.3311231665662047, + "grad_norm": 5.687065550556454, + "learning_rate": 3.407151670146608e-06, + "loss": 15.1624, + "num_tokens": 85824148.0, + "step": 829 + }, + { + "epoch": 1.332730560578662, + "grad_norm": 6.021273376342255, + "learning_rate": 3.3967984860066277e-06, + "loss": 17.5648, + "num_tokens": 86289486.0, + "step": 830 + }, + { + "epoch": 1.3343379545911191, + "grad_norm": 6.383405671104722, + "learning_rate": 3.386459526678073e-06, + "loss": 17.2036, + "num_tokens": 86723516.0, + "step": 831 + }, + { + "epoch": 1.3359453486035764, + "grad_norm": 6.728744609008056, + "learning_rate": 3.376134862087553e-06, + "loss": 16.365, + "num_tokens": 87123998.0, + "step": 832 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 6.29197127759176, + "learning_rate": 3.36582456206499e-06, + "loss": 16.587, + "num_tokens": 87502869.0, + "step": 833 + }, + { + "epoch": 1.339160136628491, + "grad_norm": 5.828825080717334, + "learning_rate": 3.35552869634316e-06, + "loss": 15.5502, + "num_tokens": 87900566.0, + "step": 834 + }, + { + "epoch": 1.3407675306409483, + "grad_norm": 6.261826813850841, + "learning_rate": 3.345247334557207e-06, + "loss": 17.3571, + "num_tokens": 88342215.0, + "step": 835 + }, + { + "epoch": 1.3423749246534058, + "grad_norm": 5.295982217217789, + "learning_rate": 3.334980546244189e-06, + "loss": 16.7108, + "num_tokens": 88801931.0, + "step": 836 + }, + { + "epoch": 1.343982318665863, + "grad_norm": 5.595162583499947, + "learning_rate": 3.324728400842585e-06, + "loss": 13.3667, + "num_tokens": 89191542.0, + "step": 837 + }, + { + "epoch": 1.3455897126783203, + "grad_norm": 6.608281020742843, + "learning_rate": 3.3144909676918503e-06, + "loss": 16.3238, + "num_tokens": 89580093.0, + "step": 838 + }, + { + "epoch": 1.3471971066907775, + "grad_norm": 5.3843702291854845, + "learning_rate": 3.3042683160319222e-06, + "loss": 17.5416, + "num_tokens": 90018973.0, + "step": 839 + }, + { + "epoch": 1.3488045007032348, + "grad_norm": 5.116546376535364, + "learning_rate": 3.2940605150027803e-06, + "loss": 15.8828, + "num_tokens": 90459322.0, + "step": 840 + }, + { + "epoch": 1.3504118947156922, + "grad_norm": 6.809696378175792, + "learning_rate": 3.283867633643949e-06, + "loss": 17.726, + "num_tokens": 90843777.0, + "step": 841 + }, + { + "epoch": 1.3520192887281495, + "grad_norm": 6.420642832181689, + "learning_rate": 3.2736897408940536e-06, + "loss": 16.5245, + "num_tokens": 91236118.0, + "step": 842 + }, + { + "epoch": 1.3536266827406067, + "grad_norm": 5.545642516319961, + "learning_rate": 3.2635269055903407e-06, + "loss": 18.0366, + "num_tokens": 91668578.0, + "step": 843 + }, + { + "epoch": 1.3552340767530642, + "grad_norm": 6.479080646771984, + "learning_rate": 3.2533791964682195e-06, + "loss": 16.2507, + "num_tokens": 92035477.0, + "step": 844 + }, + { + "epoch": 1.3568414707655214, + "grad_norm": 5.8162409725793704, + "learning_rate": 3.2432466821607942e-06, + "loss": 18.2892, + "num_tokens": 92456165.0, + "step": 845 + }, + { + "epoch": 1.3584488647779787, + "grad_norm": 5.478554792397395, + "learning_rate": 3.2331294311984007e-06, + "loss": 18.432, + "num_tokens": 92900159.0, + "step": 846 + }, + { + "epoch": 1.360056258790436, + "grad_norm": 6.205539653798637, + "learning_rate": 3.2230275120081444e-06, + "loss": 16.548, + "num_tokens": 93288010.0, + "step": 847 + }, + { + "epoch": 1.3616636528028934, + "grad_norm": 5.27636697317256, + "learning_rate": 3.212940992913429e-06, + "loss": 15.267, + "num_tokens": 93690852.0, + "step": 848 + }, + { + "epoch": 1.3632710468153506, + "grad_norm": 6.036162257348822, + "learning_rate": 3.2028699421335075e-06, + "loss": 15.72, + "num_tokens": 94076772.0, + "step": 849 + }, + { + "epoch": 1.3648784408278078, + "grad_norm": 6.470848652697586, + "learning_rate": 3.192814427783013e-06, + "loss": 17.5589, + "num_tokens": 94479575.0, + "step": 850 + }, + { + "epoch": 1.3664858348402653, + "grad_norm": 5.722412947279273, + "learning_rate": 3.1827745178714995e-06, + "loss": 16.9917, + "num_tokens": 94898037.0, + "step": 851 + }, + { + "epoch": 1.3680932288527226, + "grad_norm": 5.152111734412224, + "learning_rate": 3.172750280302979e-06, + "loss": 16.3267, + "num_tokens": 95323131.0, + "step": 852 + }, + { + "epoch": 1.3697006228651798, + "grad_norm": 6.069358139427357, + "learning_rate": 3.1627417828754682e-06, + "loss": 15.789, + "num_tokens": 95736365.0, + "step": 853 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 5.530392812465302, + "learning_rate": 3.152749093280525e-06, + "loss": 15.3655, + "num_tokens": 96146221.0, + "step": 854 + }, + { + "epoch": 1.3729154108900945, + "grad_norm": 5.733977059401985, + "learning_rate": 3.142772279102795e-06, + "loss": 17.5607, + "num_tokens": 96560443.0, + "step": 855 + }, + { + "epoch": 1.3745228049025517, + "grad_norm": 5.43555858561781, + "learning_rate": 3.1328114078195483e-06, + "loss": 18.3939, + "num_tokens": 96995228.0, + "step": 856 + }, + { + "epoch": 1.376130198915009, + "grad_norm": 5.896345853282362, + "learning_rate": 3.1228665468002308e-06, + "loss": 17.1693, + "num_tokens": 97384378.0, + "step": 857 + }, + { + "epoch": 1.3777375929274664, + "grad_norm": 6.052674329245838, + "learning_rate": 3.1129377633059955e-06, + "loss": 16.965, + "num_tokens": 97800124.0, + "step": 858 + }, + { + "epoch": 1.3793449869399237, + "grad_norm": 6.987383397736918, + "learning_rate": 3.103025124489272e-06, + "loss": 18.4649, + "num_tokens": 98197907.0, + "step": 859 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 6.106976512531219, + "learning_rate": 3.0931286973932815e-06, + "loss": 17.498, + "num_tokens": 98596599.0, + "step": 860 + }, + { + "epoch": 1.3825597749648382, + "grad_norm": 6.2153869636297525, + "learning_rate": 3.0832485489516084e-06, + "loss": 16.5517, + "num_tokens": 99017573.0, + "step": 861 + }, + { + "epoch": 1.3841671689772956, + "grad_norm": 6.10131614579238, + "learning_rate": 3.073384745987734e-06, + "loss": 16.1146, + "num_tokens": 99422558.0, + "step": 862 + }, + { + "epoch": 1.3857745629897529, + "grad_norm": 6.474892729052782, + "learning_rate": 3.0635373552145882e-06, + "loss": 17.0219, + "num_tokens": 99815790.0, + "step": 863 + }, + { + "epoch": 1.3873819570022101, + "grad_norm": 6.490427528247094, + "learning_rate": 3.0537064432340994e-06, + "loss": 16.5137, + "num_tokens": 100246674.0, + "step": 864 + }, + { + "epoch": 1.3889893510146676, + "grad_norm": 6.165617807498352, + "learning_rate": 3.0438920765367447e-06, + "loss": 16.528, + "num_tokens": 100629678.0, + "step": 865 + }, + { + "epoch": 1.3905967450271248, + "grad_norm": 5.84781386395711, + "learning_rate": 3.0340943215010974e-06, + "loss": 17.7934, + "num_tokens": 101060362.0, + "step": 866 + }, + { + "epoch": 1.392204139039582, + "grad_norm": 6.181727892461727, + "learning_rate": 3.024313244393377e-06, + "loss": 16.9843, + "num_tokens": 101442913.0, + "step": 867 + }, + { + "epoch": 1.392204139039582, + "eval_loss": 0.925393283367157, + "eval_num_tokens": 101442913.0, + "eval_runtime": 374.8139, + "eval_samples_per_second": 23.606, + "eval_steps_per_second": 5.902, + "step": 867 + }, + { + "epoch": 1.3938115330520393, + "grad_norm": 6.254937631764441, + "learning_rate": 3.014548911367007e-06, + "loss": 15.9592, + "num_tokens": 101846059.0, + "step": 868 + }, + { + "epoch": 1.3954189270644966, + "grad_norm": 5.720408262532864, + "learning_rate": 3.004801388462162e-06, + "loss": 16.6842, + "num_tokens": 102238177.0, + "step": 869 + }, + { + "epoch": 1.397026321076954, + "grad_norm": 5.385320052322474, + "learning_rate": 2.995070741605325e-06, + "loss": 16.4155, + "num_tokens": 102659993.0, + "step": 870 + }, + { + "epoch": 1.3986337150894113, + "grad_norm": 6.365147656985243, + "learning_rate": 2.9853570366088336e-06, + "loss": 16.9639, + "num_tokens": 103093757.0, + "step": 871 + }, + { + "epoch": 1.4002411091018687, + "grad_norm": 5.902327734111986, + "learning_rate": 2.9756603391704484e-06, + "loss": 15.38, + "num_tokens": 103486925.0, + "step": 872 + }, + { + "epoch": 1.401848503114326, + "grad_norm": 5.422114023486505, + "learning_rate": 2.965980714872896e-06, + "loss": 17.7403, + "num_tokens": 103922686.0, + "step": 873 + }, + { + "epoch": 1.4034558971267832, + "grad_norm": 6.230549319141113, + "learning_rate": 2.956318229183432e-06, + "loss": 18.1387, + "num_tokens": 104357042.0, + "step": 874 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 6.071754099875668, + "learning_rate": 2.946672947453395e-06, + "loss": 17.7882, + "num_tokens": 104756269.0, + "step": 875 + }, + { + "epoch": 1.4066706851516977, + "grad_norm": 5.524063890550253, + "learning_rate": 2.937044934917769e-06, + "loss": 16.4684, + "num_tokens": 105139483.0, + "step": 876 + }, + { + "epoch": 1.4082780791641551, + "grad_norm": 6.279285494878047, + "learning_rate": 2.9274342566947355e-06, + "loss": 15.7408, + "num_tokens": 105498163.0, + "step": 877 + }, + { + "epoch": 1.4098854731766124, + "grad_norm": 6.298219645320659, + "learning_rate": 2.9178409777852433e-06, + "loss": 17.0707, + "num_tokens": 105885652.0, + "step": 878 + }, + { + "epoch": 1.4114928671890699, + "grad_norm": 5.539567171608782, + "learning_rate": 2.9082651630725544e-06, + "loss": 16.6058, + "num_tokens": 106291959.0, + "step": 879 + }, + { + "epoch": 1.413100261201527, + "grad_norm": 5.66011774357705, + "learning_rate": 2.8987068773218185e-06, + "loss": 16.852, + "num_tokens": 106697769.0, + "step": 880 + }, + { + "epoch": 1.4147076552139843, + "grad_norm": 6.728182949544659, + "learning_rate": 2.889166185179631e-06, + "loss": 17.5775, + "num_tokens": 107148793.0, + "step": 881 + }, + { + "epoch": 1.4163150492264416, + "grad_norm": 6.091022763229615, + "learning_rate": 2.8796431511735874e-06, + "loss": 16.4317, + "num_tokens": 107551086.0, + "step": 882 + }, + { + "epoch": 1.4179224432388988, + "grad_norm": 6.055744831776969, + "learning_rate": 2.870137839711864e-06, + "loss": 15.4566, + "num_tokens": 107936426.0, + "step": 883 + }, + { + "epoch": 1.4195298372513563, + "grad_norm": 6.156908896746901, + "learning_rate": 2.8606503150827634e-06, + "loss": 16.7825, + "num_tokens": 108352205.0, + "step": 884 + }, + { + "epoch": 1.4211372312638135, + "grad_norm": 6.1780367901078925, + "learning_rate": 2.8511806414542954e-06, + "loss": 18.5737, + "num_tokens": 108802309.0, + "step": 885 + }, + { + "epoch": 1.4227446252762708, + "grad_norm": 5.629709467823637, + "learning_rate": 2.8417288828737287e-06, + "loss": 16.9448, + "num_tokens": 109276722.0, + "step": 886 + }, + { + "epoch": 1.4243520192887282, + "grad_norm": 6.242968440681154, + "learning_rate": 2.832295103267173e-06, + "loss": 17.9018, + "num_tokens": 109677615.0, + "step": 887 + }, + { + "epoch": 1.4259594133011855, + "grad_norm": 6.413008880567355, + "learning_rate": 2.822879366439134e-06, + "loss": 16.4343, + "num_tokens": 110070157.0, + "step": 888 + }, + { + "epoch": 1.4275668073136427, + "grad_norm": 6.143367399826841, + "learning_rate": 2.81348173607209e-06, + "loss": 18.2915, + "num_tokens": 110486407.0, + "step": 889 + }, + { + "epoch": 1.4291742013261, + "grad_norm": 5.61198610806199, + "learning_rate": 2.804102275726054e-06, + "loss": 15.0006, + "num_tokens": 110897317.0, + "step": 890 + }, + { + "epoch": 1.4307815953385574, + "grad_norm": 5.187529005503354, + "learning_rate": 2.794741048838149e-06, + "loss": 16.6515, + "num_tokens": 111360404.0, + "step": 891 + }, + { + "epoch": 1.4323889893510147, + "grad_norm": 5.155895233179099, + "learning_rate": 2.78539811872218e-06, + "loss": 18.2201, + "num_tokens": 111818476.0, + "step": 892 + }, + { + "epoch": 1.433996383363472, + "grad_norm": 5.012565721571602, + "learning_rate": 2.776073548568204e-06, + "loss": 15.3087, + "num_tokens": 112295752.0, + "step": 893 + }, + { + "epoch": 1.4356037773759294, + "grad_norm": 6.142553526917407, + "learning_rate": 2.7667674014420946e-06, + "loss": 14.3473, + "num_tokens": 112711109.0, + "step": 894 + }, + { + "epoch": 1.4372111713883866, + "grad_norm": 5.7582735630198405, + "learning_rate": 2.7574797402851317e-06, + "loss": 16.2935, + "num_tokens": 113100488.0, + "step": 895 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 5.2446450759505705, + "learning_rate": 2.7482106279135635e-06, + "loss": 16.4169, + "num_tokens": 113516973.0, + "step": 896 + }, + { + "epoch": 1.440425959413301, + "grad_norm": 6.6810684464961385, + "learning_rate": 2.7389601270181848e-06, + "loss": 15.2545, + "num_tokens": 113936981.0, + "step": 897 + }, + { + "epoch": 1.4420333534257586, + "grad_norm": 5.321448818579864, + "learning_rate": 2.7297283001639153e-06, + "loss": 17.263, + "num_tokens": 114370989.0, + "step": 898 + }, + { + "epoch": 1.4436407474382158, + "grad_norm": 6.375075174911111, + "learning_rate": 2.7205152097893695e-06, + "loss": 16.5016, + "num_tokens": 114730281.0, + "step": 899 + }, + { + "epoch": 1.445248141450673, + "grad_norm": 5.643575242584418, + "learning_rate": 2.7113209182064482e-06, + "loss": 14.5961, + "num_tokens": 115121579.0, + "step": 900 + }, + { + "epoch": 1.4468555354631305, + "grad_norm": 5.255711698426918, + "learning_rate": 2.702145487599896e-06, + "loss": 15.7787, + "num_tokens": 115574422.0, + "step": 901 + }, + { + "epoch": 1.4484629294755877, + "grad_norm": 5.35883988989798, + "learning_rate": 2.692988980026907e-06, + "loss": 17.3759, + "num_tokens": 116018710.0, + "step": 902 + }, + { + "epoch": 1.450070323488045, + "grad_norm": 5.392394169407981, + "learning_rate": 2.683851457416681e-06, + "loss": 16.2718, + "num_tokens": 116435503.0, + "step": 903 + }, + { + "epoch": 1.4516777175005022, + "grad_norm": 4.93069283905958, + "learning_rate": 2.6747329815700194e-06, + "loss": 16.1437, + "num_tokens": 116919556.0, + "step": 904 + }, + { + "epoch": 1.4532851115129597, + "grad_norm": 6.118869099848722, + "learning_rate": 2.665633614158898e-06, + "loss": 15.8727, + "num_tokens": 117326599.0, + "step": 905 + }, + { + "epoch": 1.454892505525417, + "grad_norm": 5.895629219996894, + "learning_rate": 2.6565534167260615e-06, + "loss": 15.9099, + "num_tokens": 117740806.0, + "step": 906 + }, + { + "epoch": 1.4564998995378742, + "grad_norm": 5.593621328828121, + "learning_rate": 2.647492450684593e-06, + "loss": 16.3779, + "num_tokens": 118155674.0, + "step": 907 + }, + { + "epoch": 1.4581072935503316, + "grad_norm": 6.251230808024939, + "learning_rate": 2.6384507773175127e-06, + "loss": 14.3212, + "num_tokens": 118490056.0, + "step": 908 + }, + { + "epoch": 1.4597146875627889, + "grad_norm": 6.716165234077979, + "learning_rate": 2.62942845777735e-06, + "loss": 15.0679, + "num_tokens": 118875393.0, + "step": 909 + }, + { + "epoch": 1.4613220815752461, + "grad_norm": 5.762252843834249, + "learning_rate": 2.6204255530857386e-06, + "loss": 17.7464, + "num_tokens": 119300937.0, + "step": 910 + }, + { + "epoch": 1.4629294755877034, + "grad_norm": 5.175563529237627, + "learning_rate": 2.611442124133005e-06, + "loss": 17.0956, + "num_tokens": 119750240.0, + "step": 911 + }, + { + "epoch": 1.4645368696001606, + "grad_norm": 5.540570872570951, + "learning_rate": 2.6024782316777496e-06, + "loss": 12.5575, + "num_tokens": 120151713.0, + "step": 912 + }, + { + "epoch": 1.466144263612618, + "grad_norm": 5.860566444367572, + "learning_rate": 2.593533936346443e-06, + "loss": 16.9809, + "num_tokens": 120597196.0, + "step": 913 + }, + { + "epoch": 1.4677516576250753, + "grad_norm": 5.295939051080179, + "learning_rate": 2.584609298633005e-06, + "loss": 16.3547, + "num_tokens": 121025540.0, + "step": 914 + }, + { + "epoch": 1.4693590516375328, + "grad_norm": 5.683956799030017, + "learning_rate": 2.5757043788984116e-06, + "loss": 17.0489, + "num_tokens": 121468747.0, + "step": 915 + }, + { + "epoch": 1.47096644564999, + "grad_norm": 5.9457693902514785, + "learning_rate": 2.566819237370274e-06, + "loss": 17.9181, + "num_tokens": 121886100.0, + "step": 916 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 6.453730096443661, + "learning_rate": 2.5579539341424365e-06, + "loss": 16.3415, + "num_tokens": 122267886.0, + "step": 917 + }, + { + "epoch": 1.4741812336749045, + "grad_norm": 4.843760692227952, + "learning_rate": 2.5491085291745665e-06, + "loss": 17.1804, + "num_tokens": 122709444.0, + "step": 918 + }, + { + "epoch": 1.4741812336749045, + "eval_loss": 0.920837938785553, + "eval_num_tokens": 122709444.0, + "eval_runtime": 374.8479, + "eval_samples_per_second": 23.604, + "eval_steps_per_second": 5.901, + "step": 918 + }, + { + "epoch": 1.4757886276873617, + "grad_norm": 6.143495336882707, + "learning_rate": 2.5402830822917545e-06, + "loss": 14.7955, + "num_tokens": 123126664.0, + "step": 919 + }, + { + "epoch": 1.4773960216998192, + "grad_norm": 6.477307750213874, + "learning_rate": 2.5314776531841005e-06, + "loss": 16.7985, + "num_tokens": 123588082.0, + "step": 920 + }, + { + "epoch": 1.4790034157122764, + "grad_norm": 5.569311261270139, + "learning_rate": 2.5226923014063276e-06, + "loss": 15.2525, + "num_tokens": 123973743.0, + "step": 921 + }, + { + "epoch": 1.480610809724734, + "grad_norm": 6.745111665050258, + "learning_rate": 2.5139270863773544e-06, + "loss": 17.3117, + "num_tokens": 124382634.0, + "step": 922 + }, + { + "epoch": 1.4822182037371912, + "grad_norm": 5.842663491001214, + "learning_rate": 2.5051820673799167e-06, + "loss": 16.8411, + "num_tokens": 124781051.0, + "step": 923 + }, + { + "epoch": 1.4838255977496484, + "grad_norm": 5.6148796216206165, + "learning_rate": 2.4964573035601462e-06, + "loss": 16.9277, + "num_tokens": 125225369.0, + "step": 924 + }, + { + "epoch": 1.4854329917621056, + "grad_norm": 6.034880170961891, + "learning_rate": 2.487752853927194e-06, + "loss": 18.4918, + "num_tokens": 125677143.0, + "step": 925 + }, + { + "epoch": 1.4870403857745629, + "grad_norm": 5.484895544322941, + "learning_rate": 2.479068777352805e-06, + "loss": 14.9023, + "num_tokens": 126109465.0, + "step": 926 + }, + { + "epoch": 1.4886477797870203, + "grad_norm": 5.381127886781476, + "learning_rate": 2.470405132570941e-06, + "loss": 15.2532, + "num_tokens": 126527714.0, + "step": 927 + }, + { + "epoch": 1.4902551737994776, + "grad_norm": 5.263455091561503, + "learning_rate": 2.4617619781773727e-06, + "loss": 20.2699, + "num_tokens": 126988433.0, + "step": 928 + }, + { + "epoch": 1.4918625678119348, + "grad_norm": 5.708276204507092, + "learning_rate": 2.4531393726292828e-06, + "loss": 17.4815, + "num_tokens": 127426604.0, + "step": 929 + }, + { + "epoch": 1.4934699618243923, + "grad_norm": 6.209506082568763, + "learning_rate": 2.4445373742448792e-06, + "loss": 16.5799, + "num_tokens": 127825610.0, + "step": 930 + }, + { + "epoch": 1.4950773558368495, + "grad_norm": 5.810523154074717, + "learning_rate": 2.4359560412029916e-06, + "loss": 16.2771, + "num_tokens": 128249974.0, + "step": 931 + }, + { + "epoch": 1.4966847498493068, + "grad_norm": 6.265555278254371, + "learning_rate": 2.4273954315426846e-06, + "loss": 18.2538, + "num_tokens": 128635633.0, + "step": 932 + }, + { + "epoch": 1.498292143861764, + "grad_norm": 6.481224019582282, + "learning_rate": 2.418855603162857e-06, + "loss": 18.9107, + "num_tokens": 129020832.0, + "step": 933 + }, + { + "epoch": 1.4998995378742215, + "grad_norm": 5.849821237804199, + "learning_rate": 2.41033661382186e-06, + "loss": 17.0134, + "num_tokens": 129422253.0, + "step": 934 + }, + { + "epoch": 1.5015069318866787, + "grad_norm": 5.349313946728359, + "learning_rate": 2.4018385211371002e-06, + "loss": 15.3351, + "num_tokens": 129798805.0, + "step": 935 + }, + { + "epoch": 1.5031143258991362, + "grad_norm": 5.475907707690328, + "learning_rate": 2.3933613825846536e-06, + "loss": 17.6747, + "num_tokens": 130223125.0, + "step": 936 + }, + { + "epoch": 1.5047217199115934, + "grad_norm": 6.286509106984666, + "learning_rate": 2.384905255498869e-06, + "loss": 14.4857, + "num_tokens": 130612319.0, + "step": 937 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 6.238612508686818, + "learning_rate": 2.3764701970719913e-06, + "loss": 14.6658, + "num_tokens": 131020542.0, + "step": 938 + }, + { + "epoch": 1.507936507936508, + "grad_norm": 5.826259107400306, + "learning_rate": 2.3680562643537692e-06, + "loss": 16.146, + "num_tokens": 131384964.0, + "step": 939 + }, + { + "epoch": 1.5095439019489651, + "grad_norm": 5.026892116660779, + "learning_rate": 2.359663514251069e-06, + "loss": 15.0801, + "num_tokens": 131815670.0, + "step": 940 + }, + { + "epoch": 1.5111512959614224, + "grad_norm": 5.698061748557477, + "learning_rate": 2.3512920035274874e-06, + "loss": 16.0032, + "num_tokens": 132220717.0, + "step": 941 + }, + { + "epoch": 1.5127586899738799, + "grad_norm": 5.84877027120282, + "learning_rate": 2.3429417888029754e-06, + "loss": 14.2301, + "num_tokens": 132632059.0, + "step": 942 + }, + { + "epoch": 1.514366083986337, + "grad_norm": 6.094443498176334, + "learning_rate": 2.3346129265534443e-06, + "loss": 15.3509, + "num_tokens": 133035706.0, + "step": 943 + }, + { + "epoch": 1.5159734779987946, + "grad_norm": 6.500014265386011, + "learning_rate": 2.3263054731103984e-06, + "loss": 17.7919, + "num_tokens": 133461473.0, + "step": 944 + }, + { + "epoch": 1.5175808720112518, + "grad_norm": 5.903053831968036, + "learning_rate": 2.3180194846605367e-06, + "loss": 15.6367, + "num_tokens": 133847183.0, + "step": 945 + }, + { + "epoch": 1.519188266023709, + "grad_norm": 5.626482532944251, + "learning_rate": 2.3097550172453872e-06, + "loss": 16.7635, + "num_tokens": 134299064.0, + "step": 946 + }, + { + "epoch": 1.5207956600361663, + "grad_norm": 6.604214428573044, + "learning_rate": 2.301512126760922e-06, + "loss": 17.7877, + "num_tokens": 134714334.0, + "step": 947 + }, + { + "epoch": 1.5224030540486235, + "grad_norm": 5.802335435027613, + "learning_rate": 2.293290868957174e-06, + "loss": 16.1919, + "num_tokens": 135140121.0, + "step": 948 + }, + { + "epoch": 1.524010448061081, + "grad_norm": 5.40655403913585, + "learning_rate": 2.285091299437875e-06, + "loss": 18.1783, + "num_tokens": 135580687.0, + "step": 949 + }, + { + "epoch": 1.5256178420735382, + "grad_norm": 5.480957957395197, + "learning_rate": 2.2769134736600617e-06, + "loss": 15.3015, + "num_tokens": 135996079.0, + "step": 950 + }, + { + "epoch": 1.5272252360859957, + "grad_norm": 5.934166851335229, + "learning_rate": 2.2687574469337147e-06, + "loss": 14.5184, + "num_tokens": 136417913.0, + "step": 951 + }, + { + "epoch": 1.528832630098453, + "grad_norm": 5.643163975774752, + "learning_rate": 2.2606232744213726e-06, + "loss": 16.0515, + "num_tokens": 136848529.0, + "step": 952 + }, + { + "epoch": 1.5304400241109102, + "grad_norm": 5.951135551679326, + "learning_rate": 2.2525110111377706e-06, + "loss": 15.6774, + "num_tokens": 137252131.0, + "step": 953 + }, + { + "epoch": 1.5320474181233674, + "grad_norm": 5.887554475517542, + "learning_rate": 2.2444207119494603e-06, + "loss": 15.034, + "num_tokens": 137656681.0, + "step": 954 + }, + { + "epoch": 1.5336548121358247, + "grad_norm": 5.989340603470791, + "learning_rate": 2.236352431574444e-06, + "loss": 17.4242, + "num_tokens": 138129191.0, + "step": 955 + }, + { + "epoch": 1.5352622061482821, + "grad_norm": 5.389126211667074, + "learning_rate": 2.2283062245817956e-06, + "loss": 16.155, + "num_tokens": 138511795.0, + "step": 956 + }, + { + "epoch": 1.5368696001607394, + "grad_norm": 5.74854463477428, + "learning_rate": 2.220282145391304e-06, + "loss": 15.8456, + "num_tokens": 138953656.0, + "step": 957 + }, + { + "epoch": 1.5384769941731968, + "grad_norm": 5.805508610998003, + "learning_rate": 2.2122802482730943e-06, + "loss": 15.8535, + "num_tokens": 139350936.0, + "step": 958 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 5.983140611829599, + "learning_rate": 2.20430058734727e-06, + "loss": 15.6654, + "num_tokens": 139772354.0, + "step": 959 + }, + { + "epoch": 1.5416917821981113, + "grad_norm": 5.498251836614298, + "learning_rate": 2.1963432165835364e-06, + "loss": 17.5414, + "num_tokens": 140216790.0, + "step": 960 + }, + { + "epoch": 1.5432991762105686, + "grad_norm": 5.005851173537078, + "learning_rate": 2.188408189800845e-06, + "loss": 17.1626, + "num_tokens": 140667739.0, + "step": 961 + }, + { + "epoch": 1.5449065702230258, + "grad_norm": 5.851950834985812, + "learning_rate": 2.1804955606670263e-06, + "loss": 16.422, + "num_tokens": 141088785.0, + "step": 962 + }, + { + "epoch": 1.5465139642354833, + "grad_norm": 5.846650954076833, + "learning_rate": 2.172605382698425e-06, + "loss": 16.6922, + "num_tokens": 141492108.0, + "step": 963 + }, + { + "epoch": 1.5481213582479405, + "grad_norm": 5.390667219929842, + "learning_rate": 2.1647377092595373e-06, + "loss": 16.0366, + "num_tokens": 141895713.0, + "step": 964 + }, + { + "epoch": 1.549728752260398, + "grad_norm": 6.685993946099798, + "learning_rate": 2.1568925935626554e-06, + "loss": 15.7102, + "num_tokens": 142264138.0, + "step": 965 + }, + { + "epoch": 1.5513361462728552, + "grad_norm": 5.679251657300267, + "learning_rate": 2.149070088667504e-06, + "loss": 15.765, + "num_tokens": 142666661.0, + "step": 966 + }, + { + "epoch": 1.5529435402853125, + "grad_norm": 5.167400795211997, + "learning_rate": 2.1412702474808783e-06, + "loss": 16.7888, + "num_tokens": 143088713.0, + "step": 967 + }, + { + "epoch": 1.5545509342977697, + "grad_norm": 5.631434685223519, + "learning_rate": 2.133493122756296e-06, + "loss": 15.761, + "num_tokens": 143500768.0, + "step": 968 + }, + { + "epoch": 1.556158328310227, + "grad_norm": 5.933464264436118, + "learning_rate": 2.125738767093626e-06, + "loss": 13.5066, + "num_tokens": 143858894.0, + "step": 969 + }, + { + "epoch": 1.556158328310227, + "eval_loss": 0.916408360004425, + "eval_num_tokens": 143858894.0, + "eval_runtime": 374.9099, + "eval_samples_per_second": 23.6, + "eval_steps_per_second": 5.9, + "step": 969 + }, + { + "epoch": 1.5577657223226844, + "grad_norm": 5.627969595251259, + "learning_rate": 2.1180072329387487e-06, + "loss": 17.5495, + "num_tokens": 144282896.0, + "step": 970 + }, + { + "epoch": 1.5593731163351416, + "grad_norm": 5.9368894873694416, + "learning_rate": 2.110298572583185e-06, + "loss": 16.2153, + "num_tokens": 144728257.0, + "step": 971 + }, + { + "epoch": 1.560980510347599, + "grad_norm": 5.578829204353252, + "learning_rate": 2.10261283816376e-06, + "loss": 15.1547, + "num_tokens": 145108173.0, + "step": 972 + }, + { + "epoch": 1.5625879043600563, + "grad_norm": 5.366913991378139, + "learning_rate": 2.0949500816622336e-06, + "loss": 15.927, + "num_tokens": 145513192.0, + "step": 973 + }, + { + "epoch": 1.5641952983725136, + "grad_norm": 5.214291102566144, + "learning_rate": 2.0873103549049624e-06, + "loss": 16.236, + "num_tokens": 145950288.0, + "step": 974 + }, + { + "epoch": 1.5658026923849708, + "grad_norm": 6.752180390457703, + "learning_rate": 2.0796937095625384e-06, + "loss": 15.7389, + "num_tokens": 146318597.0, + "step": 975 + }, + { + "epoch": 1.567410086397428, + "grad_norm": 5.123024679100364, + "learning_rate": 2.0721001971494477e-06, + "loss": 14.1264, + "num_tokens": 146746699.0, + "step": 976 + }, + { + "epoch": 1.5690174804098853, + "grad_norm": 4.9524078350055625, + "learning_rate": 2.0645298690237187e-06, + "loss": 15.5247, + "num_tokens": 147186539.0, + "step": 977 + }, + { + "epoch": 1.5706248744223428, + "grad_norm": 5.777329899915742, + "learning_rate": 2.056982776386576e-06, + "loss": 15.3978, + "num_tokens": 147591440.0, + "step": 978 + }, + { + "epoch": 1.5722322684348002, + "grad_norm": 6.4403440258480895, + "learning_rate": 2.049458970282088e-06, + "loss": 18.2436, + "num_tokens": 148026310.0, + "step": 979 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 6.541903048873551, + "learning_rate": 2.041958501596833e-06, + "loss": 18.7301, + "num_tokens": 148449265.0, + "step": 980 + }, + { + "epoch": 1.5754470564597147, + "grad_norm": 5.278653790166132, + "learning_rate": 2.0344814210595456e-06, + "loss": 15.8406, + "num_tokens": 148893677.0, + "step": 981 + }, + { + "epoch": 1.577054450472172, + "grad_norm": 5.570259645923092, + "learning_rate": 2.027027779240777e-06, + "loss": 16.6789, + "num_tokens": 149303192.0, + "step": 982 + }, + { + "epoch": 1.5786618444846292, + "grad_norm": 5.688649118282984, + "learning_rate": 2.0195976265525553e-06, + "loss": 15.3274, + "num_tokens": 149712167.0, + "step": 983 + }, + { + "epoch": 1.5802692384970864, + "grad_norm": 5.722886522426803, + "learning_rate": 2.0121910132480365e-06, + "loss": 18.37, + "num_tokens": 150165312.0, + "step": 984 + }, + { + "epoch": 1.581876632509544, + "grad_norm": 6.1529507004901305, + "learning_rate": 2.0048079894211765e-06, + "loss": 15.6064, + "num_tokens": 150535790.0, + "step": 985 + }, + { + "epoch": 1.5834840265220012, + "grad_norm": 5.753188248034981, + "learning_rate": 1.997448605006379e-06, + "loss": 16.8917, + "num_tokens": 150960935.0, + "step": 986 + }, + { + "epoch": 1.5850914205344586, + "grad_norm": 5.951082689133074, + "learning_rate": 1.9901129097781733e-06, + "loss": 17.3764, + "num_tokens": 151323036.0, + "step": 987 + }, + { + "epoch": 1.5866988145469159, + "grad_norm": 5.511795161594664, + "learning_rate": 1.9828009533508612e-06, + "loss": 15.9878, + "num_tokens": 151729295.0, + "step": 988 + }, + { + "epoch": 1.588306208559373, + "grad_norm": 5.599202966790124, + "learning_rate": 1.9755127851781946e-06, + "loss": 16.2688, + "num_tokens": 152144736.0, + "step": 989 + }, + { + "epoch": 1.5899136025718303, + "grad_norm": 5.6762264142112615, + "learning_rate": 1.9682484545530294e-06, + "loss": 14.7152, + "num_tokens": 152540663.0, + "step": 990 + }, + { + "epoch": 1.5915209965842876, + "grad_norm": 5.995753347244451, + "learning_rate": 1.9610080106070088e-06, + "loss": 17.1198, + "num_tokens": 152935889.0, + "step": 991 + }, + { + "epoch": 1.593128390596745, + "grad_norm": 5.5239431994938295, + "learning_rate": 1.9537915023102094e-06, + "loss": 15.7214, + "num_tokens": 153351676.0, + "step": 992 + }, + { + "epoch": 1.5947357846092023, + "grad_norm": 6.235603124820978, + "learning_rate": 1.9465989784708296e-06, + "loss": 16.2494, + "num_tokens": 153768543.0, + "step": 993 + }, + { + "epoch": 1.5963431786216598, + "grad_norm": 5.606495686311679, + "learning_rate": 1.939430487734844e-06, + "loss": 16.7961, + "num_tokens": 154186119.0, + "step": 994 + }, + { + "epoch": 1.597950572634117, + "grad_norm": 5.448806087006103, + "learning_rate": 1.932286078585688e-06, + "loss": 18.5895, + "num_tokens": 154653874.0, + "step": 995 + }, + { + "epoch": 1.5995579666465742, + "grad_norm": 5.791474287147548, + "learning_rate": 1.92516579934392e-06, + "loss": 14.9917, + "num_tokens": 155060273.0, + "step": 996 + }, + { + "epoch": 1.6011653606590315, + "grad_norm": 5.436993912703194, + "learning_rate": 1.9180696981668984e-06, + "loss": 15.7802, + "num_tokens": 155503587.0, + "step": 997 + }, + { + "epoch": 1.6027727546714887, + "grad_norm": 5.611752032821062, + "learning_rate": 1.9109978230484573e-06, + "loss": 16.2183, + "num_tokens": 155938958.0, + "step": 998 + }, + { + "epoch": 1.6043801486839462, + "grad_norm": 6.49078404190616, + "learning_rate": 1.9039502218185752e-06, + "loss": 15.7407, + "num_tokens": 156333121.0, + "step": 999 + }, + { + "epoch": 1.6059875426964034, + "grad_norm": 5.558798249164573, + "learning_rate": 1.8969269421430614e-06, + "loss": 16.135, + "num_tokens": 156772328.0, + "step": 1000 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 5.343824010362727, + "learning_rate": 1.8899280315232271e-06, + "loss": 15.2855, + "num_tokens": 157193419.0, + "step": 1001 + }, + { + "epoch": 1.6092023307213181, + "grad_norm": 4.814733078480887, + "learning_rate": 1.8829535372955657e-06, + "loss": 16.1078, + "num_tokens": 157650231.0, + "step": 1002 + }, + { + "epoch": 1.6108097247337754, + "grad_norm": 5.8806498140245225, + "learning_rate": 1.8760035066314297e-06, + "loss": 15.8679, + "num_tokens": 158107661.0, + "step": 1003 + }, + { + "epoch": 1.6124171187462326, + "grad_norm": 5.683864774508597, + "learning_rate": 1.8690779865367199e-06, + "loss": 18.5054, + "num_tokens": 158521512.0, + "step": 1004 + }, + { + "epoch": 1.6140245127586899, + "grad_norm": 6.4321810220676685, + "learning_rate": 1.8621770238515575e-06, + "loss": 16.358, + "num_tokens": 158916357.0, + "step": 1005 + }, + { + "epoch": 1.6156319067711473, + "grad_norm": 6.004080740051691, + "learning_rate": 1.8553006652499745e-06, + "loss": 15.1444, + "num_tokens": 159319362.0, + "step": 1006 + }, + { + "epoch": 1.6172393007836046, + "grad_norm": 5.5266742737649235, + "learning_rate": 1.8484489572395936e-06, + "loss": 15.8641, + "num_tokens": 159750104.0, + "step": 1007 + }, + { + "epoch": 1.618846694796062, + "grad_norm": 5.37455793632147, + "learning_rate": 1.8416219461613177e-06, + "loss": 17.0416, + "num_tokens": 160214270.0, + "step": 1008 + }, + { + "epoch": 1.6204540888085193, + "grad_norm": 5.69219273718984, + "learning_rate": 1.83481967818901e-06, + "loss": 16.4479, + "num_tokens": 160623884.0, + "step": 1009 + }, + { + "epoch": 1.6220614828209765, + "grad_norm": 5.387575488501969, + "learning_rate": 1.828042199329193e-06, + "loss": 14.7942, + "num_tokens": 161011427.0, + "step": 1010 + }, + { + "epoch": 1.6236688768334337, + "grad_norm": 5.293135580224568, + "learning_rate": 1.8212895554207233e-06, + "loss": 14.8427, + "num_tokens": 161451622.0, + "step": 1011 + }, + { + "epoch": 1.625276270845891, + "grad_norm": 5.686208721200542, + "learning_rate": 1.8145617921344922e-06, + "loss": 17.3644, + "num_tokens": 161880956.0, + "step": 1012 + }, + { + "epoch": 1.6268836648583485, + "grad_norm": 5.689519378661598, + "learning_rate": 1.807858954973114e-06, + "loss": 16.2263, + "num_tokens": 162294153.0, + "step": 1013 + }, + { + "epoch": 1.6284910588708057, + "grad_norm": 5.213367585169762, + "learning_rate": 1.8011810892706134e-06, + "loss": 17.362, + "num_tokens": 162728704.0, + "step": 1014 + }, + { + "epoch": 1.6300984528832632, + "grad_norm": 5.900073173368331, + "learning_rate": 1.794528240192126e-06, + "loss": 16.4088, + "num_tokens": 163126511.0, + "step": 1015 + }, + { + "epoch": 1.6317058468957204, + "grad_norm": 5.7521137214247915, + "learning_rate": 1.787900452733591e-06, + "loss": 16.18, + "num_tokens": 163519294.0, + "step": 1016 + }, + { + "epoch": 1.6333132409081776, + "grad_norm": 5.6111329852100065, + "learning_rate": 1.781297771721442e-06, + "loss": 16.6257, + "num_tokens": 163919233.0, + "step": 1017 + }, + { + "epoch": 1.6349206349206349, + "grad_norm": 5.809102120579805, + "learning_rate": 1.7747202418123099e-06, + "loss": 18.2281, + "num_tokens": 164327941.0, + "step": 1018 + }, + { + "epoch": 1.6365280289330921, + "grad_norm": 5.815065811253397, + "learning_rate": 1.7681679074927165e-06, + "loss": 16.9266, + "num_tokens": 164739051.0, + "step": 1019 + }, + { + "epoch": 1.6381354229455494, + "grad_norm": 5.937559985162675, + "learning_rate": 1.7616408130787789e-06, + "loss": 15.2837, + "num_tokens": 165111879.0, + "step": 1020 + }, + { + "epoch": 1.6381354229455494, + "eval_loss": 0.9130262136459351, + "eval_num_tokens": 165111879.0, + "eval_runtime": 374.8492, + "eval_samples_per_second": 23.604, + "eval_steps_per_second": 5.901, + "step": 1020 + }, + { + "epoch": 1.6397428169580068, + "grad_norm": 4.645675674950863, + "learning_rate": 1.7551390027159035e-06, + "loss": 13.7898, + "num_tokens": 165535828.0, + "step": 1021 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 5.039201487046393, + "learning_rate": 1.7486625203784902e-06, + "loss": 17.1725, + "num_tokens": 165973679.0, + "step": 1022 + }, + { + "epoch": 1.6429576049829215, + "grad_norm": 5.3033870463312125, + "learning_rate": 1.742211409869636e-06, + "loss": 15.9893, + "num_tokens": 166387479.0, + "step": 1023 + }, + { + "epoch": 1.6445649989953788, + "grad_norm": 6.032789680797502, + "learning_rate": 1.7357857148208391e-06, + "loss": 17.4633, + "num_tokens": 166802348.0, + "step": 1024 + }, + { + "epoch": 1.646172393007836, + "grad_norm": 5.735076681326581, + "learning_rate": 1.7293854786917017e-06, + "loss": 16.3112, + "num_tokens": 167195962.0, + "step": 1025 + }, + { + "epoch": 1.6477797870202933, + "grad_norm": 5.611914274070673, + "learning_rate": 1.7230107447696343e-06, + "loss": 16.2518, + "num_tokens": 167630680.0, + "step": 1026 + }, + { + "epoch": 1.6493871810327505, + "grad_norm": 5.801992414673835, + "learning_rate": 1.7166615561695687e-06, + "loss": 16.7808, + "num_tokens": 168030941.0, + "step": 1027 + }, + { + "epoch": 1.650994575045208, + "grad_norm": 5.847858066670803, + "learning_rate": 1.710337955833663e-06, + "loss": 17.2203, + "num_tokens": 168487546.0, + "step": 1028 + }, + { + "epoch": 1.6526019690576652, + "grad_norm": 5.640814602927264, + "learning_rate": 1.7040399865310114e-06, + "loss": 14.8832, + "num_tokens": 168881073.0, + "step": 1029 + }, + { + "epoch": 1.6542093630701227, + "grad_norm": 5.610377866515623, + "learning_rate": 1.6977676908573521e-06, + "loss": 16.2125, + "num_tokens": 169264724.0, + "step": 1030 + }, + { + "epoch": 1.65581675708258, + "grad_norm": 5.578955681639516, + "learning_rate": 1.6915211112347857e-06, + "loss": 17.9914, + "num_tokens": 169661744.0, + "step": 1031 + }, + { + "epoch": 1.6574241510950372, + "grad_norm": 6.282085322537562, + "learning_rate": 1.6853002899114838e-06, + "loss": 15.7254, + "num_tokens": 170083264.0, + "step": 1032 + }, + { + "epoch": 1.6590315451074944, + "grad_norm": 5.165961659419689, + "learning_rate": 1.6791052689614006e-06, + "loss": 15.2604, + "num_tokens": 170493380.0, + "step": 1033 + }, + { + "epoch": 1.6606389391199516, + "grad_norm": 6.53437318642524, + "learning_rate": 1.6729360902839989e-06, + "loss": 17.3123, + "num_tokens": 170852934.0, + "step": 1034 + }, + { + "epoch": 1.662246333132409, + "grad_norm": 6.6126690926926095, + "learning_rate": 1.6667927956039524e-06, + "loss": 16.1174, + "num_tokens": 171220301.0, + "step": 1035 + }, + { + "epoch": 1.6638537271448663, + "grad_norm": 6.216076532431903, + "learning_rate": 1.6606754264708754e-06, + "loss": 19.0162, + "num_tokens": 171648072.0, + "step": 1036 + }, + { + "epoch": 1.6654611211573238, + "grad_norm": 6.390089552608348, + "learning_rate": 1.6545840242590338e-06, + "loss": 16.0032, + "num_tokens": 172050191.0, + "step": 1037 + }, + { + "epoch": 1.667068515169781, + "grad_norm": 5.796423743776019, + "learning_rate": 1.6485186301670717e-06, + "loss": 16.0399, + "num_tokens": 172485796.0, + "step": 1038 + }, + { + "epoch": 1.6686759091822383, + "grad_norm": 5.465609634374826, + "learning_rate": 1.642479285217728e-06, + "loss": 16.1825, + "num_tokens": 172900564.0, + "step": 1039 + }, + { + "epoch": 1.6702833031946955, + "grad_norm": 5.103626675573136, + "learning_rate": 1.6364660302575613e-06, + "loss": 16.3371, + "num_tokens": 173344415.0, + "step": 1040 + }, + { + "epoch": 1.6718906972071528, + "grad_norm": 5.848759341049538, + "learning_rate": 1.6304789059566718e-06, + "loss": 16.1037, + "num_tokens": 173783873.0, + "step": 1041 + }, + { + "epoch": 1.6734980912196102, + "grad_norm": 5.996447047260234, + "learning_rate": 1.6245179528084277e-06, + "loss": 17.4757, + "num_tokens": 174217419.0, + "step": 1042 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 5.462062929032345, + "learning_rate": 1.6185832111291925e-06, + "loss": 16.1295, + "num_tokens": 174619064.0, + "step": 1043 + }, + { + "epoch": 1.676712879244525, + "grad_norm": 5.842729197541687, + "learning_rate": 1.61267472105805e-06, + "loss": 15.2796, + "num_tokens": 175008414.0, + "step": 1044 + }, + { + "epoch": 1.6783202732569822, + "grad_norm": 5.398752925675963, + "learning_rate": 1.6067925225565322e-06, + "loss": 15.2177, + "num_tokens": 175412868.0, + "step": 1045 + }, + { + "epoch": 1.6799276672694394, + "grad_norm": 5.0366261914346495, + "learning_rate": 1.6009366554083517e-06, + "loss": 16.7296, + "num_tokens": 175849401.0, + "step": 1046 + }, + { + "epoch": 1.6815350612818967, + "grad_norm": 5.553119291564378, + "learning_rate": 1.5951071592191319e-06, + "loss": 14.7445, + "num_tokens": 176276633.0, + "step": 1047 + }, + { + "epoch": 1.683142455294354, + "grad_norm": 5.514675116954182, + "learning_rate": 1.5893040734161382e-06, + "loss": 16.4458, + "num_tokens": 176704439.0, + "step": 1048 + }, + { + "epoch": 1.6847498493068114, + "grad_norm": 5.27664457159878, + "learning_rate": 1.583527437248012e-06, + "loss": 16.4284, + "num_tokens": 177151925.0, + "step": 1049 + }, + { + "epoch": 1.6863572433192686, + "grad_norm": 5.9626127156111925, + "learning_rate": 1.5777772897845033e-06, + "loss": 17.9256, + "num_tokens": 177573378.0, + "step": 1050 + }, + { + "epoch": 1.687964637331726, + "grad_norm": 5.996956750440033, + "learning_rate": 1.572053669916211e-06, + "loss": 15.9315, + "num_tokens": 177955624.0, + "step": 1051 + }, + { + "epoch": 1.6895720313441833, + "grad_norm": 5.494918070332659, + "learning_rate": 1.5663566163543134e-06, + "loss": 14.8528, + "num_tokens": 178352120.0, + "step": 1052 + }, + { + "epoch": 1.6911794253566406, + "grad_norm": 5.157707998193962, + "learning_rate": 1.5606861676303148e-06, + "loss": 17.11, + "num_tokens": 178855199.0, + "step": 1053 + }, + { + "epoch": 1.6927868193690978, + "grad_norm": 6.033177344461404, + "learning_rate": 1.5550423620957752e-06, + "loss": 16.4448, + "num_tokens": 179252145.0, + "step": 1054 + }, + { + "epoch": 1.694394213381555, + "grad_norm": 5.825310149659474, + "learning_rate": 1.54942523792206e-06, + "loss": 15.48, + "num_tokens": 179670207.0, + "step": 1055 + }, + { + "epoch": 1.6960016073940125, + "grad_norm": 5.574831716502742, + "learning_rate": 1.543834833100073e-06, + "loss": 16.613, + "num_tokens": 180045517.0, + "step": 1056 + }, + { + "epoch": 1.6976090014064698, + "grad_norm": 5.184124203387865, + "learning_rate": 1.5382711854400107e-06, + "loss": 17.4688, + "num_tokens": 180484701.0, + "step": 1057 + }, + { + "epoch": 1.6992163954189272, + "grad_norm": 5.33828679476089, + "learning_rate": 1.5327343325710925e-06, + "loss": 16.6434, + "num_tokens": 180940709.0, + "step": 1058 + }, + { + "epoch": 1.7008237894313845, + "grad_norm": 5.669096611240068, + "learning_rate": 1.52722431194132e-06, + "loss": 14.5986, + "num_tokens": 181352190.0, + "step": 1059 + }, + { + "epoch": 1.7024311834438417, + "grad_norm": 5.732641713894058, + "learning_rate": 1.5217411608172123e-06, + "loss": 17.0333, + "num_tokens": 181829442.0, + "step": 1060 + }, + { + "epoch": 1.704038577456299, + "grad_norm": 5.27537512677547, + "learning_rate": 1.516284916283563e-06, + "loss": 17.6941, + "num_tokens": 182330861.0, + "step": 1061 + }, + { + "epoch": 1.7056459714687562, + "grad_norm": 5.815672394602911, + "learning_rate": 1.510855615243183e-06, + "loss": 16.0308, + "num_tokens": 182749980.0, + "step": 1062 + }, + { + "epoch": 1.7072533654812134, + "grad_norm": 5.053269485244236, + "learning_rate": 1.505453294416655e-06, + "loss": 16.4365, + "num_tokens": 183212938.0, + "step": 1063 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 5.74007560702757, + "learning_rate": 1.5000779903420834e-06, + "loss": 15.7752, + "num_tokens": 183633748.0, + "step": 1064 + }, + { + "epoch": 1.7104681535061284, + "grad_norm": 6.140255716031313, + "learning_rate": 1.4947297393748439e-06, + "loss": 14.9197, + "num_tokens": 184032551.0, + "step": 1065 + }, + { + "epoch": 1.7120755475185856, + "grad_norm": 5.193636063941469, + "learning_rate": 1.4894085776873456e-06, + "loss": 14.7676, + "num_tokens": 184452903.0, + "step": 1066 + }, + { + "epoch": 1.7136829415310428, + "grad_norm": 6.313731834006983, + "learning_rate": 1.4841145412687797e-06, + "loss": 20.1064, + "num_tokens": 184891245.0, + "step": 1067 + }, + { + "epoch": 1.7152903355435, + "grad_norm": 5.840233970616322, + "learning_rate": 1.478847665924879e-06, + "loss": 16.281, + "num_tokens": 185307166.0, + "step": 1068 + }, + { + "epoch": 1.7168977295559573, + "grad_norm": 4.960234465465773, + "learning_rate": 1.4736079872776716e-06, + "loss": 16.5649, + "num_tokens": 185765708.0, + "step": 1069 + }, + { + "epoch": 1.7185051235684146, + "grad_norm": 6.225138240929624, + "learning_rate": 1.4683955407652484e-06, + "loss": 15.939, + "num_tokens": 186158124.0, + "step": 1070 + }, + { + "epoch": 1.720112517580872, + "grad_norm": 5.120583527953399, + "learning_rate": 1.4632103616415119e-06, + "loss": 17.6247, + "num_tokens": 186613876.0, + "step": 1071 + }, + { + "epoch": 1.720112517580872, + "eval_loss": 0.9101909399032593, + "eval_num_tokens": 186613876.0, + "eval_runtime": 375.5035, + "eval_samples_per_second": 23.563, + "eval_steps_per_second": 5.891, + "step": 1071 + }, + { + "epoch": 1.7217199115933293, + "grad_norm": 5.470591673502105, + "learning_rate": 1.458052484975952e-06, + "loss": 17.3327, + "num_tokens": 187030288.0, + "step": 1072 + }, + { + "epoch": 1.7233273056057867, + "grad_norm": 6.212074707496289, + "learning_rate": 1.4529219456533934e-06, + "loss": 17.6734, + "num_tokens": 187423862.0, + "step": 1073 + }, + { + "epoch": 1.724934699618244, + "grad_norm": 5.115610782340778, + "learning_rate": 1.4478187783737726e-06, + "loss": 16.9593, + "num_tokens": 187845783.0, + "step": 1074 + }, + { + "epoch": 1.7265420936307012, + "grad_norm": 5.3630561014891684, + "learning_rate": 1.4427430176518927e-06, + "loss": 16.9207, + "num_tokens": 188280375.0, + "step": 1075 + }, + { + "epoch": 1.7281494876431585, + "grad_norm": 5.416798960861184, + "learning_rate": 1.4376946978172017e-06, + "loss": 17.758, + "num_tokens": 188732926.0, + "step": 1076 + }, + { + "epoch": 1.7297568816556157, + "grad_norm": 6.152857461192323, + "learning_rate": 1.4326738530135472e-06, + "loss": 18.2643, + "num_tokens": 189144121.0, + "step": 1077 + }, + { + "epoch": 1.7313642756680732, + "grad_norm": 5.392176966254685, + "learning_rate": 1.427680517198956e-06, + "loss": 15.2129, + "num_tokens": 189584173.0, + "step": 1078 + }, + { + "epoch": 1.7329716696805304, + "grad_norm": 5.73418237200729, + "learning_rate": 1.4227147241453987e-06, + "loss": 17.8573, + "num_tokens": 190022157.0, + "step": 1079 + }, + { + "epoch": 1.7345790636929879, + "grad_norm": 5.517215935433924, + "learning_rate": 1.4177765074385623e-06, + "loss": 18.5507, + "num_tokens": 190455988.0, + "step": 1080 + }, + { + "epoch": 1.736186457705445, + "grad_norm": 5.959684446702604, + "learning_rate": 1.4128659004776258e-06, + "loss": 17.5235, + "num_tokens": 190896241.0, + "step": 1081 + }, + { + "epoch": 1.7377938517179023, + "grad_norm": 5.187394321937164, + "learning_rate": 1.4079829364750297e-06, + "loss": 16.6391, + "num_tokens": 191326485.0, + "step": 1082 + }, + { + "epoch": 1.7394012457303596, + "grad_norm": 5.533160296836899, + "learning_rate": 1.4031276484562582e-06, + "loss": 15.9506, + "num_tokens": 191760127.0, + "step": 1083 + }, + { + "epoch": 1.7410086397428168, + "grad_norm": 5.751427162523246, + "learning_rate": 1.3983000692596052e-06, + "loss": 15.0694, + "num_tokens": 192209367.0, + "step": 1084 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 6.4527875701949515, + "learning_rate": 1.393500231535963e-06, + "loss": 16.6661, + "num_tokens": 192567114.0, + "step": 1085 + }, + { + "epoch": 1.7442234277677315, + "grad_norm": 6.142573423677409, + "learning_rate": 1.3887281677485964e-06, + "loss": 15.3452, + "num_tokens": 192943252.0, + "step": 1086 + }, + { + "epoch": 1.745830821780189, + "grad_norm": 6.0666983353042605, + "learning_rate": 1.3839839101729233e-06, + "loss": 16.1222, + "num_tokens": 193329370.0, + "step": 1087 + }, + { + "epoch": 1.7474382157926462, + "grad_norm": 5.539876973858942, + "learning_rate": 1.3792674908962954e-06, + "loss": 16.3265, + "num_tokens": 193777002.0, + "step": 1088 + }, + { + "epoch": 1.7490456098051035, + "grad_norm": 5.684506498751541, + "learning_rate": 1.3745789418177846e-06, + "loss": 18.8923, + "num_tokens": 194251343.0, + "step": 1089 + }, + { + "epoch": 1.7506530038175607, + "grad_norm": 5.768206182396594, + "learning_rate": 1.369918294647965e-06, + "loss": 18.3707, + "num_tokens": 194650108.0, + "step": 1090 + }, + { + "epoch": 1.752260397830018, + "grad_norm": 5.819715111259433, + "learning_rate": 1.3652855809086985e-06, + "loss": 15.8017, + "num_tokens": 195043302.0, + "step": 1091 + }, + { + "epoch": 1.7538677918424754, + "grad_norm": 6.416421607580615, + "learning_rate": 1.3606808319329202e-06, + "loss": 13.11, + "num_tokens": 195426395.0, + "step": 1092 + }, + { + "epoch": 1.7554751858549327, + "grad_norm": 5.356932863560532, + "learning_rate": 1.3561040788644304e-06, + "loss": 13.6481, + "num_tokens": 195837695.0, + "step": 1093 + }, + { + "epoch": 1.7570825798673901, + "grad_norm": 5.839521295998854, + "learning_rate": 1.351555352657679e-06, + "loss": 17.2469, + "num_tokens": 196255450.0, + "step": 1094 + }, + { + "epoch": 1.7586899738798474, + "grad_norm": 5.587899117925861, + "learning_rate": 1.3470346840775632e-06, + "loss": 15.6063, + "num_tokens": 196642158.0, + "step": 1095 + }, + { + "epoch": 1.7602973678923046, + "grad_norm": 6.154059500065801, + "learning_rate": 1.3425421036992098e-06, + "loss": 14.1828, + "num_tokens": 196996266.0, + "step": 1096 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 6.049450187032808, + "learning_rate": 1.3380776419077773e-06, + "loss": 18.1201, + "num_tokens": 197393763.0, + "step": 1097 + }, + { + "epoch": 1.763512155917219, + "grad_norm": 5.559103141296837, + "learning_rate": 1.3336413288982456e-06, + "loss": 16.0454, + "num_tokens": 197806668.0, + "step": 1098 + }, + { + "epoch": 1.7651195499296763, + "grad_norm": 5.6986634343798235, + "learning_rate": 1.3292331946752113e-06, + "loss": 16.8251, + "num_tokens": 198205381.0, + "step": 1099 + }, + { + "epoch": 1.7667269439421338, + "grad_norm": 4.772590346256412, + "learning_rate": 1.3248532690526897e-06, + "loss": 16.7637, + "num_tokens": 198677290.0, + "step": 1100 + }, + { + "epoch": 1.7683343379545913, + "grad_norm": 6.163774882856735, + "learning_rate": 1.3205015816539056e-06, + "loss": 17.7161, + "num_tokens": 199041320.0, + "step": 1101 + }, + { + "epoch": 1.7699417319670485, + "grad_norm": 6.000112091390441, + "learning_rate": 1.3161781619111022e-06, + "loss": 13.4033, + "num_tokens": 199447296.0, + "step": 1102 + }, + { + "epoch": 1.7715491259795058, + "grad_norm": 6.455532865985289, + "learning_rate": 1.311883039065332e-06, + "loss": 17.8292, + "num_tokens": 199844172.0, + "step": 1103 + }, + { + "epoch": 1.773156519991963, + "grad_norm": 5.529245167887561, + "learning_rate": 1.3076162421662665e-06, + "loss": 17.5833, + "num_tokens": 200257365.0, + "step": 1104 + }, + { + "epoch": 1.7747639140044202, + "grad_norm": 5.6966834310957015, + "learning_rate": 1.3033778000719978e-06, + "loss": 15.1792, + "num_tokens": 200675342.0, + "step": 1105 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 5.759319387192927, + "learning_rate": 1.299167741448842e-06, + "loss": 18.0534, + "num_tokens": 201119767.0, + "step": 1106 + }, + { + "epoch": 1.777978702029335, + "grad_norm": 6.444582190114083, + "learning_rate": 1.294986094771146e-06, + "loss": 15.8181, + "num_tokens": 201492910.0, + "step": 1107 + }, + { + "epoch": 1.7795860960417924, + "grad_norm": 5.988935700089299, + "learning_rate": 1.2908328883210946e-06, + "loss": 14.8046, + "num_tokens": 201884653.0, + "step": 1108 + }, + { + "epoch": 1.7811934900542497, + "grad_norm": 4.878394442507177, + "learning_rate": 1.286708150188521e-06, + "loss": 15.4053, + "num_tokens": 202371415.0, + "step": 1109 + }, + { + "epoch": 1.782800884066707, + "grad_norm": 5.376681580620633, + "learning_rate": 1.2826119082707162e-06, + "loss": 18.2119, + "num_tokens": 202827338.0, + "step": 1110 + }, + { + "epoch": 1.7844082780791641, + "grad_norm": 6.029400160907876, + "learning_rate": 1.2785441902722365e-06, + "loss": 16.8847, + "num_tokens": 203234229.0, + "step": 1111 + }, + { + "epoch": 1.7860156720916214, + "grad_norm": 5.404928582961019, + "learning_rate": 1.274505023704722e-06, + "loss": 17.6841, + "num_tokens": 203647245.0, + "step": 1112 + }, + { + "epoch": 1.7876230661040786, + "grad_norm": 5.587890634103252, + "learning_rate": 1.2704944358867071e-06, + "loss": 17.2833, + "num_tokens": 204091625.0, + "step": 1113 + }, + { + "epoch": 1.789230460116536, + "grad_norm": 5.417846462064859, + "learning_rate": 1.266512453943437e-06, + "loss": 16.1214, + "num_tokens": 204492724.0, + "step": 1114 + }, + { + "epoch": 1.7908378541289933, + "grad_norm": 5.734243280961947, + "learning_rate": 1.2625591048066814e-06, + "loss": 13.9964, + "num_tokens": 204826007.0, + "step": 1115 + }, + { + "epoch": 1.7924452481414508, + "grad_norm": 5.137452012954203, + "learning_rate": 1.2586344152145578e-06, + "loss": 17.3246, + "num_tokens": 205272595.0, + "step": 1116 + }, + { + "epoch": 1.794052642153908, + "grad_norm": 5.532638406864491, + "learning_rate": 1.2547384117113456e-06, + "loss": 17.0411, + "num_tokens": 205666409.0, + "step": 1117 + }, + { + "epoch": 1.7956600361663653, + "grad_norm": 5.749543977408011, + "learning_rate": 1.2508711206473079e-06, + "loss": 16.5469, + "num_tokens": 206086028.0, + "step": 1118 + }, + { + "epoch": 1.7972674301788225, + "grad_norm": 6.190464809376764, + "learning_rate": 1.247032568178516e-06, + "loss": 17.6302, + "num_tokens": 206462474.0, + "step": 1119 + }, + { + "epoch": 1.7988748241912798, + "grad_norm": 5.766415913605399, + "learning_rate": 1.2432227802666683e-06, + "loss": 18.4856, + "num_tokens": 206911814.0, + "step": 1120 + }, + { + "epoch": 1.8004822182037372, + "grad_norm": 6.088188148895654, + "learning_rate": 1.239441782678918e-06, + "loss": 16.8456, + "num_tokens": 207288020.0, + "step": 1121 + }, + { + "epoch": 1.8020896122161945, + "grad_norm": 6.1907194697578305, + "learning_rate": 1.2356896009876953e-06, + "loss": 15.9138, + "num_tokens": 207701614.0, + "step": 1122 + }, + { + "epoch": 1.8020896122161945, + "eval_loss": 0.9071792960166931, + "eval_num_tokens": 207701614.0, + "eval_runtime": 375.4053, + "eval_samples_per_second": 23.569, + "eval_steps_per_second": 5.892, + "step": 1122 + }, + { + "epoch": 1.803697006228652, + "grad_norm": 6.41228128819154, + "learning_rate": 1.231966260570542e-06, + "loss": 15.2304, + "num_tokens": 208093649.0, + "step": 1123 + }, + { + "epoch": 1.8053044002411092, + "grad_norm": 5.71344577514554, + "learning_rate": 1.2282717866099278e-06, + "loss": 18.0213, + "num_tokens": 208534581.0, + "step": 1124 + }, + { + "epoch": 1.8069117942535664, + "grad_norm": 5.738763832575242, + "learning_rate": 1.224606204093092e-06, + "loss": 16.908, + "num_tokens": 208918969.0, + "step": 1125 + }, + { + "epoch": 1.8085191882660236, + "grad_norm": 6.78514902313829, + "learning_rate": 1.2209695378118662e-06, + "loss": 15.7252, + "num_tokens": 209309384.0, + "step": 1126 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 6.314540954597329, + "learning_rate": 1.2173618123625114e-06, + "loss": 15.4903, + "num_tokens": 209693371.0, + "step": 1127 + }, + { + "epoch": 1.8117339762909384, + "grad_norm": 4.931901080924943, + "learning_rate": 1.2137830521455491e-06, + "loss": 17.4824, + "num_tokens": 210133103.0, + "step": 1128 + }, + { + "epoch": 1.8133413703033956, + "grad_norm": 6.4083198010837625, + "learning_rate": 1.2102332813655976e-06, + "loss": 19.3428, + "num_tokens": 210552556.0, + "step": 1129 + }, + { + "epoch": 1.814948764315853, + "grad_norm": 4.910677728160251, + "learning_rate": 1.2067125240312065e-06, + "loss": 17.4666, + "num_tokens": 211037572.0, + "step": 1130 + }, + { + "epoch": 1.8165561583283103, + "grad_norm": 5.5288392413845635, + "learning_rate": 1.203220803954698e-06, + "loss": 17.0508, + "num_tokens": 211475509.0, + "step": 1131 + }, + { + "epoch": 1.8181635523407675, + "grad_norm": 5.278879731207785, + "learning_rate": 1.1997581447520006e-06, + "loss": 15.5352, + "num_tokens": 211926975.0, + "step": 1132 + }, + { + "epoch": 1.8197709463532248, + "grad_norm": 4.979333461650769, + "learning_rate": 1.1963245698424947e-06, + "loss": 15.761, + "num_tokens": 212382390.0, + "step": 1133 + }, + { + "epoch": 1.821378340365682, + "grad_norm": 6.04431314225651, + "learning_rate": 1.192920102448851e-06, + "loss": 15.6307, + "num_tokens": 212794766.0, + "step": 1134 + }, + { + "epoch": 1.8229857343781395, + "grad_norm": 5.834208980534594, + "learning_rate": 1.1895447655968729e-06, + "loss": 14.8834, + "num_tokens": 213188571.0, + "step": 1135 + }, + { + "epoch": 1.8245931283905967, + "grad_norm": 6.017434580137432, + "learning_rate": 1.1861985821153444e-06, + "loss": 15.9958, + "num_tokens": 213584015.0, + "step": 1136 + }, + { + "epoch": 1.8262005224030542, + "grad_norm": 5.883485766771821, + "learning_rate": 1.1828815746358693e-06, + "loss": 15.3746, + "num_tokens": 213951357.0, + "step": 1137 + }, + { + "epoch": 1.8278079164155114, + "grad_norm": 6.247913670304622, + "learning_rate": 1.1795937655927283e-06, + "loss": 14.4933, + "num_tokens": 214313020.0, + "step": 1138 + }, + { + "epoch": 1.8294153104279687, + "grad_norm": 5.80402806100467, + "learning_rate": 1.1763351772227153e-06, + "loss": 14.8009, + "num_tokens": 214686689.0, + "step": 1139 + }, + { + "epoch": 1.831022704440426, + "grad_norm": 5.613817540491096, + "learning_rate": 1.1731058315649965e-06, + "loss": 16.9549, + "num_tokens": 215109674.0, + "step": 1140 + }, + { + "epoch": 1.8326300984528832, + "grad_norm": 5.755907448010412, + "learning_rate": 1.1699057504609546e-06, + "loss": 17.1598, + "num_tokens": 215531950.0, + "step": 1141 + }, + { + "epoch": 1.8342374924653404, + "grad_norm": 5.022998221322219, + "learning_rate": 1.166734955554049e-06, + "loss": 17.6038, + "num_tokens": 215992916.0, + "step": 1142 + }, + { + "epoch": 1.8358448864777979, + "grad_norm": 5.96840565689285, + "learning_rate": 1.1635934682896586e-06, + "loss": 14.8299, + "num_tokens": 216422632.0, + "step": 1143 + }, + { + "epoch": 1.8374522804902553, + "grad_norm": 5.722466389988625, + "learning_rate": 1.1604813099149478e-06, + "loss": 16.9265, + "num_tokens": 216835936.0, + "step": 1144 + }, + { + "epoch": 1.8390596745027126, + "grad_norm": 5.219482060849325, + "learning_rate": 1.1573985014787145e-06, + "loss": 18.1496, + "num_tokens": 217297785.0, + "step": 1145 + }, + { + "epoch": 1.8406670685151698, + "grad_norm": 5.453487294279657, + "learning_rate": 1.1543450638312525e-06, + "loss": 14.8859, + "num_tokens": 217709299.0, + "step": 1146 + }, + { + "epoch": 1.842274462527627, + "grad_norm": 5.542894083357031, + "learning_rate": 1.1513210176242082e-06, + "loss": 15.7738, + "num_tokens": 218118571.0, + "step": 1147 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 5.1251828921877145, + "learning_rate": 1.1483263833104422e-06, + "loss": 14.7667, + "num_tokens": 218516515.0, + "step": 1148 + }, + { + "epoch": 1.8454892505525415, + "grad_norm": 5.676022496543299, + "learning_rate": 1.1453611811438914e-06, + "loss": 14.584, + "num_tokens": 218895980.0, + "step": 1149 + }, + { + "epoch": 1.847096644564999, + "grad_norm": 5.83022163950287, + "learning_rate": 1.1424254311794286e-06, + "loss": 17.5445, + "num_tokens": 219334016.0, + "step": 1150 + }, + { + "epoch": 1.8487040385774565, + "grad_norm": 5.596548994023143, + "learning_rate": 1.1395191532727306e-06, + "loss": 17.6321, + "num_tokens": 219768969.0, + "step": 1151 + }, + { + "epoch": 1.8503114325899137, + "grad_norm": 6.3867779572894765, + "learning_rate": 1.136642367080143e-06, + "loss": 16.488, + "num_tokens": 220160513.0, + "step": 1152 + }, + { + "epoch": 1.851918826602371, + "grad_norm": 4.835047035925783, + "learning_rate": 1.1337950920585476e-06, + "loss": 16.882, + "num_tokens": 220576609.0, + "step": 1153 + }, + { + "epoch": 1.8535262206148282, + "grad_norm": 5.632790161008604, + "learning_rate": 1.130977347465227e-06, + "loss": 17.6376, + "num_tokens": 220978098.0, + "step": 1154 + }, + { + "epoch": 1.8551336146272854, + "grad_norm": 5.258347162774134, + "learning_rate": 1.1281891523577413e-06, + "loss": 16.8908, + "num_tokens": 221400443.0, + "step": 1155 + }, + { + "epoch": 1.8567410086397427, + "grad_norm": 5.661493684922162, + "learning_rate": 1.1254305255937925e-06, + "loss": 14.7866, + "num_tokens": 221749707.0, + "step": 1156 + }, + { + "epoch": 1.8583484026522001, + "grad_norm": 5.33005060996647, + "learning_rate": 1.1227014858311022e-06, + "loss": 14.7077, + "num_tokens": 222141188.0, + "step": 1157 + }, + { + "epoch": 1.8599557966646574, + "grad_norm": 5.59374821910163, + "learning_rate": 1.12000205152728e-06, + "loss": 14.2631, + "num_tokens": 222544880.0, + "step": 1158 + }, + { + "epoch": 1.8615631906771148, + "grad_norm": 5.294058527044083, + "learning_rate": 1.1173322409397042e-06, + "loss": 18.3455, + "num_tokens": 222974799.0, + "step": 1159 + }, + { + "epoch": 1.863170584689572, + "grad_norm": 6.106734114249205, + "learning_rate": 1.1146920721253945e-06, + "loss": 17.6714, + "num_tokens": 223353516.0, + "step": 1160 + }, + { + "epoch": 1.8647779787020293, + "grad_norm": 5.484559878919397, + "learning_rate": 1.1120815629408926e-06, + "loss": 14.0907, + "num_tokens": 223746143.0, + "step": 1161 + }, + { + "epoch": 1.8663853727144866, + "grad_norm": 5.911380606194956, + "learning_rate": 1.1095007310421383e-06, + "loss": 17.3796, + "num_tokens": 224161097.0, + "step": 1162 + }, + { + "epoch": 1.8679927667269438, + "grad_norm": 5.9189117227133545, + "learning_rate": 1.1069495938843527e-06, + "loss": 16.0905, + "num_tokens": 224555219.0, + "step": 1163 + }, + { + "epoch": 1.8696001607394013, + "grad_norm": 5.823189780868601, + "learning_rate": 1.104428168721919e-06, + "loss": 15.175, + "num_tokens": 224960007.0, + "step": 1164 + }, + { + "epoch": 1.8712075547518585, + "grad_norm": 4.834911477613552, + "learning_rate": 1.1019364726082662e-06, + "loss": 16.0813, + "num_tokens": 225411414.0, + "step": 1165 + }, + { + "epoch": 1.872814948764316, + "grad_norm": 6.0352484371454205, + "learning_rate": 1.0994745223957536e-06, + "loss": 16.0345, + "num_tokens": 225780271.0, + "step": 1166 + }, + { + "epoch": 1.8744223427767732, + "grad_norm": 6.137002771853259, + "learning_rate": 1.0970423347355563e-06, + "loss": 15.4474, + "num_tokens": 226151777.0, + "step": 1167 + }, + { + "epoch": 1.8760297367892305, + "grad_norm": 6.506651415207588, + "learning_rate": 1.094639926077554e-06, + "loss": 18.6836, + "num_tokens": 226555648.0, + "step": 1168 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 5.151501431275093, + "learning_rate": 1.0922673126702175e-06, + "loss": 15.3529, + "num_tokens": 227029894.0, + "step": 1169 + }, + { + "epoch": 1.879244524814145, + "grad_norm": 7.6016270985018055, + "learning_rate": 1.0899245105605013e-06, + "loss": 15.5262, + "num_tokens": 227406998.0, + "step": 1170 + }, + { + "epoch": 1.8808519188266024, + "grad_norm": 6.285819104356284, + "learning_rate": 1.087611535593734e-06, + "loss": 18.9258, + "num_tokens": 227803854.0, + "step": 1171 + }, + { + "epoch": 1.8824593128390597, + "grad_norm": 5.371685899090172, + "learning_rate": 1.0853284034135111e-06, + "loss": 13.9968, + "num_tokens": 228231092.0, + "step": 1172 + }, + { + "epoch": 1.8840667068515171, + "grad_norm": 5.0966884512961075, + "learning_rate": 1.0830751294615872e-06, + "loss": 16.4603, + "num_tokens": 228677483.0, + "step": 1173 + }, + { + "epoch": 1.8840667068515171, + "eval_loss": 0.9048787951469421, + "eval_num_tokens": 228677483.0, + "eval_runtime": 375.6288, + "eval_samples_per_second": 23.555, + "eval_steps_per_second": 5.889, + "step": 1173 + }, + { + "epoch": 1.8856741008639744, + "grad_norm": 5.643596005393563, + "learning_rate": 1.0808517289777762e-06, + "loss": 16.274, + "num_tokens": 229066715.0, + "step": 1174 + }, + { + "epoch": 1.8872814948764316, + "grad_norm": 6.07445776389269, + "learning_rate": 1.0786582169998434e-06, + "loss": 16.5755, + "num_tokens": 229498024.0, + "step": 1175 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 6.04837535064954, + "learning_rate": 1.0764946083634075e-06, + "loss": 17.5761, + "num_tokens": 229901938.0, + "step": 1176 + }, + { + "epoch": 1.890496282901346, + "grad_norm": 6.077546722892833, + "learning_rate": 1.0743609177018378e-06, + "loss": 16.5305, + "num_tokens": 230276429.0, + "step": 1177 + }, + { + "epoch": 1.8921036769138035, + "grad_norm": 5.408679539744412, + "learning_rate": 1.0722571594461558e-06, + "loss": 16.2387, + "num_tokens": 230711863.0, + "step": 1178 + }, + { + "epoch": 1.8937110709262608, + "grad_norm": 6.077728891550048, + "learning_rate": 1.0701833478249385e-06, + "loss": 16.1663, + "num_tokens": 231127189.0, + "step": 1179 + }, + { + "epoch": 1.8953184649387183, + "grad_norm": 5.577637048155881, + "learning_rate": 1.0681394968642227e-06, + "loss": 16.1364, + "num_tokens": 231561014.0, + "step": 1180 + }, + { + "epoch": 1.8969258589511755, + "grad_norm": 5.700308034091714, + "learning_rate": 1.066125620387406e-06, + "loss": 15.6049, + "num_tokens": 231932725.0, + "step": 1181 + }, + { + "epoch": 1.8985332529636327, + "grad_norm": 5.472550430764944, + "learning_rate": 1.064141732015159e-06, + "loss": 17.2123, + "num_tokens": 232368837.0, + "step": 1182 + }, + { + "epoch": 1.90014064697609, + "grad_norm": 5.0892880056570196, + "learning_rate": 1.06218784516533e-06, + "loss": 16.0334, + "num_tokens": 232798962.0, + "step": 1183 + }, + { + "epoch": 1.9017480409885472, + "grad_norm": 5.486890764110697, + "learning_rate": 1.060263973052853e-06, + "loss": 16.7993, + "num_tokens": 233208993.0, + "step": 1184 + }, + { + "epoch": 1.9033554350010045, + "grad_norm": 6.285805689061978, + "learning_rate": 1.0583701286896633e-06, + "loss": 16.093, + "num_tokens": 233641760.0, + "step": 1185 + }, + { + "epoch": 1.904962829013462, + "grad_norm": 5.202769153241926, + "learning_rate": 1.0565063248846027e-06, + "loss": 18.0528, + "num_tokens": 234062199.0, + "step": 1186 + }, + { + "epoch": 1.9065702230259194, + "grad_norm": 5.475374587265379, + "learning_rate": 1.0546725742433387e-06, + "loss": 16.4997, + "num_tokens": 234523716.0, + "step": 1187 + }, + { + "epoch": 1.9081776170383766, + "grad_norm": 5.1646648278303475, + "learning_rate": 1.0528688891682756e-06, + "loss": 16.0919, + "num_tokens": 234985389.0, + "step": 1188 + }, + { + "epoch": 1.9097850110508339, + "grad_norm": 4.295878410105713, + "learning_rate": 1.0510952818584742e-06, + "loss": 14.4896, + "num_tokens": 235441766.0, + "step": 1189 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 5.95330982026002, + "learning_rate": 1.0493517643095647e-06, + "loss": 18.2837, + "num_tokens": 235857566.0, + "step": 1190 + }, + { + "epoch": 1.9129997990757484, + "grad_norm": 5.686947632919641, + "learning_rate": 1.0476383483136687e-06, + "loss": 16.7004, + "num_tokens": 236215520.0, + "step": 1191 + }, + { + "epoch": 1.9146071930882056, + "grad_norm": 6.976429803022317, + "learning_rate": 1.0459550454593198e-06, + "loss": 16.7754, + "num_tokens": 236636487.0, + "step": 1192 + }, + { + "epoch": 1.916214587100663, + "grad_norm": 5.5488434599392855, + "learning_rate": 1.044301867131382e-06, + "loss": 17.4664, + "num_tokens": 237091607.0, + "step": 1193 + }, + { + "epoch": 1.9178219811131203, + "grad_norm": 5.446034394281094, + "learning_rate": 1.0426788245109763e-06, + "loss": 15.8011, + "num_tokens": 237506804.0, + "step": 1194 + }, + { + "epoch": 1.9194293751255778, + "grad_norm": 5.278908448861804, + "learning_rate": 1.0410859285754042e-06, + "loss": 16.2185, + "num_tokens": 237891695.0, + "step": 1195 + }, + { + "epoch": 1.921036769138035, + "grad_norm": 6.517892420750336, + "learning_rate": 1.039523190098071e-06, + "loss": 17.0864, + "num_tokens": 238294980.0, + "step": 1196 + }, + { + "epoch": 1.9226441631504922, + "grad_norm": 6.0459610570161795, + "learning_rate": 1.0379906196484158e-06, + "loss": 16.8141, + "num_tokens": 238710354.0, + "step": 1197 + }, + { + "epoch": 1.9242515571629495, + "grad_norm": 7.372111896327063, + "learning_rate": 1.0364882275918386e-06, + "loss": 16.5398, + "num_tokens": 239142978.0, + "step": 1198 + }, + { + "epoch": 1.9258589511754067, + "grad_norm": 5.765418978338161, + "learning_rate": 1.0350160240896323e-06, + "loss": 16.7391, + "num_tokens": 239538696.0, + "step": 1199 + }, + { + "epoch": 1.9274663451878642, + "grad_norm": 5.831131831302644, + "learning_rate": 1.0335740190989093e-06, + "loss": 16.5487, + "num_tokens": 239910560.0, + "step": 1200 + }, + { + "epoch": 1.9290737392003214, + "grad_norm": 5.964686608482026, + "learning_rate": 1.03216222237254e-06, + "loss": 14.5456, + "num_tokens": 240277767.0, + "step": 1201 + }, + { + "epoch": 1.930681133212779, + "grad_norm": 5.889783164828484, + "learning_rate": 1.030780643459082e-06, + "loss": 16.0379, + "num_tokens": 240687953.0, + "step": 1202 + }, + { + "epoch": 1.9322885272252361, + "grad_norm": 5.691051612122731, + "learning_rate": 1.0294292917027177e-06, + "loss": 17.7673, + "num_tokens": 241108706.0, + "step": 1203 + }, + { + "epoch": 1.9338959212376934, + "grad_norm": 5.668375573105925, + "learning_rate": 1.0281081762431918e-06, + "loss": 14.5588, + "num_tokens": 241516337.0, + "step": 1204 + }, + { + "epoch": 1.9355033152501506, + "grad_norm": 6.611356074514582, + "learning_rate": 1.0268173060157488e-06, + "loss": 15.1079, + "num_tokens": 241891368.0, + "step": 1205 + }, + { + "epoch": 1.9371107092626079, + "grad_norm": 5.525094365572791, + "learning_rate": 1.0255566897510701e-06, + "loss": 16.694, + "num_tokens": 242349828.0, + "step": 1206 + }, + { + "epoch": 1.9387181032750653, + "grad_norm": 5.810836469216857, + "learning_rate": 1.0243263359752202e-06, + "loss": 16.4286, + "num_tokens": 242731976.0, + "step": 1207 + }, + { + "epoch": 1.9403254972875226, + "grad_norm": 5.456554564102643, + "learning_rate": 1.0231262530095838e-06, + "loss": 16.7109, + "num_tokens": 243151516.0, + "step": 1208 + }, + { + "epoch": 1.94193289129998, + "grad_norm": 5.489822091522901, + "learning_rate": 1.0219564489708123e-06, + "loss": 15.8669, + "num_tokens": 243562131.0, + "step": 1209 + }, + { + "epoch": 1.9435402853124373, + "grad_norm": 6.72812996742137, + "learning_rate": 1.020816931770769e-06, + "loss": 17.6394, + "num_tokens": 243936136.0, + "step": 1210 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 6.379965071803016, + "learning_rate": 1.0197077091164745e-06, + "loss": 16.8727, + "num_tokens": 244329865.0, + "step": 1211 + }, + { + "epoch": 1.9467550733373518, + "grad_norm": 5.322129403098656, + "learning_rate": 1.0186287885100546e-06, + "loss": 14.7792, + "num_tokens": 244751197.0, + "step": 1212 + }, + { + "epoch": 1.948362467349809, + "grad_norm": 5.451819578934776, + "learning_rate": 1.0175801772486895e-06, + "loss": 16.4576, + "num_tokens": 245164637.0, + "step": 1213 + }, + { + "epoch": 1.9499698613622665, + "grad_norm": 6.1131451086384585, + "learning_rate": 1.016561882424568e-06, + "loss": 16.7264, + "num_tokens": 245541210.0, + "step": 1214 + }, + { + "epoch": 1.9515772553747237, + "grad_norm": 5.9175755132210295, + "learning_rate": 1.0155739109248322e-06, + "loss": 16.6469, + "num_tokens": 245933494.0, + "step": 1215 + }, + { + "epoch": 1.9531846493871812, + "grad_norm": 6.197533509290699, + "learning_rate": 1.014616269431538e-06, + "loss": 15.9556, + "num_tokens": 246297822.0, + "step": 1216 + }, + { + "epoch": 1.9547920433996384, + "grad_norm": 6.128624692292462, + "learning_rate": 1.0136889644216055e-06, + "loss": 16.0485, + "num_tokens": 246688927.0, + "step": 1217 + }, + { + "epoch": 1.9563994374120957, + "grad_norm": 5.087112491150184, + "learning_rate": 1.0127920021667779e-06, + "loss": 16.1969, + "num_tokens": 247140390.0, + "step": 1218 + }, + { + "epoch": 1.958006831424553, + "grad_norm": 5.282122107170681, + "learning_rate": 1.0119253887335782e-06, + "loss": 16.4552, + "num_tokens": 247552395.0, + "step": 1219 + }, + { + "epoch": 1.9596142254370101, + "grad_norm": 5.660044854225965, + "learning_rate": 1.0110891299832661e-06, + "loss": 15.5057, + "num_tokens": 247996908.0, + "step": 1220 + }, + { + "epoch": 1.9612216194494676, + "grad_norm": 4.717051518675388, + "learning_rate": 1.0102832315718022e-06, + "loss": 15.8764, + "num_tokens": 248410260.0, + "step": 1221 + }, + { + "epoch": 1.9628290134619248, + "grad_norm": 5.8428892686684915, + "learning_rate": 1.009507698949806e-06, + "loss": 16.5791, + "num_tokens": 248814618.0, + "step": 1222 + }, + { + "epoch": 1.9644364074743823, + "grad_norm": 5.404082977353084, + "learning_rate": 1.0087625373625226e-06, + "loss": 15.5427, + "num_tokens": 249239516.0, + "step": 1223 + }, + { + "epoch": 1.9660438014868395, + "grad_norm": 5.414654288171972, + "learning_rate": 1.0080477518497835e-06, + "loss": 15.4976, + "num_tokens": 249637745.0, + "step": 1224 + }, + { + "epoch": 1.9660438014868395, + "eval_loss": 0.9025331139564514, + "eval_num_tokens": 249637745.0, + "eval_runtime": 375.5085, + "eval_samples_per_second": 23.563, + "eval_steps_per_second": 5.891, + "step": 1224 + }, + { + "epoch": 1.9676511954992968, + "grad_norm": 6.044207428072108, + "learning_rate": 1.0073633472459758e-06, + "loss": 15.3605, + "num_tokens": 250040321.0, + "step": 1225 + }, + { + "epoch": 1.969258589511754, + "grad_norm": 6.124310866758191, + "learning_rate": 1.006709328180007e-06, + "loss": 14.9237, + "num_tokens": 250420779.0, + "step": 1226 + }, + { + "epoch": 1.9708659835242113, + "grad_norm": 6.2897981965575065, + "learning_rate": 1.0060856990752757e-06, + "loss": 17.9574, + "num_tokens": 250836977.0, + "step": 1227 + }, + { + "epoch": 1.9724733775366685, + "grad_norm": 5.795574540730705, + "learning_rate": 1.0054924641496406e-06, + "loss": 16.1906, + "num_tokens": 251278982.0, + "step": 1228 + }, + { + "epoch": 1.974080771549126, + "grad_norm": 5.873969373020597, + "learning_rate": 1.0049296274153914e-06, + "loss": 15.3191, + "num_tokens": 251711943.0, + "step": 1229 + }, + { + "epoch": 1.9756881655615834, + "grad_norm": 4.919180589205254, + "learning_rate": 1.0043971926792237e-06, + "loss": 15.6381, + "num_tokens": 252114810.0, + "step": 1230 + }, + { + "epoch": 1.9772955595740407, + "grad_norm": 5.809261622177621, + "learning_rate": 1.0038951635422105e-06, + "loss": 18.079, + "num_tokens": 252585708.0, + "step": 1231 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 6.189425163380494, + "learning_rate": 1.0034235433997802e-06, + "loss": 15.9744, + "num_tokens": 253000276.0, + "step": 1232 + }, + { + "epoch": 1.9805103475989552, + "grad_norm": 5.845987438642186, + "learning_rate": 1.0029823354416931e-06, + "loss": 17.2062, + "num_tokens": 253417041.0, + "step": 1233 + }, + { + "epoch": 1.9821177416114124, + "grad_norm": 5.935966199473088, + "learning_rate": 1.0025715426520185e-06, + "loss": 14.4739, + "num_tokens": 253851231.0, + "step": 1234 + }, + { + "epoch": 1.9837251356238697, + "grad_norm": 6.673148313742776, + "learning_rate": 1.0021911678091159e-06, + "loss": 13.9743, + "num_tokens": 254171628.0, + "step": 1235 + }, + { + "epoch": 1.9853325296363271, + "grad_norm": 6.2434344830482695, + "learning_rate": 1.0018412134856163e-06, + "loss": 15.4476, + "num_tokens": 254549015.0, + "step": 1236 + }, + { + "epoch": 1.9869399236487844, + "grad_norm": 5.298696330868717, + "learning_rate": 1.0015216820484032e-06, + "loss": 16.3716, + "num_tokens": 254956082.0, + "step": 1237 + }, + { + "epoch": 1.9885473176612418, + "grad_norm": 5.840484430253416, + "learning_rate": 1.0012325756585986e-06, + "loss": 17.47, + "num_tokens": 255380323.0, + "step": 1238 + }, + { + "epoch": 1.990154711673699, + "grad_norm": 5.189028673505454, + "learning_rate": 1.000973896271547e-06, + "loss": 14.7357, + "num_tokens": 255850029.0, + "step": 1239 + }, + { + "epoch": 1.9917621056861563, + "grad_norm": 5.725892152355787, + "learning_rate": 1.000745645636803e-06, + "loss": 14.8511, + "num_tokens": 256275307.0, + "step": 1240 + }, + { + "epoch": 1.9933694996986135, + "grad_norm": 5.871308919894856, + "learning_rate": 1.0005478252981187e-06, + "loss": 18.6437, + "num_tokens": 256733081.0, + "step": 1241 + }, + { + "epoch": 1.9949768937110708, + "grad_norm": 6.298951502572325, + "learning_rate": 1.0003804365934346e-06, + "loss": 17.3237, + "num_tokens": 257126176.0, + "step": 1242 + }, + { + "epoch": 1.9965842877235283, + "grad_norm": 5.135087379745041, + "learning_rate": 1.000243480654868e-06, + "loss": 17.2417, + "num_tokens": 257547044.0, + "step": 1243 + }, + { + "epoch": 1.9981916817359855, + "grad_norm": 5.665400816342981, + "learning_rate": 1.0001369584087082e-06, + "loss": 16.7055, + "num_tokens": 257953218.0, + "step": 1244 + }, + { + "epoch": 1.999799075748443, + "grad_norm": 5.502546448338944, + "learning_rate": 1.0000608705754089e-06, + "loss": 16.2393, + "num_tokens": 258413945.0, + "step": 1245 + }, + { + "epoch": 2.0, + "grad_norm": 5.502546448338944, + "learning_rate": 1.0000152176695831e-06, + "loss": 1.7342, + "num_tokens": 258442804.0, + "step": 1246 + }, { "epoch": 2.0, - "step": 154, - "total_flos": 180679423688704.0, - "train_loss": 45.599958295945996, - "train_runtime": 3679.7026, - "train_samples_per_second": 5.342, - "train_steps_per_second": 0.042 + "step": 1246, + "total_flos": 1446350766178304.0, + "train_loss": 8.238658274636032, + "train_runtime": 17767.8846, + "train_samples_per_second": 8.963, + "train_steps_per_second": 0.07 } ], "logging_steps": 1, - "max_steps": 154, + "max_steps": 1246, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, @@ -1295,7 +10220,7 @@ "attributes": {} } }, - "total_flos": 180679423688704.0, + "total_flos": 1446350766178304.0, "train_batch_size": 2, "trial_name": null, "trial_params": null