| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 4926, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0060901339829476245, |
| "grad_norm": 23.5319766998291, |
| "learning_rate": 1.2162162162162164e-05, |
| "loss": 4.5905, |
| "mean_token_accuracy": 0.3401473943144083, |
| "num_tokens": 132681.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.012180267965895249, |
| "grad_norm": 6.916630744934082, |
| "learning_rate": 2.5675675675675675e-05, |
| "loss": 3.957, |
| "mean_token_accuracy": 0.3799716055393219, |
| "num_tokens": 264238.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.018270401948842874, |
| "grad_norm": 2.717982292175293, |
| "learning_rate": 3.918918918918919e-05, |
| "loss": 3.047, |
| "mean_token_accuracy": 0.46811963245272636, |
| "num_tokens": 401308.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.024360535931790498, |
| "grad_norm": 2.407865524291992, |
| "learning_rate": 5.27027027027027e-05, |
| "loss": 2.5322, |
| "mean_token_accuracy": 0.5190828196704388, |
| "num_tokens": 532444.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.030450669914738125, |
| "grad_norm": 1.018301248550415, |
| "learning_rate": 6.621621621621621e-05, |
| "loss": 2.1325, |
| "mean_token_accuracy": 0.5782605841755867, |
| "num_tokens": 660406.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03654080389768575, |
| "grad_norm": 0.7315741181373596, |
| "learning_rate": 7.972972972972974e-05, |
| "loss": 1.9044, |
| "mean_token_accuracy": 0.6264914631843567, |
| "num_tokens": 795304.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04263093788063337, |
| "grad_norm": 0.6652920246124268, |
| "learning_rate": 9.324324324324324e-05, |
| "loss": 1.633, |
| "mean_token_accuracy": 0.6683938711881637, |
| "num_tokens": 934543.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.048721071863580996, |
| "grad_norm": 0.6119660139083862, |
| "learning_rate": 0.00010675675675675677, |
| "loss": 1.543, |
| "mean_token_accuracy": 0.6834091022610664, |
| "num_tokens": 1070669.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05481120584652863, |
| "grad_norm": 0.591424286365509, |
| "learning_rate": 0.00012027027027027027, |
| "loss": 1.4154, |
| "mean_token_accuracy": 0.6991497233510018, |
| "num_tokens": 1211114.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06090133982947625, |
| "grad_norm": 0.5663530230522156, |
| "learning_rate": 0.0001337837837837838, |
| "loss": 1.3176, |
| "mean_token_accuracy": 0.7089206710457802, |
| "num_tokens": 1349584.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06699147381242387, |
| "grad_norm": 0.5881878137588501, |
| "learning_rate": 0.0001472972972972973, |
| "loss": 1.2293, |
| "mean_token_accuracy": 0.7254403859376908, |
| "num_tokens": 1487515.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0730816077953715, |
| "grad_norm": 0.7664394974708557, |
| "learning_rate": 0.00016081081081081083, |
| "loss": 1.1814, |
| "mean_token_accuracy": 0.7306812778115273, |
| "num_tokens": 1618603.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07917174177831912, |
| "grad_norm": 0.6155670881271362, |
| "learning_rate": 0.00017432432432432432, |
| "loss": 1.1967, |
| "mean_token_accuracy": 0.7284250959753991, |
| "num_tokens": 1750466.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08526187576126674, |
| "grad_norm": 0.5296258330345154, |
| "learning_rate": 0.00018783783783783784, |
| "loss": 1.0955, |
| "mean_token_accuracy": 0.7472824215888977, |
| "num_tokens": 1887913.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09135200974421437, |
| "grad_norm": 0.5564976334571838, |
| "learning_rate": 0.00019999998054550544, |
| "loss": 1.118, |
| "mean_token_accuracy": 0.7397311359643937, |
| "num_tokens": 2018579.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09744214372716199, |
| "grad_norm": 0.5301142930984497, |
| "learning_rate": 0.00019999764601633156, |
| "loss": 1.045, |
| "mean_token_accuracy": 0.7519380420446395, |
| "num_tokens": 2158851.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10353227771010962, |
| "grad_norm": 0.5949111580848694, |
| "learning_rate": 0.00019999142070388495, |
| "loss": 1.0497, |
| "mean_token_accuracy": 0.7520910769701004, |
| "num_tokens": 2296715.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.10962241169305725, |
| "grad_norm": 0.6169262528419495, |
| "learning_rate": 0.0001999813048772986, |
| "loss": 1.0821, |
| "mean_token_accuracy": 0.7406247839331627, |
| "num_tokens": 2424756.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11571254567600488, |
| "grad_norm": 0.58912593126297, |
| "learning_rate": 0.00019996729897390057, |
| "loss": 1.0286, |
| "mean_token_accuracy": 0.7527454376220704, |
| "num_tokens": 2559362.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1218026796589525, |
| "grad_norm": 0.5084304213523865, |
| "learning_rate": 0.00019994940359919483, |
| "loss": 0.992, |
| "mean_token_accuracy": 0.7640391126275062, |
| "num_tokens": 2700231.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1278928136419001, |
| "grad_norm": 0.5790796279907227, |
| "learning_rate": 0.00019992761952683516, |
| "loss": 1.0146, |
| "mean_token_accuracy": 0.7554366230964661, |
| "num_tokens": 2831324.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.13398294762484775, |
| "grad_norm": 0.5852051377296448, |
| "learning_rate": 0.00019990194769859188, |
| "loss": 0.978, |
| "mean_token_accuracy": 0.7612502485513687, |
| "num_tokens": 2967346.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.14007308160779536, |
| "grad_norm": 0.5102785229682922, |
| "learning_rate": 0.00019987238922431088, |
| "loss": 0.9616, |
| "mean_token_accuracy": 0.7677591517567635, |
| "num_tokens": 3110936.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.146163215590743, |
| "grad_norm": 0.5472669005393982, |
| "learning_rate": 0.00019983894538186576, |
| "loss": 0.9535, |
| "mean_token_accuracy": 0.76737689524889, |
| "num_tokens": 3247496.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.15225334957369063, |
| "grad_norm": 0.5611053109169006, |
| "learning_rate": 0.0001998016176171026, |
| "loss": 0.9577, |
| "mean_token_accuracy": 0.7626092001795769, |
| "num_tokens": 3384178.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.15834348355663824, |
| "grad_norm": 0.54055255651474, |
| "learning_rate": 0.0001997604075437774, |
| "loss": 0.9907, |
| "mean_token_accuracy": 0.7575223430991173, |
| "num_tokens": 3517617.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.16443361753958588, |
| "grad_norm": 0.558316707611084, |
| "learning_rate": 0.0001997153169434864, |
| "loss": 0.944, |
| "mean_token_accuracy": 0.7664194419980049, |
| "num_tokens": 3662878.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1705237515225335, |
| "grad_norm": 0.49766939878463745, |
| "learning_rate": 0.0001996663477655889, |
| "loss": 0.9106, |
| "mean_token_accuracy": 0.7760038167238236, |
| "num_tokens": 3807411.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.17661388550548113, |
| "grad_norm": 0.4953667223453522, |
| "learning_rate": 0.0001996135021271232, |
| "loss": 0.9687, |
| "mean_token_accuracy": 0.7605679705739021, |
| "num_tokens": 3936840.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.18270401948842874, |
| "grad_norm": 0.5447947978973389, |
| "learning_rate": 0.00019955678231271484, |
| "loss": 0.9625, |
| "mean_token_accuracy": 0.7603292793035508, |
| "num_tokens": 4067826.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18879415347137637, |
| "grad_norm": 0.4665842056274414, |
| "learning_rate": 0.00019949619077447807, |
| "loss": 0.9372, |
| "mean_token_accuracy": 0.7676101759076118, |
| "num_tokens": 4205887.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.19488428745432398, |
| "grad_norm": 0.515690267086029, |
| "learning_rate": 0.00019943173013190965, |
| "loss": 0.923, |
| "mean_token_accuracy": 0.7708473294973374, |
| "num_tokens": 4342894.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.20097442143727162, |
| "grad_norm": 0.5831382274627686, |
| "learning_rate": 0.00019936340317177565, |
| "loss": 0.9203, |
| "mean_token_accuracy": 0.7708552837371826, |
| "num_tokens": 4477651.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.20706455542021923, |
| "grad_norm": 0.6162773966789246, |
| "learning_rate": 0.0001992912128479911, |
| "loss": 0.916, |
| "mean_token_accuracy": 0.7702088996767997, |
| "num_tokens": 4610746.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.21315468940316687, |
| "grad_norm": 0.5172462463378906, |
| "learning_rate": 0.00019921516228149207, |
| "loss": 0.8942, |
| "mean_token_accuracy": 0.7741821393370628, |
| "num_tokens": 4751175.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2192448233861145, |
| "grad_norm": 0.5890468955039978, |
| "learning_rate": 0.0001991352547601009, |
| "loss": 0.9229, |
| "mean_token_accuracy": 0.7691043332219124, |
| "num_tokens": 4882328.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.22533495736906212, |
| "grad_norm": 0.5522404909133911, |
| "learning_rate": 0.00019905149373838408, |
| "loss": 0.9294, |
| "mean_token_accuracy": 0.7646071568131447, |
| "num_tokens": 5012181.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.23142509135200975, |
| "grad_norm": 0.5349445939064026, |
| "learning_rate": 0.0001989638828375028, |
| "loss": 0.8797, |
| "mean_token_accuracy": 0.7771721839904785, |
| "num_tokens": 5151133.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.23751522533495736, |
| "grad_norm": 0.531052827835083, |
| "learning_rate": 0.00019887242584505635, |
| "loss": 0.9221, |
| "mean_token_accuracy": 0.7678465083241462, |
| "num_tokens": 5279790.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.243605359317905, |
| "grad_norm": 0.5126324892044067, |
| "learning_rate": 0.00019877712671491864, |
| "loss": 0.8862, |
| "mean_token_accuracy": 0.7739894777536392, |
| "num_tokens": 5412390.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2496954933008526, |
| "grad_norm": 0.5111438632011414, |
| "learning_rate": 0.00019867798956706693, |
| "loss": 0.9005, |
| "mean_token_accuracy": 0.7721902653574944, |
| "num_tokens": 5545801.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2557856272838002, |
| "grad_norm": 0.5488138794898987, |
| "learning_rate": 0.00019857501868740402, |
| "loss": 0.8988, |
| "mean_token_accuracy": 0.7690282896161079, |
| "num_tokens": 5673758.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2618757612667479, |
| "grad_norm": 0.5497994422912598, |
| "learning_rate": 0.0001984682185275727, |
| "loss": 0.8802, |
| "mean_token_accuracy": 0.7780183687806129, |
| "num_tokens": 5813158.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.2679658952496955, |
| "grad_norm": 0.5478431582450867, |
| "learning_rate": 0.0001983575937047635, |
| "loss": 0.865, |
| "mean_token_accuracy": 0.7785944610834121, |
| "num_tokens": 5947367.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2740560292326431, |
| "grad_norm": 0.5188766717910767, |
| "learning_rate": 0.00019824314900151487, |
| "loss": 0.8798, |
| "mean_token_accuracy": 0.7752803862094879, |
| "num_tokens": 6081060.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2801461632155907, |
| "grad_norm": 0.530222475528717, |
| "learning_rate": 0.00019812488936550666, |
| "loss": 0.8628, |
| "mean_token_accuracy": 0.7801630645990372, |
| "num_tokens": 6217834.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2862362971985384, |
| "grad_norm": 0.5987964868545532, |
| "learning_rate": 0.00019800281990934614, |
| "loss": 0.8775, |
| "mean_token_accuracy": 0.7760324433445931, |
| "num_tokens": 6350451.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.292326431181486, |
| "grad_norm": 0.5468559265136719, |
| "learning_rate": 0.0001978769459103468, |
| "loss": 0.8721, |
| "mean_token_accuracy": 0.7794204503297806, |
| "num_tokens": 6484738.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2984165651644336, |
| "grad_norm": 0.5541098117828369, |
| "learning_rate": 0.0001977472728103005, |
| "loss": 0.8785, |
| "mean_token_accuracy": 0.7767582029104233, |
| "num_tokens": 6619313.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.30450669914738127, |
| "grad_norm": 0.5134281516075134, |
| "learning_rate": 0.0001976138062152419, |
| "loss": 0.8717, |
| "mean_token_accuracy": 0.7752724394202233, |
| "num_tokens": 6753195.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3105968331303289, |
| "grad_norm": 0.49164435267448425, |
| "learning_rate": 0.00019747655189520633, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7768464118242264, |
| "num_tokens": 6890448.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3166869671132765, |
| "grad_norm": 0.5899345278739929, |
| "learning_rate": 0.00019733551578398023, |
| "loss": 0.8322, |
| "mean_token_accuracy": 0.7859320402145386, |
| "num_tokens": 7027488.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.3227771010962241, |
| "grad_norm": 0.6552841663360596, |
| "learning_rate": 0.0001971907039788447, |
| "loss": 0.861, |
| "mean_token_accuracy": 0.7770532324910164, |
| "num_tokens": 7161184.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.32886723507917176, |
| "grad_norm": 0.5038822889328003, |
| "learning_rate": 0.0001970421227403117, |
| "loss": 0.8825, |
| "mean_token_accuracy": 0.775890800356865, |
| "num_tokens": 7294399.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.33495736906211937, |
| "grad_norm": 0.5094267129898071, |
| "learning_rate": 0.00019688977849185378, |
| "loss": 0.8598, |
| "mean_token_accuracy": 0.7817838475108146, |
| "num_tokens": 7427183.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.341047503045067, |
| "grad_norm": 0.5282809138298035, |
| "learning_rate": 0.00019673367781962594, |
| "loss": 0.8463, |
| "mean_token_accuracy": 0.7812959104776382, |
| "num_tokens": 7561734.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3471376370280146, |
| "grad_norm": 0.45355409383773804, |
| "learning_rate": 0.00019657382747218123, |
| "loss": 0.8207, |
| "mean_token_accuracy": 0.7888262197375298, |
| "num_tokens": 7706228.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.35322777101096225, |
| "grad_norm": 0.5162333846092224, |
| "learning_rate": 0.00019641023436017883, |
| "loss": 0.8235, |
| "mean_token_accuracy": 0.7868947923183441, |
| "num_tokens": 7846684.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.35931790499390986, |
| "grad_norm": 0.5194632411003113, |
| "learning_rate": 0.00019624290555608526, |
| "loss": 0.8129, |
| "mean_token_accuracy": 0.7884069249033928, |
| "num_tokens": 7986811.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3654080389768575, |
| "grad_norm": 0.5494846701622009, |
| "learning_rate": 0.00019607184829386882, |
| "loss": 0.8084, |
| "mean_token_accuracy": 0.7874000474810601, |
| "num_tokens": 8124538.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.37149817295980514, |
| "grad_norm": 0.5368776917457581, |
| "learning_rate": 0.0001958970699686866, |
| "loss": 0.8225, |
| "mean_token_accuracy": 0.783010233938694, |
| "num_tokens": 8260529.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.37758830694275275, |
| "grad_norm": 0.6229024529457092, |
| "learning_rate": 0.00019571857813656496, |
| "loss": 0.8786, |
| "mean_token_accuracy": 0.7753148928284646, |
| "num_tokens": 8389042.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.38367844092570036, |
| "grad_norm": 0.5601000785827637, |
| "learning_rate": 0.00019553638051407279, |
| "loss": 0.8909, |
| "mean_token_accuracy": 0.7745720192790031, |
| "num_tokens": 8513603.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.38976857490864797, |
| "grad_norm": 0.438970685005188, |
| "learning_rate": 0.0001953504849779879, |
| "loss": 0.8085, |
| "mean_token_accuracy": 0.7871840804815292, |
| "num_tokens": 8652970.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.39585870889159563, |
| "grad_norm": 0.5505132079124451, |
| "learning_rate": 0.00019516089956495648, |
| "loss": 0.8102, |
| "mean_token_accuracy": 0.7869585514068603, |
| "num_tokens": 8792103.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.40194884287454324, |
| "grad_norm": 0.5447221398353577, |
| "learning_rate": 0.00019496763247114581, |
| "loss": 0.8336, |
| "mean_token_accuracy": 0.7816034242510795, |
| "num_tokens": 8926853.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.40803897685749085, |
| "grad_norm": 0.4652746915817261, |
| "learning_rate": 0.00019477069205188965, |
| "loss": 0.8383, |
| "mean_token_accuracy": 0.7826304718852043, |
| "num_tokens": 9059592.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.41412911084043846, |
| "grad_norm": 0.42363590002059937, |
| "learning_rate": 0.00019457008682132726, |
| "loss": 0.847, |
| "mean_token_accuracy": 0.7810002073645592, |
| "num_tokens": 9193062.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.42021924482338613, |
| "grad_norm": 0.5209478735923767, |
| "learning_rate": 0.00019436582545203518, |
| "loss": 0.8766, |
| "mean_token_accuracy": 0.7733785718679428, |
| "num_tokens": 9315805.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.42630937880633374, |
| "grad_norm": 0.5176642537117004, |
| "learning_rate": 0.00019415791677465237, |
| "loss": 0.8155, |
| "mean_token_accuracy": 0.7869213685393334, |
| "num_tokens": 9448863.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.43239951278928135, |
| "grad_norm": 0.4531058371067047, |
| "learning_rate": 0.00019394636977749843, |
| "loss": 0.8096, |
| "mean_token_accuracy": 0.7903949975967407, |
| "num_tokens": 9589382.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.438489646772229, |
| "grad_norm": 0.5651549100875854, |
| "learning_rate": 0.000193731193606185, |
| "loss": 0.8263, |
| "mean_token_accuracy": 0.7823062822222709, |
| "num_tokens": 9723562.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4445797807551766, |
| "grad_norm": 0.5377989411354065, |
| "learning_rate": 0.00019351239756322031, |
| "loss": 0.7993, |
| "mean_token_accuracy": 0.7908329650759697, |
| "num_tokens": 9859255.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.45066991473812423, |
| "grad_norm": 0.5420868396759033, |
| "learning_rate": 0.00019328999110760722, |
| "loss": 0.8461, |
| "mean_token_accuracy": 0.7780480548739434, |
| "num_tokens": 9981578.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.45676004872107184, |
| "grad_norm": 0.4889216125011444, |
| "learning_rate": 0.000193063983854434, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7959530428051949, |
| "num_tokens": 10122922.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4628501827040195, |
| "grad_norm": 0.5044087767601013, |
| "learning_rate": 0.00019283438557445893, |
| "loss": 0.824, |
| "mean_token_accuracy": 0.7845935523509979, |
| "num_tokens": 10252854.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.4689403166869671, |
| "grad_norm": 0.5286466479301453, |
| "learning_rate": 0.00019260120619368773, |
| "loss": 0.815, |
| "mean_token_accuracy": 0.7850656941533088, |
| "num_tokens": 10385075.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.47503045066991473, |
| "grad_norm": 0.5441628694534302, |
| "learning_rate": 0.00019236445579294437, |
| "loss": 0.8048, |
| "mean_token_accuracy": 0.7876680314540863, |
| "num_tokens": 10520011.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.48112058465286234, |
| "grad_norm": 0.49002447724342346, |
| "learning_rate": 0.0001921241446074355, |
| "loss": 0.8059, |
| "mean_token_accuracy": 0.7898563235998154, |
| "num_tokens": 10652488.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.48721071863581, |
| "grad_norm": 0.4479144811630249, |
| "learning_rate": 0.0001918802830263077, |
| "loss": 0.7913, |
| "mean_token_accuracy": 0.7928732186555862, |
| "num_tokens": 10785974.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4933008526187576, |
| "grad_norm": 0.5007497668266296, |
| "learning_rate": 0.00019163288159219853, |
| "loss": 0.8083, |
| "mean_token_accuracy": 0.7893043681979179, |
| "num_tokens": 10920950.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4993909866017052, |
| "grad_norm": 0.5289483070373535, |
| "learning_rate": 0.00019138195100078064, |
| "loss": 0.8033, |
| "mean_token_accuracy": 0.7864485770463944, |
| "num_tokens": 11056380.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5054811205846529, |
| "grad_norm": 0.5604159832000732, |
| "learning_rate": 0.0001911275021002994, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7946401730179786, |
| "num_tokens": 11196074.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5115712545676004, |
| "grad_norm": 0.43645399808883667, |
| "learning_rate": 0.00019086954589110397, |
| "loss": 0.7724, |
| "mean_token_accuracy": 0.7990294560790062, |
| "num_tokens": 11337990.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5176613885505481, |
| "grad_norm": 0.43992146849632263, |
| "learning_rate": 0.0001906080935251716, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7999786615371705, |
| "num_tokens": 11481565.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5237515225334958, |
| "grad_norm": 0.5595120191574097, |
| "learning_rate": 0.0001903431563056256, |
| "loss": 0.8266, |
| "mean_token_accuracy": 0.7859750911593437, |
| "num_tokens": 11611714.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5298416565164433, |
| "grad_norm": 0.5001987218856812, |
| "learning_rate": 0.0001900747456862467, |
| "loss": 0.8506, |
| "mean_token_accuracy": 0.779585388302803, |
| "num_tokens": 11736573.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.535931790499391, |
| "grad_norm": 0.430147647857666, |
| "learning_rate": 0.00018980287327097784, |
| "loss": 0.7707, |
| "mean_token_accuracy": 0.795211361348629, |
| "num_tokens": 11876859.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5420219244823387, |
| "grad_norm": 0.5346289873123169, |
| "learning_rate": 0.00018952755081342245, |
| "loss": 0.8057, |
| "mean_token_accuracy": 0.7871127843856811, |
| "num_tokens": 12007654.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5481120584652862, |
| "grad_norm": 0.46072253584861755, |
| "learning_rate": 0.00018924879021633653, |
| "loss": 0.7924, |
| "mean_token_accuracy": 0.7913773030042648, |
| "num_tokens": 12140520.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5542021924482339, |
| "grad_norm": 0.4803653955459595, |
| "learning_rate": 0.00018896660353111375, |
| "loss": 0.8398, |
| "mean_token_accuracy": 0.7807079553604126, |
| "num_tokens": 12267219.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5602923264311814, |
| "grad_norm": 0.5219636559486389, |
| "learning_rate": 0.0001886810029572647, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7993015512824059, |
| "num_tokens": 12404646.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5663824604141291, |
| "grad_norm": 0.501483142375946, |
| "learning_rate": 0.00018839200084188936, |
| "loss": 0.7953, |
| "mean_token_accuracy": 0.787814213335514, |
| "num_tokens": 12538219.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5724725943970768, |
| "grad_norm": 0.47334522008895874, |
| "learning_rate": 0.00018809960967914346, |
| "loss": 0.789, |
| "mean_token_accuracy": 0.7928574904799461, |
| "num_tokens": 12673805.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5785627283800243, |
| "grad_norm": 0.5057492852210999, |
| "learning_rate": 0.00018780384210969806, |
| "loss": 0.7746, |
| "mean_token_accuracy": 0.7947553545236588, |
| "num_tokens": 12811727.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.584652862362972, |
| "grad_norm": 0.5179910659790039, |
| "learning_rate": 0.00018750471092019325, |
| "loss": 0.7962, |
| "mean_token_accuracy": 0.7905686929821968, |
| "num_tokens": 12947641.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5907429963459196, |
| "grad_norm": 0.45797088742256165, |
| "learning_rate": 0.00018720222904268543, |
| "loss": 0.7678, |
| "mean_token_accuracy": 0.7969774708151818, |
| "num_tokens": 13083869.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5968331303288672, |
| "grad_norm": 0.48360612988471985, |
| "learning_rate": 0.00018689640955408803, |
| "loss": 0.7996, |
| "mean_token_accuracy": 0.7885591968894005, |
| "num_tokens": 13211807.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6029232643118149, |
| "grad_norm": 0.4378497004508972, |
| "learning_rate": 0.00018658726567560635, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7969291344285011, |
| "num_tokens": 13351856.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6090133982947625, |
| "grad_norm": 0.4857536852359772, |
| "learning_rate": 0.00018627481077216577, |
| "loss": 0.7786, |
| "mean_token_accuracy": 0.7914443418383599, |
| "num_tokens": 13486443.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6151035322777101, |
| "grad_norm": 0.5233064293861389, |
| "learning_rate": 0.0001859590583518343, |
| "loss": 0.8241, |
| "mean_token_accuracy": 0.7811850637197495, |
| "num_tokens": 13612035.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6211936662606578, |
| "grad_norm": 0.5328738689422607, |
| "learning_rate": 0.00018564002206523816, |
| "loss": 0.7502, |
| "mean_token_accuracy": 0.7993430674076081, |
| "num_tokens": 13756509.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6272838002436053, |
| "grad_norm": 0.47962310910224915, |
| "learning_rate": 0.000185317715704972, |
| "loss": 0.7984, |
| "mean_token_accuracy": 0.7864531084895134, |
| "num_tokens": 13883033.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.633373934226553, |
| "grad_norm": 0.5685893893241882, |
| "learning_rate": 0.0001849921532050024, |
| "loss": 0.7869, |
| "mean_token_accuracy": 0.7909937381744385, |
| "num_tokens": 14015234.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6394640682095006, |
| "grad_norm": 0.49146631360054016, |
| "learning_rate": 0.00018466334864006566, |
| "loss": 0.7952, |
| "mean_token_accuracy": 0.7878949210047722, |
| "num_tokens": 14149319.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6455542021924482, |
| "grad_norm": 0.5556225776672363, |
| "learning_rate": 0.0001843313162250591, |
| "loss": 0.7524, |
| "mean_token_accuracy": 0.7994373366236687, |
| "num_tokens": 14286868.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6516443361753959, |
| "grad_norm": 0.511379063129425, |
| "learning_rate": 0.00018399607031442666, |
| "loss": 0.7929, |
| "mean_token_accuracy": 0.7921562284231186, |
| "num_tokens": 14418354.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6577344701583435, |
| "grad_norm": 0.5019840598106384, |
| "learning_rate": 0.00018365762540153836, |
| "loss": 0.758, |
| "mean_token_accuracy": 0.7989353060722351, |
| "num_tokens": 14553174.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6638246041412911, |
| "grad_norm": 0.6032467484474182, |
| "learning_rate": 0.00018331599611806366, |
| "loss": 0.7888, |
| "mean_token_accuracy": 0.7903819754719734, |
| "num_tokens": 14681393.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6699147381242387, |
| "grad_norm": 0.5369830131530762, |
| "learning_rate": 0.00018297119723333877, |
| "loss": 0.765, |
| "mean_token_accuracy": 0.7950262635946274, |
| "num_tokens": 14814565.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6760048721071864, |
| "grad_norm": 0.5289803743362427, |
| "learning_rate": 0.00018262324365372846, |
| "loss": 0.7496, |
| "mean_token_accuracy": 0.8032818242907525, |
| "num_tokens": 14954351.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.682095006090134, |
| "grad_norm": 0.5440439581871033, |
| "learning_rate": 0.0001822721504219814, |
| "loss": 0.7432, |
| "mean_token_accuracy": 0.799126236140728, |
| "num_tokens": 15094879.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6881851400730816, |
| "grad_norm": 0.46225935220718384, |
| "learning_rate": 0.00018191793271657978, |
| "loss": 0.7513, |
| "mean_token_accuracy": 0.8022688791155815, |
| "num_tokens": 15234906.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6942752740560292, |
| "grad_norm": 0.5592020750045776, |
| "learning_rate": 0.0001815606058510833, |
| "loss": 0.7583, |
| "mean_token_accuracy": 0.7984497547149658, |
| "num_tokens": 15373526.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7003654080389768, |
| "grad_norm": 0.525090217590332, |
| "learning_rate": 0.00018120018527346702, |
| "loss": 0.7254, |
| "mean_token_accuracy": 0.8070619881153107, |
| "num_tokens": 15516264.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7064555420219245, |
| "grad_norm": 0.5380759239196777, |
| "learning_rate": 0.00018083668656545355, |
| "loss": 0.8041, |
| "mean_token_accuracy": 0.7866759791970253, |
| "num_tokens": 15640444.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7125456760048721, |
| "grad_norm": 0.47815701365470886, |
| "learning_rate": 0.00018047012544183938, |
| "loss": 0.7604, |
| "mean_token_accuracy": 0.796156468987465, |
| "num_tokens": 15778070.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7186358099878197, |
| "grad_norm": 0.5380450487136841, |
| "learning_rate": 0.00018010051774981553, |
| "loss": 0.8135, |
| "mean_token_accuracy": 0.7842124432325364, |
| "num_tokens": 15899739.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7247259439707674, |
| "grad_norm": 0.5047502517700195, |
| "learning_rate": 0.00017972787946828246, |
| "loss": 0.7642, |
| "mean_token_accuracy": 0.7989341139793396, |
| "num_tokens": 16035805.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.730816077953715, |
| "grad_norm": 0.5440967679023743, |
| "learning_rate": 0.00017935222670715918, |
| "loss": 0.735, |
| "mean_token_accuracy": 0.8048294603824615, |
| "num_tokens": 16172541.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7369062119366626, |
| "grad_norm": 0.4766077399253845, |
| "learning_rate": 0.000178973575706687, |
| "loss": 0.805, |
| "mean_token_accuracy": 0.7871790423989296, |
| "num_tokens": 16296988.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.7429963459196103, |
| "grad_norm": 0.4153214991092682, |
| "learning_rate": 0.00017859194283672704, |
| "loss": 0.7635, |
| "mean_token_accuracy": 0.7964595645666123, |
| "num_tokens": 16432022.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7490864799025578, |
| "grad_norm": 0.4698518216609955, |
| "learning_rate": 0.00017820734459605302, |
| "loss": 0.7397, |
| "mean_token_accuracy": 0.8046972885727882, |
| "num_tokens": 16572880.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.7551766138855055, |
| "grad_norm": 0.46101540327072144, |
| "learning_rate": 0.00017781979761163756, |
| "loss": 0.7174, |
| "mean_token_accuracy": 0.8066875368356705, |
| "num_tokens": 16714419.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.761266747868453, |
| "grad_norm": 0.5313341021537781, |
| "learning_rate": 0.00017742931863793358, |
| "loss": 0.7797, |
| "mean_token_accuracy": 0.7911526098847389, |
| "num_tokens": 16838285.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7673568818514007, |
| "grad_norm": 0.4627362787723541, |
| "learning_rate": 0.00017703592455614998, |
| "loss": 0.7626, |
| "mean_token_accuracy": 0.7970306649804115, |
| "num_tokens": 16976065.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7734470158343484, |
| "grad_norm": 0.5429073572158813, |
| "learning_rate": 0.00017663963237352177, |
| "loss": 0.7398, |
| "mean_token_accuracy": 0.8005403786897659, |
| "num_tokens": 17112901.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7795371498172959, |
| "grad_norm": 0.6781270503997803, |
| "learning_rate": 0.00017624045922257471, |
| "loss": 0.7607, |
| "mean_token_accuracy": 0.7946217939257622, |
| "num_tokens": 17245480.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7856272838002436, |
| "grad_norm": 0.5227305293083191, |
| "learning_rate": 0.00017583842236038483, |
| "loss": 0.7217, |
| "mean_token_accuracy": 0.8064659267663956, |
| "num_tokens": 17387171.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.7917174177831913, |
| "grad_norm": 0.49253156781196594, |
| "learning_rate": 0.0001754335391678323, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7960015773773194, |
| "num_tokens": 17521164.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7978075517661388, |
| "grad_norm": 0.5103631615638733, |
| "learning_rate": 0.00017502582714884997, |
| "loss": 0.7435, |
| "mean_token_accuracy": 0.7995276898145676, |
| "num_tokens": 17657818.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.8038976857490865, |
| "grad_norm": 0.5531247854232788, |
| "learning_rate": 0.00017461530392966665, |
| "loss": 0.7986, |
| "mean_token_accuracy": 0.7892467245459557, |
| "num_tokens": 17784361.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8099878197320342, |
| "grad_norm": 0.4574586749076843, |
| "learning_rate": 0.00017420198725804517, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8135112956166267, |
| "num_tokens": 17929664.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.8160779537149817, |
| "grad_norm": 0.4734383225440979, |
| "learning_rate": 0.00017378589500251498, |
| "loss": 0.7308, |
| "mean_token_accuracy": 0.8029947131872177, |
| "num_tokens": 18071182.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.8221680876979294, |
| "grad_norm": 0.5192279815673828, |
| "learning_rate": 0.00017336704515159986, |
| "loss": 0.7444, |
| "mean_token_accuracy": 0.8012512847781181, |
| "num_tokens": 18211136.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8282582216808769, |
| "grad_norm": 0.5378620624542236, |
| "learning_rate": 0.00017294545581303996, |
| "loss": 0.7459, |
| "mean_token_accuracy": 0.7981989249587059, |
| "num_tokens": 18340645.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8343483556638246, |
| "grad_norm": 0.4879571497440338, |
| "learning_rate": 0.00017252114521300918, |
| "loss": 0.7877, |
| "mean_token_accuracy": 0.7891893342137337, |
| "num_tokens": 18465733.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.8404384896467723, |
| "grad_norm": 0.5297388434410095, |
| "learning_rate": 0.00017209413169532717, |
| "loss": 0.7586, |
| "mean_token_accuracy": 0.797142505645752, |
| "num_tokens": 18598979.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.8465286236297198, |
| "grad_norm": 0.5308396220207214, |
| "learning_rate": 0.00017166443372066618, |
| "loss": 0.7387, |
| "mean_token_accuracy": 0.80123979896307, |
| "num_tokens": 18735919.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8526187576126675, |
| "grad_norm": 0.49988579750061035, |
| "learning_rate": 0.0001712320698657532, |
| "loss": 0.7425, |
| "mean_token_accuracy": 0.7996803268790245, |
| "num_tokens": 18870877.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8587088915956151, |
| "grad_norm": 0.5971361994743347, |
| "learning_rate": 0.0001707970588225665, |
| "loss": 0.7691, |
| "mean_token_accuracy": 0.7922965154051781, |
| "num_tokens": 19000943.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.8647990255785627, |
| "grad_norm": 0.5141698718070984, |
| "learning_rate": 0.00017035941939752802, |
| "loss": 0.7203, |
| "mean_token_accuracy": 0.8036229625344277, |
| "num_tokens": 19135039.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8708891595615104, |
| "grad_norm": 0.4647749066352844, |
| "learning_rate": 0.0001699191705106898, |
| "loss": 0.7136, |
| "mean_token_accuracy": 0.8064323276281357, |
| "num_tokens": 19274069.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.876979293544458, |
| "grad_norm": 0.5511934161186218, |
| "learning_rate": 0.00016947633119491633, |
| "loss": 0.7455, |
| "mean_token_accuracy": 0.7985599264502525, |
| "num_tokens": 19409679.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.8830694275274056, |
| "grad_norm": 0.4936945140361786, |
| "learning_rate": 0.00016903092059506182, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.806523185968399, |
| "num_tokens": 19547419.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8891595615103532, |
| "grad_norm": 0.5227787494659424, |
| "learning_rate": 0.00016858295796714213, |
| "loss": 0.7739, |
| "mean_token_accuracy": 0.7941467314958572, |
| "num_tokens": 19674455.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8952496954933008, |
| "grad_norm": 0.5046219825744629, |
| "learning_rate": 0.00016813246267750282, |
| "loss": 0.7361, |
| "mean_token_accuracy": 0.8008369222283364, |
| "num_tokens": 19809861.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9013398294762485, |
| "grad_norm": 0.4827081263065338, |
| "learning_rate": 0.00016767945420198142, |
| "loss": 0.7464, |
| "mean_token_accuracy": 0.7986427888274192, |
| "num_tokens": 19940696.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.9074299634591961, |
| "grad_norm": 0.4970889687538147, |
| "learning_rate": 0.00016722395212506567, |
| "loss": 0.7528, |
| "mean_token_accuracy": 0.7965970665216446, |
| "num_tokens": 20070686.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.9135200974421437, |
| "grad_norm": 0.44478070735931396, |
| "learning_rate": 0.00016676597613904693, |
| "loss": 0.7185, |
| "mean_token_accuracy": 0.8081388726830483, |
| "num_tokens": 20210260.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9196102314250914, |
| "grad_norm": 0.506136417388916, |
| "learning_rate": 0.00016630554604316866, |
| "loss": 0.7395, |
| "mean_token_accuracy": 0.8003876298666001, |
| "num_tokens": 20346235.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.925700365408039, |
| "grad_norm": 0.500946044921875, |
| "learning_rate": 0.00016584268174277053, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8124501362442971, |
| "num_tokens": 20481248.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9317904993909866, |
| "grad_norm": 0.48528990149497986, |
| "learning_rate": 0.00016537740324842795, |
| "loss": 0.7227, |
| "mean_token_accuracy": 0.8041250064969063, |
| "num_tokens": 20613531.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.9378806333739342, |
| "grad_norm": 0.5070951581001282, |
| "learning_rate": 0.00016490973067508674, |
| "loss": 0.7091, |
| "mean_token_accuracy": 0.8082544595003128, |
| "num_tokens": 20750784.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9439707673568819, |
| "grad_norm": 0.5583120584487915, |
| "learning_rate": 0.0001644396842411939, |
| "loss": 0.7405, |
| "mean_token_accuracy": 0.7992320343852043, |
| "num_tokens": 20883646.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9500609013398295, |
| "grad_norm": 0.5099635124206543, |
| "learning_rate": 0.00016396728426782312, |
| "loss": 0.7103, |
| "mean_token_accuracy": 0.8091216519474983, |
| "num_tokens": 21025143.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9561510353227771, |
| "grad_norm": 0.5777808427810669, |
| "learning_rate": 0.00016349255117779652, |
| "loss": 0.7245, |
| "mean_token_accuracy": 0.8023119494318962, |
| "num_tokens": 21160014.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.9622411693057247, |
| "grad_norm": 0.5206162333488464, |
| "learning_rate": 0.0001630155054948016, |
| "loss": 0.7185, |
| "mean_token_accuracy": 0.8069521963596344, |
| "num_tokens": 21299094.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9683313032886723, |
| "grad_norm": 0.5763202905654907, |
| "learning_rate": 0.00016253616784250415, |
| "loss": 0.7677, |
| "mean_token_accuracy": 0.7927820891141891, |
| "num_tokens": 21429252.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.97442143727162, |
| "grad_norm": 0.5068426728248596, |
| "learning_rate": 0.00016205455894365627, |
| "loss": 0.7673, |
| "mean_token_accuracy": 0.794715291261673, |
| "num_tokens": 21556200.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9805115712545676, |
| "grad_norm": 0.46094459295272827, |
| "learning_rate": 0.0001615706996192009, |
| "loss": 0.771, |
| "mean_token_accuracy": 0.7921045809984207, |
| "num_tokens": 21681524.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.9866017052375152, |
| "grad_norm": 0.5063546299934387, |
| "learning_rate": 0.00016108461078737148, |
| "loss": 0.7383, |
| "mean_token_accuracy": 0.800596435368061, |
| "num_tokens": 21814109.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.9926918392204629, |
| "grad_norm": 0.5418652296066284, |
| "learning_rate": 0.0001605963134627876, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7994748756289483, |
| "num_tokens": 21947346.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.9987819732034104, |
| "grad_norm": 0.6195595264434814, |
| "learning_rate": 0.0001601058287555465, |
| "loss": 0.7294, |
| "mean_token_accuracy": 0.8030684441328049, |
| "num_tokens": 22081340.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.004872107186358, |
| "grad_norm": 0.5930359363555908, |
| "learning_rate": 0.00015961317787031054, |
| "loss": 0.7387, |
| "mean_token_accuracy": 0.8013696864247322, |
| "num_tokens": 22206441.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.0109622411693058, |
| "grad_norm": 0.4926474094390869, |
| "learning_rate": 0.00015911838210539038, |
| "loss": 0.6743, |
| "mean_token_accuracy": 0.8141208037734031, |
| "num_tokens": 22344898.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.0170523751522533, |
| "grad_norm": 0.5331000685691833, |
| "learning_rate": 0.0001586214628518242, |
| "loss": 0.7033, |
| "mean_token_accuracy": 0.807385990023613, |
| "num_tokens": 22483135.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.0231425091352009, |
| "grad_norm": 0.5267267227172852, |
| "learning_rate": 0.0001581224415924531, |
| "loss": 0.6717, |
| "mean_token_accuracy": 0.8178876608610153, |
| "num_tokens": 22617934.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.0292326431181487, |
| "grad_norm": 0.5864041447639465, |
| "learning_rate": 0.00015762133990099205, |
| "loss": 0.7421, |
| "mean_token_accuracy": 0.7981289237737655, |
| "num_tokens": 22745190.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.0353227771010962, |
| "grad_norm": 0.45681944489479065, |
| "learning_rate": 0.00015711817944109738, |
| "loss": 0.6646, |
| "mean_token_accuracy": 0.8146520599722862, |
| "num_tokens": 22887536.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0414129110840438, |
| "grad_norm": 0.5522484183311462, |
| "learning_rate": 0.00015661298196543042, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8100781336426734, |
| "num_tokens": 23017586.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.0475030450669915, |
| "grad_norm": 0.5221629738807678, |
| "learning_rate": 0.00015610576931471658, |
| "loss": 0.6939, |
| "mean_token_accuracy": 0.8114214852452278, |
| "num_tokens": 23151737.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.053593179049939, |
| "grad_norm": 0.5104020833969116, |
| "learning_rate": 0.00015559656341680164, |
| "loss": 0.716, |
| "mean_token_accuracy": 0.8063826873898506, |
| "num_tokens": 23280778.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.0596833130328867, |
| "grad_norm": 0.5163984298706055, |
| "learning_rate": 0.00015508538628570352, |
| "loss": 0.7188, |
| "mean_token_accuracy": 0.802527217566967, |
| "num_tokens": 23410327.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.0657734470158344, |
| "grad_norm": 0.5188373327255249, |
| "learning_rate": 0.00015457226002066058, |
| "loss": 0.6791, |
| "mean_token_accuracy": 0.8127639785408973, |
| "num_tokens": 23548616.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.071863580998782, |
| "grad_norm": 0.5983869433403015, |
| "learning_rate": 0.00015405720680517618, |
| "loss": 0.6869, |
| "mean_token_accuracy": 0.8110290810465812, |
| "num_tokens": 23682446.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.0779537149817295, |
| "grad_norm": 0.5919123291969299, |
| "learning_rate": 0.00015354024890605985, |
| "loss": 0.7419, |
| "mean_token_accuracy": 0.7984233900904656, |
| "num_tokens": 23806352.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.0840438489646773, |
| "grad_norm": 0.4900698661804199, |
| "learning_rate": 0.0001530214086724644, |
| "loss": 0.6781, |
| "mean_token_accuracy": 0.8152358055114746, |
| "num_tokens": 23942964.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.0901339829476249, |
| "grad_norm": 0.5409672856330872, |
| "learning_rate": 0.00015250070853491986, |
| "loss": 0.7157, |
| "mean_token_accuracy": 0.803682966530323, |
| "num_tokens": 24070937.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.0962241169305724, |
| "grad_norm": 0.5581572651863098, |
| "learning_rate": 0.0001519781710043638, |
| "loss": 0.7261, |
| "mean_token_accuracy": 0.8027503877878189, |
| "num_tokens": 24200686.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.1023142509135202, |
| "grad_norm": 0.503963053226471, |
| "learning_rate": 0.0001514538186711679, |
| "loss": 0.7125, |
| "mean_token_accuracy": 0.8042754918336869, |
| "num_tokens": 24329983.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.1084043848964678, |
| "grad_norm": 0.6159723997116089, |
| "learning_rate": 0.00015092767420416168, |
| "loss": 0.6873, |
| "mean_token_accuracy": 0.8115814313292503, |
| "num_tokens": 24465292.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.1144945188794153, |
| "grad_norm": 0.518172562122345, |
| "learning_rate": 0.00015039976034965214, |
| "loss": 0.6805, |
| "mean_token_accuracy": 0.8113815248012543, |
| "num_tokens": 24599980.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.1205846528623629, |
| "grad_norm": 0.5381601452827454, |
| "learning_rate": 0.0001498700999304407, |
| "loss": 0.6542, |
| "mean_token_accuracy": 0.8188014090061188, |
| "num_tokens": 24746703.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.1266747868453106, |
| "grad_norm": 0.5001223683357239, |
| "learning_rate": 0.00014933871584483615, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.8022593036293983, |
| "num_tokens": 24877604.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.1327649208282582, |
| "grad_norm": 0.5812251567840576, |
| "learning_rate": 0.00014880563106566512, |
| "loss": 0.6638, |
| "mean_token_accuracy": 0.8161928996443748, |
| "num_tokens": 25023049.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.1388550548112057, |
| "grad_norm": 0.5384249091148376, |
| "learning_rate": 0.0001482708686392786, |
| "loss": 0.6623, |
| "mean_token_accuracy": 0.8167443484067917, |
| "num_tokens": 25162124.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.1449451887941535, |
| "grad_norm": 0.5310192108154297, |
| "learning_rate": 0.00014773445168455576, |
| "loss": 0.7074, |
| "mean_token_accuracy": 0.8042578861117363, |
| "num_tokens": 25293569.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.151035322777101, |
| "grad_norm": 0.6224446296691895, |
| "learning_rate": 0.00014719640339190443, |
| "loss": 0.7094, |
| "mean_token_accuracy": 0.803679920732975, |
| "num_tokens": 25422953.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.1571254567600486, |
| "grad_norm": 0.5978189706802368, |
| "learning_rate": 0.00014665674702225853, |
| "loss": 0.6926, |
| "mean_token_accuracy": 0.8080565810203553, |
| "num_tokens": 25559091.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.1632155907429964, |
| "grad_norm": 0.6134657263755798, |
| "learning_rate": 0.00014611550590607245, |
| "loss": 0.6716, |
| "mean_token_accuracy": 0.8152063637971878, |
| "num_tokens": 25698134.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.169305724725944, |
| "grad_norm": 0.5075950026512146, |
| "learning_rate": 0.00014557270344231246, |
| "loss": 0.6613, |
| "mean_token_accuracy": 0.8169043198227882, |
| "num_tokens": 25835159.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.1753958587088915, |
| "grad_norm": 0.5035059452056885, |
| "learning_rate": 0.00014502836309744508, |
| "loss": 0.6903, |
| "mean_token_accuracy": 0.8096718549728393, |
| "num_tokens": 25970600.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.1814859926918393, |
| "grad_norm": 0.583890438079834, |
| "learning_rate": 0.00014448250840442254, |
| "loss": 0.6662, |
| "mean_token_accuracy": 0.8157578155398368, |
| "num_tokens": 26106658.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.1875761266747868, |
| "grad_norm": 0.5089572668075562, |
| "learning_rate": 0.00014393516296166552, |
| "loss": 0.7085, |
| "mean_token_accuracy": 0.8082539036870002, |
| "num_tokens": 26238847.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.1936662606577344, |
| "grad_norm": 0.4495029151439667, |
| "learning_rate": 0.00014338635043204288, |
| "loss": 0.7085, |
| "mean_token_accuracy": 0.8075269401073456, |
| "num_tokens": 26366417.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.1997563946406822, |
| "grad_norm": 0.6390108466148376, |
| "learning_rate": 0.00014283609454184855, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.8099950149655342, |
| "num_tokens": 26498101.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.2058465286236297, |
| "grad_norm": 0.5687986016273499, |
| "learning_rate": 0.00014228441907977607, |
| "loss": 0.7027, |
| "mean_token_accuracy": 0.8083449766039849, |
| "num_tokens": 26628513.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.2119366626065773, |
| "grad_norm": 0.487954318523407, |
| "learning_rate": 0.00014173134789588994, |
| "loss": 0.6731, |
| "mean_token_accuracy": 0.8129799589514732, |
| "num_tokens": 26761671.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.218026796589525, |
| "grad_norm": 0.5641826391220093, |
| "learning_rate": 0.00014117690490059447, |
| "loss": 0.6949, |
| "mean_token_accuracy": 0.8118783175945282, |
| "num_tokens": 26894870.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.2241169305724726, |
| "grad_norm": 0.5209829211235046, |
| "learning_rate": 0.00014062111406360034, |
| "loss": 0.6742, |
| "mean_token_accuracy": 0.816123254597187, |
| "num_tokens": 27027902.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.2302070645554202, |
| "grad_norm": 0.5218231678009033, |
| "learning_rate": 0.00014006399941288812, |
| "loss": 0.703, |
| "mean_token_accuracy": 0.805295330286026, |
| "num_tokens": 27157882.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.236297198538368, |
| "grad_norm": 0.48154470324516296, |
| "learning_rate": 0.00013950558503366957, |
| "loss": 0.6844, |
| "mean_token_accuracy": 0.811684039235115, |
| "num_tokens": 27290994.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.2423873325213155, |
| "grad_norm": 0.5417695045471191, |
| "learning_rate": 0.00013894589506734643, |
| "loss": 0.7253, |
| "mean_token_accuracy": 0.8018206775188446, |
| "num_tokens": 27420715.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.248477466504263, |
| "grad_norm": 0.5282937288284302, |
| "learning_rate": 0.00013838495371046671, |
| "loss": 0.682, |
| "mean_token_accuracy": 0.8128980100154877, |
| "num_tokens": 27552040.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.2545676004872108, |
| "grad_norm": 0.5213696360588074, |
| "learning_rate": 0.0001378227852136785, |
| "loss": 0.6728, |
| "mean_token_accuracy": 0.8128269612789154, |
| "num_tokens": 27686922.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.2606577344701584, |
| "grad_norm": 0.4823834300041199, |
| "learning_rate": 0.00013725941388068174, |
| "loss": 0.6626, |
| "mean_token_accuracy": 0.8177949145436287, |
| "num_tokens": 27825036.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.266747868453106, |
| "grad_norm": 0.6199477314949036, |
| "learning_rate": 0.0001366948640671775, |
| "loss": 0.686, |
| "mean_token_accuracy": 0.8107037082314491, |
| "num_tokens": 27961614.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.2728380024360537, |
| "grad_norm": 0.4916837513446808, |
| "learning_rate": 0.00013612916017981488, |
| "loss": 0.6738, |
| "mean_token_accuracy": 0.8149923622608185, |
| "num_tokens": 28099524.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.2789281364190013, |
| "grad_norm": 0.6001724600791931, |
| "learning_rate": 0.00013556232667513607, |
| "loss": 0.6637, |
| "mean_token_accuracy": 0.8173324480652809, |
| "num_tokens": 28237055.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.2850182704019488, |
| "grad_norm": 0.5887413620948792, |
| "learning_rate": 0.00013499438805851882, |
| "loss": 0.6744, |
| "mean_token_accuracy": 0.8149967223405838, |
| "num_tokens": 28370538.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.2911084043848966, |
| "grad_norm": 0.6208155751228333, |
| "learning_rate": 0.00013442536888311733, |
| "loss": 0.6973, |
| "mean_token_accuracy": 0.8103871151804924, |
| "num_tokens": 28499232.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.2971985383678442, |
| "grad_norm": 0.5026904344558716, |
| "learning_rate": 0.0001338552937488003, |
| "loss": 0.6739, |
| "mean_token_accuracy": 0.8153023451566697, |
| "num_tokens": 28633993.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.3032886723507917, |
| "grad_norm": 0.5218458771705627, |
| "learning_rate": 0.00013328418730108795, |
| "loss": 0.6619, |
| "mean_token_accuracy": 0.8166303977370262, |
| "num_tokens": 28774139.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.3093788063337393, |
| "grad_norm": 0.519872784614563, |
| "learning_rate": 0.00013271207423008622, |
| "loss": 0.6804, |
| "mean_token_accuracy": 0.8150519266724586, |
| "num_tokens": 28910109.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.315468940316687, |
| "grad_norm": 0.5219667553901672, |
| "learning_rate": 0.00013213897926941942, |
| "loss": 0.6682, |
| "mean_token_accuracy": 0.8166522830724716, |
| "num_tokens": 29045967.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.3215590742996346, |
| "grad_norm": 0.5744656920433044, |
| "learning_rate": 0.000131564927195161, |
| "loss": 0.6772, |
| "mean_token_accuracy": 0.8149690836668014, |
| "num_tokens": 29180769.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.3276492082825821, |
| "grad_norm": 0.5673508048057556, |
| "learning_rate": 0.00013098994282476236, |
| "loss": 0.6841, |
| "mean_token_accuracy": 0.812624742090702, |
| "num_tokens": 29313512.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.3337393422655297, |
| "grad_norm": 0.5187074542045593, |
| "learning_rate": 0.00013041405101598, |
| "loss": 0.6281, |
| "mean_token_accuracy": 0.8221091449260711, |
| "num_tokens": 29454589.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.3398294762484775, |
| "grad_norm": 0.5621201992034912, |
| "learning_rate": 0.00012983727666580086, |
| "loss": 0.6755, |
| "mean_token_accuracy": 0.8157430678606034, |
| "num_tokens": 29589968.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.345919610231425, |
| "grad_norm": 0.579699695110321, |
| "learning_rate": 0.00012925964470936598, |
| "loss": 0.6859, |
| "mean_token_accuracy": 0.8122102931141854, |
| "num_tokens": 29720188.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.3520097442143726, |
| "grad_norm": 0.6406823992729187, |
| "learning_rate": 0.00012868118011889236, |
| "loss": 0.684, |
| "mean_token_accuracy": 0.8107294023036957, |
| "num_tokens": 29848418.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.3580998781973204, |
| "grad_norm": 0.4707708954811096, |
| "learning_rate": 0.00012810190790259367, |
| "loss": 0.6607, |
| "mean_token_accuracy": 0.8182852879166603, |
| "num_tokens": 29988202.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.364190012180268, |
| "grad_norm": 0.6458183526992798, |
| "learning_rate": 0.00012752185310359874, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.8089477211236954, |
| "num_tokens": 30119777.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.3702801461632155, |
| "grad_norm": 0.4278848469257355, |
| "learning_rate": 0.00012694104079886918, |
| "loss": 0.6565, |
| "mean_token_accuracy": 0.8185079246759415, |
| "num_tokens": 30256776.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.3763702801461632, |
| "grad_norm": 0.5647698044776917, |
| "learning_rate": 0.00012635949609811505, |
| "loss": 0.6636, |
| "mean_token_accuracy": 0.8155051723122597, |
| "num_tokens": 30395629.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.3824604141291108, |
| "grad_norm": 0.43498411774635315, |
| "learning_rate": 0.00012577724414270937, |
| "loss": 0.689, |
| "mean_token_accuracy": 0.8125654354691505, |
| "num_tokens": 30532805.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.3885505481120584, |
| "grad_norm": 0.5296844244003296, |
| "learning_rate": 0.00012519431010460136, |
| "loss": 0.6854, |
| "mean_token_accuracy": 0.8122918352484703, |
| "num_tokens": 30664642.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.3946406820950061, |
| "grad_norm": 0.44080430269241333, |
| "learning_rate": 0.000124610719185228, |
| "loss": 0.6405, |
| "mean_token_accuracy": 0.8192834481596947, |
| "num_tokens": 30805370.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.4007308160779537, |
| "grad_norm": 0.5946847796440125, |
| "learning_rate": 0.00012402649661442453, |
| "loss": 0.7025, |
| "mean_token_accuracy": 0.8085126876831055, |
| "num_tokens": 30936385.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.4068209500609012, |
| "grad_norm": 0.6572047472000122, |
| "learning_rate": 0.0001234416676493339, |
| "loss": 0.709, |
| "mean_token_accuracy": 0.8046677514910698, |
| "num_tokens": 31067615.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.412911084043849, |
| "grad_norm": 0.4797047972679138, |
| "learning_rate": 0.0001228562575733147, |
| "loss": 0.6675, |
| "mean_token_accuracy": 0.8157136350870132, |
| "num_tokens": 31200044.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.4190012180267966, |
| "grad_norm": 0.5451430082321167, |
| "learning_rate": 0.0001222702916948481, |
| "loss": 0.6746, |
| "mean_token_accuracy": 0.8092615008354187, |
| "num_tokens": 31334451.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.4250913520097441, |
| "grad_norm": 0.5049906969070435, |
| "learning_rate": 0.00012168379534644371, |
| "loss": 0.6515, |
| "mean_token_accuracy": 0.8203717589378356, |
| "num_tokens": 31472218.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.431181485992692, |
| "grad_norm": 0.6531693935394287, |
| "learning_rate": 0.00012109679388354462, |
| "loss": 0.6778, |
| "mean_token_accuracy": 0.8134923160076142, |
| "num_tokens": 31605853.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.4372716199756395, |
| "grad_norm": 0.5340039730072021, |
| "learning_rate": 0.00012050931268343089, |
| "loss": 0.6628, |
| "mean_token_accuracy": 0.8176047816872597, |
| "num_tokens": 31741034.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.443361753958587, |
| "grad_norm": 0.4518280625343323, |
| "learning_rate": 0.00011992137714412266, |
| "loss": 0.6407, |
| "mean_token_accuracy": 0.8207336485385894, |
| "num_tokens": 31878661.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.4494518879415348, |
| "grad_norm": 0.5232827067375183, |
| "learning_rate": 0.00011933301268328212, |
| "loss": 0.6742, |
| "mean_token_accuracy": 0.8158077761530876, |
| "num_tokens": 32016524.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.4555420219244823, |
| "grad_norm": 0.5181542634963989, |
| "learning_rate": 0.00011874424473711457, |
| "loss": 0.699, |
| "mean_token_accuracy": 0.8078866004943848, |
| "num_tokens": 32146820.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.46163215590743, |
| "grad_norm": 0.5801041126251221, |
| "learning_rate": 0.00011815509875926883, |
| "loss": 0.6572, |
| "mean_token_accuracy": 0.8183338135480881, |
| "num_tokens": 32285928.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.4677222898903777, |
| "grad_norm": 0.5347133874893188, |
| "learning_rate": 0.00011756560021973679, |
| "loss": 0.6738, |
| "mean_token_accuracy": 0.8143690213561058, |
| "num_tokens": 32416470.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.4738124238733252, |
| "grad_norm": 0.4945615231990814, |
| "learning_rate": 0.0001169757746037524, |
| "loss": 0.6505, |
| "mean_token_accuracy": 0.8196728631854058, |
| "num_tokens": 32553798.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.4799025578562728, |
| "grad_norm": 0.5072743892669678, |
| "learning_rate": 0.00011638564741068965, |
| "loss": 0.625, |
| "mean_token_accuracy": 0.826240348815918, |
| "num_tokens": 32692511.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.4859926918392206, |
| "grad_norm": 0.5887538194656372, |
| "learning_rate": 0.00011579524415296043, |
| "loss": 0.6904, |
| "mean_token_accuracy": 0.8112018033862114, |
| "num_tokens": 32818836.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.4920828258221681, |
| "grad_norm": 0.5464449524879456, |
| "learning_rate": 0.00011520459035491142, |
| "loss": 0.6553, |
| "mean_token_accuracy": 0.8198345899581909, |
| "num_tokens": 32957967.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.4981729598051157, |
| "grad_norm": 0.5787419676780701, |
| "learning_rate": 0.00011461371155172071, |
| "loss": 0.663, |
| "mean_token_accuracy": 0.8155046373605728, |
| "num_tokens": 33094241.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.5042630937880634, |
| "grad_norm": 0.5159268975257874, |
| "learning_rate": 0.00011402263328829384, |
| "loss": 0.6792, |
| "mean_token_accuracy": 0.8127613604068756, |
| "num_tokens": 33225474.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.510353227771011, |
| "grad_norm": 0.5665333867073059, |
| "learning_rate": 0.00011343138111815939, |
| "loss": 0.6265, |
| "mean_token_accuracy": 0.8276977241039276, |
| "num_tokens": 33368246.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.5164433617539586, |
| "grad_norm": 0.6272276639938354, |
| "learning_rate": 0.00011283998060236421, |
| "loss": 0.6734, |
| "mean_token_accuracy": 0.816029068827629, |
| "num_tokens": 33503967.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.5225334957369063, |
| "grad_norm": 0.5275886654853821, |
| "learning_rate": 0.0001122484573083686, |
| "loss": 0.6457, |
| "mean_token_accuracy": 0.8222623988986015, |
| "num_tokens": 33641826.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.5286236297198539, |
| "grad_norm": 0.5526687502861023, |
| "learning_rate": 0.00011165683680894072, |
| "loss": 0.6795, |
| "mean_token_accuracy": 0.8127825185656548, |
| "num_tokens": 33774185.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.5347137637028014, |
| "grad_norm": 0.6226133704185486, |
| "learning_rate": 0.00011106514468105111, |
| "loss": 0.6684, |
| "mean_token_accuracy": 0.815614765882492, |
| "num_tokens": 33907116.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.5408038976857492, |
| "grad_norm": 0.612832248210907, |
| "learning_rate": 0.000110473406504767, |
| "loss": 0.6287, |
| "mean_token_accuracy": 0.8220825806260109, |
| "num_tokens": 34048267.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.5468940316686965, |
| "grad_norm": 0.6066681742668152, |
| "learning_rate": 0.00010988164786214639, |
| "loss": 0.6851, |
| "mean_token_accuracy": 0.8115911707282066, |
| "num_tokens": 34177555.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.5529841656516443, |
| "grad_norm": 0.6376360058784485, |
| "learning_rate": 0.00010928989433613204, |
| "loss": 0.6921, |
| "mean_token_accuracy": 0.8096534594893455, |
| "num_tokens": 34308932.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.559074299634592, |
| "grad_norm": 0.6083400249481201, |
| "learning_rate": 0.00010869817150944546, |
| "loss": 0.6575, |
| "mean_token_accuracy": 0.8187816679477692, |
| "num_tokens": 34443994.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.5651644336175394, |
| "grad_norm": 0.6098156571388245, |
| "learning_rate": 0.00010810650496348116, |
| "loss": 0.6092, |
| "mean_token_accuracy": 0.8285523638129234, |
| "num_tokens": 34588403.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.5712545676004872, |
| "grad_norm": 0.47795701026916504, |
| "learning_rate": 0.00010751492027720027, |
| "loss": 0.6423, |
| "mean_token_accuracy": 0.8211737647652626, |
| "num_tokens": 34730426.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.577344701583435, |
| "grad_norm": 0.560787558555603, |
| "learning_rate": 0.00010692344302602515, |
| "loss": 0.6707, |
| "mean_token_accuracy": 0.8134441033005715, |
| "num_tokens": 34861708.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.5834348355663823, |
| "grad_norm": 0.5722246766090393, |
| "learning_rate": 0.00010633209878073343, |
| "loss": 0.6533, |
| "mean_token_accuracy": 0.8185199156403542, |
| "num_tokens": 34997377.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.58952496954933, |
| "grad_norm": 0.4941788613796234, |
| "learning_rate": 0.00010574091310635263, |
| "loss": 0.6487, |
| "mean_token_accuracy": 0.8205527886748314, |
| "num_tokens": 35133685.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.5956151035322779, |
| "grad_norm": 0.575986921787262, |
| "learning_rate": 0.00010514991156105493, |
| "loss": 0.6615, |
| "mean_token_accuracy": 0.8179458349943161, |
| "num_tokens": 35270993.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.6017052375152252, |
| "grad_norm": 0.5677866339683533, |
| "learning_rate": 0.00010455911969505228, |
| "loss": 0.6572, |
| "mean_token_accuracy": 0.8155815675854683, |
| "num_tokens": 35402062.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.607795371498173, |
| "grad_norm": 0.6232825517654419, |
| "learning_rate": 0.00010396856304949162, |
| "loss": 0.6477, |
| "mean_token_accuracy": 0.8209305629134178, |
| "num_tokens": 35537394.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.6138855054811205, |
| "grad_norm": 0.6252410411834717, |
| "learning_rate": 0.00010337826715535102, |
| "loss": 0.6819, |
| "mean_token_accuracy": 0.8137489795684815, |
| "num_tokens": 35669332.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.619975639464068, |
| "grad_norm": 0.5850580334663391, |
| "learning_rate": 0.0001027882575323356, |
| "loss": 0.6831, |
| "mean_token_accuracy": 0.8099577218294144, |
| "num_tokens": 35799095.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.6260657734470159, |
| "grad_norm": 0.5118699073791504, |
| "learning_rate": 0.00010219855968777442, |
| "loss": 0.681, |
| "mean_token_accuracy": 0.8123177006840706, |
| "num_tokens": 35928313.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.6321559074299634, |
| "grad_norm": 0.5392698645591736, |
| "learning_rate": 0.00010160919911551774, |
| "loss": 0.6536, |
| "mean_token_accuracy": 0.8185337752103805, |
| "num_tokens": 36062033.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.638246041412911, |
| "grad_norm": 0.5542203783988953, |
| "learning_rate": 0.00010102020129483481, |
| "loss": 0.6859, |
| "mean_token_accuracy": 0.8107540607452393, |
| "num_tokens": 36190194.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.6443361753958587, |
| "grad_norm": 0.5962918996810913, |
| "learning_rate": 0.0001004315916893124, |
| "loss": 0.64, |
| "mean_token_accuracy": 0.8226593688130379, |
| "num_tokens": 36322437.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.6504263093788063, |
| "grad_norm": 0.6391364932060242, |
| "learning_rate": 9.984339574575394e-05, |
| "loss": 0.6457, |
| "mean_token_accuracy": 0.8250340327620507, |
| "num_tokens": 36463231.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.6565164433617539, |
| "grad_norm": 0.5798075795173645, |
| "learning_rate": 9.92556388930794e-05, |
| "loss": 0.6901, |
| "mean_token_accuracy": 0.8104871213436127, |
| "num_tokens": 36588963.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.6626065773447016, |
| "grad_norm": 0.5375143885612488, |
| "learning_rate": 9.866834654122597e-05, |
| "loss": 0.6723, |
| "mean_token_accuracy": 0.8132491707801819, |
| "num_tokens": 36724295.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.6686967113276492, |
| "grad_norm": 0.5556331276893616, |
| "learning_rate": 9.808154408004942e-05, |
| "loss": 0.6316, |
| "mean_token_accuracy": 0.8221101492643357, |
| "num_tokens": 36855978.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.6747868453105967, |
| "grad_norm": 0.5330142974853516, |
| "learning_rate": 9.749525687822674e-05, |
| "loss": 0.6269, |
| "mean_token_accuracy": 0.8239532545208931, |
| "num_tokens": 36994164.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.6808769792935445, |
| "grad_norm": 0.568084716796875, |
| "learning_rate": 9.6909510282159e-05, |
| "loss": 0.6568, |
| "mean_token_accuracy": 0.8158794924616813, |
| "num_tokens": 37130680.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.686967113276492, |
| "grad_norm": 0.5072943568229675, |
| "learning_rate": 9.632432961487585e-05, |
| "loss": 0.6838, |
| "mean_token_accuracy": 0.8121756613254547, |
| "num_tokens": 37261462.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.6930572472594396, |
| "grad_norm": 0.5469337701797485, |
| "learning_rate": 9.573974017494069e-05, |
| "loss": 0.6447, |
| "mean_token_accuracy": 0.8220986798405647, |
| "num_tokens": 37395606.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.6991473812423874, |
| "grad_norm": 0.57918381690979, |
| "learning_rate": 9.515576723535689e-05, |
| "loss": 0.6217, |
| "mean_token_accuracy": 0.822702020406723, |
| "num_tokens": 37533585.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.705237515225335, |
| "grad_norm": 0.6425563097000122, |
| "learning_rate": 9.45724360424753e-05, |
| "loss": 0.6435, |
| "mean_token_accuracy": 0.8198476612567902, |
| "num_tokens": 37672877.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.7113276492082825, |
| "grad_norm": 0.5059729218482971, |
| "learning_rate": 9.398977181490274e-05, |
| "loss": 0.6579, |
| "mean_token_accuracy": 0.8166012555360794, |
| "num_tokens": 37809109.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.7174177831912303, |
| "grad_norm": 0.5450888276100159, |
| "learning_rate": 9.340779974241167e-05, |
| "loss": 0.6175, |
| "mean_token_accuracy": 0.8274259582161904, |
| "num_tokens": 37950597.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.7235079171741778, |
| "grad_norm": 0.6464765667915344, |
| "learning_rate": 9.282654498485139e-05, |
| "loss": 0.6636, |
| "mean_token_accuracy": 0.8163545817136765, |
| "num_tokens": 38086904.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.7295980511571254, |
| "grad_norm": 0.6118177175521851, |
| "learning_rate": 9.22460326710601e-05, |
| "loss": 0.6696, |
| "mean_token_accuracy": 0.8133967757225037, |
| "num_tokens": 38219759.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.7356881851400732, |
| "grad_norm": 0.5518969893455505, |
| "learning_rate": 9.16662878977786e-05, |
| "loss": 0.6659, |
| "mean_token_accuracy": 0.8180875137448311, |
| "num_tokens": 38349770.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.7417783191230207, |
| "grad_norm": 0.6465517282485962, |
| "learning_rate": 9.108733572856549e-05, |
| "loss": 0.6581, |
| "mean_token_accuracy": 0.8170399129390716, |
| "num_tokens": 38482303.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.7478684531059683, |
| "grad_norm": 0.5193557143211365, |
| "learning_rate": 9.050920119271335e-05, |
| "loss": 0.6543, |
| "mean_token_accuracy": 0.8178304255008697, |
| "num_tokens": 38615426.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.753958587088916, |
| "grad_norm": 0.611529529094696, |
| "learning_rate": 8.993190928416682e-05, |
| "loss": 0.6248, |
| "mean_token_accuracy": 0.8259203046560287, |
| "num_tokens": 38755859.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.7600487210718636, |
| "grad_norm": 0.5405944585800171, |
| "learning_rate": 8.935548496044198e-05, |
| "loss": 0.6232, |
| "mean_token_accuracy": 0.8281204700469971, |
| "num_tokens": 38893007.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.7661388550548112, |
| "grad_norm": 0.6433010697364807, |
| "learning_rate": 8.877995314154748e-05, |
| "loss": 0.6751, |
| "mean_token_accuracy": 0.8155393078923225, |
| "num_tokens": 39020285.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.772228989037759, |
| "grad_norm": 0.47974956035614014, |
| "learning_rate": 8.820533870890717e-05, |
| "loss": 0.6527, |
| "mean_token_accuracy": 0.8197720810770989, |
| "num_tokens": 39151426.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.7783191230207065, |
| "grad_norm": 0.5529680848121643, |
| "learning_rate": 8.763166650428436e-05, |
| "loss": 0.6262, |
| "mean_token_accuracy": 0.8256829127669334, |
| "num_tokens": 39294242.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.784409257003654, |
| "grad_norm": 0.6060122847557068, |
| "learning_rate": 8.705896132870797e-05, |
| "loss": 0.6563, |
| "mean_token_accuracy": 0.8192467406392098, |
| "num_tokens": 39425879.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.7904993909866018, |
| "grad_norm": 0.6099355220794678, |
| "learning_rate": 8.648724794140017e-05, |
| "loss": 0.6664, |
| "mean_token_accuracy": 0.8186777010560036, |
| "num_tokens": 39559787.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.7965895249695494, |
| "grad_norm": 0.5908733010292053, |
| "learning_rate": 8.591655105870615e-05, |
| "loss": 0.6712, |
| "mean_token_accuracy": 0.8136340633034707, |
| "num_tokens": 39689823.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.802679658952497, |
| "grad_norm": 0.5845519304275513, |
| "learning_rate": 8.534689535302553e-05, |
| "loss": 0.6608, |
| "mean_token_accuracy": 0.8170475289225578, |
| "num_tokens": 39820725.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.8087697929354447, |
| "grad_norm": 0.6311175227165222, |
| "learning_rate": 8.47783054517457e-05, |
| "loss": 0.6491, |
| "mean_token_accuracy": 0.8193596869707107, |
| "num_tokens": 39947874.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.814859926918392, |
| "grad_norm": 0.5293188691139221, |
| "learning_rate": 8.421080593617706e-05, |
| "loss": 0.6105, |
| "mean_token_accuracy": 0.83141258507967, |
| "num_tokens": 40091297.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.8209500609013398, |
| "grad_norm": 0.5252617597579956, |
| "learning_rate": 8.364442134049049e-05, |
| "loss": 0.6356, |
| "mean_token_accuracy": 0.8237936720252037, |
| "num_tokens": 40229207.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.8270401948842876, |
| "grad_norm": 0.6039798855781555, |
| "learning_rate": 8.30791761506565e-05, |
| "loss": 0.6456, |
| "mean_token_accuracy": 0.8206364914774895, |
| "num_tokens": 40364863.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.833130328867235, |
| "grad_norm": 0.5508609414100647, |
| "learning_rate": 8.251509480338684e-05, |
| "loss": 0.6229, |
| "mean_token_accuracy": 0.8255992740392685, |
| "num_tokens": 40504123.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.8392204628501827, |
| "grad_norm": 0.5637634992599487, |
| "learning_rate": 8.195220168507789e-05, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.8290412962436676, |
| "num_tokens": 40646821.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.8453105968331305, |
| "grad_norm": 0.5463610291481018, |
| "learning_rate": 8.139052113075645e-05, |
| "loss": 0.6278, |
| "mean_token_accuracy": 0.8244929850101471, |
| "num_tokens": 40778989.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.8514007308160778, |
| "grad_norm": 0.5360645055770874, |
| "learning_rate": 8.083007742302776e-05, |
| "loss": 0.6336, |
| "mean_token_accuracy": 0.8228462666273118, |
| "num_tokens": 40917560.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.8574908647990256, |
| "grad_norm": 0.5185632705688477, |
| "learning_rate": 8.02708947910255e-05, |
| "loss": 0.5991, |
| "mean_token_accuracy": 0.830042028427124, |
| "num_tokens": 41059707.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.8635809987819734, |
| "grad_norm": 0.6445353627204895, |
| "learning_rate": 7.971299740936456e-05, |
| "loss": 0.6555, |
| "mean_token_accuracy": 0.8169184163212776, |
| "num_tokens": 41192515.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.8696711327649207, |
| "grad_norm": 0.5360421538352966, |
| "learning_rate": 7.915640939709576e-05, |
| "loss": 0.6234, |
| "mean_token_accuracy": 0.8257398083806038, |
| "num_tokens": 41330047.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.8757612667478685, |
| "grad_norm": 0.58651202917099, |
| "learning_rate": 7.860115481666333e-05, |
| "loss": 0.6564, |
| "mean_token_accuracy": 0.8205534905195236, |
| "num_tokens": 41460379.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.881851400730816, |
| "grad_norm": 0.6842640042304993, |
| "learning_rate": 7.804725767286427e-05, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.8097458809614182, |
| "num_tokens": 41581210.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.8879415347137636, |
| "grad_norm": 0.5175514817237854, |
| "learning_rate": 7.749474191181096e-05, |
| "loss": 0.6393, |
| "mean_token_accuracy": 0.8219558611512184, |
| "num_tokens": 41714792.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.8940316686967114, |
| "grad_norm": 0.5963588356971741, |
| "learning_rate": 7.694363141989575e-05, |
| "loss": 0.658, |
| "mean_token_accuracy": 0.8182344615459443, |
| "num_tokens": 41846600.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.900121802679659, |
| "grad_norm": 0.6149535775184631, |
| "learning_rate": 7.639395002275827e-05, |
| "loss": 0.6499, |
| "mean_token_accuracy": 0.8208124756813049, |
| "num_tokens": 41977627.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.9062119366626065, |
| "grad_norm": 0.5739808678627014, |
| "learning_rate": 7.584572148425544e-05, |
| "loss": 0.6703, |
| "mean_token_accuracy": 0.8125967502593994, |
| "num_tokens": 42104510.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 1.9123020706455542, |
| "grad_norm": 0.5982648730278015, |
| "learning_rate": 7.529896950543416e-05, |
| "loss": 0.6513, |
| "mean_token_accuracy": 0.8201186507940292, |
| "num_tokens": 42236168.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.9183922046285018, |
| "grad_norm": 0.5896486043930054, |
| "learning_rate": 7.475371772350658e-05, |
| "loss": 0.6133, |
| "mean_token_accuracy": 0.8260134413838387, |
| "num_tokens": 42375086.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.9244823386114494, |
| "grad_norm": 0.6223361492156982, |
| "learning_rate": 7.420998971082833e-05, |
| "loss": 0.6638, |
| "mean_token_accuracy": 0.8162963137030601, |
| "num_tokens": 42506457.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.9305724725943971, |
| "grad_norm": 0.709854245185852, |
| "learning_rate": 7.366780897387924e-05, |
| "loss": 0.6324, |
| "mean_token_accuracy": 0.8247174829244613, |
| "num_tokens": 42640886.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.9366626065773447, |
| "grad_norm": 0.6794169545173645, |
| "learning_rate": 7.312719895224736e-05, |
| "loss": 0.6164, |
| "mean_token_accuracy": 0.82676922082901, |
| "num_tokens": 42781318.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.9427527405602922, |
| "grad_norm": 0.49305981397628784, |
| "learning_rate": 7.258818301761532e-05, |
| "loss": 0.6216, |
| "mean_token_accuracy": 0.8258268669247627, |
| "num_tokens": 42919381.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.94884287454324, |
| "grad_norm": 0.5072576999664307, |
| "learning_rate": 7.205078447275031e-05, |
| "loss": 0.6407, |
| "mean_token_accuracy": 0.819316141307354, |
| "num_tokens": 43056494.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.9549330085261876, |
| "grad_norm": 0.6188381314277649, |
| "learning_rate": 7.151502655049623e-05, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.8328602254390717, |
| "num_tokens": 43197795.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.9610231425091351, |
| "grad_norm": 0.5626131296157837, |
| "learning_rate": 7.098093241276962e-05, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.8258091285824776, |
| "num_tokens": 43340325.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.967113276492083, |
| "grad_norm": 0.5364338755607605, |
| "learning_rate": 7.044852514955816e-05, |
| "loss": 0.6454, |
| "mean_token_accuracy": 0.8199462234973908, |
| "num_tokens": 43472226.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 1.9732034104750305, |
| "grad_norm": 0.5460382699966431, |
| "learning_rate": 6.991782777792244e-05, |
| "loss": 0.6214, |
| "mean_token_accuracy": 0.8251617640256882, |
| "num_tokens": 43609559.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.979293544457978, |
| "grad_norm": 0.6013203263282776, |
| "learning_rate": 6.938886324100097e-05, |
| "loss": 0.6422, |
| "mean_token_accuracy": 0.8197862133383751, |
| "num_tokens": 43743060.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.9853836784409258, |
| "grad_norm": 0.6688512563705444, |
| "learning_rate": 6.88616544070182e-05, |
| "loss": 0.6447, |
| "mean_token_accuracy": 0.820338460803032, |
| "num_tokens": 43876874.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.9914738124238733, |
| "grad_norm": 0.6146946549415588, |
| "learning_rate": 6.8336224068296e-05, |
| "loss": 0.6015, |
| "mean_token_accuracy": 0.8318811848759651, |
| "num_tokens": 44021963.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.997563946406821, |
| "grad_norm": 0.5972597599029541, |
| "learning_rate": 6.781259494026821e-05, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8282003849744797, |
| "num_tokens": 44159207.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.0036540803897687, |
| "grad_norm": 0.5882354974746704, |
| "learning_rate": 6.729078966049863e-05, |
| "loss": 0.6413, |
| "mean_token_accuracy": 0.8182575166225433, |
| "num_tokens": 44286530.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.009744214372716, |
| "grad_norm": 0.6095362901687622, |
| "learning_rate": 6.67708307877023e-05, |
| "loss": 0.6004, |
| "mean_token_accuracy": 0.8311555400490761, |
| "num_tokens": 44421317.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.0158343483556638, |
| "grad_norm": 0.4933398962020874, |
| "learning_rate": 6.625274080077034e-05, |
| "loss": 0.5953, |
| "mean_token_accuracy": 0.8339588925242424, |
| "num_tokens": 44559187.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.0219244823386116, |
| "grad_norm": 0.5924236178398132, |
| "learning_rate": 6.573654209779808e-05, |
| "loss": 0.5653, |
| "mean_token_accuracy": 0.8377310782670975, |
| "num_tokens": 44698135.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.028014616321559, |
| "grad_norm": 0.7029783129692078, |
| "learning_rate": 6.522225699511671e-05, |
| "loss": 0.5738, |
| "mean_token_accuracy": 0.8355662778019906, |
| "num_tokens": 44832354.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.0341047503045067, |
| "grad_norm": 0.591604471206665, |
| "learning_rate": 6.470990772632868e-05, |
| "loss": 0.5682, |
| "mean_token_accuracy": 0.8361614629626274, |
| "num_tokens": 44978302.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.0401948842874544, |
| "grad_norm": 0.6186702847480774, |
| "learning_rate": 6.419951644134623e-05, |
| "loss": 0.6036, |
| "mean_token_accuracy": 0.8307035118341446, |
| "num_tokens": 45110425.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.0462850182704018, |
| "grad_norm": 0.5165773630142212, |
| "learning_rate": 6.369110520543397e-05, |
| "loss": 0.5994, |
| "mean_token_accuracy": 0.8298280730843544, |
| "num_tokens": 45249677.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 2.0523751522533495, |
| "grad_norm": 0.6079832315444946, |
| "learning_rate": 6.318469599825489e-05, |
| "loss": 0.5921, |
| "mean_token_accuracy": 0.8314823001623154, |
| "num_tokens": 45384176.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 2.0584652862362973, |
| "grad_norm": 0.6483248472213745, |
| "learning_rate": 6.268031071292028e-05, |
| "loss": 0.6059, |
| "mean_token_accuracy": 0.8267465397715569, |
| "num_tokens": 45517909.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.0645554202192447, |
| "grad_norm": 0.5445775389671326, |
| "learning_rate": 6.217797115504296e-05, |
| "loss": 0.5923, |
| "mean_token_accuracy": 0.8316209375858307, |
| "num_tokens": 45654194.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.0706455542021924, |
| "grad_norm": 0.6171122789382935, |
| "learning_rate": 6.16776990417949e-05, |
| "loss": 0.6385, |
| "mean_token_accuracy": 0.820763637125492, |
| "num_tokens": 45784344.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.07673568818514, |
| "grad_norm": 0.6944971680641174, |
| "learning_rate": 6.117951600096805e-05, |
| "loss": 0.6352, |
| "mean_token_accuracy": 0.8226177647709847, |
| "num_tokens": 45911670.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.0828258221680875, |
| "grad_norm": 0.6227422952651978, |
| "learning_rate": 6.06834435700396e-05, |
| "loss": 0.5697, |
| "mean_token_accuracy": 0.8398994401097297, |
| "num_tokens": 46055567.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.0889159561510353, |
| "grad_norm": 0.6456082463264465, |
| "learning_rate": 6.018950319524062e-05, |
| "loss": 0.606, |
| "mean_token_accuracy": 0.8293915688991547, |
| "num_tokens": 46190030.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.095006090133983, |
| "grad_norm": 0.6646130681037903, |
| "learning_rate": 5.969771623062905e-05, |
| "loss": 0.6088, |
| "mean_token_accuracy": 0.8277154579758644, |
| "num_tokens": 46324733.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.1010962241169304, |
| "grad_norm": 0.6068746447563171, |
| "learning_rate": 5.920810393716647e-05, |
| "loss": 0.6055, |
| "mean_token_accuracy": 0.8310004472732544, |
| "num_tokens": 46457965.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.107186358099878, |
| "grad_norm": 0.6344166398048401, |
| "learning_rate": 5.872068748179904e-05, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.826252605021, |
| "num_tokens": 46591001.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.113276492082826, |
| "grad_norm": 0.670253574848175, |
| "learning_rate": 5.823548793654222e-05, |
| "loss": 0.5922, |
| "mean_token_accuracy": 0.8330878585577011, |
| "num_tokens": 46733212.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.1193666260657733, |
| "grad_norm": 0.6969623565673828, |
| "learning_rate": 5.775252627756988e-05, |
| "loss": 0.6112, |
| "mean_token_accuracy": 0.8287390932440758, |
| "num_tokens": 46869195.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.125456760048721, |
| "grad_norm": 0.5771912932395935, |
| "learning_rate": 5.727182338430759e-05, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.8298723086714744, |
| "num_tokens": 47004726.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.131546894031669, |
| "grad_norm": 0.5449331998825073, |
| "learning_rate": 5.679340003852971e-05, |
| "loss": 0.6334, |
| "mean_token_accuracy": 0.8226374134421348, |
| "num_tokens": 47128817.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.137637028014616, |
| "grad_norm": 0.6869291067123413, |
| "learning_rate": 5.6317276923461074e-05, |
| "loss": 0.6234, |
| "mean_token_accuracy": 0.8266696631908417, |
| "num_tokens": 47262232.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.143727161997564, |
| "grad_norm": 0.6404641270637512, |
| "learning_rate": 5.584347462288294e-05, |
| "loss": 0.5965, |
| "mean_token_accuracy": 0.8314864963293076, |
| "num_tokens": 47395541.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.1498172959805117, |
| "grad_norm": 0.6093842387199402, |
| "learning_rate": 5.537201362024287e-05, |
| "loss": 0.5873, |
| "mean_token_accuracy": 0.8329750701785088, |
| "num_tokens": 47538479.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.155907429963459, |
| "grad_norm": 0.6266767978668213, |
| "learning_rate": 5.490291429776933e-05, |
| "loss": 0.6107, |
| "mean_token_accuracy": 0.8280060842633248, |
| "num_tokens": 47668527.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.161997563946407, |
| "grad_norm": 0.6084907054901123, |
| "learning_rate": 5.443619693559048e-05, |
| "loss": 0.6046, |
| "mean_token_accuracy": 0.8307000920176506, |
| "num_tokens": 47802055.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.1680876979293546, |
| "grad_norm": 0.6321319341659546, |
| "learning_rate": 5.397188171085747e-05, |
| "loss": 0.6376, |
| "mean_token_accuracy": 0.822290787100792, |
| "num_tokens": 47929896.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.174177831912302, |
| "grad_norm": 0.6345944404602051, |
| "learning_rate": 5.350998869687209e-05, |
| "loss": 0.642, |
| "mean_token_accuracy": 0.8202966138720512, |
| "num_tokens": 48057772.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.1802679658952497, |
| "grad_norm": 0.8021138310432434, |
| "learning_rate": 5.3050537862219005e-05, |
| "loss": 0.6004, |
| "mean_token_accuracy": 0.829450149834156, |
| "num_tokens": 48189172.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.1863580998781975, |
| "grad_norm": 0.734176754951477, |
| "learning_rate": 5.259354906990246e-05, |
| "loss": 0.6312, |
| "mean_token_accuracy": 0.823958395421505, |
| "num_tokens": 48317103.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.192448233861145, |
| "grad_norm": 0.5638821125030518, |
| "learning_rate": 5.213904207648749e-05, |
| "loss": 0.5778, |
| "mean_token_accuracy": 0.8363123446702957, |
| "num_tokens": 48454312.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.1985383678440926, |
| "grad_norm": 0.5940743088722229, |
| "learning_rate": 5.168703653124587e-05, |
| "loss": 0.6191, |
| "mean_token_accuracy": 0.8252517506480217, |
| "num_tokens": 48585829.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.2046285018270404, |
| "grad_norm": 0.6641266345977783, |
| "learning_rate": 5.1237551975306666e-05, |
| "loss": 0.6039, |
| "mean_token_accuracy": 0.8323954001069069, |
| "num_tokens": 48721617.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.2107186358099877, |
| "grad_norm": 0.5851954221725464, |
| "learning_rate": 5.0790607840811335e-05, |
| "loss": 0.642, |
| "mean_token_accuracy": 0.8193060621619225, |
| "num_tokens": 48848346.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.2168087697929355, |
| "grad_norm": 0.6154613494873047, |
| "learning_rate": 5.0346223450073795e-05, |
| "loss": 0.5972, |
| "mean_token_accuracy": 0.829984450340271, |
| "num_tokens": 48989423.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.2228989037758833, |
| "grad_norm": 0.6556016802787781, |
| "learning_rate": 4.990441801474487e-05, |
| "loss": 0.5768, |
| "mean_token_accuracy": 0.8358439221978188, |
| "num_tokens": 49127954.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.2289890377588306, |
| "grad_norm": 0.6562819480895996, |
| "learning_rate": 4.94652106349819e-05, |
| "loss": 0.582, |
| "mean_token_accuracy": 0.8355499967932701, |
| "num_tokens": 49268448.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.2350791717417784, |
| "grad_norm": 0.7151765823364258, |
| "learning_rate": 4.9028620298622924e-05, |
| "loss": 0.6543, |
| "mean_token_accuracy": 0.8192793279886246, |
| "num_tokens": 49393488.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.2411693057247257, |
| "grad_norm": 0.6189950108528137, |
| "learning_rate": 4.8594665880365796e-05, |
| "loss": 0.6065, |
| "mean_token_accuracy": 0.8288143903017045, |
| "num_tokens": 49527018.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.2472594397076735, |
| "grad_norm": 0.6800888180732727, |
| "learning_rate": 4.816336614095221e-05, |
| "loss": 0.5777, |
| "mean_token_accuracy": 0.8371251985430718, |
| "num_tokens": 49662686.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.2533495736906213, |
| "grad_norm": 0.6750311255455017, |
| "learning_rate": 4.7734739726356694e-05, |
| "loss": 0.5646, |
| "mean_token_accuracy": 0.8395438298583031, |
| "num_tokens": 49803437.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.259439707673569, |
| "grad_norm": 0.781650185585022, |
| "learning_rate": 4.730880516698042e-05, |
| "loss": 0.6074, |
| "mean_token_accuracy": 0.8285451725125312, |
| "num_tokens": 49938432.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.2655298416565164, |
| "grad_norm": 0.6055442690849304, |
| "learning_rate": 4.6885580876850095e-05, |
| "loss": 0.6106, |
| "mean_token_accuracy": 0.828575699031353, |
| "num_tokens": 50071578.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.271619975639464, |
| "grad_norm": 0.5515550374984741, |
| "learning_rate": 4.6465085152821924e-05, |
| "loss": 0.6172, |
| "mean_token_accuracy": 0.8277894973754882, |
| "num_tokens": 50205327.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.2777101096224115, |
| "grad_norm": 0.6260959506034851, |
| "learning_rate": 4.604733617379061e-05, |
| "loss": 0.6276, |
| "mean_token_accuracy": 0.8239622369408608, |
| "num_tokens": 50336366.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 2.2838002436053593, |
| "grad_norm": 0.6698872447013855, |
| "learning_rate": 4.5632351999903366e-05, |
| "loss": 0.6015, |
| "mean_token_accuracy": 0.8326066240668297, |
| "num_tokens": 50471282.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.289890377588307, |
| "grad_norm": 0.5506294965744019, |
| "learning_rate": 4.52201505717793e-05, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.8309131726622582, |
| "num_tokens": 50602518.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 2.2959805115712544, |
| "grad_norm": 0.5408323407173157, |
| "learning_rate": 4.4810749709733625e-05, |
| "loss": 0.5986, |
| "mean_token_accuracy": 0.8340771466493606, |
| "num_tokens": 50738973.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 2.302070645554202, |
| "grad_norm": 0.5263858437538147, |
| "learning_rate": 4.440416711300731e-05, |
| "loss": 0.6177, |
| "mean_token_accuracy": 0.8264430776238442, |
| "num_tokens": 50871381.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 2.30816077953715, |
| "grad_norm": 0.6784396767616272, |
| "learning_rate": 4.400042035900194e-05, |
| "loss": 0.5947, |
| "mean_token_accuracy": 0.8301294282078743, |
| "num_tokens": 51003495.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 2.3142509135200973, |
| "grad_norm": 0.585687518119812, |
| "learning_rate": 4.359952690251984e-05, |
| "loss": 0.5748, |
| "mean_token_accuracy": 0.8376362308859825, |
| "num_tokens": 51143094.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.320341047503045, |
| "grad_norm": 0.5948687791824341, |
| "learning_rate": 4.320150407500935e-05, |
| "loss": 0.6192, |
| "mean_token_accuracy": 0.8263852804899215, |
| "num_tokens": 51270732.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 2.326431181485993, |
| "grad_norm": 0.7059959173202515, |
| "learning_rate": 4.28063690838156e-05, |
| "loss": 0.5937, |
| "mean_token_accuracy": 0.8340961948037148, |
| "num_tokens": 51408741.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 2.33252131546894, |
| "grad_norm": 0.5434718728065491, |
| "learning_rate": 4.241413901143673e-05, |
| "loss": 0.5754, |
| "mean_token_accuracy": 0.838581845164299, |
| "num_tokens": 51549085.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 2.338611449451888, |
| "grad_norm": 0.6144487857818604, |
| "learning_rate": 4.202483081478516e-05, |
| "loss": 0.5929, |
| "mean_token_accuracy": 0.8351974800229073, |
| "num_tokens": 51685054.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 2.3447015834348357, |
| "grad_norm": 0.6229000091552734, |
| "learning_rate": 4.163846132445465e-05, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.8321109473705292, |
| "num_tokens": 51817361.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.350791717417783, |
| "grad_norm": 0.6818157434463501, |
| "learning_rate": 4.125504724399264e-05, |
| "loss": 0.6114, |
| "mean_token_accuracy": 0.8261456057429314, |
| "num_tokens": 51944695.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 2.356881851400731, |
| "grad_norm": 0.663104772567749, |
| "learning_rate": 4.087460514917811e-05, |
| "loss": 0.5733, |
| "mean_token_accuracy": 0.8354582965373993, |
| "num_tokens": 52084047.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 2.3629719853836786, |
| "grad_norm": 0.6448875069618225, |
| "learning_rate": 4.0497151487305077e-05, |
| "loss": 0.5788, |
| "mean_token_accuracy": 0.8370290577411652, |
| "num_tokens": 52220298.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 2.369062119366626, |
| "grad_norm": 0.5701313018798828, |
| "learning_rate": 4.012270257647129e-05, |
| "loss": 0.5919, |
| "mean_token_accuracy": 0.8328420773148537, |
| "num_tokens": 52354394.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 2.3751522533495737, |
| "grad_norm": 0.5643812417984009, |
| "learning_rate": 3.9751274604873135e-05, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.8361553356051445, |
| "num_tokens": 52491783.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.3812423873325215, |
| "grad_norm": 0.5355440974235535, |
| "learning_rate": 3.938288363010543e-05, |
| "loss": 0.59, |
| "mean_token_accuracy": 0.8323336541652679, |
| "num_tokens": 52623569.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.387332521315469, |
| "grad_norm": 0.5486684441566467, |
| "learning_rate": 3.9017545578467416e-05, |
| "loss": 0.5766, |
| "mean_token_accuracy": 0.8349390685558319, |
| "num_tokens": 52758323.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.3934226552984166, |
| "grad_norm": 0.7338609099388123, |
| "learning_rate": 3.865527624427424e-05, |
| "loss": 0.6389, |
| "mean_token_accuracy": 0.8210031896829605, |
| "num_tokens": 52885390.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.3995127892813644, |
| "grad_norm": 0.6802729368209839, |
| "learning_rate": 3.829609128917399e-05, |
| "loss": 0.5642, |
| "mean_token_accuracy": 0.8403128817677498, |
| "num_tokens": 53021341.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.4056029232643117, |
| "grad_norm": 0.6056346297264099, |
| "learning_rate": 3.794000624147081e-05, |
| "loss": 0.5718, |
| "mean_token_accuracy": 0.8383283510804176, |
| "num_tokens": 53161898.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.4116930572472595, |
| "grad_norm": 0.6759582161903381, |
| "learning_rate": 3.758703649545342e-05, |
| "loss": 0.6012, |
| "mean_token_accuracy": 0.8309965297579766, |
| "num_tokens": 53297422.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.4177831912302072, |
| "grad_norm": 0.6293630599975586, |
| "learning_rate": 3.723719731072964e-05, |
| "loss": 0.6016, |
| "mean_token_accuracy": 0.8290236741304398, |
| "num_tokens": 53426293.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.4238733252131546, |
| "grad_norm": 0.5844866037368774, |
| "learning_rate": 3.689050381156668e-05, |
| "loss": 0.5933, |
| "mean_token_accuracy": 0.8323912978172302, |
| "num_tokens": 53558802.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.4299634591961023, |
| "grad_norm": 0.6043350100517273, |
| "learning_rate": 3.654697098623731e-05, |
| "loss": 0.5912, |
| "mean_token_accuracy": 0.8325126841664314, |
| "num_tokens": 53692702.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.43605359317905, |
| "grad_norm": 0.6182368397712708, |
| "learning_rate": 3.6206613686371874e-05, |
| "loss": 0.6284, |
| "mean_token_accuracy": 0.8247880086302757, |
| "num_tokens": 53822212.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.4421437271619975, |
| "grad_norm": 0.6524989604949951, |
| "learning_rate": 3.586944662631628e-05, |
| "loss": 0.5655, |
| "mean_token_accuracy": 0.8384448051452636, |
| "num_tokens": 53966156.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.4482338611449452, |
| "grad_norm": 0.6937422156333923, |
| "learning_rate": 3.5535484382495686e-05, |
| "loss": 0.5625, |
| "mean_token_accuracy": 0.840263594686985, |
| "num_tokens": 54105125.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.4543239951278926, |
| "grad_norm": 0.7187004685401917, |
| "learning_rate": 3.520474139278455e-05, |
| "loss": 0.638, |
| "mean_token_accuracy": 0.8233362153172493, |
| "num_tokens": 54230056.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.4604141291108403, |
| "grad_norm": 0.7127388715744019, |
| "learning_rate": 3.487723195588231e-05, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8296995043754578, |
| "num_tokens": 54364012.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.466504263093788, |
| "grad_norm": 0.596787691116333, |
| "learning_rate": 3.455297023069529e-05, |
| "loss": 0.5619, |
| "mean_token_accuracy": 0.84009919911623, |
| "num_tokens": 54505256.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.472594397076736, |
| "grad_norm": 0.5992204546928406, |
| "learning_rate": 3.423197023572453e-05, |
| "loss": 0.5662, |
| "mean_token_accuracy": 0.8365229025483132, |
| "num_tokens": 54641531.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.4786845310596832, |
| "grad_norm": 0.5798633098602295, |
| "learning_rate": 3.391424584845983e-05, |
| "loss": 0.5957, |
| "mean_token_accuracy": 0.8325728610157966, |
| "num_tokens": 54777760.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.484774665042631, |
| "grad_norm": 0.5911865830421448, |
| "learning_rate": 3.359981080477968e-05, |
| "loss": 0.5621, |
| "mean_token_accuracy": 0.8406099453568459, |
| "num_tokens": 54920046.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.4908647990255783, |
| "grad_norm": 0.7955853343009949, |
| "learning_rate": 3.32886786983575e-05, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.830414567887783, |
| "num_tokens": 55049133.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.496954933008526, |
| "grad_norm": 0.7664952874183655, |
| "learning_rate": 3.29808629800739e-05, |
| "loss": 0.5649, |
| "mean_token_accuracy": 0.8381562843918801, |
| "num_tokens": 55189390.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.503045066991474, |
| "grad_norm": 0.7716973423957825, |
| "learning_rate": 3.267637695743531e-05, |
| "loss": 0.609, |
| "mean_token_accuracy": 0.8287163496017456, |
| "num_tokens": 55320616.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.5091352009744217, |
| "grad_norm": 0.8114868402481079, |
| "learning_rate": 3.237523379399847e-05, |
| "loss": 0.5971, |
| "mean_token_accuracy": 0.8342167064547539, |
| "num_tokens": 55453396.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.515225334957369, |
| "grad_norm": 0.5958247780799866, |
| "learning_rate": 3.207744650880153e-05, |
| "loss": 0.5599, |
| "mean_token_accuracy": 0.841068896651268, |
| "num_tokens": 55594828.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.5213154689403168, |
| "grad_norm": 0.5653116106987, |
| "learning_rate": 3.178302797580104e-05, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.8287098646163941, |
| "num_tokens": 55727082.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.527405602923264, |
| "grad_norm": 0.5179179310798645, |
| "learning_rate": 3.149199092331553e-05, |
| "loss": 0.5676, |
| "mean_token_accuracy": 0.839223000407219, |
| "num_tokens": 55871490.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.533495736906212, |
| "grad_norm": 0.7765582799911499, |
| "learning_rate": 3.1204347933475144e-05, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.8292981028556824, |
| "num_tokens": 56005734.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.5395858708891597, |
| "grad_norm": 0.7029058337211609, |
| "learning_rate": 3.0920111441677726e-05, |
| "loss": 0.6274, |
| "mean_token_accuracy": 0.8268977910280227, |
| "num_tokens": 56137399.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.5456760048721074, |
| "grad_norm": 0.5849781632423401, |
| "learning_rate": 3.063929373605119e-05, |
| "loss": 0.5984, |
| "mean_token_accuracy": 0.8308302730321884, |
| "num_tokens": 56269293.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.5517661388550548, |
| "grad_norm": 0.703788161277771, |
| "learning_rate": 3.0361906956922358e-05, |
| "loss": 0.5954, |
| "mean_token_accuracy": 0.831818374991417, |
| "num_tokens": 56400293.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.5578562728380025, |
| "grad_norm": 0.605482816696167, |
| "learning_rate": 3.0087963096291965e-05, |
| "loss": 0.5726, |
| "mean_token_accuracy": 0.8381782233715057, |
| "num_tokens": 56537449.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.56394640682095, |
| "grad_norm": 0.6472823619842529, |
| "learning_rate": 2.9817473997316338e-05, |
| "loss": 0.5827, |
| "mean_token_accuracy": 0.8334032818675041, |
| "num_tokens": 56671079.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 2.5700365408038977, |
| "grad_norm": 0.7700249552726746, |
| "learning_rate": 2.9550451353795366e-05, |
| "loss": 0.6021, |
| "mean_token_accuracy": 0.8302450269460678, |
| "num_tokens": 56807767.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 2.5761266747868454, |
| "grad_norm": 0.6605527400970459, |
| "learning_rate": 2.9286906709666923e-05, |
| "loss": 0.6154, |
| "mean_token_accuracy": 0.8274287924170494, |
| "num_tokens": 56935160.0, |
| "step": 4230 |
| }, |
| { |
| "epoch": 2.582216808769793, |
| "grad_norm": 0.5413617491722107, |
| "learning_rate": 2.902685145850781e-05, |
| "loss": 0.579, |
| "mean_token_accuracy": 0.8385426059365273, |
| "num_tokens": 57073941.0, |
| "step": 4240 |
| }, |
| { |
| "epoch": 2.5883069427527405, |
| "grad_norm": 0.7194695472717285, |
| "learning_rate": 2.8770296843041234e-05, |
| "loss": 0.6248, |
| "mean_token_accuracy": 0.8245450094342232, |
| "num_tokens": 57200681.0, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.5943970767356883, |
| "grad_norm": 0.5466388463973999, |
| "learning_rate": 2.851725395465068e-05, |
| "loss": 0.5959, |
| "mean_token_accuracy": 0.8355760648846626, |
| "num_tokens": 57336873.0, |
| "step": 4260 |
| }, |
| { |
| "epoch": 2.6004872107186356, |
| "grad_norm": 0.7041544318199158, |
| "learning_rate": 2.826773373290048e-05, |
| "loss": 0.6056, |
| "mean_token_accuracy": 0.8318590998649598, |
| "num_tokens": 57468023.0, |
| "step": 4270 |
| }, |
| { |
| "epoch": 2.6065773447015834, |
| "grad_norm": 0.6439122557640076, |
| "learning_rate": 2.8021746965062823e-05, |
| "loss": 0.5836, |
| "mean_token_accuracy": 0.8366597965359688, |
| "num_tokens": 57604960.0, |
| "step": 4280 |
| }, |
| { |
| "epoch": 2.612667478684531, |
| "grad_norm": 0.701906681060791, |
| "learning_rate": 2.7779304285651454e-05, |
| "loss": 0.5703, |
| "mean_token_accuracy": 0.8376422345638275, |
| "num_tokens": 57743772.0, |
| "step": 4290 |
| }, |
| { |
| "epoch": 2.6187576126674785, |
| "grad_norm": 0.5615540146827698, |
| "learning_rate": 2.754041617596182e-05, |
| "loss": 0.5661, |
| "mean_token_accuracy": 0.8376396596431732, |
| "num_tokens": 57888421.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.6248477466504263, |
| "grad_norm": 0.7024915814399719, |
| "learning_rate": 2.73050929636181e-05, |
| "loss": 0.5924, |
| "mean_token_accuracy": 0.8345223397016526, |
| "num_tokens": 58026002.0, |
| "step": 4310 |
| }, |
| { |
| "epoch": 2.630937880633374, |
| "grad_norm": 0.6473823189735413, |
| "learning_rate": 2.7073344822126588e-05, |
| "loss": 0.5799, |
| "mean_token_accuracy": 0.8390604540705681, |
| "num_tokens": 58167373.0, |
| "step": 4320 |
| }, |
| { |
| "epoch": 2.6370280146163214, |
| "grad_norm": 0.592150866985321, |
| "learning_rate": 2.6845181770435913e-05, |
| "loss": 0.5745, |
| "mean_token_accuracy": 0.8360892593860626, |
| "num_tokens": 58305631.0, |
| "step": 4330 |
| }, |
| { |
| "epoch": 2.643118148599269, |
| "grad_norm": 0.6310701966285706, |
| "learning_rate": 2.662061367250389e-05, |
| "loss": 0.6368, |
| "mean_token_accuracy": 0.8224513292312622, |
| "num_tokens": 58430217.0, |
| "step": 4340 |
| }, |
| { |
| "epoch": 2.649208282582217, |
| "grad_norm": 0.7892738580703735, |
| "learning_rate": 2.6399650236871114e-05, |
| "loss": 0.5984, |
| "mean_token_accuracy": 0.8304956495761872, |
| "num_tokens": 58559084.0, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.6552984165651643, |
| "grad_norm": 0.6933486461639404, |
| "learning_rate": 2.6182301016241194e-05, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8244527041912079, |
| "num_tokens": 58684693.0, |
| "step": 4360 |
| }, |
| { |
| "epoch": 2.661388550548112, |
| "grad_norm": 0.7749532461166382, |
| "learning_rate": 2.5968575407067848e-05, |
| "loss": 0.6076, |
| "mean_token_accuracy": 0.8320668131113053, |
| "num_tokens": 58817986.0, |
| "step": 4370 |
| }, |
| { |
| "epoch": 2.6674786845310594, |
| "grad_norm": 0.5271658301353455, |
| "learning_rate": 2.5758482649148542e-05, |
| "loss": 0.5893, |
| "mean_token_accuracy": 0.8352309837937355, |
| "num_tokens": 58956432.0, |
| "step": 4380 |
| }, |
| { |
| "epoch": 2.673568818514007, |
| "grad_norm": 0.5268961191177368, |
| "learning_rate": 2.555203182522517e-05, |
| "loss": 0.5908, |
| "mean_token_accuracy": 0.8314955353736877, |
| "num_tokens": 59091108.0, |
| "step": 4390 |
| }, |
| { |
| "epoch": 2.679658952496955, |
| "grad_norm": 0.7242282629013062, |
| "learning_rate": 2.5349231860591298e-05, |
| "loss": 0.6036, |
| "mean_token_accuracy": 0.830773600935936, |
| "num_tokens": 59226935.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.6857490864799027, |
| "grad_norm": 0.6606787443161011, |
| "learning_rate": 2.515009152270638e-05, |
| "loss": 0.5947, |
| "mean_token_accuracy": 0.8281330943107605, |
| "num_tokens": 59358718.0, |
| "step": 4410 |
| }, |
| { |
| "epoch": 2.69183922046285, |
| "grad_norm": 0.5850993990898132, |
| "learning_rate": 2.4954619420816622e-05, |
| "loss": 0.5842, |
| "mean_token_accuracy": 0.8339527383446693, |
| "num_tokens": 59493400.0, |
| "step": 4420 |
| }, |
| { |
| "epoch": 2.697929354445798, |
| "grad_norm": 0.6853693127632141, |
| "learning_rate": 2.47628240055829e-05, |
| "loss": 0.5869, |
| "mean_token_accuracy": 0.832540349662304, |
| "num_tokens": 59628340.0, |
| "step": 4430 |
| }, |
| { |
| "epoch": 2.704019488428745, |
| "grad_norm": 0.5214850902557373, |
| "learning_rate": 2.457471356871536e-05, |
| "loss": 0.587, |
| "mean_token_accuracy": 0.8354447767138481, |
| "num_tokens": 59768106.0, |
| "step": 4440 |
| }, |
| { |
| "epoch": 2.710109622411693, |
| "grad_norm": 0.7274345755577087, |
| "learning_rate": 2.4390296242614934e-05, |
| "loss": 0.584, |
| "mean_token_accuracy": 0.83839912712574, |
| "num_tokens": 59902355.0, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.7161997563946407, |
| "grad_norm": 0.6061622500419617, |
| "learning_rate": 2.4209580000021777e-05, |
| "loss": 0.6142, |
| "mean_token_accuracy": 0.8286945581436157, |
| "num_tokens": 60033260.0, |
| "step": 4460 |
| }, |
| { |
| "epoch": 2.7222898903775885, |
| "grad_norm": 0.65688157081604, |
| "learning_rate": 2.403257265367063e-05, |
| "loss": 0.6134, |
| "mean_token_accuracy": 0.8292039141058922, |
| "num_tokens": 60165503.0, |
| "step": 4470 |
| }, |
| { |
| "epoch": 2.728380024360536, |
| "grad_norm": 0.671927273273468, |
| "learning_rate": 2.3859281855952982e-05, |
| "loss": 0.5613, |
| "mean_token_accuracy": 0.836935906112194, |
| "num_tokens": 60306999.0, |
| "step": 4480 |
| }, |
| { |
| "epoch": 2.7344701583434836, |
| "grad_norm": 0.7475078701972961, |
| "learning_rate": 2.3689715098586323e-05, |
| "loss": 0.5809, |
| "mean_token_accuracy": 0.8373750746250153, |
| "num_tokens": 60442655.0, |
| "step": 4490 |
| }, |
| { |
| "epoch": 2.740560292326431, |
| "grad_norm": 0.6505313515663147, |
| "learning_rate": 2.3523879712290205e-05, |
| "loss": 0.5699, |
| "mean_token_accuracy": 0.838838130235672, |
| "num_tokens": 60584753.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.7466504263093787, |
| "grad_norm": 0.6547593474388123, |
| "learning_rate": 2.336178286646933e-05, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.8289906069636345, |
| "num_tokens": 60716008.0, |
| "step": 4510 |
| }, |
| { |
| "epoch": 2.7527405602923265, |
| "grad_norm": 0.6438285708427429, |
| "learning_rate": 2.3203431568903587e-05, |
| "loss": 0.5943, |
| "mean_token_accuracy": 0.8335410162806511, |
| "num_tokens": 60852035.0, |
| "step": 4520 |
| }, |
| { |
| "epoch": 2.7588306942752743, |
| "grad_norm": 0.5707643628120422, |
| "learning_rate": 2.304883266544519e-05, |
| "loss": 0.5651, |
| "mean_token_accuracy": 0.8370564222335816, |
| "num_tokens": 60991573.0, |
| "step": 4530 |
| }, |
| { |
| "epoch": 2.7649208282582216, |
| "grad_norm": 0.7300040125846863, |
| "learning_rate": 2.2897992839722563e-05, |
| "loss": 0.6344, |
| "mean_token_accuracy": 0.8223764657974243, |
| "num_tokens": 61116667.0, |
| "step": 4540 |
| }, |
| { |
| "epoch": 2.7710109622411694, |
| "grad_norm": 0.781278133392334, |
| "learning_rate": 2.27509186128515e-05, |
| "loss": 0.5983, |
| "mean_token_accuracy": 0.8337039306759835, |
| "num_tokens": 61250372.0, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.7771010962241167, |
| "grad_norm": 0.6679220795631409, |
| "learning_rate": 2.260761634315322e-05, |
| "loss": 0.5621, |
| "mean_token_accuracy": 0.8398270666599273, |
| "num_tokens": 61396984.0, |
| "step": 4560 |
| }, |
| { |
| "epoch": 2.7831912302070645, |
| "grad_norm": 0.6907692551612854, |
| "learning_rate": 2.2468092225879466e-05, |
| "loss": 0.604, |
| "mean_token_accuracy": 0.8311845645308494, |
| "num_tokens": 61532028.0, |
| "step": 4570 |
| }, |
| { |
| "epoch": 2.7892813641900123, |
| "grad_norm": 0.5401070713996887, |
| "learning_rate": 2.2332352292944697e-05, |
| "loss": 0.6033, |
| "mean_token_accuracy": 0.8319179087877273, |
| "num_tokens": 61666037.0, |
| "step": 4580 |
| }, |
| { |
| "epoch": 2.79537149817296, |
| "grad_norm": 0.7261266112327576, |
| "learning_rate": 2.2200402412665298e-05, |
| "loss": 0.5679, |
| "mean_token_accuracy": 0.8387535363435745, |
| "num_tokens": 61806461.0, |
| "step": 4590 |
| }, |
| { |
| "epoch": 2.8014616321559074, |
| "grad_norm": 0.6658375263214111, |
| "learning_rate": 2.2072248289505863e-05, |
| "loss": 0.6096, |
| "mean_token_accuracy": 0.8302027896046639, |
| "num_tokens": 61936714.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.807551766138855, |
| "grad_norm": 0.6202580332756042, |
| "learning_rate": 2.194789546383265e-05, |
| "loss": 0.5684, |
| "mean_token_accuracy": 0.8411198407411575, |
| "num_tokens": 62074886.0, |
| "step": 4610 |
| }, |
| { |
| "epoch": 2.8136419001218025, |
| "grad_norm": 0.6867278218269348, |
| "learning_rate": 2.1827349311673956e-05, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.8294085919857025, |
| "num_tokens": 62205784.0, |
| "step": 4620 |
| }, |
| { |
| "epoch": 2.8197320341047503, |
| "grad_norm": 0.6569831967353821, |
| "learning_rate": 2.1710615044487803e-05, |
| "loss": 0.6202, |
| "mean_token_accuracy": 0.8258814692497254, |
| "num_tokens": 62334349.0, |
| "step": 4630 |
| }, |
| { |
| "epoch": 2.825822168087698, |
| "grad_norm": 0.6896407604217529, |
| "learning_rate": 2.1597697708936558e-05, |
| "loss": 0.5865, |
| "mean_token_accuracy": 0.8326056882739067, |
| "num_tokens": 62466787.0, |
| "step": 4640 |
| }, |
| { |
| "epoch": 2.831912302070646, |
| "grad_norm": 0.6883603930473328, |
| "learning_rate": 2.1488602186668787e-05, |
| "loss": 0.5853, |
| "mean_token_accuracy": 0.8355178698897362, |
| "num_tokens": 62602765.0, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.838002436053593, |
| "grad_norm": 0.6500746607780457, |
| "learning_rate": 2.1383333194108245e-05, |
| "loss": 0.5693, |
| "mean_token_accuracy": 0.8407857313752174, |
| "num_tokens": 62741245.0, |
| "step": 4660 |
| }, |
| { |
| "epoch": 2.844092570036541, |
| "grad_norm": 0.6719433665275574, |
| "learning_rate": 2.1281895282249874e-05, |
| "loss": 0.5867, |
| "mean_token_accuracy": 0.8356816500425339, |
| "num_tokens": 62876137.0, |
| "step": 4670 |
| }, |
| { |
| "epoch": 2.8501827040194883, |
| "grad_norm": 0.5427895188331604, |
| "learning_rate": 2.1184292836463194e-05, |
| "loss": 0.5869, |
| "mean_token_accuracy": 0.8338323414325715, |
| "num_tokens": 63010203.0, |
| "step": 4680 |
| }, |
| { |
| "epoch": 2.856272838002436, |
| "grad_norm": 0.6588513255119324, |
| "learning_rate": 2.10905300763026e-05, |
| "loss": 0.5824, |
| "mean_token_accuracy": 0.837106254696846, |
| "num_tokens": 63152549.0, |
| "step": 4690 |
| }, |
| { |
| "epoch": 2.862362971985384, |
| "grad_norm": 0.6410390138626099, |
| "learning_rate": 2.1000611055324987e-05, |
| "loss": 0.57, |
| "mean_token_accuracy": 0.8419015809893609, |
| "num_tokens": 63287537.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.868453105968331, |
| "grad_norm": 0.6098606586456299, |
| "learning_rate": 2.09145396609145e-05, |
| "loss": 0.5791, |
| "mean_token_accuracy": 0.8359826967120171, |
| "num_tokens": 63424589.0, |
| "step": 4710 |
| }, |
| { |
| "epoch": 2.874543239951279, |
| "grad_norm": 0.7105115652084351, |
| "learning_rate": 2.083231961411448e-05, |
| "loss": 0.5915, |
| "mean_token_accuracy": 0.8325518026947976, |
| "num_tokens": 63554175.0, |
| "step": 4720 |
| }, |
| { |
| "epoch": 2.8806333739342267, |
| "grad_norm": 0.648669421672821, |
| "learning_rate": 2.0753954469466614e-05, |
| "loss": 0.6047, |
| "mean_token_accuracy": 0.8295484691858291, |
| "num_tokens": 63682998.0, |
| "step": 4730 |
| }, |
| { |
| "epoch": 2.886723507917174, |
| "grad_norm": 0.657294750213623, |
| "learning_rate": 2.0679447614857204e-05, |
| "loss": 0.5921, |
| "mean_token_accuracy": 0.8348564639687538, |
| "num_tokens": 63820437.0, |
| "step": 4740 |
| }, |
| { |
| "epoch": 2.892813641900122, |
| "grad_norm": 0.5496834516525269, |
| "learning_rate": 2.0608802271370776e-05, |
| "loss": 0.5599, |
| "mean_token_accuracy": 0.8398202702403068, |
| "num_tokens": 63964469.0, |
| "step": 4750 |
| }, |
| { |
| "epoch": 2.8989037758830696, |
| "grad_norm": 0.7021865248680115, |
| "learning_rate": 2.0542021493150766e-05, |
| "loss": 0.6405, |
| "mean_token_accuracy": 0.8221626535058022, |
| "num_tokens": 64085097.0, |
| "step": 4760 |
| }, |
| { |
| "epoch": 2.904993909866017, |
| "grad_norm": 0.5804896950721741, |
| "learning_rate": 2.0479108167267523e-05, |
| "loss": 0.6079, |
| "mean_token_accuracy": 0.8265683457255364, |
| "num_tokens": 64215053.0, |
| "step": 4770 |
| }, |
| { |
| "epoch": 2.9110840438489647, |
| "grad_norm": 0.7602665424346924, |
| "learning_rate": 2.0420065013593475e-05, |
| "loss": 0.5827, |
| "mean_token_accuracy": 0.8372338116168976, |
| "num_tokens": 64348756.0, |
| "step": 4780 |
| }, |
| { |
| "epoch": 2.9171741778319125, |
| "grad_norm": 0.6796186566352844, |
| "learning_rate": 2.0364894584685548e-05, |
| "loss": 0.5918, |
| "mean_token_accuracy": 0.8328269824385643, |
| "num_tokens": 64484863.0, |
| "step": 4790 |
| }, |
| { |
| "epoch": 2.92326431181486, |
| "grad_norm": 0.5706749558448792, |
| "learning_rate": 2.0313599265674807e-05, |
| "loss": 0.572, |
| "mean_token_accuracy": 0.8380152434110641, |
| "num_tokens": 64621317.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.9293544457978076, |
| "grad_norm": 0.7671577334403992, |
| "learning_rate": 2.0266181274163366e-05, |
| "loss": 0.5842, |
| "mean_token_accuracy": 0.832770548760891, |
| "num_tokens": 64753894.0, |
| "step": 4810 |
| }, |
| { |
| "epoch": 2.9354445797807553, |
| "grad_norm": 0.6405246257781982, |
| "learning_rate": 2.022264266012849e-05, |
| "loss": 0.5511, |
| "mean_token_accuracy": 0.8441521510481834, |
| "num_tokens": 64897014.0, |
| "step": 4820 |
| }, |
| { |
| "epoch": 2.9415347137637027, |
| "grad_norm": 0.6317846179008484, |
| "learning_rate": 2.0182985305833967e-05, |
| "loss": 0.5965, |
| "mean_token_accuracy": 0.833104345202446, |
| "num_tokens": 65029007.0, |
| "step": 4830 |
| }, |
| { |
| "epoch": 2.9476248477466505, |
| "grad_norm": 0.5668535232543945, |
| "learning_rate": 2.0147210925748773e-05, |
| "loss": 0.577, |
| "mean_token_accuracy": 0.8383101314306259, |
| "num_tokens": 65166793.0, |
| "step": 4840 |
| }, |
| { |
| "epoch": 2.953714981729598, |
| "grad_norm": 0.5817210674285889, |
| "learning_rate": 2.0115321066472894e-05, |
| "loss": 0.5967, |
| "mean_token_accuracy": 0.8329069703817368, |
| "num_tokens": 65302274.0, |
| "step": 4850 |
| }, |
| { |
| "epoch": 2.9598051157125456, |
| "grad_norm": 0.764815628528595, |
| "learning_rate": 2.0087317106670535e-05, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.830155149102211, |
| "num_tokens": 65434310.0, |
| "step": 4860 |
| }, |
| { |
| "epoch": 2.9658952496954933, |
| "grad_norm": 0.6195541620254517, |
| "learning_rate": 2.0063200257010438e-05, |
| "loss": 0.5662, |
| "mean_token_accuracy": 0.8380124986171722, |
| "num_tokens": 65571543.0, |
| "step": 4870 |
| }, |
| { |
| "epoch": 2.971985383678441, |
| "grad_norm": 0.680578351020813, |
| "learning_rate": 2.0042971560113606e-05, |
| "loss": 0.5897, |
| "mean_token_accuracy": 0.8346181452274323, |
| "num_tokens": 65705371.0, |
| "step": 4880 |
| }, |
| { |
| "epoch": 2.9780755176613884, |
| "grad_norm": 0.5930168628692627, |
| "learning_rate": 2.002663189050819e-05, |
| "loss": 0.5779, |
| "mean_token_accuracy": 0.8375312358140945, |
| "num_tokens": 65842664.0, |
| "step": 4890 |
| }, |
| { |
| "epoch": 2.9841656516443362, |
| "grad_norm": 0.5821495652198792, |
| "learning_rate": 2.001418195459171e-05, |
| "loss": 0.6032, |
| "mean_token_accuracy": 0.8325582191348075, |
| "num_tokens": 65977282.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.9902557856272836, |
| "grad_norm": 0.7727954983711243, |
| "learning_rate": 2.0005622290600484e-05, |
| "loss": 0.6033, |
| "mean_token_accuracy": 0.829984401166439, |
| "num_tokens": 66105864.0, |
| "step": 4910 |
| }, |
| { |
| "epoch": 2.9963459196102313, |
| "grad_norm": 0.7271150350570679, |
| "learning_rate": 2.000095326858638e-05, |
| "loss": 0.5762, |
| "mean_token_accuracy": 0.8358065068721772, |
| "num_tokens": 66241202.0, |
| "step": 4920 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 4926, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.098300386001027e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|