| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1642, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0060901339829476245, |
| "grad_norm": 23.5319766998291, |
| "learning_rate": 1.2162162162162164e-05, |
| "loss": 4.5905, |
| "mean_token_accuracy": 0.3401473943144083, |
| "num_tokens": 132681.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.012180267965895249, |
| "grad_norm": 6.916630744934082, |
| "learning_rate": 2.5675675675675675e-05, |
| "loss": 3.957, |
| "mean_token_accuracy": 0.3799716055393219, |
| "num_tokens": 264238.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.018270401948842874, |
| "grad_norm": 2.717982292175293, |
| "learning_rate": 3.918918918918919e-05, |
| "loss": 3.047, |
| "mean_token_accuracy": 0.46811963245272636, |
| "num_tokens": 401308.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.024360535931790498, |
| "grad_norm": 2.407865524291992, |
| "learning_rate": 5.27027027027027e-05, |
| "loss": 2.5322, |
| "mean_token_accuracy": 0.5190828196704388, |
| "num_tokens": 532444.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.030450669914738125, |
| "grad_norm": 1.018301248550415, |
| "learning_rate": 6.621621621621621e-05, |
| "loss": 2.1325, |
| "mean_token_accuracy": 0.5782605841755867, |
| "num_tokens": 660406.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03654080389768575, |
| "grad_norm": 0.7315741181373596, |
| "learning_rate": 7.972972972972974e-05, |
| "loss": 1.9044, |
| "mean_token_accuracy": 0.6264914631843567, |
| "num_tokens": 795304.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04263093788063337, |
| "grad_norm": 0.6652920246124268, |
| "learning_rate": 9.324324324324324e-05, |
| "loss": 1.633, |
| "mean_token_accuracy": 0.6683938711881637, |
| "num_tokens": 934543.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.048721071863580996, |
| "grad_norm": 0.6119660139083862, |
| "learning_rate": 0.00010675675675675677, |
| "loss": 1.543, |
| "mean_token_accuracy": 0.6834091022610664, |
| "num_tokens": 1070669.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05481120584652863, |
| "grad_norm": 0.591424286365509, |
| "learning_rate": 0.00012027027027027027, |
| "loss": 1.4154, |
| "mean_token_accuracy": 0.6991497233510018, |
| "num_tokens": 1211114.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06090133982947625, |
| "grad_norm": 0.5663530230522156, |
| "learning_rate": 0.0001337837837837838, |
| "loss": 1.3176, |
| "mean_token_accuracy": 0.7089206710457802, |
| "num_tokens": 1349584.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06699147381242387, |
| "grad_norm": 0.5881878137588501, |
| "learning_rate": 0.0001472972972972973, |
| "loss": 1.2293, |
| "mean_token_accuracy": 0.7254403859376908, |
| "num_tokens": 1487515.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0730816077953715, |
| "grad_norm": 0.7664394974708557, |
| "learning_rate": 0.00016081081081081083, |
| "loss": 1.1814, |
| "mean_token_accuracy": 0.7306812778115273, |
| "num_tokens": 1618603.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07917174177831912, |
| "grad_norm": 0.6155670881271362, |
| "learning_rate": 0.00017432432432432432, |
| "loss": 1.1967, |
| "mean_token_accuracy": 0.7284250959753991, |
| "num_tokens": 1750466.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08526187576126674, |
| "grad_norm": 0.5296258330345154, |
| "learning_rate": 0.00018783783783783784, |
| "loss": 1.0955, |
| "mean_token_accuracy": 0.7472824215888977, |
| "num_tokens": 1887913.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09135200974421437, |
| "grad_norm": 0.5564976334571838, |
| "learning_rate": 0.00019999998054550544, |
| "loss": 1.118, |
| "mean_token_accuracy": 0.7397311359643937, |
| "num_tokens": 2018579.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09744214372716199, |
| "grad_norm": 0.5301142930984497, |
| "learning_rate": 0.00019999764601633156, |
| "loss": 1.045, |
| "mean_token_accuracy": 0.7519380420446395, |
| "num_tokens": 2158851.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10353227771010962, |
| "grad_norm": 0.5949111580848694, |
| "learning_rate": 0.00019999142070388495, |
| "loss": 1.0497, |
| "mean_token_accuracy": 0.7520910769701004, |
| "num_tokens": 2296715.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.10962241169305725, |
| "grad_norm": 0.6169262528419495, |
| "learning_rate": 0.0001999813048772986, |
| "loss": 1.0821, |
| "mean_token_accuracy": 0.7406247839331627, |
| "num_tokens": 2424756.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11571254567600488, |
| "grad_norm": 0.58912593126297, |
| "learning_rate": 0.00019996729897390057, |
| "loss": 1.0286, |
| "mean_token_accuracy": 0.7527454376220704, |
| "num_tokens": 2559362.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1218026796589525, |
| "grad_norm": 0.5084304213523865, |
| "learning_rate": 0.00019994940359919483, |
| "loss": 0.992, |
| "mean_token_accuracy": 0.7640391126275062, |
| "num_tokens": 2700231.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1278928136419001, |
| "grad_norm": 0.5790796279907227, |
| "learning_rate": 0.00019992761952683516, |
| "loss": 1.0146, |
| "mean_token_accuracy": 0.7554366230964661, |
| "num_tokens": 2831324.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.13398294762484775, |
| "grad_norm": 0.5852051377296448, |
| "learning_rate": 0.00019990194769859188, |
| "loss": 0.978, |
| "mean_token_accuracy": 0.7612502485513687, |
| "num_tokens": 2967346.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.14007308160779536, |
| "grad_norm": 0.5102785229682922, |
| "learning_rate": 0.00019987238922431088, |
| "loss": 0.9616, |
| "mean_token_accuracy": 0.7677591517567635, |
| "num_tokens": 3110936.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.146163215590743, |
| "grad_norm": 0.5472669005393982, |
| "learning_rate": 0.00019983894538186576, |
| "loss": 0.9535, |
| "mean_token_accuracy": 0.76737689524889, |
| "num_tokens": 3247496.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.15225334957369063, |
| "grad_norm": 0.5611053109169006, |
| "learning_rate": 0.0001998016176171026, |
| "loss": 0.9577, |
| "mean_token_accuracy": 0.7626092001795769, |
| "num_tokens": 3384178.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.15834348355663824, |
| "grad_norm": 0.54055255651474, |
| "learning_rate": 0.0001997604075437774, |
| "loss": 0.9907, |
| "mean_token_accuracy": 0.7575223430991173, |
| "num_tokens": 3517617.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.16443361753958588, |
| "grad_norm": 0.558316707611084, |
| "learning_rate": 0.0001997153169434864, |
| "loss": 0.944, |
| "mean_token_accuracy": 0.7664194419980049, |
| "num_tokens": 3662878.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1705237515225335, |
| "grad_norm": 0.49766939878463745, |
| "learning_rate": 0.0001996663477655889, |
| "loss": 0.9106, |
| "mean_token_accuracy": 0.7760038167238236, |
| "num_tokens": 3807411.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.17661388550548113, |
| "grad_norm": 0.4953667223453522, |
| "learning_rate": 0.0001996135021271232, |
| "loss": 0.9687, |
| "mean_token_accuracy": 0.7605679705739021, |
| "num_tokens": 3936840.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.18270401948842874, |
| "grad_norm": 0.5447947978973389, |
| "learning_rate": 0.00019955678231271484, |
| "loss": 0.9625, |
| "mean_token_accuracy": 0.7603292793035508, |
| "num_tokens": 4067826.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18879415347137637, |
| "grad_norm": 0.4665842056274414, |
| "learning_rate": 0.00019949619077447807, |
| "loss": 0.9372, |
| "mean_token_accuracy": 0.7676101759076118, |
| "num_tokens": 4205887.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.19488428745432398, |
| "grad_norm": 0.515690267086029, |
| "learning_rate": 0.00019943173013190965, |
| "loss": 0.923, |
| "mean_token_accuracy": 0.7708473294973374, |
| "num_tokens": 4342894.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.20097442143727162, |
| "grad_norm": 0.5831382274627686, |
| "learning_rate": 0.00019936340317177565, |
| "loss": 0.9203, |
| "mean_token_accuracy": 0.7708552837371826, |
| "num_tokens": 4477651.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.20706455542021923, |
| "grad_norm": 0.6162773966789246, |
| "learning_rate": 0.0001992912128479911, |
| "loss": 0.916, |
| "mean_token_accuracy": 0.7702088996767997, |
| "num_tokens": 4610746.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.21315468940316687, |
| "grad_norm": 0.5172462463378906, |
| "learning_rate": 0.00019921516228149207, |
| "loss": 0.8942, |
| "mean_token_accuracy": 0.7741821393370628, |
| "num_tokens": 4751175.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2192448233861145, |
| "grad_norm": 0.5890468955039978, |
| "learning_rate": 0.0001991352547601009, |
| "loss": 0.9229, |
| "mean_token_accuracy": 0.7691043332219124, |
| "num_tokens": 4882328.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.22533495736906212, |
| "grad_norm": 0.5522404909133911, |
| "learning_rate": 0.00019905149373838408, |
| "loss": 0.9294, |
| "mean_token_accuracy": 0.7646071568131447, |
| "num_tokens": 5012181.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.23142509135200975, |
| "grad_norm": 0.5349445939064026, |
| "learning_rate": 0.0001989638828375028, |
| "loss": 0.8797, |
| "mean_token_accuracy": 0.7771721839904785, |
| "num_tokens": 5151133.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.23751522533495736, |
| "grad_norm": 0.531052827835083, |
| "learning_rate": 0.00019887242584505635, |
| "loss": 0.9221, |
| "mean_token_accuracy": 0.7678465083241462, |
| "num_tokens": 5279790.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.243605359317905, |
| "grad_norm": 0.5126324892044067, |
| "learning_rate": 0.00019877712671491864, |
| "loss": 0.8862, |
| "mean_token_accuracy": 0.7739894777536392, |
| "num_tokens": 5412390.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2496954933008526, |
| "grad_norm": 0.5111438632011414, |
| "learning_rate": 0.00019867798956706693, |
| "loss": 0.9005, |
| "mean_token_accuracy": 0.7721902653574944, |
| "num_tokens": 5545801.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2557856272838002, |
| "grad_norm": 0.5488138794898987, |
| "learning_rate": 0.00019857501868740402, |
| "loss": 0.8988, |
| "mean_token_accuracy": 0.7690282896161079, |
| "num_tokens": 5673758.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2618757612667479, |
| "grad_norm": 0.5497994422912598, |
| "learning_rate": 0.0001984682185275727, |
| "loss": 0.8802, |
| "mean_token_accuracy": 0.7780183687806129, |
| "num_tokens": 5813158.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.2679658952496955, |
| "grad_norm": 0.5478431582450867, |
| "learning_rate": 0.0001983575937047635, |
| "loss": 0.865, |
| "mean_token_accuracy": 0.7785944610834121, |
| "num_tokens": 5947367.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2740560292326431, |
| "grad_norm": 0.5188766717910767, |
| "learning_rate": 0.00019824314900151487, |
| "loss": 0.8798, |
| "mean_token_accuracy": 0.7752803862094879, |
| "num_tokens": 6081060.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2801461632155907, |
| "grad_norm": 0.530222475528717, |
| "learning_rate": 0.00019812488936550666, |
| "loss": 0.8628, |
| "mean_token_accuracy": 0.7801630645990372, |
| "num_tokens": 6217834.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2862362971985384, |
| "grad_norm": 0.5987964868545532, |
| "learning_rate": 0.00019800281990934614, |
| "loss": 0.8775, |
| "mean_token_accuracy": 0.7760324433445931, |
| "num_tokens": 6350451.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.292326431181486, |
| "grad_norm": 0.5468559265136719, |
| "learning_rate": 0.0001978769459103468, |
| "loss": 0.8721, |
| "mean_token_accuracy": 0.7794204503297806, |
| "num_tokens": 6484738.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2984165651644336, |
| "grad_norm": 0.5541098117828369, |
| "learning_rate": 0.0001977472728103005, |
| "loss": 0.8785, |
| "mean_token_accuracy": 0.7767582029104233, |
| "num_tokens": 6619313.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.30450669914738127, |
| "grad_norm": 0.5134281516075134, |
| "learning_rate": 0.0001976138062152419, |
| "loss": 0.8717, |
| "mean_token_accuracy": 0.7752724394202233, |
| "num_tokens": 6753195.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3105968331303289, |
| "grad_norm": 0.49164435267448425, |
| "learning_rate": 0.00019747655189520633, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7768464118242264, |
| "num_tokens": 6890448.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3166869671132765, |
| "grad_norm": 0.5899345278739929, |
| "learning_rate": 0.00019733551578398023, |
| "loss": 0.8322, |
| "mean_token_accuracy": 0.7859320402145386, |
| "num_tokens": 7027488.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.3227771010962241, |
| "grad_norm": 0.6552841663360596, |
| "learning_rate": 0.0001971907039788447, |
| "loss": 0.861, |
| "mean_token_accuracy": 0.7770532324910164, |
| "num_tokens": 7161184.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.32886723507917176, |
| "grad_norm": 0.5038822889328003, |
| "learning_rate": 0.0001970421227403117, |
| "loss": 0.8825, |
| "mean_token_accuracy": 0.775890800356865, |
| "num_tokens": 7294399.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.33495736906211937, |
| "grad_norm": 0.5094267129898071, |
| "learning_rate": 0.00019688977849185378, |
| "loss": 0.8598, |
| "mean_token_accuracy": 0.7817838475108146, |
| "num_tokens": 7427183.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.341047503045067, |
| "grad_norm": 0.5282809138298035, |
| "learning_rate": 0.00019673367781962594, |
| "loss": 0.8463, |
| "mean_token_accuracy": 0.7812959104776382, |
| "num_tokens": 7561734.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3471376370280146, |
| "grad_norm": 0.45355409383773804, |
| "learning_rate": 0.00019657382747218123, |
| "loss": 0.8207, |
| "mean_token_accuracy": 0.7888262197375298, |
| "num_tokens": 7706228.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.35322777101096225, |
| "grad_norm": 0.5162333846092224, |
| "learning_rate": 0.00019641023436017883, |
| "loss": 0.8235, |
| "mean_token_accuracy": 0.7868947923183441, |
| "num_tokens": 7846684.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.35931790499390986, |
| "grad_norm": 0.5194632411003113, |
| "learning_rate": 0.00019624290555608526, |
| "loss": 0.8129, |
| "mean_token_accuracy": 0.7884069249033928, |
| "num_tokens": 7986811.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3654080389768575, |
| "grad_norm": 0.5494846701622009, |
| "learning_rate": 0.00019607184829386882, |
| "loss": 0.8084, |
| "mean_token_accuracy": 0.7874000474810601, |
| "num_tokens": 8124538.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.37149817295980514, |
| "grad_norm": 0.5368776917457581, |
| "learning_rate": 0.0001958970699686866, |
| "loss": 0.8225, |
| "mean_token_accuracy": 0.783010233938694, |
| "num_tokens": 8260529.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.37758830694275275, |
| "grad_norm": 0.6229024529457092, |
| "learning_rate": 0.00019571857813656496, |
| "loss": 0.8786, |
| "mean_token_accuracy": 0.7753148928284646, |
| "num_tokens": 8389042.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.38367844092570036, |
| "grad_norm": 0.5601000785827637, |
| "learning_rate": 0.00019553638051407279, |
| "loss": 0.8909, |
| "mean_token_accuracy": 0.7745720192790031, |
| "num_tokens": 8513603.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.38976857490864797, |
| "grad_norm": 0.438970685005188, |
| "learning_rate": 0.0001953504849779879, |
| "loss": 0.8085, |
| "mean_token_accuracy": 0.7871840804815292, |
| "num_tokens": 8652970.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.39585870889159563, |
| "grad_norm": 0.5505132079124451, |
| "learning_rate": 0.00019516089956495648, |
| "loss": 0.8102, |
| "mean_token_accuracy": 0.7869585514068603, |
| "num_tokens": 8792103.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.40194884287454324, |
| "grad_norm": 0.5447221398353577, |
| "learning_rate": 0.00019496763247114581, |
| "loss": 0.8336, |
| "mean_token_accuracy": 0.7816034242510795, |
| "num_tokens": 8926853.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.40803897685749085, |
| "grad_norm": 0.4652746915817261, |
| "learning_rate": 0.00019477069205188965, |
| "loss": 0.8383, |
| "mean_token_accuracy": 0.7826304718852043, |
| "num_tokens": 9059592.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.41412911084043846, |
| "grad_norm": 0.42363590002059937, |
| "learning_rate": 0.00019457008682132726, |
| "loss": 0.847, |
| "mean_token_accuracy": 0.7810002073645592, |
| "num_tokens": 9193062.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.42021924482338613, |
| "grad_norm": 0.5209478735923767, |
| "learning_rate": 0.00019436582545203518, |
| "loss": 0.8766, |
| "mean_token_accuracy": 0.7733785718679428, |
| "num_tokens": 9315805.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.42630937880633374, |
| "grad_norm": 0.5176642537117004, |
| "learning_rate": 0.00019415791677465237, |
| "loss": 0.8155, |
| "mean_token_accuracy": 0.7869213685393334, |
| "num_tokens": 9448863.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.43239951278928135, |
| "grad_norm": 0.4531058371067047, |
| "learning_rate": 0.00019394636977749843, |
| "loss": 0.8096, |
| "mean_token_accuracy": 0.7903949975967407, |
| "num_tokens": 9589382.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.438489646772229, |
| "grad_norm": 0.5651549100875854, |
| "learning_rate": 0.000193731193606185, |
| "loss": 0.8263, |
| "mean_token_accuracy": 0.7823062822222709, |
| "num_tokens": 9723562.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4445797807551766, |
| "grad_norm": 0.5377989411354065, |
| "learning_rate": 0.00019351239756322031, |
| "loss": 0.7993, |
| "mean_token_accuracy": 0.7908329650759697, |
| "num_tokens": 9859255.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.45066991473812423, |
| "grad_norm": 0.5420868396759033, |
| "learning_rate": 0.00019328999110760722, |
| "loss": 0.8461, |
| "mean_token_accuracy": 0.7780480548739434, |
| "num_tokens": 9981578.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.45676004872107184, |
| "grad_norm": 0.4889216125011444, |
| "learning_rate": 0.000193063983854434, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7959530428051949, |
| "num_tokens": 10122922.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4628501827040195, |
| "grad_norm": 0.5044087767601013, |
| "learning_rate": 0.00019283438557445893, |
| "loss": 0.824, |
| "mean_token_accuracy": 0.7845935523509979, |
| "num_tokens": 10252854.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.4689403166869671, |
| "grad_norm": 0.5286466479301453, |
| "learning_rate": 0.00019260120619368773, |
| "loss": 0.815, |
| "mean_token_accuracy": 0.7850656941533088, |
| "num_tokens": 10385075.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.47503045066991473, |
| "grad_norm": 0.5441628694534302, |
| "learning_rate": 0.00019236445579294437, |
| "loss": 0.8048, |
| "mean_token_accuracy": 0.7876680314540863, |
| "num_tokens": 10520011.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.48112058465286234, |
| "grad_norm": 0.49002447724342346, |
| "learning_rate": 0.0001921241446074355, |
| "loss": 0.8059, |
| "mean_token_accuracy": 0.7898563235998154, |
| "num_tokens": 10652488.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.48721071863581, |
| "grad_norm": 0.4479144811630249, |
| "learning_rate": 0.0001918802830263077, |
| "loss": 0.7913, |
| "mean_token_accuracy": 0.7928732186555862, |
| "num_tokens": 10785974.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4933008526187576, |
| "grad_norm": 0.5007497668266296, |
| "learning_rate": 0.00019163288159219853, |
| "loss": 0.8083, |
| "mean_token_accuracy": 0.7893043681979179, |
| "num_tokens": 10920950.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4993909866017052, |
| "grad_norm": 0.5289483070373535, |
| "learning_rate": 0.00019138195100078064, |
| "loss": 0.8033, |
| "mean_token_accuracy": 0.7864485770463944, |
| "num_tokens": 11056380.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5054811205846529, |
| "grad_norm": 0.5604159832000732, |
| "learning_rate": 0.0001911275021002994, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7946401730179786, |
| "num_tokens": 11196074.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5115712545676004, |
| "grad_norm": 0.43645399808883667, |
| "learning_rate": 0.00019086954589110397, |
| "loss": 0.7724, |
| "mean_token_accuracy": 0.7990294560790062, |
| "num_tokens": 11337990.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5176613885505481, |
| "grad_norm": 0.43992146849632263, |
| "learning_rate": 0.0001906080935251716, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7999786615371705, |
| "num_tokens": 11481565.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5237515225334958, |
| "grad_norm": 0.5595120191574097, |
| "learning_rate": 0.0001903431563056256, |
| "loss": 0.8266, |
| "mean_token_accuracy": 0.7859750911593437, |
| "num_tokens": 11611714.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5298416565164433, |
| "grad_norm": 0.5001987218856812, |
| "learning_rate": 0.0001900747456862467, |
| "loss": 0.8506, |
| "mean_token_accuracy": 0.779585388302803, |
| "num_tokens": 11736573.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.535931790499391, |
| "grad_norm": 0.430147647857666, |
| "learning_rate": 0.00018980287327097784, |
| "loss": 0.7707, |
| "mean_token_accuracy": 0.795211361348629, |
| "num_tokens": 11876859.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5420219244823387, |
| "grad_norm": 0.5346289873123169, |
| "learning_rate": 0.00018952755081342245, |
| "loss": 0.8057, |
| "mean_token_accuracy": 0.7871127843856811, |
| "num_tokens": 12007654.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5481120584652862, |
| "grad_norm": 0.46072253584861755, |
| "learning_rate": 0.00018924879021633653, |
| "loss": 0.7924, |
| "mean_token_accuracy": 0.7913773030042648, |
| "num_tokens": 12140520.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5542021924482339, |
| "grad_norm": 0.4803653955459595, |
| "learning_rate": 0.00018896660353111375, |
| "loss": 0.8398, |
| "mean_token_accuracy": 0.7807079553604126, |
| "num_tokens": 12267219.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5602923264311814, |
| "grad_norm": 0.5219636559486389, |
| "learning_rate": 0.0001886810029572647, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7993015512824059, |
| "num_tokens": 12404646.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5663824604141291, |
| "grad_norm": 0.501483142375946, |
| "learning_rate": 0.00018839200084188936, |
| "loss": 0.7953, |
| "mean_token_accuracy": 0.787814213335514, |
| "num_tokens": 12538219.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5724725943970768, |
| "grad_norm": 0.47334522008895874, |
| "learning_rate": 0.00018809960967914346, |
| "loss": 0.789, |
| "mean_token_accuracy": 0.7928574904799461, |
| "num_tokens": 12673805.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5785627283800243, |
| "grad_norm": 0.5057492852210999, |
| "learning_rate": 0.00018780384210969806, |
| "loss": 0.7746, |
| "mean_token_accuracy": 0.7947553545236588, |
| "num_tokens": 12811727.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.584652862362972, |
| "grad_norm": 0.5179910659790039, |
| "learning_rate": 0.00018750471092019325, |
| "loss": 0.7962, |
| "mean_token_accuracy": 0.7905686929821968, |
| "num_tokens": 12947641.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5907429963459196, |
| "grad_norm": 0.45797088742256165, |
| "learning_rate": 0.00018720222904268543, |
| "loss": 0.7678, |
| "mean_token_accuracy": 0.7969774708151818, |
| "num_tokens": 13083869.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5968331303288672, |
| "grad_norm": 0.48360612988471985, |
| "learning_rate": 0.00018689640955408803, |
| "loss": 0.7996, |
| "mean_token_accuracy": 0.7885591968894005, |
| "num_tokens": 13211807.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6029232643118149, |
| "grad_norm": 0.4378497004508972, |
| "learning_rate": 0.00018658726567560635, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7969291344285011, |
| "num_tokens": 13351856.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6090133982947625, |
| "grad_norm": 0.4857536852359772, |
| "learning_rate": 0.00018627481077216577, |
| "loss": 0.7786, |
| "mean_token_accuracy": 0.7914443418383599, |
| "num_tokens": 13486443.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6151035322777101, |
| "grad_norm": 0.5233064293861389, |
| "learning_rate": 0.0001859590583518343, |
| "loss": 0.8241, |
| "mean_token_accuracy": 0.7811850637197495, |
| "num_tokens": 13612035.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6211936662606578, |
| "grad_norm": 0.5328738689422607, |
| "learning_rate": 0.00018564002206523816, |
| "loss": 0.7502, |
| "mean_token_accuracy": 0.7993430674076081, |
| "num_tokens": 13756509.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6272838002436053, |
| "grad_norm": 0.47962310910224915, |
| "learning_rate": 0.000185317715704972, |
| "loss": 0.7984, |
| "mean_token_accuracy": 0.7864531084895134, |
| "num_tokens": 13883033.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.633373934226553, |
| "grad_norm": 0.5685893893241882, |
| "learning_rate": 0.0001849921532050024, |
| "loss": 0.7869, |
| "mean_token_accuracy": 0.7909937381744385, |
| "num_tokens": 14015234.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6394640682095006, |
| "grad_norm": 0.49146631360054016, |
| "learning_rate": 0.00018466334864006566, |
| "loss": 0.7952, |
| "mean_token_accuracy": 0.7878949210047722, |
| "num_tokens": 14149319.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6455542021924482, |
| "grad_norm": 0.5556225776672363, |
| "learning_rate": 0.0001843313162250591, |
| "loss": 0.7524, |
| "mean_token_accuracy": 0.7994373366236687, |
| "num_tokens": 14286868.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6516443361753959, |
| "grad_norm": 0.511379063129425, |
| "learning_rate": 0.00018399607031442666, |
| "loss": 0.7929, |
| "mean_token_accuracy": 0.7921562284231186, |
| "num_tokens": 14418354.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6577344701583435, |
| "grad_norm": 0.5019840598106384, |
| "learning_rate": 0.00018365762540153836, |
| "loss": 0.758, |
| "mean_token_accuracy": 0.7989353060722351, |
| "num_tokens": 14553174.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6638246041412911, |
| "grad_norm": 0.6032467484474182, |
| "learning_rate": 0.00018331599611806366, |
| "loss": 0.7888, |
| "mean_token_accuracy": 0.7903819754719734, |
| "num_tokens": 14681393.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6699147381242387, |
| "grad_norm": 0.5369830131530762, |
| "learning_rate": 0.00018297119723333877, |
| "loss": 0.765, |
| "mean_token_accuracy": 0.7950262635946274, |
| "num_tokens": 14814565.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6760048721071864, |
| "grad_norm": 0.5289803743362427, |
| "learning_rate": 0.00018262324365372846, |
| "loss": 0.7496, |
| "mean_token_accuracy": 0.8032818242907525, |
| "num_tokens": 14954351.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.682095006090134, |
| "grad_norm": 0.5440439581871033, |
| "learning_rate": 0.0001822721504219814, |
| "loss": 0.7432, |
| "mean_token_accuracy": 0.799126236140728, |
| "num_tokens": 15094879.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6881851400730816, |
| "grad_norm": 0.46225935220718384, |
| "learning_rate": 0.00018191793271657978, |
| "loss": 0.7513, |
| "mean_token_accuracy": 0.8022688791155815, |
| "num_tokens": 15234906.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6942752740560292, |
| "grad_norm": 0.5592020750045776, |
| "learning_rate": 0.0001815606058510833, |
| "loss": 0.7583, |
| "mean_token_accuracy": 0.7984497547149658, |
| "num_tokens": 15373526.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7003654080389768, |
| "grad_norm": 0.525090217590332, |
| "learning_rate": 0.00018120018527346702, |
| "loss": 0.7254, |
| "mean_token_accuracy": 0.8070619881153107, |
| "num_tokens": 15516264.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7064555420219245, |
| "grad_norm": 0.5380759239196777, |
| "learning_rate": 0.00018083668656545355, |
| "loss": 0.8041, |
| "mean_token_accuracy": 0.7866759791970253, |
| "num_tokens": 15640444.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7125456760048721, |
| "grad_norm": 0.47815701365470886, |
| "learning_rate": 0.00018047012544183938, |
| "loss": 0.7604, |
| "mean_token_accuracy": 0.796156468987465, |
| "num_tokens": 15778070.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7186358099878197, |
| "grad_norm": 0.5380450487136841, |
| "learning_rate": 0.00018010051774981553, |
| "loss": 0.8135, |
| "mean_token_accuracy": 0.7842124432325364, |
| "num_tokens": 15899739.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7247259439707674, |
| "grad_norm": 0.5047502517700195, |
| "learning_rate": 0.00017972787946828246, |
| "loss": 0.7642, |
| "mean_token_accuracy": 0.7989341139793396, |
| "num_tokens": 16035805.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.730816077953715, |
| "grad_norm": 0.5440967679023743, |
| "learning_rate": 0.00017935222670715918, |
| "loss": 0.735, |
| "mean_token_accuracy": 0.8048294603824615, |
| "num_tokens": 16172541.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7369062119366626, |
| "grad_norm": 0.4766077399253845, |
| "learning_rate": 0.000178973575706687, |
| "loss": 0.805, |
| "mean_token_accuracy": 0.7871790423989296, |
| "num_tokens": 16296988.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.7429963459196103, |
| "grad_norm": 0.4153214991092682, |
| "learning_rate": 0.00017859194283672704, |
| "loss": 0.7635, |
| "mean_token_accuracy": 0.7964595645666123, |
| "num_tokens": 16432022.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7490864799025578, |
| "grad_norm": 0.4698518216609955, |
| "learning_rate": 0.00017820734459605302, |
| "loss": 0.7397, |
| "mean_token_accuracy": 0.8046972885727882, |
| "num_tokens": 16572880.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.7551766138855055, |
| "grad_norm": 0.46101540327072144, |
| "learning_rate": 0.00017781979761163756, |
| "loss": 0.7174, |
| "mean_token_accuracy": 0.8066875368356705, |
| "num_tokens": 16714419.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.761266747868453, |
| "grad_norm": 0.5313341021537781, |
| "learning_rate": 0.00017742931863793358, |
| "loss": 0.7797, |
| "mean_token_accuracy": 0.7911526098847389, |
| "num_tokens": 16838285.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7673568818514007, |
| "grad_norm": 0.4627362787723541, |
| "learning_rate": 0.00017703592455614998, |
| "loss": 0.7626, |
| "mean_token_accuracy": 0.7970306649804115, |
| "num_tokens": 16976065.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7734470158343484, |
| "grad_norm": 0.5429073572158813, |
| "learning_rate": 0.00017663963237352177, |
| "loss": 0.7398, |
| "mean_token_accuracy": 0.8005403786897659, |
| "num_tokens": 17112901.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7795371498172959, |
| "grad_norm": 0.6781270503997803, |
| "learning_rate": 0.00017624045922257471, |
| "loss": 0.7607, |
| "mean_token_accuracy": 0.7946217939257622, |
| "num_tokens": 17245480.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7856272838002436, |
| "grad_norm": 0.5227305293083191, |
| "learning_rate": 0.00017583842236038483, |
| "loss": 0.7217, |
| "mean_token_accuracy": 0.8064659267663956, |
| "num_tokens": 17387171.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.7917174177831913, |
| "grad_norm": 0.49253156781196594, |
| "learning_rate": 0.0001754335391678323, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7960015773773194, |
| "num_tokens": 17521164.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7978075517661388, |
| "grad_norm": 0.5103631615638733, |
| "learning_rate": 0.00017502582714884997, |
| "loss": 0.7435, |
| "mean_token_accuracy": 0.7995276898145676, |
| "num_tokens": 17657818.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.8038976857490865, |
| "grad_norm": 0.5531247854232788, |
| "learning_rate": 0.00017461530392966665, |
| "loss": 0.7986, |
| "mean_token_accuracy": 0.7892467245459557, |
| "num_tokens": 17784361.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8099878197320342, |
| "grad_norm": 0.4574586749076843, |
| "learning_rate": 0.00017420198725804517, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8135112956166267, |
| "num_tokens": 17929664.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.8160779537149817, |
| "grad_norm": 0.4734383225440979, |
| "learning_rate": 0.00017378589500251498, |
| "loss": 0.7308, |
| "mean_token_accuracy": 0.8029947131872177, |
| "num_tokens": 18071182.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.8221680876979294, |
| "grad_norm": 0.5192279815673828, |
| "learning_rate": 0.00017336704515159986, |
| "loss": 0.7444, |
| "mean_token_accuracy": 0.8012512847781181, |
| "num_tokens": 18211136.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8282582216808769, |
| "grad_norm": 0.5378620624542236, |
| "learning_rate": 0.00017294545581303996, |
| "loss": 0.7459, |
| "mean_token_accuracy": 0.7981989249587059, |
| "num_tokens": 18340645.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8343483556638246, |
| "grad_norm": 0.4879571497440338, |
| "learning_rate": 0.00017252114521300918, |
| "loss": 0.7877, |
| "mean_token_accuracy": 0.7891893342137337, |
| "num_tokens": 18465733.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.8404384896467723, |
| "grad_norm": 0.5297388434410095, |
| "learning_rate": 0.00017209413169532717, |
| "loss": 0.7586, |
| "mean_token_accuracy": 0.797142505645752, |
| "num_tokens": 18598979.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.8465286236297198, |
| "grad_norm": 0.5308396220207214, |
| "learning_rate": 0.00017166443372066618, |
| "loss": 0.7387, |
| "mean_token_accuracy": 0.80123979896307, |
| "num_tokens": 18735919.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8526187576126675, |
| "grad_norm": 0.49988579750061035, |
| "learning_rate": 0.0001712320698657532, |
| "loss": 0.7425, |
| "mean_token_accuracy": 0.7996803268790245, |
| "num_tokens": 18870877.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8587088915956151, |
| "grad_norm": 0.5971361994743347, |
| "learning_rate": 0.0001707970588225665, |
| "loss": 0.7691, |
| "mean_token_accuracy": 0.7922965154051781, |
| "num_tokens": 19000943.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.8647990255785627, |
| "grad_norm": 0.5141698718070984, |
| "learning_rate": 0.00017035941939752802, |
| "loss": 0.7203, |
| "mean_token_accuracy": 0.8036229625344277, |
| "num_tokens": 19135039.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8708891595615104, |
| "grad_norm": 0.4647749066352844, |
| "learning_rate": 0.0001699191705106898, |
| "loss": 0.7136, |
| "mean_token_accuracy": 0.8064323276281357, |
| "num_tokens": 19274069.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.876979293544458, |
| "grad_norm": 0.5511934161186218, |
| "learning_rate": 0.00016947633119491633, |
| "loss": 0.7455, |
| "mean_token_accuracy": 0.7985599264502525, |
| "num_tokens": 19409679.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.8830694275274056, |
| "grad_norm": 0.4936945140361786, |
| "learning_rate": 0.00016903092059506182, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.806523185968399, |
| "num_tokens": 19547419.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8891595615103532, |
| "grad_norm": 0.5227787494659424, |
| "learning_rate": 0.00016858295796714213, |
| "loss": 0.7739, |
| "mean_token_accuracy": 0.7941467314958572, |
| "num_tokens": 19674455.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8952496954933008, |
| "grad_norm": 0.5046219825744629, |
| "learning_rate": 0.00016813246267750282, |
| "loss": 0.7361, |
| "mean_token_accuracy": 0.8008369222283364, |
| "num_tokens": 19809861.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9013398294762485, |
| "grad_norm": 0.4827081263065338, |
| "learning_rate": 0.00016767945420198142, |
| "loss": 0.7464, |
| "mean_token_accuracy": 0.7986427888274192, |
| "num_tokens": 19940696.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.9074299634591961, |
| "grad_norm": 0.4970889687538147, |
| "learning_rate": 0.00016722395212506567, |
| "loss": 0.7528, |
| "mean_token_accuracy": 0.7965970665216446, |
| "num_tokens": 20070686.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.9135200974421437, |
| "grad_norm": 0.44478070735931396, |
| "learning_rate": 0.00016676597613904693, |
| "loss": 0.7185, |
| "mean_token_accuracy": 0.8081388726830483, |
| "num_tokens": 20210260.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9196102314250914, |
| "grad_norm": 0.506136417388916, |
| "learning_rate": 0.00016630554604316866, |
| "loss": 0.7395, |
| "mean_token_accuracy": 0.8003876298666001, |
| "num_tokens": 20346235.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.925700365408039, |
| "grad_norm": 0.500946044921875, |
| "learning_rate": 0.00016584268174277053, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8124501362442971, |
| "num_tokens": 20481248.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9317904993909866, |
| "grad_norm": 0.48528990149497986, |
| "learning_rate": 0.00016537740324842795, |
| "loss": 0.7227, |
| "mean_token_accuracy": 0.8041250064969063, |
| "num_tokens": 20613531.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.9378806333739342, |
| "grad_norm": 0.5070951581001282, |
| "learning_rate": 0.00016490973067508674, |
| "loss": 0.7091, |
| "mean_token_accuracy": 0.8082544595003128, |
| "num_tokens": 20750784.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9439707673568819, |
| "grad_norm": 0.5583120584487915, |
| "learning_rate": 0.0001644396842411939, |
| "loss": 0.7405, |
| "mean_token_accuracy": 0.7992320343852043, |
| "num_tokens": 20883646.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9500609013398295, |
| "grad_norm": 0.5099635124206543, |
| "learning_rate": 0.00016396728426782312, |
| "loss": 0.7103, |
| "mean_token_accuracy": 0.8091216519474983, |
| "num_tokens": 21025143.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9561510353227771, |
| "grad_norm": 0.5777808427810669, |
| "learning_rate": 0.00016349255117779652, |
| "loss": 0.7245, |
| "mean_token_accuracy": 0.8023119494318962, |
| "num_tokens": 21160014.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.9622411693057247, |
| "grad_norm": 0.5206162333488464, |
| "learning_rate": 0.0001630155054948016, |
| "loss": 0.7185, |
| "mean_token_accuracy": 0.8069521963596344, |
| "num_tokens": 21299094.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9683313032886723, |
| "grad_norm": 0.5763202905654907, |
| "learning_rate": 0.00016253616784250415, |
| "loss": 0.7677, |
| "mean_token_accuracy": 0.7927820891141891, |
| "num_tokens": 21429252.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.97442143727162, |
| "grad_norm": 0.5068426728248596, |
| "learning_rate": 0.00016205455894365627, |
| "loss": 0.7673, |
| "mean_token_accuracy": 0.794715291261673, |
| "num_tokens": 21556200.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9805115712545676, |
| "grad_norm": 0.46094459295272827, |
| "learning_rate": 0.0001615706996192009, |
| "loss": 0.771, |
| "mean_token_accuracy": 0.7921045809984207, |
| "num_tokens": 21681524.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.9866017052375152, |
| "grad_norm": 0.5063546299934387, |
| "learning_rate": 0.00016108461078737148, |
| "loss": 0.7383, |
| "mean_token_accuracy": 0.800596435368061, |
| "num_tokens": 21814109.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.9926918392204629, |
| "grad_norm": 0.5418652296066284, |
| "learning_rate": 0.0001605963134627876, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7994748756289483, |
| "num_tokens": 21947346.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.9987819732034104, |
| "grad_norm": 0.6195595264434814, |
| "learning_rate": 0.0001601058287555465, |
| "loss": 0.7294, |
| "mean_token_accuracy": 0.8030684441328049, |
| "num_tokens": 22081340.0, |
| "step": 1640 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 4926, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.6993952090530775e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|