diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5385 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20179496622456752, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00033632494370761253, + "grad_norm": 5.551640033721924, + "learning_rate": 4.9999999999999996e-05, + "loss": 40.646, + "num_input_tokens_seen": 13107200, + "step": 50 + }, + { + "epoch": 0.0006726498874152251, + "grad_norm": 3.0933010578155518, + "learning_rate": 9.999999999999999e-05, + "loss": 35.6719, + "num_input_tokens_seen": 26214400, + "step": 100 + }, + { + "epoch": 0.0010089748311228376, + "grad_norm": 1.8445225954055786, + "learning_rate": 0.00015, + "loss": 31.7929, + "num_input_tokens_seen": 39321600, + "step": 150 + }, + { + "epoch": 0.0013452997748304501, + "grad_norm": 3.410053253173828, + "learning_rate": 0.00019999999999999998, + "loss": 29.7717, + "num_input_tokens_seen": 52428800, + "step": 200 + }, + { + "epoch": 0.0016816247185380627, + "grad_norm": 3.2875728607177734, + "learning_rate": 0.00025, + "loss": 28.0852, + "num_input_tokens_seen": 65536000, + "step": 250 + }, + { + "epoch": 0.002017949662245675, + "grad_norm": 3.1337997913360596, + "learning_rate": 0.0003, + "loss": 26.681, + "num_input_tokens_seen": 78643200, + "step": 300 + }, + { + "epoch": 0.002354274605953288, + "grad_norm": 3.2058675289154053, + "learning_rate": 0.00035, + "loss": 25.5272, + "num_input_tokens_seen": 91750400, + "step": 350 + }, + { + "epoch": 0.0026905995496609002, + "grad_norm": 2.94515323638916, + "learning_rate": 0.00039999999999999996, + "loss": 24.5054, + "num_input_tokens_seen": 104857600, + "step": 400 + }, + { + "epoch": 0.003026924493368513, + "grad_norm": 1.952689528465271, + "learning_rate": 0.00045, + "loss": 23.6095, + "num_input_tokens_seen": 117964800, + "step": 450 + }, + { + "epoch": 0.0033632494370761253, + "grad_norm": 2.3968894481658936, + "learning_rate": 0.0005, + "loss": 22.7795, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.0033632494370761253, + "eval_loss": 5.559806823730469, + "eval_runtime": 144.2401, + "eval_samples_per_second": 34.664, + "eval_steps_per_second": 8.666, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.003699574380783738, + "grad_norm": 2.3432235717773438, + "learning_rate": 0.0005499999999999999, + "loss": 22.0823, + "num_input_tokens_seen": 144179200, + "step": 550 + }, + { + "epoch": 0.00403589932449135, + "grad_norm": 2.5011863708496094, + "learning_rate": 0.0006, + "loss": 21.3173, + "num_input_tokens_seen": 157286400, + "step": 600 + }, + { + "epoch": 0.004372224268198963, + "grad_norm": 2.038031816482544, + "learning_rate": 0.0005999957181118445, + "loss": 20.6294, + "num_input_tokens_seen": 170393600, + "step": 650 + }, + { + "epoch": 0.004708549211906576, + "grad_norm": 1.8921018838882446, + "learning_rate": 0.0005999828725696082, + "loss": 19.8806, + "num_input_tokens_seen": 183500800, + "step": 700 + }, + { + "epoch": 0.005044874155614189, + "grad_norm": 2.1825079917907715, + "learning_rate": 0.0005999614637399793, + "loss": 19.2902, + "num_input_tokens_seen": 196608000, + "step": 750 + }, + { + "epoch": 0.0053811990993218005, + "grad_norm": 1.6200745105743408, + "learning_rate": 0.0005999314922340923, + "loss": 18.7553, + "num_input_tokens_seen": 209715200, + "step": 800 + }, + { + "epoch": 0.005717524043029413, + "grad_norm": 1.4547795057296753, + "learning_rate": 0.0005998929589075115, + "loss": 18.3636, + "num_input_tokens_seen": 222822400, + "step": 850 + }, + { + "epoch": 0.006053848986737026, + "grad_norm": 1.7445319890975952, + "learning_rate": 0.0005998458648602063, + "loss": 18.0002, + "num_input_tokens_seen": 235929600, + "step": 900 + }, + { + "epoch": 0.006390173930444639, + "grad_norm": 1.5439883470535278, + "learning_rate": 0.0005997902114365196, + "loss": 17.6987, + "num_input_tokens_seen": 249036800, + "step": 950 + }, + { + "epoch": 0.006726498874152251, + "grad_norm": 1.5094984769821167, + "learning_rate": 0.0005997260002251293, + "loss": 17.4367, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.006726498874152251, + "eval_loss": 4.2485198974609375, + "eval_runtime": 143.1933, + "eval_samples_per_second": 34.918, + "eval_steps_per_second": 8.729, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.007062823817859863, + "grad_norm": 1.380916714668274, + "learning_rate": 0.0005996532330590042, + "loss": 17.1672, + "num_input_tokens_seen": 275251200, + "step": 1050 + }, + { + "epoch": 0.007399148761567476, + "grad_norm": 1.2637253999710083, + "learning_rate": 0.0005995719120153497, + "loss": 16.9309, + "num_input_tokens_seen": 288358400, + "step": 1100 + }, + { + "epoch": 0.007735473705275089, + "grad_norm": 1.6453403234481812, + "learning_rate": 0.0005994820394155497, + "loss": 16.7436, + "num_input_tokens_seen": 301465600, + "step": 1150 + }, + { + "epoch": 0.0080717986489827, + "grad_norm": 1.4340417385101318, + "learning_rate": 0.0005993836178251009, + "loss": 16.507, + "num_input_tokens_seen": 314572800, + "step": 1200 + }, + { + "epoch": 0.008408123592690313, + "grad_norm": 1.1437392234802246, + "learning_rate": 0.0005992766500535377, + "loss": 16.4345, + "num_input_tokens_seen": 327680000, + "step": 1250 + }, + { + "epoch": 0.008744448536397926, + "grad_norm": 1.3861935138702393, + "learning_rate": 0.0005991611391543539, + "loss": 16.2523, + "num_input_tokens_seen": 340787200, + "step": 1300 + }, + { + "epoch": 0.009080773480105539, + "grad_norm": 1.2690448760986328, + "learning_rate": 0.0005990370884249146, + "loss": 16.1004, + "num_input_tokens_seen": 353894400, + "step": 1350 + }, + { + "epoch": 0.009417098423813152, + "grad_norm": 1.2439777851104736, + "learning_rate": 0.000598904501406362, + "loss": 16.0162, + "num_input_tokens_seen": 367001600, + "step": 1400 + }, + { + "epoch": 0.009753423367520764, + "grad_norm": 1.1519834995269775, + "learning_rate": 0.0005987633818835147, + "loss": 15.8826, + "num_input_tokens_seen": 380108800, + "step": 1450 + }, + { + "epoch": 0.010089748311228377, + "grad_norm": 1.101974606513977, + "learning_rate": 0.0005986137338847594, + "loss": 15.7688, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.010089748311228377, + "eval_loss": 3.8665707111358643, + "eval_runtime": 143.1117, + "eval_samples_per_second": 34.938, + "eval_steps_per_second": 8.734, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.010426073254935988, + "grad_norm": 1.3389586210250854, + "learning_rate": 0.0005984555616819361, + "loss": 15.6984, + "num_input_tokens_seen": 406323200, + "step": 1550 + }, + { + "epoch": 0.010762398198643601, + "grad_norm": 1.175673246383667, + "learning_rate": 0.0005982888697902161, + "loss": 15.6319, + "num_input_tokens_seen": 419430400, + "step": 1600 + }, + { + "epoch": 0.011098723142351214, + "grad_norm": 1.122758388519287, + "learning_rate": 0.0005981136629679728, + "loss": 15.4898, + "num_input_tokens_seen": 432537600, + "step": 1650 + }, + { + "epoch": 0.011435048086058826, + "grad_norm": 1.0009791851043701, + "learning_rate": 0.0005979299462166464, + "loss": 15.4399, + "num_input_tokens_seen": 445644800, + "step": 1700 + }, + { + "epoch": 0.01177137302976644, + "grad_norm": 1.0287221670150757, + "learning_rate": 0.0005977377247806006, + "loss": 15.3713, + "num_input_tokens_seen": 458752000, + "step": 1750 + }, + { + "epoch": 0.012107697973474052, + "grad_norm": 1.061454176902771, + "learning_rate": 0.0005975370041469738, + "loss": 15.266, + "num_input_tokens_seen": 471859200, + "step": 1800 + }, + { + "epoch": 0.012444022917181665, + "grad_norm": 1.2214291095733643, + "learning_rate": 0.0005973277900455209, + "loss": 15.2011, + "num_input_tokens_seen": 484966400, + "step": 1850 + }, + { + "epoch": 0.012780347860889277, + "grad_norm": 1.0759477615356445, + "learning_rate": 0.0005971100884484513, + "loss": 15.153, + "num_input_tokens_seen": 498073600, + "step": 1900 + }, + { + "epoch": 0.013116672804596888, + "grad_norm": 0.8617029190063477, + "learning_rate": 0.0005968839055702578, + "loss": 15.1029, + "num_input_tokens_seen": 511180800, + "step": 1950 + }, + { + "epoch": 0.013452997748304501, + "grad_norm": 1.063086748123169, + "learning_rate": 0.0005966492478675384, + "loss": 14.9894, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.013452997748304501, + "eval_loss": 3.6751515865325928, + "eval_runtime": 143.2457, + "eval_samples_per_second": 34.905, + "eval_steps_per_second": 8.726, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.013789322692012114, + "grad_norm": 0.8963438868522644, + "learning_rate": 0.000596406122038814, + "loss": 14.9472, + "num_input_tokens_seen": 537395200, + "step": 2050 + }, + { + "epoch": 0.014125647635719727, + "grad_norm": 0.8694930672645569, + "learning_rate": 0.0005961545350243351, + "loss": 14.8887, + "num_input_tokens_seen": 550502400, + "step": 2100 + }, + { + "epoch": 0.01446197257942734, + "grad_norm": 0.9276862144470215, + "learning_rate": 0.0005958944940058844, + "loss": 14.8208, + "num_input_tokens_seen": 563609600, + "step": 2150 + }, + { + "epoch": 0.014798297523134952, + "grad_norm": 0.8817610144615173, + "learning_rate": 0.0005956260064065727, + "loss": 14.7679, + "num_input_tokens_seen": 576716800, + "step": 2200 + }, + { + "epoch": 0.015134622466842565, + "grad_norm": 0.888661801815033, + "learning_rate": 0.0005953490798906257, + "loss": 14.7253, + "num_input_tokens_seen": 589824000, + "step": 2250 + }, + { + "epoch": 0.015470947410550178, + "grad_norm": 0.8768919706344604, + "learning_rate": 0.0005950637223631658, + "loss": 14.6678, + "num_input_tokens_seen": 602931200, + "step": 2300 + }, + { + "epoch": 0.01580727235425779, + "grad_norm": 0.8688133955001831, + "learning_rate": 0.0005947699419699865, + "loss": 14.6422, + "num_input_tokens_seen": 616038400, + "step": 2350 + }, + { + "epoch": 0.0161435972979654, + "grad_norm": 0.8557626008987427, + "learning_rate": 0.0005944677470973196, + "loss": 14.6511, + "num_input_tokens_seen": 629145600, + "step": 2400 + }, + { + "epoch": 0.016479922241673016, + "grad_norm": 0.956565260887146, + "learning_rate": 0.0005941571463715962, + "loss": 14.5594, + "num_input_tokens_seen": 642252800, + "step": 2450 + }, + { + "epoch": 0.016816247185380627, + "grad_norm": 0.8760116100311279, + "learning_rate": 0.0005938381486591999, + "loss": 14.5031, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.016816247185380627, + "eval_loss": 3.5482449531555176, + "eval_runtime": 143.9882, + "eval_samples_per_second": 34.725, + "eval_steps_per_second": 8.681, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.01715257212908824, + "grad_norm": 0.8938534259796143, + "learning_rate": 0.0005935107630662145, + "loss": 14.4733, + "num_input_tokens_seen": 668467200, + "step": 2550 + }, + { + "epoch": 0.017488897072795852, + "grad_norm": 0.8379454016685486, + "learning_rate": 0.0005931749989381631, + "loss": 14.386, + "num_input_tokens_seen": 681574400, + "step": 2600 + }, + { + "epoch": 0.017825222016503463, + "grad_norm": 0.7709890007972717, + "learning_rate": 0.000592830865859742, + "loss": 14.3883, + "num_input_tokens_seen": 694681600, + "step": 2650 + }, + { + "epoch": 0.018161546960211078, + "grad_norm": 0.8483361601829529, + "learning_rate": 0.000592478373654547, + "loss": 14.4122, + "num_input_tokens_seen": 707788800, + "step": 2700 + }, + { + "epoch": 0.01849787190391869, + "grad_norm": 0.8239767551422119, + "learning_rate": 0.0005921175323847927, + "loss": 14.3169, + "num_input_tokens_seen": 720896000, + "step": 2750 + }, + { + "epoch": 0.018834196847626303, + "grad_norm": 0.7901423573493958, + "learning_rate": 0.0005917483523510252, + "loss": 14.263, + "num_input_tokens_seen": 734003200, + "step": 2800 + }, + { + "epoch": 0.019170521791333914, + "grad_norm": 0.7838689088821411, + "learning_rate": 0.0005913708440918291, + "loss": 14.2589, + "num_input_tokens_seen": 747110400, + "step": 2850 + }, + { + "epoch": 0.01950684673504153, + "grad_norm": 0.90792316198349, + "learning_rate": 0.000590985018383525, + "loss": 14.2538, + "num_input_tokens_seen": 760217600, + "step": 2900 + }, + { + "epoch": 0.01984317167874914, + "grad_norm": 0.7609734535217285, + "learning_rate": 0.0005905908862398632, + "loss": 14.208, + "num_input_tokens_seen": 773324800, + "step": 2950 + }, + { + "epoch": 0.020179496622456754, + "grad_norm": 0.7796016335487366, + "learning_rate": 0.0005901884589117088, + "loss": 14.2405, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.020179496622456754, + "eval_loss": 3.469104766845703, + "eval_runtime": 142.6639, + "eval_samples_per_second": 35.047, + "eval_steps_per_second": 8.762, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.020515821566164365, + "grad_norm": 0.7596002221107483, + "learning_rate": 0.0005897777478867204, + "loss": 14.1367, + "num_input_tokens_seen": 799539200, + "step": 3050 + }, + { + "epoch": 0.020852146509871976, + "grad_norm": 0.7429248094558716, + "learning_rate": 0.0005893587648890227, + "loss": 14.1394, + "num_input_tokens_seen": 812646400, + "step": 3100 + }, + { + "epoch": 0.02118847145357959, + "grad_norm": 0.8267149329185486, + "learning_rate": 0.0005889315218788711, + "loss": 14.1218, + "num_input_tokens_seen": 825753600, + "step": 3150 + }, + { + "epoch": 0.021524796397287202, + "grad_norm": 0.7618885636329651, + "learning_rate": 0.0005884960310523109, + "loss": 14.0575, + "num_input_tokens_seen": 838860800, + "step": 3200 + }, + { + "epoch": 0.021861121340994816, + "grad_norm": 0.7333565950393677, + "learning_rate": 0.0005880523048408287, + "loss": 14.0723, + "num_input_tokens_seen": 851968000, + "step": 3250 + }, + { + "epoch": 0.022197446284702427, + "grad_norm": 0.7767319083213806, + "learning_rate": 0.0005876003559109981, + "loss": 14.0067, + "num_input_tokens_seen": 865075200, + "step": 3300 + }, + { + "epoch": 0.022533771228410042, + "grad_norm": 0.7478107213973999, + "learning_rate": 0.0005871401971641175, + "loss": 14.0154, + "num_input_tokens_seen": 878182400, + "step": 3350 + }, + { + "epoch": 0.022870096172117653, + "grad_norm": 0.7439610958099365, + "learning_rate": 0.0005866718417358421, + "loss": 13.9922, + "num_input_tokens_seen": 891289600, + "step": 3400 + }, + { + "epoch": 0.023206421115825267, + "grad_norm": 0.7624334096908569, + "learning_rate": 0.0005861953029958091, + "loss": 13.9456, + "num_input_tokens_seen": 904396800, + "step": 3450 + }, + { + "epoch": 0.02354274605953288, + "grad_norm": 0.7594953775405884, + "learning_rate": 0.0005857105945472556, + "loss": 13.9742, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.02354274605953288, + "eval_loss": 3.410249710083008, + "eval_runtime": 143.2447, + "eval_samples_per_second": 34.905, + "eval_steps_per_second": 8.726, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.02387907100324049, + "grad_norm": 0.7231994867324829, + "learning_rate": 0.0005852177302266308, + "loss": 13.959, + "num_input_tokens_seen": 930611200, + "step": 3550 + }, + { + "epoch": 0.024215395946948104, + "grad_norm": 0.7433524131774902, + "learning_rate": 0.0005847167241032006, + "loss": 13.8909, + "num_input_tokens_seen": 943718400, + "step": 3600 + }, + { + "epoch": 0.024551720890655715, + "grad_norm": 0.6849333643913269, + "learning_rate": 0.0005842075904786462, + "loss": 13.8984, + "num_input_tokens_seen": 956825600, + "step": 3650 + }, + { + "epoch": 0.02488804583436333, + "grad_norm": 0.7122375965118408, + "learning_rate": 0.000583690343886656, + "loss": 13.8611, + "num_input_tokens_seen": 969932800, + "step": 3700 + }, + { + "epoch": 0.02522437077807094, + "grad_norm": 0.7722771763801575, + "learning_rate": 0.0005831649990925102, + "loss": 13.862, + "num_input_tokens_seen": 983040000, + "step": 3750 + }, + { + "epoch": 0.025560695721778555, + "grad_norm": 0.7184539437294006, + "learning_rate": 0.0005826315710926599, + "loss": 13.8641, + "num_input_tokens_seen": 996147200, + "step": 3800 + }, + { + "epoch": 0.025897020665486166, + "grad_norm": 0.7419592142105103, + "learning_rate": 0.0005820900751142987, + "loss": 13.808, + "num_input_tokens_seen": 1009254400, + "step": 3850 + }, + { + "epoch": 0.026233345609193777, + "grad_norm": 0.7380815148353577, + "learning_rate": 0.0005815405266149281, + "loss": 13.7751, + "num_input_tokens_seen": 1022361600, + "step": 3900 + }, + { + "epoch": 0.02656967055290139, + "grad_norm": 0.7219839692115784, + "learning_rate": 0.000580982941281916, + "loss": 13.8042, + "num_input_tokens_seen": 1035468800, + "step": 3950 + }, + { + "epoch": 0.026905995496609002, + "grad_norm": 0.7262866497039795, + "learning_rate": 0.0005804173350320493, + "loss": 13.7434, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.026905995496609002, + "eval_loss": 3.364596128463745, + "eval_runtime": 143.5464, + "eval_samples_per_second": 34.832, + "eval_steps_per_second": 8.708, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.027242320440316617, + "grad_norm": 0.7669729590415955, + "learning_rate": 0.0005798437240110794, + "loss": 13.759, + "num_input_tokens_seen": 1061683200, + "step": 4050 + }, + { + "epoch": 0.027578645384024228, + "grad_norm": 0.75081866979599, + "learning_rate": 0.0005792621245932613, + "loss": 13.8008, + "num_input_tokens_seen": 1074790400, + "step": 4100 + }, + { + "epoch": 0.027914970327731842, + "grad_norm": 0.6844099164009094, + "learning_rate": 0.0005786725533808858, + "loss": 13.7462, + "num_input_tokens_seen": 1087897600, + "step": 4150 + }, + { + "epoch": 0.028251295271439453, + "grad_norm": 0.6971242427825928, + "learning_rate": 0.0005780750272038064, + "loss": 13.7535, + "num_input_tokens_seen": 1101004800, + "step": 4200 + }, + { + "epoch": 0.028587620215147068, + "grad_norm": 0.7153123021125793, + "learning_rate": 0.0005774695631189582, + "loss": 13.7085, + "num_input_tokens_seen": 1114112000, + "step": 4250 + }, + { + "epoch": 0.02892394515885468, + "grad_norm": 0.6920833587646484, + "learning_rate": 0.0005768561784098711, + "loss": 13.6495, + "num_input_tokens_seen": 1127219200, + "step": 4300 + }, + { + "epoch": 0.02926027010256229, + "grad_norm": 0.7521312236785889, + "learning_rate": 0.0005762348905861764, + "loss": 13.6559, + "num_input_tokens_seen": 1140326400, + "step": 4350 + }, + { + "epoch": 0.029596595046269904, + "grad_norm": 0.676365077495575, + "learning_rate": 0.0005756057173831074, + "loss": 13.6069, + "num_input_tokens_seen": 1153433600, + "step": 4400 + }, + { + "epoch": 0.029932919989977515, + "grad_norm": 0.6946042776107788, + "learning_rate": 0.0005749686767609928, + "loss": 13.6218, + "num_input_tokens_seen": 1166540800, + "step": 4450 + }, + { + "epoch": 0.03026924493368513, + "grad_norm": 0.7127935886383057, + "learning_rate": 0.0005743237869047437, + "loss": 13.6039, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.03026924493368513, + "eval_loss": 3.3240578174591064, + "eval_runtime": 143.6605, + "eval_samples_per_second": 34.804, + "eval_steps_per_second": 8.701, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.03060556987739274, + "grad_norm": 0.7366272807121277, + "learning_rate": 0.0005736710662233351, + "loss": 13.604, + "num_input_tokens_seen": 1192755200, + "step": 4550 + }, + { + "epoch": 0.030941894821100355, + "grad_norm": 0.6478694081306458, + "learning_rate": 0.0005730105333492799, + "loss": 13.5717, + "num_input_tokens_seen": 1205862400, + "step": 4600 + }, + { + "epoch": 0.03127821976480797, + "grad_norm": 0.7228036522865295, + "learning_rate": 0.0005723422071380976, + "loss": 13.5385, + "num_input_tokens_seen": 1218969600, + "step": 4650 + }, + { + "epoch": 0.03161454470851558, + "grad_norm": 0.6998932957649231, + "learning_rate": 0.0005716661066677753, + "loss": 13.5237, + "num_input_tokens_seen": 1232076800, + "step": 4700 + }, + { + "epoch": 0.03195086965222319, + "grad_norm": 0.6933197379112244, + "learning_rate": 0.0005709822512382236, + "loss": 13.5417, + "num_input_tokens_seen": 1245184000, + "step": 4750 + }, + { + "epoch": 0.0322871945959308, + "grad_norm": 0.7209503054618835, + "learning_rate": 0.0005702906603707256, + "loss": 13.5653, + "num_input_tokens_seen": 1258291200, + "step": 4800 + }, + { + "epoch": 0.032623519539638414, + "grad_norm": 0.6796743273735046, + "learning_rate": 0.0005695913538073798, + "loss": 13.557, + "num_input_tokens_seen": 1271398400, + "step": 4850 + }, + { + "epoch": 0.03295984448334603, + "grad_norm": 0.6676873564720154, + "learning_rate": 0.0005688843515105359, + "loss": 13.4965, + "num_input_tokens_seen": 1284505600, + "step": 4900 + }, + { + "epoch": 0.03329616942705364, + "grad_norm": 0.6413120627403259, + "learning_rate": 0.0005681696736622258, + "loss": 13.5013, + "num_input_tokens_seen": 1297612800, + "step": 4950 + }, + { + "epoch": 0.033632494370761254, + "grad_norm": 0.6716573238372803, + "learning_rate": 0.0005674473406635868, + "loss": 13.4891, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.033632494370761254, + "eval_loss": 3.2916977405548096, + "eval_runtime": 143.1949, + "eval_samples_per_second": 34.917, + "eval_steps_per_second": 8.729, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.033968819314468865, + "grad_norm": 0.6855191588401794, + "learning_rate": 0.0005667173731342798, + "loss": 13.4753, + "num_input_tokens_seen": 1323827200, + "step": 5050 + }, + { + "epoch": 0.03430514425817648, + "grad_norm": 0.6939865350723267, + "learning_rate": 0.0005659797919119, + "loss": 13.4583, + "num_input_tokens_seen": 1336934400, + "step": 5100 + }, + { + "epoch": 0.034641469201884094, + "grad_norm": 0.6979573369026184, + "learning_rate": 0.0005652346180513829, + "loss": 13.4339, + "num_input_tokens_seen": 1350041600, + "step": 5150 + }, + { + "epoch": 0.034977794145591705, + "grad_norm": 0.6683679819107056, + "learning_rate": 0.0005644818728244026, + "loss": 13.4496, + "num_input_tokens_seen": 1363148800, + "step": 5200 + }, + { + "epoch": 0.035314119089299316, + "grad_norm": 0.6824884414672852, + "learning_rate": 0.0005637215777187651, + "loss": 13.4705, + "num_input_tokens_seen": 1376256000, + "step": 5250 + }, + { + "epoch": 0.03565044403300693, + "grad_norm": 0.6840626001358032, + "learning_rate": 0.0005629537544377942, + "loss": 13.4349, + "num_input_tokens_seen": 1389363200, + "step": 5300 + }, + { + "epoch": 0.035986768976714545, + "grad_norm": 0.6613268852233887, + "learning_rate": 0.0005621784248997128, + "loss": 13.46, + "num_input_tokens_seen": 1402470400, + "step": 5350 + }, + { + "epoch": 0.036323093920422156, + "grad_norm": 0.6920462846755981, + "learning_rate": 0.0005613956112370167, + "loss": 13.4035, + "num_input_tokens_seen": 1415577600, + "step": 5400 + }, + { + "epoch": 0.03665941886412977, + "grad_norm": 0.6671062111854553, + "learning_rate": 0.0005606053357958429, + "loss": 13.3312, + "num_input_tokens_seen": 1428684800, + "step": 5450 + }, + { + "epoch": 0.03699574380783738, + "grad_norm": 0.6575270295143127, + "learning_rate": 0.0005598076211353316, + "loss": 13.3718, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.03699574380783738, + "eval_loss": 3.2621724605560303, + "eval_runtime": 143.0547, + "eval_samples_per_second": 34.952, + "eval_steps_per_second": 8.738, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.037332068751544996, + "grad_norm": 0.6697873473167419, + "learning_rate": 0.0005590024900269825, + "loss": 13.3337, + "num_input_tokens_seen": 1454899200, + "step": 5550 + }, + { + "epoch": 0.03766839369525261, + "grad_norm": 0.6501484513282776, + "learning_rate": 0.0005581899654540048, + "loss": 13.3573, + "num_input_tokens_seen": 1468006400, + "step": 5600 + }, + { + "epoch": 0.03800471863896022, + "grad_norm": 0.6328523755073547, + "learning_rate": 0.0005573700706106607, + "loss": 13.3513, + "num_input_tokens_seen": 1481113600, + "step": 5650 + }, + { + "epoch": 0.03834104358266783, + "grad_norm": 0.6470258831977844, + "learning_rate": 0.0005565428289016039, + "loss": 13.2964, + "num_input_tokens_seen": 1494220800, + "step": 5700 + }, + { + "epoch": 0.03867736852637544, + "grad_norm": 0.6687533855438232, + "learning_rate": 0.0005557082639412105, + "loss": 13.3508, + "num_input_tokens_seen": 1507328000, + "step": 5750 + }, + { + "epoch": 0.03901369347008306, + "grad_norm": 0.6524744629859924, + "learning_rate": 0.0005548663995529062, + "loss": 13.3254, + "num_input_tokens_seen": 1520435200, + "step": 5800 + }, + { + "epoch": 0.03935001841379067, + "grad_norm": 0.6340435147285461, + "learning_rate": 0.0005540172597684852, + "loss": 13.3107, + "num_input_tokens_seen": 1533542400, + "step": 5850 + }, + { + "epoch": 0.03968634335749828, + "grad_norm": 0.6675236225128174, + "learning_rate": 0.000553160868827425, + "loss": 13.264, + "num_input_tokens_seen": 1546649600, + "step": 5900 + }, + { + "epoch": 0.04002266830120589, + "grad_norm": 0.6894492506980896, + "learning_rate": 0.0005522972511761935, + "loss": 13.2815, + "num_input_tokens_seen": 1559756800, + "step": 5950 + }, + { + "epoch": 0.04035899324491351, + "grad_norm": 0.6689581274986267, + "learning_rate": 0.000551426431467552, + "loss": 13.3443, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.04035899324491351, + "eval_loss": 3.237107276916504, + "eval_runtime": 142.4946, + "eval_samples_per_second": 35.089, + "eval_steps_per_second": 8.772, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.04069531818862112, + "grad_norm": 0.6856837868690491, + "learning_rate": 0.0005505484345598515, + "loss": 13.2681, + "num_input_tokens_seen": 1585971200, + "step": 6050 + }, + { + "epoch": 0.04103164313232873, + "grad_norm": 0.6631260514259338, + "learning_rate": 0.0005496632855163221, + "loss": 13.2594, + "num_input_tokens_seen": 1599078400, + "step": 6100 + }, + { + "epoch": 0.04136796807603634, + "grad_norm": 0.6479213833808899, + "learning_rate": 0.0005487710096043584, + "loss": 13.2822, + "num_input_tokens_seen": 1612185600, + "step": 6150 + }, + { + "epoch": 0.04170429301974395, + "grad_norm": 0.6974468231201172, + "learning_rate": 0.0005478716322947985, + "loss": 13.2206, + "num_input_tokens_seen": 1625292800, + "step": 6200 + }, + { + "epoch": 0.04204061796345157, + "grad_norm": 0.633106529712677, + "learning_rate": 0.0005469651792611956, + "loss": 13.2054, + "num_input_tokens_seen": 1638400000, + "step": 6250 + }, + { + "epoch": 0.04237694290715918, + "grad_norm": 0.6755931377410889, + "learning_rate": 0.0005460516763790867, + "loss": 13.206, + "num_input_tokens_seen": 1651507200, + "step": 6300 + }, + { + "epoch": 0.04271326785086679, + "grad_norm": 0.6707047820091248, + "learning_rate": 0.0005451311497252529, + "loss": 13.2538, + "num_input_tokens_seen": 1664614400, + "step": 6350 + }, + { + "epoch": 0.043049592794574404, + "grad_norm": 0.6525476574897766, + "learning_rate": 0.0005442036255769754, + "loss": 13.1984, + "num_input_tokens_seen": 1677721600, + "step": 6400 + }, + { + "epoch": 0.04338591773828202, + "grad_norm": 0.6575285196304321, + "learning_rate": 0.0005432691304112853, + "loss": 13.1798, + "num_input_tokens_seen": 1690828800, + "step": 6450 + }, + { + "epoch": 0.04372224268198963, + "grad_norm": 0.6678348183631897, + "learning_rate": 0.0005423276909042077, + "loss": 13.1945, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.04372224268198963, + "eval_loss": 3.2158761024475098, + "eval_runtime": 143.477, + "eval_samples_per_second": 34.849, + "eval_steps_per_second": 8.712, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.044058567625697244, + "grad_norm": 0.6580634713172913, + "learning_rate": 0.0005413793339300004, + "loss": 13.1733, + "num_input_tokens_seen": 1717043200, + "step": 6550 + }, + { + "epoch": 0.044394892569404855, + "grad_norm": 0.7573990821838379, + "learning_rate": 0.000540424086560387, + "loss": 13.1998, + "num_input_tokens_seen": 1730150400, + "step": 6600 + }, + { + "epoch": 0.044731217513112466, + "grad_norm": 0.6535853147506714, + "learning_rate": 0.000539461976063783, + "loss": 13.1668, + "num_input_tokens_seen": 1743257600, + "step": 6650 + }, + { + "epoch": 0.045067542456820084, + "grad_norm": 0.6875225305557251, + "learning_rate": 0.0005384930299045193, + "loss": 13.1695, + "num_input_tokens_seen": 1756364800, + "step": 6700 + }, + { + "epoch": 0.045403867400527695, + "grad_norm": 0.6734049320220947, + "learning_rate": 0.0005375172757420559, + "loss": 13.1982, + "num_input_tokens_seen": 1769472000, + "step": 6750 + }, + { + "epoch": 0.045740192344235306, + "grad_norm": 0.6594141721725464, + "learning_rate": 0.0005365347414301942, + "loss": 13.132, + "num_input_tokens_seen": 1782579200, + "step": 6800 + }, + { + "epoch": 0.04607651728794292, + "grad_norm": 0.606103777885437, + "learning_rate": 0.0005355454550162814, + "loss": 13.15, + "num_input_tokens_seen": 1795686400, + "step": 6850 + }, + { + "epoch": 0.046412842231650535, + "grad_norm": 0.6484935879707336, + "learning_rate": 0.0005345494447404089, + "loss": 13.1301, + "num_input_tokens_seen": 1808793600, + "step": 6900 + }, + { + "epoch": 0.046749167175358146, + "grad_norm": 0.7227681279182434, + "learning_rate": 0.0005335467390346076, + "loss": 13.1443, + "num_input_tokens_seen": 1821900800, + "step": 6950 + }, + { + "epoch": 0.04708549211906576, + "grad_norm": 0.6700535416603088, + "learning_rate": 0.0005325373665220355, + "loss": 13.0997, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.04708549211906576, + "eval_loss": 3.194380044937134, + "eval_runtime": 142.7031, + "eval_samples_per_second": 35.038, + "eval_steps_per_second": 8.759, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.04742181706277337, + "grad_norm": 0.6384073495864868, + "learning_rate": 0.0005315213560161604, + "loss": 13.0959, + "num_input_tokens_seen": 1848115200, + "step": 7050 + }, + { + "epoch": 0.04775814200648098, + "grad_norm": 0.6730595231056213, + "learning_rate": 0.0005304987365199383, + "loss": 13.081, + "num_input_tokens_seen": 1861222400, + "step": 7100 + }, + { + "epoch": 0.0480944669501886, + "grad_norm": 0.6358413696289062, + "learning_rate": 0.0005294695372249843, + "loss": 13.0862, + "num_input_tokens_seen": 1874329600, + "step": 7150 + }, + { + "epoch": 0.04843079189389621, + "grad_norm": 0.6400682926177979, + "learning_rate": 0.0005284337875107402, + "loss": 13.0959, + "num_input_tokens_seen": 1887436800, + "step": 7200 + }, + { + "epoch": 0.04876711683760382, + "grad_norm": 0.6422862410545349, + "learning_rate": 0.0005273915169436359, + "loss": 13.0957, + "num_input_tokens_seen": 1900544000, + "step": 7250 + }, + { + "epoch": 0.04910344178131143, + "grad_norm": 0.6517816185951233, + "learning_rate": 0.0005263427552762443, + "loss": 13.0312, + "num_input_tokens_seen": 1913651200, + "step": 7300 + }, + { + "epoch": 0.04943976672501905, + "grad_norm": 0.6352054476737976, + "learning_rate": 0.0005252875324464333, + "loss": 13.0642, + "num_input_tokens_seen": 1926758400, + "step": 7350 + }, + { + "epoch": 0.04977609166872666, + "grad_norm": 0.6357077956199646, + "learning_rate": 0.0005242258785765105, + "loss": 13.0704, + "num_input_tokens_seen": 1939865600, + "step": 7400 + }, + { + "epoch": 0.05011241661243427, + "grad_norm": 0.6192994713783264, + "learning_rate": 0.0005231578239723635, + "loss": 13.0549, + "num_input_tokens_seen": 1952972800, + "step": 7450 + }, + { + "epoch": 0.05044874155614188, + "grad_norm": 0.6180127859115601, + "learning_rate": 0.0005220833991225946, + "loss": 13.1213, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.05044874155614188, + "eval_loss": 3.1755564212799072, + "eval_runtime": 142.155, + "eval_samples_per_second": 35.173, + "eval_steps_per_second": 8.793, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.05078506649984949, + "grad_norm": 0.663218080997467, + "learning_rate": 0.0005210026346976507, + "loss": 13.0441, + "num_input_tokens_seen": 1979187200, + "step": 7550 + }, + { + "epoch": 0.05112139144355711, + "grad_norm": 0.6263464093208313, + "learning_rate": 0.0005199155615489478, + "loss": 13.0148, + "num_input_tokens_seen": 1992294400, + "step": 7600 + }, + { + "epoch": 0.05145771638726472, + "grad_norm": 0.6272994875907898, + "learning_rate": 0.0005188222107079903, + "loss": 13.0467, + "num_input_tokens_seen": 2005401600, + "step": 7650 + }, + { + "epoch": 0.05179404133097233, + "grad_norm": 0.6265645623207092, + "learning_rate": 0.0005177226133854845, + "loss": 13.0346, + "num_input_tokens_seen": 2018508800, + "step": 7700 + }, + { + "epoch": 0.05213036627467994, + "grad_norm": 0.6151268482208252, + "learning_rate": 0.0005166168009704493, + "loss": 13.0065, + "num_input_tokens_seen": 2031616000, + "step": 7750 + }, + { + "epoch": 0.052466691218387554, + "grad_norm": 0.6234976649284363, + "learning_rate": 0.0005155048050293182, + "loss": 13.0419, + "num_input_tokens_seen": 2044723200, + "step": 7800 + }, + { + "epoch": 0.05280301616209517, + "grad_norm": 0.6200417280197144, + "learning_rate": 0.0005143866573050397, + "loss": 12.9675, + "num_input_tokens_seen": 2057830400, + "step": 7850 + }, + { + "epoch": 0.05313934110580278, + "grad_norm": 0.6281518340110779, + "learning_rate": 0.0005132623897161705, + "loss": 12.9652, + "num_input_tokens_seen": 2070937600, + "step": 7900 + }, + { + "epoch": 0.053475666049510394, + "grad_norm": 0.6501129269599915, + "learning_rate": 0.0005121320343559641, + "loss": 13.0074, + "num_input_tokens_seen": 2084044800, + "step": 7950 + }, + { + "epoch": 0.053811990993218005, + "grad_norm": 0.6317852139472961, + "learning_rate": 0.0005109956234914558, + "loss": 12.977, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.053811990993218005, + "eval_loss": 3.1572272777557373, + "eval_runtime": 142.9574, + "eval_samples_per_second": 34.975, + "eval_steps_per_second": 8.744, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.05414831593692562, + "grad_norm": 0.6210319995880127, + "learning_rate": 0.0005098531895625401, + "loss": 12.9927, + "num_input_tokens_seen": 2110259200, + "step": 8050 + }, + { + "epoch": 0.054484640880633234, + "grad_norm": 0.6362951397895813, + "learning_rate": 0.0005087047651810459, + "loss": 12.9658, + "num_input_tokens_seen": 2123366400, + "step": 8100 + }, + { + "epoch": 0.054820965824340845, + "grad_norm": 0.6348525285720825, + "learning_rate": 0.0005075503831298047, + "loss": 12.9523, + "num_input_tokens_seen": 2136473600, + "step": 8150 + }, + { + "epoch": 0.055157290768048456, + "grad_norm": 0.6554312705993652, + "learning_rate": 0.0005063900763617156, + "loss": 12.9581, + "num_input_tokens_seen": 2149580800, + "step": 8200 + }, + { + "epoch": 0.05549361571175607, + "grad_norm": 0.6416252851486206, + "learning_rate": 0.0005052238779988038, + "loss": 12.9369, + "num_input_tokens_seen": 2162688000, + "step": 8250 + }, + { + "epoch": 0.055829940655463685, + "grad_norm": 0.653998076915741, + "learning_rate": 0.0005040518213312757, + "loss": 12.9279, + "num_input_tokens_seen": 2175795200, + "step": 8300 + }, + { + "epoch": 0.056166265599171296, + "grad_norm": 0.6076102256774902, + "learning_rate": 0.0005028739398165686, + "loss": 12.9306, + "num_input_tokens_seen": 2188902400, + "step": 8350 + }, + { + "epoch": 0.05650259054287891, + "grad_norm": 0.6263251304626465, + "learning_rate": 0.0005016902670783949, + "loss": 12.9367, + "num_input_tokens_seen": 2202009600, + "step": 8400 + }, + { + "epoch": 0.05683891548658652, + "grad_norm": 0.6503254175186157, + "learning_rate": 0.0005005008369057835, + "loss": 12.8458, + "num_input_tokens_seen": 2215116800, + "step": 8450 + }, + { + "epoch": 0.057175240430294136, + "grad_norm": 0.6300747394561768, + "learning_rate": 0.0004993056832521138, + "loss": 12.8892, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.057175240430294136, + "eval_loss": 3.1433920860290527, + "eval_runtime": 145.793, + "eval_samples_per_second": 34.295, + "eval_steps_per_second": 8.574, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.05751156537400175, + "grad_norm": 0.6060501337051392, + "learning_rate": 0.0004981048402341477, + "loss": 12.9441, + "num_input_tokens_seen": 2241331200, + "step": 8550 + }, + { + "epoch": 0.05784789031770936, + "grad_norm": 0.650665819644928, + "learning_rate": 0.0004968983421310554, + "loss": 12.8715, + "num_input_tokens_seen": 2254438400, + "step": 8600 + }, + { + "epoch": 0.05818421526141697, + "grad_norm": 0.6535419821739197, + "learning_rate": 0.0004956862233834363, + "loss": 12.8842, + "num_input_tokens_seen": 2267545600, + "step": 8650 + }, + { + "epoch": 0.05852054020512458, + "grad_norm": 0.6426728367805481, + "learning_rate": 0.0004944685185923365, + "loss": 12.9156, + "num_input_tokens_seen": 2280652800, + "step": 8700 + }, + { + "epoch": 0.0588568651488322, + "grad_norm": 0.6501869559288025, + "learning_rate": 0.0004932452625182607, + "loss": 12.8779, + "num_input_tokens_seen": 2293760000, + "step": 8750 + }, + { + "epoch": 0.05919319009253981, + "grad_norm": 0.6345402002334595, + "learning_rate": 0.0004920164900801805, + "loss": 12.8629, + "num_input_tokens_seen": 2306867200, + "step": 8800 + }, + { + "epoch": 0.05952951503624742, + "grad_norm": 0.6712388396263123, + "learning_rate": 0.0004907822363545365, + "loss": 12.876, + "num_input_tokens_seen": 2319974400, + "step": 8850 + }, + { + "epoch": 0.05986583997995503, + "grad_norm": 0.6922229528427124, + "learning_rate": 0.0004895425365742384, + "loss": 12.8556, + "num_input_tokens_seen": 2333081600, + "step": 8900 + }, + { + "epoch": 0.06020216492366265, + "grad_norm": 0.6531935334205627, + "learning_rate": 0.0004882974261276581, + "loss": 12.8296, + "num_input_tokens_seen": 2346188800, + "step": 8950 + }, + { + "epoch": 0.06053848986737026, + "grad_norm": 0.6729333400726318, + "learning_rate": 0.00048704694055762005, + "loss": 12.8258, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.06053848986737026, + "eval_loss": 3.1286280155181885, + "eval_runtime": 143.0292, + "eval_samples_per_second": 34.958, + "eval_steps_per_second": 8.739, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.06087481481107787, + "grad_norm": 0.6235183477401733, + "learning_rate": 0.0004857911155603867, + "loss": 12.8588, + "num_input_tokens_seen": 2372403200, + "step": 9050 + }, + { + "epoch": 0.06121113975478548, + "grad_norm": 0.642000138759613, + "learning_rate": 0.0004845299869846392, + "loss": 12.8232, + "num_input_tokens_seen": 2385510400, + "step": 9100 + }, + { + "epoch": 0.06154746469849309, + "grad_norm": 0.6252527236938477, + "learning_rate": 0.0004832635908304543, + "loss": 12.8595, + "num_input_tokens_seen": 2398617600, + "step": 9150 + }, + { + "epoch": 0.06188378964220071, + "grad_norm": 0.6228143572807312, + "learning_rate": 0.0004819919632482766, + "loss": 12.8152, + "num_input_tokens_seen": 2411724800, + "step": 9200 + }, + { + "epoch": 0.06222011458590832, + "grad_norm": 0.661567211151123, + "learning_rate": 0.00048071514053788666, + "loss": 12.8356, + "num_input_tokens_seen": 2424832000, + "step": 9250 + }, + { + "epoch": 0.06255643952961594, + "grad_norm": 0.6318378448486328, + "learning_rate": 0.00047943315914736475, + "loss": 12.831, + "num_input_tokens_seen": 2437939200, + "step": 9300 + }, + { + "epoch": 0.06289276447332355, + "grad_norm": 0.6098783612251282, + "learning_rate": 0.0004781460556720504, + "loss": 12.8363, + "num_input_tokens_seen": 2451046400, + "step": 9350 + }, + { + "epoch": 0.06322908941703116, + "grad_norm": 0.643997073173523, + "learning_rate": 0.00047685386685349796, + "loss": 12.8267, + "num_input_tokens_seen": 2464153600, + "step": 9400 + }, + { + "epoch": 0.06356541436073877, + "grad_norm": 0.6287397146224976, + "learning_rate": 0.000475556629578427, + "loss": 12.8131, + "num_input_tokens_seen": 2477260800, + "step": 9450 + }, + { + "epoch": 0.06390173930444638, + "grad_norm": 0.6625655889511108, + "learning_rate": 0.0004742543808776708, + "loss": 12.8312, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.06390173930444638, + "eval_loss": 3.1130659580230713, + "eval_runtime": 143.1847, + "eval_samples_per_second": 34.92, + "eval_steps_per_second": 8.73, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.064238064248154, + "grad_norm": 0.6380253434181213, + "learning_rate": 0.0004729471579251177, + "loss": 12.8645, + "num_input_tokens_seen": 2503475200, + "step": 9550 + }, + { + "epoch": 0.0645743891918616, + "grad_norm": 0.6287338137626648, + "learning_rate": 0.00047163499803665085, + "loss": 12.7931, + "num_input_tokens_seen": 2516582400, + "step": 9600 + }, + { + "epoch": 0.06491071413556922, + "grad_norm": 0.6729796528816223, + "learning_rate": 0.00047031793866908294, + "loss": 12.7903, + "num_input_tokens_seen": 2529689600, + "step": 9650 + }, + { + "epoch": 0.06524703907927683, + "grad_norm": 0.6398154497146606, + "learning_rate": 0.0004689960174190865, + "loss": 12.7746, + "num_input_tokens_seen": 2542796800, + "step": 9700 + }, + { + "epoch": 0.06558336402298445, + "grad_norm": 0.6751012206077576, + "learning_rate": 0.00046766927202212145, + "loss": 12.7655, + "num_input_tokens_seen": 2555904000, + "step": 9750 + }, + { + "epoch": 0.06591968896669206, + "grad_norm": 0.6046076416969299, + "learning_rate": 0.0004663377403513568, + "loss": 12.8018, + "num_input_tokens_seen": 2569011200, + "step": 9800 + }, + { + "epoch": 0.06625601391039967, + "grad_norm": 0.6526479721069336, + "learning_rate": 0.0004650014604165907, + "loss": 12.7394, + "num_input_tokens_seen": 2582118400, + "step": 9850 + }, + { + "epoch": 0.06659233885410729, + "grad_norm": 0.6187541484832764, + "learning_rate": 0.00046366047036316456, + "loss": 12.7346, + "num_input_tokens_seen": 2595225600, + "step": 9900 + }, + { + "epoch": 0.0669286637978149, + "grad_norm": 0.6106886267662048, + "learning_rate": 0.0004623148084708745, + "loss": 12.7597, + "num_input_tokens_seen": 2608332800, + "step": 9950 + }, + { + "epoch": 0.06726498874152251, + "grad_norm": 0.6358317136764526, + "learning_rate": 0.0004609645131528788, + "loss": 12.7303, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.06726498874152251, + "eval_loss": 3.1026949882507324, + "eval_runtime": 142.2834, + "eval_samples_per_second": 35.141, + "eval_steps_per_second": 8.785, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.06760131368523012, + "grad_norm": 0.6313095688819885, + "learning_rate": 0.0004596096229546009, + "loss": 12.7336, + "num_input_tokens_seen": 2634547200, + "step": 10050 + }, + { + "epoch": 0.06793763862893773, + "grad_norm": 0.6490457057952881, + "learning_rate": 0.00045825017655262934, + "loss": 12.7727, + "num_input_tokens_seen": 2647654400, + "step": 10100 + }, + { + "epoch": 0.06827396357264534, + "grad_norm": 0.6609966158866882, + "learning_rate": 0.000456886212753614, + "loss": 12.759, + "num_input_tokens_seen": 2660761600, + "step": 10150 + }, + { + "epoch": 0.06861028851635297, + "grad_norm": 0.6392827033996582, + "learning_rate": 0.00045551777049315757, + "loss": 12.7189, + "num_input_tokens_seen": 2673868800, + "step": 10200 + }, + { + "epoch": 0.06894661346006058, + "grad_norm": 0.6272814273834229, + "learning_rate": 0.0004541448888347047, + "loss": 12.6948, + "num_input_tokens_seen": 2686976000, + "step": 10250 + }, + { + "epoch": 0.06928293840376819, + "grad_norm": 0.6286495327949524, + "learning_rate": 0.00045276760696842693, + "loss": 12.7224, + "num_input_tokens_seen": 2700083200, + "step": 10300 + }, + { + "epoch": 0.0696192633474758, + "grad_norm": 0.6213704943656921, + "learning_rate": 0.00045138596421010374, + "loss": 12.778, + "num_input_tokens_seen": 2713190400, + "step": 10350 + }, + { + "epoch": 0.06995558829118341, + "grad_norm": 0.6061195731163025, + "learning_rate": 0.00045, + "loss": 12.7403, + "num_input_tokens_seen": 2726297600, + "step": 10400 + }, + { + "epoch": 0.07029191323489102, + "grad_norm": 0.6419244408607483, + "learning_rate": 0.0004486097539017407, + "loss": 12.7137, + "num_input_tokens_seen": 2739404800, + "step": 10450 + }, + { + "epoch": 0.07062823817859863, + "grad_norm": 0.6618810892105103, + "learning_rate": 0.00044721526560118134, + "loss": 12.6896, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.07062823817859863, + "eval_loss": 3.0883917808532715, + "eval_runtime": 142.4547, + "eval_samples_per_second": 35.099, + "eval_steps_per_second": 8.775, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.07096456312230624, + "grad_norm": 0.6856646537780762, + "learning_rate": 0.00044581657490527473, + "loss": 12.6825, + "num_input_tokens_seen": 2765619200, + "step": 10550 + }, + { + "epoch": 0.07130088806601385, + "grad_norm": 0.6331352591514587, + "learning_rate": 0.00044441372174093487, + "loss": 12.675, + "num_input_tokens_seen": 2778726400, + "step": 10600 + }, + { + "epoch": 0.07163721300972148, + "grad_norm": 0.6496602296829224, + "learning_rate": 0.0004430067461538976, + "loss": 12.6842, + "num_input_tokens_seen": 2791833600, + "step": 10650 + }, + { + "epoch": 0.07197353795342909, + "grad_norm": 0.6698866486549377, + "learning_rate": 0.00044159568830757687, + "loss": 12.6498, + "num_input_tokens_seen": 2804940800, + "step": 10700 + }, + { + "epoch": 0.0723098628971367, + "grad_norm": 0.6556456089019775, + "learning_rate": 0.00044018058848191855, + "loss": 12.7073, + "num_input_tokens_seen": 2818048000, + "step": 10750 + }, + { + "epoch": 0.07264618784084431, + "grad_norm": 0.6554015278816223, + "learning_rate": 0.0004387614870722506, + "loss": 12.6515, + "num_input_tokens_seen": 2831155200, + "step": 10800 + }, + { + "epoch": 0.07298251278455192, + "grad_norm": 0.6356109380722046, + "learning_rate": 0.0004373384245881296, + "loss": 12.6759, + "num_input_tokens_seen": 2844262400, + "step": 10850 + }, + { + "epoch": 0.07331883772825953, + "grad_norm": 0.6429396271705627, + "learning_rate": 0.0004359114416521851, + "loss": 12.6469, + "num_input_tokens_seen": 2857369600, + "step": 10900 + }, + { + "epoch": 0.07365516267196714, + "grad_norm": 0.6229676604270935, + "learning_rate": 0.0004344805789989591, + "loss": 12.6783, + "num_input_tokens_seen": 2870476800, + "step": 10950 + }, + { + "epoch": 0.07399148761567476, + "grad_norm": 0.6383066177368164, + "learning_rate": 0.000433045877473744, + "loss": 12.6273, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.07399148761567476, + "eval_loss": 3.076796054840088, + "eval_runtime": 143.2443, + "eval_samples_per_second": 34.905, + "eval_steps_per_second": 8.726, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.07432781255938237, + "grad_norm": 0.612218976020813, + "learning_rate": 0.0004316073780314163, + "loss": 12.6729, + "num_input_tokens_seen": 2896691200, + "step": 11050 + }, + { + "epoch": 0.07466413750308999, + "grad_norm": 0.6343071460723877, + "learning_rate": 0.00043016512173526736, + "loss": 12.6507, + "num_input_tokens_seen": 2909798400, + "step": 11100 + }, + { + "epoch": 0.0750004624467976, + "grad_norm": 0.6494725942611694, + "learning_rate": 0.0004287191497558317, + "loss": 12.6271, + "num_input_tokens_seen": 2922905600, + "step": 11150 + }, + { + "epoch": 0.07533678739050521, + "grad_norm": 0.6436727046966553, + "learning_rate": 0.0004272695033697111, + "loss": 12.6529, + "num_input_tokens_seen": 2936012800, + "step": 11200 + }, + { + "epoch": 0.07567311233421282, + "grad_norm": 0.6481876373291016, + "learning_rate": 0.00042581622395839705, + "loss": 12.6528, + "num_input_tokens_seen": 2949120000, + "step": 11250 + }, + { + "epoch": 0.07600943727792044, + "grad_norm": 0.6492651104927063, + "learning_rate": 0.0004243593530070886, + "loss": 12.6312, + "num_input_tokens_seen": 2962227200, + "step": 11300 + }, + { + "epoch": 0.07634576222162805, + "grad_norm": 0.6570179462432861, + "learning_rate": 0.00042289893210350907, + "loss": 12.6428, + "num_input_tokens_seen": 2975334400, + "step": 11350 + }, + { + "epoch": 0.07668208716533566, + "grad_norm": 0.6505069732666016, + "learning_rate": 0.0004214350029367181, + "loss": 12.6549, + "num_input_tokens_seen": 2988441600, + "step": 11400 + }, + { + "epoch": 0.07701841210904327, + "grad_norm": 0.6301828026771545, + "learning_rate": 0.0004199676072959222, + "loss": 12.5838, + "num_input_tokens_seen": 3001548800, + "step": 11450 + }, + { + "epoch": 0.07735473705275088, + "grad_norm": 0.625487208366394, + "learning_rate": 0.0004184967870692816, + "loss": 12.6166, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.07735473705275088, + "eval_loss": 3.0652644634246826, + "eval_runtime": 142.6807, + "eval_samples_per_second": 35.043, + "eval_steps_per_second": 8.761, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.0776910619964585, + "grad_norm": 0.6678441762924194, + "learning_rate": 0.000417022584242714, + "loss": 12.6271, + "num_input_tokens_seen": 3027763200, + "step": 11550 + }, + { + "epoch": 0.07802738694016612, + "grad_norm": 0.6448168754577637, + "learning_rate": 0.00041554504089869716, + "loss": 12.6012, + "num_input_tokens_seen": 3040870400, + "step": 11600 + }, + { + "epoch": 0.07836371188387373, + "grad_norm": 0.6791290640830994, + "learning_rate": 0.0004140641992150667, + "loss": 12.5798, + "num_input_tokens_seen": 3053977600, + "step": 11650 + }, + { + "epoch": 0.07870003682758134, + "grad_norm": 0.8807069659233093, + "learning_rate": 0.00041258010146381224, + "loss": 12.6015, + "num_input_tokens_seen": 3067084800, + "step": 11700 + }, + { + "epoch": 0.07903636177128895, + "grad_norm": 0.6284939646720886, + "learning_rate": 0.00041109279000987105, + "loss": 12.6183, + "num_input_tokens_seen": 3080192000, + "step": 11750 + }, + { + "epoch": 0.07937268671499656, + "grad_norm": 0.6453195810317993, + "learning_rate": 0.0004096023073099185, + "loss": 12.6, + "num_input_tokens_seen": 3093299200, + "step": 11800 + }, + { + "epoch": 0.07970901165870417, + "grad_norm": 0.6511227488517761, + "learning_rate": 0.00040810869591115603, + "loss": 12.5952, + "num_input_tokens_seen": 3106406400, + "step": 11850 + }, + { + "epoch": 0.08004533660241178, + "grad_norm": 0.6701833009719849, + "learning_rate": 0.0004066119984500966, + "loss": 12.5674, + "num_input_tokens_seen": 3119513600, + "step": 11900 + }, + { + "epoch": 0.08038166154611939, + "grad_norm": 0.6320140957832336, + "learning_rate": 0.0004051122576513479, + "loss": 12.5772, + "num_input_tokens_seen": 3132620800, + "step": 11950 + }, + { + "epoch": 0.08071798648982702, + "grad_norm": 0.6579756736755371, + "learning_rate": 0.00040360951632639226, + "loss": 12.57, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.08071798648982702, + "eval_loss": 3.0548510551452637, + "eval_runtime": 142.805, + "eval_samples_per_second": 35.013, + "eval_steps_per_second": 8.753, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.08105431143353463, + "grad_norm": 0.6717228293418884, + "learning_rate": 0.0004021038173723649, + "loss": 12.5689, + "num_input_tokens_seen": 3158835200, + "step": 12050 + }, + { + "epoch": 0.08139063637724224, + "grad_norm": 0.6350929141044617, + "learning_rate": 0.0004005952037708293, + "loss": 12.5709, + "num_input_tokens_seen": 3171942400, + "step": 12100 + }, + { + "epoch": 0.08172696132094985, + "grad_norm": 0.6500872373580933, + "learning_rate": 0.00039908371858655013, + "loss": 12.576, + "num_input_tokens_seen": 3185049600, + "step": 12150 + }, + { + "epoch": 0.08206328626465746, + "grad_norm": 0.6404949426651001, + "learning_rate": 0.00039756940496626415, + "loss": 12.5173, + "num_input_tokens_seen": 3198156800, + "step": 12200 + }, + { + "epoch": 0.08239961120836507, + "grad_norm": 0.6140453219413757, + "learning_rate": 0.0003960523061374484, + "loss": 12.5427, + "num_input_tokens_seen": 3211264000, + "step": 12250 + }, + { + "epoch": 0.08273593615207268, + "grad_norm": 0.6440966725349426, + "learning_rate": 0.00039453246540708625, + "loss": 12.5706, + "num_input_tokens_seen": 3224371200, + "step": 12300 + }, + { + "epoch": 0.0830722610957803, + "grad_norm": 0.6301671862602234, + "learning_rate": 0.00039300992616043105, + "loss": 12.5483, + "num_input_tokens_seen": 3237478400, + "step": 12350 + }, + { + "epoch": 0.0834085860394879, + "grad_norm": 0.628695547580719, + "learning_rate": 0.00039148473185976815, + "loss": 12.5334, + "num_input_tokens_seen": 3250585600, + "step": 12400 + }, + { + "epoch": 0.08374491098319553, + "grad_norm": 0.6627179980278015, + "learning_rate": 0.0003899569260431734, + "loss": 12.565, + "num_input_tokens_seen": 3263692800, + "step": 12450 + }, + { + "epoch": 0.08408123592690314, + "grad_norm": 0.6234163045883179, + "learning_rate": 0.00038842655232327125, + "loss": 12.5742, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.08408123592690314, + "eval_loss": 3.0441489219665527, + "eval_runtime": 141.8038, + "eval_samples_per_second": 35.26, + "eval_steps_per_second": 8.815, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.08441756087061075, + "grad_norm": 0.6204286217689514, + "learning_rate": 0.0003868936543859888, + "loss": 12.5493, + "num_input_tokens_seen": 3289907200, + "step": 12550 + }, + { + "epoch": 0.08475388581431836, + "grad_norm": 0.6237512230873108, + "learning_rate": 0.00038535827598930967, + "loss": 12.5179, + "num_input_tokens_seen": 3303014400, + "step": 12600 + }, + { + "epoch": 0.08509021075802597, + "grad_norm": 0.6418094635009766, + "learning_rate": 0.00038382046096202435, + "loss": 12.5096, + "num_input_tokens_seen": 3316121600, + "step": 12650 + }, + { + "epoch": 0.08542653570173359, + "grad_norm": 0.6306421160697937, + "learning_rate": 0.0003822802532024791, + "loss": 12.5202, + "num_input_tokens_seen": 3329228800, + "step": 12700 + }, + { + "epoch": 0.0857628606454412, + "grad_norm": 0.6436113715171814, + "learning_rate": 0.000380737696677323, + "loss": 12.4871, + "num_input_tokens_seen": 3342336000, + "step": 12750 + }, + { + "epoch": 0.08609918558914881, + "grad_norm": 1.0079458951950073, + "learning_rate": 0.00037919283542025287, + "loss": 12.4992, + "num_input_tokens_seen": 3355443200, + "step": 12800 + }, + { + "epoch": 0.08643551053285642, + "grad_norm": 0.6185023188591003, + "learning_rate": 0.0003776457135307562, + "loss": 12.4876, + "num_input_tokens_seen": 3368550400, + "step": 12850 + }, + { + "epoch": 0.08677183547656404, + "grad_norm": 0.6664910912513733, + "learning_rate": 0.0003760963751728521, + "loss": 12.4876, + "num_input_tokens_seen": 3381657600, + "step": 12900 + }, + { + "epoch": 0.08710816042027165, + "grad_norm": 0.6112196445465088, + "learning_rate": 0.00037454486457383124, + "loss": 12.4972, + "num_input_tokens_seen": 3394764800, + "step": 12950 + }, + { + "epoch": 0.08744448536397927, + "grad_norm": 0.6308513879776001, + "learning_rate": 0.00037299122602299257, + "loss": 12.4583, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.08744448536397927, + "eval_loss": 3.034710645675659, + "eval_runtime": 182.8724, + "eval_samples_per_second": 27.341, + "eval_steps_per_second": 6.835, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.08778081030768688, + "grad_norm": 0.6481872200965881, + "learning_rate": 0.00037143550387037943, + "loss": 12.4646, + "num_input_tokens_seen": 3420979200, + "step": 13050 + }, + { + "epoch": 0.08811713525139449, + "grad_norm": 0.6672606468200684, + "learning_rate": 0.0003698777425255136, + "loss": 12.4237, + "num_input_tokens_seen": 3434086400, + "step": 13100 + }, + { + "epoch": 0.0884534601951021, + "grad_norm": 0.6188272833824158, + "learning_rate": 0.00036831798645612735, + "loss": 12.4983, + "num_input_tokens_seen": 3447193600, + "step": 13150 + }, + { + "epoch": 0.08878978513880971, + "grad_norm": 0.6584819555282593, + "learning_rate": 0.0003667562801868943, + "loss": 12.4316, + "num_input_tokens_seen": 3460300800, + "step": 13200 + }, + { + "epoch": 0.08912611008251732, + "grad_norm": 0.6392587423324585, + "learning_rate": 0.0003651926682981584, + "loss": 12.4541, + "num_input_tokens_seen": 3473408000, + "step": 13250 + }, + { + "epoch": 0.08946243502622493, + "grad_norm": 0.6473196148872375, + "learning_rate": 0.00036362719542466104, + "loss": 12.4921, + "num_input_tokens_seen": 3486515200, + "step": 13300 + }, + { + "epoch": 0.08979875996993256, + "grad_norm": 0.6527711153030396, + "learning_rate": 0.00036205990625426724, + "loss": 12.4578, + "num_input_tokens_seen": 3499622400, + "step": 13350 + }, + { + "epoch": 0.09013508491364017, + "grad_norm": 0.6588818430900574, + "learning_rate": 0.00036049084552669, + "loss": 12.4449, + "num_input_tokens_seen": 3512729600, + "step": 13400 + }, + { + "epoch": 0.09047140985734778, + "grad_norm": 0.6333611011505127, + "learning_rate": 0.00035892005803221286, + "loss": 12.4364, + "num_input_tokens_seen": 3525836800, + "step": 13450 + }, + { + "epoch": 0.09080773480105539, + "grad_norm": 0.6385447978973389, + "learning_rate": 0.0003573475886104117, + "loss": 12.4483, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.09080773480105539, + "eval_loss": 3.0267038345336914, + "eval_runtime": 143.0402, + "eval_samples_per_second": 34.955, + "eval_steps_per_second": 8.739, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.091144059744763, + "grad_norm": 0.652103066444397, + "learning_rate": 0.0003557734821488744, + "loss": 12.3973, + "num_input_tokens_seen": 3552051200, + "step": 13550 + }, + { + "epoch": 0.09148038468847061, + "grad_norm": 0.629550576210022, + "learning_rate": 0.00035419778358191967, + "loss": 12.4529, + "num_input_tokens_seen": 3565158400, + "step": 13600 + }, + { + "epoch": 0.09181670963217822, + "grad_norm": 0.646165132522583, + "learning_rate": 0.00035262053788931446, + "loss": 12.4602, + "num_input_tokens_seen": 3578265600, + "step": 13650 + }, + { + "epoch": 0.09215303457588583, + "grad_norm": 0.6328135132789612, + "learning_rate": 0.0003510417900949898, + "loss": 12.4859, + "num_input_tokens_seen": 3591372800, + "step": 13700 + }, + { + "epoch": 0.09248935951959344, + "grad_norm": 0.6435760259628296, + "learning_rate": 0.0003494615852657555, + "loss": 12.4747, + "num_input_tokens_seen": 3604480000, + "step": 13750 + }, + { + "epoch": 0.09282568446330107, + "grad_norm": 0.6149182915687561, + "learning_rate": 0.0003478799685100137, + "loss": 12.4353, + "num_input_tokens_seen": 3617587200, + "step": 13800 + }, + { + "epoch": 0.09316200940700868, + "grad_norm": 0.6365089416503906, + "learning_rate": 0.00034629698497647176, + "loss": 12.4255, + "num_input_tokens_seen": 3630694400, + "step": 13850 + }, + { + "epoch": 0.09349833435071629, + "grad_norm": 0.6469732522964478, + "learning_rate": 0.0003447126798528523, + "loss": 12.4259, + "num_input_tokens_seen": 3643801600, + "step": 13900 + }, + { + "epoch": 0.0938346592944239, + "grad_norm": 0.6317386031150818, + "learning_rate": 0.00034312709836460453, + "loss": 12.4626, + "num_input_tokens_seen": 3656908800, + "step": 13950 + }, + { + "epoch": 0.09417098423813151, + "grad_norm": 0.6267306208610535, + "learning_rate": 0.00034154028577361217, + "loss": 12.3991, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.09417098423813151, + "eval_loss": 3.016310691833496, + "eval_runtime": 142.725, + "eval_samples_per_second": 35.032, + "eval_steps_per_second": 8.758, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.09450730918183912, + "grad_norm": 0.6656507849693298, + "learning_rate": 0.0003399522873769023, + "loss": 12.4213, + "num_input_tokens_seen": 3683123200, + "step": 14050 + }, + { + "epoch": 0.09484363412554674, + "grad_norm": 0.6371810436248779, + "learning_rate": 0.0003383631485053518, + "loss": 12.4092, + "num_input_tokens_seen": 3696230400, + "step": 14100 + }, + { + "epoch": 0.09517995906925435, + "grad_norm": 0.6278609037399292, + "learning_rate": 0.0003367729145223933, + "loss": 12.3764, + "num_input_tokens_seen": 3709337600, + "step": 14150 + }, + { + "epoch": 0.09551628401296196, + "grad_norm": 0.6190541982650757, + "learning_rate": 0.00033518163082272055, + "loss": 12.4095, + "num_input_tokens_seen": 3722444800, + "step": 14200 + }, + { + "epoch": 0.09585260895666958, + "grad_norm": 0.6580514907836914, + "learning_rate": 0.00033358934283099235, + "loss": 12.3431, + "num_input_tokens_seen": 3735552000, + "step": 14250 + }, + { + "epoch": 0.0961889339003772, + "grad_norm": 0.6620698571205139, + "learning_rate": 0.000331996096000536, + "loss": 12.3971, + "num_input_tokens_seen": 3748659200, + "step": 14300 + }, + { + "epoch": 0.0965252588440848, + "grad_norm": 0.61739182472229, + "learning_rate": 0.00033040193581204973, + "loss": 12.3897, + "num_input_tokens_seen": 3761766400, + "step": 14350 + }, + { + "epoch": 0.09686158378779242, + "grad_norm": 0.6852706670761108, + "learning_rate": 0.0003288069077723045, + "loss": 12.4072, + "num_input_tokens_seen": 3774873600, + "step": 14400 + }, + { + "epoch": 0.09719790873150003, + "grad_norm": 0.6366174817085266, + "learning_rate": 0.00032721105741284466, + "loss": 12.3834, + "num_input_tokens_seen": 3787980800, + "step": 14450 + }, + { + "epoch": 0.09753423367520764, + "grad_norm": 0.685984194278717, + "learning_rate": 0.0003256144302886885, + "loss": 12.4215, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.09753423367520764, + "eval_loss": 3.0072007179260254, + "eval_runtime": 142.0382, + "eval_samples_per_second": 35.202, + "eval_steps_per_second": 8.8, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.09787055861891525, + "grad_norm": 0.633934736251831, + "learning_rate": 0.000324017071977028, + "loss": 12.3848, + "num_input_tokens_seen": 3814195200, + "step": 14550 + }, + { + "epoch": 0.09820688356262286, + "grad_norm": 0.6223523020744324, + "learning_rate": 0.0003224190280759273, + "loss": 12.389, + "num_input_tokens_seen": 3827302400, + "step": 14600 + }, + { + "epoch": 0.09854320850633047, + "grad_norm": 0.6419284343719482, + "learning_rate": 0.00032082034420302137, + "loss": 12.3622, + "num_input_tokens_seen": 3840409600, + "step": 14650 + }, + { + "epoch": 0.0988795334500381, + "grad_norm": 0.6162405610084534, + "learning_rate": 0.0003192210659942139, + "loss": 12.4409, + "num_input_tokens_seen": 3853516800, + "step": 14700 + }, + { + "epoch": 0.0992158583937457, + "grad_norm": 0.6561248898506165, + "learning_rate": 0.0003176212391023743, + "loss": 12.4152, + "num_input_tokens_seen": 3866624000, + "step": 14750 + }, + { + "epoch": 0.09955218333745332, + "grad_norm": 0.6575373411178589, + "learning_rate": 0.0003160209091960347, + "loss": 12.3603, + "num_input_tokens_seen": 3879731200, + "step": 14800 + }, + { + "epoch": 0.09988850828116093, + "grad_norm": 0.6060482859611511, + "learning_rate": 0.0003144201219580862, + "loss": 12.3752, + "num_input_tokens_seen": 3892838400, + "step": 14850 + }, + { + "epoch": 0.10022483322486854, + "grad_norm": 0.6433590650558472, + "learning_rate": 0.000312818923084475, + "loss": 12.3568, + "num_input_tokens_seen": 3905945600, + "step": 14900 + }, + { + "epoch": 0.10056115816857615, + "grad_norm": 0.626518189907074, + "learning_rate": 0.00031121735828289773, + "loss": 12.3327, + "num_input_tokens_seen": 3919052800, + "step": 14950 + }, + { + "epoch": 0.10089748311228376, + "grad_norm": 0.6467755436897278, + "learning_rate": 0.0003096154732714966, + "loss": 12.367, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.10089748311228376, + "eval_loss": 2.9978182315826416, + "eval_runtime": 143.1987, + "eval_samples_per_second": 34.917, + "eval_steps_per_second": 8.729, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.10123380805599137, + "grad_norm": 0.6437516808509827, + "learning_rate": 0.00030801331377755466, + "loss": 12.3776, + "num_input_tokens_seen": 3945267200, + "step": 15050 + }, + { + "epoch": 0.10157013299969898, + "grad_norm": 0.6743655204772949, + "learning_rate": 0.0003064109255361904, + "loss": 12.326, + "num_input_tokens_seen": 3958374400, + "step": 15100 + }, + { + "epoch": 0.1019064579434066, + "grad_norm": 0.6296969056129456, + "learning_rate": 0.00030480835428905214, + "loss": 12.3444, + "num_input_tokens_seen": 3971481600, + "step": 15150 + }, + { + "epoch": 0.10224278288711422, + "grad_norm": 0.648457407951355, + "learning_rate": 0.000303205645783012, + "loss": 12.3422, + "num_input_tokens_seen": 3984588800, + "step": 15200 + }, + { + "epoch": 0.10257910783082183, + "grad_norm": 0.6306461691856384, + "learning_rate": 0.0003016028457688604, + "loss": 12.3452, + "num_input_tokens_seen": 3997696000, + "step": 15250 + }, + { + "epoch": 0.10291543277452944, + "grad_norm": 0.6481978893280029, + "learning_rate": 0.0003, + "loss": 12.3079, + "num_input_tokens_seen": 4010803200, + "step": 15300 + }, + { + "epoch": 0.10325175771823705, + "grad_norm": 0.7946459650993347, + "learning_rate": 0.0002983971542311397, + "loss": 12.3674, + "num_input_tokens_seen": 4023910400, + "step": 15350 + }, + { + "epoch": 0.10358808266194466, + "grad_norm": 0.6375327706336975, + "learning_rate": 0.000296794354216988, + "loss": 12.3125, + "num_input_tokens_seen": 4037017600, + "step": 15400 + }, + { + "epoch": 0.10392440760565227, + "grad_norm": 0.6338579058647156, + "learning_rate": 0.0002951916457109479, + "loss": 12.3305, + "num_input_tokens_seen": 4050124800, + "step": 15450 + }, + { + "epoch": 0.10426073254935989, + "grad_norm": 0.642365038394928, + "learning_rate": 0.00029358907446380955, + "loss": 12.3038, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.10426073254935989, + "eval_loss": 2.9912989139556885, + "eval_runtime": 142.6952, + "eval_samples_per_second": 35.04, + "eval_steps_per_second": 8.76, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.1045970574930675, + "grad_norm": 0.6200032830238342, + "learning_rate": 0.00029198668622244534, + "loss": 12.3153, + "num_input_tokens_seen": 4076339200, + "step": 15550 + }, + { + "epoch": 0.10493338243677511, + "grad_norm": 0.6352826356887817, + "learning_rate": 0.0002903845267285034, + "loss": 12.3094, + "num_input_tokens_seen": 4089446400, + "step": 15600 + }, + { + "epoch": 0.10526970738048273, + "grad_norm": 0.6530657410621643, + "learning_rate": 0.0002887826417171023, + "loss": 12.3094, + "num_input_tokens_seen": 4102553600, + "step": 15650 + }, + { + "epoch": 0.10560603232419034, + "grad_norm": 0.6631893515586853, + "learning_rate": 0.00028718107691552496, + "loss": 12.2943, + "num_input_tokens_seen": 4115660800, + "step": 15700 + }, + { + "epoch": 0.10594235726789795, + "grad_norm": 0.6634914875030518, + "learning_rate": 0.0002855798780419138, + "loss": 12.2738, + "num_input_tokens_seen": 4128768000, + "step": 15750 + }, + { + "epoch": 0.10627868221160557, + "grad_norm": 0.6240889430046082, + "learning_rate": 0.00028397909080396527, + "loss": 12.3316, + "num_input_tokens_seen": 4141875200, + "step": 15800 + }, + { + "epoch": 0.10661500715531318, + "grad_norm": 0.6263941526412964, + "learning_rate": 0.00028237876089762574, + "loss": 12.2874, + "num_input_tokens_seen": 4154982400, + "step": 15850 + }, + { + "epoch": 0.10695133209902079, + "grad_norm": 0.629359245300293, + "learning_rate": 0.00028077893400578615, + "loss": 12.3043, + "num_input_tokens_seen": 4168089600, + "step": 15900 + }, + { + "epoch": 0.1072876570427284, + "grad_norm": 0.6163947582244873, + "learning_rate": 0.0002791796557969787, + "loss": 12.3009, + "num_input_tokens_seen": 4181196800, + "step": 15950 + }, + { + "epoch": 0.10762398198643601, + "grad_norm": 0.6394808888435364, + "learning_rate": 0.0002775809719240727, + "loss": 12.2584, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.10762398198643601, + "eval_loss": 2.984212875366211, + "eval_runtime": 142.4659, + "eval_samples_per_second": 35.096, + "eval_steps_per_second": 8.774, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.10796030693014362, + "grad_norm": 0.6504441499710083, + "learning_rate": 0.00027598292802297203, + "loss": 12.301, + "num_input_tokens_seen": 4207411200, + "step": 16050 + }, + { + "epoch": 0.10829663187385125, + "grad_norm": 0.6610515117645264, + "learning_rate": 0.00027438556971131137, + "loss": 12.2809, + "num_input_tokens_seen": 4220518400, + "step": 16100 + }, + { + "epoch": 0.10863295681755886, + "grad_norm": 0.6400002837181091, + "learning_rate": 0.00027278894258715535, + "loss": 12.2821, + "num_input_tokens_seen": 4233625600, + "step": 16150 + }, + { + "epoch": 0.10896928176126647, + "grad_norm": 0.6517115831375122, + "learning_rate": 0.00027119309222769546, + "loss": 12.2722, + "num_input_tokens_seen": 4246732800, + "step": 16200 + }, + { + "epoch": 0.10930560670497408, + "grad_norm": 0.6387389898300171, + "learning_rate": 0.0002695980641879502, + "loss": 12.2715, + "num_input_tokens_seen": 4259840000, + "step": 16250 + }, + { + "epoch": 0.10964193164868169, + "grad_norm": 0.6440519094467163, + "learning_rate": 0.0002680039039994639, + "loss": 12.25, + "num_input_tokens_seen": 4272947200, + "step": 16300 + }, + { + "epoch": 0.1099782565923893, + "grad_norm": 0.6389286518096924, + "learning_rate": 0.0002664106571690076, + "loss": 12.2565, + "num_input_tokens_seen": 4286054400, + "step": 16350 + }, + { + "epoch": 0.11031458153609691, + "grad_norm": 0.6398110389709473, + "learning_rate": 0.00026481836917727946, + "loss": 12.2356, + "num_input_tokens_seen": 4299161600, + "step": 16400 + }, + { + "epoch": 0.11065090647980452, + "grad_norm": 0.6471937298774719, + "learning_rate": 0.00026322708547760676, + "loss": 12.269, + "num_input_tokens_seen": 4312268800, + "step": 16450 + }, + { + "epoch": 0.11098723142351213, + "grad_norm": 0.6105075478553772, + "learning_rate": 0.00026163685149464816, + "loss": 12.2762, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.11098723142351213, + "eval_loss": 2.9754507541656494, + "eval_runtime": 142.3479, + "eval_samples_per_second": 35.125, + "eval_steps_per_second": 8.781, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.11132355636721976, + "grad_norm": 0.6111390590667725, + "learning_rate": 0.00026004771262309764, + "loss": 12.2253, + "num_input_tokens_seen": 4338483200, + "step": 16550 + }, + { + "epoch": 0.11165988131092737, + "grad_norm": 0.6871252059936523, + "learning_rate": 0.0002584597142263877, + "loss": 12.2595, + "num_input_tokens_seen": 4351590400, + "step": 16600 + }, + { + "epoch": 0.11199620625463498, + "grad_norm": 0.6608724594116211, + "learning_rate": 0.00025687290163539547, + "loss": 12.2838, + "num_input_tokens_seen": 4364697600, + "step": 16650 + }, + { + "epoch": 0.11233253119834259, + "grad_norm": 0.634148895740509, + "learning_rate": 0.0002552873201471476, + "loss": 12.2522, + "num_input_tokens_seen": 4377804800, + "step": 16700 + }, + { + "epoch": 0.1126688561420502, + "grad_norm": 0.6481145620346069, + "learning_rate": 0.00025370301502352825, + "loss": 12.2185, + "num_input_tokens_seen": 4390912000, + "step": 16750 + }, + { + "epoch": 0.11300518108575781, + "grad_norm": 0.6411675810813904, + "learning_rate": 0.0002521200314899863, + "loss": 12.2566, + "num_input_tokens_seen": 4404019200, + "step": 16800 + }, + { + "epoch": 0.11334150602946542, + "grad_norm": 0.6985258460044861, + "learning_rate": 0.00025053841473424447, + "loss": 12.3036, + "num_input_tokens_seen": 4417126400, + "step": 16850 + }, + { + "epoch": 0.11367783097317304, + "grad_norm": 0.6223846673965454, + "learning_rate": 0.0002489582099050102, + "loss": 12.1942, + "num_input_tokens_seen": 4430233600, + "step": 16900 + }, + { + "epoch": 0.11401415591688065, + "grad_norm": 0.637690007686615, + "learning_rate": 0.00024737946211068554, + "loss": 12.2711, + "num_input_tokens_seen": 4443340800, + "step": 16950 + }, + { + "epoch": 0.11435048086058827, + "grad_norm": 0.6184976696968079, + "learning_rate": 0.00024580221641808033, + "loss": 12.2252, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.11435048086058827, + "eval_loss": 2.9684932231903076, + "eval_runtime": 142.7182, + "eval_samples_per_second": 35.034, + "eval_steps_per_second": 8.759, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.11468680580429588, + "grad_norm": 0.618519127368927, + "learning_rate": 0.0002442265178511256, + "loss": 12.2066, + "num_input_tokens_seen": 4469555200, + "step": 17050 + }, + { + "epoch": 0.1150231307480035, + "grad_norm": 0.6514145731925964, + "learning_rate": 0.00024265241138958835, + "loss": 12.2228, + "num_input_tokens_seen": 4482662400, + "step": 17100 + }, + { + "epoch": 0.1153594556917111, + "grad_norm": 0.6823457479476929, + "learning_rate": 0.00024107994196778714, + "loss": 12.2507, + "num_input_tokens_seen": 4495769600, + "step": 17150 + }, + { + "epoch": 0.11569578063541872, + "grad_norm": 0.6243106722831726, + "learning_rate": 0.0002395091544733101, + "loss": 12.1857, + "num_input_tokens_seen": 4508876800, + "step": 17200 + }, + { + "epoch": 0.11603210557912633, + "grad_norm": 0.6370251774787903, + "learning_rate": 0.00023794009374573274, + "loss": 12.2309, + "num_input_tokens_seen": 4521984000, + "step": 17250 + }, + { + "epoch": 0.11636843052283394, + "grad_norm": 0.6504274010658264, + "learning_rate": 0.00023637280457533902, + "loss": 12.2132, + "num_input_tokens_seen": 4535091200, + "step": 17300 + }, + { + "epoch": 0.11670475546654155, + "grad_norm": 0.6156638860702515, + "learning_rate": 0.00023480733170184158, + "loss": 12.199, + "num_input_tokens_seen": 4548198400, + "step": 17350 + }, + { + "epoch": 0.11704108041024916, + "grad_norm": 0.6292795538902283, + "learning_rate": 0.0002332437198131057, + "loss": 12.2122, + "num_input_tokens_seen": 4561305600, + "step": 17400 + }, + { + "epoch": 0.11737740535395678, + "grad_norm": 0.6368102431297302, + "learning_rate": 0.00023168201354387266, + "loss": 12.2453, + "num_input_tokens_seen": 4574412800, + "step": 17450 + }, + { + "epoch": 0.1177137302976644, + "grad_norm": 0.6373352408409119, + "learning_rate": 0.00023012225747448645, + "loss": 12.2031, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.1177137302976644, + "eval_loss": 2.961634635925293, + "eval_runtime": 142.67, + "eval_samples_per_second": 35.046, + "eval_steps_per_second": 8.761, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.118050055241372, + "grad_norm": 0.6781473755836487, + "learning_rate": 0.0002285644961296205, + "loss": 12.1939, + "num_input_tokens_seen": 4600627200, + "step": 17550 + }, + { + "epoch": 0.11838638018507962, + "grad_norm": 0.6434431076049805, + "learning_rate": 0.0002270087739770074, + "loss": 12.1876, + "num_input_tokens_seen": 4613734400, + "step": 17600 + }, + { + "epoch": 0.11872270512878723, + "grad_norm": 0.6625823974609375, + "learning_rate": 0.00022545513542616865, + "loss": 12.1683, + "num_input_tokens_seen": 4626841600, + "step": 17650 + }, + { + "epoch": 0.11905903007249484, + "grad_norm": 0.6367326974868774, + "learning_rate": 0.0002239036248271478, + "loss": 12.1769, + "num_input_tokens_seen": 4639948800, + "step": 17700 + }, + { + "epoch": 0.11939535501620245, + "grad_norm": 0.648065447807312, + "learning_rate": 0.00022235428646924372, + "loss": 12.2213, + "num_input_tokens_seen": 4653056000, + "step": 17750 + }, + { + "epoch": 0.11973167995991006, + "grad_norm": 0.648695170879364, + "learning_rate": 0.00022080716457974705, + "loss": 12.1699, + "num_input_tokens_seen": 4666163200, + "step": 17800 + }, + { + "epoch": 0.12006800490361767, + "grad_norm": 0.6311103105545044, + "learning_rate": 0.00021926230332267694, + "loss": 12.1912, + "num_input_tokens_seen": 4679270400, + "step": 17850 + }, + { + "epoch": 0.1204043298473253, + "grad_norm": 0.6318332552909851, + "learning_rate": 0.00021771974679752094, + "loss": 12.1242, + "num_input_tokens_seen": 4692377600, + "step": 17900 + }, + { + "epoch": 0.12074065479103291, + "grad_norm": 0.6513566374778748, + "learning_rate": 0.0002161795390379756, + "loss": 12.2068, + "num_input_tokens_seen": 4705484800, + "step": 17950 + }, + { + "epoch": 0.12107697973474052, + "grad_norm": 0.6865115761756897, + "learning_rate": 0.00021464172401069027, + "loss": 12.1477, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.12107697973474052, + "eval_loss": 2.954843044281006, + "eval_runtime": 142.8465, + "eval_samples_per_second": 35.003, + "eval_steps_per_second": 8.751, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.12141330467844813, + "grad_norm": 0.622513473033905, + "learning_rate": 0.00021310634561401109, + "loss": 12.1664, + "num_input_tokens_seen": 4731699200, + "step": 18050 + }, + { + "epoch": 0.12174962962215574, + "grad_norm": 0.6387473344802856, + "learning_rate": 0.0002115734476767287, + "loss": 12.1838, + "num_input_tokens_seen": 4744806400, + "step": 18100 + }, + { + "epoch": 0.12208595456586335, + "grad_norm": 0.6974210739135742, + "learning_rate": 0.00021004307395682648, + "loss": 12.201, + "num_input_tokens_seen": 4757913600, + "step": 18150 + }, + { + "epoch": 0.12242227950957096, + "grad_norm": 0.6665675640106201, + "learning_rate": 0.00020851526814023185, + "loss": 12.1154, + "num_input_tokens_seen": 4771020800, + "step": 18200 + }, + { + "epoch": 0.12275860445327857, + "grad_norm": 0.6340165734291077, + "learning_rate": 0.00020699007383956895, + "loss": 12.19, + "num_input_tokens_seen": 4784128000, + "step": 18250 + }, + { + "epoch": 0.12309492939698619, + "grad_norm": 0.6360442638397217, + "learning_rate": 0.00020546753459291378, + "loss": 12.1872, + "num_input_tokens_seen": 4797235200, + "step": 18300 + }, + { + "epoch": 0.12343125434069381, + "grad_norm": 0.6116852760314941, + "learning_rate": 0.00020394769386255162, + "loss": 12.1645, + "num_input_tokens_seen": 4810342400, + "step": 18350 + }, + { + "epoch": 0.12376757928440142, + "grad_norm": 0.6432573795318604, + "learning_rate": 0.00020243059503373588, + "loss": 12.1537, + "num_input_tokens_seen": 4823449600, + "step": 18400 + }, + { + "epoch": 0.12410390422810903, + "grad_norm": 0.6480187773704529, + "learning_rate": 0.00020091628141344996, + "loss": 12.155, + "num_input_tokens_seen": 4836556800, + "step": 18450 + }, + { + "epoch": 0.12444022917181664, + "grad_norm": 0.643993616104126, + "learning_rate": 0.00019940479622917068, + "loss": 12.1604, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.12444022917181664, + "eval_loss": 2.9489145278930664, + "eval_runtime": 142.7319, + "eval_samples_per_second": 35.031, + "eval_steps_per_second": 8.758, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.12477655411552425, + "grad_norm": 0.6500803232192993, + "learning_rate": 0.00019789618262763508, + "loss": 12.1604, + "num_input_tokens_seen": 4862771200, + "step": 18550 + }, + { + "epoch": 0.12511287905923188, + "grad_norm": 0.6314743161201477, + "learning_rate": 0.00019639048367360774, + "loss": 12.1107, + "num_input_tokens_seen": 4875878400, + "step": 18600 + }, + { + "epoch": 0.1254492040029395, + "grad_norm": 0.6902073621749878, + "learning_rate": 0.00019488774234865217, + "loss": 12.1634, + "num_input_tokens_seen": 4888985600, + "step": 18650 + }, + { + "epoch": 0.1257855289466471, + "grad_norm": 0.6349673867225647, + "learning_rate": 0.00019338800154990337, + "loss": 12.1828, + "num_input_tokens_seen": 4902092800, + "step": 18700 + }, + { + "epoch": 0.1261218538903547, + "grad_norm": 0.639392614364624, + "learning_rate": 0.000191891304088844, + "loss": 12.1314, + "num_input_tokens_seen": 4915200000, + "step": 18750 + }, + { + "epoch": 0.12645817883406232, + "grad_norm": 0.6466573476791382, + "learning_rate": 0.00019039769269008148, + "loss": 12.1521, + "num_input_tokens_seen": 4928307200, + "step": 18800 + }, + { + "epoch": 0.12679450377776993, + "grad_norm": 0.6457189917564392, + "learning_rate": 0.00018890720999012895, + "loss": 12.1631, + "num_input_tokens_seen": 4941414400, + "step": 18850 + }, + { + "epoch": 0.12713082872147755, + "grad_norm": 0.648733377456665, + "learning_rate": 0.00018741989853618779, + "loss": 12.1553, + "num_input_tokens_seen": 4954521600, + "step": 18900 + }, + { + "epoch": 0.12746715366518516, + "grad_norm": 0.6314489841461182, + "learning_rate": 0.00018593580078493335, + "loss": 12.1703, + "num_input_tokens_seen": 4967628800, + "step": 18950 + }, + { + "epoch": 0.12780347860889277, + "grad_norm": 0.6238834857940674, + "learning_rate": 0.0001844549591013027, + "loss": 12.0931, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.12780347860889277, + "eval_loss": 2.943131923675537, + "eval_runtime": 142.8812, + "eval_samples_per_second": 34.994, + "eval_steps_per_second": 8.749, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.12813980355260038, + "grad_norm": 0.6120157241821289, + "learning_rate": 0.00018297741575728593, + "loss": 12.1415, + "num_input_tokens_seen": 4993843200, + "step": 19050 + }, + { + "epoch": 0.128476128496308, + "grad_norm": 0.6346642374992371, + "learning_rate": 0.00018150321293071843, + "loss": 12.1464, + "num_input_tokens_seen": 5006950400, + "step": 19100 + }, + { + "epoch": 0.1288124534400156, + "grad_norm": 0.6268289685249329, + "learning_rate": 0.00018003239270407775, + "loss": 12.1105, + "num_input_tokens_seen": 5020057600, + "step": 19150 + }, + { + "epoch": 0.1291487783837232, + "grad_norm": 0.6437589526176453, + "learning_rate": 0.00017856499706328183, + "loss": 12.1208, + "num_input_tokens_seen": 5033164800, + "step": 19200 + }, + { + "epoch": 0.12948510332743082, + "grad_norm": 0.6311147809028625, + "learning_rate": 0.00017710106789649096, + "loss": 12.1137, + "num_input_tokens_seen": 5046272000, + "step": 19250 + }, + { + "epoch": 0.12982142827113843, + "grad_norm": 0.646539568901062, + "learning_rate": 0.00017564064699291133, + "loss": 12.1824, + "num_input_tokens_seen": 5059379200, + "step": 19300 + }, + { + "epoch": 0.13015775321484604, + "grad_norm": 0.6385849714279175, + "learning_rate": 0.00017418377604160295, + "loss": 12.1106, + "num_input_tokens_seen": 5072486400, + "step": 19350 + }, + { + "epoch": 0.13049407815855366, + "grad_norm": 0.6449156403541565, + "learning_rate": 0.0001727304966302887, + "loss": 12.0996, + "num_input_tokens_seen": 5085593600, + "step": 19400 + }, + { + "epoch": 0.1308304031022613, + "grad_norm": 0.6219010949134827, + "learning_rate": 0.0001712808502441682, + "loss": 12.1306, + "num_input_tokens_seen": 5098700800, + "step": 19450 + }, + { + "epoch": 0.1311667280459689, + "grad_norm": 0.6273418664932251, + "learning_rate": 0.00016983487826473256, + "loss": 12.0719, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.1311667280459689, + "eval_loss": 2.937514066696167, + "eval_runtime": 142.6532, + "eval_samples_per_second": 35.05, + "eval_steps_per_second": 8.763, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.13150305298967652, + "grad_norm": 0.6366037130355835, + "learning_rate": 0.00016839262196858374, + "loss": 12.1143, + "num_input_tokens_seen": 5124915200, + "step": 19550 + }, + { + "epoch": 0.13183937793338413, + "grad_norm": 0.6395111083984375, + "learning_rate": 0.00016695412252625596, + "loss": 12.0524, + "num_input_tokens_seen": 5138022400, + "step": 19600 + }, + { + "epoch": 0.13217570287709174, + "grad_norm": 0.639202892780304, + "learning_rate": 0.0001655194210010409, + "loss": 12.1006, + "num_input_tokens_seen": 5151129600, + "step": 19650 + }, + { + "epoch": 0.13251202782079935, + "grad_norm": 0.6547548174858093, + "learning_rate": 0.00016408855834781487, + "loss": 12.0684, + "num_input_tokens_seen": 5164236800, + "step": 19700 + }, + { + "epoch": 0.13284835276450696, + "grad_norm": 0.6669015884399414, + "learning_rate": 0.00016266157541187034, + "loss": 12.1204, + "num_input_tokens_seen": 5177344000, + "step": 19750 + }, + { + "epoch": 0.13318467770821457, + "grad_norm": 0.637744665145874, + "learning_rate": 0.00016123851292774947, + "loss": 12.1164, + "num_input_tokens_seen": 5190451200, + "step": 19800 + }, + { + "epoch": 0.13352100265192218, + "grad_norm": 0.6337763667106628, + "learning_rate": 0.00015981941151808137, + "loss": 12.1213, + "num_input_tokens_seen": 5203558400, + "step": 19850 + }, + { + "epoch": 0.1338573275956298, + "grad_norm": 0.651337742805481, + "learning_rate": 0.0001584043116924231, + "loss": 12.1115, + "num_input_tokens_seen": 5216665600, + "step": 19900 + }, + { + "epoch": 0.1341936525393374, + "grad_norm": 0.6313726902008057, + "learning_rate": 0.00015699325384610244, + "loss": 12.1078, + "num_input_tokens_seen": 5229772800, + "step": 19950 + }, + { + "epoch": 0.13452997748304502, + "grad_norm": 0.6925057768821716, + "learning_rate": 0.00015558627825906524, + "loss": 12.0672, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.13452997748304502, + "eval_loss": 2.931644916534424, + "eval_runtime": 143.0863, + "eval_samples_per_second": 34.944, + "eval_steps_per_second": 8.736, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.13486630242675263, + "grad_norm": 0.6415194272994995, + "learning_rate": 0.00015418342509472535, + "loss": 12.1005, + "num_input_tokens_seen": 5255987200, + "step": 20050 + }, + { + "epoch": 0.13520262737046024, + "grad_norm": 0.6401641368865967, + "learning_rate": 0.00015278473439881874, + "loss": 12.0935, + "num_input_tokens_seen": 5269094400, + "step": 20100 + }, + { + "epoch": 0.13553895231416785, + "grad_norm": 0.6700222492218018, + "learning_rate": 0.0001513902460982592, + "loss": 12.0946, + "num_input_tokens_seen": 5282201600, + "step": 20150 + }, + { + "epoch": 0.13587527725787546, + "grad_norm": 0.6184066534042358, + "learning_rate": 0.00015000000000000004, + "loss": 12.058, + "num_input_tokens_seen": 5295308800, + "step": 20200 + }, + { + "epoch": 0.13621160220158307, + "grad_norm": 0.6642903685569763, + "learning_rate": 0.00014861403578989629, + "loss": 12.0421, + "num_input_tokens_seen": 5308416000, + "step": 20250 + }, + { + "epoch": 0.13654792714529068, + "grad_norm": 0.651897668838501, + "learning_rate": 0.00014723239303157307, + "loss": 12.0393, + "num_input_tokens_seen": 5321523200, + "step": 20300 + }, + { + "epoch": 0.1368842520889983, + "grad_norm": 0.616648256778717, + "learning_rate": 0.00014585511116529528, + "loss": 12.0737, + "num_input_tokens_seen": 5334630400, + "step": 20350 + }, + { + "epoch": 0.13722057703270593, + "grad_norm": 0.6298686861991882, + "learning_rate": 0.00014448222950684246, + "loss": 12.0721, + "num_input_tokens_seen": 5347737600, + "step": 20400 + }, + { + "epoch": 0.13755690197641354, + "grad_norm": 0.6637253165245056, + "learning_rate": 0.00014311378724638605, + "loss": 12.0921, + "num_input_tokens_seen": 5360844800, + "step": 20450 + }, + { + "epoch": 0.13789322692012115, + "grad_norm": 0.6153833866119385, + "learning_rate": 0.0001417498234473706, + "loss": 12.0664, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.13789322692012115, + "eval_loss": 2.9268288612365723, + "eval_runtime": 143.0059, + "eval_samples_per_second": 34.964, + "eval_steps_per_second": 8.741, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.13822955186382876, + "grad_norm": 0.6536301374435425, + "learning_rate": 0.00014039037704539906, + "loss": 12.0644, + "num_input_tokens_seen": 5387059200, + "step": 20550 + }, + { + "epoch": 0.13856587680753638, + "grad_norm": 0.678833544254303, + "learning_rate": 0.00013903548684712116, + "loss": 12.0616, + "num_input_tokens_seen": 5400166400, + "step": 20600 + }, + { + "epoch": 0.138902201751244, + "grad_norm": 0.6597055792808533, + "learning_rate": 0.00013768519152912537, + "loss": 12.0914, + "num_input_tokens_seen": 5413273600, + "step": 20650 + }, + { + "epoch": 0.1392385266949516, + "grad_norm": 0.6703686714172363, + "learning_rate": 0.00013633952963683542, + "loss": 12.0582, + "num_input_tokens_seen": 5426380800, + "step": 20700 + }, + { + "epoch": 0.1395748516386592, + "grad_norm": 0.6616584062576294, + "learning_rate": 0.00013499853958340923, + "loss": 12.105, + "num_input_tokens_seen": 5439488000, + "step": 20750 + }, + { + "epoch": 0.13991117658236682, + "grad_norm": 0.6584370136260986, + "learning_rate": 0.00013366225964864313, + "loss": 12.0616, + "num_input_tokens_seen": 5452595200, + "step": 20800 + }, + { + "epoch": 0.14024750152607443, + "grad_norm": 0.6238560676574707, + "learning_rate": 0.00013233072797787847, + "loss": 12.074, + "num_input_tokens_seen": 5465702400, + "step": 20850 + }, + { + "epoch": 0.14058382646978204, + "grad_norm": 0.6119787096977234, + "learning_rate": 0.00013100398258091337, + "loss": 12.0441, + "num_input_tokens_seen": 5478809600, + "step": 20900 + }, + { + "epoch": 0.14092015141348965, + "grad_norm": 0.6162968873977661, + "learning_rate": 0.00012968206133091707, + "loss": 12.0726, + "num_input_tokens_seen": 5491916800, + "step": 20950 + }, + { + "epoch": 0.14125647635719726, + "grad_norm": 0.6324203014373779, + "learning_rate": 0.00012836500196334916, + "loss": 12.029, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.14125647635719726, + "eval_loss": 2.9219326972961426, + "eval_runtime": 142.444, + "eval_samples_per_second": 35.102, + "eval_steps_per_second": 8.775, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.14159280130090487, + "grad_norm": 0.6728281378746033, + "learning_rate": 0.0001270528420748823, + "loss": 12.0576, + "num_input_tokens_seen": 5518131200, + "step": 21050 + }, + { + "epoch": 0.14192912624461249, + "grad_norm": 0.6371399164199829, + "learning_rate": 0.0001257456191223292, + "loss": 12.0809, + "num_input_tokens_seen": 5531238400, + "step": 21100 + }, + { + "epoch": 0.1422654511883201, + "grad_norm": 0.6416388750076294, + "learning_rate": 0.00012444337042157285, + "loss": 12.0472, + "num_input_tokens_seen": 5544345600, + "step": 21150 + }, + { + "epoch": 0.1426017761320277, + "grad_norm": 0.672295093536377, + "learning_rate": 0.00012314613314650207, + "loss": 12.0615, + "num_input_tokens_seen": 5557452800, + "step": 21200 + }, + { + "epoch": 0.14293810107573532, + "grad_norm": 0.6460967063903809, + "learning_rate": 0.00012185394432794955, + "loss": 12.0439, + "num_input_tokens_seen": 5570560000, + "step": 21250 + }, + { + "epoch": 0.14327442601944296, + "grad_norm": 0.6483781337738037, + "learning_rate": 0.0001205668408526352, + "loss": 12.0767, + "num_input_tokens_seen": 5583667200, + "step": 21300 + }, + { + "epoch": 0.14361075096315057, + "grad_norm": 0.6515306830406189, + "learning_rate": 0.00011928485946211334, + "loss": 12.0398, + "num_input_tokens_seen": 5596774400, + "step": 21350 + }, + { + "epoch": 0.14394707590685818, + "grad_norm": 0.6355323791503906, + "learning_rate": 0.00011800803675172337, + "loss": 12.0792, + "num_input_tokens_seen": 5609881600, + "step": 21400 + }, + { + "epoch": 0.1442834008505658, + "grad_norm": 0.6724342107772827, + "learning_rate": 0.00011673640916954571, + "loss": 12.0238, + "num_input_tokens_seen": 5622988800, + "step": 21450 + }, + { + "epoch": 0.1446197257942734, + "grad_norm": 0.6570594310760498, + "learning_rate": 0.00011547001301536085, + "loss": 12.0514, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.1446197257942734, + "eval_loss": 2.91679310798645, + "eval_runtime": 142.6116, + "eval_samples_per_second": 35.06, + "eval_steps_per_second": 8.765, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.144956050737981, + "grad_norm": 0.6420052647590637, + "learning_rate": 0.00011420888443961337, + "loss": 12.02, + "num_input_tokens_seen": 5649203200, + "step": 21550 + }, + { + "epoch": 0.14529237568168862, + "grad_norm": 0.6295548677444458, + "learning_rate": 0.00011295305944237995, + "loss": 12.0275, + "num_input_tokens_seen": 5662310400, + "step": 21600 + }, + { + "epoch": 0.14562870062539623, + "grad_norm": 0.6434178352355957, + "learning_rate": 0.00011170257387234198, + "loss": 12.0421, + "num_input_tokens_seen": 5675417600, + "step": 21650 + }, + { + "epoch": 0.14596502556910385, + "grad_norm": 0.6139717102050781, + "learning_rate": 0.0001104574634257616, + "loss": 12.0342, + "num_input_tokens_seen": 5688524800, + "step": 21700 + }, + { + "epoch": 0.14630135051281146, + "grad_norm": 0.6519197225570679, + "learning_rate": 0.00010921776364546347, + "loss": 12.0328, + "num_input_tokens_seen": 5701632000, + "step": 21750 + }, + { + "epoch": 0.14663767545651907, + "grad_norm": 0.6653149724006653, + "learning_rate": 0.00010798350991981948, + "loss": 12.0151, + "num_input_tokens_seen": 5714739200, + "step": 21800 + }, + { + "epoch": 0.14697400040022668, + "grad_norm": 0.6633841395378113, + "learning_rate": 0.0001067547374817392, + "loss": 11.9882, + "num_input_tokens_seen": 5727846400, + "step": 21850 + }, + { + "epoch": 0.1473103253439343, + "grad_norm": 0.6165183186531067, + "learning_rate": 0.00010553148140766353, + "loss": 12.0242, + "num_input_tokens_seen": 5740953600, + "step": 21900 + }, + { + "epoch": 0.1476466502876419, + "grad_norm": 0.6443773508071899, + "learning_rate": 0.00010431377661656374, + "loss": 12.0166, + "num_input_tokens_seen": 5754060800, + "step": 21950 + }, + { + "epoch": 0.1479829752313495, + "grad_norm": 0.6805723905563354, + "learning_rate": 0.00010310165786894456, + "loss": 12.0284, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.1479829752313495, + "eval_loss": 2.9135851860046387, + "eval_runtime": 143.3087, + "eval_samples_per_second": 34.89, + "eval_steps_per_second": 8.722, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.14831930017505712, + "grad_norm": 0.6574228405952454, + "learning_rate": 0.00010189515976585224, + "loss": 11.9953, + "num_input_tokens_seen": 5780275200, + "step": 22050 + }, + { + "epoch": 0.14865562511876473, + "grad_norm": 0.630247175693512, + "learning_rate": 0.00010069431674788618, + "loss": 12.0309, + "num_input_tokens_seen": 5793382400, + "step": 22100 + }, + { + "epoch": 0.14899195006247234, + "grad_norm": 0.6254024505615234, + "learning_rate": 9.949916309421655e-05, + "loss": 11.9972, + "num_input_tokens_seen": 5806489600, + "step": 22150 + }, + { + "epoch": 0.14932827500617998, + "grad_norm": 0.6164761781692505, + "learning_rate": 9.830973292160493e-05, + "loss": 12.0382, + "num_input_tokens_seen": 5819596800, + "step": 22200 + }, + { + "epoch": 0.1496645999498876, + "grad_norm": 0.6174560189247131, + "learning_rate": 9.712606018343136e-05, + "loss": 11.981, + "num_input_tokens_seen": 5832704000, + "step": 22250 + }, + { + "epoch": 0.1500009248935952, + "grad_norm": 0.6346741914749146, + "learning_rate": 9.594817866872411e-05, + "loss": 12.0161, + "num_input_tokens_seen": 5845811200, + "step": 22300 + }, + { + "epoch": 0.15033724983730282, + "grad_norm": 0.6521451473236084, + "learning_rate": 9.477612200119616e-05, + "loss": 12.0022, + "num_input_tokens_seen": 5858918400, + "step": 22350 + }, + { + "epoch": 0.15067357478101043, + "grad_norm": 0.6211933493614197, + "learning_rate": 9.360992363828442e-05, + "loss": 12.0695, + "num_input_tokens_seen": 5872025600, + "step": 22400 + }, + { + "epoch": 0.15100989972471804, + "grad_norm": 0.6488197445869446, + "learning_rate": 9.244961687019529e-05, + "loss": 12.0477, + "num_input_tokens_seen": 5885132800, + "step": 22450 + }, + { + "epoch": 0.15134622466842565, + "grad_norm": 0.6073492169380188, + "learning_rate": 9.129523481895408e-05, + "loss": 11.9863, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.15134622466842565, + "eval_loss": 2.9087352752685547, + "eval_runtime": 141.7599, + "eval_samples_per_second": 35.271, + "eval_steps_per_second": 8.818, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.15168254961213326, + "grad_norm": 0.6158032417297363, + "learning_rate": 9.014681043745983e-05, + "loss": 12.0428, + "num_input_tokens_seen": 5911347200, + "step": 22550 + }, + { + "epoch": 0.15201887455584087, + "grad_norm": 0.6146510243415833, + "learning_rate": 8.900437650854409e-05, + "loss": 12.0035, + "num_input_tokens_seen": 5924454400, + "step": 22600 + }, + { + "epoch": 0.15235519949954848, + "grad_norm": 0.7664083242416382, + "learning_rate": 8.786796564403575e-05, + "loss": 12.0481, + "num_input_tokens_seen": 5937561600, + "step": 22650 + }, + { + "epoch": 0.1526915244432561, + "grad_norm": 0.6329470872879028, + "learning_rate": 8.673761028382955e-05, + "loss": 11.9683, + "num_input_tokens_seen": 5950668800, + "step": 22700 + }, + { + "epoch": 0.1530278493869637, + "grad_norm": 0.617677628993988, + "learning_rate": 8.561334269496019e-05, + "loss": 11.9993, + "num_input_tokens_seen": 5963776000, + "step": 22750 + }, + { + "epoch": 0.15336417433067132, + "grad_norm": 0.6368398070335388, + "learning_rate": 8.449519497068174e-05, + "loss": 11.9881, + "num_input_tokens_seen": 5976883200, + "step": 22800 + }, + { + "epoch": 0.15370049927437893, + "grad_norm": 0.6332319974899292, + "learning_rate": 8.338319902955062e-05, + "loss": 12.0005, + "num_input_tokens_seen": 5989990400, + "step": 22850 + }, + { + "epoch": 0.15403682421808654, + "grad_norm": 0.6333373188972473, + "learning_rate": 8.227738661451541e-05, + "loss": 12.0081, + "num_input_tokens_seen": 6003097600, + "step": 22900 + }, + { + "epoch": 0.15437314916179415, + "grad_norm": 0.6376117467880249, + "learning_rate": 8.117778929200977e-05, + "loss": 11.9789, + "num_input_tokens_seen": 6016204800, + "step": 22950 + }, + { + "epoch": 0.15470947410550176, + "grad_norm": 0.6416700482368469, + "learning_rate": 8.008443845105216e-05, + "loss": 11.9845, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.15470947410550176, + "eval_loss": 2.9053738117218018, + "eval_runtime": 142.1312, + "eval_samples_per_second": 35.179, + "eval_steps_per_second": 8.795, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.15504579904920937, + "grad_norm": 0.6468757390975952, + "learning_rate": 7.899736530234923e-05, + "loss": 11.991, + "num_input_tokens_seen": 6042419200, + "step": 23050 + }, + { + "epoch": 0.155382123992917, + "grad_norm": 0.6658541560173035, + "learning_rate": 7.791660087740537e-05, + "loss": 11.9583, + "num_input_tokens_seen": 6055526400, + "step": 23100 + }, + { + "epoch": 0.15571844893662462, + "grad_norm": 0.6665578484535217, + "learning_rate": 7.68421760276364e-05, + "loss": 12.0004, + "num_input_tokens_seen": 6068633600, + "step": 23150 + }, + { + "epoch": 0.15605477388033223, + "grad_norm": 0.6088104844093323, + "learning_rate": 7.577412142348944e-05, + "loss": 11.9758, + "num_input_tokens_seen": 6081740800, + "step": 23200 + }, + { + "epoch": 0.15639109882403984, + "grad_norm": 0.6299030184745789, + "learning_rate": 7.47124675535666e-05, + "loss": 12.036, + "num_input_tokens_seen": 6094848000, + "step": 23250 + }, + { + "epoch": 0.15672742376774745, + "grad_norm": 0.642490565776825, + "learning_rate": 7.365724472375568e-05, + "loss": 11.9951, + "num_input_tokens_seen": 6107955200, + "step": 23300 + }, + { + "epoch": 0.15706374871145506, + "grad_norm": 0.6375728845596313, + "learning_rate": 7.260848305636405e-05, + "loss": 11.9859, + "num_input_tokens_seen": 6121062400, + "step": 23350 + }, + { + "epoch": 0.15740007365516268, + "grad_norm": 0.6122708320617676, + "learning_rate": 7.156621248925967e-05, + "loss": 11.9532, + "num_input_tokens_seen": 6134169600, + "step": 23400 + }, + { + "epoch": 0.1577363985988703, + "grad_norm": 0.6512198448181152, + "learning_rate": 7.05304627750157e-05, + "loss": 11.9962, + "num_input_tokens_seen": 6147276800, + "step": 23450 + }, + { + "epoch": 0.1580727235425779, + "grad_norm": 0.6488016247749329, + "learning_rate": 6.950126348006171e-05, + "loss": 11.9285, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.1580727235425779, + "eval_loss": 2.9018726348876953, + "eval_runtime": 143.4685, + "eval_samples_per_second": 34.851, + "eval_steps_per_second": 8.713, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.1584090484862855, + "grad_norm": 0.6251162886619568, + "learning_rate": 6.847864398383946e-05, + "loss": 11.9805, + "num_input_tokens_seen": 6173491200, + "step": 23550 + }, + { + "epoch": 0.15874537342999312, + "grad_norm": 0.6338608264923096, + "learning_rate": 6.746263347796449e-05, + "loss": 11.9775, + "num_input_tokens_seen": 6186598400, + "step": 23600 + }, + { + "epoch": 0.15908169837370073, + "grad_norm": 0.6400789618492126, + "learning_rate": 6.645326096539229e-05, + "loss": 11.9472, + "num_input_tokens_seen": 6199705600, + "step": 23650 + }, + { + "epoch": 0.15941802331740834, + "grad_norm": 0.6252830624580383, + "learning_rate": 6.545055525959105e-05, + "loss": 11.9752, + "num_input_tokens_seen": 6212812800, + "step": 23700 + }, + { + "epoch": 0.15975434826111595, + "grad_norm": 0.6790284514427185, + "learning_rate": 6.445454498371857e-05, + "loss": 11.957, + "num_input_tokens_seen": 6225920000, + "step": 23750 + }, + { + "epoch": 0.16009067320482356, + "grad_norm": 0.621303379535675, + "learning_rate": 6.346525856980567e-05, + "loss": 11.9433, + "num_input_tokens_seen": 6239027200, + "step": 23800 + }, + { + "epoch": 0.16042699814853117, + "grad_norm": 0.6379457712173462, + "learning_rate": 6.248272425794411e-05, + "loss": 11.9516, + "num_input_tokens_seen": 6252134400, + "step": 23850 + }, + { + "epoch": 0.16076332309223879, + "grad_norm": 0.6223682761192322, + "learning_rate": 6.150697009548073e-05, + "loss": 11.9856, + "num_input_tokens_seen": 6265241600, + "step": 23900 + }, + { + "epoch": 0.1610996480359464, + "grad_norm": 0.6040588021278381, + "learning_rate": 6.0538023936216814e-05, + "loss": 11.9921, + "num_input_tokens_seen": 6278348800, + "step": 23950 + }, + { + "epoch": 0.16143597297965404, + "grad_norm": 0.6390047669410706, + "learning_rate": 5.957591343961304e-05, + "loss": 11.9322, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.16143597297965404, + "eval_loss": 2.8987817764282227, + "eval_runtime": 142.5945, + "eval_samples_per_second": 35.064, + "eval_steps_per_second": 8.766, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.16177229792336165, + "grad_norm": 0.6663207411766052, + "learning_rate": 5.862066606999949e-05, + "loss": 11.9835, + "num_input_tokens_seen": 6304563200, + "step": 24050 + }, + { + "epoch": 0.16210862286706926, + "grad_norm": 0.6109934449195862, + "learning_rate": 5.7672309095792316e-05, + "loss": 11.933, + "num_input_tokens_seen": 6317670400, + "step": 24100 + }, + { + "epoch": 0.16244494781077687, + "grad_norm": 0.6243853569030762, + "learning_rate": 5.6730869588714744e-05, + "loss": 12.0097, + "num_input_tokens_seen": 6330777600, + "step": 24150 + }, + { + "epoch": 0.16278127275448448, + "grad_norm": 0.6165538430213928, + "learning_rate": 5.579637442302454e-05, + "loss": 11.9705, + "num_input_tokens_seen": 6343884800, + "step": 24200 + }, + { + "epoch": 0.1631175976981921, + "grad_norm": 0.5966577529907227, + "learning_rate": 5.4868850274747045e-05, + "loss": 11.9362, + "num_input_tokens_seen": 6356992000, + "step": 24250 + }, + { + "epoch": 0.1634539226418997, + "grad_norm": 0.6405600309371948, + "learning_rate": 5.39483236209132e-05, + "loss": 11.987, + "num_input_tokens_seen": 6370099200, + "step": 24300 + }, + { + "epoch": 0.1637902475856073, + "grad_norm": 0.6266763210296631, + "learning_rate": 5.303482073880436e-05, + "loss": 11.9779, + "num_input_tokens_seen": 6383206400, + "step": 24350 + }, + { + "epoch": 0.16412657252931492, + "grad_norm": 0.6331851482391357, + "learning_rate": 5.2128367705201594e-05, + "loss": 11.921, + "num_input_tokens_seen": 6396313600, + "step": 24400 + }, + { + "epoch": 0.16446289747302253, + "grad_norm": 0.6594439744949341, + "learning_rate": 5.122899039564157e-05, + "loss": 11.9332, + "num_input_tokens_seen": 6409420800, + "step": 24450 + }, + { + "epoch": 0.16479922241673015, + "grad_norm": 0.6269896626472473, + "learning_rate": 5.033671448367788e-05, + "loss": 11.9627, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.16479922241673015, + "eval_loss": 2.896472454071045, + "eval_runtime": 143.2784, + "eval_samples_per_second": 34.897, + "eval_steps_per_second": 8.724, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.16513554736043776, + "grad_norm": 0.6272408962249756, + "learning_rate": 4.945156544014846e-05, + "loss": 11.9879, + "num_input_tokens_seen": 6435635200, + "step": 24550 + }, + { + "epoch": 0.16547187230414537, + "grad_norm": 0.6145939826965332, + "learning_rate": 4.8573568532447815e-05, + "loss": 11.964, + "num_input_tokens_seen": 6448742400, + "step": 24600 + }, + { + "epoch": 0.16580819724785298, + "grad_norm": 0.6379438638687134, + "learning_rate": 4.770274882380648e-05, + "loss": 11.9384, + "num_input_tokens_seen": 6461849600, + "step": 24650 + }, + { + "epoch": 0.1661445221915606, + "grad_norm": 0.6524396538734436, + "learning_rate": 4.6839131172574996e-05, + "loss": 11.9477, + "num_input_tokens_seen": 6474956800, + "step": 24700 + }, + { + "epoch": 0.1664808471352682, + "grad_norm": 0.6290236711502075, + "learning_rate": 4.598274023151476e-05, + "loss": 11.9441, + "num_input_tokens_seen": 6488064000, + "step": 24750 + }, + { + "epoch": 0.1668171720789758, + "grad_norm": 0.6329859495162964, + "learning_rate": 4.513360044709382e-05, + "loss": 11.9686, + "num_input_tokens_seen": 6501171200, + "step": 24800 + }, + { + "epoch": 0.16715349702268342, + "grad_norm": 0.6199634671211243, + "learning_rate": 4.429173605878951e-05, + "loss": 11.907, + "num_input_tokens_seen": 6514278400, + "step": 24850 + }, + { + "epoch": 0.16748982196639106, + "grad_norm": 0.6326203346252441, + "learning_rate": 4.3457171098396174e-05, + "loss": 11.9358, + "num_input_tokens_seen": 6527385600, + "step": 24900 + }, + { + "epoch": 0.16782614691009867, + "grad_norm": 0.647875189781189, + "learning_rate": 4.2629929389339246e-05, + "loss": 11.9304, + "num_input_tokens_seen": 6540492800, + "step": 24950 + }, + { + "epoch": 0.16816247185380628, + "grad_norm": 0.6240447759628296, + "learning_rate": 4.181003454599512e-05, + "loss": 11.9144, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.16816247185380628, + "eval_loss": 2.8938522338867188, + "eval_runtime": 143.6338, + "eval_samples_per_second": 34.811, + "eval_steps_per_second": 8.703, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.1684987967975139, + "grad_norm": 0.6343597173690796, + "learning_rate": 4.099750997301747e-05, + "loss": 11.9949, + "num_input_tokens_seen": 6566707200, + "step": 25050 + }, + { + "epoch": 0.1688351217412215, + "grad_norm": 0.626124918460846, + "learning_rate": 4.019237886466838e-05, + "loss": 11.9272, + "num_input_tokens_seen": 6579814400, + "step": 25100 + }, + { + "epoch": 0.16917144668492912, + "grad_norm": 0.6266665458679199, + "learning_rate": 3.939466420415709e-05, + "loss": 11.935, + "num_input_tokens_seen": 6592921600, + "step": 25150 + }, + { + "epoch": 0.16950777162863673, + "grad_norm": 0.6637131571769714, + "learning_rate": 3.8604388762983175e-05, + "loss": 11.9444, + "num_input_tokens_seen": 6606028800, + "step": 25200 + }, + { + "epoch": 0.16984409657234434, + "grad_norm": 0.6241376399993896, + "learning_rate": 3.782157510028706e-05, + "loss": 11.9235, + "num_input_tokens_seen": 6619136000, + "step": 25250 + }, + { + "epoch": 0.17018042151605195, + "grad_norm": 0.617912232875824, + "learning_rate": 3.704624556220566e-05, + "loss": 11.9165, + "num_input_tokens_seen": 6632243200, + "step": 25300 + }, + { + "epoch": 0.17051674645975956, + "grad_norm": 0.6167590022087097, + "learning_rate": 3.627842228123483e-05, + "loss": 11.9636, + "num_input_tokens_seen": 6645350400, + "step": 25350 + }, + { + "epoch": 0.17085307140346717, + "grad_norm": 0.6313674449920654, + "learning_rate": 3.551812717559729e-05, + "loss": 11.9304, + "num_input_tokens_seen": 6658457600, + "step": 25400 + }, + { + "epoch": 0.17118939634717478, + "grad_norm": 0.6143530607223511, + "learning_rate": 3.47653819486171e-05, + "loss": 11.9495, + "num_input_tokens_seen": 6671564800, + "step": 25450 + }, + { + "epoch": 0.1715257212908824, + "grad_norm": 0.6127185821533203, + "learning_rate": 3.402020808809996e-05, + "loss": 11.926, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.1715257212908824, + "eval_loss": 2.8920793533325195, + "eval_runtime": 142.7714, + "eval_samples_per_second": 35.021, + "eval_steps_per_second": 8.755, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.17186204623459, + "grad_norm": 0.621300995349884, + "learning_rate": 3.328262686572024e-05, + "loss": 11.9852, + "num_input_tokens_seen": 6697779200, + "step": 25550 + }, + { + "epoch": 0.17219837117829762, + "grad_norm": 0.6242550015449524, + "learning_rate": 3.2552659336413154e-05, + "loss": 11.9132, + "num_input_tokens_seen": 6710886400, + "step": 25600 + }, + { + "epoch": 0.17253469612200523, + "grad_norm": 0.6490415930747986, + "learning_rate": 3.1830326337774124e-05, + "loss": 11.9529, + "num_input_tokens_seen": 6723993600, + "step": 25650 + }, + { + "epoch": 0.17287102106571284, + "grad_norm": 0.5997505187988281, + "learning_rate": 3.111564848946403e-05, + "loss": 11.948, + "num_input_tokens_seen": 6737100800, + "step": 25700 + }, + { + "epoch": 0.17320734600942045, + "grad_norm": 0.6490405797958374, + "learning_rate": 3.040864619262011e-05, + "loss": 11.9353, + "num_input_tokens_seen": 6750208000, + "step": 25750 + }, + { + "epoch": 0.1735436709531281, + "grad_norm": 0.6102951169013977, + "learning_rate": 2.9709339629274285e-05, + "loss": 11.97, + "num_input_tokens_seen": 6763315200, + "step": 25800 + }, + { + "epoch": 0.1738799958968357, + "grad_norm": 0.6121110916137695, + "learning_rate": 2.9017748761776394e-05, + "loss": 11.9342, + "num_input_tokens_seen": 6776422400, + "step": 25850 + }, + { + "epoch": 0.1742163208405433, + "grad_norm": 0.6192799806594849, + "learning_rate": 2.8333893332224754e-05, + "loss": 11.928, + "num_input_tokens_seen": 6789529600, + "step": 25900 + }, + { + "epoch": 0.17455264578425092, + "grad_norm": 0.6458452939987183, + "learning_rate": 2.7657792861902393e-05, + "loss": 11.9213, + "num_input_tokens_seen": 6802636800, + "step": 25950 + }, + { + "epoch": 0.17488897072795853, + "grad_norm": 0.6549943089485168, + "learning_rate": 2.6989466650720048e-05, + "loss": 11.9298, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.17488897072795853, + "eval_loss": 2.890101671218872, + "eval_runtime": 142.6241, + "eval_samples_per_second": 35.057, + "eval_steps_per_second": 8.764, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.17522529567166614, + "grad_norm": 0.6231434941291809, + "learning_rate": 2.6328933776664907e-05, + "loss": 11.8924, + "num_input_tokens_seen": 6828851200, + "step": 26050 + }, + { + "epoch": 0.17556162061537375, + "grad_norm": 0.6445599794387817, + "learning_rate": 2.567621309525628e-05, + "loss": 11.9639, + "num_input_tokens_seen": 6841958400, + "step": 26100 + }, + { + "epoch": 0.17589794555908136, + "grad_norm": 0.6182544827461243, + "learning_rate": 2.503132323900714e-05, + "loss": 11.8955, + "num_input_tokens_seen": 6855065600, + "step": 26150 + }, + { + "epoch": 0.17623427050278898, + "grad_norm": 0.6308871507644653, + "learning_rate": 2.439428261689249e-05, + "loss": 11.898, + "num_input_tokens_seen": 6868172800, + "step": 26200 + }, + { + "epoch": 0.1765705954464966, + "grad_norm": 0.6257124543190002, + "learning_rate": 2.376510941382351e-05, + "loss": 11.9309, + "num_input_tokens_seen": 6881280000, + "step": 26250 + }, + { + "epoch": 0.1769069203902042, + "grad_norm": 0.6235978603363037, + "learning_rate": 2.3143821590128896e-05, + "loss": 11.9587, + "num_input_tokens_seen": 6894387200, + "step": 26300 + }, + { + "epoch": 0.1772432453339118, + "grad_norm": 0.6002153158187866, + "learning_rate": 2.2530436881041725e-05, + "loss": 11.9336, + "num_input_tokens_seen": 6907494400, + "step": 26350 + }, + { + "epoch": 0.17757957027761942, + "grad_norm": 0.6364301443099976, + "learning_rate": 2.1924972796193506e-05, + "loss": 11.9054, + "num_input_tokens_seen": 6920601600, + "step": 26400 + }, + { + "epoch": 0.17791589522132703, + "grad_norm": 0.6437053680419922, + "learning_rate": 2.132744661911412e-05, + "loss": 11.9355, + "num_input_tokens_seen": 6933708800, + "step": 26450 + }, + { + "epoch": 0.17825222016503464, + "grad_norm": 0.6307169795036316, + "learning_rate": 2.073787540673876e-05, + "loss": 11.9117, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.17825222016503464, + "eval_loss": 2.888777732849121, + "eval_runtime": 142.3367, + "eval_samples_per_second": 35.128, + "eval_steps_per_second": 8.782, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.17858854510874225, + "grad_norm": 0.6065594553947449, + "learning_rate": 2.0156275988920568e-05, + "loss": 11.9054, + "num_input_tokens_seen": 6959923200, + "step": 26550 + }, + { + "epoch": 0.17892487005244986, + "grad_norm": 0.6257479190826416, + "learning_rate": 1.958266496795069e-05, + "loss": 11.8735, + "num_input_tokens_seen": 6973030400, + "step": 26600 + }, + { + "epoch": 0.17926119499615747, + "grad_norm": 0.6354475617408752, + "learning_rate": 1.9017058718084012e-05, + "loss": 11.9371, + "num_input_tokens_seen": 6986137600, + "step": 26650 + }, + { + "epoch": 0.1795975199398651, + "grad_norm": 0.6183739900588989, + "learning_rate": 1.8459473385071865e-05, + "loss": 11.9123, + "num_input_tokens_seen": 6999244800, + "step": 26700 + }, + { + "epoch": 0.17993384488357272, + "grad_norm": 0.6221346259117126, + "learning_rate": 1.7909924885701145e-05, + "loss": 11.9004, + "num_input_tokens_seen": 7012352000, + "step": 26750 + }, + { + "epoch": 0.18027016982728034, + "grad_norm": 0.607341468334198, + "learning_rate": 1.7368428907339983e-05, + "loss": 11.9286, + "num_input_tokens_seen": 7025459200, + "step": 26800 + }, + { + "epoch": 0.18060649477098795, + "grad_norm": 0.6302104592323303, + "learning_rate": 1.6835000907489728e-05, + "loss": 11.9551, + "num_input_tokens_seen": 7038566400, + "step": 26850 + }, + { + "epoch": 0.18094281971469556, + "grad_norm": 0.6029033064842224, + "learning_rate": 1.6309656113344017e-05, + "loss": 11.8979, + "num_input_tokens_seen": 7051673600, + "step": 26900 + }, + { + "epoch": 0.18127914465840317, + "grad_norm": 0.6170194149017334, + "learning_rate": 1.5792409521353732e-05, + "loss": 11.9503, + "num_input_tokens_seen": 7064780800, + "step": 26950 + }, + { + "epoch": 0.18161546960211078, + "grad_norm": 0.6190406084060669, + "learning_rate": 1.5283275896799407e-05, + "loss": 11.945, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.18161546960211078, + "eval_loss": 2.88728404045105, + "eval_runtime": 143.9093, + "eval_samples_per_second": 34.744, + "eval_steps_per_second": 8.686, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.1819517945458184, + "grad_norm": 0.6403325796127319, + "learning_rate": 1.478226977336916e-05, + "loss": 11.8936, + "num_input_tokens_seen": 7090995200, + "step": 27050 + }, + { + "epoch": 0.182288119489526, + "grad_norm": 0.6248791813850403, + "learning_rate": 1.428940545274433e-05, + "loss": 11.9114, + "num_input_tokens_seen": 7104102400, + "step": 27100 + }, + { + "epoch": 0.1826244444332336, + "grad_norm": 0.6114192605018616, + "learning_rate": 1.3804697004190869e-05, + "loss": 11.9281, + "num_input_tokens_seen": 7117209600, + "step": 27150 + }, + { + "epoch": 0.18296076937694122, + "grad_norm": 0.6320353746414185, + "learning_rate": 1.3328158264157762e-05, + "loss": 11.9141, + "num_input_tokens_seen": 7130316800, + "step": 27200 + }, + { + "epoch": 0.18329709432064883, + "grad_norm": 0.6097228527069092, + "learning_rate": 1.2859802835882416e-05, + "loss": 11.8966, + "num_input_tokens_seen": 7143424000, + "step": 27250 + }, + { + "epoch": 0.18363341926435645, + "grad_norm": 0.6205602288246155, + "learning_rate": 1.2399644089001825e-05, + "loss": 11.9154, + "num_input_tokens_seen": 7156531200, + "step": 27300 + }, + { + "epoch": 0.18396974420806406, + "grad_norm": 0.6151401996612549, + "learning_rate": 1.1947695159171256e-05, + "loss": 11.8856, + "num_input_tokens_seen": 7169638400, + "step": 27350 + }, + { + "epoch": 0.18430606915177167, + "grad_norm": 0.6325812935829163, + "learning_rate": 1.1503968947689135e-05, + "loss": 11.9602, + "num_input_tokens_seen": 7182745600, + "step": 27400 + }, + { + "epoch": 0.18464239409547928, + "grad_norm": 0.6651480197906494, + "learning_rate": 1.106847812112892e-05, + "loss": 11.8962, + "num_input_tokens_seen": 7195852800, + "step": 27450 + }, + { + "epoch": 0.1849787190391869, + "grad_norm": 0.6203281283378601, + "learning_rate": 1.0641235110977286e-05, + "loss": 11.9267, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.1849787190391869, + "eval_loss": 2.8867011070251465, + "eval_runtime": 143.8431, + "eval_samples_per_second": 34.76, + "eval_steps_per_second": 8.69, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.1853150439828945, + "grad_norm": 0.628180205821991, + "learning_rate": 1.022225211327954e-05, + "loss": 11.9684, + "num_input_tokens_seen": 7222067200, + "step": 27550 + }, + { + "epoch": 0.18565136892660214, + "grad_norm": 0.6240800023078918, + "learning_rate": 9.811541088291163e-06, + "loss": 11.9017, + "num_input_tokens_seen": 7235174400, + "step": 27600 + }, + { + "epoch": 0.18598769387030975, + "grad_norm": 0.6192197799682617, + "learning_rate": 9.409113760136766e-06, + "loss": 11.9137, + "num_input_tokens_seen": 7248281600, + "step": 27650 + }, + { + "epoch": 0.18632401881401736, + "grad_norm": 0.6189801096916199, + "learning_rate": 9.014981616474937e-06, + "loss": 11.9493, + "num_input_tokens_seen": 7261388800, + "step": 27700 + }, + { + "epoch": 0.18666034375772497, + "grad_norm": 0.6035293340682983, + "learning_rate": 8.629155908170881e-06, + "loss": 11.9083, + "num_input_tokens_seen": 7274496000, + "step": 27750 + }, + { + "epoch": 0.18699666870143258, + "grad_norm": 0.6316511034965515, + "learning_rate": 8.25164764897468e-06, + "loss": 11.9187, + "num_input_tokens_seen": 7287603200, + "step": 27800 + }, + { + "epoch": 0.1873329936451402, + "grad_norm": 0.6229190826416016, + "learning_rate": 7.882467615207334e-06, + "loss": 11.8842, + "num_input_tokens_seen": 7300710400, + "step": 27850 + }, + { + "epoch": 0.1876693185888478, + "grad_norm": 0.6222130656242371, + "learning_rate": 7.521626345452914e-06, + "loss": 11.9228, + "num_input_tokens_seen": 7313817600, + "step": 27900 + }, + { + "epoch": 0.18800564353255542, + "grad_norm": 0.6076390743255615, + "learning_rate": 7.169134140257871e-06, + "loss": 11.9038, + "num_input_tokens_seen": 7326924800, + "step": 27950 + }, + { + "epoch": 0.18834196847626303, + "grad_norm": 0.6207023859024048, + "learning_rate": 6.825001061836799e-06, + "loss": 11.9013, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.18834196847626303, + "eval_loss": 2.885740280151367, + "eval_runtime": 143.4494, + "eval_samples_per_second": 34.855, + "eval_steps_per_second": 8.714, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.18867829341997064, + "grad_norm": 0.6160932779312134, + "learning_rate": 6.4892369337854025e-06, + "loss": 11.9279, + "num_input_tokens_seen": 7353139200, + "step": 28050 + }, + { + "epoch": 0.18901461836367825, + "grad_norm": 0.6192066669464111, + "learning_rate": 6.161851340799984e-06, + "loss": 11.8922, + "num_input_tokens_seen": 7366246400, + "step": 28100 + }, + { + "epoch": 0.18935094330738586, + "grad_norm": 0.6136648654937744, + "learning_rate": 5.842853628403799e-06, + "loss": 11.906, + "num_input_tokens_seen": 7379353600, + "step": 28150 + }, + { + "epoch": 0.18968726825109347, + "grad_norm": 0.621473491191864, + "learning_rate": 5.532252902680367e-06, + "loss": 11.8603, + "num_input_tokens_seen": 7392460800, + "step": 28200 + }, + { + "epoch": 0.19002359319480108, + "grad_norm": 0.6140876412391663, + "learning_rate": 5.2300580300135175e-06, + "loss": 11.8953, + "num_input_tokens_seen": 7405568000, + "step": 28250 + }, + { + "epoch": 0.1903599181385087, + "grad_norm": 0.6015214323997498, + "learning_rate": 4.9362776368341846e-06, + "loss": 11.8874, + "num_input_tokens_seen": 7418675200, + "step": 28300 + }, + { + "epoch": 0.1906962430822163, + "grad_norm": 0.6086856126785278, + "learning_rate": 4.650920109374279e-06, + "loss": 11.9015, + "num_input_tokens_seen": 7431782400, + "step": 28350 + }, + { + "epoch": 0.19103256802592392, + "grad_norm": 0.6232919692993164, + "learning_rate": 4.373993593427238e-06, + "loss": 11.9252, + "num_input_tokens_seen": 7444889600, + "step": 28400 + }, + { + "epoch": 0.19136889296963153, + "grad_norm": 0.6096498966217041, + "learning_rate": 4.105505994115521e-06, + "loss": 11.9018, + "num_input_tokens_seen": 7457996800, + "step": 28450 + }, + { + "epoch": 0.19170521791333917, + "grad_norm": 0.6317954659461975, + "learning_rate": 3.845464975664947e-06, + "loss": 11.9102, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.19170521791333917, + "eval_loss": 2.8853116035461426, + "eval_runtime": 143.468, + "eval_samples_per_second": 34.851, + "eval_steps_per_second": 8.713, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.19204154285704678, + "grad_norm": 0.6087967753410339, + "learning_rate": 3.5938779611859093e-06, + "loss": 11.9431, + "num_input_tokens_seen": 7484211200, + "step": 28550 + }, + { + "epoch": 0.1923778678007544, + "grad_norm": 0.614473283290863, + "learning_rate": 3.350752132461443e-06, + "loss": 11.9548, + "num_input_tokens_seen": 7497318400, + "step": 28600 + }, + { + "epoch": 0.192714192744462, + "grad_norm": 0.6257823705673218, + "learning_rate": 3.116094429742222e-06, + "loss": 11.9179, + "num_input_tokens_seen": 7510425600, + "step": 28650 + }, + { + "epoch": 0.1930505176881696, + "grad_norm": 0.6351081728935242, + "learning_rate": 2.889911551548585e-06, + "loss": 11.9183, + "num_input_tokens_seen": 7523532800, + "step": 28700 + }, + { + "epoch": 0.19338684263187722, + "grad_norm": 0.6371856331825256, + "learning_rate": 2.672209954479021e-06, + "loss": 11.9169, + "num_input_tokens_seen": 7536640000, + "step": 28750 + }, + { + "epoch": 0.19372316757558483, + "grad_norm": 0.622117280960083, + "learning_rate": 2.462995853026184e-06, + "loss": 11.9404, + "num_input_tokens_seen": 7549747200, + "step": 28800 + }, + { + "epoch": 0.19405949251929244, + "grad_norm": 0.6010422110557556, + "learning_rate": 2.2622752193992675e-06, + "loss": 11.9441, + "num_input_tokens_seen": 7562854400, + "step": 28850 + }, + { + "epoch": 0.19439581746300005, + "grad_norm": 0.6092264652252197, + "learning_rate": 2.0700537833536422e-06, + "loss": 11.893, + "num_input_tokens_seen": 7575961600, + "step": 28900 + }, + { + "epoch": 0.19473214240670766, + "grad_norm": 0.6216610670089722, + "learning_rate": 1.8863370320272187e-06, + "loss": 11.9201, + "num_input_tokens_seen": 7589068800, + "step": 28950 + }, + { + "epoch": 0.19506846735041528, + "grad_norm": 0.615051805973053, + "learning_rate": 1.7111302097839396e-06, + "loss": 11.9402, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.19506846735041528, + "eval_loss": 2.885068655014038, + "eval_runtime": 142.9832, + "eval_samples_per_second": 34.969, + "eval_steps_per_second": 8.742, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.1954047922941229, + "grad_norm": 0.6069262027740479, + "learning_rate": 1.5444383180638342e-06, + "loss": 11.9314, + "num_input_tokens_seen": 7615283200, + "step": 29050 + }, + { + "epoch": 0.1957411172378305, + "grad_norm": 0.628108561038971, + "learning_rate": 1.3862661152405309e-06, + "loss": 11.9151, + "num_input_tokens_seen": 7628390400, + "step": 29100 + }, + { + "epoch": 0.1960774421815381, + "grad_norm": 0.6232333779335022, + "learning_rate": 1.236618116485233e-06, + "loss": 11.8887, + "num_input_tokens_seen": 7641497600, + "step": 29150 + }, + { + "epoch": 0.19641376712524572, + "grad_norm": 0.6372972726821899, + "learning_rate": 1.0954985936379223e-06, + "loss": 11.8873, + "num_input_tokens_seen": 7654604800, + "step": 29200 + }, + { + "epoch": 0.19675009206895333, + "grad_norm": 0.5991822481155396, + "learning_rate": 9.6291157508529e-07, + "loss": 11.9405, + "num_input_tokens_seen": 7667712000, + "step": 29250 + }, + { + "epoch": 0.19708641701266094, + "grad_norm": 0.6108511686325073, + "learning_rate": 8.388608456459612e-07, + "loss": 11.9085, + "num_input_tokens_seen": 7680819200, + "step": 29300 + }, + { + "epoch": 0.19742274195636855, + "grad_norm": 0.6104913949966431, + "learning_rate": 7.23349946462215e-07, + "loss": 11.8859, + "num_input_tokens_seen": 7693926400, + "step": 29350 + }, + { + "epoch": 0.1977590669000762, + "grad_norm": 0.6084222197532654, + "learning_rate": 6.163821748990994e-07, + "loss": 11.9059, + "num_input_tokens_seen": 7707033600, + "step": 29400 + }, + { + "epoch": 0.1980953918437838, + "grad_norm": 0.633105993270874, + "learning_rate": 5.179605844501388e-07, + "loss": 11.9174, + "num_input_tokens_seen": 7720140800, + "step": 29450 + }, + { + "epoch": 0.1984317167874914, + "grad_norm": 0.6088514924049377, + "learning_rate": 4.280879846503049e-07, + "loss": 11.9125, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.1984317167874914, + "eval_loss": 2.8849411010742188, + "eval_runtime": 143.8146, + "eval_samples_per_second": 34.767, + "eval_steps_per_second": 8.692, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.19876804173119902, + "grad_norm": 0.6054402589797974, + "learning_rate": 3.467669409957463e-07, + "loss": 11.9468, + "num_input_tokens_seen": 7746355200, + "step": 29550 + }, + { + "epoch": 0.19910436667490664, + "grad_norm": 0.6133595705032349, + "learning_rate": 2.7399977487051473e-07, + "loss": 11.9368, + "num_input_tokens_seen": 7759462400, + "step": 29600 + }, + { + "epoch": 0.19944069161861425, + "grad_norm": 0.6098650693893433, + "learning_rate": 2.097885634804175e-07, + "loss": 11.8971, + "num_input_tokens_seen": 7772569600, + "step": 29650 + }, + { + "epoch": 0.19977701656232186, + "grad_norm": 0.6231054663658142, + "learning_rate": 1.541351397936319e-07, + "loss": 11.9546, + "num_input_tokens_seen": 7785676800, + "step": 29700 + }, + { + "epoch": 0.20011334150602947, + "grad_norm": 0.6323234438896179, + "learning_rate": 1.0704109248838022e-07, + "loss": 11.8848, + "num_input_tokens_seen": 7798784000, + "step": 29750 + }, + { + "epoch": 0.20044966644973708, + "grad_norm": 0.6294256448745728, + "learning_rate": 6.850776590763274e-08, + "loss": 11.9027, + "num_input_tokens_seen": 7811891200, + "step": 29800 + }, + { + "epoch": 0.2007859913934447, + "grad_norm": 0.6184135675430298, + "learning_rate": 3.853626002063848e-08, + "loss": 11.9454, + "num_input_tokens_seen": 7824998400, + "step": 29850 + }, + { + "epoch": 0.2011223163371523, + "grad_norm": 0.6376939415931702, + "learning_rate": 1.7127430391683516e-08, + "loss": 11.8928, + "num_input_tokens_seen": 7838105600, + "step": 29900 + }, + { + "epoch": 0.2014586412808599, + "grad_norm": 0.6745944619178772, + "learning_rate": 4.281888155543978e-09, + "loss": 11.9315, + "num_input_tokens_seen": 7851212800, + "step": 29950 + }, + { + "epoch": 0.20179496622456752, + "grad_norm": 0.6381050944328308, + "learning_rate": 0.0, + "loss": 11.9242, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.20179496622456752, + "eval_loss": 2.8848958015441895, + "eval_runtime": 142.697, + "eval_samples_per_second": 35.039, + "eval_steps_per_second": 8.76, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.20179496622456752, + "num_input_tokens_seen": 7864320000, + "step": 30000, + "total_flos": 5.0112805994496e+18, + "train_loss": 13.025330790201822, + "train_runtime": 93774.0829, + "train_samples_per_second": 81.899, + "train_steps_per_second": 0.32, + "train_tokens_per_second": 83864.536 + } + ], + "logging_steps": 50, + "max_steps": 30000, + "num_input_tokens_seen": 7864320000, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.0112805994496e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}