| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.295965950462699, | |
| "eval_steps": 500, | |
| "global_step": 44000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00033632494370761253, | |
| "grad_norm": 5.551640033721924, | |
| "learning_rate": 4.9999999999999996e-05, | |
| "loss": 40.646, | |
| "num_input_tokens_seen": 13107200, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0006726498874152251, | |
| "grad_norm": 3.0933010578155518, | |
| "learning_rate": 9.999999999999999e-05, | |
| "loss": 35.6719, | |
| "num_input_tokens_seen": 26214400, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0010089748311228376, | |
| "grad_norm": 1.8445225954055786, | |
| "learning_rate": 0.00015, | |
| "loss": 31.7929, | |
| "num_input_tokens_seen": 39321600, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0013452997748304501, | |
| "grad_norm": 3.410053253173828, | |
| "learning_rate": 0.00019999999999999998, | |
| "loss": 29.7717, | |
| "num_input_tokens_seen": 52428800, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0016816247185380627, | |
| "grad_norm": 3.2875728607177734, | |
| "learning_rate": 0.00025, | |
| "loss": 28.0852, | |
| "num_input_tokens_seen": 65536000, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.002017949662245675, | |
| "grad_norm": 3.1337997913360596, | |
| "learning_rate": 0.0003, | |
| "loss": 26.681, | |
| "num_input_tokens_seen": 78643200, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.002354274605953288, | |
| "grad_norm": 3.2058675289154053, | |
| "learning_rate": 0.00035, | |
| "loss": 25.5272, | |
| "num_input_tokens_seen": 91750400, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0026905995496609002, | |
| "grad_norm": 2.94515323638916, | |
| "learning_rate": 0.00039999999999999996, | |
| "loss": 24.5054, | |
| "num_input_tokens_seen": 104857600, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.003026924493368513, | |
| "grad_norm": 1.952689528465271, | |
| "learning_rate": 0.00045, | |
| "loss": 23.6095, | |
| "num_input_tokens_seen": 117964800, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0033632494370761253, | |
| "grad_norm": 2.3968894481658936, | |
| "learning_rate": 0.0005, | |
| "loss": 22.7795, | |
| "num_input_tokens_seen": 131072000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0033632494370761253, | |
| "eval_loss": 5.559806823730469, | |
| "eval_runtime": 144.2401, | |
| "eval_samples_per_second": 34.664, | |
| "eval_steps_per_second": 8.666, | |
| "num_input_tokens_seen": 131072000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.003699574380783738, | |
| "grad_norm": 2.3432235717773438, | |
| "learning_rate": 0.0005499999999999999, | |
| "loss": 22.0823, | |
| "num_input_tokens_seen": 144179200, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.00403589932449135, | |
| "grad_norm": 2.5011863708496094, | |
| "learning_rate": 0.0006, | |
| "loss": 21.3173, | |
| "num_input_tokens_seen": 157286400, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.004372224268198963, | |
| "grad_norm": 2.038031816482544, | |
| "learning_rate": 0.0005999957181118445, | |
| "loss": 20.6294, | |
| "num_input_tokens_seen": 170393600, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.004708549211906576, | |
| "grad_norm": 1.8921018838882446, | |
| "learning_rate": 0.0005999828725696082, | |
| "loss": 19.8806, | |
| "num_input_tokens_seen": 183500800, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.005044874155614189, | |
| "grad_norm": 2.1825079917907715, | |
| "learning_rate": 0.0005999614637399793, | |
| "loss": 19.2902, | |
| "num_input_tokens_seen": 196608000, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0053811990993218005, | |
| "grad_norm": 1.6200745105743408, | |
| "learning_rate": 0.0005999314922340923, | |
| "loss": 18.7553, | |
| "num_input_tokens_seen": 209715200, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.005717524043029413, | |
| "grad_norm": 1.4547795057296753, | |
| "learning_rate": 0.0005998929589075115, | |
| "loss": 18.3636, | |
| "num_input_tokens_seen": 222822400, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.006053848986737026, | |
| "grad_norm": 1.7445319890975952, | |
| "learning_rate": 0.0005998458648602063, | |
| "loss": 18.0002, | |
| "num_input_tokens_seen": 235929600, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.006390173930444639, | |
| "grad_norm": 1.5439883470535278, | |
| "learning_rate": 0.0005997902114365196, | |
| "loss": 17.6987, | |
| "num_input_tokens_seen": 249036800, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.006726498874152251, | |
| "grad_norm": 1.5094984769821167, | |
| "learning_rate": 0.0005997260002251293, | |
| "loss": 17.4367, | |
| "num_input_tokens_seen": 262144000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.006726498874152251, | |
| "eval_loss": 4.2485198974609375, | |
| "eval_runtime": 143.1933, | |
| "eval_samples_per_second": 34.918, | |
| "eval_steps_per_second": 8.729, | |
| "num_input_tokens_seen": 262144000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.007062823817859863, | |
| "grad_norm": 1.380916714668274, | |
| "learning_rate": 0.0005996532330590042, | |
| "loss": 17.1672, | |
| "num_input_tokens_seen": 275251200, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.007399148761567476, | |
| "grad_norm": 1.2637253999710083, | |
| "learning_rate": 0.0005995719120153497, | |
| "loss": 16.9309, | |
| "num_input_tokens_seen": 288358400, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.007735473705275089, | |
| "grad_norm": 1.6453403234481812, | |
| "learning_rate": 0.0005994820394155497, | |
| "loss": 16.7436, | |
| "num_input_tokens_seen": 301465600, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0080717986489827, | |
| "grad_norm": 1.4340417385101318, | |
| "learning_rate": 0.0005993836178251009, | |
| "loss": 16.507, | |
| "num_input_tokens_seen": 314572800, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.008408123592690313, | |
| "grad_norm": 1.1437392234802246, | |
| "learning_rate": 0.0005992766500535377, | |
| "loss": 16.4345, | |
| "num_input_tokens_seen": 327680000, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.008744448536397926, | |
| "grad_norm": 1.3861935138702393, | |
| "learning_rate": 0.0005991611391543539, | |
| "loss": 16.2523, | |
| "num_input_tokens_seen": 340787200, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.009080773480105539, | |
| "grad_norm": 1.2690448760986328, | |
| "learning_rate": 0.0005990370884249146, | |
| "loss": 16.1004, | |
| "num_input_tokens_seen": 353894400, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.009417098423813152, | |
| "grad_norm": 1.2439777851104736, | |
| "learning_rate": 0.000598904501406362, | |
| "loss": 16.0162, | |
| "num_input_tokens_seen": 367001600, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.009753423367520764, | |
| "grad_norm": 1.1519834995269775, | |
| "learning_rate": 0.0005987633818835147, | |
| "loss": 15.8826, | |
| "num_input_tokens_seen": 380108800, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.010089748311228377, | |
| "grad_norm": 1.101974606513977, | |
| "learning_rate": 0.0005986137338847594, | |
| "loss": 15.7688, | |
| "num_input_tokens_seen": 393216000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.010089748311228377, | |
| "eval_loss": 3.8665707111358643, | |
| "eval_runtime": 143.1117, | |
| "eval_samples_per_second": 34.938, | |
| "eval_steps_per_second": 8.734, | |
| "num_input_tokens_seen": 393216000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.010426073254935988, | |
| "grad_norm": 1.3389586210250854, | |
| "learning_rate": 0.0005984555616819361, | |
| "loss": 15.6984, | |
| "num_input_tokens_seen": 406323200, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.010762398198643601, | |
| "grad_norm": 1.175673246383667, | |
| "learning_rate": 0.0005982888697902161, | |
| "loss": 15.6319, | |
| "num_input_tokens_seen": 419430400, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.011098723142351214, | |
| "grad_norm": 1.122758388519287, | |
| "learning_rate": 0.0005981136629679728, | |
| "loss": 15.4898, | |
| "num_input_tokens_seen": 432537600, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.011435048086058826, | |
| "grad_norm": 1.0009791851043701, | |
| "learning_rate": 0.0005979299462166464, | |
| "loss": 15.4399, | |
| "num_input_tokens_seen": 445644800, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.01177137302976644, | |
| "grad_norm": 1.0287221670150757, | |
| "learning_rate": 0.0005977377247806006, | |
| "loss": 15.3713, | |
| "num_input_tokens_seen": 458752000, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.012107697973474052, | |
| "grad_norm": 1.061454176902771, | |
| "learning_rate": 0.0005975370041469738, | |
| "loss": 15.266, | |
| "num_input_tokens_seen": 471859200, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.012444022917181665, | |
| "grad_norm": 1.2214291095733643, | |
| "learning_rate": 0.0005973277900455209, | |
| "loss": 15.2011, | |
| "num_input_tokens_seen": 484966400, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.012780347860889277, | |
| "grad_norm": 1.0759477615356445, | |
| "learning_rate": 0.0005971100884484513, | |
| "loss": 15.153, | |
| "num_input_tokens_seen": 498073600, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.013116672804596888, | |
| "grad_norm": 0.8617029190063477, | |
| "learning_rate": 0.0005968839055702578, | |
| "loss": 15.1029, | |
| "num_input_tokens_seen": 511180800, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.013452997748304501, | |
| "grad_norm": 1.063086748123169, | |
| "learning_rate": 0.0005966492478675384, | |
| "loss": 14.9894, | |
| "num_input_tokens_seen": 524288000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.013452997748304501, | |
| "eval_loss": 3.6751515865325928, | |
| "eval_runtime": 143.2457, | |
| "eval_samples_per_second": 34.905, | |
| "eval_steps_per_second": 8.726, | |
| "num_input_tokens_seen": 524288000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.013789322692012114, | |
| "grad_norm": 0.8963438868522644, | |
| "learning_rate": 0.000596406122038814, | |
| "loss": 14.9472, | |
| "num_input_tokens_seen": 537395200, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.014125647635719727, | |
| "grad_norm": 0.8694930672645569, | |
| "learning_rate": 0.0005961545350243351, | |
| "loss": 14.8887, | |
| "num_input_tokens_seen": 550502400, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.01446197257942734, | |
| "grad_norm": 0.9276862144470215, | |
| "learning_rate": 0.0005958944940058844, | |
| "loss": 14.8208, | |
| "num_input_tokens_seen": 563609600, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.014798297523134952, | |
| "grad_norm": 0.8817610144615173, | |
| "learning_rate": 0.0005956260064065727, | |
| "loss": 14.7679, | |
| "num_input_tokens_seen": 576716800, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.015134622466842565, | |
| "grad_norm": 0.888661801815033, | |
| "learning_rate": 0.0005953490798906257, | |
| "loss": 14.7253, | |
| "num_input_tokens_seen": 589824000, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.015470947410550178, | |
| "grad_norm": 0.8768919706344604, | |
| "learning_rate": 0.0005950637223631658, | |
| "loss": 14.6678, | |
| "num_input_tokens_seen": 602931200, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.01580727235425779, | |
| "grad_norm": 0.8688133955001831, | |
| "learning_rate": 0.0005947699419699865, | |
| "loss": 14.6422, | |
| "num_input_tokens_seen": 616038400, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.0161435972979654, | |
| "grad_norm": 0.8557626008987427, | |
| "learning_rate": 0.0005944677470973196, | |
| "loss": 14.6511, | |
| "num_input_tokens_seen": 629145600, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.016479922241673016, | |
| "grad_norm": 0.956565260887146, | |
| "learning_rate": 0.0005941571463715962, | |
| "loss": 14.5594, | |
| "num_input_tokens_seen": 642252800, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.016816247185380627, | |
| "grad_norm": 0.8760116100311279, | |
| "learning_rate": 0.0005938381486591999, | |
| "loss": 14.5031, | |
| "num_input_tokens_seen": 655360000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.016816247185380627, | |
| "eval_loss": 3.5482449531555176, | |
| "eval_runtime": 143.9882, | |
| "eval_samples_per_second": 34.725, | |
| "eval_steps_per_second": 8.681, | |
| "num_input_tokens_seen": 655360000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.01715257212908824, | |
| "grad_norm": 0.8938534259796143, | |
| "learning_rate": 0.0005935107630662145, | |
| "loss": 14.4733, | |
| "num_input_tokens_seen": 668467200, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.017488897072795852, | |
| "grad_norm": 0.8379454016685486, | |
| "learning_rate": 0.0005931749989381631, | |
| "loss": 14.386, | |
| "num_input_tokens_seen": 681574400, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.017825222016503463, | |
| "grad_norm": 0.7709890007972717, | |
| "learning_rate": 0.000592830865859742, | |
| "loss": 14.3883, | |
| "num_input_tokens_seen": 694681600, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.018161546960211078, | |
| "grad_norm": 0.8483361601829529, | |
| "learning_rate": 0.000592478373654547, | |
| "loss": 14.4122, | |
| "num_input_tokens_seen": 707788800, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.01849787190391869, | |
| "grad_norm": 0.8239767551422119, | |
| "learning_rate": 0.0005921175323847927, | |
| "loss": 14.3169, | |
| "num_input_tokens_seen": 720896000, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.018834196847626303, | |
| "grad_norm": 0.7901423573493958, | |
| "learning_rate": 0.0005917483523510252, | |
| "loss": 14.263, | |
| "num_input_tokens_seen": 734003200, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.019170521791333914, | |
| "grad_norm": 0.7838689088821411, | |
| "learning_rate": 0.0005913708440918291, | |
| "loss": 14.2589, | |
| "num_input_tokens_seen": 747110400, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.01950684673504153, | |
| "grad_norm": 0.90792316198349, | |
| "learning_rate": 0.000590985018383525, | |
| "loss": 14.2538, | |
| "num_input_tokens_seen": 760217600, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.01984317167874914, | |
| "grad_norm": 0.7609734535217285, | |
| "learning_rate": 0.0005905908862398632, | |
| "loss": 14.208, | |
| "num_input_tokens_seen": 773324800, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.020179496622456754, | |
| "grad_norm": 0.7796016335487366, | |
| "learning_rate": 0.0005901884589117088, | |
| "loss": 14.2405, | |
| "num_input_tokens_seen": 786432000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.020179496622456754, | |
| "eval_loss": 3.469104766845703, | |
| "eval_runtime": 142.6639, | |
| "eval_samples_per_second": 35.047, | |
| "eval_steps_per_second": 8.762, | |
| "num_input_tokens_seen": 786432000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.020515821566164365, | |
| "grad_norm": 0.7596002221107483, | |
| "learning_rate": 0.0005897777478867204, | |
| "loss": 14.1367, | |
| "num_input_tokens_seen": 799539200, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.020852146509871976, | |
| "grad_norm": 0.7429248094558716, | |
| "learning_rate": 0.0005893587648890227, | |
| "loss": 14.1394, | |
| "num_input_tokens_seen": 812646400, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.02118847145357959, | |
| "grad_norm": 0.8267149329185486, | |
| "learning_rate": 0.0005889315218788711, | |
| "loss": 14.1218, | |
| "num_input_tokens_seen": 825753600, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.021524796397287202, | |
| "grad_norm": 0.7618885636329651, | |
| "learning_rate": 0.0005884960310523109, | |
| "loss": 14.0575, | |
| "num_input_tokens_seen": 838860800, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.021861121340994816, | |
| "grad_norm": 0.7333565950393677, | |
| "learning_rate": 0.0005880523048408287, | |
| "loss": 14.0723, | |
| "num_input_tokens_seen": 851968000, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.022197446284702427, | |
| "grad_norm": 0.7767319083213806, | |
| "learning_rate": 0.0005876003559109981, | |
| "loss": 14.0067, | |
| "num_input_tokens_seen": 865075200, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.022533771228410042, | |
| "grad_norm": 0.7478107213973999, | |
| "learning_rate": 0.0005871401971641175, | |
| "loss": 14.0154, | |
| "num_input_tokens_seen": 878182400, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.022870096172117653, | |
| "grad_norm": 0.7439610958099365, | |
| "learning_rate": 0.0005866718417358421, | |
| "loss": 13.9922, | |
| "num_input_tokens_seen": 891289600, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.023206421115825267, | |
| "grad_norm": 0.7624334096908569, | |
| "learning_rate": 0.0005861953029958091, | |
| "loss": 13.9456, | |
| "num_input_tokens_seen": 904396800, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.02354274605953288, | |
| "grad_norm": 0.7594953775405884, | |
| "learning_rate": 0.0005857105945472556, | |
| "loss": 13.9742, | |
| "num_input_tokens_seen": 917504000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.02354274605953288, | |
| "eval_loss": 3.410249710083008, | |
| "eval_runtime": 143.2447, | |
| "eval_samples_per_second": 34.905, | |
| "eval_steps_per_second": 8.726, | |
| "num_input_tokens_seen": 917504000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.02387907100324049, | |
| "grad_norm": 0.7231994867324829, | |
| "learning_rate": 0.0005852177302266308, | |
| "loss": 13.959, | |
| "num_input_tokens_seen": 930611200, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.024215395946948104, | |
| "grad_norm": 0.7433524131774902, | |
| "learning_rate": 0.0005847167241032006, | |
| "loss": 13.8909, | |
| "num_input_tokens_seen": 943718400, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.024551720890655715, | |
| "grad_norm": 0.6849333643913269, | |
| "learning_rate": 0.0005842075904786462, | |
| "loss": 13.8984, | |
| "num_input_tokens_seen": 956825600, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.02488804583436333, | |
| "grad_norm": 0.7122375965118408, | |
| "learning_rate": 0.000583690343886656, | |
| "loss": 13.8611, | |
| "num_input_tokens_seen": 969932800, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.02522437077807094, | |
| "grad_norm": 0.7722771763801575, | |
| "learning_rate": 0.0005831649990925102, | |
| "loss": 13.862, | |
| "num_input_tokens_seen": 983040000, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.025560695721778555, | |
| "grad_norm": 0.7184539437294006, | |
| "learning_rate": 0.0005826315710926599, | |
| "loss": 13.8641, | |
| "num_input_tokens_seen": 996147200, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.025897020665486166, | |
| "grad_norm": 0.7419592142105103, | |
| "learning_rate": 0.0005820900751142987, | |
| "loss": 13.808, | |
| "num_input_tokens_seen": 1009254400, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.026233345609193777, | |
| "grad_norm": 0.7380815148353577, | |
| "learning_rate": 0.0005815405266149281, | |
| "loss": 13.7751, | |
| "num_input_tokens_seen": 1022361600, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.02656967055290139, | |
| "grad_norm": 0.7219839692115784, | |
| "learning_rate": 0.000580982941281916, | |
| "loss": 13.8042, | |
| "num_input_tokens_seen": 1035468800, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.026905995496609002, | |
| "grad_norm": 0.7262866497039795, | |
| "learning_rate": 0.0005804173350320493, | |
| "loss": 13.7434, | |
| "num_input_tokens_seen": 1048576000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.026905995496609002, | |
| "eval_loss": 3.364596128463745, | |
| "eval_runtime": 143.5464, | |
| "eval_samples_per_second": 34.832, | |
| "eval_steps_per_second": 8.708, | |
| "num_input_tokens_seen": 1048576000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.027242320440316617, | |
| "grad_norm": 0.7669729590415955, | |
| "learning_rate": 0.0005798437240110794, | |
| "loss": 13.759, | |
| "num_input_tokens_seen": 1061683200, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.027578645384024228, | |
| "grad_norm": 0.75081866979599, | |
| "learning_rate": 0.0005792621245932613, | |
| "loss": 13.8008, | |
| "num_input_tokens_seen": 1074790400, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.027914970327731842, | |
| "grad_norm": 0.6844099164009094, | |
| "learning_rate": 0.0005786725533808858, | |
| "loss": 13.7462, | |
| "num_input_tokens_seen": 1087897600, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.028251295271439453, | |
| "grad_norm": 0.6971242427825928, | |
| "learning_rate": 0.0005780750272038064, | |
| "loss": 13.7535, | |
| "num_input_tokens_seen": 1101004800, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.028587620215147068, | |
| "grad_norm": 0.7153123021125793, | |
| "learning_rate": 0.0005774695631189582, | |
| "loss": 13.7085, | |
| "num_input_tokens_seen": 1114112000, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.02892394515885468, | |
| "grad_norm": 0.6920833587646484, | |
| "learning_rate": 0.0005768561784098711, | |
| "loss": 13.6495, | |
| "num_input_tokens_seen": 1127219200, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.02926027010256229, | |
| "grad_norm": 0.7521312236785889, | |
| "learning_rate": 0.0005762348905861764, | |
| "loss": 13.6559, | |
| "num_input_tokens_seen": 1140326400, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.029596595046269904, | |
| "grad_norm": 0.676365077495575, | |
| "learning_rate": 0.0005756057173831074, | |
| "loss": 13.6069, | |
| "num_input_tokens_seen": 1153433600, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.029932919989977515, | |
| "grad_norm": 0.6946042776107788, | |
| "learning_rate": 0.0005749686767609928, | |
| "loss": 13.6218, | |
| "num_input_tokens_seen": 1166540800, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.03026924493368513, | |
| "grad_norm": 0.7127935886383057, | |
| "learning_rate": 0.0005743237869047437, | |
| "loss": 13.6039, | |
| "num_input_tokens_seen": 1179648000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.03026924493368513, | |
| "eval_loss": 3.3240578174591064, | |
| "eval_runtime": 143.6605, | |
| "eval_samples_per_second": 34.804, | |
| "eval_steps_per_second": 8.701, | |
| "num_input_tokens_seen": 1179648000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.03060556987739274, | |
| "grad_norm": 0.7366272807121277, | |
| "learning_rate": 0.0005736710662233351, | |
| "loss": 13.604, | |
| "num_input_tokens_seen": 1192755200, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.030941894821100355, | |
| "grad_norm": 0.6478694081306458, | |
| "learning_rate": 0.0005730105333492799, | |
| "loss": 13.5717, | |
| "num_input_tokens_seen": 1205862400, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.03127821976480797, | |
| "grad_norm": 0.7228036522865295, | |
| "learning_rate": 0.0005723422071380976, | |
| "loss": 13.5385, | |
| "num_input_tokens_seen": 1218969600, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.03161454470851558, | |
| "grad_norm": 0.6998932957649231, | |
| "learning_rate": 0.0005716661066677753, | |
| "loss": 13.5237, | |
| "num_input_tokens_seen": 1232076800, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.03195086965222319, | |
| "grad_norm": 0.6933197379112244, | |
| "learning_rate": 0.0005709822512382236, | |
| "loss": 13.5417, | |
| "num_input_tokens_seen": 1245184000, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.0322871945959308, | |
| "grad_norm": 0.7209503054618835, | |
| "learning_rate": 0.0005702906603707256, | |
| "loss": 13.5653, | |
| "num_input_tokens_seen": 1258291200, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.032623519539638414, | |
| "grad_norm": 0.6796743273735046, | |
| "learning_rate": 0.0005695913538073798, | |
| "loss": 13.557, | |
| "num_input_tokens_seen": 1271398400, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.03295984448334603, | |
| "grad_norm": 0.6676873564720154, | |
| "learning_rate": 0.0005688843515105359, | |
| "loss": 13.4965, | |
| "num_input_tokens_seen": 1284505600, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.03329616942705364, | |
| "grad_norm": 0.6413120627403259, | |
| "learning_rate": 0.0005681696736622258, | |
| "loss": 13.5013, | |
| "num_input_tokens_seen": 1297612800, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.033632494370761254, | |
| "grad_norm": 0.6716573238372803, | |
| "learning_rate": 0.0005674473406635868, | |
| "loss": 13.4891, | |
| "num_input_tokens_seen": 1310720000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.033632494370761254, | |
| "eval_loss": 3.2916977405548096, | |
| "eval_runtime": 143.1949, | |
| "eval_samples_per_second": 34.917, | |
| "eval_steps_per_second": 8.729, | |
| "num_input_tokens_seen": 1310720000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.033968819314468865, | |
| "grad_norm": 0.6855191588401794, | |
| "learning_rate": 0.0005667173731342798, | |
| "loss": 13.4753, | |
| "num_input_tokens_seen": 1323827200, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.03430514425817648, | |
| "grad_norm": 0.6939865350723267, | |
| "learning_rate": 0.0005659797919119, | |
| "loss": 13.4583, | |
| "num_input_tokens_seen": 1336934400, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.034641469201884094, | |
| "grad_norm": 0.6979573369026184, | |
| "learning_rate": 0.0005652346180513829, | |
| "loss": 13.4339, | |
| "num_input_tokens_seen": 1350041600, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.034977794145591705, | |
| "grad_norm": 0.6683679819107056, | |
| "learning_rate": 0.0005644818728244026, | |
| "loss": 13.4496, | |
| "num_input_tokens_seen": 1363148800, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.035314119089299316, | |
| "grad_norm": 0.6824884414672852, | |
| "learning_rate": 0.0005637215777187651, | |
| "loss": 13.4705, | |
| "num_input_tokens_seen": 1376256000, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.03565044403300693, | |
| "grad_norm": 0.6840626001358032, | |
| "learning_rate": 0.0005629537544377942, | |
| "loss": 13.4349, | |
| "num_input_tokens_seen": 1389363200, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.035986768976714545, | |
| "grad_norm": 0.6613268852233887, | |
| "learning_rate": 0.0005621784248997128, | |
| "loss": 13.46, | |
| "num_input_tokens_seen": 1402470400, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.036323093920422156, | |
| "grad_norm": 0.6920462846755981, | |
| "learning_rate": 0.0005613956112370167, | |
| "loss": 13.4035, | |
| "num_input_tokens_seen": 1415577600, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.03665941886412977, | |
| "grad_norm": 0.6671062111854553, | |
| "learning_rate": 0.0005606053357958429, | |
| "loss": 13.3312, | |
| "num_input_tokens_seen": 1428684800, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.03699574380783738, | |
| "grad_norm": 0.6575270295143127, | |
| "learning_rate": 0.0005598076211353316, | |
| "loss": 13.3718, | |
| "num_input_tokens_seen": 1441792000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.03699574380783738, | |
| "eval_loss": 3.2621724605560303, | |
| "eval_runtime": 143.0547, | |
| "eval_samples_per_second": 34.952, | |
| "eval_steps_per_second": 8.738, | |
| "num_input_tokens_seen": 1441792000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.037332068751544996, | |
| "grad_norm": 0.6697873473167419, | |
| "learning_rate": 0.0005590024900269825, | |
| "loss": 13.3337, | |
| "num_input_tokens_seen": 1454899200, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.03766839369525261, | |
| "grad_norm": 0.6501484513282776, | |
| "learning_rate": 0.0005581899654540048, | |
| "loss": 13.3573, | |
| "num_input_tokens_seen": 1468006400, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.03800471863896022, | |
| "grad_norm": 0.6328523755073547, | |
| "learning_rate": 0.0005573700706106607, | |
| "loss": 13.3513, | |
| "num_input_tokens_seen": 1481113600, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.03834104358266783, | |
| "grad_norm": 0.6470258831977844, | |
| "learning_rate": 0.0005565428289016039, | |
| "loss": 13.2964, | |
| "num_input_tokens_seen": 1494220800, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.03867736852637544, | |
| "grad_norm": 0.6687533855438232, | |
| "learning_rate": 0.0005557082639412105, | |
| "loss": 13.3508, | |
| "num_input_tokens_seen": 1507328000, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.03901369347008306, | |
| "grad_norm": 0.6524744629859924, | |
| "learning_rate": 0.0005548663995529062, | |
| "loss": 13.3254, | |
| "num_input_tokens_seen": 1520435200, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.03935001841379067, | |
| "grad_norm": 0.6340435147285461, | |
| "learning_rate": 0.0005540172597684852, | |
| "loss": 13.3107, | |
| "num_input_tokens_seen": 1533542400, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.03968634335749828, | |
| "grad_norm": 0.6675236225128174, | |
| "learning_rate": 0.000553160868827425, | |
| "loss": 13.264, | |
| "num_input_tokens_seen": 1546649600, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.04002266830120589, | |
| "grad_norm": 0.6894492506980896, | |
| "learning_rate": 0.0005522972511761935, | |
| "loss": 13.2815, | |
| "num_input_tokens_seen": 1559756800, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.04035899324491351, | |
| "grad_norm": 0.6689581274986267, | |
| "learning_rate": 0.000551426431467552, | |
| "loss": 13.3443, | |
| "num_input_tokens_seen": 1572864000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.04035899324491351, | |
| "eval_loss": 3.237107276916504, | |
| "eval_runtime": 142.4946, | |
| "eval_samples_per_second": 35.089, | |
| "eval_steps_per_second": 8.772, | |
| "num_input_tokens_seen": 1572864000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.04069531818862112, | |
| "grad_norm": 0.6856837868690491, | |
| "learning_rate": 0.0005505484345598515, | |
| "loss": 13.2681, | |
| "num_input_tokens_seen": 1585971200, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.04103164313232873, | |
| "grad_norm": 0.6631260514259338, | |
| "learning_rate": 0.0005496632855163221, | |
| "loss": 13.2594, | |
| "num_input_tokens_seen": 1599078400, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.04136796807603634, | |
| "grad_norm": 0.6479213833808899, | |
| "learning_rate": 0.0005487710096043584, | |
| "loss": 13.2822, | |
| "num_input_tokens_seen": 1612185600, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.04170429301974395, | |
| "grad_norm": 0.6974468231201172, | |
| "learning_rate": 0.0005478716322947985, | |
| "loss": 13.2206, | |
| "num_input_tokens_seen": 1625292800, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.04204061796345157, | |
| "grad_norm": 0.633106529712677, | |
| "learning_rate": 0.0005469651792611956, | |
| "loss": 13.2054, | |
| "num_input_tokens_seen": 1638400000, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.04237694290715918, | |
| "grad_norm": 0.6755931377410889, | |
| "learning_rate": 0.0005460516763790867, | |
| "loss": 13.206, | |
| "num_input_tokens_seen": 1651507200, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.04271326785086679, | |
| "grad_norm": 0.6707047820091248, | |
| "learning_rate": 0.0005451311497252529, | |
| "loss": 13.2538, | |
| "num_input_tokens_seen": 1664614400, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.043049592794574404, | |
| "grad_norm": 0.6525476574897766, | |
| "learning_rate": 0.0005442036255769754, | |
| "loss": 13.1984, | |
| "num_input_tokens_seen": 1677721600, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.04338591773828202, | |
| "grad_norm": 0.6575285196304321, | |
| "learning_rate": 0.0005432691304112853, | |
| "loss": 13.1798, | |
| "num_input_tokens_seen": 1690828800, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.04372224268198963, | |
| "grad_norm": 0.6678348183631897, | |
| "learning_rate": 0.0005423276909042077, | |
| "loss": 13.1945, | |
| "num_input_tokens_seen": 1703936000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.04372224268198963, | |
| "eval_loss": 3.2158761024475098, | |
| "eval_runtime": 143.477, | |
| "eval_samples_per_second": 34.849, | |
| "eval_steps_per_second": 8.712, | |
| "num_input_tokens_seen": 1703936000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.044058567625697244, | |
| "grad_norm": 0.6580634713172913, | |
| "learning_rate": 0.0005413793339300004, | |
| "loss": 13.1733, | |
| "num_input_tokens_seen": 1717043200, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.044394892569404855, | |
| "grad_norm": 0.7573990821838379, | |
| "learning_rate": 0.000540424086560387, | |
| "loss": 13.1998, | |
| "num_input_tokens_seen": 1730150400, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.044731217513112466, | |
| "grad_norm": 0.6535853147506714, | |
| "learning_rate": 0.000539461976063783, | |
| "loss": 13.1668, | |
| "num_input_tokens_seen": 1743257600, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.045067542456820084, | |
| "grad_norm": 0.6875225305557251, | |
| "learning_rate": 0.0005384930299045193, | |
| "loss": 13.1695, | |
| "num_input_tokens_seen": 1756364800, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.045403867400527695, | |
| "grad_norm": 0.6734049320220947, | |
| "learning_rate": 0.0005375172757420559, | |
| "loss": 13.1982, | |
| "num_input_tokens_seen": 1769472000, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.045740192344235306, | |
| "grad_norm": 0.6594141721725464, | |
| "learning_rate": 0.0005365347414301942, | |
| "loss": 13.132, | |
| "num_input_tokens_seen": 1782579200, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.04607651728794292, | |
| "grad_norm": 0.606103777885437, | |
| "learning_rate": 0.0005355454550162814, | |
| "loss": 13.15, | |
| "num_input_tokens_seen": 1795686400, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.046412842231650535, | |
| "grad_norm": 0.6484935879707336, | |
| "learning_rate": 0.0005345494447404089, | |
| "loss": 13.1301, | |
| "num_input_tokens_seen": 1808793600, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.046749167175358146, | |
| "grad_norm": 0.7227681279182434, | |
| "learning_rate": 0.0005335467390346076, | |
| "loss": 13.1443, | |
| "num_input_tokens_seen": 1821900800, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.04708549211906576, | |
| "grad_norm": 0.6700535416603088, | |
| "learning_rate": 0.0005325373665220355, | |
| "loss": 13.0997, | |
| "num_input_tokens_seen": 1835008000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.04708549211906576, | |
| "eval_loss": 3.194380044937134, | |
| "eval_runtime": 142.7031, | |
| "eval_samples_per_second": 35.038, | |
| "eval_steps_per_second": 8.759, | |
| "num_input_tokens_seen": 1835008000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.04742181706277337, | |
| "grad_norm": 0.6384073495864868, | |
| "learning_rate": 0.0005315213560161604, | |
| "loss": 13.0959, | |
| "num_input_tokens_seen": 1848115200, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.04775814200648098, | |
| "grad_norm": 0.6730595231056213, | |
| "learning_rate": 0.0005304987365199383, | |
| "loss": 13.081, | |
| "num_input_tokens_seen": 1861222400, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.0480944669501886, | |
| "grad_norm": 0.6358413696289062, | |
| "learning_rate": 0.0005294695372249843, | |
| "loss": 13.0862, | |
| "num_input_tokens_seen": 1874329600, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.04843079189389621, | |
| "grad_norm": 0.6400682926177979, | |
| "learning_rate": 0.0005284337875107402, | |
| "loss": 13.0959, | |
| "num_input_tokens_seen": 1887436800, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.04876711683760382, | |
| "grad_norm": 0.6422862410545349, | |
| "learning_rate": 0.0005273915169436359, | |
| "loss": 13.0957, | |
| "num_input_tokens_seen": 1900544000, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.04910344178131143, | |
| "grad_norm": 0.6517816185951233, | |
| "learning_rate": 0.0005263427552762443, | |
| "loss": 13.0312, | |
| "num_input_tokens_seen": 1913651200, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.04943976672501905, | |
| "grad_norm": 0.6352054476737976, | |
| "learning_rate": 0.0005252875324464333, | |
| "loss": 13.0642, | |
| "num_input_tokens_seen": 1926758400, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.04977609166872666, | |
| "grad_norm": 0.6357077956199646, | |
| "learning_rate": 0.0005242258785765105, | |
| "loss": 13.0704, | |
| "num_input_tokens_seen": 1939865600, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.05011241661243427, | |
| "grad_norm": 0.6192994713783264, | |
| "learning_rate": 0.0005231578239723635, | |
| "loss": 13.0549, | |
| "num_input_tokens_seen": 1952972800, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.05044874155614188, | |
| "grad_norm": 0.6180127859115601, | |
| "learning_rate": 0.0005220833991225946, | |
| "loss": 13.1213, | |
| "num_input_tokens_seen": 1966080000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.05044874155614188, | |
| "eval_loss": 3.1755564212799072, | |
| "eval_runtime": 142.155, | |
| "eval_samples_per_second": 35.173, | |
| "eval_steps_per_second": 8.793, | |
| "num_input_tokens_seen": 1966080000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.05078506649984949, | |
| "grad_norm": 0.663218080997467, | |
| "learning_rate": 0.0005210026346976507, | |
| "loss": 13.0441, | |
| "num_input_tokens_seen": 1979187200, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.05112139144355711, | |
| "grad_norm": 0.6263464093208313, | |
| "learning_rate": 0.0005199155615489478, | |
| "loss": 13.0148, | |
| "num_input_tokens_seen": 1992294400, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.05145771638726472, | |
| "grad_norm": 0.6272994875907898, | |
| "learning_rate": 0.0005188222107079903, | |
| "loss": 13.0467, | |
| "num_input_tokens_seen": 2005401600, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.05179404133097233, | |
| "grad_norm": 0.6265645623207092, | |
| "learning_rate": 0.0005177226133854845, | |
| "loss": 13.0346, | |
| "num_input_tokens_seen": 2018508800, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.05213036627467994, | |
| "grad_norm": 0.6151268482208252, | |
| "learning_rate": 0.0005166168009704493, | |
| "loss": 13.0065, | |
| "num_input_tokens_seen": 2031616000, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.052466691218387554, | |
| "grad_norm": 0.6234976649284363, | |
| "learning_rate": 0.0005155048050293182, | |
| "loss": 13.0419, | |
| "num_input_tokens_seen": 2044723200, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.05280301616209517, | |
| "grad_norm": 0.6200417280197144, | |
| "learning_rate": 0.0005143866573050397, | |
| "loss": 12.9675, | |
| "num_input_tokens_seen": 2057830400, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.05313934110580278, | |
| "grad_norm": 0.6281518340110779, | |
| "learning_rate": 0.0005132623897161705, | |
| "loss": 12.9652, | |
| "num_input_tokens_seen": 2070937600, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.053475666049510394, | |
| "grad_norm": 0.6501129269599915, | |
| "learning_rate": 0.0005121320343559641, | |
| "loss": 13.0074, | |
| "num_input_tokens_seen": 2084044800, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.053811990993218005, | |
| "grad_norm": 0.6317852139472961, | |
| "learning_rate": 0.0005109956234914558, | |
| "loss": 12.977, | |
| "num_input_tokens_seen": 2097152000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.053811990993218005, | |
| "eval_loss": 3.1572272777557373, | |
| "eval_runtime": 142.9574, | |
| "eval_samples_per_second": 34.975, | |
| "eval_steps_per_second": 8.744, | |
| "num_input_tokens_seen": 2097152000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.05414831593692562, | |
| "grad_norm": 0.6210319995880127, | |
| "learning_rate": 0.0005098531895625401, | |
| "loss": 12.9927, | |
| "num_input_tokens_seen": 2110259200, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.054484640880633234, | |
| "grad_norm": 0.6362951397895813, | |
| "learning_rate": 0.0005087047651810459, | |
| "loss": 12.9658, | |
| "num_input_tokens_seen": 2123366400, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.054820965824340845, | |
| "grad_norm": 0.6348525285720825, | |
| "learning_rate": 0.0005075503831298047, | |
| "loss": 12.9523, | |
| "num_input_tokens_seen": 2136473600, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.055157290768048456, | |
| "grad_norm": 0.6554312705993652, | |
| "learning_rate": 0.0005063900763617156, | |
| "loss": 12.9581, | |
| "num_input_tokens_seen": 2149580800, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.05549361571175607, | |
| "grad_norm": 0.6416252851486206, | |
| "learning_rate": 0.0005052238779988038, | |
| "loss": 12.9369, | |
| "num_input_tokens_seen": 2162688000, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.055829940655463685, | |
| "grad_norm": 0.653998076915741, | |
| "learning_rate": 0.0005040518213312757, | |
| "loss": 12.9279, | |
| "num_input_tokens_seen": 2175795200, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.056166265599171296, | |
| "grad_norm": 0.6076102256774902, | |
| "learning_rate": 0.0005028739398165686, | |
| "loss": 12.9306, | |
| "num_input_tokens_seen": 2188902400, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.05650259054287891, | |
| "grad_norm": 0.6263251304626465, | |
| "learning_rate": 0.0005016902670783949, | |
| "loss": 12.9367, | |
| "num_input_tokens_seen": 2202009600, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.05683891548658652, | |
| "grad_norm": 0.6503254175186157, | |
| "learning_rate": 0.0005005008369057835, | |
| "loss": 12.8458, | |
| "num_input_tokens_seen": 2215116800, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.057175240430294136, | |
| "grad_norm": 0.6300747394561768, | |
| "learning_rate": 0.0004993056832521138, | |
| "loss": 12.8892, | |
| "num_input_tokens_seen": 2228224000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.057175240430294136, | |
| "eval_loss": 3.1433920860290527, | |
| "eval_runtime": 145.793, | |
| "eval_samples_per_second": 34.295, | |
| "eval_steps_per_second": 8.574, | |
| "num_input_tokens_seen": 2228224000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.05751156537400175, | |
| "grad_norm": 0.6060501337051392, | |
| "learning_rate": 0.0004981048402341477, | |
| "loss": 12.9441, | |
| "num_input_tokens_seen": 2241331200, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.05784789031770936, | |
| "grad_norm": 0.650665819644928, | |
| "learning_rate": 0.0004968983421310554, | |
| "loss": 12.8715, | |
| "num_input_tokens_seen": 2254438400, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.05818421526141697, | |
| "grad_norm": 0.6535419821739197, | |
| "learning_rate": 0.0004956862233834363, | |
| "loss": 12.8842, | |
| "num_input_tokens_seen": 2267545600, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.05852054020512458, | |
| "grad_norm": 0.6426728367805481, | |
| "learning_rate": 0.0004944685185923365, | |
| "loss": 12.9156, | |
| "num_input_tokens_seen": 2280652800, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.0588568651488322, | |
| "grad_norm": 0.6501869559288025, | |
| "learning_rate": 0.0004932452625182607, | |
| "loss": 12.8779, | |
| "num_input_tokens_seen": 2293760000, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.05919319009253981, | |
| "grad_norm": 0.6345402002334595, | |
| "learning_rate": 0.0004920164900801805, | |
| "loss": 12.8629, | |
| "num_input_tokens_seen": 2306867200, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.05952951503624742, | |
| "grad_norm": 0.6712388396263123, | |
| "learning_rate": 0.0004907822363545365, | |
| "loss": 12.876, | |
| "num_input_tokens_seen": 2319974400, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.05986583997995503, | |
| "grad_norm": 0.6922229528427124, | |
| "learning_rate": 0.0004895425365742384, | |
| "loss": 12.8556, | |
| "num_input_tokens_seen": 2333081600, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.06020216492366265, | |
| "grad_norm": 0.6531935334205627, | |
| "learning_rate": 0.0004882974261276581, | |
| "loss": 12.8296, | |
| "num_input_tokens_seen": 2346188800, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.06053848986737026, | |
| "grad_norm": 0.6729333400726318, | |
| "learning_rate": 0.00048704694055762005, | |
| "loss": 12.8258, | |
| "num_input_tokens_seen": 2359296000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.06053848986737026, | |
| "eval_loss": 3.1286280155181885, | |
| "eval_runtime": 143.0292, | |
| "eval_samples_per_second": 34.958, | |
| "eval_steps_per_second": 8.739, | |
| "num_input_tokens_seen": 2359296000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.06087481481107787, | |
| "grad_norm": 0.6235183477401733, | |
| "learning_rate": 0.0004857911155603867, | |
| "loss": 12.8588, | |
| "num_input_tokens_seen": 2372403200, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.06121113975478548, | |
| "grad_norm": 0.642000138759613, | |
| "learning_rate": 0.0004845299869846392, | |
| "loss": 12.8232, | |
| "num_input_tokens_seen": 2385510400, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.06154746469849309, | |
| "grad_norm": 0.6252527236938477, | |
| "learning_rate": 0.0004832635908304543, | |
| "loss": 12.8595, | |
| "num_input_tokens_seen": 2398617600, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.06188378964220071, | |
| "grad_norm": 0.6228143572807312, | |
| "learning_rate": 0.0004819919632482766, | |
| "loss": 12.8152, | |
| "num_input_tokens_seen": 2411724800, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.06222011458590832, | |
| "grad_norm": 0.661567211151123, | |
| "learning_rate": 0.00048071514053788666, | |
| "loss": 12.8356, | |
| "num_input_tokens_seen": 2424832000, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.06255643952961594, | |
| "grad_norm": 0.6318378448486328, | |
| "learning_rate": 0.00047943315914736475, | |
| "loss": 12.831, | |
| "num_input_tokens_seen": 2437939200, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.06289276447332355, | |
| "grad_norm": 0.6098783612251282, | |
| "learning_rate": 0.0004781460556720504, | |
| "loss": 12.8363, | |
| "num_input_tokens_seen": 2451046400, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.06322908941703116, | |
| "grad_norm": 0.643997073173523, | |
| "learning_rate": 0.00047685386685349796, | |
| "loss": 12.8267, | |
| "num_input_tokens_seen": 2464153600, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.06356541436073877, | |
| "grad_norm": 0.6287397146224976, | |
| "learning_rate": 0.000475556629578427, | |
| "loss": 12.8131, | |
| "num_input_tokens_seen": 2477260800, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.06390173930444638, | |
| "grad_norm": 0.6625655889511108, | |
| "learning_rate": 0.0004742543808776708, | |
| "loss": 12.8312, | |
| "num_input_tokens_seen": 2490368000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.06390173930444638, | |
| "eval_loss": 3.1130659580230713, | |
| "eval_runtime": 143.1847, | |
| "eval_samples_per_second": 34.92, | |
| "eval_steps_per_second": 8.73, | |
| "num_input_tokens_seen": 2490368000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.064238064248154, | |
| "grad_norm": 0.6380253434181213, | |
| "learning_rate": 0.0004729471579251177, | |
| "loss": 12.8645, | |
| "num_input_tokens_seen": 2503475200, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.0645743891918616, | |
| "grad_norm": 0.6287338137626648, | |
| "learning_rate": 0.00047163499803665085, | |
| "loss": 12.7931, | |
| "num_input_tokens_seen": 2516582400, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.06491071413556922, | |
| "grad_norm": 0.6729796528816223, | |
| "learning_rate": 0.00047031793866908294, | |
| "loss": 12.7903, | |
| "num_input_tokens_seen": 2529689600, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.06524703907927683, | |
| "grad_norm": 0.6398154497146606, | |
| "learning_rate": 0.0004689960174190865, | |
| "loss": 12.7746, | |
| "num_input_tokens_seen": 2542796800, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.06558336402298445, | |
| "grad_norm": 0.6751012206077576, | |
| "learning_rate": 0.00046766927202212145, | |
| "loss": 12.7655, | |
| "num_input_tokens_seen": 2555904000, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.06591968896669206, | |
| "grad_norm": 0.6046076416969299, | |
| "learning_rate": 0.0004663377403513568, | |
| "loss": 12.8018, | |
| "num_input_tokens_seen": 2569011200, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.06625601391039967, | |
| "grad_norm": 0.6526479721069336, | |
| "learning_rate": 0.0004650014604165907, | |
| "loss": 12.7394, | |
| "num_input_tokens_seen": 2582118400, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.06659233885410729, | |
| "grad_norm": 0.6187541484832764, | |
| "learning_rate": 0.00046366047036316456, | |
| "loss": 12.7346, | |
| "num_input_tokens_seen": 2595225600, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.0669286637978149, | |
| "grad_norm": 0.6106886267662048, | |
| "learning_rate": 0.0004623148084708745, | |
| "loss": 12.7597, | |
| "num_input_tokens_seen": 2608332800, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.06726498874152251, | |
| "grad_norm": 0.6358317136764526, | |
| "learning_rate": 0.0004609645131528788, | |
| "loss": 12.7303, | |
| "num_input_tokens_seen": 2621440000, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.06726498874152251, | |
| "eval_loss": 3.1026949882507324, | |
| "eval_runtime": 142.2834, | |
| "eval_samples_per_second": 35.141, | |
| "eval_steps_per_second": 8.785, | |
| "num_input_tokens_seen": 2621440000, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.06760131368523012, | |
| "grad_norm": 0.6313095688819885, | |
| "learning_rate": 0.0004596096229546009, | |
| "loss": 12.7336, | |
| "num_input_tokens_seen": 2634547200, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.06793763862893773, | |
| "grad_norm": 0.6490457057952881, | |
| "learning_rate": 0.00045825017655262934, | |
| "loss": 12.7727, | |
| "num_input_tokens_seen": 2647654400, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.06827396357264534, | |
| "grad_norm": 0.6609966158866882, | |
| "learning_rate": 0.000456886212753614, | |
| "loss": 12.759, | |
| "num_input_tokens_seen": 2660761600, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.06861028851635297, | |
| "grad_norm": 0.6392827033996582, | |
| "learning_rate": 0.00045551777049315757, | |
| "loss": 12.7189, | |
| "num_input_tokens_seen": 2673868800, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.06894661346006058, | |
| "grad_norm": 0.6272814273834229, | |
| "learning_rate": 0.0004541448888347047, | |
| "loss": 12.6948, | |
| "num_input_tokens_seen": 2686976000, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.06928293840376819, | |
| "grad_norm": 0.6286495327949524, | |
| "learning_rate": 0.00045276760696842693, | |
| "loss": 12.7224, | |
| "num_input_tokens_seen": 2700083200, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.0696192633474758, | |
| "grad_norm": 0.6213704943656921, | |
| "learning_rate": 0.00045138596421010374, | |
| "loss": 12.778, | |
| "num_input_tokens_seen": 2713190400, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.06995558829118341, | |
| "grad_norm": 0.6061195731163025, | |
| "learning_rate": 0.00045, | |
| "loss": 12.7403, | |
| "num_input_tokens_seen": 2726297600, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.07029191323489102, | |
| "grad_norm": 0.6419244408607483, | |
| "learning_rate": 0.0004486097539017407, | |
| "loss": 12.7137, | |
| "num_input_tokens_seen": 2739404800, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.07062823817859863, | |
| "grad_norm": 0.6618810892105103, | |
| "learning_rate": 0.00044721526560118134, | |
| "loss": 12.6896, | |
| "num_input_tokens_seen": 2752512000, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.07062823817859863, | |
| "eval_loss": 3.0883917808532715, | |
| "eval_runtime": 142.4547, | |
| "eval_samples_per_second": 35.099, | |
| "eval_steps_per_second": 8.775, | |
| "num_input_tokens_seen": 2752512000, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.07096456312230624, | |
| "grad_norm": 0.6856646537780762, | |
| "learning_rate": 0.00044581657490527473, | |
| "loss": 12.6825, | |
| "num_input_tokens_seen": 2765619200, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.07130088806601385, | |
| "grad_norm": 0.6331352591514587, | |
| "learning_rate": 0.00044441372174093487, | |
| "loss": 12.675, | |
| "num_input_tokens_seen": 2778726400, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.07163721300972148, | |
| "grad_norm": 0.6496602296829224, | |
| "learning_rate": 0.0004430067461538976, | |
| "loss": 12.6842, | |
| "num_input_tokens_seen": 2791833600, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.07197353795342909, | |
| "grad_norm": 0.6698866486549377, | |
| "learning_rate": 0.00044159568830757687, | |
| "loss": 12.6498, | |
| "num_input_tokens_seen": 2804940800, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.0723098628971367, | |
| "grad_norm": 0.6556456089019775, | |
| "learning_rate": 0.00044018058848191855, | |
| "loss": 12.7073, | |
| "num_input_tokens_seen": 2818048000, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.07264618784084431, | |
| "grad_norm": 0.6554015278816223, | |
| "learning_rate": 0.0004387614870722506, | |
| "loss": 12.6515, | |
| "num_input_tokens_seen": 2831155200, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.07298251278455192, | |
| "grad_norm": 0.6356109380722046, | |
| "learning_rate": 0.0004373384245881296, | |
| "loss": 12.6759, | |
| "num_input_tokens_seen": 2844262400, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.07331883772825953, | |
| "grad_norm": 0.6429396271705627, | |
| "learning_rate": 0.0004359114416521851, | |
| "loss": 12.6469, | |
| "num_input_tokens_seen": 2857369600, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.07365516267196714, | |
| "grad_norm": 0.6229676604270935, | |
| "learning_rate": 0.0004344805789989591, | |
| "loss": 12.6783, | |
| "num_input_tokens_seen": 2870476800, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.07399148761567476, | |
| "grad_norm": 0.6383066177368164, | |
| "learning_rate": 0.000433045877473744, | |
| "loss": 12.6273, | |
| "num_input_tokens_seen": 2883584000, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.07399148761567476, | |
| "eval_loss": 3.076796054840088, | |
| "eval_runtime": 143.2443, | |
| "eval_samples_per_second": 34.905, | |
| "eval_steps_per_second": 8.726, | |
| "num_input_tokens_seen": 2883584000, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.07432781255938237, | |
| "grad_norm": 0.612218976020813, | |
| "learning_rate": 0.0004316073780314163, | |
| "loss": 12.6729, | |
| "num_input_tokens_seen": 2896691200, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.07466413750308999, | |
| "grad_norm": 0.6343071460723877, | |
| "learning_rate": 0.00043016512173526736, | |
| "loss": 12.6507, | |
| "num_input_tokens_seen": 2909798400, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.0750004624467976, | |
| "grad_norm": 0.6494725942611694, | |
| "learning_rate": 0.0004287191497558317, | |
| "loss": 12.6271, | |
| "num_input_tokens_seen": 2922905600, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.07533678739050521, | |
| "grad_norm": 0.6436727046966553, | |
| "learning_rate": 0.0004272695033697111, | |
| "loss": 12.6529, | |
| "num_input_tokens_seen": 2936012800, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.07567311233421282, | |
| "grad_norm": 0.6481876373291016, | |
| "learning_rate": 0.00042581622395839705, | |
| "loss": 12.6528, | |
| "num_input_tokens_seen": 2949120000, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.07600943727792044, | |
| "grad_norm": 0.6492651104927063, | |
| "learning_rate": 0.0004243593530070886, | |
| "loss": 12.6312, | |
| "num_input_tokens_seen": 2962227200, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.07634576222162805, | |
| "grad_norm": 0.6570179462432861, | |
| "learning_rate": 0.00042289893210350907, | |
| "loss": 12.6428, | |
| "num_input_tokens_seen": 2975334400, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.07668208716533566, | |
| "grad_norm": 0.6505069732666016, | |
| "learning_rate": 0.0004214350029367181, | |
| "loss": 12.6549, | |
| "num_input_tokens_seen": 2988441600, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.07701841210904327, | |
| "grad_norm": 0.6301828026771545, | |
| "learning_rate": 0.0004199676072959222, | |
| "loss": 12.5838, | |
| "num_input_tokens_seen": 3001548800, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.07735473705275088, | |
| "grad_norm": 0.625487208366394, | |
| "learning_rate": 0.0004184967870692816, | |
| "loss": 12.6166, | |
| "num_input_tokens_seen": 3014656000, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.07735473705275088, | |
| "eval_loss": 3.0652644634246826, | |
| "eval_runtime": 142.6807, | |
| "eval_samples_per_second": 35.043, | |
| "eval_steps_per_second": 8.761, | |
| "num_input_tokens_seen": 3014656000, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.0776910619964585, | |
| "grad_norm": 0.6678441762924194, | |
| "learning_rate": 0.000417022584242714, | |
| "loss": 12.6271, | |
| "num_input_tokens_seen": 3027763200, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.07802738694016612, | |
| "grad_norm": 0.6448168754577637, | |
| "learning_rate": 0.00041554504089869716, | |
| "loss": 12.6012, | |
| "num_input_tokens_seen": 3040870400, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.07836371188387373, | |
| "grad_norm": 0.6791290640830994, | |
| "learning_rate": 0.0004140641992150667, | |
| "loss": 12.5798, | |
| "num_input_tokens_seen": 3053977600, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.07870003682758134, | |
| "grad_norm": 0.8807069659233093, | |
| "learning_rate": 0.00041258010146381224, | |
| "loss": 12.6015, | |
| "num_input_tokens_seen": 3067084800, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.07903636177128895, | |
| "grad_norm": 0.6284939646720886, | |
| "learning_rate": 0.00041109279000987105, | |
| "loss": 12.6183, | |
| "num_input_tokens_seen": 3080192000, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.07937268671499656, | |
| "grad_norm": 0.6453195810317993, | |
| "learning_rate": 0.0004096023073099185, | |
| "loss": 12.6, | |
| "num_input_tokens_seen": 3093299200, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.07970901165870417, | |
| "grad_norm": 0.6511227488517761, | |
| "learning_rate": 0.00040810869591115603, | |
| "loss": 12.5952, | |
| "num_input_tokens_seen": 3106406400, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.08004533660241178, | |
| "grad_norm": 0.6701833009719849, | |
| "learning_rate": 0.0004066119984500966, | |
| "loss": 12.5674, | |
| "num_input_tokens_seen": 3119513600, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.08038166154611939, | |
| "grad_norm": 0.6320140957832336, | |
| "learning_rate": 0.0004051122576513479, | |
| "loss": 12.5772, | |
| "num_input_tokens_seen": 3132620800, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.08071798648982702, | |
| "grad_norm": 0.6579756736755371, | |
| "learning_rate": 0.00040360951632639226, | |
| "loss": 12.57, | |
| "num_input_tokens_seen": 3145728000, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.08071798648982702, | |
| "eval_loss": 3.0548510551452637, | |
| "eval_runtime": 142.805, | |
| "eval_samples_per_second": 35.013, | |
| "eval_steps_per_second": 8.753, | |
| "num_input_tokens_seen": 3145728000, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.08105431143353463, | |
| "grad_norm": 0.6717228293418884, | |
| "learning_rate": 0.0004021038173723649, | |
| "loss": 12.5689, | |
| "num_input_tokens_seen": 3158835200, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.08139063637724224, | |
| "grad_norm": 0.6350929141044617, | |
| "learning_rate": 0.0004005952037708293, | |
| "loss": 12.5709, | |
| "num_input_tokens_seen": 3171942400, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.08172696132094985, | |
| "grad_norm": 0.6500872373580933, | |
| "learning_rate": 0.00039908371858655013, | |
| "loss": 12.576, | |
| "num_input_tokens_seen": 3185049600, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.08206328626465746, | |
| "grad_norm": 0.6404949426651001, | |
| "learning_rate": 0.00039756940496626415, | |
| "loss": 12.5173, | |
| "num_input_tokens_seen": 3198156800, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.08239961120836507, | |
| "grad_norm": 0.6140453219413757, | |
| "learning_rate": 0.0003960523061374484, | |
| "loss": 12.5427, | |
| "num_input_tokens_seen": 3211264000, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.08273593615207268, | |
| "grad_norm": 0.6440966725349426, | |
| "learning_rate": 0.00039453246540708625, | |
| "loss": 12.5706, | |
| "num_input_tokens_seen": 3224371200, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.0830722610957803, | |
| "grad_norm": 0.6301671862602234, | |
| "learning_rate": 0.00039300992616043105, | |
| "loss": 12.5483, | |
| "num_input_tokens_seen": 3237478400, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.0834085860394879, | |
| "grad_norm": 0.628695547580719, | |
| "learning_rate": 0.00039148473185976815, | |
| "loss": 12.5334, | |
| "num_input_tokens_seen": 3250585600, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.08374491098319553, | |
| "grad_norm": 0.6627179980278015, | |
| "learning_rate": 0.0003899569260431734, | |
| "loss": 12.565, | |
| "num_input_tokens_seen": 3263692800, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.08408123592690314, | |
| "grad_norm": 0.6234163045883179, | |
| "learning_rate": 0.00038842655232327125, | |
| "loss": 12.5742, | |
| "num_input_tokens_seen": 3276800000, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.08408123592690314, | |
| "eval_loss": 3.0441489219665527, | |
| "eval_runtime": 141.8038, | |
| "eval_samples_per_second": 35.26, | |
| "eval_steps_per_second": 8.815, | |
| "num_input_tokens_seen": 3276800000, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.08441756087061075, | |
| "grad_norm": 0.6204286217689514, | |
| "learning_rate": 0.0003868936543859888, | |
| "loss": 12.5493, | |
| "num_input_tokens_seen": 3289907200, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.08475388581431836, | |
| "grad_norm": 0.6237512230873108, | |
| "learning_rate": 0.00038535827598930967, | |
| "loss": 12.5179, | |
| "num_input_tokens_seen": 3303014400, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.08509021075802597, | |
| "grad_norm": 0.6418094635009766, | |
| "learning_rate": 0.00038382046096202435, | |
| "loss": 12.5096, | |
| "num_input_tokens_seen": 3316121600, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.08542653570173359, | |
| "grad_norm": 0.6306421160697937, | |
| "learning_rate": 0.0003822802532024791, | |
| "loss": 12.5202, | |
| "num_input_tokens_seen": 3329228800, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.0857628606454412, | |
| "grad_norm": 0.6436113715171814, | |
| "learning_rate": 0.000380737696677323, | |
| "loss": 12.4871, | |
| "num_input_tokens_seen": 3342336000, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.08609918558914881, | |
| "grad_norm": 1.0079458951950073, | |
| "learning_rate": 0.00037919283542025287, | |
| "loss": 12.4992, | |
| "num_input_tokens_seen": 3355443200, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.08643551053285642, | |
| "grad_norm": 0.6185023188591003, | |
| "learning_rate": 0.0003776457135307562, | |
| "loss": 12.4876, | |
| "num_input_tokens_seen": 3368550400, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.08677183547656404, | |
| "grad_norm": 0.6664910912513733, | |
| "learning_rate": 0.0003760963751728521, | |
| "loss": 12.4876, | |
| "num_input_tokens_seen": 3381657600, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.08710816042027165, | |
| "grad_norm": 0.6112196445465088, | |
| "learning_rate": 0.00037454486457383124, | |
| "loss": 12.4972, | |
| "num_input_tokens_seen": 3394764800, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.08744448536397927, | |
| "grad_norm": 0.6308513879776001, | |
| "learning_rate": 0.00037299122602299257, | |
| "loss": 12.4583, | |
| "num_input_tokens_seen": 3407872000, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.08744448536397927, | |
| "eval_loss": 3.034710645675659, | |
| "eval_runtime": 182.8724, | |
| "eval_samples_per_second": 27.341, | |
| "eval_steps_per_second": 6.835, | |
| "num_input_tokens_seen": 3407872000, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.08778081030768688, | |
| "grad_norm": 0.6481872200965881, | |
| "learning_rate": 0.00037143550387037943, | |
| "loss": 12.4646, | |
| "num_input_tokens_seen": 3420979200, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.08811713525139449, | |
| "grad_norm": 0.6672606468200684, | |
| "learning_rate": 0.0003698777425255136, | |
| "loss": 12.4237, | |
| "num_input_tokens_seen": 3434086400, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.0884534601951021, | |
| "grad_norm": 0.6188272833824158, | |
| "learning_rate": 0.00036831798645612735, | |
| "loss": 12.4983, | |
| "num_input_tokens_seen": 3447193600, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.08878978513880971, | |
| "grad_norm": 0.6584819555282593, | |
| "learning_rate": 0.0003667562801868943, | |
| "loss": 12.4316, | |
| "num_input_tokens_seen": 3460300800, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.08912611008251732, | |
| "grad_norm": 0.6392587423324585, | |
| "learning_rate": 0.0003651926682981584, | |
| "loss": 12.4541, | |
| "num_input_tokens_seen": 3473408000, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.08946243502622493, | |
| "grad_norm": 0.6473196148872375, | |
| "learning_rate": 0.00036362719542466104, | |
| "loss": 12.4921, | |
| "num_input_tokens_seen": 3486515200, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.08979875996993256, | |
| "grad_norm": 0.6527711153030396, | |
| "learning_rate": 0.00036205990625426724, | |
| "loss": 12.4578, | |
| "num_input_tokens_seen": 3499622400, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.09013508491364017, | |
| "grad_norm": 0.6588818430900574, | |
| "learning_rate": 0.00036049084552669, | |
| "loss": 12.4449, | |
| "num_input_tokens_seen": 3512729600, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.09047140985734778, | |
| "grad_norm": 0.6333611011505127, | |
| "learning_rate": 0.00035892005803221286, | |
| "loss": 12.4364, | |
| "num_input_tokens_seen": 3525836800, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.09080773480105539, | |
| "grad_norm": 0.6385447978973389, | |
| "learning_rate": 0.0003573475886104117, | |
| "loss": 12.4483, | |
| "num_input_tokens_seen": 3538944000, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.09080773480105539, | |
| "eval_loss": 3.0267038345336914, | |
| "eval_runtime": 143.0402, | |
| "eval_samples_per_second": 34.955, | |
| "eval_steps_per_second": 8.739, | |
| "num_input_tokens_seen": 3538944000, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.091144059744763, | |
| "grad_norm": 0.652103066444397, | |
| "learning_rate": 0.0003557734821488744, | |
| "loss": 12.3973, | |
| "num_input_tokens_seen": 3552051200, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.09148038468847061, | |
| "grad_norm": 0.629550576210022, | |
| "learning_rate": 0.00035419778358191967, | |
| "loss": 12.4529, | |
| "num_input_tokens_seen": 3565158400, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.09181670963217822, | |
| "grad_norm": 0.646165132522583, | |
| "learning_rate": 0.00035262053788931446, | |
| "loss": 12.4602, | |
| "num_input_tokens_seen": 3578265600, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.09215303457588583, | |
| "grad_norm": 0.6328135132789612, | |
| "learning_rate": 0.0003510417900949898, | |
| "loss": 12.4859, | |
| "num_input_tokens_seen": 3591372800, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.09248935951959344, | |
| "grad_norm": 0.6435760259628296, | |
| "learning_rate": 0.0003494615852657555, | |
| "loss": 12.4747, | |
| "num_input_tokens_seen": 3604480000, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.09282568446330107, | |
| "grad_norm": 0.6149182915687561, | |
| "learning_rate": 0.0003478799685100137, | |
| "loss": 12.4353, | |
| "num_input_tokens_seen": 3617587200, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.09316200940700868, | |
| "grad_norm": 0.6365089416503906, | |
| "learning_rate": 0.00034629698497647176, | |
| "loss": 12.4255, | |
| "num_input_tokens_seen": 3630694400, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.09349833435071629, | |
| "grad_norm": 0.6469732522964478, | |
| "learning_rate": 0.0003447126798528523, | |
| "loss": 12.4259, | |
| "num_input_tokens_seen": 3643801600, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.0938346592944239, | |
| "grad_norm": 0.6317386031150818, | |
| "learning_rate": 0.00034312709836460453, | |
| "loss": 12.4626, | |
| "num_input_tokens_seen": 3656908800, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.09417098423813151, | |
| "grad_norm": 0.6267306208610535, | |
| "learning_rate": 0.00034154028577361217, | |
| "loss": 12.3991, | |
| "num_input_tokens_seen": 3670016000, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.09417098423813151, | |
| "eval_loss": 3.016310691833496, | |
| "eval_runtime": 142.725, | |
| "eval_samples_per_second": 35.032, | |
| "eval_steps_per_second": 8.758, | |
| "num_input_tokens_seen": 3670016000, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.09450730918183912, | |
| "grad_norm": 0.6656507849693298, | |
| "learning_rate": 0.0003399522873769023, | |
| "loss": 12.4213, | |
| "num_input_tokens_seen": 3683123200, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.09484363412554674, | |
| "grad_norm": 0.6371810436248779, | |
| "learning_rate": 0.0003383631485053518, | |
| "loss": 12.4092, | |
| "num_input_tokens_seen": 3696230400, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.09517995906925435, | |
| "grad_norm": 0.6278609037399292, | |
| "learning_rate": 0.0003367729145223933, | |
| "loss": 12.3764, | |
| "num_input_tokens_seen": 3709337600, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.09551628401296196, | |
| "grad_norm": 0.6190541982650757, | |
| "learning_rate": 0.00033518163082272055, | |
| "loss": 12.4095, | |
| "num_input_tokens_seen": 3722444800, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.09585260895666958, | |
| "grad_norm": 0.6580514907836914, | |
| "learning_rate": 0.00033358934283099235, | |
| "loss": 12.3431, | |
| "num_input_tokens_seen": 3735552000, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.0961889339003772, | |
| "grad_norm": 0.6620698571205139, | |
| "learning_rate": 0.000331996096000536, | |
| "loss": 12.3971, | |
| "num_input_tokens_seen": 3748659200, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.0965252588440848, | |
| "grad_norm": 0.61739182472229, | |
| "learning_rate": 0.00033040193581204973, | |
| "loss": 12.3897, | |
| "num_input_tokens_seen": 3761766400, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.09686158378779242, | |
| "grad_norm": 0.6852706670761108, | |
| "learning_rate": 0.0003288069077723045, | |
| "loss": 12.4072, | |
| "num_input_tokens_seen": 3774873600, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.09719790873150003, | |
| "grad_norm": 0.6366174817085266, | |
| "learning_rate": 0.00032721105741284466, | |
| "loss": 12.3834, | |
| "num_input_tokens_seen": 3787980800, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.09753423367520764, | |
| "grad_norm": 0.685984194278717, | |
| "learning_rate": 0.0003256144302886885, | |
| "loss": 12.4215, | |
| "num_input_tokens_seen": 3801088000, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.09753423367520764, | |
| "eval_loss": 3.0072007179260254, | |
| "eval_runtime": 142.0382, | |
| "eval_samples_per_second": 35.202, | |
| "eval_steps_per_second": 8.8, | |
| "num_input_tokens_seen": 3801088000, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.09787055861891525, | |
| "grad_norm": 0.633934736251831, | |
| "learning_rate": 0.000324017071977028, | |
| "loss": 12.3848, | |
| "num_input_tokens_seen": 3814195200, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.09820688356262286, | |
| "grad_norm": 0.6223523020744324, | |
| "learning_rate": 0.0003224190280759273, | |
| "loss": 12.389, | |
| "num_input_tokens_seen": 3827302400, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.09854320850633047, | |
| "grad_norm": 0.6419284343719482, | |
| "learning_rate": 0.00032082034420302137, | |
| "loss": 12.3622, | |
| "num_input_tokens_seen": 3840409600, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.0988795334500381, | |
| "grad_norm": 0.6162405610084534, | |
| "learning_rate": 0.0003192210659942139, | |
| "loss": 12.4409, | |
| "num_input_tokens_seen": 3853516800, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.0992158583937457, | |
| "grad_norm": 0.6561248898506165, | |
| "learning_rate": 0.0003176212391023743, | |
| "loss": 12.4152, | |
| "num_input_tokens_seen": 3866624000, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.09955218333745332, | |
| "grad_norm": 0.6575373411178589, | |
| "learning_rate": 0.0003160209091960347, | |
| "loss": 12.3603, | |
| "num_input_tokens_seen": 3879731200, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.09988850828116093, | |
| "grad_norm": 0.6060482859611511, | |
| "learning_rate": 0.0003144201219580862, | |
| "loss": 12.3752, | |
| "num_input_tokens_seen": 3892838400, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.10022483322486854, | |
| "grad_norm": 0.6433590650558472, | |
| "learning_rate": 0.000312818923084475, | |
| "loss": 12.3568, | |
| "num_input_tokens_seen": 3905945600, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.10056115816857615, | |
| "grad_norm": 0.626518189907074, | |
| "learning_rate": 0.00031121735828289773, | |
| "loss": 12.3327, | |
| "num_input_tokens_seen": 3919052800, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.10089748311228376, | |
| "grad_norm": 0.6467755436897278, | |
| "learning_rate": 0.0003096154732714966, | |
| "loss": 12.367, | |
| "num_input_tokens_seen": 3932160000, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.10089748311228376, | |
| "eval_loss": 2.9978182315826416, | |
| "eval_runtime": 143.1987, | |
| "eval_samples_per_second": 34.917, | |
| "eval_steps_per_second": 8.729, | |
| "num_input_tokens_seen": 3932160000, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.10123380805599137, | |
| "grad_norm": 0.6437516808509827, | |
| "learning_rate": 0.00030801331377755466, | |
| "loss": 12.3776, | |
| "num_input_tokens_seen": 3945267200, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.10157013299969898, | |
| "grad_norm": 0.6743655204772949, | |
| "learning_rate": 0.0003064109255361904, | |
| "loss": 12.326, | |
| "num_input_tokens_seen": 3958374400, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.1019064579434066, | |
| "grad_norm": 0.6296969056129456, | |
| "learning_rate": 0.00030480835428905214, | |
| "loss": 12.3444, | |
| "num_input_tokens_seen": 3971481600, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.10224278288711422, | |
| "grad_norm": 0.648457407951355, | |
| "learning_rate": 0.000303205645783012, | |
| "loss": 12.3422, | |
| "num_input_tokens_seen": 3984588800, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.10257910783082183, | |
| "grad_norm": 0.6306461691856384, | |
| "learning_rate": 0.0003016028457688604, | |
| "loss": 12.3452, | |
| "num_input_tokens_seen": 3997696000, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.10291543277452944, | |
| "grad_norm": 0.6481978893280029, | |
| "learning_rate": 0.0003, | |
| "loss": 12.3079, | |
| "num_input_tokens_seen": 4010803200, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.10325175771823705, | |
| "grad_norm": 0.7946459650993347, | |
| "learning_rate": 0.0002983971542311397, | |
| "loss": 12.3674, | |
| "num_input_tokens_seen": 4023910400, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.10358808266194466, | |
| "grad_norm": 0.6375327706336975, | |
| "learning_rate": 0.000296794354216988, | |
| "loss": 12.3125, | |
| "num_input_tokens_seen": 4037017600, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.10392440760565227, | |
| "grad_norm": 0.6338579058647156, | |
| "learning_rate": 0.0002951916457109479, | |
| "loss": 12.3305, | |
| "num_input_tokens_seen": 4050124800, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.10426073254935989, | |
| "grad_norm": 0.642365038394928, | |
| "learning_rate": 0.00029358907446380955, | |
| "loss": 12.3038, | |
| "num_input_tokens_seen": 4063232000, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.10426073254935989, | |
| "eval_loss": 2.9912989139556885, | |
| "eval_runtime": 142.6952, | |
| "eval_samples_per_second": 35.04, | |
| "eval_steps_per_second": 8.76, | |
| "num_input_tokens_seen": 4063232000, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.1045970574930675, | |
| "grad_norm": 0.6200032830238342, | |
| "learning_rate": 0.00029198668622244534, | |
| "loss": 12.3153, | |
| "num_input_tokens_seen": 4076339200, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.10493338243677511, | |
| "grad_norm": 0.6352826356887817, | |
| "learning_rate": 0.0002903845267285034, | |
| "loss": 12.3094, | |
| "num_input_tokens_seen": 4089446400, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.10526970738048273, | |
| "grad_norm": 0.6530657410621643, | |
| "learning_rate": 0.0002887826417171023, | |
| "loss": 12.3094, | |
| "num_input_tokens_seen": 4102553600, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.10560603232419034, | |
| "grad_norm": 0.6631893515586853, | |
| "learning_rate": 0.00028718107691552496, | |
| "loss": 12.2943, | |
| "num_input_tokens_seen": 4115660800, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.10594235726789795, | |
| "grad_norm": 0.6634914875030518, | |
| "learning_rate": 0.0002855798780419138, | |
| "loss": 12.2738, | |
| "num_input_tokens_seen": 4128768000, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.10627868221160557, | |
| "grad_norm": 0.6240889430046082, | |
| "learning_rate": 0.00028397909080396527, | |
| "loss": 12.3316, | |
| "num_input_tokens_seen": 4141875200, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.10661500715531318, | |
| "grad_norm": 0.6263941526412964, | |
| "learning_rate": 0.00028237876089762574, | |
| "loss": 12.2874, | |
| "num_input_tokens_seen": 4154982400, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.10695133209902079, | |
| "grad_norm": 0.629359245300293, | |
| "learning_rate": 0.00028077893400578615, | |
| "loss": 12.3043, | |
| "num_input_tokens_seen": 4168089600, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.1072876570427284, | |
| "grad_norm": 0.6163947582244873, | |
| "learning_rate": 0.0002791796557969787, | |
| "loss": 12.3009, | |
| "num_input_tokens_seen": 4181196800, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.10762398198643601, | |
| "grad_norm": 0.6394808888435364, | |
| "learning_rate": 0.0002775809719240727, | |
| "loss": 12.2584, | |
| "num_input_tokens_seen": 4194304000, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.10762398198643601, | |
| "eval_loss": 2.984212875366211, | |
| "eval_runtime": 142.4659, | |
| "eval_samples_per_second": 35.096, | |
| "eval_steps_per_second": 8.774, | |
| "num_input_tokens_seen": 4194304000, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.10796030693014362, | |
| "grad_norm": 0.6504441499710083, | |
| "learning_rate": 0.00027598292802297203, | |
| "loss": 12.301, | |
| "num_input_tokens_seen": 4207411200, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.10829663187385125, | |
| "grad_norm": 0.6610515117645264, | |
| "learning_rate": 0.00027438556971131137, | |
| "loss": 12.2809, | |
| "num_input_tokens_seen": 4220518400, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.10863295681755886, | |
| "grad_norm": 0.6400002837181091, | |
| "learning_rate": 0.00027278894258715535, | |
| "loss": 12.2821, | |
| "num_input_tokens_seen": 4233625600, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.10896928176126647, | |
| "grad_norm": 0.6517115831375122, | |
| "learning_rate": 0.00027119309222769546, | |
| "loss": 12.2722, | |
| "num_input_tokens_seen": 4246732800, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.10930560670497408, | |
| "grad_norm": 0.6387389898300171, | |
| "learning_rate": 0.0002695980641879502, | |
| "loss": 12.2715, | |
| "num_input_tokens_seen": 4259840000, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.10964193164868169, | |
| "grad_norm": 0.6440519094467163, | |
| "learning_rate": 0.0002680039039994639, | |
| "loss": 12.25, | |
| "num_input_tokens_seen": 4272947200, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.1099782565923893, | |
| "grad_norm": 0.6389286518096924, | |
| "learning_rate": 0.0002664106571690076, | |
| "loss": 12.2565, | |
| "num_input_tokens_seen": 4286054400, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.11031458153609691, | |
| "grad_norm": 0.6398110389709473, | |
| "learning_rate": 0.00026481836917727946, | |
| "loss": 12.2356, | |
| "num_input_tokens_seen": 4299161600, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.11065090647980452, | |
| "grad_norm": 0.6471937298774719, | |
| "learning_rate": 0.00026322708547760676, | |
| "loss": 12.269, | |
| "num_input_tokens_seen": 4312268800, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.11098723142351213, | |
| "grad_norm": 0.6105075478553772, | |
| "learning_rate": 0.00026163685149464816, | |
| "loss": 12.2762, | |
| "num_input_tokens_seen": 4325376000, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.11098723142351213, | |
| "eval_loss": 2.9754507541656494, | |
| "eval_runtime": 142.3479, | |
| "eval_samples_per_second": 35.125, | |
| "eval_steps_per_second": 8.781, | |
| "num_input_tokens_seen": 4325376000, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.11132355636721976, | |
| "grad_norm": 0.6111390590667725, | |
| "learning_rate": 0.00026004771262309764, | |
| "loss": 12.2253, | |
| "num_input_tokens_seen": 4338483200, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.11165988131092737, | |
| "grad_norm": 0.6871252059936523, | |
| "learning_rate": 0.0002584597142263877, | |
| "loss": 12.2595, | |
| "num_input_tokens_seen": 4351590400, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.11199620625463498, | |
| "grad_norm": 0.6608724594116211, | |
| "learning_rate": 0.00025687290163539547, | |
| "loss": 12.2838, | |
| "num_input_tokens_seen": 4364697600, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.11233253119834259, | |
| "grad_norm": 0.634148895740509, | |
| "learning_rate": 0.0002552873201471476, | |
| "loss": 12.2522, | |
| "num_input_tokens_seen": 4377804800, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.1126688561420502, | |
| "grad_norm": 0.6481145620346069, | |
| "learning_rate": 0.00025370301502352825, | |
| "loss": 12.2185, | |
| "num_input_tokens_seen": 4390912000, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.11300518108575781, | |
| "grad_norm": 0.6411675810813904, | |
| "learning_rate": 0.0002521200314899863, | |
| "loss": 12.2566, | |
| "num_input_tokens_seen": 4404019200, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.11334150602946542, | |
| "grad_norm": 0.6985258460044861, | |
| "learning_rate": 0.00025053841473424447, | |
| "loss": 12.3036, | |
| "num_input_tokens_seen": 4417126400, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.11367783097317304, | |
| "grad_norm": 0.6223846673965454, | |
| "learning_rate": 0.0002489582099050102, | |
| "loss": 12.1942, | |
| "num_input_tokens_seen": 4430233600, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.11401415591688065, | |
| "grad_norm": 0.637690007686615, | |
| "learning_rate": 0.00024737946211068554, | |
| "loss": 12.2711, | |
| "num_input_tokens_seen": 4443340800, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.11435048086058827, | |
| "grad_norm": 0.6184976696968079, | |
| "learning_rate": 0.00024580221641808033, | |
| "loss": 12.2252, | |
| "num_input_tokens_seen": 4456448000, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.11435048086058827, | |
| "eval_loss": 2.9684932231903076, | |
| "eval_runtime": 142.7182, | |
| "eval_samples_per_second": 35.034, | |
| "eval_steps_per_second": 8.759, | |
| "num_input_tokens_seen": 4456448000, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.11468680580429588, | |
| "grad_norm": 0.618519127368927, | |
| "learning_rate": 0.0002442265178511256, | |
| "loss": 12.2066, | |
| "num_input_tokens_seen": 4469555200, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.1150231307480035, | |
| "grad_norm": 0.6514145731925964, | |
| "learning_rate": 0.00024265241138958835, | |
| "loss": 12.2228, | |
| "num_input_tokens_seen": 4482662400, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.1153594556917111, | |
| "grad_norm": 0.6823457479476929, | |
| "learning_rate": 0.00024107994196778714, | |
| "loss": 12.2507, | |
| "num_input_tokens_seen": 4495769600, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.11569578063541872, | |
| "grad_norm": 0.6243106722831726, | |
| "learning_rate": 0.0002395091544733101, | |
| "loss": 12.1857, | |
| "num_input_tokens_seen": 4508876800, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.11603210557912633, | |
| "grad_norm": 0.6370251774787903, | |
| "learning_rate": 0.00023794009374573274, | |
| "loss": 12.2309, | |
| "num_input_tokens_seen": 4521984000, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.11636843052283394, | |
| "grad_norm": 0.6504274010658264, | |
| "learning_rate": 0.00023637280457533902, | |
| "loss": 12.2132, | |
| "num_input_tokens_seen": 4535091200, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.11670475546654155, | |
| "grad_norm": 0.6156638860702515, | |
| "learning_rate": 0.00023480733170184158, | |
| "loss": 12.199, | |
| "num_input_tokens_seen": 4548198400, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.11704108041024916, | |
| "grad_norm": 0.6292795538902283, | |
| "learning_rate": 0.0002332437198131057, | |
| "loss": 12.2122, | |
| "num_input_tokens_seen": 4561305600, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.11737740535395678, | |
| "grad_norm": 0.6368102431297302, | |
| "learning_rate": 0.00023168201354387266, | |
| "loss": 12.2453, | |
| "num_input_tokens_seen": 4574412800, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.1177137302976644, | |
| "grad_norm": 0.6373352408409119, | |
| "learning_rate": 0.00023012225747448645, | |
| "loss": 12.2031, | |
| "num_input_tokens_seen": 4587520000, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.1177137302976644, | |
| "eval_loss": 2.961634635925293, | |
| "eval_runtime": 142.67, | |
| "eval_samples_per_second": 35.046, | |
| "eval_steps_per_second": 8.761, | |
| "num_input_tokens_seen": 4587520000, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.118050055241372, | |
| "grad_norm": 0.6781473755836487, | |
| "learning_rate": 0.0002285644961296205, | |
| "loss": 12.1939, | |
| "num_input_tokens_seen": 4600627200, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.11838638018507962, | |
| "grad_norm": 0.6434431076049805, | |
| "learning_rate": 0.0002270087739770074, | |
| "loss": 12.1876, | |
| "num_input_tokens_seen": 4613734400, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.11872270512878723, | |
| "grad_norm": 0.6625823974609375, | |
| "learning_rate": 0.00022545513542616865, | |
| "loss": 12.1683, | |
| "num_input_tokens_seen": 4626841600, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.11905903007249484, | |
| "grad_norm": 0.6367326974868774, | |
| "learning_rate": 0.0002239036248271478, | |
| "loss": 12.1769, | |
| "num_input_tokens_seen": 4639948800, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.11939535501620245, | |
| "grad_norm": 0.648065447807312, | |
| "learning_rate": 0.00022235428646924372, | |
| "loss": 12.2213, | |
| "num_input_tokens_seen": 4653056000, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.11973167995991006, | |
| "grad_norm": 0.648695170879364, | |
| "learning_rate": 0.00022080716457974705, | |
| "loss": 12.1699, | |
| "num_input_tokens_seen": 4666163200, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.12006800490361767, | |
| "grad_norm": 0.6311103105545044, | |
| "learning_rate": 0.00021926230332267694, | |
| "loss": 12.1912, | |
| "num_input_tokens_seen": 4679270400, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.1204043298473253, | |
| "grad_norm": 0.6318332552909851, | |
| "learning_rate": 0.00021771974679752094, | |
| "loss": 12.1242, | |
| "num_input_tokens_seen": 4692377600, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.12074065479103291, | |
| "grad_norm": 0.6513566374778748, | |
| "learning_rate": 0.0002161795390379756, | |
| "loss": 12.2068, | |
| "num_input_tokens_seen": 4705484800, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.12107697973474052, | |
| "grad_norm": 0.6865115761756897, | |
| "learning_rate": 0.00021464172401069027, | |
| "loss": 12.1477, | |
| "num_input_tokens_seen": 4718592000, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.12107697973474052, | |
| "eval_loss": 2.954843044281006, | |
| "eval_runtime": 142.8465, | |
| "eval_samples_per_second": 35.003, | |
| "eval_steps_per_second": 8.751, | |
| "num_input_tokens_seen": 4718592000, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.12141330467844813, | |
| "grad_norm": 0.622513473033905, | |
| "learning_rate": 0.00021310634561401109, | |
| "loss": 12.1664, | |
| "num_input_tokens_seen": 4731699200, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.12174962962215574, | |
| "grad_norm": 0.6387473344802856, | |
| "learning_rate": 0.0002115734476767287, | |
| "loss": 12.1838, | |
| "num_input_tokens_seen": 4744806400, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.12208595456586335, | |
| "grad_norm": 0.6974210739135742, | |
| "learning_rate": 0.00021004307395682648, | |
| "loss": 12.201, | |
| "num_input_tokens_seen": 4757913600, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.12242227950957096, | |
| "grad_norm": 0.6665675640106201, | |
| "learning_rate": 0.00020851526814023185, | |
| "loss": 12.1154, | |
| "num_input_tokens_seen": 4771020800, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.12275860445327857, | |
| "grad_norm": 0.6340165734291077, | |
| "learning_rate": 0.00020699007383956895, | |
| "loss": 12.19, | |
| "num_input_tokens_seen": 4784128000, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.12309492939698619, | |
| "grad_norm": 0.6360442638397217, | |
| "learning_rate": 0.00020546753459291378, | |
| "loss": 12.1872, | |
| "num_input_tokens_seen": 4797235200, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.12343125434069381, | |
| "grad_norm": 0.6116852760314941, | |
| "learning_rate": 0.00020394769386255162, | |
| "loss": 12.1645, | |
| "num_input_tokens_seen": 4810342400, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.12376757928440142, | |
| "grad_norm": 0.6432573795318604, | |
| "learning_rate": 0.00020243059503373588, | |
| "loss": 12.1537, | |
| "num_input_tokens_seen": 4823449600, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.12410390422810903, | |
| "grad_norm": 0.6480187773704529, | |
| "learning_rate": 0.00020091628141344996, | |
| "loss": 12.155, | |
| "num_input_tokens_seen": 4836556800, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.12444022917181664, | |
| "grad_norm": 0.643993616104126, | |
| "learning_rate": 0.00019940479622917068, | |
| "loss": 12.1604, | |
| "num_input_tokens_seen": 4849664000, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.12444022917181664, | |
| "eval_loss": 2.9489145278930664, | |
| "eval_runtime": 142.7319, | |
| "eval_samples_per_second": 35.031, | |
| "eval_steps_per_second": 8.758, | |
| "num_input_tokens_seen": 4849664000, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.12477655411552425, | |
| "grad_norm": 0.6500803232192993, | |
| "learning_rate": 0.00019789618262763508, | |
| "loss": 12.1604, | |
| "num_input_tokens_seen": 4862771200, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.12511287905923188, | |
| "grad_norm": 0.6314743161201477, | |
| "learning_rate": 0.00019639048367360774, | |
| "loss": 12.1107, | |
| "num_input_tokens_seen": 4875878400, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.1254492040029395, | |
| "grad_norm": 0.6902073621749878, | |
| "learning_rate": 0.00019488774234865217, | |
| "loss": 12.1634, | |
| "num_input_tokens_seen": 4888985600, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.1257855289466471, | |
| "grad_norm": 0.6349673867225647, | |
| "learning_rate": 0.00019338800154990337, | |
| "loss": 12.1828, | |
| "num_input_tokens_seen": 4902092800, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.1261218538903547, | |
| "grad_norm": 0.639392614364624, | |
| "learning_rate": 0.000191891304088844, | |
| "loss": 12.1314, | |
| "num_input_tokens_seen": 4915200000, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.12645817883406232, | |
| "grad_norm": 0.6466573476791382, | |
| "learning_rate": 0.00019039769269008148, | |
| "loss": 12.1521, | |
| "num_input_tokens_seen": 4928307200, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.12679450377776993, | |
| "grad_norm": 0.6457189917564392, | |
| "learning_rate": 0.00018890720999012895, | |
| "loss": 12.1631, | |
| "num_input_tokens_seen": 4941414400, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.12713082872147755, | |
| "grad_norm": 0.648733377456665, | |
| "learning_rate": 0.00018741989853618779, | |
| "loss": 12.1553, | |
| "num_input_tokens_seen": 4954521600, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.12746715366518516, | |
| "grad_norm": 0.6314489841461182, | |
| "learning_rate": 0.00018593580078493335, | |
| "loss": 12.1703, | |
| "num_input_tokens_seen": 4967628800, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.12780347860889277, | |
| "grad_norm": 0.6238834857940674, | |
| "learning_rate": 0.0001844549591013027, | |
| "loss": 12.0931, | |
| "num_input_tokens_seen": 4980736000, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.12780347860889277, | |
| "eval_loss": 2.943131923675537, | |
| "eval_runtime": 142.8812, | |
| "eval_samples_per_second": 34.994, | |
| "eval_steps_per_second": 8.749, | |
| "num_input_tokens_seen": 4980736000, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.12813980355260038, | |
| "grad_norm": 0.6120157241821289, | |
| "learning_rate": 0.00018297741575728593, | |
| "loss": 12.1415, | |
| "num_input_tokens_seen": 4993843200, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.128476128496308, | |
| "grad_norm": 0.6346642374992371, | |
| "learning_rate": 0.00018150321293071843, | |
| "loss": 12.1464, | |
| "num_input_tokens_seen": 5006950400, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.1288124534400156, | |
| "grad_norm": 0.6268289685249329, | |
| "learning_rate": 0.00018003239270407775, | |
| "loss": 12.1105, | |
| "num_input_tokens_seen": 5020057600, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.1291487783837232, | |
| "grad_norm": 0.6437589526176453, | |
| "learning_rate": 0.00017856499706328183, | |
| "loss": 12.1208, | |
| "num_input_tokens_seen": 5033164800, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.12948510332743082, | |
| "grad_norm": 0.6311147809028625, | |
| "learning_rate": 0.00017710106789649096, | |
| "loss": 12.1137, | |
| "num_input_tokens_seen": 5046272000, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.12982142827113843, | |
| "grad_norm": 0.646539568901062, | |
| "learning_rate": 0.00017564064699291133, | |
| "loss": 12.1824, | |
| "num_input_tokens_seen": 5059379200, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.13015775321484604, | |
| "grad_norm": 0.6385849714279175, | |
| "learning_rate": 0.00017418377604160295, | |
| "loss": 12.1106, | |
| "num_input_tokens_seen": 5072486400, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.13049407815855366, | |
| "grad_norm": 0.6449156403541565, | |
| "learning_rate": 0.0001727304966302887, | |
| "loss": 12.0996, | |
| "num_input_tokens_seen": 5085593600, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.1308304031022613, | |
| "grad_norm": 0.6219010949134827, | |
| "learning_rate": 0.0001712808502441682, | |
| "loss": 12.1306, | |
| "num_input_tokens_seen": 5098700800, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.1311667280459689, | |
| "grad_norm": 0.6273418664932251, | |
| "learning_rate": 0.00016983487826473256, | |
| "loss": 12.0719, | |
| "num_input_tokens_seen": 5111808000, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.1311667280459689, | |
| "eval_loss": 2.937514066696167, | |
| "eval_runtime": 142.6532, | |
| "eval_samples_per_second": 35.05, | |
| "eval_steps_per_second": 8.763, | |
| "num_input_tokens_seen": 5111808000, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.13150305298967652, | |
| "grad_norm": 0.6366037130355835, | |
| "learning_rate": 0.00016839262196858374, | |
| "loss": 12.1143, | |
| "num_input_tokens_seen": 5124915200, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.13183937793338413, | |
| "grad_norm": 0.6395111083984375, | |
| "learning_rate": 0.00016695412252625596, | |
| "loss": 12.0524, | |
| "num_input_tokens_seen": 5138022400, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.13217570287709174, | |
| "grad_norm": 0.639202892780304, | |
| "learning_rate": 0.0001655194210010409, | |
| "loss": 12.1006, | |
| "num_input_tokens_seen": 5151129600, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.13251202782079935, | |
| "grad_norm": 0.6547548174858093, | |
| "learning_rate": 0.00016408855834781487, | |
| "loss": 12.0684, | |
| "num_input_tokens_seen": 5164236800, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.13284835276450696, | |
| "grad_norm": 0.6669015884399414, | |
| "learning_rate": 0.00016266157541187034, | |
| "loss": 12.1204, | |
| "num_input_tokens_seen": 5177344000, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.13318467770821457, | |
| "grad_norm": 0.637744665145874, | |
| "learning_rate": 0.00016123851292774947, | |
| "loss": 12.1164, | |
| "num_input_tokens_seen": 5190451200, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.13352100265192218, | |
| "grad_norm": 0.6337763667106628, | |
| "learning_rate": 0.00015981941151808137, | |
| "loss": 12.1213, | |
| "num_input_tokens_seen": 5203558400, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.1338573275956298, | |
| "grad_norm": 0.651337742805481, | |
| "learning_rate": 0.0001584043116924231, | |
| "loss": 12.1115, | |
| "num_input_tokens_seen": 5216665600, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.1341936525393374, | |
| "grad_norm": 0.6313726902008057, | |
| "learning_rate": 0.00015699325384610244, | |
| "loss": 12.1078, | |
| "num_input_tokens_seen": 5229772800, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.13452997748304502, | |
| "grad_norm": 0.6925057768821716, | |
| "learning_rate": 0.00015558627825906524, | |
| "loss": 12.0672, | |
| "num_input_tokens_seen": 5242880000, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.13452997748304502, | |
| "eval_loss": 2.931644916534424, | |
| "eval_runtime": 143.0863, | |
| "eval_samples_per_second": 34.944, | |
| "eval_steps_per_second": 8.736, | |
| "num_input_tokens_seen": 5242880000, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.13486630242675263, | |
| "grad_norm": 0.6415194272994995, | |
| "learning_rate": 0.00015418342509472535, | |
| "loss": 12.1005, | |
| "num_input_tokens_seen": 5255987200, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.13520262737046024, | |
| "grad_norm": 0.6401641368865967, | |
| "learning_rate": 0.00015278473439881874, | |
| "loss": 12.0935, | |
| "num_input_tokens_seen": 5269094400, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.13553895231416785, | |
| "grad_norm": 0.6700222492218018, | |
| "learning_rate": 0.0001513902460982592, | |
| "loss": 12.0946, | |
| "num_input_tokens_seen": 5282201600, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.13587527725787546, | |
| "grad_norm": 0.6184066534042358, | |
| "learning_rate": 0.00015000000000000004, | |
| "loss": 12.058, | |
| "num_input_tokens_seen": 5295308800, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.13621160220158307, | |
| "grad_norm": 0.6642903685569763, | |
| "learning_rate": 0.00014861403578989629, | |
| "loss": 12.0421, | |
| "num_input_tokens_seen": 5308416000, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.13654792714529068, | |
| "grad_norm": 0.651897668838501, | |
| "learning_rate": 0.00014723239303157307, | |
| "loss": 12.0393, | |
| "num_input_tokens_seen": 5321523200, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.1368842520889983, | |
| "grad_norm": 0.616648256778717, | |
| "learning_rate": 0.00014585511116529528, | |
| "loss": 12.0737, | |
| "num_input_tokens_seen": 5334630400, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.13722057703270593, | |
| "grad_norm": 0.6298686861991882, | |
| "learning_rate": 0.00014448222950684246, | |
| "loss": 12.0721, | |
| "num_input_tokens_seen": 5347737600, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.13755690197641354, | |
| "grad_norm": 0.6637253165245056, | |
| "learning_rate": 0.00014311378724638605, | |
| "loss": 12.0921, | |
| "num_input_tokens_seen": 5360844800, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.13789322692012115, | |
| "grad_norm": 0.6153833866119385, | |
| "learning_rate": 0.0001417498234473706, | |
| "loss": 12.0664, | |
| "num_input_tokens_seen": 5373952000, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.13789322692012115, | |
| "eval_loss": 2.9268288612365723, | |
| "eval_runtime": 143.0059, | |
| "eval_samples_per_second": 34.964, | |
| "eval_steps_per_second": 8.741, | |
| "num_input_tokens_seen": 5373952000, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.13822955186382876, | |
| "grad_norm": 0.6536301374435425, | |
| "learning_rate": 0.00014039037704539906, | |
| "loss": 12.0644, | |
| "num_input_tokens_seen": 5387059200, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.13856587680753638, | |
| "grad_norm": 0.678833544254303, | |
| "learning_rate": 0.00013903548684712116, | |
| "loss": 12.0616, | |
| "num_input_tokens_seen": 5400166400, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.138902201751244, | |
| "grad_norm": 0.6597055792808533, | |
| "learning_rate": 0.00013768519152912537, | |
| "loss": 12.0914, | |
| "num_input_tokens_seen": 5413273600, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.1392385266949516, | |
| "grad_norm": 0.6703686714172363, | |
| "learning_rate": 0.00013633952963683542, | |
| "loss": 12.0582, | |
| "num_input_tokens_seen": 5426380800, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.1395748516386592, | |
| "grad_norm": 0.6616584062576294, | |
| "learning_rate": 0.00013499853958340923, | |
| "loss": 12.105, | |
| "num_input_tokens_seen": 5439488000, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.13991117658236682, | |
| "grad_norm": 0.6584370136260986, | |
| "learning_rate": 0.00013366225964864313, | |
| "loss": 12.0616, | |
| "num_input_tokens_seen": 5452595200, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.14024750152607443, | |
| "grad_norm": 0.6238560676574707, | |
| "learning_rate": 0.00013233072797787847, | |
| "loss": 12.074, | |
| "num_input_tokens_seen": 5465702400, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.14058382646978204, | |
| "grad_norm": 0.6119787096977234, | |
| "learning_rate": 0.00013100398258091337, | |
| "loss": 12.0441, | |
| "num_input_tokens_seen": 5478809600, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.14092015141348965, | |
| "grad_norm": 0.6162968873977661, | |
| "learning_rate": 0.00012968206133091707, | |
| "loss": 12.0726, | |
| "num_input_tokens_seen": 5491916800, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.14125647635719726, | |
| "grad_norm": 0.6324203014373779, | |
| "learning_rate": 0.00012836500196334916, | |
| "loss": 12.029, | |
| "num_input_tokens_seen": 5505024000, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.14125647635719726, | |
| "eval_loss": 2.9219326972961426, | |
| "eval_runtime": 142.444, | |
| "eval_samples_per_second": 35.102, | |
| "eval_steps_per_second": 8.775, | |
| "num_input_tokens_seen": 5505024000, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.14159280130090487, | |
| "grad_norm": 0.6728281378746033, | |
| "learning_rate": 0.0001270528420748823, | |
| "loss": 12.0576, | |
| "num_input_tokens_seen": 5518131200, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.14192912624461249, | |
| "grad_norm": 0.6371399164199829, | |
| "learning_rate": 0.0001257456191223292, | |
| "loss": 12.0809, | |
| "num_input_tokens_seen": 5531238400, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.1422654511883201, | |
| "grad_norm": 0.6416388750076294, | |
| "learning_rate": 0.00012444337042157285, | |
| "loss": 12.0472, | |
| "num_input_tokens_seen": 5544345600, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.1426017761320277, | |
| "grad_norm": 0.672295093536377, | |
| "learning_rate": 0.00012314613314650207, | |
| "loss": 12.0615, | |
| "num_input_tokens_seen": 5557452800, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.14293810107573532, | |
| "grad_norm": 0.6460967063903809, | |
| "learning_rate": 0.00012185394432794955, | |
| "loss": 12.0439, | |
| "num_input_tokens_seen": 5570560000, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.14327442601944296, | |
| "grad_norm": 0.6483781337738037, | |
| "learning_rate": 0.0001205668408526352, | |
| "loss": 12.0767, | |
| "num_input_tokens_seen": 5583667200, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.14361075096315057, | |
| "grad_norm": 0.6515306830406189, | |
| "learning_rate": 0.00011928485946211334, | |
| "loss": 12.0398, | |
| "num_input_tokens_seen": 5596774400, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.14394707590685818, | |
| "grad_norm": 0.6355323791503906, | |
| "learning_rate": 0.00011800803675172337, | |
| "loss": 12.0792, | |
| "num_input_tokens_seen": 5609881600, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.1442834008505658, | |
| "grad_norm": 0.6724342107772827, | |
| "learning_rate": 0.00011673640916954571, | |
| "loss": 12.0238, | |
| "num_input_tokens_seen": 5622988800, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.1446197257942734, | |
| "grad_norm": 0.6570594310760498, | |
| "learning_rate": 0.00011547001301536085, | |
| "loss": 12.0514, | |
| "num_input_tokens_seen": 5636096000, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.1446197257942734, | |
| "eval_loss": 2.91679310798645, | |
| "eval_runtime": 142.6116, | |
| "eval_samples_per_second": 35.06, | |
| "eval_steps_per_second": 8.765, | |
| "num_input_tokens_seen": 5636096000, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.144956050737981, | |
| "grad_norm": 0.6420052647590637, | |
| "learning_rate": 0.00011420888443961337, | |
| "loss": 12.02, | |
| "num_input_tokens_seen": 5649203200, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.14529237568168862, | |
| "grad_norm": 0.6295548677444458, | |
| "learning_rate": 0.00011295305944237995, | |
| "loss": 12.0275, | |
| "num_input_tokens_seen": 5662310400, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.14562870062539623, | |
| "grad_norm": 0.6434178352355957, | |
| "learning_rate": 0.00011170257387234198, | |
| "loss": 12.0421, | |
| "num_input_tokens_seen": 5675417600, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.14596502556910385, | |
| "grad_norm": 0.6139717102050781, | |
| "learning_rate": 0.0001104574634257616, | |
| "loss": 12.0342, | |
| "num_input_tokens_seen": 5688524800, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.14630135051281146, | |
| "grad_norm": 0.6519197225570679, | |
| "learning_rate": 0.00010921776364546347, | |
| "loss": 12.0328, | |
| "num_input_tokens_seen": 5701632000, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.14663767545651907, | |
| "grad_norm": 0.6653149724006653, | |
| "learning_rate": 0.00010798350991981948, | |
| "loss": 12.0151, | |
| "num_input_tokens_seen": 5714739200, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.14697400040022668, | |
| "grad_norm": 0.6633841395378113, | |
| "learning_rate": 0.0001067547374817392, | |
| "loss": 11.9882, | |
| "num_input_tokens_seen": 5727846400, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.1473103253439343, | |
| "grad_norm": 0.6165183186531067, | |
| "learning_rate": 0.00010553148140766353, | |
| "loss": 12.0242, | |
| "num_input_tokens_seen": 5740953600, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.1476466502876419, | |
| "grad_norm": 0.6443773508071899, | |
| "learning_rate": 0.00010431377661656374, | |
| "loss": 12.0166, | |
| "num_input_tokens_seen": 5754060800, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.1479829752313495, | |
| "grad_norm": 0.6805723905563354, | |
| "learning_rate": 0.00010310165786894456, | |
| "loss": 12.0284, | |
| "num_input_tokens_seen": 5767168000, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.1479829752313495, | |
| "eval_loss": 2.9135851860046387, | |
| "eval_runtime": 143.3087, | |
| "eval_samples_per_second": 34.89, | |
| "eval_steps_per_second": 8.722, | |
| "num_input_tokens_seen": 5767168000, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.14831930017505712, | |
| "grad_norm": 0.6574228405952454, | |
| "learning_rate": 0.00010189515976585224, | |
| "loss": 11.9953, | |
| "num_input_tokens_seen": 5780275200, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 0.14865562511876473, | |
| "grad_norm": 0.630247175693512, | |
| "learning_rate": 0.00010069431674788618, | |
| "loss": 12.0309, | |
| "num_input_tokens_seen": 5793382400, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.14899195006247234, | |
| "grad_norm": 0.6254024505615234, | |
| "learning_rate": 9.949916309421655e-05, | |
| "loss": 11.9972, | |
| "num_input_tokens_seen": 5806489600, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 0.14932827500617998, | |
| "grad_norm": 0.6164761781692505, | |
| "learning_rate": 9.830973292160493e-05, | |
| "loss": 12.0382, | |
| "num_input_tokens_seen": 5819596800, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.1496645999498876, | |
| "grad_norm": 0.6174560189247131, | |
| "learning_rate": 9.712606018343136e-05, | |
| "loss": 11.981, | |
| "num_input_tokens_seen": 5832704000, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 0.1500009248935952, | |
| "grad_norm": 0.6346741914749146, | |
| "learning_rate": 9.594817866872411e-05, | |
| "loss": 12.0161, | |
| "num_input_tokens_seen": 5845811200, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.15033724983730282, | |
| "grad_norm": 0.6521451473236084, | |
| "learning_rate": 9.477612200119616e-05, | |
| "loss": 12.0022, | |
| "num_input_tokens_seen": 5858918400, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 0.15067357478101043, | |
| "grad_norm": 0.6211933493614197, | |
| "learning_rate": 9.360992363828442e-05, | |
| "loss": 12.0695, | |
| "num_input_tokens_seen": 5872025600, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.15100989972471804, | |
| "grad_norm": 0.6488197445869446, | |
| "learning_rate": 9.244961687019529e-05, | |
| "loss": 12.0477, | |
| "num_input_tokens_seen": 5885132800, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 0.15134622466842565, | |
| "grad_norm": 0.6073492169380188, | |
| "learning_rate": 9.129523481895408e-05, | |
| "loss": 11.9863, | |
| "num_input_tokens_seen": 5898240000, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.15134622466842565, | |
| "eval_loss": 2.9087352752685547, | |
| "eval_runtime": 141.7599, | |
| "eval_samples_per_second": 35.271, | |
| "eval_steps_per_second": 8.818, | |
| "num_input_tokens_seen": 5898240000, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.15168254961213326, | |
| "grad_norm": 0.6158032417297363, | |
| "learning_rate": 9.014681043745983e-05, | |
| "loss": 12.0428, | |
| "num_input_tokens_seen": 5911347200, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 0.15201887455584087, | |
| "grad_norm": 0.6146510243415833, | |
| "learning_rate": 8.900437650854409e-05, | |
| "loss": 12.0035, | |
| "num_input_tokens_seen": 5924454400, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.15235519949954848, | |
| "grad_norm": 0.7664083242416382, | |
| "learning_rate": 8.786796564403575e-05, | |
| "loss": 12.0481, | |
| "num_input_tokens_seen": 5937561600, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 0.1526915244432561, | |
| "grad_norm": 0.6329470872879028, | |
| "learning_rate": 8.673761028382955e-05, | |
| "loss": 11.9683, | |
| "num_input_tokens_seen": 5950668800, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.1530278493869637, | |
| "grad_norm": 0.617677628993988, | |
| "learning_rate": 8.561334269496019e-05, | |
| "loss": 11.9993, | |
| "num_input_tokens_seen": 5963776000, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 0.15336417433067132, | |
| "grad_norm": 0.6368398070335388, | |
| "learning_rate": 8.449519497068174e-05, | |
| "loss": 11.9881, | |
| "num_input_tokens_seen": 5976883200, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.15370049927437893, | |
| "grad_norm": 0.6332319974899292, | |
| "learning_rate": 8.338319902955062e-05, | |
| "loss": 12.0005, | |
| "num_input_tokens_seen": 5989990400, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 0.15403682421808654, | |
| "grad_norm": 0.6333373188972473, | |
| "learning_rate": 8.227738661451541e-05, | |
| "loss": 12.0081, | |
| "num_input_tokens_seen": 6003097600, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.15437314916179415, | |
| "grad_norm": 0.6376117467880249, | |
| "learning_rate": 8.117778929200977e-05, | |
| "loss": 11.9789, | |
| "num_input_tokens_seen": 6016204800, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 0.15470947410550176, | |
| "grad_norm": 0.6416700482368469, | |
| "learning_rate": 8.008443845105216e-05, | |
| "loss": 11.9845, | |
| "num_input_tokens_seen": 6029312000, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.15470947410550176, | |
| "eval_loss": 2.9053738117218018, | |
| "eval_runtime": 142.1312, | |
| "eval_samples_per_second": 35.179, | |
| "eval_steps_per_second": 8.795, | |
| "num_input_tokens_seen": 6029312000, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.15504579904920937, | |
| "grad_norm": 0.6468757390975952, | |
| "learning_rate": 7.899736530234923e-05, | |
| "loss": 11.991, | |
| "num_input_tokens_seen": 6042419200, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 0.155382123992917, | |
| "grad_norm": 0.6658541560173035, | |
| "learning_rate": 7.791660087740537e-05, | |
| "loss": 11.9583, | |
| "num_input_tokens_seen": 6055526400, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.15571844893662462, | |
| "grad_norm": 0.6665578484535217, | |
| "learning_rate": 7.68421760276364e-05, | |
| "loss": 12.0004, | |
| "num_input_tokens_seen": 6068633600, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 0.15605477388033223, | |
| "grad_norm": 0.6088104844093323, | |
| "learning_rate": 7.577412142348944e-05, | |
| "loss": 11.9758, | |
| "num_input_tokens_seen": 6081740800, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.15639109882403984, | |
| "grad_norm": 0.6299030184745789, | |
| "learning_rate": 7.47124675535666e-05, | |
| "loss": 12.036, | |
| "num_input_tokens_seen": 6094848000, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 0.15672742376774745, | |
| "grad_norm": 0.642490565776825, | |
| "learning_rate": 7.365724472375568e-05, | |
| "loss": 11.9951, | |
| "num_input_tokens_seen": 6107955200, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.15706374871145506, | |
| "grad_norm": 0.6375728845596313, | |
| "learning_rate": 7.260848305636405e-05, | |
| "loss": 11.9859, | |
| "num_input_tokens_seen": 6121062400, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 0.15740007365516268, | |
| "grad_norm": 0.6122708320617676, | |
| "learning_rate": 7.156621248925967e-05, | |
| "loss": 11.9532, | |
| "num_input_tokens_seen": 6134169600, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.1577363985988703, | |
| "grad_norm": 0.6512198448181152, | |
| "learning_rate": 7.05304627750157e-05, | |
| "loss": 11.9962, | |
| "num_input_tokens_seen": 6147276800, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 0.1580727235425779, | |
| "grad_norm": 0.6488016247749329, | |
| "learning_rate": 6.950126348006171e-05, | |
| "loss": 11.9285, | |
| "num_input_tokens_seen": 6160384000, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.1580727235425779, | |
| "eval_loss": 2.9018726348876953, | |
| "eval_runtime": 143.4685, | |
| "eval_samples_per_second": 34.851, | |
| "eval_steps_per_second": 8.713, | |
| "num_input_tokens_seen": 6160384000, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.1584090484862855, | |
| "grad_norm": 0.6251162886619568, | |
| "learning_rate": 6.847864398383946e-05, | |
| "loss": 11.9805, | |
| "num_input_tokens_seen": 6173491200, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 0.15874537342999312, | |
| "grad_norm": 0.6338608264923096, | |
| "learning_rate": 6.746263347796449e-05, | |
| "loss": 11.9775, | |
| "num_input_tokens_seen": 6186598400, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.15908169837370073, | |
| "grad_norm": 0.6400789618492126, | |
| "learning_rate": 6.645326096539229e-05, | |
| "loss": 11.9472, | |
| "num_input_tokens_seen": 6199705600, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 0.15941802331740834, | |
| "grad_norm": 0.6252830624580383, | |
| "learning_rate": 6.545055525959105e-05, | |
| "loss": 11.9752, | |
| "num_input_tokens_seen": 6212812800, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.15975434826111595, | |
| "grad_norm": 0.6790284514427185, | |
| "learning_rate": 6.445454498371857e-05, | |
| "loss": 11.957, | |
| "num_input_tokens_seen": 6225920000, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 0.16009067320482356, | |
| "grad_norm": 0.621303379535675, | |
| "learning_rate": 6.346525856980567e-05, | |
| "loss": 11.9433, | |
| "num_input_tokens_seen": 6239027200, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.16042699814853117, | |
| "grad_norm": 0.6379457712173462, | |
| "learning_rate": 6.248272425794411e-05, | |
| "loss": 11.9516, | |
| "num_input_tokens_seen": 6252134400, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 0.16076332309223879, | |
| "grad_norm": 0.6223682761192322, | |
| "learning_rate": 6.150697009548073e-05, | |
| "loss": 11.9856, | |
| "num_input_tokens_seen": 6265241600, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.1610996480359464, | |
| "grad_norm": 0.6040588021278381, | |
| "learning_rate": 6.0538023936216814e-05, | |
| "loss": 11.9921, | |
| "num_input_tokens_seen": 6278348800, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 0.16143597297965404, | |
| "grad_norm": 0.6390047669410706, | |
| "learning_rate": 5.957591343961304e-05, | |
| "loss": 11.9322, | |
| "num_input_tokens_seen": 6291456000, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.16143597297965404, | |
| "eval_loss": 2.8987817764282227, | |
| "eval_runtime": 142.5945, | |
| "eval_samples_per_second": 35.064, | |
| "eval_steps_per_second": 8.766, | |
| "num_input_tokens_seen": 6291456000, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.16177229792336165, | |
| "grad_norm": 0.6663207411766052, | |
| "learning_rate": 5.862066606999949e-05, | |
| "loss": 11.9835, | |
| "num_input_tokens_seen": 6304563200, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 0.16210862286706926, | |
| "grad_norm": 0.6109934449195862, | |
| "learning_rate": 5.7672309095792316e-05, | |
| "loss": 11.933, | |
| "num_input_tokens_seen": 6317670400, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.16244494781077687, | |
| "grad_norm": 0.6243853569030762, | |
| "learning_rate": 5.6730869588714744e-05, | |
| "loss": 12.0097, | |
| "num_input_tokens_seen": 6330777600, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 0.16278127275448448, | |
| "grad_norm": 0.6165538430213928, | |
| "learning_rate": 5.579637442302454e-05, | |
| "loss": 11.9705, | |
| "num_input_tokens_seen": 6343884800, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.1631175976981921, | |
| "grad_norm": 0.5966577529907227, | |
| "learning_rate": 5.4868850274747045e-05, | |
| "loss": 11.9362, | |
| "num_input_tokens_seen": 6356992000, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 0.1634539226418997, | |
| "grad_norm": 0.6405600309371948, | |
| "learning_rate": 5.39483236209132e-05, | |
| "loss": 11.987, | |
| "num_input_tokens_seen": 6370099200, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.1637902475856073, | |
| "grad_norm": 0.6266763210296631, | |
| "learning_rate": 5.303482073880436e-05, | |
| "loss": 11.9779, | |
| "num_input_tokens_seen": 6383206400, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 0.16412657252931492, | |
| "grad_norm": 0.6331851482391357, | |
| "learning_rate": 5.2128367705201594e-05, | |
| "loss": 11.921, | |
| "num_input_tokens_seen": 6396313600, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.16446289747302253, | |
| "grad_norm": 0.6594439744949341, | |
| "learning_rate": 5.122899039564157e-05, | |
| "loss": 11.9332, | |
| "num_input_tokens_seen": 6409420800, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 0.16479922241673015, | |
| "grad_norm": 0.6269896626472473, | |
| "learning_rate": 5.033671448367788e-05, | |
| "loss": 11.9627, | |
| "num_input_tokens_seen": 6422528000, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.16479922241673015, | |
| "eval_loss": 2.896472454071045, | |
| "eval_runtime": 143.2784, | |
| "eval_samples_per_second": 34.897, | |
| "eval_steps_per_second": 8.724, | |
| "num_input_tokens_seen": 6422528000, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.16513554736043776, | |
| "grad_norm": 0.6272408962249756, | |
| "learning_rate": 4.945156544014846e-05, | |
| "loss": 11.9879, | |
| "num_input_tokens_seen": 6435635200, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 0.16547187230414537, | |
| "grad_norm": 0.6145939826965332, | |
| "learning_rate": 4.8573568532447815e-05, | |
| "loss": 11.964, | |
| "num_input_tokens_seen": 6448742400, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.16580819724785298, | |
| "grad_norm": 0.6379438638687134, | |
| "learning_rate": 4.770274882380648e-05, | |
| "loss": 11.9384, | |
| "num_input_tokens_seen": 6461849600, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 0.1661445221915606, | |
| "grad_norm": 0.6524396538734436, | |
| "learning_rate": 4.6839131172574996e-05, | |
| "loss": 11.9477, | |
| "num_input_tokens_seen": 6474956800, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.1664808471352682, | |
| "grad_norm": 0.6290236711502075, | |
| "learning_rate": 4.598274023151476e-05, | |
| "loss": 11.9441, | |
| "num_input_tokens_seen": 6488064000, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 0.1668171720789758, | |
| "grad_norm": 0.6329859495162964, | |
| "learning_rate": 4.513360044709382e-05, | |
| "loss": 11.9686, | |
| "num_input_tokens_seen": 6501171200, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.16715349702268342, | |
| "grad_norm": 0.6199634671211243, | |
| "learning_rate": 4.429173605878951e-05, | |
| "loss": 11.907, | |
| "num_input_tokens_seen": 6514278400, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 0.16748982196639106, | |
| "grad_norm": 0.6326203346252441, | |
| "learning_rate": 4.3457171098396174e-05, | |
| "loss": 11.9358, | |
| "num_input_tokens_seen": 6527385600, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.16782614691009867, | |
| "grad_norm": 0.647875189781189, | |
| "learning_rate": 4.2629929389339246e-05, | |
| "loss": 11.9304, | |
| "num_input_tokens_seen": 6540492800, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 0.16816247185380628, | |
| "grad_norm": 0.6240447759628296, | |
| "learning_rate": 4.181003454599512e-05, | |
| "loss": 11.9144, | |
| "num_input_tokens_seen": 6553600000, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.16816247185380628, | |
| "eval_loss": 2.8938522338867188, | |
| "eval_runtime": 143.6338, | |
| "eval_samples_per_second": 34.811, | |
| "eval_steps_per_second": 8.703, | |
| "num_input_tokens_seen": 6553600000, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.1684987967975139, | |
| "grad_norm": 0.6343597173690796, | |
| "learning_rate": 4.099750997301747e-05, | |
| "loss": 11.9949, | |
| "num_input_tokens_seen": 6566707200, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 0.1688351217412215, | |
| "grad_norm": 0.626124918460846, | |
| "learning_rate": 4.019237886466838e-05, | |
| "loss": 11.9272, | |
| "num_input_tokens_seen": 6579814400, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.16917144668492912, | |
| "grad_norm": 0.6266665458679199, | |
| "learning_rate": 3.939466420415709e-05, | |
| "loss": 11.935, | |
| "num_input_tokens_seen": 6592921600, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 0.16950777162863673, | |
| "grad_norm": 0.6637131571769714, | |
| "learning_rate": 3.8604388762983175e-05, | |
| "loss": 11.9444, | |
| "num_input_tokens_seen": 6606028800, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.16984409657234434, | |
| "grad_norm": 0.6241376399993896, | |
| "learning_rate": 3.782157510028706e-05, | |
| "loss": 11.9235, | |
| "num_input_tokens_seen": 6619136000, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 0.17018042151605195, | |
| "grad_norm": 0.617912232875824, | |
| "learning_rate": 3.704624556220566e-05, | |
| "loss": 11.9165, | |
| "num_input_tokens_seen": 6632243200, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.17051674645975956, | |
| "grad_norm": 0.6167590022087097, | |
| "learning_rate": 3.627842228123483e-05, | |
| "loss": 11.9636, | |
| "num_input_tokens_seen": 6645350400, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 0.17085307140346717, | |
| "grad_norm": 0.6313674449920654, | |
| "learning_rate": 3.551812717559729e-05, | |
| "loss": 11.9304, | |
| "num_input_tokens_seen": 6658457600, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.17118939634717478, | |
| "grad_norm": 0.6143530607223511, | |
| "learning_rate": 3.47653819486171e-05, | |
| "loss": 11.9495, | |
| "num_input_tokens_seen": 6671564800, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 0.1715257212908824, | |
| "grad_norm": 0.6127185821533203, | |
| "learning_rate": 3.402020808809996e-05, | |
| "loss": 11.926, | |
| "num_input_tokens_seen": 6684672000, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.1715257212908824, | |
| "eval_loss": 2.8920793533325195, | |
| "eval_runtime": 142.7714, | |
| "eval_samples_per_second": 35.021, | |
| "eval_steps_per_second": 8.755, | |
| "num_input_tokens_seen": 6684672000, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.17186204623459, | |
| "grad_norm": 0.621300995349884, | |
| "learning_rate": 3.328262686572024e-05, | |
| "loss": 11.9852, | |
| "num_input_tokens_seen": 6697779200, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 0.17219837117829762, | |
| "grad_norm": 0.6242550015449524, | |
| "learning_rate": 3.2552659336413154e-05, | |
| "loss": 11.9132, | |
| "num_input_tokens_seen": 6710886400, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.17253469612200523, | |
| "grad_norm": 0.6490415930747986, | |
| "learning_rate": 3.1830326337774124e-05, | |
| "loss": 11.9529, | |
| "num_input_tokens_seen": 6723993600, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 0.17287102106571284, | |
| "grad_norm": 0.5997505187988281, | |
| "learning_rate": 3.111564848946403e-05, | |
| "loss": 11.948, | |
| "num_input_tokens_seen": 6737100800, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.17320734600942045, | |
| "grad_norm": 0.6490405797958374, | |
| "learning_rate": 3.040864619262011e-05, | |
| "loss": 11.9353, | |
| "num_input_tokens_seen": 6750208000, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 0.1735436709531281, | |
| "grad_norm": 0.6102951169013977, | |
| "learning_rate": 2.9709339629274285e-05, | |
| "loss": 11.97, | |
| "num_input_tokens_seen": 6763315200, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.1738799958968357, | |
| "grad_norm": 0.6121110916137695, | |
| "learning_rate": 2.9017748761776394e-05, | |
| "loss": 11.9342, | |
| "num_input_tokens_seen": 6776422400, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 0.1742163208405433, | |
| "grad_norm": 0.6192799806594849, | |
| "learning_rate": 2.8333893332224754e-05, | |
| "loss": 11.928, | |
| "num_input_tokens_seen": 6789529600, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.17455264578425092, | |
| "grad_norm": 0.6458452939987183, | |
| "learning_rate": 2.7657792861902393e-05, | |
| "loss": 11.9213, | |
| "num_input_tokens_seen": 6802636800, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 0.17488897072795853, | |
| "grad_norm": 0.6549943089485168, | |
| "learning_rate": 2.6989466650720048e-05, | |
| "loss": 11.9298, | |
| "num_input_tokens_seen": 6815744000, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.17488897072795853, | |
| "eval_loss": 2.890101671218872, | |
| "eval_runtime": 142.6241, | |
| "eval_samples_per_second": 35.057, | |
| "eval_steps_per_second": 8.764, | |
| "num_input_tokens_seen": 6815744000, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.17522529567166614, | |
| "grad_norm": 0.6231434941291809, | |
| "learning_rate": 2.6328933776664907e-05, | |
| "loss": 11.8924, | |
| "num_input_tokens_seen": 6828851200, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 0.17556162061537375, | |
| "grad_norm": 0.6445599794387817, | |
| "learning_rate": 2.567621309525628e-05, | |
| "loss": 11.9639, | |
| "num_input_tokens_seen": 6841958400, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.17589794555908136, | |
| "grad_norm": 0.6182544827461243, | |
| "learning_rate": 2.503132323900714e-05, | |
| "loss": 11.8955, | |
| "num_input_tokens_seen": 6855065600, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 0.17623427050278898, | |
| "grad_norm": 0.6308871507644653, | |
| "learning_rate": 2.439428261689249e-05, | |
| "loss": 11.898, | |
| "num_input_tokens_seen": 6868172800, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.1765705954464966, | |
| "grad_norm": 0.6257124543190002, | |
| "learning_rate": 2.376510941382351e-05, | |
| "loss": 11.9309, | |
| "num_input_tokens_seen": 6881280000, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 0.1769069203902042, | |
| "grad_norm": 0.6235978603363037, | |
| "learning_rate": 2.3143821590128896e-05, | |
| "loss": 11.9587, | |
| "num_input_tokens_seen": 6894387200, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.1772432453339118, | |
| "grad_norm": 0.6002153158187866, | |
| "learning_rate": 2.2530436881041725e-05, | |
| "loss": 11.9336, | |
| "num_input_tokens_seen": 6907494400, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 0.17757957027761942, | |
| "grad_norm": 0.6364301443099976, | |
| "learning_rate": 2.1924972796193506e-05, | |
| "loss": 11.9054, | |
| "num_input_tokens_seen": 6920601600, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.17791589522132703, | |
| "grad_norm": 0.6437053680419922, | |
| "learning_rate": 2.132744661911412e-05, | |
| "loss": 11.9355, | |
| "num_input_tokens_seen": 6933708800, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 0.17825222016503464, | |
| "grad_norm": 0.6307169795036316, | |
| "learning_rate": 2.073787540673876e-05, | |
| "loss": 11.9117, | |
| "num_input_tokens_seen": 6946816000, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.17825222016503464, | |
| "eval_loss": 2.888777732849121, | |
| "eval_runtime": 142.3367, | |
| "eval_samples_per_second": 35.128, | |
| "eval_steps_per_second": 8.782, | |
| "num_input_tokens_seen": 6946816000, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.17858854510874225, | |
| "grad_norm": 0.6065594553947449, | |
| "learning_rate": 2.0156275988920568e-05, | |
| "loss": 11.9054, | |
| "num_input_tokens_seen": 6959923200, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 0.17892487005244986, | |
| "grad_norm": 0.6257479190826416, | |
| "learning_rate": 1.958266496795069e-05, | |
| "loss": 11.8735, | |
| "num_input_tokens_seen": 6973030400, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.17926119499615747, | |
| "grad_norm": 0.6354475617408752, | |
| "learning_rate": 1.9017058718084012e-05, | |
| "loss": 11.9371, | |
| "num_input_tokens_seen": 6986137600, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 0.1795975199398651, | |
| "grad_norm": 0.6183739900588989, | |
| "learning_rate": 1.8459473385071865e-05, | |
| "loss": 11.9123, | |
| "num_input_tokens_seen": 6999244800, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.17993384488357272, | |
| "grad_norm": 0.6221346259117126, | |
| "learning_rate": 1.7909924885701145e-05, | |
| "loss": 11.9004, | |
| "num_input_tokens_seen": 7012352000, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 0.18027016982728034, | |
| "grad_norm": 0.607341468334198, | |
| "learning_rate": 1.7368428907339983e-05, | |
| "loss": 11.9286, | |
| "num_input_tokens_seen": 7025459200, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.18060649477098795, | |
| "grad_norm": 0.6302104592323303, | |
| "learning_rate": 1.6835000907489728e-05, | |
| "loss": 11.9551, | |
| "num_input_tokens_seen": 7038566400, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 0.18094281971469556, | |
| "grad_norm": 0.6029033064842224, | |
| "learning_rate": 1.6309656113344017e-05, | |
| "loss": 11.8979, | |
| "num_input_tokens_seen": 7051673600, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.18127914465840317, | |
| "grad_norm": 0.6170194149017334, | |
| "learning_rate": 1.5792409521353732e-05, | |
| "loss": 11.9503, | |
| "num_input_tokens_seen": 7064780800, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 0.18161546960211078, | |
| "grad_norm": 0.6190406084060669, | |
| "learning_rate": 1.5283275896799407e-05, | |
| "loss": 11.945, | |
| "num_input_tokens_seen": 7077888000, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.18161546960211078, | |
| "eval_loss": 2.88728404045105, | |
| "eval_runtime": 143.9093, | |
| "eval_samples_per_second": 34.744, | |
| "eval_steps_per_second": 8.686, | |
| "num_input_tokens_seen": 7077888000, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.1819517945458184, | |
| "grad_norm": 0.6403325796127319, | |
| "learning_rate": 1.478226977336916e-05, | |
| "loss": 11.8936, | |
| "num_input_tokens_seen": 7090995200, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 0.182288119489526, | |
| "grad_norm": 0.6248791813850403, | |
| "learning_rate": 1.428940545274433e-05, | |
| "loss": 11.9114, | |
| "num_input_tokens_seen": 7104102400, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.1826244444332336, | |
| "grad_norm": 0.6114192605018616, | |
| "learning_rate": 1.3804697004190869e-05, | |
| "loss": 11.9281, | |
| "num_input_tokens_seen": 7117209600, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 0.18296076937694122, | |
| "grad_norm": 0.6320353746414185, | |
| "learning_rate": 1.3328158264157762e-05, | |
| "loss": 11.9141, | |
| "num_input_tokens_seen": 7130316800, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.18329709432064883, | |
| "grad_norm": 0.6097228527069092, | |
| "learning_rate": 1.2859802835882416e-05, | |
| "loss": 11.8966, | |
| "num_input_tokens_seen": 7143424000, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 0.18363341926435645, | |
| "grad_norm": 0.6205602288246155, | |
| "learning_rate": 1.2399644089001825e-05, | |
| "loss": 11.9154, | |
| "num_input_tokens_seen": 7156531200, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.18396974420806406, | |
| "grad_norm": 0.6151401996612549, | |
| "learning_rate": 1.1947695159171256e-05, | |
| "loss": 11.8856, | |
| "num_input_tokens_seen": 7169638400, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 0.18430606915177167, | |
| "grad_norm": 0.6325812935829163, | |
| "learning_rate": 1.1503968947689135e-05, | |
| "loss": 11.9602, | |
| "num_input_tokens_seen": 7182745600, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.18464239409547928, | |
| "grad_norm": 0.6651480197906494, | |
| "learning_rate": 1.106847812112892e-05, | |
| "loss": 11.8962, | |
| "num_input_tokens_seen": 7195852800, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 0.1849787190391869, | |
| "grad_norm": 0.6203281283378601, | |
| "learning_rate": 1.0641235110977286e-05, | |
| "loss": 11.9267, | |
| "num_input_tokens_seen": 7208960000, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.1849787190391869, | |
| "eval_loss": 2.8867011070251465, | |
| "eval_runtime": 143.8431, | |
| "eval_samples_per_second": 34.76, | |
| "eval_steps_per_second": 8.69, | |
| "num_input_tokens_seen": 7208960000, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.1853150439828945, | |
| "grad_norm": 0.628180205821991, | |
| "learning_rate": 1.022225211327954e-05, | |
| "loss": 11.9684, | |
| "num_input_tokens_seen": 7222067200, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 0.18565136892660214, | |
| "grad_norm": 0.6240800023078918, | |
| "learning_rate": 9.811541088291163e-06, | |
| "loss": 11.9017, | |
| "num_input_tokens_seen": 7235174400, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.18598769387030975, | |
| "grad_norm": 0.6192197799682617, | |
| "learning_rate": 9.409113760136766e-06, | |
| "loss": 11.9137, | |
| "num_input_tokens_seen": 7248281600, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 0.18632401881401736, | |
| "grad_norm": 0.6189801096916199, | |
| "learning_rate": 9.014981616474937e-06, | |
| "loss": 11.9493, | |
| "num_input_tokens_seen": 7261388800, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.18666034375772497, | |
| "grad_norm": 0.6035293340682983, | |
| "learning_rate": 8.629155908170881e-06, | |
| "loss": 11.9083, | |
| "num_input_tokens_seen": 7274496000, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 0.18699666870143258, | |
| "grad_norm": 0.6316511034965515, | |
| "learning_rate": 8.25164764897468e-06, | |
| "loss": 11.9187, | |
| "num_input_tokens_seen": 7287603200, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.1873329936451402, | |
| "grad_norm": 0.6229190826416016, | |
| "learning_rate": 7.882467615207334e-06, | |
| "loss": 11.8842, | |
| "num_input_tokens_seen": 7300710400, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 0.1876693185888478, | |
| "grad_norm": 0.6222130656242371, | |
| "learning_rate": 7.521626345452914e-06, | |
| "loss": 11.9228, | |
| "num_input_tokens_seen": 7313817600, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.18800564353255542, | |
| "grad_norm": 0.6076390743255615, | |
| "learning_rate": 7.169134140257871e-06, | |
| "loss": 11.9038, | |
| "num_input_tokens_seen": 7326924800, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 0.18834196847626303, | |
| "grad_norm": 0.6207023859024048, | |
| "learning_rate": 6.825001061836799e-06, | |
| "loss": 11.9013, | |
| "num_input_tokens_seen": 7340032000, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.18834196847626303, | |
| "eval_loss": 2.885740280151367, | |
| "eval_runtime": 143.4494, | |
| "eval_samples_per_second": 34.855, | |
| "eval_steps_per_second": 8.714, | |
| "num_input_tokens_seen": 7340032000, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.18867829341997064, | |
| "grad_norm": 0.6160932779312134, | |
| "learning_rate": 6.4892369337854025e-06, | |
| "loss": 11.9279, | |
| "num_input_tokens_seen": 7353139200, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 0.18901461836367825, | |
| "grad_norm": 0.6192066669464111, | |
| "learning_rate": 6.161851340799984e-06, | |
| "loss": 11.8922, | |
| "num_input_tokens_seen": 7366246400, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.18935094330738586, | |
| "grad_norm": 0.6136648654937744, | |
| "learning_rate": 5.842853628403799e-06, | |
| "loss": 11.906, | |
| "num_input_tokens_seen": 7379353600, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 0.18968726825109347, | |
| "grad_norm": 0.621473491191864, | |
| "learning_rate": 5.532252902680367e-06, | |
| "loss": 11.8603, | |
| "num_input_tokens_seen": 7392460800, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.19002359319480108, | |
| "grad_norm": 0.6140876412391663, | |
| "learning_rate": 5.2300580300135175e-06, | |
| "loss": 11.8953, | |
| "num_input_tokens_seen": 7405568000, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 0.1903599181385087, | |
| "grad_norm": 0.6015214323997498, | |
| "learning_rate": 4.9362776368341846e-06, | |
| "loss": 11.8874, | |
| "num_input_tokens_seen": 7418675200, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.1906962430822163, | |
| "grad_norm": 0.6086856126785278, | |
| "learning_rate": 4.650920109374279e-06, | |
| "loss": 11.9015, | |
| "num_input_tokens_seen": 7431782400, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 0.19103256802592392, | |
| "grad_norm": 0.6232919692993164, | |
| "learning_rate": 4.373993593427238e-06, | |
| "loss": 11.9252, | |
| "num_input_tokens_seen": 7444889600, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.19136889296963153, | |
| "grad_norm": 0.6096498966217041, | |
| "learning_rate": 4.105505994115521e-06, | |
| "loss": 11.9018, | |
| "num_input_tokens_seen": 7457996800, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 0.19170521791333917, | |
| "grad_norm": 0.6317954659461975, | |
| "learning_rate": 3.845464975664947e-06, | |
| "loss": 11.9102, | |
| "num_input_tokens_seen": 7471104000, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.19170521791333917, | |
| "eval_loss": 2.8853116035461426, | |
| "eval_runtime": 143.468, | |
| "eval_samples_per_second": 34.851, | |
| "eval_steps_per_second": 8.713, | |
| "num_input_tokens_seen": 7471104000, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.19204154285704678, | |
| "grad_norm": 0.6087967753410339, | |
| "learning_rate": 3.5938779611859093e-06, | |
| "loss": 11.9431, | |
| "num_input_tokens_seen": 7484211200, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 0.1923778678007544, | |
| "grad_norm": 0.614473283290863, | |
| "learning_rate": 3.350752132461443e-06, | |
| "loss": 11.9548, | |
| "num_input_tokens_seen": 7497318400, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.192714192744462, | |
| "grad_norm": 0.6257823705673218, | |
| "learning_rate": 3.116094429742222e-06, | |
| "loss": 11.9179, | |
| "num_input_tokens_seen": 7510425600, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 0.1930505176881696, | |
| "grad_norm": 0.6351081728935242, | |
| "learning_rate": 2.889911551548585e-06, | |
| "loss": 11.9183, | |
| "num_input_tokens_seen": 7523532800, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.19338684263187722, | |
| "grad_norm": 0.6371856331825256, | |
| "learning_rate": 2.672209954479021e-06, | |
| "loss": 11.9169, | |
| "num_input_tokens_seen": 7536640000, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 0.19372316757558483, | |
| "grad_norm": 0.622117280960083, | |
| "learning_rate": 2.462995853026184e-06, | |
| "loss": 11.9404, | |
| "num_input_tokens_seen": 7549747200, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.19405949251929244, | |
| "grad_norm": 0.6010422110557556, | |
| "learning_rate": 2.2622752193992675e-06, | |
| "loss": 11.9441, | |
| "num_input_tokens_seen": 7562854400, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 0.19439581746300005, | |
| "grad_norm": 0.6092264652252197, | |
| "learning_rate": 2.0700537833536422e-06, | |
| "loss": 11.893, | |
| "num_input_tokens_seen": 7575961600, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.19473214240670766, | |
| "grad_norm": 0.6216610670089722, | |
| "learning_rate": 1.8863370320272187e-06, | |
| "loss": 11.9201, | |
| "num_input_tokens_seen": 7589068800, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 0.19506846735041528, | |
| "grad_norm": 0.615051805973053, | |
| "learning_rate": 1.7111302097839396e-06, | |
| "loss": 11.9402, | |
| "num_input_tokens_seen": 7602176000, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.19506846735041528, | |
| "eval_loss": 2.885068655014038, | |
| "eval_runtime": 142.9832, | |
| "eval_samples_per_second": 34.969, | |
| "eval_steps_per_second": 8.742, | |
| "num_input_tokens_seen": 7602176000, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.1954047922941229, | |
| "grad_norm": 0.6069262027740479, | |
| "learning_rate": 1.5444383180638342e-06, | |
| "loss": 11.9314, | |
| "num_input_tokens_seen": 7615283200, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 0.1957411172378305, | |
| "grad_norm": 0.628108561038971, | |
| "learning_rate": 1.3862661152405309e-06, | |
| "loss": 11.9151, | |
| "num_input_tokens_seen": 7628390400, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.1960774421815381, | |
| "grad_norm": 0.6232333779335022, | |
| "learning_rate": 1.236618116485233e-06, | |
| "loss": 11.8887, | |
| "num_input_tokens_seen": 7641497600, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 0.19641376712524572, | |
| "grad_norm": 0.6372972726821899, | |
| "learning_rate": 1.0954985936379223e-06, | |
| "loss": 11.8873, | |
| "num_input_tokens_seen": 7654604800, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.19675009206895333, | |
| "grad_norm": 0.5991822481155396, | |
| "learning_rate": 9.6291157508529e-07, | |
| "loss": 11.9405, | |
| "num_input_tokens_seen": 7667712000, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 0.19708641701266094, | |
| "grad_norm": 0.6108511686325073, | |
| "learning_rate": 8.388608456459612e-07, | |
| "loss": 11.9085, | |
| "num_input_tokens_seen": 7680819200, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.19742274195636855, | |
| "grad_norm": 0.6104913949966431, | |
| "learning_rate": 7.23349946462215e-07, | |
| "loss": 11.8859, | |
| "num_input_tokens_seen": 7693926400, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 0.1977590669000762, | |
| "grad_norm": 0.6084222197532654, | |
| "learning_rate": 6.163821748990994e-07, | |
| "loss": 11.9059, | |
| "num_input_tokens_seen": 7707033600, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.1980953918437838, | |
| "grad_norm": 0.633105993270874, | |
| "learning_rate": 5.179605844501388e-07, | |
| "loss": 11.9174, | |
| "num_input_tokens_seen": 7720140800, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 0.1984317167874914, | |
| "grad_norm": 0.6088514924049377, | |
| "learning_rate": 4.280879846503049e-07, | |
| "loss": 11.9125, | |
| "num_input_tokens_seen": 7733248000, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.1984317167874914, | |
| "eval_loss": 2.8849411010742188, | |
| "eval_runtime": 143.8146, | |
| "eval_samples_per_second": 34.767, | |
| "eval_steps_per_second": 8.692, | |
| "num_input_tokens_seen": 7733248000, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.19876804173119902, | |
| "grad_norm": 0.6054402589797974, | |
| "learning_rate": 3.467669409957463e-07, | |
| "loss": 11.9468, | |
| "num_input_tokens_seen": 7746355200, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 0.19910436667490664, | |
| "grad_norm": 0.6133595705032349, | |
| "learning_rate": 2.7399977487051473e-07, | |
| "loss": 11.9368, | |
| "num_input_tokens_seen": 7759462400, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.19944069161861425, | |
| "grad_norm": 0.6098650693893433, | |
| "learning_rate": 2.097885634804175e-07, | |
| "loss": 11.8971, | |
| "num_input_tokens_seen": 7772569600, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 0.19977701656232186, | |
| "grad_norm": 0.6231054663658142, | |
| "learning_rate": 1.541351397936319e-07, | |
| "loss": 11.9546, | |
| "num_input_tokens_seen": 7785676800, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.20011334150602947, | |
| "grad_norm": 0.6323234438896179, | |
| "learning_rate": 1.0704109248838022e-07, | |
| "loss": 11.8848, | |
| "num_input_tokens_seen": 7798784000, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 0.20044966644973708, | |
| "grad_norm": 0.6294256448745728, | |
| "learning_rate": 6.850776590763274e-08, | |
| "loss": 11.9027, | |
| "num_input_tokens_seen": 7811891200, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.2007859913934447, | |
| "grad_norm": 0.6184135675430298, | |
| "learning_rate": 3.853626002063848e-08, | |
| "loss": 11.9454, | |
| "num_input_tokens_seen": 7824998400, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 0.2011223163371523, | |
| "grad_norm": 0.6376939415931702, | |
| "learning_rate": 1.7127430391683516e-08, | |
| "loss": 11.8928, | |
| "num_input_tokens_seen": 7838105600, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.2014586412808599, | |
| "grad_norm": 0.6745944619178772, | |
| "learning_rate": 4.281888155543978e-09, | |
| "loss": 11.9315, | |
| "num_input_tokens_seen": 7851212800, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 0.20179496622456752, | |
| "grad_norm": 0.6381050944328308, | |
| "learning_rate": 0.0, | |
| "loss": 11.9242, | |
| "num_input_tokens_seen": 7864320000, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.20179496622456752, | |
| "eval_loss": 2.8848958015441895, | |
| "eval_runtime": 142.697, | |
| "eval_samples_per_second": 35.039, | |
| "eval_steps_per_second": 8.76, | |
| "num_input_tokens_seen": 7864320000, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.20213129116827513, | |
| "grad_norm": 0.7275823950767517, | |
| "learning_rate": 0.0002881031482247361, | |
| "loss": 12.0089, | |
| "num_input_tokens_seen": 7877427200, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 0.20246761611198275, | |
| "grad_norm": 0.7593051195144653, | |
| "learning_rate": 0.0002904816199505797, | |
| "loss": 12.0389, | |
| "num_input_tokens_seen": 7890534400, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.20280394105569036, | |
| "grad_norm": 0.7933290004730225, | |
| "learning_rate": 0.00029286069073616763, | |
| "loss": 12.0537, | |
| "num_input_tokens_seen": 7903641600, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 0.20314026599939797, | |
| "grad_norm": 0.736951470375061, | |
| "learning_rate": 0.0002952402108495577, | |
| "loss": 12.0687, | |
| "num_input_tokens_seen": 7916748800, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.20347659094310558, | |
| "grad_norm": 0.7448037266731262, | |
| "learning_rate": 0.0002976200305305268, | |
| "loss": 12.0549, | |
| "num_input_tokens_seen": 7929856000, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 0.2038129158868132, | |
| "grad_norm": 0.7063918113708496, | |
| "learning_rate": 0.0002999999999999999, | |
| "loss": 12.0769, | |
| "num_input_tokens_seen": 7942963200, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.20414924083052083, | |
| "grad_norm": 0.7379609942436218, | |
| "learning_rate": 0.000302379969469473, | |
| "loss": 12.1145, | |
| "num_input_tokens_seen": 7956070400, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 0.20448556577422844, | |
| "grad_norm": 0.7159172892570496, | |
| "learning_rate": 0.0003047597891504424, | |
| "loss": 12.1304, | |
| "num_input_tokens_seen": 7969177600, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.20482189071793605, | |
| "grad_norm": 0.759340226650238, | |
| "learning_rate": 0.00030713930926383194, | |
| "loss": 12.1011, | |
| "num_input_tokens_seen": 7982284800, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 0.20515821566164366, | |
| "grad_norm": 0.782768189907074, | |
| "learning_rate": 0.00030951838004942016, | |
| "loss": 12.1276, | |
| "num_input_tokens_seen": 7995392000, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.20515821566164366, | |
| "eval_loss": 2.9330999851226807, | |
| "eval_runtime": 143.3174, | |
| "eval_samples_per_second": 34.888, | |
| "eval_steps_per_second": 8.722, | |
| "num_input_tokens_seen": 7995392000, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.20549454060535127, | |
| "grad_norm": 0.7521361112594604, | |
| "learning_rate": 0.00031189685177526375, | |
| "loss": 12.1475, | |
| "num_input_tokens_seen": 8008499200, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 0.20583086554905888, | |
| "grad_norm": 0.752306342124939, | |
| "learning_rate": 0.00031427457474712264, | |
| "loss": 12.0914, | |
| "num_input_tokens_seen": 8021606400, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.2061671904927665, | |
| "grad_norm": 0.6963069438934326, | |
| "learning_rate": 0.0003166513993178817, | |
| "loss": 12.1272, | |
| "num_input_tokens_seen": 8034713600, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 0.2065035154364741, | |
| "grad_norm": 0.7007436752319336, | |
| "learning_rate": 0.0003190271758969692, | |
| "loss": 12.1085, | |
| "num_input_tokens_seen": 8047820800, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.20683984038018172, | |
| "grad_norm": 0.7034767270088196, | |
| "learning_rate": 0.00032140175495976947, | |
| "loss": 12.1114, | |
| "num_input_tokens_seen": 8060928000, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 0.20717616532388933, | |
| "grad_norm": 0.7317435145378113, | |
| "learning_rate": 0.0003237749870570365, | |
| "loss": 12.0728, | |
| "num_input_tokens_seen": 8074035200, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.20751249026759694, | |
| "grad_norm": 0.665651261806488, | |
| "learning_rate": 0.0003261467228242976, | |
| "loss": 12.1099, | |
| "num_input_tokens_seen": 8087142400, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 0.20784881521130455, | |
| "grad_norm": 0.7023760080337524, | |
| "learning_rate": 0.0003285168129912546, | |
| "loss": 12.1188, | |
| "num_input_tokens_seen": 8100249600, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.20818514015501216, | |
| "grad_norm": 0.7026780247688293, | |
| "learning_rate": 0.00033088510839118004, | |
| "loss": 12.0884, | |
| "num_input_tokens_seen": 8113356800, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 0.20852146509871977, | |
| "grad_norm": 0.7397706508636475, | |
| "learning_rate": 0.00033325145997030323, | |
| "loss": 12.0894, | |
| "num_input_tokens_seen": 8126464000, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.20852146509871977, | |
| "eval_loss": 2.9383528232574463, | |
| "eval_runtime": 144.6078, | |
| "eval_samples_per_second": 34.576, | |
| "eval_steps_per_second": 8.644, | |
| "num_input_tokens_seen": 8126464000, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.20885779004242738, | |
| "grad_norm": 0.719252347946167, | |
| "learning_rate": 0.0003356157187971916, | |
| "loss": 12.0981, | |
| "num_input_tokens_seen": 8139571200, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 0.209194114986135, | |
| "grad_norm": 0.701697826385498, | |
| "learning_rate": 0.0003379777360721248, | |
| "loss": 12.1294, | |
| "num_input_tokens_seen": 8152678400, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.2095304399298426, | |
| "grad_norm": 0.7229558229446411, | |
| "learning_rate": 0.0003403373631364593, | |
| "loss": 12.1401, | |
| "num_input_tokens_seen": 8165785600, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 0.20986676487355022, | |
| "grad_norm": 0.7196134328842163, | |
| "learning_rate": 0.0003426944514819854, | |
| "loss": 12.1419, | |
| "num_input_tokens_seen": 8178892800, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.21020308981725785, | |
| "grad_norm": 0.7312472462654114, | |
| "learning_rate": 0.00034504885276027194, | |
| "loss": 12.1428, | |
| "num_input_tokens_seen": 8192000000, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 0.21053941476096547, | |
| "grad_norm": 0.7427237033843994, | |
| "learning_rate": 0.00034740041879200497, | |
| "loss": 12.0976, | |
| "num_input_tokens_seen": 8205107200, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.21087573970467308, | |
| "grad_norm": 0.7099300622940063, | |
| "learning_rate": 0.0003497490015763119, | |
| "loss": 12.1202, | |
| "num_input_tokens_seen": 8218214400, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 0.2112120646483807, | |
| "grad_norm": 0.6904875636100769, | |
| "learning_rate": 0.000352094453300079, | |
| "loss": 12.1501, | |
| "num_input_tokens_seen": 8231321600, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.2115483895920883, | |
| "grad_norm": 0.6842371225357056, | |
| "learning_rate": 0.00035443662634725056, | |
| "loss": 12.1598, | |
| "num_input_tokens_seen": 8244428800, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 0.2118847145357959, | |
| "grad_norm": 0.7030087113380432, | |
| "learning_rate": 0.000356775373308123, | |
| "loss": 12.1683, | |
| "num_input_tokens_seen": 8257536000, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.2118847145357959, | |
| "eval_loss": 2.9419422149658203, | |
| "eval_runtime": 144.4539, | |
| "eval_samples_per_second": 34.613, | |
| "eval_steps_per_second": 8.653, | |
| "num_input_tokens_seen": 8257536000, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.21222103947950352, | |
| "grad_norm": 0.7204530835151672, | |
| "learning_rate": 0.00035911054698862003, | |
| "loss": 12.1513, | |
| "num_input_tokens_seen": 8270643200, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 0.21255736442321113, | |
| "grad_norm": 0.736909806728363, | |
| "learning_rate": 0.000361442000419557, | |
| "loss": 12.127, | |
| "num_input_tokens_seen": 8283750400, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.21289368936691874, | |
| "grad_norm": 0.7460354566574097, | |
| "learning_rate": 0.00036376958686589275, | |
| "loss": 12.1163, | |
| "num_input_tokens_seen": 8296857600, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 0.21323001431062635, | |
| "grad_norm": 0.7942859530448914, | |
| "learning_rate": 0.0003660931598359621, | |
| "loss": 12.1437, | |
| "num_input_tokens_seen": 8309964800, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.21356633925433396, | |
| "grad_norm": 0.7317420840263367, | |
| "learning_rate": 0.00036841257309069635, | |
| "loss": 12.1541, | |
| "num_input_tokens_seen": 8323072000, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 0.21390266419804158, | |
| "grad_norm": 0.7040686011314392, | |
| "learning_rate": 0.0003707276806528282, | |
| "loss": 12.1931, | |
| "num_input_tokens_seen": 8336179200, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.2142389891417492, | |
| "grad_norm": 0.6845248937606812, | |
| "learning_rate": 0.0003730383368160774, | |
| "loss": 12.1727, | |
| "num_input_tokens_seen": 8349286400, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 0.2145753140854568, | |
| "grad_norm": 0.7383499145507812, | |
| "learning_rate": 0.00037534439615432365, | |
| "loss": 12.1563, | |
| "num_input_tokens_seen": 8362393600, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.2149116390291644, | |
| "grad_norm": 0.7198951840400696, | |
| "learning_rate": 0.00037764571353075604, | |
| "loss": 12.1176, | |
| "num_input_tokens_seen": 8375500800, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 0.21524796397287202, | |
| "grad_norm": 0.6850191354751587, | |
| "learning_rate": 0.0003799421441070105, | |
| "loss": 12.236, | |
| "num_input_tokens_seen": 8388608000, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.21524796397287202, | |
| "eval_loss": 2.944018602371216, | |
| "eval_runtime": 144.7537, | |
| "eval_samples_per_second": 34.541, | |
| "eval_steps_per_second": 8.635, | |
| "num_input_tokens_seen": 8388608000, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.21558428891657963, | |
| "grad_norm": 0.7053660750389099, | |
| "learning_rate": 0.000382233543352283, | |
| "loss": 12.129, | |
| "num_input_tokens_seen": 8401715200, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 0.21592061386028724, | |
| "grad_norm": 0.7114477157592773, | |
| "learning_rate": 0.00038451976705242873, | |
| "loss": 12.1358, | |
| "num_input_tokens_seen": 8414822400, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.21625693880399488, | |
| "grad_norm": 0.7044904232025146, | |
| "learning_rate": 0.00038680067131903565, | |
| "loss": 12.16, | |
| "num_input_tokens_seen": 8427929600, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 0.2165932637477025, | |
| "grad_norm": 0.7055366635322571, | |
| "learning_rate": 0.0003890761125984825, | |
| "loss": 12.1501, | |
| "num_input_tokens_seen": 8441036800, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.2169295886914101, | |
| "grad_norm": 0.7038373947143555, | |
| "learning_rate": 0.0003913459476809723, | |
| "loss": 12.176, | |
| "num_input_tokens_seen": 8454144000, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 0.2172659136351177, | |
| "grad_norm": 0.7184889912605286, | |
| "learning_rate": 0.0003936100337095459, | |
| "loss": 12.1378, | |
| "num_input_tokens_seen": 8467251200, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.21760223857882532, | |
| "grad_norm": 0.6986017823219299, | |
| "learning_rate": 0.00039586822818907474, | |
| "loss": 12.1544, | |
| "num_input_tokens_seen": 8480358400, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 0.21793856352253294, | |
| "grad_norm": 0.7293840050697327, | |
| "learning_rate": 0.00039812038899522646, | |
| "loss": 12.1641, | |
| "num_input_tokens_seen": 8493465600, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.21827488846624055, | |
| "grad_norm": 0.7284411191940308, | |
| "learning_rate": 0.0004003663743834105, | |
| "loss": 12.1764, | |
| "num_input_tokens_seen": 8506572800, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 0.21861121340994816, | |
| "grad_norm": 0.6832025051116943, | |
| "learning_rate": 0.00040260604299770063, | |
| "loss": 12.1634, | |
| "num_input_tokens_seen": 8519680000, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.21861121340994816, | |
| "eval_loss": 2.946486473083496, | |
| "eval_runtime": 144.4228, | |
| "eval_samples_per_second": 34.621, | |
| "eval_steps_per_second": 8.655, | |
| "num_input_tokens_seen": 8519680000, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.21894753835365577, | |
| "grad_norm": 0.6891733407974243, | |
| "learning_rate": 0.00040483925387972924, | |
| "loss": 12.1478, | |
| "num_input_tokens_seen": 8532787200, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 0.21928386329736338, | |
| "grad_norm": 0.677598237991333, | |
| "learning_rate": 0.0004070658664775615, | |
| "loss": 12.1567, | |
| "num_input_tokens_seen": 8545894400, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.219620188241071, | |
| "grad_norm": 0.6960694789886475, | |
| "learning_rate": 0.00040928574065453814, | |
| "loss": 12.1627, | |
| "num_input_tokens_seen": 8559001600, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 0.2199565131847786, | |
| "grad_norm": 0.7247474193572998, | |
| "learning_rate": 0.00041149873669809825, | |
| "loss": 12.1736, | |
| "num_input_tokens_seen": 8572108800, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.2202928381284862, | |
| "grad_norm": 0.7329341769218445, | |
| "learning_rate": 0.00041370471532856985, | |
| "loss": 12.1226, | |
| "num_input_tokens_seen": 8585216000, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 0.22062916307219382, | |
| "grad_norm": 0.7093823552131653, | |
| "learning_rate": 0.00041590353770793847, | |
| "loss": 12.1499, | |
| "num_input_tokens_seen": 8598323200, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.22096548801590143, | |
| "grad_norm": 0.7415474653244019, | |
| "learning_rate": 0.0004180950654485822, | |
| "loss": 12.1712, | |
| "num_input_tokens_seen": 8611430400, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 0.22130181295960905, | |
| "grad_norm": 0.7047693133354187, | |
| "learning_rate": 0.0004202791606219841, | |
| "loss": 12.1846, | |
| "num_input_tokens_seen": 8624537600, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.22163813790331666, | |
| "grad_norm": 0.6748114228248596, | |
| "learning_rate": 0.00042245568576741076, | |
| "loss": 12.1544, | |
| "num_input_tokens_seen": 8637644800, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 0.22197446284702427, | |
| "grad_norm": 0.7129522562026978, | |
| "learning_rate": 0.00042462450390056577, | |
| "loss": 12.1996, | |
| "num_input_tokens_seen": 8650752000, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.22197446284702427, | |
| "eval_loss": 2.948745012283325, | |
| "eval_runtime": 144.6942, | |
| "eval_samples_per_second": 34.556, | |
| "eval_steps_per_second": 8.639, | |
| "num_input_tokens_seen": 8650752000, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.2223107877907319, | |
| "grad_norm": 1.463781714439392, | |
| "learning_rate": 0.0004267854785222096, | |
| "loss": 12.2239, | |
| "num_input_tokens_seen": 8663859200, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 0.22264711273443952, | |
| "grad_norm": 0.6819838285446167, | |
| "learning_rate": 0.00042893847362675144, | |
| "loss": 12.1495, | |
| "num_input_tokens_seen": 8676966400, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.22298343767814713, | |
| "grad_norm": 0.7195115089416504, | |
| "learning_rate": 0.0004310833537108078, | |
| "loss": 12.1546, | |
| "num_input_tokens_seen": 8690073600, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 0.22331976262185474, | |
| "grad_norm": 0.6917895078659058, | |
| "learning_rate": 0.0004332199837817323, | |
| "loss": 12.1972, | |
| "num_input_tokens_seen": 8703180800, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.22365608756556235, | |
| "grad_norm": 0.6915993094444275, | |
| "learning_rate": 0.00043534822936610993, | |
| "loss": 12.1944, | |
| "num_input_tokens_seen": 8716288000, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 0.22399241250926996, | |
| "grad_norm": 0.7407116293907166, | |
| "learning_rate": 0.000437467956518223, | |
| "loss": 12.1735, | |
| "num_input_tokens_seen": 8729395200, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.22432873745297757, | |
| "grad_norm": 0.6850821375846863, | |
| "learning_rate": 0.000439579031828478, | |
| "loss": 12.1879, | |
| "num_input_tokens_seen": 8742502400, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 0.22466506239668518, | |
| "grad_norm": 0.7180017828941345, | |
| "learning_rate": 0.0004416813224318048, | |
| "loss": 12.1381, | |
| "num_input_tokens_seen": 8755609600, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.2250013873403928, | |
| "grad_norm": 0.7392515540122986, | |
| "learning_rate": 0.00044377469601601676, | |
| "loss": 12.1726, | |
| "num_input_tokens_seen": 8768716800, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 0.2253377122841004, | |
| "grad_norm": 0.6968019604682922, | |
| "learning_rate": 0.0004458590208301405, | |
| "loss": 12.1747, | |
| "num_input_tokens_seen": 8781824000, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.2253377122841004, | |
| "eval_loss": 2.95086407661438, | |
| "eval_runtime": 144.6095, | |
| "eval_samples_per_second": 34.576, | |
| "eval_steps_per_second": 8.644, | |
| "num_input_tokens_seen": 8781824000, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.22567403722780802, | |
| "grad_norm": 0.6789761185646057, | |
| "learning_rate": 0.00044793416569270493, | |
| "loss": 12.1764, | |
| "num_input_tokens_seen": 8794931200, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 0.22601036217151563, | |
| "grad_norm": 0.6792165040969849, | |
| "learning_rate": 0.00045, | |
| "loss": 12.2017, | |
| "num_input_tokens_seen": 8808038400, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.22634668711522324, | |
| "grad_norm": 0.678844153881073, | |
| "learning_rate": 0.0004520563937342948, | |
| "loss": 12.1961, | |
| "num_input_tokens_seen": 8821145600, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 0.22668301205893085, | |
| "grad_norm": 0.8010696172714233, | |
| "learning_rate": 0.0004541032174720218, | |
| "loss": 12.2016, | |
| "num_input_tokens_seen": 8834252800, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.22701933700263846, | |
| "grad_norm": 0.7628909945487976, | |
| "learning_rate": 0.0004561403423919214, | |
| "loss": 12.1993, | |
| "num_input_tokens_seen": 8847360000, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 0.22735566194634607, | |
| "grad_norm": 0.687944769859314, | |
| "learning_rate": 0.00045816764028315066, | |
| "loss": 12.1857, | |
| "num_input_tokens_seen": 8860467200, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.22769198689005368, | |
| "grad_norm": 0.7951564788818359, | |
| "learning_rate": 0.00046018498355335067, | |
| "loss": 12.2186, | |
| "num_input_tokens_seen": 8873574400, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 0.2280283118337613, | |
| "grad_norm": 0.6637552380561829, | |
| "learning_rate": 0.0004621922452366791, | |
| "loss": 12.1794, | |
| "num_input_tokens_seen": 8886681600, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.22836463677746893, | |
| "grad_norm": 0.7089061141014099, | |
| "learning_rate": 0.0004641892990017995, | |
| "loss": 12.2064, | |
| "num_input_tokens_seen": 8899788800, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 0.22870096172117654, | |
| "grad_norm": 0.7480903267860413, | |
| "learning_rate": 0.000466176019159833, | |
| "loss": 12.2075, | |
| "num_input_tokens_seen": 8912896000, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.22870096172117654, | |
| "eval_loss": 2.9531610012054443, | |
| "eval_runtime": 142.2963, | |
| "eval_samples_per_second": 35.138, | |
| "eval_steps_per_second": 8.784, | |
| "num_input_tokens_seen": 8912896000, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.22903728666488415, | |
| "grad_norm": 0.695820152759552, | |
| "learning_rate": 0.0004681522806722681, | |
| "loss": 12.1961, | |
| "num_input_tokens_seen": 8926003200, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 0.22937361160859177, | |
| "grad_norm": 0.6941335797309875, | |
| "learning_rate": 0.0004701179591588313, | |
| "loss": 12.1694, | |
| "num_input_tokens_seen": 8939110400, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.22970993655229938, | |
| "grad_norm": 0.7149194478988647, | |
| "learning_rate": 0.0004720729309053136, | |
| "loss": 12.1635, | |
| "num_input_tokens_seen": 8952217600, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 0.230046261496007, | |
| "grad_norm": 0.7684397101402283, | |
| "learning_rate": 0.0004740170728713593, | |
| "loss": 12.2116, | |
| "num_input_tokens_seen": 8965324800, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.2303825864397146, | |
| "grad_norm": 0.6849952340126038, | |
| "learning_rate": 0.0004759502626982069, | |
| "loss": 12.1929, | |
| "num_input_tokens_seen": 8978432000, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 0.2307189113834222, | |
| "grad_norm": 0.7058719396591187, | |
| "learning_rate": 0.00047787237871639213, | |
| "loss": 12.2105, | |
| "num_input_tokens_seen": 8991539200, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.23105523632712982, | |
| "grad_norm": 0.7113450765609741, | |
| "learning_rate": 0.00047978329995340403, | |
| "loss": 12.2029, | |
| "num_input_tokens_seen": 9004646400, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 0.23139156127083743, | |
| "grad_norm": 0.701141357421875, | |
| "learning_rate": 0.0004816829061412999, | |
| "loss": 12.2123, | |
| "num_input_tokens_seen": 9017753600, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.23172788621454504, | |
| "grad_norm": 0.7303545475006104, | |
| "learning_rate": 0.00048357107772427376, | |
| "loss": 12.162, | |
| "num_input_tokens_seen": 9030860800, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 0.23206421115825265, | |
| "grad_norm": 0.6944325566291809, | |
| "learning_rate": 0.00048544769586618153, | |
| "loss": 12.2216, | |
| "num_input_tokens_seen": 9043968000, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.23206421115825265, | |
| "eval_loss": 2.9535205364227295, | |
| "eval_runtime": 144.5707, | |
| "eval_samples_per_second": 34.585, | |
| "eval_steps_per_second": 8.646, | |
| "num_input_tokens_seen": 9043968000, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.23240053610196026, | |
| "grad_norm": 0.7305278778076172, | |
| "learning_rate": 0.0004873126424580189, | |
| "loss": 12.1618, | |
| "num_input_tokens_seen": 9057075200, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 0.23273686104566788, | |
| "grad_norm": 0.6831786632537842, | |
| "learning_rate": 0.0004891658001253566, | |
| "loss": 12.1682, | |
| "num_input_tokens_seen": 9070182400, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.2330731859893755, | |
| "grad_norm": 0.6941043138504028, | |
| "learning_rate": 0.0004910070522357263, | |
| "loss": 12.217, | |
| "num_input_tokens_seen": 9083289600, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 0.2334095109330831, | |
| "grad_norm": 0.6915681958198547, | |
| "learning_rate": 0.0004928362829059618, | |
| "loss": 12.1997, | |
| "num_input_tokens_seen": 9096396800, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.2337458358767907, | |
| "grad_norm": 0.6973621249198914, | |
| "learning_rate": 0.0004946533770094914, | |
| "loss": 12.1828, | |
| "num_input_tokens_seen": 9109504000, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 0.23408216082049832, | |
| "grad_norm": 0.7607231140136719, | |
| "learning_rate": 0.0004964582201835855, | |
| "loss": 12.1826, | |
| "num_input_tokens_seen": 9122611200, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.23441848576420596, | |
| "grad_norm": 0.7609473466873169, | |
| "learning_rate": 0.0004982506988365519, | |
| "loss": 12.2168, | |
| "num_input_tokens_seen": 9135718400, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 0.23475481070791357, | |
| "grad_norm": 0.6931918263435364, | |
| "learning_rate": 0.0005000307001548874, | |
| "loss": 12.2015, | |
| "num_input_tokens_seen": 9148825600, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.23509113565162118, | |
| "grad_norm": 0.7207595705986023, | |
| "learning_rate": 0.0005017981121103756, | |
| "loss": 12.2119, | |
| "num_input_tokens_seen": 9161932800, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 0.2354274605953288, | |
| "grad_norm": 0.706977128982544, | |
| "learning_rate": 0.0005035528234671396, | |
| "loss": 12.1703, | |
| "num_input_tokens_seen": 9175040000, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.2354274605953288, | |
| "eval_loss": 2.9533894062042236, | |
| "eval_runtime": 144.9781, | |
| "eval_samples_per_second": 34.488, | |
| "eval_steps_per_second": 8.622, | |
| "num_input_tokens_seen": 9175040000, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.2357637855390364, | |
| "grad_norm": 0.8773657083511353, | |
| "learning_rate": 0.0005052947237886414, | |
| "loss": 12.2164, | |
| "num_input_tokens_seen": 9188147200, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 0.236100110482744, | |
| "grad_norm": 0.7248002886772156, | |
| "learning_rate": 0.0005070237034446334, | |
| "loss": 12.1933, | |
| "num_input_tokens_seen": 9201254400, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.23643643542645162, | |
| "grad_norm": 0.7326715588569641, | |
| "learning_rate": 0.0005087396536180581, | |
| "loss": 12.2179, | |
| "num_input_tokens_seen": 9214361600, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 0.23677276037015924, | |
| "grad_norm": 0.7187255620956421, | |
| "learning_rate": 0.0005104424663118964, | |
| "loss": 12.2079, | |
| "num_input_tokens_seen": 9227468800, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.23710908531386685, | |
| "grad_norm": 0.7238693833351135, | |
| "learning_rate": 0.0005121320343559641, | |
| "loss": 12.2368, | |
| "num_input_tokens_seen": 9240576000, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 0.23744541025757446, | |
| "grad_norm": 0.7042940855026245, | |
| "learning_rate": 0.0005138082514136587, | |
| "loss": 12.1812, | |
| "num_input_tokens_seen": 9253683200, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.23778173520128207, | |
| "grad_norm": 0.7219937443733215, | |
| "learning_rate": 0.00051547101198865, | |
| "loss": 12.1853, | |
| "num_input_tokens_seen": 9266790400, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 0.23811806014498968, | |
| "grad_norm": 0.7058824300765991, | |
| "learning_rate": 0.000517120211431521, | |
| "loss": 12.183, | |
| "num_input_tokens_seen": 9279897600, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.2384543850886973, | |
| "grad_norm": 0.7246367335319519, | |
| "learning_rate": 0.0005187557459463531, | |
| "loss": 12.1807, | |
| "num_input_tokens_seen": 9293004800, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 0.2387907100324049, | |
| "grad_norm": 0.666766881942749, | |
| "learning_rate": 0.00052037751259726, | |
| "loss": 12.228, | |
| "num_input_tokens_seen": 9306112000, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.2387907100324049, | |
| "eval_loss": 2.955592155456543, | |
| "eval_runtime": 143.4345, | |
| "eval_samples_per_second": 34.859, | |
| "eval_steps_per_second": 8.715, | |
| "num_input_tokens_seen": 9306112000, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.2391270349761125, | |
| "grad_norm": 0.7502472400665283, | |
| "learning_rate": 0.000521985409314865, | |
| "loss": 12.1843, | |
| "num_input_tokens_seen": 9319219200, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 0.23946335991982012, | |
| "grad_norm": 0.7807977795600891, | |
| "learning_rate": 0.0005235793349027264, | |
| "loss": 12.1848, | |
| "num_input_tokens_seen": 9332326400, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.23979968486352773, | |
| "grad_norm": 0.7111331820487976, | |
| "learning_rate": 0.0005251591890437045, | |
| "loss": 12.223, | |
| "num_input_tokens_seen": 9345433600, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 0.24013600980723535, | |
| "grad_norm": 0.6893537044525146, | |
| "learning_rate": 0.0005267248723062775, | |
| "loss": 12.2176, | |
| "num_input_tokens_seen": 9358540800, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.24047233475094298, | |
| "grad_norm": 0.7014564275741577, | |
| "learning_rate": 0.0005282762861507973, | |
| "loss": 12.2056, | |
| "num_input_tokens_seen": 9371648000, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 0.2408086596946506, | |
| "grad_norm": 0.7688605189323425, | |
| "learning_rate": 0.0005298133329356933, | |
| "loss": 12.218, | |
| "num_input_tokens_seen": 9384755200, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.2411449846383582, | |
| "grad_norm": 0.7300233840942383, | |
| "learning_rate": 0.0005313359159236161, | |
| "loss": 12.1808, | |
| "num_input_tokens_seen": 9397862400, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 0.24148130958206582, | |
| "grad_norm": 0.7303577065467834, | |
| "learning_rate": 0.000532843939287527, | |
| "loss": 12.2078, | |
| "num_input_tokens_seen": 9410969600, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.24181763452577343, | |
| "grad_norm": 0.7312725186347961, | |
| "learning_rate": 0.000534337308116728, | |
| "loss": 12.26, | |
| "num_input_tokens_seen": 9424076800, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 0.24215395946948104, | |
| "grad_norm": 0.7405809164047241, | |
| "learning_rate": 0.0005358159284228361, | |
| "loss": 12.2168, | |
| "num_input_tokens_seen": 9437184000, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.24215395946948104, | |
| "eval_loss": 2.954807758331299, | |
| "eval_runtime": 145.0981, | |
| "eval_samples_per_second": 34.459, | |
| "eval_steps_per_second": 8.615, | |
| "num_input_tokens_seen": 9437184000, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.24249028441318865, | |
| "grad_norm": 0.6873496770858765, | |
| "learning_rate": 0.0005372797071456985, | |
| "loss": 12.1809, | |
| "num_input_tokens_seen": 9450291200, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 0.24282660935689626, | |
| "grad_norm": 0.7725156545639038, | |
| "learning_rate": 0.0005387285521592496, | |
| "loss": 12.2273, | |
| "num_input_tokens_seen": 9463398400, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.24316293430060387, | |
| "grad_norm": 0.7376220226287842, | |
| "learning_rate": 0.000540162372277308, | |
| "loss": 12.2616, | |
| "num_input_tokens_seen": 9476505600, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 0.24349925924431148, | |
| "grad_norm": 0.7150977849960327, | |
| "learning_rate": 0.0005415810772593176, | |
| "loss": 12.2227, | |
| "num_input_tokens_seen": 9489612800, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.2438355841880191, | |
| "grad_norm": 0.733695387840271, | |
| "learning_rate": 0.0005429845778160248, | |
| "loss": 12.1751, | |
| "num_input_tokens_seen": 9502720000, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 0.2441719091317267, | |
| "grad_norm": 0.840552568435669, | |
| "learning_rate": 0.0005443727856151006, | |
| "loss": 12.1819, | |
| "num_input_tokens_seen": 9515827200, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.24450823407543432, | |
| "grad_norm": 0.7238914966583252, | |
| "learning_rate": 0.0005457456132866974, | |
| "loss": 12.2273, | |
| "num_input_tokens_seen": 9528934400, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 0.24484455901914193, | |
| "grad_norm": 0.7124038338661194, | |
| "learning_rate": 0.0005471029744289499, | |
| "loss": 12.2139, | |
| "num_input_tokens_seen": 9542041600, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.24518088396284954, | |
| "grad_norm": 0.7153120636940002, | |
| "learning_rate": 0.0005484447836134111, | |
| "loss": 12.2012, | |
| "num_input_tokens_seen": 9555148800, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 0.24551720890655715, | |
| "grad_norm": 0.7338851094245911, | |
| "learning_rate": 0.0005497709563904314, | |
| "loss": 12.173, | |
| "num_input_tokens_seen": 9568256000, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.24551720890655715, | |
| "eval_loss": 2.956364154815674, | |
| "eval_runtime": 143.663, | |
| "eval_samples_per_second": 34.804, | |
| "eval_steps_per_second": 8.701, | |
| "num_input_tokens_seen": 9568256000, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.24585353385026476, | |
| "grad_norm": 0.6650400757789612, | |
| "learning_rate": 0.0005510814092944707, | |
| "loss": 12.2382, | |
| "num_input_tokens_seen": 9581363200, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 0.24618985879397237, | |
| "grad_norm": 0.6839830279350281, | |
| "learning_rate": 0.0005523760598493544, | |
| "loss": 12.2283, | |
| "num_input_tokens_seen": 9594470400, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.24652618373768, | |
| "grad_norm": 0.7136399745941162, | |
| "learning_rate": 0.0005536548265734613, | |
| "loss": 12.2312, | |
| "num_input_tokens_seen": 9607577600, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 0.24686250868138762, | |
| "grad_norm": 0.6932345628738403, | |
| "learning_rate": 0.0005549176289848542, | |
| "loss": 12.1727, | |
| "num_input_tokens_seen": 9620684800, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.24719883362509523, | |
| "grad_norm": 0.7730117440223694, | |
| "learning_rate": 0.0005561643876063442, | |
| "loss": 12.2163, | |
| "num_input_tokens_seen": 9633792000, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 0.24753515856880284, | |
| "grad_norm": 0.6646705269813538, | |
| "learning_rate": 0.000557395023970493, | |
| "loss": 12.2019, | |
| "num_input_tokens_seen": 9646899200, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.24787148351251045, | |
| "grad_norm": 0.7021203637123108, | |
| "learning_rate": 0.0005586094606245507, | |
| "loss": 12.2108, | |
| "num_input_tokens_seen": 9660006400, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 0.24820780845621807, | |
| "grad_norm": 0.7044020891189575, | |
| "learning_rate": 0.0005598076211353316, | |
| "loss": 12.1731, | |
| "num_input_tokens_seen": 9673113600, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.24854413339992568, | |
| "grad_norm": 0.7115085124969482, | |
| "learning_rate": 0.0005609894300940235, | |
| "loss": 12.1964, | |
| "num_input_tokens_seen": 9686220800, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 0.2488804583436333, | |
| "grad_norm": 0.7299662232398987, | |
| "learning_rate": 0.0005621548131209354, | |
| "loss": 12.213, | |
| "num_input_tokens_seen": 9699328000, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.2488804583436333, | |
| "eval_loss": 2.9557838439941406, | |
| "eval_runtime": 144.8795, | |
| "eval_samples_per_second": 34.511, | |
| "eval_steps_per_second": 8.628, | |
| "num_input_tokens_seen": 9699328000, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.2492167832873409, | |
| "grad_norm": 0.7441222667694092, | |
| "learning_rate": 0.0005633036968701766, | |
| "loss": 12.1967, | |
| "num_input_tokens_seen": 9712435200, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 0.2495531082310485, | |
| "grad_norm": 0.690399169921875, | |
| "learning_rate": 0.0005644360090342746, | |
| "loss": 12.1598, | |
| "num_input_tokens_seen": 9725542400, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.24988943317475612, | |
| "grad_norm": 0.7531688213348389, | |
| "learning_rate": 0.0005655516783487247, | |
| "loss": 12.1726, | |
| "num_input_tokens_seen": 9738649600, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 0.25022575811846376, | |
| "grad_norm": 0.7439296841621399, | |
| "learning_rate": 0.000566650634596477, | |
| "loss": 12.1706, | |
| "num_input_tokens_seen": 9751756800, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.25056208306217137, | |
| "grad_norm": 0.7687821984291077, | |
| "learning_rate": 0.0005677328086123528, | |
| "loss": 12.2027, | |
| "num_input_tokens_seen": 9764864000, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 0.250898408005879, | |
| "grad_norm": 0.742973804473877, | |
| "learning_rate": 0.0005687981322874008, | |
| "loss": 12.2343, | |
| "num_input_tokens_seen": 9777971200, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.2512347329495866, | |
| "grad_norm": 0.7731523513793945, | |
| "learning_rate": 0.0005698465385731809, | |
| "loss": 12.1947, | |
| "num_input_tokens_seen": 9791078400, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 0.2515710578932942, | |
| "grad_norm": 0.7087680101394653, | |
| "learning_rate": 0.0005708779614859862, | |
| "loss": 12.1857, | |
| "num_input_tokens_seen": 9804185600, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.2519073828370018, | |
| "grad_norm": 0.7114725112915039, | |
| "learning_rate": 0.0005718923361109948, | |
| "loss": 12.1963, | |
| "num_input_tokens_seen": 9817292800, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 0.2522437077807094, | |
| "grad_norm": 0.737745463848114, | |
| "learning_rate": 0.0005728895986063555, | |
| "loss": 12.2333, | |
| "num_input_tokens_seen": 9830400000, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.2522437077807094, | |
| "eval_loss": 2.953993797302246, | |
| "eval_runtime": 143.4597, | |
| "eval_samples_per_second": 34.853, | |
| "eval_steps_per_second": 8.713, | |
| "num_input_tokens_seen": 9830400000, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.25258003272441704, | |
| "grad_norm": 0.6653860211372375, | |
| "learning_rate": 0.0005738696862072053, | |
| "loss": 12.1995, | |
| "num_input_tokens_seen": 9843507200, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 0.25291635766812465, | |
| "grad_norm": 0.7174036502838135, | |
| "learning_rate": 0.000574832537229621, | |
| "loss": 12.2128, | |
| "num_input_tokens_seen": 9856614400, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.25325268261183226, | |
| "grad_norm": 0.6888349056243896, | |
| "learning_rate": 0.0005757780910744996, | |
| "loss": 12.2092, | |
| "num_input_tokens_seen": 9869721600, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 0.25358900755553987, | |
| "grad_norm": 0.7647917866706848, | |
| "learning_rate": 0.0005767062882313744, | |
| "loss": 12.1714, | |
| "num_input_tokens_seen": 9882828800, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.2539253324992475, | |
| "grad_norm": 0.6871117949485779, | |
| "learning_rate": 0.0005776170702821582, | |
| "loss": 12.1855, | |
| "num_input_tokens_seen": 9895936000, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 0.2542616574429551, | |
| "grad_norm": 0.7536265850067139, | |
| "learning_rate": 0.0005785103799048218, | |
| "loss": 12.1877, | |
| "num_input_tokens_seen": 9909043200, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.2545979823866627, | |
| "grad_norm": 0.7374672293663025, | |
| "learning_rate": 0.000579386160877, | |
| "loss": 12.1878, | |
| "num_input_tokens_seen": 9922150400, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 0.2549343073303703, | |
| "grad_norm": 0.7381187677383423, | |
| "learning_rate": 0.0005802443580795319, | |
| "loss": 12.1768, | |
| "num_input_tokens_seen": 9935257600, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.2552706322740779, | |
| "grad_norm": 0.6883304715156555, | |
| "learning_rate": 0.0005810849174999284, | |
| "loss": 12.1864, | |
| "num_input_tokens_seen": 9948364800, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 0.25560695721778554, | |
| "grad_norm": 0.7935585975646973, | |
| "learning_rate": 0.0005819077862357724, | |
| "loss": 12.2082, | |
| "num_input_tokens_seen": 9961472000, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.25560695721778554, | |
| "eval_loss": 2.952374219894409, | |
| "eval_runtime": 144.3041, | |
| "eval_samples_per_second": 34.649, | |
| "eval_steps_per_second": 8.662, | |
| "num_input_tokens_seen": 9961472000, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.25594328216149315, | |
| "grad_norm": 0.7124053239822388, | |
| "learning_rate": 0.0005827129124980481, | |
| "loss": 12.2038, | |
| "num_input_tokens_seen": 9974579200, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 0.25627960710520076, | |
| "grad_norm": 0.6911721229553223, | |
| "learning_rate": 0.0005835002456144005, | |
| "loss": 12.1568, | |
| "num_input_tokens_seen": 9987686400, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.25661593204890837, | |
| "grad_norm": 0.6921409964561462, | |
| "learning_rate": 0.0005842697360323244, | |
| "loss": 12.1679, | |
| "num_input_tokens_seen": 10000793600, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 0.256952256992616, | |
| "grad_norm": 0.6664074659347534, | |
| "learning_rate": 0.0005850213353222835, | |
| "loss": 12.1476, | |
| "num_input_tokens_seen": 10013900800, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.2572885819363236, | |
| "grad_norm": 0.751232385635376, | |
| "learning_rate": 0.0005857549961807581, | |
| "loss": 12.1765, | |
| "num_input_tokens_seen": 10027008000, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 0.2576249068800312, | |
| "grad_norm": 0.7219013571739197, | |
| "learning_rate": 0.0005864706724332221, | |
| "loss": 12.1953, | |
| "num_input_tokens_seen": 10040115200, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.2579612318237388, | |
| "grad_norm": 0.7485892176628113, | |
| "learning_rate": 0.0005871683190370495, | |
| "loss": 12.1816, | |
| "num_input_tokens_seen": 10053222400, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 0.2582975567674464, | |
| "grad_norm": 0.7257047295570374, | |
| "learning_rate": 0.0005878478920843492, | |
| "loss": 12.1784, | |
| "num_input_tokens_seen": 10066329600, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.25863388171115403, | |
| "grad_norm": 0.7609989643096924, | |
| "learning_rate": 0.0005885093488047278, | |
| "loss": 12.1662, | |
| "num_input_tokens_seen": 10079436800, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 0.25897020665486165, | |
| "grad_norm": 0.7117071151733398, | |
| "learning_rate": 0.0005891526475679825, | |
| "loss": 12.1737, | |
| "num_input_tokens_seen": 10092544000, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.25897020665486165, | |
| "eval_loss": 2.951767921447754, | |
| "eval_runtime": 144.4716, | |
| "eval_samples_per_second": 34.609, | |
| "eval_steps_per_second": 8.652, | |
| "num_input_tokens_seen": 10092544000, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.25930653159856926, | |
| "grad_norm": 0.710891604423523, | |
| "learning_rate": 0.0005897777478867204, | |
| "loss": 12.1998, | |
| "num_input_tokens_seen": 10105651200, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 0.25964285654227687, | |
| "grad_norm": 0.6925230622291565, | |
| "learning_rate": 0.0005903846104189068, | |
| "loss": 12.1422, | |
| "num_input_tokens_seen": 10118758400, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.2599791814859845, | |
| "grad_norm": 0.7349727153778076, | |
| "learning_rate": 0.0005909731969703415, | |
| "loss": 12.1443, | |
| "num_input_tokens_seen": 10131865600, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 0.2603155064296921, | |
| "grad_norm": 0.7101354598999023, | |
| "learning_rate": 0.0005915434704970625, | |
| "loss": 12.1675, | |
| "num_input_tokens_seen": 10144972800, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.2606518313733997, | |
| "grad_norm": 0.7395833134651184, | |
| "learning_rate": 0.0005920953951076772, | |
| "loss": 12.1621, | |
| "num_input_tokens_seen": 10158080000, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 0.2609881563171073, | |
| "grad_norm": 0.6630483865737915, | |
| "learning_rate": 0.000592628936065622, | |
| "loss": 12.1786, | |
| "num_input_tokens_seen": 10171187200, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.2613244812608149, | |
| "grad_norm": 0.7242410778999329, | |
| "learning_rate": 0.0005931440597913479, | |
| "loss": 12.1812, | |
| "num_input_tokens_seen": 10184294400, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 0.2616608062045226, | |
| "grad_norm": 0.6953455805778503, | |
| "learning_rate": 0.0005936407338644336, | |
| "loss": 12.1973, | |
| "num_input_tokens_seen": 10197401600, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.2619971311482302, | |
| "grad_norm": 0.7216541171073914, | |
| "learning_rate": 0.0005941189270256271, | |
| "loss": 12.1768, | |
| "num_input_tokens_seen": 10210508800, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 0.2623334560919378, | |
| "grad_norm": 0.7473825216293335, | |
| "learning_rate": 0.000594578609178812, | |
| "loss": 12.1727, | |
| "num_input_tokens_seen": 10223616000, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.2623334560919378, | |
| "eval_loss": 2.948453187942505, | |
| "eval_runtime": 144.8455, | |
| "eval_samples_per_second": 34.52, | |
| "eval_steps_per_second": 8.63, | |
| "num_input_tokens_seen": 10223616000, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.2626697810356454, | |
| "grad_norm": 0.6958849430084229, | |
| "learning_rate": 0.0005950197513929021, | |
| "loss": 12.1838, | |
| "num_input_tokens_seen": 10236723200, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 0.26300610597935303, | |
| "grad_norm": 0.719382643699646, | |
| "learning_rate": 0.0005954423259036624, | |
| "loss": 12.1698, | |
| "num_input_tokens_seen": 10249830400, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.26334243092306064, | |
| "grad_norm": 0.7826195955276489, | |
| "learning_rate": 0.0005958463061154559, | |
| "loss": 12.1541, | |
| "num_input_tokens_seen": 10262937600, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 0.26367875586676826, | |
| "grad_norm": 0.7032487988471985, | |
| "learning_rate": 0.0005962316666029183, | |
| "loss": 12.2026, | |
| "num_input_tokens_seen": 10276044800, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.26401508081047587, | |
| "grad_norm": 0.6985056400299072, | |
| "learning_rate": 0.0005965983831125571, | |
| "loss": 12.0952, | |
| "num_input_tokens_seen": 10289152000, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 0.2643514057541835, | |
| "grad_norm": 0.7609641551971436, | |
| "learning_rate": 0.0005969464325642798, | |
| "loss": 12.1751, | |
| "num_input_tokens_seen": 10302259200, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.2646877306978911, | |
| "grad_norm": 0.7543072700500488, | |
| "learning_rate": 0.000597275793052844, | |
| "loss": 12.1587, | |
| "num_input_tokens_seen": 10315366400, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 0.2650240556415987, | |
| "grad_norm": 0.706814706325531, | |
| "learning_rate": 0.0005975864438492385, | |
| "loss": 12.1217, | |
| "num_input_tokens_seen": 10328473600, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.2653603805853063, | |
| "grad_norm": 0.6812881231307983, | |
| "learning_rate": 0.0005978783654019865, | |
| "loss": 12.1758, | |
| "num_input_tokens_seen": 10341580800, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 0.2656967055290139, | |
| "grad_norm": 0.7161454558372498, | |
| "learning_rate": 0.0005981515393383762, | |
| "loss": 12.122, | |
| "num_input_tokens_seen": 10354688000, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.2656967055290139, | |
| "eval_loss": 2.9451985359191895, | |
| "eval_runtime": 142.7964, | |
| "eval_samples_per_second": 35.015, | |
| "eval_steps_per_second": 8.754, | |
| "num_input_tokens_seen": 10354688000, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.26603303047272153, | |
| "grad_norm": 0.7173239588737488, | |
| "learning_rate": 0.0005984059484656179, | |
| "loss": 12.1612, | |
| "num_input_tokens_seen": 10367795200, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 0.26636935541642914, | |
| "grad_norm": 0.6520034670829773, | |
| "learning_rate": 0.0005986415767719254, | |
| "loss": 12.1175, | |
| "num_input_tokens_seen": 10380902400, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.26670568036013675, | |
| "grad_norm": 0.7121595144271851, | |
| "learning_rate": 0.0005988584094275236, | |
| "loss": 12.1337, | |
| "num_input_tokens_seen": 10394009600, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 0.26704200530384437, | |
| "grad_norm": 0.6799529790878296, | |
| "learning_rate": 0.0005990564327855826, | |
| "loss": 12.1631, | |
| "num_input_tokens_seen": 10407116800, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.267378330247552, | |
| "grad_norm": 0.7196834683418274, | |
| "learning_rate": 0.000599235634383076, | |
| "loss": 12.2017, | |
| "num_input_tokens_seen": 10420224000, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 0.2677146551912596, | |
| "grad_norm": 0.7003572583198547, | |
| "learning_rate": 0.0005993960029415653, | |
| "loss": 12.1466, | |
| "num_input_tokens_seen": 10433331200, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.2680509801349672, | |
| "grad_norm": 0.7599800229072571, | |
| "learning_rate": 0.0005995375283679099, | |
| "loss": 12.149, | |
| "num_input_tokens_seen": 10446438400, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 0.2683873050786748, | |
| "grad_norm": 0.6968846321105957, | |
| "learning_rate": 0.0005996602017549024, | |
| "loss": 12.1406, | |
| "num_input_tokens_seen": 10459545600, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.2687236300223824, | |
| "grad_norm": 0.7128781080245972, | |
| "learning_rate": 0.0005997640153818289, | |
| "loss": 12.186, | |
| "num_input_tokens_seen": 10472652800, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 0.26905995496609003, | |
| "grad_norm": 0.6576798558235168, | |
| "learning_rate": 0.0005998489627149555, | |
| "loss": 12.1667, | |
| "num_input_tokens_seen": 10485760000, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.26905995496609003, | |
| "eval_loss": 2.9424450397491455, | |
| "eval_runtime": 144.1047, | |
| "eval_samples_per_second": 34.697, | |
| "eval_steps_per_second": 8.674, | |
| "num_input_tokens_seen": 10485760000, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.26939627990979764, | |
| "grad_norm": 0.7382534742355347, | |
| "learning_rate": 0.0005999150384079386, | |
| "loss": 12.1596, | |
| "num_input_tokens_seen": 10498867200, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 0.26973260485350525, | |
| "grad_norm": 0.6991714835166931, | |
| "learning_rate": 0.0005999622383021625, | |
| "loss": 12.1364, | |
| "num_input_tokens_seen": 10511974400, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.27006892979721286, | |
| "grad_norm": 0.6768296957015991, | |
| "learning_rate": 0.0005999905594269998, | |
| "loss": 12.119, | |
| "num_input_tokens_seen": 10525081600, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 0.2704052547409205, | |
| "grad_norm": 0.6475286483764648, | |
| "learning_rate": 0.0006, | |
| "loss": 12.1936, | |
| "num_input_tokens_seen": 10538188800, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.2707415796846281, | |
| "grad_norm": 0.7031168341636658, | |
| "learning_rate": 0.0005999905594269998, | |
| "loss": 12.1317, | |
| "num_input_tokens_seen": 10551296000, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 0.2710779046283357, | |
| "grad_norm": 0.6848679780960083, | |
| "learning_rate": 0.0005999622383021625, | |
| "loss": 12.1177, | |
| "num_input_tokens_seen": 10564403200, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.2714142295720433, | |
| "grad_norm": 0.7390510439872742, | |
| "learning_rate": 0.0005999150384079387, | |
| "loss": 12.1262, | |
| "num_input_tokens_seen": 10577510400, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 0.2717505545157509, | |
| "grad_norm": 0.8185621500015259, | |
| "learning_rate": 0.0005998489627149555, | |
| "loss": 12.1441, | |
| "num_input_tokens_seen": 10590617600, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.27208687945945853, | |
| "grad_norm": 0.745233952999115, | |
| "learning_rate": 0.000599764015381829, | |
| "loss": 12.1814, | |
| "num_input_tokens_seen": 10603724800, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 0.27242320440316614, | |
| "grad_norm": 0.6643325686454773, | |
| "learning_rate": 0.0005996602017549024, | |
| "loss": 12.1603, | |
| "num_input_tokens_seen": 10616832000, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.27242320440316614, | |
| "eval_loss": 2.940908193588257, | |
| "eval_runtime": 144.742, | |
| "eval_samples_per_second": 34.544, | |
| "eval_steps_per_second": 8.636, | |
| "num_input_tokens_seen": 10616832000, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.27275952934687375, | |
| "grad_norm": 0.7548837065696716, | |
| "learning_rate": 0.0005995375283679099, | |
| "loss": 12.1047, | |
| "num_input_tokens_seen": 10629939200, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 0.27309585429058136, | |
| "grad_norm": 0.728705108165741, | |
| "learning_rate": 0.0005993960029415653, | |
| "loss": 12.1574, | |
| "num_input_tokens_seen": 10643046400, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.273432179234289, | |
| "grad_norm": 0.7318731546401978, | |
| "learning_rate": 0.0005992356343830761, | |
| "loss": 12.1235, | |
| "num_input_tokens_seen": 10656153600, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 0.2737685041779966, | |
| "grad_norm": 0.681523859500885, | |
| "learning_rate": 0.0005990564327855827, | |
| "loss": 12.1465, | |
| "num_input_tokens_seen": 10669260800, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.27410482912170425, | |
| "grad_norm": 0.8111559152603149, | |
| "learning_rate": 0.0005988584094275236, | |
| "loss": 12.1384, | |
| "num_input_tokens_seen": 10682368000, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 0.27444115406541186, | |
| "grad_norm": 0.7197436690330505, | |
| "learning_rate": 0.0005986415767719254, | |
| "loss": 12.1005, | |
| "num_input_tokens_seen": 10695475200, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.2747774790091195, | |
| "grad_norm": 0.7043651938438416, | |
| "learning_rate": 0.0005984059484656179, | |
| "loss": 12.1248, | |
| "num_input_tokens_seen": 10708582400, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 0.2751138039528271, | |
| "grad_norm": 0.669236958026886, | |
| "learning_rate": 0.0005981515393383762, | |
| "loss": 12.1399, | |
| "num_input_tokens_seen": 10721689600, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.2754501288965347, | |
| "grad_norm": 0.7053328156471252, | |
| "learning_rate": 0.0005978783654019865, | |
| "loss": 12.1099, | |
| "num_input_tokens_seen": 10734796800, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 0.2757864538402423, | |
| "grad_norm": 0.7475985288619995, | |
| "learning_rate": 0.0005975864438492385, | |
| "loss": 12.1097, | |
| "num_input_tokens_seen": 10747904000, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.2757864538402423, | |
| "eval_loss": 2.937664747238159, | |
| "eval_runtime": 144.1013, | |
| "eval_samples_per_second": 34.698, | |
| "eval_steps_per_second": 8.674, | |
| "num_input_tokens_seen": 10747904000, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.2761227787839499, | |
| "grad_norm": 0.744483232498169, | |
| "learning_rate": 0.000597275793052844, | |
| "loss": 12.1037, | |
| "num_input_tokens_seen": 10761011200, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 0.27645910372765753, | |
| "grad_norm": 0.6690404415130615, | |
| "learning_rate": 0.0005969464325642798, | |
| "loss": 12.0967, | |
| "num_input_tokens_seen": 10774118400, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.27679542867136514, | |
| "grad_norm": 0.6754409670829773, | |
| "learning_rate": 0.0005965983831125574, | |
| "loss": 12.1495, | |
| "num_input_tokens_seen": 10787225600, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 0.27713175361507275, | |
| "grad_norm": 0.7471763491630554, | |
| "learning_rate": 0.0005962316666029183, | |
| "loss": 12.1021, | |
| "num_input_tokens_seen": 10800332800, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.27746807855878036, | |
| "grad_norm": 0.6677432656288147, | |
| "learning_rate": 0.0005958463061154559, | |
| "loss": 12.0978, | |
| "num_input_tokens_seen": 10813440000, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 0.277804403502488, | |
| "grad_norm": 0.6972395777702332, | |
| "learning_rate": 0.0005954423259036624, | |
| "loss": 12.0866, | |
| "num_input_tokens_seen": 10826547200, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.2781407284461956, | |
| "grad_norm": 0.6901528835296631, | |
| "learning_rate": 0.0005950197513929021, | |
| "loss": 12.1149, | |
| "num_input_tokens_seen": 10839654400, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 0.2784770533899032, | |
| "grad_norm": 0.7258496880531311, | |
| "learning_rate": 0.0005945786091788119, | |
| "loss": 12.0969, | |
| "num_input_tokens_seen": 10852761600, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.2788133783336108, | |
| "grad_norm": 0.6775540113449097, | |
| "learning_rate": 0.0005941189270256271, | |
| "loss": 12.1099, | |
| "num_input_tokens_seen": 10865868800, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 0.2791497032773184, | |
| "grad_norm": 0.7190734148025513, | |
| "learning_rate": 0.0005936407338644335, | |
| "loss": 12.1215, | |
| "num_input_tokens_seen": 10878976000, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.2791497032773184, | |
| "eval_loss": 2.9347798824310303, | |
| "eval_runtime": 144.156, | |
| "eval_samples_per_second": 34.685, | |
| "eval_steps_per_second": 8.671, | |
| "num_input_tokens_seen": 10878976000, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.27948602822102603, | |
| "grad_norm": 0.7368255257606506, | |
| "learning_rate": 0.0005931440597913479, | |
| "loss": 12.0951, | |
| "num_input_tokens_seen": 10892083200, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 0.27982235316473364, | |
| "grad_norm": 0.6617956161499023, | |
| "learning_rate": 0.0005926289360656221, | |
| "loss": 12.1469, | |
| "num_input_tokens_seen": 10905190400, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.28015867810844125, | |
| "grad_norm": 0.729915201663971, | |
| "learning_rate": 0.0005920953951076773, | |
| "loss": 12.0647, | |
| "num_input_tokens_seen": 10918297600, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 0.28049500305214886, | |
| "grad_norm": 0.6950346827507019, | |
| "learning_rate": 0.0005915434704970625, | |
| "loss": 12.0908, | |
| "num_input_tokens_seen": 10931404800, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.2808313279958565, | |
| "grad_norm": 0.7097339630126953, | |
| "learning_rate": 0.0005909731969703416, | |
| "loss": 12.1064, | |
| "num_input_tokens_seen": 10944512000, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 0.2811676529395641, | |
| "grad_norm": 0.6792251467704773, | |
| "learning_rate": 0.0005903846104189068, | |
| "loss": 12.1007, | |
| "num_input_tokens_seen": 10957619200, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.2815039778832717, | |
| "grad_norm": 0.7234287858009338, | |
| "learning_rate": 0.0005897777478867204, | |
| "loss": 12.0688, | |
| "num_input_tokens_seen": 10970726400, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 0.2818403028269793, | |
| "grad_norm": 0.687317430973053, | |
| "learning_rate": 0.0005891526475679825, | |
| "loss": 12.0511, | |
| "num_input_tokens_seen": 10983833600, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.2821766277706869, | |
| "grad_norm": 0.6987143754959106, | |
| "learning_rate": 0.0005885093488047278, | |
| "loss": 12.0861, | |
| "num_input_tokens_seen": 10996940800, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 0.2825129527143945, | |
| "grad_norm": 0.7171245217323303, | |
| "learning_rate": 0.0005878478920843492, | |
| "loss": 12.0889, | |
| "num_input_tokens_seen": 11010048000, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.2825129527143945, | |
| "eval_loss": 2.927813768386841, | |
| "eval_runtime": 143.2851, | |
| "eval_samples_per_second": 34.895, | |
| "eval_steps_per_second": 8.724, | |
| "num_input_tokens_seen": 11010048000, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.28284927765810214, | |
| "grad_norm": 0.6817448735237122, | |
| "learning_rate": 0.0005871683190370497, | |
| "loss": 12.0507, | |
| "num_input_tokens_seen": 11023155200, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 0.28318560260180975, | |
| "grad_norm": 1.443415641784668, | |
| "learning_rate": 0.0005864706724332221, | |
| "loss": 12.0804, | |
| "num_input_tokens_seen": 11036262400, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.28352192754551736, | |
| "grad_norm": 0.7497735619544983, | |
| "learning_rate": 0.0005857549961807582, | |
| "loss": 12.1135, | |
| "num_input_tokens_seen": 11049369600, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 0.28385825248922497, | |
| "grad_norm": 0.7141171097755432, | |
| "learning_rate": 0.0005850213353222835, | |
| "loss": 12.0707, | |
| "num_input_tokens_seen": 11062476800, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.2841945774329326, | |
| "grad_norm": 0.6800997257232666, | |
| "learning_rate": 0.0005842697360323246, | |
| "loss": 12.0946, | |
| "num_input_tokens_seen": 11075584000, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 0.2845309023766402, | |
| "grad_norm": 0.6729973554611206, | |
| "learning_rate": 0.0005835002456144005, | |
| "loss": 12.0882, | |
| "num_input_tokens_seen": 11088691200, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.2848672273203478, | |
| "grad_norm": 0.715886116027832, | |
| "learning_rate": 0.0005827129124980481, | |
| "loss": 12.0713, | |
| "num_input_tokens_seen": 11101798400, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 0.2852035522640554, | |
| "grad_norm": 0.7392980456352234, | |
| "learning_rate": 0.0005819077862357724, | |
| "loss": 12.0934, | |
| "num_input_tokens_seen": 11114905600, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.285539877207763, | |
| "grad_norm": 0.7118540406227112, | |
| "learning_rate": 0.0005810849174999285, | |
| "loss": 12.0531, | |
| "num_input_tokens_seen": 11128012800, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 0.28587620215147064, | |
| "grad_norm": 0.6643871665000916, | |
| "learning_rate": 0.000580244358079532, | |
| "loss": 12.0812, | |
| "num_input_tokens_seen": 11141120000, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.28587620215147064, | |
| "eval_loss": 2.9250741004943848, | |
| "eval_runtime": 143.6479, | |
| "eval_samples_per_second": 34.807, | |
| "eval_steps_per_second": 8.702, | |
| "num_input_tokens_seen": 11141120000, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.2862125270951783, | |
| "grad_norm": 0.7261589169502258, | |
| "learning_rate": 0.0005793861608770001, | |
| "loss": 12.0856, | |
| "num_input_tokens_seen": 11154227200, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 0.2865488520388859, | |
| "grad_norm": 0.7352684140205383, | |
| "learning_rate": 0.0005785103799048218, | |
| "loss": 12.094, | |
| "num_input_tokens_seen": 11167334400, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.2868851769825935, | |
| "grad_norm": 0.650610089302063, | |
| "learning_rate": 0.0005776170702821582, | |
| "loss": 12.0796, | |
| "num_input_tokens_seen": 11180441600, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 0.28722150192630114, | |
| "grad_norm": 0.6917529106140137, | |
| "learning_rate": 0.0005767062882313743, | |
| "loss": 12.0511, | |
| "num_input_tokens_seen": 11193548800, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.28755782687000875, | |
| "grad_norm": 0.8611562252044678, | |
| "learning_rate": 0.0005757780910744997, | |
| "loss": 12.0772, | |
| "num_input_tokens_seen": 11206656000, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 0.28789415181371636, | |
| "grad_norm": 0.7321364283561707, | |
| "learning_rate": 0.0005748325372296208, | |
| "loss": 12.0432, | |
| "num_input_tokens_seen": 11219763200, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.28823047675742397, | |
| "grad_norm": 0.6974388957023621, | |
| "learning_rate": 0.0005738696862072053, | |
| "loss": 12.0408, | |
| "num_input_tokens_seen": 11232870400, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 0.2885668017011316, | |
| "grad_norm": 0.6981905102729797, | |
| "learning_rate": 0.0005728895986063554, | |
| "loss": 12.0419, | |
| "num_input_tokens_seen": 11245977600, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.2889031266448392, | |
| "grad_norm": 0.7019402384757996, | |
| "learning_rate": 0.000571892336110995, | |
| "loss": 12.0206, | |
| "num_input_tokens_seen": 11259084800, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 0.2892394515885468, | |
| "grad_norm": 0.7176699042320251, | |
| "learning_rate": 0.0005708779614859863, | |
| "loss": 12.0641, | |
| "num_input_tokens_seen": 11272192000, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.2892394515885468, | |
| "eval_loss": 2.9219655990600586, | |
| "eval_runtime": 144.3813, | |
| "eval_samples_per_second": 34.631, | |
| "eval_steps_per_second": 8.658, | |
| "num_input_tokens_seen": 11272192000, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.2895757765322544, | |
| "grad_norm": 0.6620699763298035, | |
| "learning_rate": 0.000569846538573181, | |
| "loss": 12.0268, | |
| "num_input_tokens_seen": 11285299200, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 0.289912101475962, | |
| "grad_norm": 0.732656717300415, | |
| "learning_rate": 0.0005687981322874007, | |
| "loss": 12.0479, | |
| "num_input_tokens_seen": 11298406400, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.29024842641966964, | |
| "grad_norm": 0.6901125907897949, | |
| "learning_rate": 0.0005677328086123529, | |
| "loss": 12.0703, | |
| "num_input_tokens_seen": 11311513600, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 0.29058475136337725, | |
| "grad_norm": 0.6816314458847046, | |
| "learning_rate": 0.0005666506345964772, | |
| "loss": 12.0966, | |
| "num_input_tokens_seen": 11324620800, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.29092107630708486, | |
| "grad_norm": 0.6607284545898438, | |
| "learning_rate": 0.000565551678348725, | |
| "loss": 12.0123, | |
| "num_input_tokens_seen": 11337728000, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 0.29125740125079247, | |
| "grad_norm": 0.6880225539207458, | |
| "learning_rate": 0.0005644360090342745, | |
| "loss": 12.0421, | |
| "num_input_tokens_seen": 11350835200, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.2915937261945001, | |
| "grad_norm": 0.6576426029205322, | |
| "learning_rate": 0.0005633036968701766, | |
| "loss": 12.0355, | |
| "num_input_tokens_seen": 11363942400, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 0.2919300511382077, | |
| "grad_norm": 0.6806447505950928, | |
| "learning_rate": 0.0005621548131209353, | |
| "loss": 12.0741, | |
| "num_input_tokens_seen": 11377049600, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.2922663760819153, | |
| "grad_norm": 0.662423849105835, | |
| "learning_rate": 0.0005609894300940238, | |
| "loss": 12.0367, | |
| "num_input_tokens_seen": 11390156800, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 0.2926027010256229, | |
| "grad_norm": 0.7388314604759216, | |
| "learning_rate": 0.0005598076211353315, | |
| "loss": 12.0113, | |
| "num_input_tokens_seen": 11403264000, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.2926027010256229, | |
| "eval_loss": 2.9169106483459473, | |
| "eval_runtime": 144.3511, | |
| "eval_samples_per_second": 34.638, | |
| "eval_steps_per_second": 8.659, | |
| "num_input_tokens_seen": 11403264000, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.2929390259693305, | |
| "grad_norm": 0.752152681350708, | |
| "learning_rate": 0.0005586094606245508, | |
| "loss": 12.0399, | |
| "num_input_tokens_seen": 11416371200, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 0.29327535091303814, | |
| "grad_norm": 0.6547142863273621, | |
| "learning_rate": 0.000557395023970493, | |
| "loss": 12.0508, | |
| "num_input_tokens_seen": 11429478400, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 0.29361167585674575, | |
| "grad_norm": 0.7126719951629639, | |
| "learning_rate": 0.0005561643876063445, | |
| "loss": 12.0612, | |
| "num_input_tokens_seen": 11442585600, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 0.29394800080045336, | |
| "grad_norm": 0.7018846869468689, | |
| "learning_rate": 0.0005549176289848543, | |
| "loss": 12.0491, | |
| "num_input_tokens_seen": 11455692800, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 0.29428432574416097, | |
| "grad_norm": 0.6944059133529663, | |
| "learning_rate": 0.0005536548265734613, | |
| "loss": 12.0484, | |
| "num_input_tokens_seen": 11468800000, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 0.2946206506878686, | |
| "grad_norm": 0.6890963912010193, | |
| "learning_rate": 0.0005523760598493542, | |
| "loss": 12.0126, | |
| "num_input_tokens_seen": 11481907200, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.2949569756315762, | |
| "grad_norm": 0.6812265515327454, | |
| "learning_rate": 0.0005510814092944709, | |
| "loss": 12.0461, | |
| "num_input_tokens_seen": 11495014400, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 0.2952933005752838, | |
| "grad_norm": 0.7193289995193481, | |
| "learning_rate": 0.0005497709563904316, | |
| "loss": 11.9808, | |
| "num_input_tokens_seen": 11508121600, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 0.2956296255189914, | |
| "grad_norm": 0.7270051836967468, | |
| "learning_rate": 0.0005484447836134113, | |
| "loss": 12.0367, | |
| "num_input_tokens_seen": 11521228800, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 0.295965950462699, | |
| "grad_norm": 0.6606201529502869, | |
| "learning_rate": 0.0005471029744289497, | |
| "loss": 12.0095, | |
| "num_input_tokens_seen": 11534336000, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.295965950462699, | |
| "eval_loss": 2.9146511554718018, | |
| "eval_runtime": 143.7716, | |
| "eval_samples_per_second": 34.777, | |
| "eval_steps_per_second": 8.694, | |
| "num_input_tokens_seen": 11534336000, | |
| "step": 44000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 60000, | |
| "num_input_tokens_seen": 11534336000, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.34987821252608e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |