| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.31005163552237736, |
| "eval_steps": 500, |
| "global_step": 65000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00023850125809413643, |
| "grad_norm": 1.5476292371749878, |
| "learning_rate": 0.0001, |
| "loss": 9.8285, |
| "num_input_tokens_seen": 13107200, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00047700251618827287, |
| "grad_norm": 0.47720426321029663, |
| "learning_rate": 0.0002, |
| "loss": 7.964, |
| "num_input_tokens_seen": 26214400, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0007155037742824094, |
| "grad_norm": 0.8443030714988708, |
| "learning_rate": 0.0003, |
| "loss": 7.0452, |
| "num_input_tokens_seen": 39321600, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0009540050323765457, |
| "grad_norm": 0.5895723104476929, |
| "learning_rate": 0.0004, |
| "loss": 6.355, |
| "num_input_tokens_seen": 52428800, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0011925062904706823, |
| "grad_norm": 0.8343789577484131, |
| "learning_rate": 0.0005, |
| "loss": 5.8716, |
| "num_input_tokens_seen": 65536000, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0014310075485648188, |
| "grad_norm": 0.5747953057289124, |
| "learning_rate": 0.0006, |
| "loss": 5.5086, |
| "num_input_tokens_seen": 78643200, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0016695088066589552, |
| "grad_norm": 0.8383421301841736, |
| "learning_rate": 0.0007, |
| "loss": 5.2217, |
| "num_input_tokens_seen": 91750400, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0019080100647530915, |
| "grad_norm": 0.5696113109588623, |
| "learning_rate": 0.0008, |
| "loss": 4.9683, |
| "num_input_tokens_seen": 104857600, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.002146511322847228, |
| "grad_norm": 0.5431691408157349, |
| "learning_rate": 0.0009000000000000001, |
| "loss": 4.7629, |
| "num_input_tokens_seen": 117964800, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0023850125809413646, |
| "grad_norm": 0.4571855664253235, |
| "learning_rate": 0.001, |
| "loss": 4.5284, |
| "num_input_tokens_seen": 131072000, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0023850125809413646, |
| "eval_loss": 4.309167385101318, |
| "eval_runtime": 53.3891, |
| "eval_samples_per_second": 93.652, |
| "eval_steps_per_second": 23.413, |
| "num_input_tokens_seen": 131072000, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.002623513839035501, |
| "grad_norm": 0.43464773893356323, |
| "learning_rate": 0.001, |
| "loss": 4.3379, |
| "num_input_tokens_seen": 144179200, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.0028620150971296375, |
| "grad_norm": 0.49611660838127136, |
| "learning_rate": 0.001, |
| "loss": 4.1712, |
| "num_input_tokens_seen": 157286400, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0031005163552237738, |
| "grad_norm": 0.4060957729816437, |
| "learning_rate": 0.001, |
| "loss": 4.0436, |
| "num_input_tokens_seen": 170393600, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0033390176133179105, |
| "grad_norm": 0.37300577759742737, |
| "learning_rate": 0.001, |
| "loss": 3.9582, |
| "num_input_tokens_seen": 183500800, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0035775188714120467, |
| "grad_norm": 0.4117021858692169, |
| "learning_rate": 0.001, |
| "loss": 3.8674, |
| "num_input_tokens_seen": 196608000, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.003816020129506183, |
| "grad_norm": 0.3335980772972107, |
| "learning_rate": 0.001, |
| "loss": 3.8031, |
| "num_input_tokens_seen": 209715200, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.004054521387600319, |
| "grad_norm": 0.35943159461021423, |
| "learning_rate": 0.001, |
| "loss": 3.7534, |
| "num_input_tokens_seen": 222822400, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.004293022645694456, |
| "grad_norm": 0.40000948309898376, |
| "learning_rate": 0.001, |
| "loss": 3.6867, |
| "num_input_tokens_seen": 235929600, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0045315239037885926, |
| "grad_norm": 0.3165877163410187, |
| "learning_rate": 0.001, |
| "loss": 3.6565, |
| "num_input_tokens_seen": 249036800, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.004770025161882729, |
| "grad_norm": 0.3687070906162262, |
| "learning_rate": 0.001, |
| "loss": 3.6005, |
| "num_input_tokens_seen": 262144000, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.004770025161882729, |
| "eval_loss": 3.4853296279907227, |
| "eval_runtime": 52.477, |
| "eval_samples_per_second": 95.28, |
| "eval_steps_per_second": 23.82, |
| "num_input_tokens_seen": 262144000, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.005008526419976865, |
| "grad_norm": 0.32389721274375916, |
| "learning_rate": 0.001, |
| "loss": 3.5663, |
| "num_input_tokens_seen": 275251200, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.005247027678071002, |
| "grad_norm": 0.3202049434185028, |
| "learning_rate": 0.001, |
| "loss": 3.5376, |
| "num_input_tokens_seen": 288358400, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.005485528936165138, |
| "grad_norm": 0.30287981033325195, |
| "learning_rate": 0.001, |
| "loss": 3.5135, |
| "num_input_tokens_seen": 301465600, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.005724030194259275, |
| "grad_norm": 0.3624540865421295, |
| "learning_rate": 0.001, |
| "loss": 3.4814, |
| "num_input_tokens_seen": 314572800, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.005962531452353411, |
| "grad_norm": 0.30017992854118347, |
| "learning_rate": 0.001, |
| "loss": 3.4476, |
| "num_input_tokens_seen": 327680000, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0062010327104475476, |
| "grad_norm": 0.3169330060482025, |
| "learning_rate": 0.001, |
| "loss": 3.4179, |
| "num_input_tokens_seen": 340787200, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.006439533968541684, |
| "grad_norm": 0.2730589210987091, |
| "learning_rate": 0.001, |
| "loss": 3.4074, |
| "num_input_tokens_seen": 353894400, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.006678035226635821, |
| "grad_norm": 0.2927146553993225, |
| "learning_rate": 0.001, |
| "loss": 3.3757, |
| "num_input_tokens_seen": 367001600, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.006916536484729957, |
| "grad_norm": 0.34230080246925354, |
| "learning_rate": 0.001, |
| "loss": 3.3502, |
| "num_input_tokens_seen": 380108800, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.007155037742824093, |
| "grad_norm": 0.30472344160079956, |
| "learning_rate": 0.001, |
| "loss": 3.3639, |
| "num_input_tokens_seen": 393216000, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.007155037742824093, |
| "eval_loss": 3.2486374378204346, |
| "eval_runtime": 52.482, |
| "eval_samples_per_second": 95.271, |
| "eval_steps_per_second": 23.818, |
| "num_input_tokens_seen": 393216000, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.00739353900091823, |
| "grad_norm": 0.26124337315559387, |
| "learning_rate": 0.001, |
| "loss": 3.3521, |
| "num_input_tokens_seen": 406323200, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.007632040259012366, |
| "grad_norm": 0.29117754101753235, |
| "learning_rate": 0.001, |
| "loss": 3.315, |
| "num_input_tokens_seen": 419430400, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.007870541517106503, |
| "grad_norm": 0.24080802500247955, |
| "learning_rate": 0.001, |
| "loss": 3.3103, |
| "num_input_tokens_seen": 432537600, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.008109042775200638, |
| "grad_norm": 0.29982003569602966, |
| "learning_rate": 0.001, |
| "loss": 3.2926, |
| "num_input_tokens_seen": 445644800, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.008347544033294775, |
| "grad_norm": 0.26795274019241333, |
| "learning_rate": 0.001, |
| "loss": 3.2843, |
| "num_input_tokens_seen": 458752000, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.008586045291388912, |
| "grad_norm": 0.252774715423584, |
| "learning_rate": 0.001, |
| "loss": 3.274, |
| "num_input_tokens_seen": 471859200, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.008824546549483048, |
| "grad_norm": 0.25432145595550537, |
| "learning_rate": 0.001, |
| "loss": 3.2533, |
| "num_input_tokens_seen": 484966400, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.009063047807577185, |
| "grad_norm": 0.25918108224868774, |
| "learning_rate": 0.001, |
| "loss": 3.2501, |
| "num_input_tokens_seen": 498073600, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.009301549065671322, |
| "grad_norm": 0.2482348382472992, |
| "learning_rate": 0.001, |
| "loss": 3.2541, |
| "num_input_tokens_seen": 511180800, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.009540050323765458, |
| "grad_norm": 0.2615273594856262, |
| "learning_rate": 0.001, |
| "loss": 3.2218, |
| "num_input_tokens_seen": 524288000, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.009540050323765458, |
| "eval_loss": 3.1193039417266846, |
| "eval_runtime": 52.7955, |
| "eval_samples_per_second": 94.705, |
| "eval_steps_per_second": 23.676, |
| "num_input_tokens_seen": 524288000, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.009778551581859595, |
| "grad_norm": 0.2637729048728943, |
| "learning_rate": 0.001, |
| "loss": 3.2285, |
| "num_input_tokens_seen": 537395200, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.01001705283995373, |
| "grad_norm": 0.23936080932617188, |
| "learning_rate": 0.001, |
| "loss": 3.2119, |
| "num_input_tokens_seen": 550502400, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.010255554098047867, |
| "grad_norm": 0.2469020038843155, |
| "learning_rate": 0.001, |
| "loss": 3.2021, |
| "num_input_tokens_seen": 563609600, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.010494055356142003, |
| "grad_norm": 0.2304004430770874, |
| "learning_rate": 0.001, |
| "loss": 3.1874, |
| "num_input_tokens_seen": 576716800, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.01073255661423614, |
| "grad_norm": 0.232864648103714, |
| "learning_rate": 0.001, |
| "loss": 3.1897, |
| "num_input_tokens_seen": 589824000, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.010971057872330277, |
| "grad_norm": 0.23161470890045166, |
| "learning_rate": 0.001, |
| "loss": 3.1689, |
| "num_input_tokens_seen": 602931200, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.011209559130424413, |
| "grad_norm": 0.20868408679962158, |
| "learning_rate": 0.001, |
| "loss": 3.1615, |
| "num_input_tokens_seen": 616038400, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.01144806038851855, |
| "grad_norm": 0.23374608159065247, |
| "learning_rate": 0.001, |
| "loss": 3.1556, |
| "num_input_tokens_seen": 629145600, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.011686561646612685, |
| "grad_norm": 0.21716611087322235, |
| "learning_rate": 0.001, |
| "loss": 3.1463, |
| "num_input_tokens_seen": 642252800, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.011925062904706822, |
| "grad_norm": 0.23689670860767365, |
| "learning_rate": 0.001, |
| "loss": 3.1433, |
| "num_input_tokens_seen": 655360000, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.011925062904706822, |
| "eval_loss": 3.040046215057373, |
| "eval_runtime": 52.9109, |
| "eval_samples_per_second": 94.498, |
| "eval_steps_per_second": 23.625, |
| "num_input_tokens_seen": 655360000, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.012163564162800958, |
| "grad_norm": 0.2245575189590454, |
| "learning_rate": 0.001, |
| "loss": 3.1445, |
| "num_input_tokens_seen": 668467200, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.012402065420895095, |
| "grad_norm": 0.20992259681224823, |
| "learning_rate": 0.001, |
| "loss": 3.1447, |
| "num_input_tokens_seen": 681574400, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.012640566678989232, |
| "grad_norm": 0.21792201697826385, |
| "learning_rate": 0.001, |
| "loss": 3.1323, |
| "num_input_tokens_seen": 694681600, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.012879067937083368, |
| "grad_norm": 0.243458554148674, |
| "learning_rate": 0.001, |
| "loss": 3.1084, |
| "num_input_tokens_seen": 707788800, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.013117569195177505, |
| "grad_norm": 0.21190515160560608, |
| "learning_rate": 0.001, |
| "loss": 3.1202, |
| "num_input_tokens_seen": 720896000, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.013356070453271642, |
| "grad_norm": 0.2461613118648529, |
| "learning_rate": 0.001, |
| "loss": 3.1007, |
| "num_input_tokens_seen": 734003200, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.013594571711365777, |
| "grad_norm": 0.1976248323917389, |
| "learning_rate": 0.001, |
| "loss": 3.1079, |
| "num_input_tokens_seen": 747110400, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.013833072969459913, |
| "grad_norm": 0.22097842395305634, |
| "learning_rate": 0.001, |
| "loss": 3.0846, |
| "num_input_tokens_seen": 760217600, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.01407157422755405, |
| "grad_norm": 0.20581132173538208, |
| "learning_rate": 0.001, |
| "loss": 3.0995, |
| "num_input_tokens_seen": 773324800, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.014310075485648187, |
| "grad_norm": 0.19790051877498627, |
| "learning_rate": 0.001, |
| "loss": 3.0977, |
| "num_input_tokens_seen": 786432000, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.014310075485648187, |
| "eval_loss": 2.9804909229278564, |
| "eval_runtime": 53.1278, |
| "eval_samples_per_second": 94.113, |
| "eval_steps_per_second": 23.528, |
| "num_input_tokens_seen": 786432000, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.014548576743742323, |
| "grad_norm": 0.20328116416931152, |
| "learning_rate": 0.001, |
| "loss": 3.0872, |
| "num_input_tokens_seen": 799539200, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.01478707800183646, |
| "grad_norm": 0.21318025887012482, |
| "learning_rate": 0.001, |
| "loss": 3.0861, |
| "num_input_tokens_seen": 812646400, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.015025579259930597, |
| "grad_norm": 0.22170069813728333, |
| "learning_rate": 0.001, |
| "loss": 3.0618, |
| "num_input_tokens_seen": 825753600, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.015264080518024732, |
| "grad_norm": 0.21292312443256378, |
| "learning_rate": 0.001, |
| "loss": 3.0567, |
| "num_input_tokens_seen": 838860800, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.015502581776118868, |
| "grad_norm": 0.2331959754228592, |
| "learning_rate": 0.001, |
| "loss": 3.0714, |
| "num_input_tokens_seen": 851968000, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.015741083034213007, |
| "grad_norm": 0.19236011803150177, |
| "learning_rate": 0.001, |
| "loss": 3.059, |
| "num_input_tokens_seen": 865075200, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.015979584292307142, |
| "grad_norm": 0.19991376996040344, |
| "learning_rate": 0.001, |
| "loss": 3.0542, |
| "num_input_tokens_seen": 878182400, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.016218085550401277, |
| "grad_norm": 0.2042934149503708, |
| "learning_rate": 0.001, |
| "loss": 3.0517, |
| "num_input_tokens_seen": 891289600, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.016456586808495415, |
| "grad_norm": 0.19254428148269653, |
| "learning_rate": 0.001, |
| "loss": 3.0415, |
| "num_input_tokens_seen": 904396800, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.01669508806658955, |
| "grad_norm": 0.19211998581886292, |
| "learning_rate": 0.001, |
| "loss": 3.0253, |
| "num_input_tokens_seen": 917504000, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.01669508806658955, |
| "eval_loss": 2.937037944793701, |
| "eval_runtime": 52.6773, |
| "eval_samples_per_second": 94.918, |
| "eval_steps_per_second": 23.729, |
| "num_input_tokens_seen": 917504000, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.01693358932468369, |
| "grad_norm": 0.19596482813358307, |
| "learning_rate": 0.001, |
| "loss": 3.053, |
| "num_input_tokens_seen": 930611200, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.017172090582777823, |
| "grad_norm": 0.20214103162288666, |
| "learning_rate": 0.001, |
| "loss": 3.0385, |
| "num_input_tokens_seen": 943718400, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.017410591840871962, |
| "grad_norm": 0.18580283224582672, |
| "learning_rate": 0.001, |
| "loss": 3.0354, |
| "num_input_tokens_seen": 956825600, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.017649093098966097, |
| "grad_norm": 0.18928515911102295, |
| "learning_rate": 0.001, |
| "loss": 3.0292, |
| "num_input_tokens_seen": 969932800, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.017887594357060232, |
| "grad_norm": 0.19066137075424194, |
| "learning_rate": 0.001, |
| "loss": 3.0206, |
| "num_input_tokens_seen": 983040000, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.01812609561515437, |
| "grad_norm": 0.20291416347026825, |
| "learning_rate": 0.001, |
| "loss": 3.0254, |
| "num_input_tokens_seen": 996147200, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.018364596873248505, |
| "grad_norm": 0.19991491734981537, |
| "learning_rate": 0.001, |
| "loss": 3.0212, |
| "num_input_tokens_seen": 1009254400, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.018603098131342644, |
| "grad_norm": 0.19553051888942719, |
| "learning_rate": 0.001, |
| "loss": 3.0229, |
| "num_input_tokens_seen": 1022361600, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.01884159938943678, |
| "grad_norm": 0.19302095472812653, |
| "learning_rate": 0.001, |
| "loss": 3.0137, |
| "num_input_tokens_seen": 1035468800, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.019080100647530917, |
| "grad_norm": 0.18680201470851898, |
| "learning_rate": 0.001, |
| "loss": 3.0106, |
| "num_input_tokens_seen": 1048576000, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.019080100647530917, |
| "eval_loss": 2.8984477519989014, |
| "eval_runtime": 52.851, |
| "eval_samples_per_second": 94.606, |
| "eval_steps_per_second": 23.651, |
| "num_input_tokens_seen": 1048576000, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.019318601905625052, |
| "grad_norm": 0.18222174048423767, |
| "learning_rate": 0.001, |
| "loss": 3.0095, |
| "num_input_tokens_seen": 1061683200, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.01955710316371919, |
| "grad_norm": 0.1929137110710144, |
| "learning_rate": 0.001, |
| "loss": 3.0022, |
| "num_input_tokens_seen": 1074790400, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.019795604421813325, |
| "grad_norm": 0.19358602166175842, |
| "learning_rate": 0.001, |
| "loss": 2.9978, |
| "num_input_tokens_seen": 1087897600, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.02003410567990746, |
| "grad_norm": 0.19070614874362946, |
| "learning_rate": 0.001, |
| "loss": 3.0016, |
| "num_input_tokens_seen": 1101004800, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.0202726069380016, |
| "grad_norm": 0.17888160049915314, |
| "learning_rate": 0.001, |
| "loss": 2.9984, |
| "num_input_tokens_seen": 1114112000, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.020511108196095734, |
| "grad_norm": 0.1823708564043045, |
| "learning_rate": 0.001, |
| "loss": 3.004, |
| "num_input_tokens_seen": 1127219200, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.020749609454189872, |
| "grad_norm": 0.1753600388765335, |
| "learning_rate": 0.001, |
| "loss": 2.9814, |
| "num_input_tokens_seen": 1140326400, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.020988110712284007, |
| "grad_norm": 0.1710510551929474, |
| "learning_rate": 0.001, |
| "loss": 2.9597, |
| "num_input_tokens_seen": 1153433600, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.021226611970378145, |
| "grad_norm": 0.18727277219295502, |
| "learning_rate": 0.001, |
| "loss": 2.9695, |
| "num_input_tokens_seen": 1166540800, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.02146511322847228, |
| "grad_norm": 0.17773132026195526, |
| "learning_rate": 0.001, |
| "loss": 2.9664, |
| "num_input_tokens_seen": 1179648000, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.02146511322847228, |
| "eval_loss": 2.871137857437134, |
| "eval_runtime": 51.4876, |
| "eval_samples_per_second": 97.111, |
| "eval_steps_per_second": 24.278, |
| "num_input_tokens_seen": 1179648000, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.021703614486566415, |
| "grad_norm": 0.1875799000263214, |
| "learning_rate": 0.001, |
| "loss": 2.9682, |
| "num_input_tokens_seen": 1192755200, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.021942115744660554, |
| "grad_norm": 0.18222226202487946, |
| "learning_rate": 0.001, |
| "loss": 2.9484, |
| "num_input_tokens_seen": 1205862400, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.02218061700275469, |
| "grad_norm": 0.191411092877388, |
| "learning_rate": 0.001, |
| "loss": 2.9637, |
| "num_input_tokens_seen": 1218969600, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.022419118260848827, |
| "grad_norm": 0.17608201503753662, |
| "learning_rate": 0.001, |
| "loss": 2.9792, |
| "num_input_tokens_seen": 1232076800, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.022657619518942962, |
| "grad_norm": 0.1718858927488327, |
| "learning_rate": 0.001, |
| "loss": 2.9674, |
| "num_input_tokens_seen": 1245184000, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.0228961207770371, |
| "grad_norm": 0.18428942561149597, |
| "learning_rate": 0.001, |
| "loss": 2.976, |
| "num_input_tokens_seen": 1258291200, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.023134622035131235, |
| "grad_norm": 0.16696259379386902, |
| "learning_rate": 0.001, |
| "loss": 2.9486, |
| "num_input_tokens_seen": 1271398400, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.02337312329322537, |
| "grad_norm": 0.18239040672779083, |
| "learning_rate": 0.001, |
| "loss": 2.956, |
| "num_input_tokens_seen": 1284505600, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.02361162455131951, |
| "grad_norm": 0.17167994379997253, |
| "learning_rate": 0.001, |
| "loss": 2.9449, |
| "num_input_tokens_seen": 1297612800, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.023850125809413644, |
| "grad_norm": 0.18532761931419373, |
| "learning_rate": 0.001, |
| "loss": 2.947, |
| "num_input_tokens_seen": 1310720000, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.023850125809413644, |
| "eval_loss": 2.8470754623413086, |
| "eval_runtime": 51.04, |
| "eval_samples_per_second": 97.962, |
| "eval_steps_per_second": 24.491, |
| "num_input_tokens_seen": 1310720000, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.024088627067507782, |
| "grad_norm": 0.21697266399860382, |
| "learning_rate": 0.001, |
| "loss": 2.963, |
| "num_input_tokens_seen": 1323827200, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.024327128325601917, |
| "grad_norm": 0.17018833756446838, |
| "learning_rate": 0.001, |
| "loss": 2.9453, |
| "num_input_tokens_seen": 1336934400, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.024565629583696055, |
| "grad_norm": 0.17473167181015015, |
| "learning_rate": 0.001, |
| "loss": 2.9516, |
| "num_input_tokens_seen": 1350041600, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.02480413084179019, |
| "grad_norm": 0.18488293886184692, |
| "learning_rate": 0.001, |
| "loss": 2.9404, |
| "num_input_tokens_seen": 1363148800, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.025042632099884325, |
| "grad_norm": 0.17348967492580414, |
| "learning_rate": 0.001, |
| "loss": 2.9275, |
| "num_input_tokens_seen": 1376256000, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.025281133357978464, |
| "grad_norm": 0.16547563672065735, |
| "learning_rate": 0.001, |
| "loss": 2.9464, |
| "num_input_tokens_seen": 1389363200, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.0255196346160726, |
| "grad_norm": 0.17538361251354218, |
| "learning_rate": 0.001, |
| "loss": 2.94, |
| "num_input_tokens_seen": 1402470400, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.025758135874166737, |
| "grad_norm": 0.17068558931350708, |
| "learning_rate": 0.001, |
| "loss": 2.9382, |
| "num_input_tokens_seen": 1415577600, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.025996637132260872, |
| "grad_norm": 0.17389337718486786, |
| "learning_rate": 0.001, |
| "loss": 2.9254, |
| "num_input_tokens_seen": 1428684800, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.02623513839035501, |
| "grad_norm": 0.17620491981506348, |
| "learning_rate": 0.001, |
| "loss": 2.9221, |
| "num_input_tokens_seen": 1441792000, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.02623513839035501, |
| "eval_loss": 2.8246922492980957, |
| "eval_runtime": 50.2832, |
| "eval_samples_per_second": 99.437, |
| "eval_steps_per_second": 24.859, |
| "num_input_tokens_seen": 1441792000, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.026473639648449145, |
| "grad_norm": 0.15889622271060944, |
| "learning_rate": 0.001, |
| "loss": 2.923, |
| "num_input_tokens_seen": 1454899200, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.026712140906543284, |
| "grad_norm": 0.17490123212337494, |
| "learning_rate": 0.001, |
| "loss": 2.9146, |
| "num_input_tokens_seen": 1468006400, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.02695064216463742, |
| "grad_norm": 0.17789559066295624, |
| "learning_rate": 0.001, |
| "loss": 2.9253, |
| "num_input_tokens_seen": 1481113600, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.027189143422731554, |
| "grad_norm": 0.17113780975341797, |
| "learning_rate": 0.001, |
| "loss": 2.9267, |
| "num_input_tokens_seen": 1494220800, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.027427644680825692, |
| "grad_norm": 0.1671907901763916, |
| "learning_rate": 0.001, |
| "loss": 2.9178, |
| "num_input_tokens_seen": 1507328000, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.027666145938919827, |
| "grad_norm": 0.17511603236198425, |
| "learning_rate": 0.001, |
| "loss": 2.9341, |
| "num_input_tokens_seen": 1520435200, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.027904647197013965, |
| "grad_norm": 0.1821524053812027, |
| "learning_rate": 0.001, |
| "loss": 2.9076, |
| "num_input_tokens_seen": 1533542400, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.0281431484551081, |
| "grad_norm": 0.16259051859378815, |
| "learning_rate": 0.001, |
| "loss": 2.9212, |
| "num_input_tokens_seen": 1546649600, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.02838164971320224, |
| "grad_norm": 0.18584352731704712, |
| "learning_rate": 0.001, |
| "loss": 2.927, |
| "num_input_tokens_seen": 1559756800, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.028620150971296374, |
| "grad_norm": 0.181602343916893, |
| "learning_rate": 0.001, |
| "loss": 2.9096, |
| "num_input_tokens_seen": 1572864000, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.028620150971296374, |
| "eval_loss": 2.8036017417907715, |
| "eval_runtime": 50.543, |
| "eval_samples_per_second": 98.926, |
| "eval_steps_per_second": 24.731, |
| "num_input_tokens_seen": 1572864000, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.02885865222939051, |
| "grad_norm": 0.1653270423412323, |
| "learning_rate": 0.001, |
| "loss": 2.9214, |
| "num_input_tokens_seen": 1585971200, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.029097153487484647, |
| "grad_norm": 0.17030183970928192, |
| "learning_rate": 0.001, |
| "loss": 2.9081, |
| "num_input_tokens_seen": 1599078400, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.029335654745578782, |
| "grad_norm": 0.17734774947166443, |
| "learning_rate": 0.001, |
| "loss": 2.9128, |
| "num_input_tokens_seen": 1612185600, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.02957415600367292, |
| "grad_norm": 0.1664343774318695, |
| "learning_rate": 0.001, |
| "loss": 2.9084, |
| "num_input_tokens_seen": 1625292800, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.029812657261767055, |
| "grad_norm": 0.15939603745937347, |
| "learning_rate": 0.001, |
| "loss": 2.9049, |
| "num_input_tokens_seen": 1638400000, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.030051158519861194, |
| "grad_norm": 0.16107864677906036, |
| "learning_rate": 0.001, |
| "loss": 2.8889, |
| "num_input_tokens_seen": 1651507200, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.03028965977795533, |
| "grad_norm": 0.1734771579504013, |
| "learning_rate": 0.001, |
| "loss": 2.8951, |
| "num_input_tokens_seen": 1664614400, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.030528161036049464, |
| "grad_norm": 0.1804204136133194, |
| "learning_rate": 0.001, |
| "loss": 2.8877, |
| "num_input_tokens_seen": 1677721600, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.030766662294143602, |
| "grad_norm": 0.16369500756263733, |
| "learning_rate": 0.001, |
| "loss": 2.8764, |
| "num_input_tokens_seen": 1690828800, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.031005163552237737, |
| "grad_norm": 0.1704144924879074, |
| "learning_rate": 0.001, |
| "loss": 2.8965, |
| "num_input_tokens_seen": 1703936000, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.031005163552237737, |
| "eval_loss": 2.787343740463257, |
| "eval_runtime": 50.3956, |
| "eval_samples_per_second": 99.215, |
| "eval_steps_per_second": 24.804, |
| "num_input_tokens_seen": 1703936000, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.031243664810331875, |
| "grad_norm": 0.17917555570602417, |
| "learning_rate": 0.001, |
| "loss": 2.8882, |
| "num_input_tokens_seen": 1717043200, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.031482166068426014, |
| "grad_norm": 0.18822412192821503, |
| "learning_rate": 0.001, |
| "loss": 2.8931, |
| "num_input_tokens_seen": 1730150400, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.031720667326520145, |
| "grad_norm": 0.1702752560377121, |
| "learning_rate": 0.001, |
| "loss": 2.8906, |
| "num_input_tokens_seen": 1743257600, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.031959168584614284, |
| "grad_norm": 0.16963082551956177, |
| "learning_rate": 0.001, |
| "loss": 2.8809, |
| "num_input_tokens_seen": 1756364800, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.03219766984270842, |
| "grad_norm": 0.17273569107055664, |
| "learning_rate": 0.001, |
| "loss": 2.9005, |
| "num_input_tokens_seen": 1769472000, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.032436171100802554, |
| "grad_norm": 0.21361888945102692, |
| "learning_rate": 0.001, |
| "loss": 2.8683, |
| "num_input_tokens_seen": 1782579200, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.03267467235889669, |
| "grad_norm": 0.16454364359378815, |
| "learning_rate": 0.001, |
| "loss": 2.8921, |
| "num_input_tokens_seen": 1795686400, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.03291317361699083, |
| "grad_norm": 0.1677432805299759, |
| "learning_rate": 0.001, |
| "loss": 2.8777, |
| "num_input_tokens_seen": 1808793600, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.03315167487508497, |
| "grad_norm": 0.17707760632038116, |
| "learning_rate": 0.001, |
| "loss": 2.8791, |
| "num_input_tokens_seen": 1821900800, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.0333901761331791, |
| "grad_norm": 0.1784796118736267, |
| "learning_rate": 0.001, |
| "loss": 2.8642, |
| "num_input_tokens_seen": 1835008000, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.0333901761331791, |
| "eval_loss": 2.7708475589752197, |
| "eval_runtime": 50.9058, |
| "eval_samples_per_second": 98.221, |
| "eval_steps_per_second": 24.555, |
| "num_input_tokens_seen": 1835008000, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.03362867739127324, |
| "grad_norm": 0.15859876573085785, |
| "learning_rate": 0.001, |
| "loss": 2.8919, |
| "num_input_tokens_seen": 1848115200, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.03386717864936738, |
| "grad_norm": 0.17061467468738556, |
| "learning_rate": 0.001, |
| "loss": 2.868, |
| "num_input_tokens_seen": 1861222400, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.03410567990746151, |
| "grad_norm": 0.17118851840496063, |
| "learning_rate": 0.001, |
| "loss": 2.8677, |
| "num_input_tokens_seen": 1874329600, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.03434418116555565, |
| "grad_norm": 0.1561940759420395, |
| "learning_rate": 0.001, |
| "loss": 2.8701, |
| "num_input_tokens_seen": 1887436800, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.034582682423649785, |
| "grad_norm": 0.17568449676036835, |
| "learning_rate": 0.001, |
| "loss": 2.8652, |
| "num_input_tokens_seen": 1900544000, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.034821183681743924, |
| "grad_norm": 0.17471665143966675, |
| "learning_rate": 0.001, |
| "loss": 2.8614, |
| "num_input_tokens_seen": 1913651200, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.035059684939838055, |
| "grad_norm": 0.17949970066547394, |
| "learning_rate": 0.001, |
| "loss": 2.862, |
| "num_input_tokens_seen": 1926758400, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.035298186197932194, |
| "grad_norm": 0.17014376819133759, |
| "learning_rate": 0.001, |
| "loss": 2.8696, |
| "num_input_tokens_seen": 1939865600, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.03553668745602633, |
| "grad_norm": 0.166939839720726, |
| "learning_rate": 0.001, |
| "loss": 2.8679, |
| "num_input_tokens_seen": 1952972800, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.035775188714120464, |
| "grad_norm": 0.16403459012508392, |
| "learning_rate": 0.001, |
| "loss": 2.8692, |
| "num_input_tokens_seen": 1966080000, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.035775188714120464, |
| "eval_loss": 2.7581658363342285, |
| "eval_runtime": 50.614, |
| "eval_samples_per_second": 98.787, |
| "eval_steps_per_second": 24.697, |
| "num_input_tokens_seen": 1966080000, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.0360136899722146, |
| "grad_norm": 0.16664361953735352, |
| "learning_rate": 0.001, |
| "loss": 2.8549, |
| "num_input_tokens_seen": 1979187200, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.03625219123030874, |
| "grad_norm": 0.165015310049057, |
| "learning_rate": 0.001, |
| "loss": 2.867, |
| "num_input_tokens_seen": 1992294400, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.03649069248840288, |
| "grad_norm": 0.17752580344676971, |
| "learning_rate": 0.001, |
| "loss": 2.8721, |
| "num_input_tokens_seen": 2005401600, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.03672919374649701, |
| "grad_norm": 0.1641317456960678, |
| "learning_rate": 0.001, |
| "loss": 2.8601, |
| "num_input_tokens_seen": 2018508800, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.03696769500459115, |
| "grad_norm": 0.1706378310918808, |
| "learning_rate": 0.001, |
| "loss": 2.8385, |
| "num_input_tokens_seen": 2031616000, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.03720619626268529, |
| "grad_norm": 0.18265438079833984, |
| "learning_rate": 0.001, |
| "loss": 2.8421, |
| "num_input_tokens_seen": 2044723200, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.037444697520779426, |
| "grad_norm": 0.17270897328853607, |
| "learning_rate": 0.001, |
| "loss": 2.8576, |
| "num_input_tokens_seen": 2057830400, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.03768319877887356, |
| "grad_norm": 0.17359280586242676, |
| "learning_rate": 0.001, |
| "loss": 2.8522, |
| "num_input_tokens_seen": 2070937600, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.037921700036967695, |
| "grad_norm": 0.1679411083459854, |
| "learning_rate": 0.001, |
| "loss": 2.854, |
| "num_input_tokens_seen": 2084044800, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.038160201295061834, |
| "grad_norm": 0.16735835373401642, |
| "learning_rate": 0.001, |
| "loss": 2.8494, |
| "num_input_tokens_seen": 2097152000, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.038160201295061834, |
| "eval_loss": 2.7443442344665527, |
| "eval_runtime": 50.3387, |
| "eval_samples_per_second": 99.327, |
| "eval_steps_per_second": 24.832, |
| "num_input_tokens_seen": 2097152000, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.038398702553155965, |
| "grad_norm": 0.16059577465057373, |
| "learning_rate": 0.001, |
| "loss": 2.8495, |
| "num_input_tokens_seen": 2110259200, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.038637203811250104, |
| "grad_norm": 0.1842387169599533, |
| "learning_rate": 0.001, |
| "loss": 2.8526, |
| "num_input_tokens_seen": 2123366400, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.03887570506934424, |
| "grad_norm": 0.15922050178050995, |
| "learning_rate": 0.001, |
| "loss": 2.8312, |
| "num_input_tokens_seen": 2136473600, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.03911420632743838, |
| "grad_norm": 0.16642028093338013, |
| "learning_rate": 0.001, |
| "loss": 2.8452, |
| "num_input_tokens_seen": 2149580800, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.03935270758553251, |
| "grad_norm": 0.16174671053886414, |
| "learning_rate": 0.001, |
| "loss": 2.8471, |
| "num_input_tokens_seen": 2162688000, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.03959120884362665, |
| "grad_norm": 0.16786591708660126, |
| "learning_rate": 0.001, |
| "loss": 2.8435, |
| "num_input_tokens_seen": 2175795200, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.03982971010172079, |
| "grad_norm": 0.17107373476028442, |
| "learning_rate": 0.001, |
| "loss": 2.862, |
| "num_input_tokens_seen": 2188902400, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.04006821135981492, |
| "grad_norm": 0.17952118813991547, |
| "learning_rate": 0.001, |
| "loss": 2.8414, |
| "num_input_tokens_seen": 2202009600, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.04030671261790906, |
| "grad_norm": 0.16836482286453247, |
| "learning_rate": 0.001, |
| "loss": 2.8363, |
| "num_input_tokens_seen": 2215116800, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.0405452138760032, |
| "grad_norm": 0.16812962293624878, |
| "learning_rate": 0.001, |
| "loss": 2.844, |
| "num_input_tokens_seen": 2228224000, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.0405452138760032, |
| "eval_loss": 2.7306976318359375, |
| "eval_runtime": 50.272, |
| "eval_samples_per_second": 99.459, |
| "eval_steps_per_second": 24.865, |
| "num_input_tokens_seen": 2228224000, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.040783715134097336, |
| "grad_norm": 0.1696135401725769, |
| "learning_rate": 0.001, |
| "loss": 2.8406, |
| "num_input_tokens_seen": 2241331200, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.04102221639219147, |
| "grad_norm": 0.16062459349632263, |
| "learning_rate": 0.001, |
| "loss": 2.8453, |
| "num_input_tokens_seen": 2254438400, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.041260717650285605, |
| "grad_norm": 0.17326433956623077, |
| "learning_rate": 0.001, |
| "loss": 2.8449, |
| "num_input_tokens_seen": 2267545600, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.041499218908379744, |
| "grad_norm": 0.16410672664642334, |
| "learning_rate": 0.001, |
| "loss": 2.8412, |
| "num_input_tokens_seen": 2280652800, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.041737720166473875, |
| "grad_norm": 0.16255012154579163, |
| "learning_rate": 0.001, |
| "loss": 2.8524, |
| "num_input_tokens_seen": 2293760000, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.041976221424568014, |
| "grad_norm": 0.163652241230011, |
| "learning_rate": 0.001, |
| "loss": 2.8528, |
| "num_input_tokens_seen": 2306867200, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.04221472268266215, |
| "grad_norm": 0.15598778426647186, |
| "learning_rate": 0.001, |
| "loss": 2.8255, |
| "num_input_tokens_seen": 2319974400, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.04245322394075629, |
| "grad_norm": 0.1740003079175949, |
| "learning_rate": 0.001, |
| "loss": 2.8278, |
| "num_input_tokens_seen": 2333081600, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.04269172519885042, |
| "grad_norm": 0.17225052416324615, |
| "learning_rate": 0.001, |
| "loss": 2.8334, |
| "num_input_tokens_seen": 2346188800, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.04293022645694456, |
| "grad_norm": 0.18005919456481934, |
| "learning_rate": 0.001, |
| "loss": 2.8044, |
| "num_input_tokens_seen": 2359296000, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.04293022645694456, |
| "eval_loss": 2.7220215797424316, |
| "eval_runtime": 50.2706, |
| "eval_samples_per_second": 99.462, |
| "eval_steps_per_second": 24.865, |
| "num_input_tokens_seen": 2359296000, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.0431687277150387, |
| "grad_norm": 0.16554109752178192, |
| "learning_rate": 0.001, |
| "loss": 2.83, |
| "num_input_tokens_seen": 2372403200, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.04340722897313283, |
| "grad_norm": 0.17308101058006287, |
| "learning_rate": 0.001, |
| "loss": 2.8204, |
| "num_input_tokens_seen": 2385510400, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.04364573023122697, |
| "grad_norm": 0.16701756417751312, |
| "learning_rate": 0.001, |
| "loss": 2.836, |
| "num_input_tokens_seen": 2398617600, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.04388423148932111, |
| "grad_norm": 0.16220535337924957, |
| "learning_rate": 0.001, |
| "loss": 2.8194, |
| "num_input_tokens_seen": 2411724800, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.044122732747415246, |
| "grad_norm": 0.16643071174621582, |
| "learning_rate": 0.001, |
| "loss": 2.8157, |
| "num_input_tokens_seen": 2424832000, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.04436123400550938, |
| "grad_norm": 0.16293680667877197, |
| "learning_rate": 0.001, |
| "loss": 2.8147, |
| "num_input_tokens_seen": 2437939200, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.044599735263603515, |
| "grad_norm": 0.1914059966802597, |
| "learning_rate": 0.001, |
| "loss": 2.8164, |
| "num_input_tokens_seen": 2451046400, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.044838236521697654, |
| "grad_norm": 0.15867285430431366, |
| "learning_rate": 0.001, |
| "loss": 2.8063, |
| "num_input_tokens_seen": 2464153600, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.045076737779791785, |
| "grad_norm": 0.16319462656974792, |
| "learning_rate": 0.001, |
| "loss": 2.8096, |
| "num_input_tokens_seen": 2477260800, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.045315239037885924, |
| "grad_norm": 0.16578581929206848, |
| "learning_rate": 0.001, |
| "loss": 2.8106, |
| "num_input_tokens_seen": 2490368000, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.045315239037885924, |
| "eval_loss": 2.7105066776275635, |
| "eval_runtime": 50.0838, |
| "eval_samples_per_second": 99.833, |
| "eval_steps_per_second": 24.958, |
| "num_input_tokens_seen": 2490368000, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.04555374029598006, |
| "grad_norm": 0.17125573754310608, |
| "learning_rate": 0.001, |
| "loss": 2.8259, |
| "num_input_tokens_seen": 2503475200, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.0457922415540742, |
| "grad_norm": 0.1661599725484848, |
| "learning_rate": 0.001, |
| "loss": 2.8109, |
| "num_input_tokens_seen": 2516582400, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.04603074281216833, |
| "grad_norm": 0.16203565895557404, |
| "learning_rate": 0.001, |
| "loss": 2.8198, |
| "num_input_tokens_seen": 2529689600, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.04626924407026247, |
| "grad_norm": 0.1869373619556427, |
| "learning_rate": 0.001, |
| "loss": 2.8163, |
| "num_input_tokens_seen": 2542796800, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.04650774532835661, |
| "grad_norm": 0.17401213943958282, |
| "learning_rate": 0.001, |
| "loss": 2.8209, |
| "num_input_tokens_seen": 2555904000, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.04674624658645074, |
| "grad_norm": 0.15835829079151154, |
| "learning_rate": 0.001, |
| "loss": 2.8032, |
| "num_input_tokens_seen": 2569011200, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.04698474784454488, |
| "grad_norm": 0.16554060578346252, |
| "learning_rate": 0.001, |
| "loss": 2.8072, |
| "num_input_tokens_seen": 2582118400, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.04722324910263902, |
| "grad_norm": 0.16941213607788086, |
| "learning_rate": 0.001, |
| "loss": 2.8076, |
| "num_input_tokens_seen": 2595225600, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.047461750360733156, |
| "grad_norm": 0.16324704885482788, |
| "learning_rate": 0.001, |
| "loss": 2.8097, |
| "num_input_tokens_seen": 2608332800, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.04770025161882729, |
| "grad_norm": 0.16865754127502441, |
| "learning_rate": 0.001, |
| "loss": 2.8051, |
| "num_input_tokens_seen": 2621440000, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.04770025161882729, |
| "eval_loss": 2.7000486850738525, |
| "eval_runtime": 50.3365, |
| "eval_samples_per_second": 99.331, |
| "eval_steps_per_second": 24.833, |
| "num_input_tokens_seen": 2621440000, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.047938752876921426, |
| "grad_norm": 0.17076526582241058, |
| "learning_rate": 0.001, |
| "loss": 2.8148, |
| "num_input_tokens_seen": 2634547200, |
| "step": 10050 |
| }, |
| { |
| "epoch": 0.048177254135015564, |
| "grad_norm": 0.1610497534275055, |
| "learning_rate": 0.001, |
| "loss": 2.8012, |
| "num_input_tokens_seen": 2647654400, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.048415755393109695, |
| "grad_norm": 0.15984536707401276, |
| "learning_rate": 0.001, |
| "loss": 2.8086, |
| "num_input_tokens_seen": 2660761600, |
| "step": 10150 |
| }, |
| { |
| "epoch": 0.048654256651203834, |
| "grad_norm": 0.21775834262371063, |
| "learning_rate": 0.001, |
| "loss": 2.8021, |
| "num_input_tokens_seen": 2673868800, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.04889275790929797, |
| "grad_norm": 0.1841157227754593, |
| "learning_rate": 0.001, |
| "loss": 2.8152, |
| "num_input_tokens_seen": 2686976000, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.04913125916739211, |
| "grad_norm": 0.17025424540042877, |
| "learning_rate": 0.001, |
| "loss": 2.8131, |
| "num_input_tokens_seen": 2700083200, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.04936976042548624, |
| "grad_norm": 0.1992417722940445, |
| "learning_rate": 0.001, |
| "loss": 2.8039, |
| "num_input_tokens_seen": 2713190400, |
| "step": 10350 |
| }, |
| { |
| "epoch": 0.04960826168358038, |
| "grad_norm": 0.1680469959974289, |
| "learning_rate": 0.001, |
| "loss": 2.7921, |
| "num_input_tokens_seen": 2726297600, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.04984676294167452, |
| "grad_norm": 0.18296252191066742, |
| "learning_rate": 0.001, |
| "loss": 2.8036, |
| "num_input_tokens_seen": 2739404800, |
| "step": 10450 |
| }, |
| { |
| "epoch": 0.05008526419976865, |
| "grad_norm": 0.16041898727416992, |
| "learning_rate": 0.001, |
| "loss": 2.7979, |
| "num_input_tokens_seen": 2752512000, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.05008526419976865, |
| "eval_loss": 2.6893723011016846, |
| "eval_runtime": 50.642, |
| "eval_samples_per_second": 98.732, |
| "eval_steps_per_second": 24.683, |
| "num_input_tokens_seen": 2752512000, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.05032376545786279, |
| "grad_norm": 0.16704030334949493, |
| "learning_rate": 0.001, |
| "loss": 2.79, |
| "num_input_tokens_seen": 2765619200, |
| "step": 10550 |
| }, |
| { |
| "epoch": 0.05056226671595693, |
| "grad_norm": 0.16553758084774017, |
| "learning_rate": 0.001, |
| "loss": 2.7964, |
| "num_input_tokens_seen": 2778726400, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.050800767974051066, |
| "grad_norm": 0.16027161478996277, |
| "learning_rate": 0.001, |
| "loss": 2.7937, |
| "num_input_tokens_seen": 2791833600, |
| "step": 10650 |
| }, |
| { |
| "epoch": 0.0510392692321452, |
| "grad_norm": 0.16177843511104584, |
| "learning_rate": 0.001, |
| "loss": 2.7957, |
| "num_input_tokens_seen": 2804940800, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.051277770490239336, |
| "grad_norm": 0.16713912785053253, |
| "learning_rate": 0.001, |
| "loss": 2.7949, |
| "num_input_tokens_seen": 2818048000, |
| "step": 10750 |
| }, |
| { |
| "epoch": 0.051516271748333474, |
| "grad_norm": 0.1815747618675232, |
| "learning_rate": 0.001, |
| "loss": 2.7915, |
| "num_input_tokens_seen": 2831155200, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.05175477300642761, |
| "grad_norm": 0.16732683777809143, |
| "learning_rate": 0.001, |
| "loss": 2.7994, |
| "num_input_tokens_seen": 2844262400, |
| "step": 10850 |
| }, |
| { |
| "epoch": 0.051993274264521744, |
| "grad_norm": 0.18305908143520355, |
| "learning_rate": 0.001, |
| "loss": 2.7888, |
| "num_input_tokens_seen": 2857369600, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.05223177552261588, |
| "grad_norm": 0.16450954973697662, |
| "learning_rate": 0.001, |
| "loss": 2.7834, |
| "num_input_tokens_seen": 2870476800, |
| "step": 10950 |
| }, |
| { |
| "epoch": 0.05247027678071002, |
| "grad_norm": 0.16485372185707092, |
| "learning_rate": 0.001, |
| "loss": 2.7976, |
| "num_input_tokens_seen": 2883584000, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.05247027678071002, |
| "eval_loss": 2.6825835704803467, |
| "eval_runtime": 50.1016, |
| "eval_samples_per_second": 99.797, |
| "eval_steps_per_second": 24.949, |
| "num_input_tokens_seen": 2883584000, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.05270877803880415, |
| "grad_norm": 0.1733204573392868, |
| "learning_rate": 0.001, |
| "loss": 2.7959, |
| "num_input_tokens_seen": 2896691200, |
| "step": 11050 |
| }, |
| { |
| "epoch": 0.05294727929689829, |
| "grad_norm": 0.16432546079158783, |
| "learning_rate": 0.001, |
| "loss": 2.793, |
| "num_input_tokens_seen": 2909798400, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.05318578055499243, |
| "grad_norm": 0.18369582295417786, |
| "learning_rate": 0.001, |
| "loss": 2.8149, |
| "num_input_tokens_seen": 2922905600, |
| "step": 11150 |
| }, |
| { |
| "epoch": 0.05342428181308657, |
| "grad_norm": 0.17782896757125854, |
| "learning_rate": 0.001, |
| "loss": 2.7878, |
| "num_input_tokens_seen": 2936012800, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.0536627830711807, |
| "grad_norm": 0.18320836126804352, |
| "learning_rate": 0.001, |
| "loss": 2.8159, |
| "num_input_tokens_seen": 2949120000, |
| "step": 11250 |
| }, |
| { |
| "epoch": 0.05390128432927484, |
| "grad_norm": 0.1667925864458084, |
| "learning_rate": 0.001, |
| "loss": 2.795, |
| "num_input_tokens_seen": 2962227200, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.054139785587368976, |
| "grad_norm": 0.19831301271915436, |
| "learning_rate": 0.001, |
| "loss": 2.7907, |
| "num_input_tokens_seen": 2975334400, |
| "step": 11350 |
| }, |
| { |
| "epoch": 0.05437828684546311, |
| "grad_norm": 0.1610182225704193, |
| "learning_rate": 0.001, |
| "loss": 2.774, |
| "num_input_tokens_seen": 2988441600, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.054616788103557246, |
| "grad_norm": 0.15938150882720947, |
| "learning_rate": 0.001, |
| "loss": 2.7766, |
| "num_input_tokens_seen": 3001548800, |
| "step": 11450 |
| }, |
| { |
| "epoch": 0.054855289361651384, |
| "grad_norm": 0.15737415850162506, |
| "learning_rate": 0.001, |
| "loss": 2.783, |
| "num_input_tokens_seen": 3014656000, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.054855289361651384, |
| "eval_loss": 2.6739256381988525, |
| "eval_runtime": 51.2462, |
| "eval_samples_per_second": 97.568, |
| "eval_steps_per_second": 24.392, |
| "num_input_tokens_seen": 3014656000, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.05509379061974552, |
| "grad_norm": 0.16538532078266144, |
| "learning_rate": 0.001, |
| "loss": 2.7966, |
| "num_input_tokens_seen": 3027763200, |
| "step": 11550 |
| }, |
| { |
| "epoch": 0.055332291877839654, |
| "grad_norm": 0.18035660684108734, |
| "learning_rate": 0.001, |
| "loss": 2.7789, |
| "num_input_tokens_seen": 3040870400, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.05557079313593379, |
| "grad_norm": 0.17831085622310638, |
| "learning_rate": 0.001, |
| "loss": 2.7962, |
| "num_input_tokens_seen": 3053977600, |
| "step": 11650 |
| }, |
| { |
| "epoch": 0.05580929439402793, |
| "grad_norm": 0.17723870277404785, |
| "learning_rate": 0.001, |
| "loss": 2.7791, |
| "num_input_tokens_seen": 3067084800, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.05604779565212206, |
| "grad_norm": 0.17663581669330597, |
| "learning_rate": 0.001, |
| "loss": 2.7696, |
| "num_input_tokens_seen": 3080192000, |
| "step": 11750 |
| }, |
| { |
| "epoch": 0.0562862969102162, |
| "grad_norm": 0.16684900224208832, |
| "learning_rate": 0.001, |
| "loss": 2.7762, |
| "num_input_tokens_seen": 3093299200, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.05652479816831034, |
| "grad_norm": 0.17407995462417603, |
| "learning_rate": 0.001, |
| "loss": 2.7767, |
| "num_input_tokens_seen": 3106406400, |
| "step": 11850 |
| }, |
| { |
| "epoch": 0.05676329942640448, |
| "grad_norm": 0.1750691831111908, |
| "learning_rate": 0.001, |
| "loss": 2.7785, |
| "num_input_tokens_seen": 3119513600, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.05700180068449861, |
| "grad_norm": 0.16576959192752838, |
| "learning_rate": 0.001, |
| "loss": 2.773, |
| "num_input_tokens_seen": 3132620800, |
| "step": 11950 |
| }, |
| { |
| "epoch": 0.05724030194259275, |
| "grad_norm": 0.16957831382751465, |
| "learning_rate": 0.001, |
| "loss": 2.7781, |
| "num_input_tokens_seen": 3145728000, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.05724030194259275, |
| "eval_loss": 2.6683335304260254, |
| "eval_runtime": 50.6428, |
| "eval_samples_per_second": 98.731, |
| "eval_steps_per_second": 24.683, |
| "num_input_tokens_seen": 3145728000, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.057478803200686886, |
| "grad_norm": 0.1645338237285614, |
| "learning_rate": 0.001, |
| "loss": 2.7709, |
| "num_input_tokens_seen": 3158835200, |
| "step": 12050 |
| }, |
| { |
| "epoch": 0.05771730445878102, |
| "grad_norm": 0.15848694741725922, |
| "learning_rate": 0.001, |
| "loss": 2.7849, |
| "num_input_tokens_seen": 3171942400, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.057955805716875156, |
| "grad_norm": 0.20003071427345276, |
| "learning_rate": 0.001, |
| "loss": 2.7691, |
| "num_input_tokens_seen": 3185049600, |
| "step": 12150 |
| }, |
| { |
| "epoch": 0.058194306974969294, |
| "grad_norm": 0.19301050901412964, |
| "learning_rate": 0.001, |
| "loss": 2.7811, |
| "num_input_tokens_seen": 3198156800, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.05843280823306343, |
| "grad_norm": 0.171390563249588, |
| "learning_rate": 0.001, |
| "loss": 2.7712, |
| "num_input_tokens_seen": 3211264000, |
| "step": 12250 |
| }, |
| { |
| "epoch": 0.058671309491157564, |
| "grad_norm": 0.1654270589351654, |
| "learning_rate": 0.001, |
| "loss": 2.7788, |
| "num_input_tokens_seen": 3224371200, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.0589098107492517, |
| "grad_norm": 0.16559672355651855, |
| "learning_rate": 0.001, |
| "loss": 2.7839, |
| "num_input_tokens_seen": 3237478400, |
| "step": 12350 |
| }, |
| { |
| "epoch": 0.05914831200734584, |
| "grad_norm": 0.16773344576358795, |
| "learning_rate": 0.001, |
| "loss": 2.7896, |
| "num_input_tokens_seen": 3250585600, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.05938681326543997, |
| "grad_norm": 0.1639021933078766, |
| "learning_rate": 0.001, |
| "loss": 2.7704, |
| "num_input_tokens_seen": 3263692800, |
| "step": 12450 |
| }, |
| { |
| "epoch": 0.05962531452353411, |
| "grad_norm": 0.15584540367126465, |
| "learning_rate": 0.001, |
| "loss": 2.7687, |
| "num_input_tokens_seen": 3276800000, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.05962531452353411, |
| "eval_loss": 2.6606011390686035, |
| "eval_runtime": 51.1636, |
| "eval_samples_per_second": 97.726, |
| "eval_steps_per_second": 24.431, |
| "num_input_tokens_seen": 3276800000, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.05986381578162825, |
| "grad_norm": 0.18144413828849792, |
| "learning_rate": 0.001, |
| "loss": 2.7711, |
| "num_input_tokens_seen": 3289907200, |
| "step": 12550 |
| }, |
| { |
| "epoch": 0.06010231703972239, |
| "grad_norm": 0.18225054442882538, |
| "learning_rate": 0.001, |
| "loss": 2.7675, |
| "num_input_tokens_seen": 3303014400, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.06034081829781652, |
| "grad_norm": 0.16542398929595947, |
| "learning_rate": 0.001, |
| "loss": 2.7563, |
| "num_input_tokens_seen": 3316121600, |
| "step": 12650 |
| }, |
| { |
| "epoch": 0.06057931955591066, |
| "grad_norm": 0.1765596568584442, |
| "learning_rate": 0.001, |
| "loss": 2.7807, |
| "num_input_tokens_seen": 3329228800, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.060817820814004796, |
| "grad_norm": 0.17469234764575958, |
| "learning_rate": 0.001, |
| "loss": 2.7532, |
| "num_input_tokens_seen": 3342336000, |
| "step": 12750 |
| }, |
| { |
| "epoch": 0.06105632207209893, |
| "grad_norm": 0.1841767281293869, |
| "learning_rate": 0.001, |
| "loss": 2.7824, |
| "num_input_tokens_seen": 3355443200, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.061294823330193066, |
| "grad_norm": 0.1667831838130951, |
| "learning_rate": 0.001, |
| "loss": 2.7648, |
| "num_input_tokens_seen": 3368550400, |
| "step": 12850 |
| }, |
| { |
| "epoch": 0.061533324588287204, |
| "grad_norm": 0.16561101377010345, |
| "learning_rate": 0.001, |
| "loss": 2.7798, |
| "num_input_tokens_seen": 3381657600, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.06177182584638134, |
| "grad_norm": 0.17370566725730896, |
| "learning_rate": 0.001, |
| "loss": 2.7755, |
| "num_input_tokens_seen": 3394764800, |
| "step": 12950 |
| }, |
| { |
| "epoch": 0.062010327104475474, |
| "grad_norm": 0.16871176660060883, |
| "learning_rate": 0.001, |
| "loss": 2.7676, |
| "num_input_tokens_seen": 3407872000, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.062010327104475474, |
| "eval_loss": 2.653367757797241, |
| "eval_runtime": 50.8399, |
| "eval_samples_per_second": 98.348, |
| "eval_steps_per_second": 24.587, |
| "num_input_tokens_seen": 3407872000, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.06224882836256961, |
| "grad_norm": 0.17592230439186096, |
| "learning_rate": 0.001, |
| "loss": 2.7639, |
| "num_input_tokens_seen": 3420979200, |
| "step": 13050 |
| }, |
| { |
| "epoch": 0.06248732962066375, |
| "grad_norm": 0.1640375405550003, |
| "learning_rate": 0.001, |
| "loss": 2.7785, |
| "num_input_tokens_seen": 3434086400, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.06272583087875788, |
| "grad_norm": 0.16389331221580505, |
| "learning_rate": 0.001, |
| "loss": 2.7475, |
| "num_input_tokens_seen": 3447193600, |
| "step": 13150 |
| }, |
| { |
| "epoch": 0.06296433213685203, |
| "grad_norm": 0.1733655482530594, |
| "learning_rate": 0.001, |
| "loss": 2.7538, |
| "num_input_tokens_seen": 3460300800, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.06320283339494616, |
| "grad_norm": 0.19206473231315613, |
| "learning_rate": 0.001, |
| "loss": 2.7819, |
| "num_input_tokens_seen": 3473408000, |
| "step": 13250 |
| }, |
| { |
| "epoch": 0.06344133465304029, |
| "grad_norm": 0.1841450184583664, |
| "learning_rate": 0.001, |
| "loss": 2.7701, |
| "num_input_tokens_seen": 3486515200, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.06367983591113444, |
| "grad_norm": 0.1701631247997284, |
| "learning_rate": 0.001, |
| "loss": 2.7587, |
| "num_input_tokens_seen": 3499622400, |
| "step": 13350 |
| }, |
| { |
| "epoch": 0.06391833716922857, |
| "grad_norm": 0.17068499326705933, |
| "learning_rate": 0.001, |
| "loss": 2.7589, |
| "num_input_tokens_seen": 3512729600, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.0641568384273227, |
| "grad_norm": 0.17927715182304382, |
| "learning_rate": 0.001, |
| "loss": 2.764, |
| "num_input_tokens_seen": 3525836800, |
| "step": 13450 |
| }, |
| { |
| "epoch": 0.06439533968541684, |
| "grad_norm": 0.19105768203735352, |
| "learning_rate": 0.001, |
| "loss": 2.7593, |
| "num_input_tokens_seen": 3538944000, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.06439533968541684, |
| "eval_loss": 2.6473631858825684, |
| "eval_runtime": 51.0846, |
| "eval_samples_per_second": 97.877, |
| "eval_steps_per_second": 24.469, |
| "num_input_tokens_seen": 3538944000, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.06463384094351098, |
| "grad_norm": 0.17262668907642365, |
| "learning_rate": 0.001, |
| "loss": 2.7522, |
| "num_input_tokens_seen": 3552051200, |
| "step": 13550 |
| }, |
| { |
| "epoch": 0.06487234220160511, |
| "grad_norm": 0.16810455918312073, |
| "learning_rate": 0.001, |
| "loss": 2.7664, |
| "num_input_tokens_seen": 3565158400, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.06511084345969925, |
| "grad_norm": 0.17312487959861755, |
| "learning_rate": 0.001, |
| "loss": 2.7557, |
| "num_input_tokens_seen": 3578265600, |
| "step": 13650 |
| }, |
| { |
| "epoch": 0.06534934471779338, |
| "grad_norm": 0.16985322535037994, |
| "learning_rate": 0.001, |
| "loss": 2.7449, |
| "num_input_tokens_seen": 3591372800, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.06558784597588753, |
| "grad_norm": 0.1812393069267273, |
| "learning_rate": 0.001, |
| "loss": 2.749, |
| "num_input_tokens_seen": 3604480000, |
| "step": 13750 |
| }, |
| { |
| "epoch": 0.06582634723398166, |
| "grad_norm": 0.183237224817276, |
| "learning_rate": 0.001, |
| "loss": 2.7637, |
| "num_input_tokens_seen": 3617587200, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.06606484849207579, |
| "grad_norm": 0.17770566046237946, |
| "learning_rate": 0.001, |
| "loss": 2.7602, |
| "num_input_tokens_seen": 3630694400, |
| "step": 13850 |
| }, |
| { |
| "epoch": 0.06630334975016994, |
| "grad_norm": 0.1678437739610672, |
| "learning_rate": 0.001, |
| "loss": 2.76, |
| "num_input_tokens_seen": 3643801600, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.06654185100826407, |
| "grad_norm": 0.16213107109069824, |
| "learning_rate": 0.001, |
| "loss": 2.7467, |
| "num_input_tokens_seen": 3656908800, |
| "step": 13950 |
| }, |
| { |
| "epoch": 0.0667803522663582, |
| "grad_norm": 0.17652907967567444, |
| "learning_rate": 0.001, |
| "loss": 2.7516, |
| "num_input_tokens_seen": 3670016000, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.0667803522663582, |
| "eval_loss": 2.6438815593719482, |
| "eval_runtime": 50.3233, |
| "eval_samples_per_second": 99.358, |
| "eval_steps_per_second": 24.839, |
| "num_input_tokens_seen": 3670016000, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.06701885352445235, |
| "grad_norm": 0.1785530298948288, |
| "learning_rate": 0.001, |
| "loss": 2.7475, |
| "num_input_tokens_seen": 3683123200, |
| "step": 14050 |
| }, |
| { |
| "epoch": 0.06725735478254648, |
| "grad_norm": 0.15644113719463348, |
| "learning_rate": 0.001, |
| "loss": 2.7541, |
| "num_input_tokens_seen": 3696230400, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.06749585604064061, |
| "grad_norm": 0.183272585272789, |
| "learning_rate": 0.001, |
| "loss": 2.7513, |
| "num_input_tokens_seen": 3709337600, |
| "step": 14150 |
| }, |
| { |
| "epoch": 0.06773435729873475, |
| "grad_norm": 0.17523212730884552, |
| "learning_rate": 0.001, |
| "loss": 2.7649, |
| "num_input_tokens_seen": 3722444800, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.06797285855682889, |
| "grad_norm": 0.1778247356414795, |
| "learning_rate": 0.001, |
| "loss": 2.7457, |
| "num_input_tokens_seen": 3735552000, |
| "step": 14250 |
| }, |
| { |
| "epoch": 0.06821135981492302, |
| "grad_norm": 0.18277810513973236, |
| "learning_rate": 0.001, |
| "loss": 2.7477, |
| "num_input_tokens_seen": 3748659200, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.06844986107301716, |
| "grad_norm": 0.17541366815567017, |
| "learning_rate": 0.001, |
| "loss": 2.7419, |
| "num_input_tokens_seen": 3761766400, |
| "step": 14350 |
| }, |
| { |
| "epoch": 0.0686883623311113, |
| "grad_norm": 0.1701425164937973, |
| "learning_rate": 0.001, |
| "loss": 2.7437, |
| "num_input_tokens_seen": 3774873600, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.06892686358920544, |
| "grad_norm": 0.16685517132282257, |
| "learning_rate": 0.001, |
| "loss": 2.7357, |
| "num_input_tokens_seen": 3787980800, |
| "step": 14450 |
| }, |
| { |
| "epoch": 0.06916536484729957, |
| "grad_norm": 0.1738167405128479, |
| "learning_rate": 0.001, |
| "loss": 2.7475, |
| "num_input_tokens_seen": 3801088000, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.06916536484729957, |
| "eval_loss": 2.635887622833252, |
| "eval_runtime": 50.4516, |
| "eval_samples_per_second": 99.105, |
| "eval_steps_per_second": 24.776, |
| "num_input_tokens_seen": 3801088000, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.0694038661053937, |
| "grad_norm": 0.18279027938842773, |
| "learning_rate": 0.001, |
| "loss": 2.7521, |
| "num_input_tokens_seen": 3814195200, |
| "step": 14550 |
| }, |
| { |
| "epoch": 0.06964236736348785, |
| "grad_norm": 0.1878173053264618, |
| "learning_rate": 0.001, |
| "loss": 2.7401, |
| "num_input_tokens_seen": 3827302400, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.06988086862158198, |
| "grad_norm": 0.17670077085494995, |
| "learning_rate": 0.001, |
| "loss": 2.7513, |
| "num_input_tokens_seen": 3840409600, |
| "step": 14650 |
| }, |
| { |
| "epoch": 0.07011936987967611, |
| "grad_norm": 0.17042580246925354, |
| "learning_rate": 0.001, |
| "loss": 2.7383, |
| "num_input_tokens_seen": 3853516800, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.07035787113777026, |
| "grad_norm": 0.17193050682544708, |
| "learning_rate": 0.001, |
| "loss": 2.7408, |
| "num_input_tokens_seen": 3866624000, |
| "step": 14750 |
| }, |
| { |
| "epoch": 0.07059637239586439, |
| "grad_norm": 0.16576342284679413, |
| "learning_rate": 0.001, |
| "loss": 2.7312, |
| "num_input_tokens_seen": 3879731200, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.07083487365395852, |
| "grad_norm": 0.18535619974136353, |
| "learning_rate": 0.001, |
| "loss": 2.756, |
| "num_input_tokens_seen": 3892838400, |
| "step": 14850 |
| }, |
| { |
| "epoch": 0.07107337491205266, |
| "grad_norm": 0.1729886531829834, |
| "learning_rate": 0.001, |
| "loss": 2.751, |
| "num_input_tokens_seen": 3905945600, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.0713118761701468, |
| "grad_norm": 0.16047705709934235, |
| "learning_rate": 0.001, |
| "loss": 2.7361, |
| "num_input_tokens_seen": 3919052800, |
| "step": 14950 |
| }, |
| { |
| "epoch": 0.07155037742824093, |
| "grad_norm": 0.17655611038208008, |
| "learning_rate": 0.001, |
| "loss": 2.7471, |
| "num_input_tokens_seen": 3932160000, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.07155037742824093, |
| "eval_loss": 2.6311256885528564, |
| "eval_runtime": 51.0361, |
| "eval_samples_per_second": 97.97, |
| "eval_steps_per_second": 24.492, |
| "num_input_tokens_seen": 3932160000, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.07178887868633507, |
| "grad_norm": 0.19243250787258148, |
| "learning_rate": 0.001, |
| "loss": 2.7551, |
| "num_input_tokens_seen": 3945267200, |
| "step": 15050 |
| }, |
| { |
| "epoch": 0.0720273799444292, |
| "grad_norm": 0.17328651249408722, |
| "learning_rate": 0.001, |
| "loss": 2.7346, |
| "num_input_tokens_seen": 3958374400, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.07226588120252335, |
| "grad_norm": 0.16357752680778503, |
| "learning_rate": 0.001, |
| "loss": 2.7523, |
| "num_input_tokens_seen": 3971481600, |
| "step": 15150 |
| }, |
| { |
| "epoch": 0.07250438246061748, |
| "grad_norm": 0.1726733148097992, |
| "learning_rate": 0.001, |
| "loss": 2.725, |
| "num_input_tokens_seen": 3984588800, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.07274288371871161, |
| "grad_norm": 0.16912953555583954, |
| "learning_rate": 0.001, |
| "loss": 2.738, |
| "num_input_tokens_seen": 3997696000, |
| "step": 15250 |
| }, |
| { |
| "epoch": 0.07298138497680576, |
| "grad_norm": 0.19751113653182983, |
| "learning_rate": 0.001, |
| "loss": 2.7532, |
| "num_input_tokens_seen": 4010803200, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.07321988623489989, |
| "grad_norm": 0.16762405633926392, |
| "learning_rate": 0.001, |
| "loss": 2.7413, |
| "num_input_tokens_seen": 4023910400, |
| "step": 15350 |
| }, |
| { |
| "epoch": 0.07345838749299402, |
| "grad_norm": 0.18106459081172943, |
| "learning_rate": 0.001, |
| "loss": 2.7411, |
| "num_input_tokens_seen": 4037017600, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.07369688875108817, |
| "grad_norm": 0.184165820479393, |
| "learning_rate": 0.001, |
| "loss": 2.7449, |
| "num_input_tokens_seen": 4050124800, |
| "step": 15450 |
| }, |
| { |
| "epoch": 0.0739353900091823, |
| "grad_norm": 0.16832765936851501, |
| "learning_rate": 0.001, |
| "loss": 2.7442, |
| "num_input_tokens_seen": 4063232000, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.0739353900091823, |
| "eval_loss": 2.6253247261047363, |
| "eval_runtime": 50.6964, |
| "eval_samples_per_second": 98.626, |
| "eval_steps_per_second": 24.657, |
| "num_input_tokens_seen": 4063232000, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.07417389126727643, |
| "grad_norm": 0.1663861721754074, |
| "learning_rate": 0.001, |
| "loss": 2.7461, |
| "num_input_tokens_seen": 4076339200, |
| "step": 15550 |
| }, |
| { |
| "epoch": 0.07441239252537057, |
| "grad_norm": 0.17217928171157837, |
| "learning_rate": 0.001, |
| "loss": 2.7394, |
| "num_input_tokens_seen": 4089446400, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.0746508937834647, |
| "grad_norm": 0.17169134318828583, |
| "learning_rate": 0.001, |
| "loss": 2.7474, |
| "num_input_tokens_seen": 4102553600, |
| "step": 15650 |
| }, |
| { |
| "epoch": 0.07488939504155885, |
| "grad_norm": 0.17074033617973328, |
| "learning_rate": 0.001, |
| "loss": 2.7405, |
| "num_input_tokens_seen": 4115660800, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.07512789629965298, |
| "grad_norm": 0.20199435949325562, |
| "learning_rate": 0.001, |
| "loss": 2.7412, |
| "num_input_tokens_seen": 4128768000, |
| "step": 15750 |
| }, |
| { |
| "epoch": 0.07536639755774711, |
| "grad_norm": 0.17569150030612946, |
| "learning_rate": 0.001, |
| "loss": 2.7279, |
| "num_input_tokens_seen": 4141875200, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.07560489881584126, |
| "grad_norm": 0.1753721386194229, |
| "learning_rate": 0.001, |
| "loss": 2.7442, |
| "num_input_tokens_seen": 4154982400, |
| "step": 15850 |
| }, |
| { |
| "epoch": 0.07584340007393539, |
| "grad_norm": 0.17356647551059723, |
| "learning_rate": 0.001, |
| "loss": 2.7447, |
| "num_input_tokens_seen": 4168089600, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.07608190133202952, |
| "grad_norm": 0.16931213438510895, |
| "learning_rate": 0.001, |
| "loss": 2.7194, |
| "num_input_tokens_seen": 4181196800, |
| "step": 15950 |
| }, |
| { |
| "epoch": 0.07632040259012367, |
| "grad_norm": 0.2109583616256714, |
| "learning_rate": 0.001, |
| "loss": 2.7271, |
| "num_input_tokens_seen": 4194304000, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.07632040259012367, |
| "eval_loss": 2.6222195625305176, |
| "eval_runtime": 50.2121, |
| "eval_samples_per_second": 99.578, |
| "eval_steps_per_second": 24.894, |
| "num_input_tokens_seen": 4194304000, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.0765589038482178, |
| "grad_norm": 0.1729741096496582, |
| "learning_rate": 0.001, |
| "loss": 2.7273, |
| "num_input_tokens_seen": 4207411200, |
| "step": 16050 |
| }, |
| { |
| "epoch": 0.07679740510631193, |
| "grad_norm": 0.178414985537529, |
| "learning_rate": 0.001, |
| "loss": 2.7119, |
| "num_input_tokens_seen": 4220518400, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.07703590636440608, |
| "grad_norm": 0.16985353827476501, |
| "learning_rate": 0.001, |
| "loss": 2.7329, |
| "num_input_tokens_seen": 4233625600, |
| "step": 16150 |
| }, |
| { |
| "epoch": 0.07727440762250021, |
| "grad_norm": 0.1792905330657959, |
| "learning_rate": 0.001, |
| "loss": 2.7331, |
| "num_input_tokens_seen": 4246732800, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.07751290888059434, |
| "grad_norm": 0.17052733898162842, |
| "learning_rate": 0.001, |
| "loss": 2.7438, |
| "num_input_tokens_seen": 4259840000, |
| "step": 16250 |
| }, |
| { |
| "epoch": 0.07775141013868848, |
| "grad_norm": 0.18520629405975342, |
| "learning_rate": 0.001, |
| "loss": 2.7292, |
| "num_input_tokens_seen": 4272947200, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.07798991139678262, |
| "grad_norm": 0.18607158958911896, |
| "learning_rate": 0.001, |
| "loss": 2.7305, |
| "num_input_tokens_seen": 4286054400, |
| "step": 16350 |
| }, |
| { |
| "epoch": 0.07822841265487676, |
| "grad_norm": 0.1774805337190628, |
| "learning_rate": 0.001, |
| "loss": 2.7237, |
| "num_input_tokens_seen": 4299161600, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.07846691391297089, |
| "grad_norm": 0.17118123173713684, |
| "learning_rate": 0.001, |
| "loss": 2.736, |
| "num_input_tokens_seen": 4312268800, |
| "step": 16450 |
| }, |
| { |
| "epoch": 0.07870541517106502, |
| "grad_norm": 0.18550898134708405, |
| "learning_rate": 0.001, |
| "loss": 2.7237, |
| "num_input_tokens_seen": 4325376000, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.07870541517106502, |
| "eval_loss": 2.6178503036499023, |
| "eval_runtime": 50.4959, |
| "eval_samples_per_second": 99.018, |
| "eval_steps_per_second": 24.754, |
| "num_input_tokens_seen": 4325376000, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.07894391642915917, |
| "grad_norm": 0.1839551031589508, |
| "learning_rate": 0.001, |
| "loss": 2.7312, |
| "num_input_tokens_seen": 4338483200, |
| "step": 16550 |
| }, |
| { |
| "epoch": 0.0791824176872533, |
| "grad_norm": 0.17430303990840912, |
| "learning_rate": 0.001, |
| "loss": 2.7138, |
| "num_input_tokens_seen": 4351590400, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.07942091894534743, |
| "grad_norm": 0.17208248376846313, |
| "learning_rate": 0.001, |
| "loss": 2.7459, |
| "num_input_tokens_seen": 4364697600, |
| "step": 16650 |
| }, |
| { |
| "epoch": 0.07965942020344158, |
| "grad_norm": 0.16932401061058044, |
| "learning_rate": 0.001, |
| "loss": 2.7358, |
| "num_input_tokens_seen": 4377804800, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.07989792146153571, |
| "grad_norm": 0.17707890272140503, |
| "learning_rate": 0.001, |
| "loss": 2.7169, |
| "num_input_tokens_seen": 4390912000, |
| "step": 16750 |
| }, |
| { |
| "epoch": 0.08013642271962984, |
| "grad_norm": 0.1669357717037201, |
| "learning_rate": 0.001, |
| "loss": 2.7296, |
| "num_input_tokens_seen": 4404019200, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.08037492397772399, |
| "grad_norm": 0.19266557693481445, |
| "learning_rate": 0.001, |
| "loss": 2.7187, |
| "num_input_tokens_seen": 4417126400, |
| "step": 16850 |
| }, |
| { |
| "epoch": 0.08061342523581812, |
| "grad_norm": 0.17670407891273499, |
| "learning_rate": 0.001, |
| "loss": 2.7339, |
| "num_input_tokens_seen": 4430233600, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.08085192649391225, |
| "grad_norm": 0.17866192758083344, |
| "learning_rate": 0.001, |
| "loss": 2.7167, |
| "num_input_tokens_seen": 4443340800, |
| "step": 16950 |
| }, |
| { |
| "epoch": 0.0810904277520064, |
| "grad_norm": 0.18247559666633606, |
| "learning_rate": 0.001, |
| "loss": 2.7151, |
| "num_input_tokens_seen": 4456448000, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.0810904277520064, |
| "eval_loss": 2.6127233505249023, |
| "eval_runtime": 50.7555, |
| "eval_samples_per_second": 98.512, |
| "eval_steps_per_second": 24.628, |
| "num_input_tokens_seen": 4456448000, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.08132892901010053, |
| "grad_norm": 0.17702773213386536, |
| "learning_rate": 0.001, |
| "loss": 2.7303, |
| "num_input_tokens_seen": 4469555200, |
| "step": 17050 |
| }, |
| { |
| "epoch": 0.08156743026819467, |
| "grad_norm": 0.18900151550769806, |
| "learning_rate": 0.001, |
| "loss": 2.7286, |
| "num_input_tokens_seen": 4482662400, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.0818059315262888, |
| "grad_norm": 0.18566136062145233, |
| "learning_rate": 0.001, |
| "loss": 2.7304, |
| "num_input_tokens_seen": 4495769600, |
| "step": 17150 |
| }, |
| { |
| "epoch": 0.08204443278438293, |
| "grad_norm": 0.1759686917066574, |
| "learning_rate": 0.001, |
| "loss": 2.7179, |
| "num_input_tokens_seen": 4508876800, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.08228293404247708, |
| "grad_norm": 0.15799184143543243, |
| "learning_rate": 0.001, |
| "loss": 2.7367, |
| "num_input_tokens_seen": 4521984000, |
| "step": 17250 |
| }, |
| { |
| "epoch": 0.08252143530057121, |
| "grad_norm": 0.18740351498126984, |
| "learning_rate": 0.001, |
| "loss": 2.72, |
| "num_input_tokens_seen": 4535091200, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.08275993655866534, |
| "grad_norm": 0.17688381671905518, |
| "learning_rate": 0.001, |
| "loss": 2.7115, |
| "num_input_tokens_seen": 4548198400, |
| "step": 17350 |
| }, |
| { |
| "epoch": 0.08299843781675949, |
| "grad_norm": 0.1807299256324768, |
| "learning_rate": 0.001, |
| "loss": 2.7346, |
| "num_input_tokens_seen": 4561305600, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.08323693907485362, |
| "grad_norm": 0.17570430040359497, |
| "learning_rate": 0.001, |
| "loss": 2.7141, |
| "num_input_tokens_seen": 4574412800, |
| "step": 17450 |
| }, |
| { |
| "epoch": 0.08347544033294775, |
| "grad_norm": 0.16912159323692322, |
| "learning_rate": 0.001, |
| "loss": 2.7164, |
| "num_input_tokens_seen": 4587520000, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.08347544033294775, |
| "eval_loss": 2.6085972785949707, |
| "eval_runtime": 50.619, |
| "eval_samples_per_second": 98.777, |
| "eval_steps_per_second": 24.694, |
| "num_input_tokens_seen": 4587520000, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.0837139415910419, |
| "grad_norm": 0.17684704065322876, |
| "learning_rate": 0.001, |
| "loss": 2.7293, |
| "num_input_tokens_seen": 4600627200, |
| "step": 17550 |
| }, |
| { |
| "epoch": 0.08395244284913603, |
| "grad_norm": 0.18020550906658173, |
| "learning_rate": 0.001, |
| "loss": 2.7124, |
| "num_input_tokens_seen": 4613734400, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.08419094410723016, |
| "grad_norm": 0.17311082780361176, |
| "learning_rate": 0.001, |
| "loss": 2.7047, |
| "num_input_tokens_seen": 4626841600, |
| "step": 17650 |
| }, |
| { |
| "epoch": 0.0844294453653243, |
| "grad_norm": 0.17366532981395721, |
| "learning_rate": 0.001, |
| "loss": 2.7316, |
| "num_input_tokens_seen": 4639948800, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.08466794662341844, |
| "grad_norm": 0.16526220738887787, |
| "learning_rate": 0.001, |
| "loss": 2.7212, |
| "num_input_tokens_seen": 4653056000, |
| "step": 17750 |
| }, |
| { |
| "epoch": 0.08490644788151258, |
| "grad_norm": 0.1746092140674591, |
| "learning_rate": 0.001, |
| "loss": 2.7288, |
| "num_input_tokens_seen": 4666163200, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.08514494913960671, |
| "grad_norm": 0.19404995441436768, |
| "learning_rate": 0.001, |
| "loss": 2.7129, |
| "num_input_tokens_seen": 4679270400, |
| "step": 17850 |
| }, |
| { |
| "epoch": 0.08538345039770084, |
| "grad_norm": 0.18850015103816986, |
| "learning_rate": 0.001, |
| "loss": 2.7161, |
| "num_input_tokens_seen": 4692377600, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.08562195165579499, |
| "grad_norm": 0.19126516580581665, |
| "learning_rate": 0.001, |
| "loss": 2.7206, |
| "num_input_tokens_seen": 4705484800, |
| "step": 17950 |
| }, |
| { |
| "epoch": 0.08586045291388912, |
| "grad_norm": 0.1802307665348053, |
| "learning_rate": 0.001, |
| "loss": 2.7163, |
| "num_input_tokens_seen": 4718592000, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.08586045291388912, |
| "eval_loss": 2.603686809539795, |
| "eval_runtime": 50.6488, |
| "eval_samples_per_second": 98.719, |
| "eval_steps_per_second": 24.68, |
| "num_input_tokens_seen": 4718592000, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.08609895417198325, |
| "grad_norm": 0.18276441097259521, |
| "learning_rate": 0.001, |
| "loss": 2.7299, |
| "num_input_tokens_seen": 4731699200, |
| "step": 18050 |
| }, |
| { |
| "epoch": 0.0863374554300774, |
| "grad_norm": 0.17280028760433197, |
| "learning_rate": 0.001, |
| "loss": 2.7175, |
| "num_input_tokens_seen": 4744806400, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.08657595668817153, |
| "grad_norm": 0.17224080860614777, |
| "learning_rate": 0.001, |
| "loss": 2.7089, |
| "num_input_tokens_seen": 4757913600, |
| "step": 18150 |
| }, |
| { |
| "epoch": 0.08681445794626566, |
| "grad_norm": 0.17205391824245453, |
| "learning_rate": 0.001, |
| "loss": 2.7072, |
| "num_input_tokens_seen": 4771020800, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.0870529592043598, |
| "grad_norm": 0.1829432249069214, |
| "learning_rate": 0.001, |
| "loss": 2.6959, |
| "num_input_tokens_seen": 4784128000, |
| "step": 18250 |
| }, |
| { |
| "epoch": 0.08729146046245394, |
| "grad_norm": 0.1669514924287796, |
| "learning_rate": 0.001, |
| "loss": 2.7209, |
| "num_input_tokens_seen": 4797235200, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.08752996172054807, |
| "grad_norm": 0.18273359537124634, |
| "learning_rate": 0.001, |
| "loss": 2.6935, |
| "num_input_tokens_seen": 4810342400, |
| "step": 18350 |
| }, |
| { |
| "epoch": 0.08776846297864221, |
| "grad_norm": 0.21061965823173523, |
| "learning_rate": 0.001, |
| "loss": 2.7204, |
| "num_input_tokens_seen": 4823449600, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.08800696423673635, |
| "grad_norm": 0.17710614204406738, |
| "learning_rate": 0.001, |
| "loss": 2.7231, |
| "num_input_tokens_seen": 4836556800, |
| "step": 18450 |
| }, |
| { |
| "epoch": 0.08824546549483049, |
| "grad_norm": 0.17370648682117462, |
| "learning_rate": 0.001, |
| "loss": 2.7064, |
| "num_input_tokens_seen": 4849664000, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.08824546549483049, |
| "eval_loss": 2.600076913833618, |
| "eval_runtime": 50.3596, |
| "eval_samples_per_second": 99.286, |
| "eval_steps_per_second": 24.822, |
| "num_input_tokens_seen": 4849664000, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.08848396675292462, |
| "grad_norm": 0.19398675858974457, |
| "learning_rate": 0.001, |
| "loss": 2.7118, |
| "num_input_tokens_seen": 4862771200, |
| "step": 18550 |
| }, |
| { |
| "epoch": 0.08872246801101875, |
| "grad_norm": 0.18522043526172638, |
| "learning_rate": 0.001, |
| "loss": 2.6998, |
| "num_input_tokens_seen": 4875878400, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.0889609692691129, |
| "grad_norm": 0.2682952880859375, |
| "learning_rate": 0.001, |
| "loss": 2.7057, |
| "num_input_tokens_seen": 4888985600, |
| "step": 18650 |
| }, |
| { |
| "epoch": 0.08919947052720703, |
| "grad_norm": 0.18555712699890137, |
| "learning_rate": 0.001, |
| "loss": 2.7127, |
| "num_input_tokens_seen": 4902092800, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.08943797178530116, |
| "grad_norm": 0.1940859854221344, |
| "learning_rate": 0.001, |
| "loss": 2.7054, |
| "num_input_tokens_seen": 4915200000, |
| "step": 18750 |
| }, |
| { |
| "epoch": 0.08967647304339531, |
| "grad_norm": 0.1800539344549179, |
| "learning_rate": 0.001, |
| "loss": 2.702, |
| "num_input_tokens_seen": 4928307200, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.08991497430148944, |
| "grad_norm": 0.19734695553779602, |
| "learning_rate": 0.001, |
| "loss": 2.7157, |
| "num_input_tokens_seen": 4941414400, |
| "step": 18850 |
| }, |
| { |
| "epoch": 0.09015347555958357, |
| "grad_norm": 0.16387026011943817, |
| "learning_rate": 0.001, |
| "loss": 2.7183, |
| "num_input_tokens_seen": 4954521600, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.09039197681767772, |
| "grad_norm": 0.19447770714759827, |
| "learning_rate": 0.001, |
| "loss": 2.7154, |
| "num_input_tokens_seen": 4967628800, |
| "step": 18950 |
| }, |
| { |
| "epoch": 0.09063047807577185, |
| "grad_norm": 0.17366571724414825, |
| "learning_rate": 0.001, |
| "loss": 2.6996, |
| "num_input_tokens_seen": 4980736000, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.09063047807577185, |
| "eval_loss": 2.5983569622039795, |
| "eval_runtime": 50.8055, |
| "eval_samples_per_second": 98.415, |
| "eval_steps_per_second": 24.604, |
| "num_input_tokens_seen": 4980736000, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.09086897933386599, |
| "grad_norm": 0.1770928055047989, |
| "learning_rate": 0.001, |
| "loss": 2.7171, |
| "num_input_tokens_seen": 4993843200, |
| "step": 19050 |
| }, |
| { |
| "epoch": 0.09110748059196012, |
| "grad_norm": 0.18122689425945282, |
| "learning_rate": 0.001, |
| "loss": 2.7064, |
| "num_input_tokens_seen": 5006950400, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.09134598185005426, |
| "grad_norm": 0.19320747256278992, |
| "learning_rate": 0.001, |
| "loss": 2.7105, |
| "num_input_tokens_seen": 5020057600, |
| "step": 19150 |
| }, |
| { |
| "epoch": 0.0915844831081484, |
| "grad_norm": 0.19556616246700287, |
| "learning_rate": 0.001, |
| "loss": 2.7018, |
| "num_input_tokens_seen": 5033164800, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.09182298436624253, |
| "grad_norm": 0.18251653015613556, |
| "learning_rate": 0.001, |
| "loss": 2.7067, |
| "num_input_tokens_seen": 5046272000, |
| "step": 19250 |
| }, |
| { |
| "epoch": 0.09206148562433666, |
| "grad_norm": 0.17226757109165192, |
| "learning_rate": 0.001, |
| "loss": 2.6803, |
| "num_input_tokens_seen": 5059379200, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.09229998688243081, |
| "grad_norm": 0.18007858097553253, |
| "learning_rate": 0.001, |
| "loss": 2.6998, |
| "num_input_tokens_seen": 5072486400, |
| "step": 19350 |
| }, |
| { |
| "epoch": 0.09253848814052494, |
| "grad_norm": 0.1664605736732483, |
| "learning_rate": 0.001, |
| "loss": 2.6985, |
| "num_input_tokens_seen": 5085593600, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.09277698939861907, |
| "grad_norm": 0.17898677289485931, |
| "learning_rate": 0.001, |
| "loss": 2.7034, |
| "num_input_tokens_seen": 5098700800, |
| "step": 19450 |
| }, |
| { |
| "epoch": 0.09301549065671322, |
| "grad_norm": 0.16403160989284515, |
| "learning_rate": 0.001, |
| "loss": 2.7012, |
| "num_input_tokens_seen": 5111808000, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.09301549065671322, |
| "eval_loss": 2.594251871109009, |
| "eval_runtime": 50.1924, |
| "eval_samples_per_second": 99.617, |
| "eval_steps_per_second": 24.904, |
| "num_input_tokens_seen": 5111808000, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.09325399191480735, |
| "grad_norm": 0.17973001301288605, |
| "learning_rate": 0.001, |
| "loss": 2.7039, |
| "num_input_tokens_seen": 5124915200, |
| "step": 19550 |
| }, |
| { |
| "epoch": 0.09349249317290148, |
| "grad_norm": 0.1667868047952652, |
| "learning_rate": 0.001, |
| "loss": 2.7038, |
| "num_input_tokens_seen": 5138022400, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.09373099443099563, |
| "grad_norm": 0.18338319659233093, |
| "learning_rate": 0.001, |
| "loss": 2.7138, |
| "num_input_tokens_seen": 5151129600, |
| "step": 19650 |
| }, |
| { |
| "epoch": 0.09396949568908976, |
| "grad_norm": 0.17962965369224548, |
| "learning_rate": 0.001, |
| "loss": 2.6994, |
| "num_input_tokens_seen": 5164236800, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.0942079969471839, |
| "grad_norm": 0.17233812808990479, |
| "learning_rate": 0.001, |
| "loss": 2.7073, |
| "num_input_tokens_seen": 5177344000, |
| "step": 19750 |
| }, |
| { |
| "epoch": 0.09444649820527803, |
| "grad_norm": 0.16720129549503326, |
| "learning_rate": 0.001, |
| "loss": 2.7146, |
| "num_input_tokens_seen": 5190451200, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.09468499946337217, |
| "grad_norm": 0.1732376664876938, |
| "learning_rate": 0.001, |
| "loss": 2.7126, |
| "num_input_tokens_seen": 5203558400, |
| "step": 19850 |
| }, |
| { |
| "epoch": 0.09492350072146631, |
| "grad_norm": 0.17245380580425262, |
| "learning_rate": 0.001, |
| "loss": 2.7054, |
| "num_input_tokens_seen": 5216665600, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.09516200197956044, |
| "grad_norm": 0.17415107786655426, |
| "learning_rate": 0.001, |
| "loss": 2.7027, |
| "num_input_tokens_seen": 5229772800, |
| "step": 19950 |
| }, |
| { |
| "epoch": 0.09540050323765457, |
| "grad_norm": 0.1747124344110489, |
| "learning_rate": 0.001, |
| "loss": 2.6975, |
| "num_input_tokens_seen": 5242880000, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.09540050323765457, |
| "eval_loss": 2.5900332927703857, |
| "eval_runtime": 50.5898, |
| "eval_samples_per_second": 98.834, |
| "eval_steps_per_second": 24.709, |
| "num_input_tokens_seen": 5242880000, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.09563900449574872, |
| "grad_norm": 0.17750214040279388, |
| "learning_rate": 0.001, |
| "loss": 2.7, |
| "num_input_tokens_seen": 5255987200, |
| "step": 20050 |
| }, |
| { |
| "epoch": 0.09587750575384285, |
| "grad_norm": 0.16490615904331207, |
| "learning_rate": 0.001, |
| "loss": 2.7188, |
| "num_input_tokens_seen": 5269094400, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.09611600701193698, |
| "grad_norm": 0.20347309112548828, |
| "learning_rate": 0.001, |
| "loss": 2.7034, |
| "num_input_tokens_seen": 5282201600, |
| "step": 20150 |
| }, |
| { |
| "epoch": 0.09635450827003113, |
| "grad_norm": 0.19717667996883392, |
| "learning_rate": 0.001, |
| "loss": 2.6864, |
| "num_input_tokens_seen": 5295308800, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.09659300952812526, |
| "grad_norm": 0.17054997384548187, |
| "learning_rate": 0.001, |
| "loss": 2.7068, |
| "num_input_tokens_seen": 5308416000, |
| "step": 20250 |
| }, |
| { |
| "epoch": 0.09683151078621939, |
| "grad_norm": 0.1771887093782425, |
| "learning_rate": 0.001, |
| "loss": 2.7037, |
| "num_input_tokens_seen": 5321523200, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.09707001204431354, |
| "grad_norm": 0.17556501924991608, |
| "learning_rate": 0.001, |
| "loss": 2.705, |
| "num_input_tokens_seen": 5334630400, |
| "step": 20350 |
| }, |
| { |
| "epoch": 0.09730851330240767, |
| "grad_norm": 0.1696256399154663, |
| "learning_rate": 0.001, |
| "loss": 2.7109, |
| "num_input_tokens_seen": 5347737600, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.09754701456050181, |
| "grad_norm": 0.18629619479179382, |
| "learning_rate": 0.001, |
| "loss": 2.7043, |
| "num_input_tokens_seen": 5360844800, |
| "step": 20450 |
| }, |
| { |
| "epoch": 0.09778551581859594, |
| "grad_norm": 0.18701018393039703, |
| "learning_rate": 0.001, |
| "loss": 2.7002, |
| "num_input_tokens_seen": 5373952000, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.09778551581859594, |
| "eval_loss": 2.587498903274536, |
| "eval_runtime": 51.1986, |
| "eval_samples_per_second": 97.659, |
| "eval_steps_per_second": 24.415, |
| "num_input_tokens_seen": 5373952000, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.09802401707669008, |
| "grad_norm": 0.1792842447757721, |
| "learning_rate": 0.001, |
| "loss": 2.7083, |
| "num_input_tokens_seen": 5387059200, |
| "step": 20550 |
| }, |
| { |
| "epoch": 0.09826251833478422, |
| "grad_norm": 0.18761058151721954, |
| "learning_rate": 0.001, |
| "loss": 2.6898, |
| "num_input_tokens_seen": 5400166400, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.09850101959287835, |
| "grad_norm": 0.1827591508626938, |
| "learning_rate": 0.001, |
| "loss": 2.6976, |
| "num_input_tokens_seen": 5413273600, |
| "step": 20650 |
| }, |
| { |
| "epoch": 0.09873952085097248, |
| "grad_norm": 0.16178373992443085, |
| "learning_rate": 0.001, |
| "loss": 2.7029, |
| "num_input_tokens_seen": 5426380800, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.09897802210906663, |
| "grad_norm": 0.1880313903093338, |
| "learning_rate": 0.001, |
| "loss": 2.6867, |
| "num_input_tokens_seen": 5439488000, |
| "step": 20750 |
| }, |
| { |
| "epoch": 0.09921652336716076, |
| "grad_norm": 0.17611584067344666, |
| "learning_rate": 0.001, |
| "loss": 2.6741, |
| "num_input_tokens_seen": 5452595200, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.09945502462525489, |
| "grad_norm": 0.17712561786174774, |
| "learning_rate": 0.001, |
| "loss": 2.7099, |
| "num_input_tokens_seen": 5465702400, |
| "step": 20850 |
| }, |
| { |
| "epoch": 0.09969352588334904, |
| "grad_norm": 0.18022434413433075, |
| "learning_rate": 0.001, |
| "loss": 2.6988, |
| "num_input_tokens_seen": 5478809600, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.09993202714144317, |
| "grad_norm": 0.17434161901474, |
| "learning_rate": 0.001, |
| "loss": 2.6869, |
| "num_input_tokens_seen": 5491916800, |
| "step": 20950 |
| }, |
| { |
| "epoch": 0.1001705283995373, |
| "grad_norm": 0.17802472412586212, |
| "learning_rate": 0.001, |
| "loss": 2.6935, |
| "num_input_tokens_seen": 5505024000, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.1001705283995373, |
| "eval_loss": 2.5838851928710938, |
| "eval_runtime": 50.1977, |
| "eval_samples_per_second": 99.606, |
| "eval_steps_per_second": 24.902, |
| "num_input_tokens_seen": 5505024000, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.10040902965763145, |
| "grad_norm": 0.1723284274339676, |
| "learning_rate": 0.001, |
| "loss": 2.694, |
| "num_input_tokens_seen": 5518131200, |
| "step": 21050 |
| }, |
| { |
| "epoch": 0.10064753091572558, |
| "grad_norm": 0.1627894937992096, |
| "learning_rate": 0.001, |
| "loss": 2.6866, |
| "num_input_tokens_seen": 5531238400, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.10088603217381972, |
| "grad_norm": 0.20949719846248627, |
| "learning_rate": 0.001, |
| "loss": 2.6915, |
| "num_input_tokens_seen": 5544345600, |
| "step": 21150 |
| }, |
| { |
| "epoch": 0.10112453343191385, |
| "grad_norm": 0.1980736404657364, |
| "learning_rate": 0.001, |
| "loss": 2.7076, |
| "num_input_tokens_seen": 5557452800, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.10136303469000799, |
| "grad_norm": 0.20961201190948486, |
| "learning_rate": 0.001, |
| "loss": 2.6978, |
| "num_input_tokens_seen": 5570560000, |
| "step": 21250 |
| }, |
| { |
| "epoch": 0.10160153594810213, |
| "grad_norm": 0.18137700855731964, |
| "learning_rate": 0.001, |
| "loss": 2.7029, |
| "num_input_tokens_seen": 5583667200, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.10184003720619626, |
| "grad_norm": 0.17235560715198517, |
| "learning_rate": 0.001, |
| "loss": 2.6979, |
| "num_input_tokens_seen": 5596774400, |
| "step": 21350 |
| }, |
| { |
| "epoch": 0.1020785384642904, |
| "grad_norm": 0.17818449437618256, |
| "learning_rate": 0.001, |
| "loss": 2.6987, |
| "num_input_tokens_seen": 5609881600, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.10231703972238454, |
| "grad_norm": 0.1798463761806488, |
| "learning_rate": 0.001, |
| "loss": 2.693, |
| "num_input_tokens_seen": 5622988800, |
| "step": 21450 |
| }, |
| { |
| "epoch": 0.10255554098047867, |
| "grad_norm": 0.19028444588184357, |
| "learning_rate": 0.001, |
| "loss": 2.7079, |
| "num_input_tokens_seen": 5636096000, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.10255554098047867, |
| "eval_loss": 2.5816380977630615, |
| "eval_runtime": 50.7808, |
| "eval_samples_per_second": 98.462, |
| "eval_steps_per_second": 24.616, |
| "num_input_tokens_seen": 5636096000, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.1027940422385728, |
| "grad_norm": 0.1831275075674057, |
| "learning_rate": 0.001, |
| "loss": 2.7038, |
| "num_input_tokens_seen": 5649203200, |
| "step": 21550 |
| }, |
| { |
| "epoch": 0.10303254349666695, |
| "grad_norm": 0.17404012382030487, |
| "learning_rate": 0.001, |
| "loss": 2.7071, |
| "num_input_tokens_seen": 5662310400, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.10327104475476108, |
| "grad_norm": 0.1652098149061203, |
| "learning_rate": 0.001, |
| "loss": 2.7033, |
| "num_input_tokens_seen": 5675417600, |
| "step": 21650 |
| }, |
| { |
| "epoch": 0.10350954601285522, |
| "grad_norm": 0.1914501190185547, |
| "learning_rate": 0.001, |
| "loss": 2.6844, |
| "num_input_tokens_seen": 5688524800, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.10374804727094936, |
| "grad_norm": 0.19169588387012482, |
| "learning_rate": 0.001, |
| "loss": 2.6793, |
| "num_input_tokens_seen": 5701632000, |
| "step": 21750 |
| }, |
| { |
| "epoch": 0.10398654852904349, |
| "grad_norm": 0.17937491834163666, |
| "learning_rate": 0.001, |
| "loss": 2.6972, |
| "num_input_tokens_seen": 5714739200, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.10422504978713763, |
| "grad_norm": 0.17515376210212708, |
| "learning_rate": 0.001, |
| "loss": 2.6995, |
| "num_input_tokens_seen": 5727846400, |
| "step": 21850 |
| }, |
| { |
| "epoch": 0.10446355104523176, |
| "grad_norm": 0.18881027400493622, |
| "learning_rate": 0.001, |
| "loss": 2.7097, |
| "num_input_tokens_seen": 5740953600, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.1047020523033259, |
| "grad_norm": 0.19030135869979858, |
| "learning_rate": 0.001, |
| "loss": 2.6801, |
| "num_input_tokens_seen": 5754060800, |
| "step": 21950 |
| }, |
| { |
| "epoch": 0.10494055356142004, |
| "grad_norm": 0.17325563728809357, |
| "learning_rate": 0.001, |
| "loss": 2.6803, |
| "num_input_tokens_seen": 5767168000, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.10494055356142004, |
| "eval_loss": 2.577341318130493, |
| "eval_runtime": 50.8482, |
| "eval_samples_per_second": 98.332, |
| "eval_steps_per_second": 24.583, |
| "num_input_tokens_seen": 5767168000, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.10517905481951417, |
| "grad_norm": 0.19298380613327026, |
| "learning_rate": 0.001, |
| "loss": 2.6966, |
| "num_input_tokens_seen": 5780275200, |
| "step": 22050 |
| }, |
| { |
| "epoch": 0.1054175560776083, |
| "grad_norm": 0.1772100180387497, |
| "learning_rate": 0.001, |
| "loss": 2.6851, |
| "num_input_tokens_seen": 5793382400, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.10565605733570245, |
| "grad_norm": 0.18548481166362762, |
| "learning_rate": 0.001, |
| "loss": 2.7028, |
| "num_input_tokens_seen": 5806489600, |
| "step": 22150 |
| }, |
| { |
| "epoch": 0.10589455859379658, |
| "grad_norm": 0.20102089643478394, |
| "learning_rate": 0.001, |
| "loss": 2.6915, |
| "num_input_tokens_seen": 5819596800, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.10613305985189071, |
| "grad_norm": 0.1833849996328354, |
| "learning_rate": 0.001, |
| "loss": 2.6872, |
| "num_input_tokens_seen": 5832704000, |
| "step": 22250 |
| }, |
| { |
| "epoch": 0.10637156110998486, |
| "grad_norm": 0.17730027437210083, |
| "learning_rate": 0.001, |
| "loss": 2.6811, |
| "num_input_tokens_seen": 5845811200, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.10661006236807899, |
| "grad_norm": 0.1818256825208664, |
| "learning_rate": 0.001, |
| "loss": 2.7, |
| "num_input_tokens_seen": 5858918400, |
| "step": 22350 |
| }, |
| { |
| "epoch": 0.10684856362617313, |
| "grad_norm": 0.16850312054157257, |
| "learning_rate": 0.001, |
| "loss": 2.6927, |
| "num_input_tokens_seen": 5872025600, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.10708706488426727, |
| "grad_norm": 0.209822878241539, |
| "learning_rate": 0.001, |
| "loss": 2.6881, |
| "num_input_tokens_seen": 5885132800, |
| "step": 22450 |
| }, |
| { |
| "epoch": 0.1073255661423614, |
| "grad_norm": 0.2131560891866684, |
| "learning_rate": 0.001, |
| "loss": 2.6797, |
| "num_input_tokens_seen": 5898240000, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.1073255661423614, |
| "eval_loss": 2.575388193130493, |
| "eval_runtime": 50.9696, |
| "eval_samples_per_second": 98.098, |
| "eval_steps_per_second": 24.524, |
| "num_input_tokens_seen": 5898240000, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.10756406740045554, |
| "grad_norm": 0.18200135231018066, |
| "learning_rate": 0.001, |
| "loss": 2.6837, |
| "num_input_tokens_seen": 5911347200, |
| "step": 22550 |
| }, |
| { |
| "epoch": 0.10780256865854967, |
| "grad_norm": 0.1830984354019165, |
| "learning_rate": 0.001, |
| "loss": 2.7159, |
| "num_input_tokens_seen": 5924454400, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.1080410699166438, |
| "grad_norm": 0.1700614094734192, |
| "learning_rate": 0.001, |
| "loss": 2.6852, |
| "num_input_tokens_seen": 5937561600, |
| "step": 22650 |
| }, |
| { |
| "epoch": 0.10827957117473795, |
| "grad_norm": 0.18473868072032928, |
| "learning_rate": 0.001, |
| "loss": 2.6857, |
| "num_input_tokens_seen": 5950668800, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.10851807243283208, |
| "grad_norm": 0.19345365464687347, |
| "learning_rate": 0.001, |
| "loss": 2.69, |
| "num_input_tokens_seen": 5963776000, |
| "step": 22750 |
| }, |
| { |
| "epoch": 0.10875657369092621, |
| "grad_norm": 0.18807141482830048, |
| "learning_rate": 0.001, |
| "loss": 2.6897, |
| "num_input_tokens_seen": 5976883200, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.10899507494902036, |
| "grad_norm": 0.18426446616649628, |
| "learning_rate": 0.001, |
| "loss": 2.6855, |
| "num_input_tokens_seen": 5989990400, |
| "step": 22850 |
| }, |
| { |
| "epoch": 0.10923357620711449, |
| "grad_norm": 0.19184571504592896, |
| "learning_rate": 0.001, |
| "loss": 2.6914, |
| "num_input_tokens_seen": 6003097600, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.10947207746520862, |
| "grad_norm": 0.22897471487522125, |
| "learning_rate": 0.001, |
| "loss": 2.6812, |
| "num_input_tokens_seen": 6016204800, |
| "step": 22950 |
| }, |
| { |
| "epoch": 0.10971057872330277, |
| "grad_norm": 0.1939724087715149, |
| "learning_rate": 0.001, |
| "loss": 2.6836, |
| "num_input_tokens_seen": 6029312000, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.10971057872330277, |
| "eval_loss": 2.570572853088379, |
| "eval_runtime": 50.0606, |
| "eval_samples_per_second": 99.879, |
| "eval_steps_per_second": 24.97, |
| "num_input_tokens_seen": 6029312000, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.1099490799813969, |
| "grad_norm": 0.17564797401428223, |
| "learning_rate": 0.001, |
| "loss": 2.6912, |
| "num_input_tokens_seen": 6042419200, |
| "step": 23050 |
| }, |
| { |
| "epoch": 0.11018758123949104, |
| "grad_norm": 0.17937473952770233, |
| "learning_rate": 0.001, |
| "loss": 2.6708, |
| "num_input_tokens_seen": 6055526400, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.11042608249758518, |
| "grad_norm": 0.18281136453151703, |
| "learning_rate": 0.001, |
| "loss": 2.6855, |
| "num_input_tokens_seen": 6068633600, |
| "step": 23150 |
| }, |
| { |
| "epoch": 0.11066458375567931, |
| "grad_norm": 0.18834726512432098, |
| "learning_rate": 0.001, |
| "loss": 2.6887, |
| "num_input_tokens_seen": 6081740800, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.11090308501377345, |
| "grad_norm": 0.2104720175266266, |
| "learning_rate": 0.001, |
| "loss": 2.6914, |
| "num_input_tokens_seen": 6094848000, |
| "step": 23250 |
| }, |
| { |
| "epoch": 0.11114158627186758, |
| "grad_norm": 0.18674172461032867, |
| "learning_rate": 0.001, |
| "loss": 2.6855, |
| "num_input_tokens_seen": 6107955200, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.11138008752996172, |
| "grad_norm": 0.19519701600074768, |
| "learning_rate": 0.001, |
| "loss": 2.6851, |
| "num_input_tokens_seen": 6121062400, |
| "step": 23350 |
| }, |
| { |
| "epoch": 0.11161858878805586, |
| "grad_norm": 0.1752537339925766, |
| "learning_rate": 0.001, |
| "loss": 2.692, |
| "num_input_tokens_seen": 6134169600, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.11185709004614999, |
| "grad_norm": 0.1786031723022461, |
| "learning_rate": 0.001, |
| "loss": 2.6785, |
| "num_input_tokens_seen": 6147276800, |
| "step": 23450 |
| }, |
| { |
| "epoch": 0.11209559130424412, |
| "grad_norm": 0.19057604670524597, |
| "learning_rate": 0.001, |
| "loss": 2.6798, |
| "num_input_tokens_seen": 6160384000, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.11209559130424412, |
| "eval_loss": 2.5710463523864746, |
| "eval_runtime": 50.3332, |
| "eval_samples_per_second": 99.338, |
| "eval_steps_per_second": 24.835, |
| "num_input_tokens_seen": 6160384000, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.11233409256233827, |
| "grad_norm": 0.18272963166236877, |
| "learning_rate": 0.001, |
| "loss": 2.6847, |
| "num_input_tokens_seen": 6173491200, |
| "step": 23550 |
| }, |
| { |
| "epoch": 0.1125725938204324, |
| "grad_norm": 0.1666375696659088, |
| "learning_rate": 0.001, |
| "loss": 2.6747, |
| "num_input_tokens_seen": 6186598400, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.11281109507852653, |
| "grad_norm": 0.1688246876001358, |
| "learning_rate": 0.001, |
| "loss": 2.6963, |
| "num_input_tokens_seen": 6199705600, |
| "step": 23650 |
| }, |
| { |
| "epoch": 0.11304959633662068, |
| "grad_norm": 0.1970459669828415, |
| "learning_rate": 0.001, |
| "loss": 2.6904, |
| "num_input_tokens_seen": 6212812800, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.11328809759471481, |
| "grad_norm": 0.19660720229148865, |
| "learning_rate": 0.001, |
| "loss": 2.6833, |
| "num_input_tokens_seen": 6225920000, |
| "step": 23750 |
| }, |
| { |
| "epoch": 0.11352659885280895, |
| "grad_norm": 0.18711698055267334, |
| "learning_rate": 0.001, |
| "loss": 2.6872, |
| "num_input_tokens_seen": 6239027200, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.11376510011090309, |
| "grad_norm": 0.1878872513771057, |
| "learning_rate": 0.001, |
| "loss": 2.6884, |
| "num_input_tokens_seen": 6252134400, |
| "step": 23850 |
| }, |
| { |
| "epoch": 0.11400360136899722, |
| "grad_norm": 0.1969616860151291, |
| "learning_rate": 0.001, |
| "loss": 2.6982, |
| "num_input_tokens_seen": 6265241600, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.11424210262709136, |
| "grad_norm": 0.19693812727928162, |
| "learning_rate": 0.001, |
| "loss": 2.6782, |
| "num_input_tokens_seen": 6278348800, |
| "step": 23950 |
| }, |
| { |
| "epoch": 0.1144806038851855, |
| "grad_norm": 0.1731441468000412, |
| "learning_rate": 0.001, |
| "loss": 2.6917, |
| "num_input_tokens_seen": 6291456000, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.1144806038851855, |
| "eval_loss": 2.5664987564086914, |
| "eval_runtime": 50.3452, |
| "eval_samples_per_second": 99.314, |
| "eval_steps_per_second": 24.829, |
| "num_input_tokens_seen": 6291456000, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.11471910514327963, |
| "grad_norm": 0.1724429428577423, |
| "learning_rate": 0.001, |
| "loss": 2.6806, |
| "num_input_tokens_seen": 6304563200, |
| "step": 24050 |
| }, |
| { |
| "epoch": 0.11495760640137377, |
| "grad_norm": 0.20449388027191162, |
| "learning_rate": 0.001, |
| "loss": 2.6873, |
| "num_input_tokens_seen": 6317670400, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.1151961076594679, |
| "grad_norm": 0.19024738669395447, |
| "learning_rate": 0.001, |
| "loss": 2.6811, |
| "num_input_tokens_seen": 6330777600, |
| "step": 24150 |
| }, |
| { |
| "epoch": 0.11543460891756203, |
| "grad_norm": 0.20510025322437286, |
| "learning_rate": 0.001, |
| "loss": 2.6643, |
| "num_input_tokens_seen": 6343884800, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.11567311017565618, |
| "grad_norm": 0.1783556044101715, |
| "learning_rate": 0.001, |
| "loss": 2.6709, |
| "num_input_tokens_seen": 6356992000, |
| "step": 24250 |
| }, |
| { |
| "epoch": 0.11591161143375031, |
| "grad_norm": 0.1771089732646942, |
| "learning_rate": 0.001, |
| "loss": 2.6677, |
| "num_input_tokens_seen": 6370099200, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.11615011269184444, |
| "grad_norm": 0.17016734182834625, |
| "learning_rate": 0.001, |
| "loss": 2.6681, |
| "num_input_tokens_seen": 6383206400, |
| "step": 24350 |
| }, |
| { |
| "epoch": 0.11638861394993859, |
| "grad_norm": 0.1901489496231079, |
| "learning_rate": 0.001, |
| "loss": 2.6811, |
| "num_input_tokens_seen": 6396313600, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.11662711520803272, |
| "grad_norm": 0.18185457587242126, |
| "learning_rate": 0.001, |
| "loss": 2.6787, |
| "num_input_tokens_seen": 6409420800, |
| "step": 24450 |
| }, |
| { |
| "epoch": 0.11686561646612686, |
| "grad_norm": 0.1789853274822235, |
| "learning_rate": 0.001, |
| "loss": 2.6657, |
| "num_input_tokens_seen": 6422528000, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.11686561646612686, |
| "eval_loss": 2.564084768295288, |
| "eval_runtime": 50.4559, |
| "eval_samples_per_second": 99.096, |
| "eval_steps_per_second": 24.774, |
| "num_input_tokens_seen": 6422528000, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.117104117724221, |
| "grad_norm": 0.17294436693191528, |
| "learning_rate": 0.001, |
| "loss": 2.6812, |
| "num_input_tokens_seen": 6435635200, |
| "step": 24550 |
| }, |
| { |
| "epoch": 0.11734261898231513, |
| "grad_norm": 0.1840251386165619, |
| "learning_rate": 0.001, |
| "loss": 2.6599, |
| "num_input_tokens_seen": 6448742400, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.11758112024040927, |
| "grad_norm": 0.17588932812213898, |
| "learning_rate": 0.001, |
| "loss": 2.6742, |
| "num_input_tokens_seen": 6461849600, |
| "step": 24650 |
| }, |
| { |
| "epoch": 0.1178196214985034, |
| "grad_norm": 0.1805667132139206, |
| "learning_rate": 0.001, |
| "loss": 2.6647, |
| "num_input_tokens_seen": 6474956800, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.11805812275659754, |
| "grad_norm": 0.17930665612220764, |
| "learning_rate": 0.001, |
| "loss": 2.6763, |
| "num_input_tokens_seen": 6488064000, |
| "step": 24750 |
| }, |
| { |
| "epoch": 0.11829662401469168, |
| "grad_norm": 0.19195732474327087, |
| "learning_rate": 0.001, |
| "loss": 2.6716, |
| "num_input_tokens_seen": 6501171200, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.11853512527278581, |
| "grad_norm": 0.19274356961250305, |
| "learning_rate": 0.001, |
| "loss": 2.6702, |
| "num_input_tokens_seen": 6514278400, |
| "step": 24850 |
| }, |
| { |
| "epoch": 0.11877362653087994, |
| "grad_norm": 0.17423510551452637, |
| "learning_rate": 0.001, |
| "loss": 2.6733, |
| "num_input_tokens_seen": 6527385600, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.11901212778897409, |
| "grad_norm": 0.20267954468727112, |
| "learning_rate": 0.001, |
| "loss": 2.6649, |
| "num_input_tokens_seen": 6540492800, |
| "step": 24950 |
| }, |
| { |
| "epoch": 0.11925062904706822, |
| "grad_norm": 0.1756502240896225, |
| "learning_rate": 0.001, |
| "loss": 2.6582, |
| "num_input_tokens_seen": 6553600000, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.11925062904706822, |
| "eval_loss": 2.562955379486084, |
| "eval_runtime": 50.0071, |
| "eval_samples_per_second": 99.986, |
| "eval_steps_per_second": 24.996, |
| "num_input_tokens_seen": 6553600000, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.11948913030516237, |
| "grad_norm": 0.19173742830753326, |
| "learning_rate": 0.001, |
| "loss": 2.6871, |
| "num_input_tokens_seen": 6566707200, |
| "step": 25050 |
| }, |
| { |
| "epoch": 0.1197276315632565, |
| "grad_norm": 0.1746075600385666, |
| "learning_rate": 0.001, |
| "loss": 2.7003, |
| "num_input_tokens_seen": 6579814400, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.11996613282135063, |
| "grad_norm": 0.17817530035972595, |
| "learning_rate": 0.001, |
| "loss": 2.6944, |
| "num_input_tokens_seen": 6592921600, |
| "step": 25150 |
| }, |
| { |
| "epoch": 0.12020463407944477, |
| "grad_norm": 0.201807901263237, |
| "learning_rate": 0.001, |
| "loss": 2.6766, |
| "num_input_tokens_seen": 6606028800, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.1204431353375389, |
| "grad_norm": 0.18620917201042175, |
| "learning_rate": 0.001, |
| "loss": 2.6802, |
| "num_input_tokens_seen": 6619136000, |
| "step": 25250 |
| }, |
| { |
| "epoch": 0.12068163659563304, |
| "grad_norm": 0.17383818328380585, |
| "learning_rate": 0.001, |
| "loss": 2.6698, |
| "num_input_tokens_seen": 6632243200, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.12092013785372718, |
| "grad_norm": 0.1766287237405777, |
| "learning_rate": 0.001, |
| "loss": 2.6705, |
| "num_input_tokens_seen": 6645350400, |
| "step": 25350 |
| }, |
| { |
| "epoch": 0.12115863911182131, |
| "grad_norm": 0.19551052153110504, |
| "learning_rate": 0.001, |
| "loss": 2.678, |
| "num_input_tokens_seen": 6658457600, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.12139714036991545, |
| "grad_norm": 0.18625982105731964, |
| "learning_rate": 0.001, |
| "loss": 2.6688, |
| "num_input_tokens_seen": 6671564800, |
| "step": 25450 |
| }, |
| { |
| "epoch": 0.12163564162800959, |
| "grad_norm": 0.18274050951004028, |
| "learning_rate": 0.001, |
| "loss": 2.6818, |
| "num_input_tokens_seen": 6684672000, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.12163564162800959, |
| "eval_loss": 2.5602569580078125, |
| "eval_runtime": 50.4187, |
| "eval_samples_per_second": 99.17, |
| "eval_steps_per_second": 24.792, |
| "num_input_tokens_seen": 6684672000, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.12187414288610372, |
| "grad_norm": 0.18547837436199188, |
| "learning_rate": 0.001, |
| "loss": 2.6754, |
| "num_input_tokens_seen": 6697779200, |
| "step": 25550 |
| }, |
| { |
| "epoch": 0.12211264414419785, |
| "grad_norm": 0.18558937311172485, |
| "learning_rate": 0.001, |
| "loss": 2.6767, |
| "num_input_tokens_seen": 6710886400, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.122351145402292, |
| "grad_norm": 0.17276135087013245, |
| "learning_rate": 0.001, |
| "loss": 2.6775, |
| "num_input_tokens_seen": 6723993600, |
| "step": 25650 |
| }, |
| { |
| "epoch": 0.12258964666038613, |
| "grad_norm": 0.18483039736747742, |
| "learning_rate": 0.001, |
| "loss": 2.6818, |
| "num_input_tokens_seen": 6737100800, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.12282814791848028, |
| "grad_norm": 0.18036937713623047, |
| "learning_rate": 0.001, |
| "loss": 2.6669, |
| "num_input_tokens_seen": 6750208000, |
| "step": 25750 |
| }, |
| { |
| "epoch": 0.12306664917657441, |
| "grad_norm": 0.1728815734386444, |
| "learning_rate": 0.001, |
| "loss": 2.6789, |
| "num_input_tokens_seen": 6763315200, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.12330515043466854, |
| "grad_norm": 0.19193877279758453, |
| "learning_rate": 0.001, |
| "loss": 2.6487, |
| "num_input_tokens_seen": 6776422400, |
| "step": 25850 |
| }, |
| { |
| "epoch": 0.12354365169276268, |
| "grad_norm": 0.1584886610507965, |
| "learning_rate": 0.001, |
| "loss": 2.6638, |
| "num_input_tokens_seen": 6789529600, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.12378215295085682, |
| "grad_norm": 0.18792498111724854, |
| "learning_rate": 0.001, |
| "loss": 2.6754, |
| "num_input_tokens_seen": 6802636800, |
| "step": 25950 |
| }, |
| { |
| "epoch": 0.12402065420895095, |
| "grad_norm": 0.1689581423997879, |
| "learning_rate": 0.001, |
| "loss": 2.6682, |
| "num_input_tokens_seen": 6815744000, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.12402065420895095, |
| "eval_loss": 2.5587522983551025, |
| "eval_runtime": 50.7858, |
| "eval_samples_per_second": 98.453, |
| "eval_steps_per_second": 24.613, |
| "num_input_tokens_seen": 6815744000, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.1242591554670451, |
| "grad_norm": 0.18573056161403656, |
| "learning_rate": 0.001, |
| "loss": 2.6565, |
| "num_input_tokens_seen": 6828851200, |
| "step": 26050 |
| }, |
| { |
| "epoch": 0.12449765672513922, |
| "grad_norm": 0.19160890579223633, |
| "learning_rate": 0.001, |
| "loss": 2.6797, |
| "num_input_tokens_seen": 6841958400, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.12473615798323336, |
| "grad_norm": 0.18323373794555664, |
| "learning_rate": 0.001, |
| "loss": 2.6602, |
| "num_input_tokens_seen": 6855065600, |
| "step": 26150 |
| }, |
| { |
| "epoch": 0.1249746592413275, |
| "grad_norm": 0.17691807448863983, |
| "learning_rate": 0.001, |
| "loss": 2.6676, |
| "num_input_tokens_seen": 6868172800, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.12521316049942163, |
| "grad_norm": 0.20718660950660706, |
| "learning_rate": 0.001, |
| "loss": 2.6588, |
| "num_input_tokens_seen": 6881280000, |
| "step": 26250 |
| }, |
| { |
| "epoch": 0.12545166175751576, |
| "grad_norm": 0.17811058461666107, |
| "learning_rate": 0.001, |
| "loss": 2.6754, |
| "num_input_tokens_seen": 6894387200, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.1256901630156099, |
| "grad_norm": 0.17490555346012115, |
| "learning_rate": 0.001, |
| "loss": 2.6605, |
| "num_input_tokens_seen": 6907494400, |
| "step": 26350 |
| }, |
| { |
| "epoch": 0.12592866427370406, |
| "grad_norm": 0.17391368746757507, |
| "learning_rate": 0.001, |
| "loss": 2.684, |
| "num_input_tokens_seen": 6920601600, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.1261671655317982, |
| "grad_norm": 0.16951416432857513, |
| "learning_rate": 0.001, |
| "loss": 2.6685, |
| "num_input_tokens_seen": 6933708800, |
| "step": 26450 |
| }, |
| { |
| "epoch": 0.12640566678989232, |
| "grad_norm": 0.17574581503868103, |
| "learning_rate": 0.001, |
| "loss": 2.6665, |
| "num_input_tokens_seen": 6946816000, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.12640566678989232, |
| "eval_loss": 2.556109666824341, |
| "eval_runtime": 50.8743, |
| "eval_samples_per_second": 98.281, |
| "eval_steps_per_second": 24.57, |
| "num_input_tokens_seen": 6946816000, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.12664416804798645, |
| "grad_norm": 0.19910745322704315, |
| "learning_rate": 0.001, |
| "loss": 2.6755, |
| "num_input_tokens_seen": 6959923200, |
| "step": 26550 |
| }, |
| { |
| "epoch": 0.12688266930608058, |
| "grad_norm": 0.20141273736953735, |
| "learning_rate": 0.001, |
| "loss": 2.6798, |
| "num_input_tokens_seen": 6973030400, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.1271211705641747, |
| "grad_norm": 0.1732529103755951, |
| "learning_rate": 0.001, |
| "loss": 2.6606, |
| "num_input_tokens_seen": 6986137600, |
| "step": 26650 |
| }, |
| { |
| "epoch": 0.12735967182226887, |
| "grad_norm": 0.17546698451042175, |
| "learning_rate": 0.001, |
| "loss": 2.6717, |
| "num_input_tokens_seen": 6999244800, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.127598173080363, |
| "grad_norm": 0.2186097502708435, |
| "learning_rate": 0.001, |
| "loss": 2.6702, |
| "num_input_tokens_seen": 7012352000, |
| "step": 26750 |
| }, |
| { |
| "epoch": 0.12783667433845713, |
| "grad_norm": 0.1735202819108963, |
| "learning_rate": 0.001, |
| "loss": 2.6795, |
| "num_input_tokens_seen": 7025459200, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.12807517559655127, |
| "grad_norm": 0.40701860189437866, |
| "learning_rate": 0.001, |
| "loss": 2.6591, |
| "num_input_tokens_seen": 7038566400, |
| "step": 26850 |
| }, |
| { |
| "epoch": 0.1283136768546454, |
| "grad_norm": 0.19710049033164978, |
| "learning_rate": 0.001, |
| "loss": 2.6841, |
| "num_input_tokens_seen": 7051673600, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.12855217811273956, |
| "grad_norm": 0.18638554215431213, |
| "learning_rate": 0.001, |
| "loss": 2.6718, |
| "num_input_tokens_seen": 7064780800, |
| "step": 26950 |
| }, |
| { |
| "epoch": 0.1287906793708337, |
| "grad_norm": 0.17546561360359192, |
| "learning_rate": 0.001, |
| "loss": 2.6547, |
| "num_input_tokens_seen": 7077888000, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.1287906793708337, |
| "eval_loss": 2.551922559738159, |
| "eval_runtime": 50.1864, |
| "eval_samples_per_second": 99.629, |
| "eval_steps_per_second": 24.907, |
| "num_input_tokens_seen": 7077888000, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.12902918062892782, |
| "grad_norm": 0.1790401190519333, |
| "learning_rate": 0.001, |
| "loss": 2.672, |
| "num_input_tokens_seen": 7090995200, |
| "step": 27050 |
| }, |
| { |
| "epoch": 0.12926768188702195, |
| "grad_norm": 0.18173836171627045, |
| "learning_rate": 0.001, |
| "loss": 2.6563, |
| "num_input_tokens_seen": 7104102400, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.12950618314511608, |
| "grad_norm": 0.1827983856201172, |
| "learning_rate": 0.001, |
| "loss": 2.665, |
| "num_input_tokens_seen": 7117209600, |
| "step": 27150 |
| }, |
| { |
| "epoch": 0.12974468440321021, |
| "grad_norm": 0.20252254605293274, |
| "learning_rate": 0.001, |
| "loss": 2.675, |
| "num_input_tokens_seen": 7130316800, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.12998318566130437, |
| "grad_norm": 0.18492095172405243, |
| "learning_rate": 0.001, |
| "loss": 2.6801, |
| "num_input_tokens_seen": 7143424000, |
| "step": 27250 |
| }, |
| { |
| "epoch": 0.1302216869193985, |
| "grad_norm": 0.1962280571460724, |
| "learning_rate": 0.001, |
| "loss": 2.6551, |
| "num_input_tokens_seen": 7156531200, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.13046018817749264, |
| "grad_norm": 0.18813727796077728, |
| "learning_rate": 0.001, |
| "loss": 2.6728, |
| "num_input_tokens_seen": 7169638400, |
| "step": 27350 |
| }, |
| { |
| "epoch": 0.13069868943558677, |
| "grad_norm": 0.18111565709114075, |
| "learning_rate": 0.001, |
| "loss": 2.6743, |
| "num_input_tokens_seen": 7182745600, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.1309371906936809, |
| "grad_norm": 0.1727459728717804, |
| "learning_rate": 0.001, |
| "loss": 2.6596, |
| "num_input_tokens_seen": 7195852800, |
| "step": 27450 |
| }, |
| { |
| "epoch": 0.13117569195177506, |
| "grad_norm": 0.20097768306732178, |
| "learning_rate": 0.001, |
| "loss": 2.6651, |
| "num_input_tokens_seen": 7208960000, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.13117569195177506, |
| "eval_loss": 2.5501132011413574, |
| "eval_runtime": 50.3677, |
| "eval_samples_per_second": 99.27, |
| "eval_steps_per_second": 24.817, |
| "num_input_tokens_seen": 7208960000, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.1314141932098692, |
| "grad_norm": 0.17329637706279755, |
| "learning_rate": 0.001, |
| "loss": 2.663, |
| "num_input_tokens_seen": 7222067200, |
| "step": 27550 |
| }, |
| { |
| "epoch": 0.13165269446796332, |
| "grad_norm": 0.16942919790744781, |
| "learning_rate": 0.001, |
| "loss": 2.6609, |
| "num_input_tokens_seen": 7235174400, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.13189119572605745, |
| "grad_norm": 0.19828958809375763, |
| "learning_rate": 0.001, |
| "loss": 2.6625, |
| "num_input_tokens_seen": 7248281600, |
| "step": 27650 |
| }, |
| { |
| "epoch": 0.13212969698415158, |
| "grad_norm": 0.1928141862154007, |
| "learning_rate": 0.001, |
| "loss": 2.6597, |
| "num_input_tokens_seen": 7261388800, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.13236819824224572, |
| "grad_norm": 0.1870756894350052, |
| "learning_rate": 0.001, |
| "loss": 2.6718, |
| "num_input_tokens_seen": 7274496000, |
| "step": 27750 |
| }, |
| { |
| "epoch": 0.13260669950033988, |
| "grad_norm": 0.1786762923002243, |
| "learning_rate": 0.001, |
| "loss": 2.6631, |
| "num_input_tokens_seen": 7287603200, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.132845200758434, |
| "grad_norm": 0.1710624396800995, |
| "learning_rate": 0.001, |
| "loss": 2.6717, |
| "num_input_tokens_seen": 7300710400, |
| "step": 27850 |
| }, |
| { |
| "epoch": 0.13308370201652814, |
| "grad_norm": 0.1805214285850525, |
| "learning_rate": 0.001, |
| "loss": 2.6669, |
| "num_input_tokens_seen": 7313817600, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.13332220327462227, |
| "grad_norm": 0.18169906735420227, |
| "learning_rate": 0.001, |
| "loss": 2.6659, |
| "num_input_tokens_seen": 7326924800, |
| "step": 27950 |
| }, |
| { |
| "epoch": 0.1335607045327164, |
| "grad_norm": 0.16959500312805176, |
| "learning_rate": 0.001, |
| "loss": 2.6623, |
| "num_input_tokens_seen": 7340032000, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.1335607045327164, |
| "eval_loss": 2.548675060272217, |
| "eval_runtime": 50.3022, |
| "eval_samples_per_second": 99.399, |
| "eval_steps_per_second": 24.85, |
| "num_input_tokens_seen": 7340032000, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.13379920579081056, |
| "grad_norm": 0.19409704208374023, |
| "learning_rate": 0.001, |
| "loss": 2.6776, |
| "num_input_tokens_seen": 7353139200, |
| "step": 28050 |
| }, |
| { |
| "epoch": 0.1340377070489047, |
| "grad_norm": 0.1712968647480011, |
| "learning_rate": 0.001, |
| "loss": 2.6679, |
| "num_input_tokens_seen": 7366246400, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.13427620830699882, |
| "grad_norm": 0.20586130023002625, |
| "learning_rate": 0.001, |
| "loss": 2.6633, |
| "num_input_tokens_seen": 7379353600, |
| "step": 28150 |
| }, |
| { |
| "epoch": 0.13451470956509295, |
| "grad_norm": 0.1776891052722931, |
| "learning_rate": 0.001, |
| "loss": 2.6683, |
| "num_input_tokens_seen": 7392460800, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.1347532108231871, |
| "grad_norm": 0.19293451309204102, |
| "learning_rate": 0.001, |
| "loss": 2.6645, |
| "num_input_tokens_seen": 7405568000, |
| "step": 28250 |
| }, |
| { |
| "epoch": 0.13499171208128122, |
| "grad_norm": 0.17754724621772766, |
| "learning_rate": 0.001, |
| "loss": 2.6685, |
| "num_input_tokens_seen": 7418675200, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.13523021333937538, |
| "grad_norm": 0.17739038169384003, |
| "learning_rate": 0.001, |
| "loss": 2.6607, |
| "num_input_tokens_seen": 7431782400, |
| "step": 28350 |
| }, |
| { |
| "epoch": 0.1354687145974695, |
| "grad_norm": 0.175009086728096, |
| "learning_rate": 0.001, |
| "loss": 2.6679, |
| "num_input_tokens_seen": 7444889600, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.13570721585556364, |
| "grad_norm": 0.2229124754667282, |
| "learning_rate": 0.001, |
| "loss": 2.6687, |
| "num_input_tokens_seen": 7457996800, |
| "step": 28450 |
| }, |
| { |
| "epoch": 0.13594571711365777, |
| "grad_norm": 0.1791590005159378, |
| "learning_rate": 0.001, |
| "loss": 2.6741, |
| "num_input_tokens_seen": 7471104000, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.13594571711365777, |
| "eval_loss": 2.5456056594848633, |
| "eval_runtime": 50.6342, |
| "eval_samples_per_second": 98.747, |
| "eval_steps_per_second": 24.687, |
| "num_input_tokens_seen": 7471104000, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.1361842183717519, |
| "grad_norm": 0.18920041620731354, |
| "learning_rate": 0.001, |
| "loss": 2.6612, |
| "num_input_tokens_seen": 7484211200, |
| "step": 28550 |
| }, |
| { |
| "epoch": 0.13642271962984603, |
| "grad_norm": 0.19247522950172424, |
| "learning_rate": 0.001, |
| "loss": 2.6597, |
| "num_input_tokens_seen": 7497318400, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.1366612208879402, |
| "grad_norm": 0.22499197721481323, |
| "learning_rate": 0.001, |
| "loss": 2.6583, |
| "num_input_tokens_seen": 7510425600, |
| "step": 28650 |
| }, |
| { |
| "epoch": 0.13689972214603432, |
| "grad_norm": 0.18946559727191925, |
| "learning_rate": 0.001, |
| "loss": 2.6612, |
| "num_input_tokens_seen": 7523532800, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.13713822340412846, |
| "grad_norm": 0.19621454179286957, |
| "learning_rate": 0.001, |
| "loss": 2.6425, |
| "num_input_tokens_seen": 7536640000, |
| "step": 28750 |
| }, |
| { |
| "epoch": 0.1373767246622226, |
| "grad_norm": 0.21594376862049103, |
| "learning_rate": 0.001, |
| "loss": 2.6564, |
| "num_input_tokens_seen": 7549747200, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.13761522592031672, |
| "grad_norm": 0.18186470866203308, |
| "learning_rate": 0.001, |
| "loss": 2.6728, |
| "num_input_tokens_seen": 7562854400, |
| "step": 28850 |
| }, |
| { |
| "epoch": 0.13785372717841088, |
| "grad_norm": 0.19369743764400482, |
| "learning_rate": 0.001, |
| "loss": 2.6585, |
| "num_input_tokens_seen": 7575961600, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.138092228436505, |
| "grad_norm": 0.1897999793291092, |
| "learning_rate": 0.001, |
| "loss": 2.6564, |
| "num_input_tokens_seen": 7589068800, |
| "step": 28950 |
| }, |
| { |
| "epoch": 0.13833072969459914, |
| "grad_norm": 0.18076784908771515, |
| "learning_rate": 0.001, |
| "loss": 2.6453, |
| "num_input_tokens_seen": 7602176000, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.13833072969459914, |
| "eval_loss": 2.54413104057312, |
| "eval_runtime": 50.9152, |
| "eval_samples_per_second": 98.202, |
| "eval_steps_per_second": 24.551, |
| "num_input_tokens_seen": 7602176000, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.13856923095269327, |
| "grad_norm": 0.18520566821098328, |
| "learning_rate": 0.001, |
| "loss": 2.6644, |
| "num_input_tokens_seen": 7615283200, |
| "step": 29050 |
| }, |
| { |
| "epoch": 0.1388077322107874, |
| "grad_norm": 0.22739861905574799, |
| "learning_rate": 0.001, |
| "loss": 2.6597, |
| "num_input_tokens_seen": 7628390400, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.13904623346888154, |
| "grad_norm": 0.18451730906963348, |
| "learning_rate": 0.001, |
| "loss": 2.6432, |
| "num_input_tokens_seen": 7641497600, |
| "step": 29150 |
| }, |
| { |
| "epoch": 0.1392847347269757, |
| "grad_norm": 0.1865098923444748, |
| "learning_rate": 0.001, |
| "loss": 2.6651, |
| "num_input_tokens_seen": 7654604800, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.13952323598506983, |
| "grad_norm": 0.18676789104938507, |
| "learning_rate": 0.001, |
| "loss": 2.6597, |
| "num_input_tokens_seen": 7667712000, |
| "step": 29250 |
| }, |
| { |
| "epoch": 0.13976173724316396, |
| "grad_norm": 0.17463742196559906, |
| "learning_rate": 0.001, |
| "loss": 2.6571, |
| "num_input_tokens_seen": 7680819200, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.1400002385012581, |
| "grad_norm": 0.21621429920196533, |
| "learning_rate": 0.001, |
| "loss": 2.6342, |
| "num_input_tokens_seen": 7693926400, |
| "step": 29350 |
| }, |
| { |
| "epoch": 0.14023873975935222, |
| "grad_norm": 0.17493990063667297, |
| "learning_rate": 0.001, |
| "loss": 2.6536, |
| "num_input_tokens_seen": 7707033600, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.14047724101744638, |
| "grad_norm": 0.17649762332439423, |
| "learning_rate": 0.001, |
| "loss": 2.6526, |
| "num_input_tokens_seen": 7720140800, |
| "step": 29450 |
| }, |
| { |
| "epoch": 0.1407157422755405, |
| "grad_norm": 0.18224874138832092, |
| "learning_rate": 0.001, |
| "loss": 2.6635, |
| "num_input_tokens_seen": 7733248000, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.1407157422755405, |
| "eval_loss": 2.5433554649353027, |
| "eval_runtime": 51.2973, |
| "eval_samples_per_second": 97.471, |
| "eval_steps_per_second": 24.368, |
| "num_input_tokens_seen": 7733248000, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.14095424353363464, |
| "grad_norm": 0.21109874546527863, |
| "learning_rate": 0.001, |
| "loss": 2.6788, |
| "num_input_tokens_seen": 7746355200, |
| "step": 29550 |
| }, |
| { |
| "epoch": 0.14119274479172877, |
| "grad_norm": 0.17663723230361938, |
| "learning_rate": 0.001, |
| "loss": 2.6578, |
| "num_input_tokens_seen": 7759462400, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.1414312460498229, |
| "grad_norm": 0.18385198712348938, |
| "learning_rate": 0.001, |
| "loss": 2.676, |
| "num_input_tokens_seen": 7772569600, |
| "step": 29650 |
| }, |
| { |
| "epoch": 0.14166974730791704, |
| "grad_norm": 0.1829567402601242, |
| "learning_rate": 0.001, |
| "loss": 2.6586, |
| "num_input_tokens_seen": 7785676800, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.1419082485660112, |
| "grad_norm": 0.1907297968864441, |
| "learning_rate": 0.001, |
| "loss": 2.6508, |
| "num_input_tokens_seen": 7798784000, |
| "step": 29750 |
| }, |
| { |
| "epoch": 0.14214674982410533, |
| "grad_norm": 0.2106500118970871, |
| "learning_rate": 0.001, |
| "loss": 2.6578, |
| "num_input_tokens_seen": 7811891200, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.14238525108219946, |
| "grad_norm": 0.18974357843399048, |
| "learning_rate": 0.001, |
| "loss": 2.6506, |
| "num_input_tokens_seen": 7824998400, |
| "step": 29850 |
| }, |
| { |
| "epoch": 0.1426237523402936, |
| "grad_norm": 0.18876343965530396, |
| "learning_rate": 0.001, |
| "loss": 2.6663, |
| "num_input_tokens_seen": 7838105600, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.14286225359838772, |
| "grad_norm": 0.17305608093738556, |
| "learning_rate": 0.001, |
| "loss": 2.657, |
| "num_input_tokens_seen": 7851212800, |
| "step": 29950 |
| }, |
| { |
| "epoch": 0.14310075485648185, |
| "grad_norm": 0.18900860846042633, |
| "learning_rate": 0.001, |
| "loss": 2.6502, |
| "num_input_tokens_seen": 7864320000, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.14310075485648185, |
| "eval_loss": 2.540076971054077, |
| "eval_runtime": 50.1464, |
| "eval_samples_per_second": 99.708, |
| "eval_steps_per_second": 24.927, |
| "num_input_tokens_seen": 7864320000, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.143339256114576, |
| "grad_norm": 0.16919030249118805, |
| "learning_rate": 0.001, |
| "loss": 2.6729, |
| "num_input_tokens_seen": 7877427200, |
| "step": 30050 |
| }, |
| { |
| "epoch": 0.14357775737267015, |
| "grad_norm": 0.17828898131847382, |
| "learning_rate": 0.001, |
| "loss": 2.647, |
| "num_input_tokens_seen": 7890534400, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.14381625863076428, |
| "grad_norm": 0.1790715903043747, |
| "learning_rate": 0.001, |
| "loss": 2.6639, |
| "num_input_tokens_seen": 7903641600, |
| "step": 30150 |
| }, |
| { |
| "epoch": 0.1440547598888584, |
| "grad_norm": 0.18818187713623047, |
| "learning_rate": 0.001, |
| "loss": 2.6485, |
| "num_input_tokens_seen": 7916748800, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.14429326114695254, |
| "grad_norm": 0.2171814739704132, |
| "learning_rate": 0.001, |
| "loss": 2.6577, |
| "num_input_tokens_seen": 7929856000, |
| "step": 30250 |
| }, |
| { |
| "epoch": 0.1445317624050467, |
| "grad_norm": 0.1844399869441986, |
| "learning_rate": 0.001, |
| "loss": 2.6473, |
| "num_input_tokens_seen": 7942963200, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.14477026366314083, |
| "grad_norm": 0.19607801735401154, |
| "learning_rate": 0.001, |
| "loss": 2.6576, |
| "num_input_tokens_seen": 7956070400, |
| "step": 30350 |
| }, |
| { |
| "epoch": 0.14500876492123496, |
| "grad_norm": 0.1967996209859848, |
| "learning_rate": 0.001, |
| "loss": 2.64, |
| "num_input_tokens_seen": 7969177600, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.1452472661793291, |
| "grad_norm": 0.2087596207857132, |
| "learning_rate": 0.001, |
| "loss": 2.6485, |
| "num_input_tokens_seen": 7982284800, |
| "step": 30450 |
| }, |
| { |
| "epoch": 0.14548576743742322, |
| "grad_norm": 0.1938595473766327, |
| "learning_rate": 0.001, |
| "loss": 2.654, |
| "num_input_tokens_seen": 7995392000, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.14548576743742322, |
| "eval_loss": 2.537402391433716, |
| "eval_runtime": 50.7304, |
| "eval_samples_per_second": 98.56, |
| "eval_steps_per_second": 24.64, |
| "num_input_tokens_seen": 7995392000, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.14572426869551736, |
| "grad_norm": 0.18282300233840942, |
| "learning_rate": 0.001, |
| "loss": 2.6592, |
| "num_input_tokens_seen": 8008499200, |
| "step": 30550 |
| }, |
| { |
| "epoch": 0.14596276995361152, |
| "grad_norm": 0.1829262375831604, |
| "learning_rate": 0.001, |
| "loss": 2.6618, |
| "num_input_tokens_seen": 8021606400, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.14620127121170565, |
| "grad_norm": 0.19001947343349457, |
| "learning_rate": 0.001, |
| "loss": 2.649, |
| "num_input_tokens_seen": 8034713600, |
| "step": 30650 |
| }, |
| { |
| "epoch": 0.14643977246979978, |
| "grad_norm": 0.19943153858184814, |
| "learning_rate": 0.001, |
| "loss": 2.6578, |
| "num_input_tokens_seen": 8047820800, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.1466782737278939, |
| "grad_norm": 0.18482360243797302, |
| "learning_rate": 0.001, |
| "loss": 2.6616, |
| "num_input_tokens_seen": 8060928000, |
| "step": 30750 |
| }, |
| { |
| "epoch": 0.14691677498598804, |
| "grad_norm": 0.20858009159564972, |
| "learning_rate": 0.001, |
| "loss": 2.6684, |
| "num_input_tokens_seen": 8074035200, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.1471552762440822, |
| "grad_norm": 0.2759605646133423, |
| "learning_rate": 0.001, |
| "loss": 2.713, |
| "num_input_tokens_seen": 8087142400, |
| "step": 30850 |
| }, |
| { |
| "epoch": 0.14739377750217633, |
| "grad_norm": 0.22366145253181458, |
| "learning_rate": 0.001, |
| "loss": 2.7065, |
| "num_input_tokens_seen": 8100249600, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.14763227876027046, |
| "grad_norm": 0.22143268585205078, |
| "learning_rate": 0.001, |
| "loss": 2.672, |
| "num_input_tokens_seen": 8113356800, |
| "step": 30950 |
| }, |
| { |
| "epoch": 0.1478707800183646, |
| "grad_norm": 0.25140002369880676, |
| "learning_rate": 0.001, |
| "loss": 2.6658, |
| "num_input_tokens_seen": 8126464000, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.1478707800183646, |
| "eval_loss": 2.5451457500457764, |
| "eval_runtime": 50.0622, |
| "eval_samples_per_second": 99.876, |
| "eval_steps_per_second": 24.969, |
| "num_input_tokens_seen": 8126464000, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.14810928127645873, |
| "grad_norm": 0.20207786560058594, |
| "learning_rate": 0.001, |
| "loss": 2.6493, |
| "num_input_tokens_seen": 8139571200, |
| "step": 31050 |
| }, |
| { |
| "epoch": 0.14834778253455286, |
| "grad_norm": 0.20135898888111115, |
| "learning_rate": 0.001, |
| "loss": 2.6555, |
| "num_input_tokens_seen": 8152678400, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.14858628379264702, |
| "grad_norm": 0.19284267723560333, |
| "learning_rate": 0.001, |
| "loss": 2.6637, |
| "num_input_tokens_seen": 8165785600, |
| "step": 31150 |
| }, |
| { |
| "epoch": 0.14882478505074115, |
| "grad_norm": 0.17214693129062653, |
| "learning_rate": 0.001, |
| "loss": 2.6663, |
| "num_input_tokens_seen": 8178892800, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.14906328630883528, |
| "grad_norm": 0.19444549083709717, |
| "learning_rate": 0.001, |
| "loss": 2.6541, |
| "num_input_tokens_seen": 8192000000, |
| "step": 31250 |
| }, |
| { |
| "epoch": 0.1493017875669294, |
| "grad_norm": 0.19992901384830475, |
| "learning_rate": 0.001, |
| "loss": 2.6419, |
| "num_input_tokens_seen": 8205107200, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.14954028882502354, |
| "grad_norm": 0.16732315719127655, |
| "learning_rate": 0.001, |
| "loss": 2.6559, |
| "num_input_tokens_seen": 8218214400, |
| "step": 31350 |
| }, |
| { |
| "epoch": 0.1497787900831177, |
| "grad_norm": 0.4210798442363739, |
| "learning_rate": 0.001, |
| "loss": 2.6478, |
| "num_input_tokens_seen": 8231321600, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.15001729134121183, |
| "grad_norm": 0.2139436900615692, |
| "learning_rate": 0.001, |
| "loss": 2.6753, |
| "num_input_tokens_seen": 8244428800, |
| "step": 31450 |
| }, |
| { |
| "epoch": 0.15025579259930597, |
| "grad_norm": 0.19131046533584595, |
| "learning_rate": 0.001, |
| "loss": 2.6675, |
| "num_input_tokens_seen": 8257536000, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.15025579259930597, |
| "eval_loss": 2.5402350425720215, |
| "eval_runtime": 50.477, |
| "eval_samples_per_second": 99.055, |
| "eval_steps_per_second": 24.764, |
| "num_input_tokens_seen": 8257536000, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.1504942938574001, |
| "grad_norm": 0.20711492002010345, |
| "learning_rate": 0.001, |
| "loss": 2.6654, |
| "num_input_tokens_seen": 8270643200, |
| "step": 31550 |
| }, |
| { |
| "epoch": 0.15073279511549423, |
| "grad_norm": 0.1888076812028885, |
| "learning_rate": 0.001, |
| "loss": 2.6603, |
| "num_input_tokens_seen": 8283750400, |
| "step": 31600 |
| }, |
| { |
| "epoch": 0.15097129637358836, |
| "grad_norm": 0.18534335494041443, |
| "learning_rate": 0.001, |
| "loss": 2.6539, |
| "num_input_tokens_seen": 8296857600, |
| "step": 31650 |
| }, |
| { |
| "epoch": 0.15120979763168252, |
| "grad_norm": 0.2024192214012146, |
| "learning_rate": 0.001, |
| "loss": 2.6514, |
| "num_input_tokens_seen": 8309964800, |
| "step": 31700 |
| }, |
| { |
| "epoch": 0.15144829888977665, |
| "grad_norm": 0.18967773020267487, |
| "learning_rate": 0.001, |
| "loss": 2.6457, |
| "num_input_tokens_seen": 8323072000, |
| "step": 31750 |
| }, |
| { |
| "epoch": 0.15168680014787078, |
| "grad_norm": 0.18823806941509247, |
| "learning_rate": 0.001, |
| "loss": 2.6579, |
| "num_input_tokens_seen": 8336179200, |
| "step": 31800 |
| }, |
| { |
| "epoch": 0.1519253014059649, |
| "grad_norm": 0.20198485255241394, |
| "learning_rate": 0.001, |
| "loss": 2.6623, |
| "num_input_tokens_seen": 8349286400, |
| "step": 31850 |
| }, |
| { |
| "epoch": 0.15216380266405904, |
| "grad_norm": 0.19362477958202362, |
| "learning_rate": 0.001, |
| "loss": 2.6473, |
| "num_input_tokens_seen": 8362393600, |
| "step": 31900 |
| }, |
| { |
| "epoch": 0.15240230392215318, |
| "grad_norm": 0.18454812467098236, |
| "learning_rate": 0.001, |
| "loss": 2.6411, |
| "num_input_tokens_seen": 8375500800, |
| "step": 31950 |
| }, |
| { |
| "epoch": 0.15264080518024734, |
| "grad_norm": 0.1968630850315094, |
| "learning_rate": 0.001, |
| "loss": 2.6405, |
| "num_input_tokens_seen": 8388608000, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.15264080518024734, |
| "eval_loss": 2.5325138568878174, |
| "eval_runtime": 51.0134, |
| "eval_samples_per_second": 98.013, |
| "eval_steps_per_second": 24.503, |
| "num_input_tokens_seen": 8388608000, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.15287930643834147, |
| "grad_norm": 0.180119588971138, |
| "learning_rate": 0.001, |
| "loss": 2.6558, |
| "num_input_tokens_seen": 8401715200, |
| "step": 32050 |
| }, |
| { |
| "epoch": 0.1531178076964356, |
| "grad_norm": 0.1952589452266693, |
| "learning_rate": 0.001, |
| "loss": 2.6465, |
| "num_input_tokens_seen": 8414822400, |
| "step": 32100 |
| }, |
| { |
| "epoch": 0.15335630895452973, |
| "grad_norm": 0.1845589131116867, |
| "learning_rate": 0.001, |
| "loss": 2.6297, |
| "num_input_tokens_seen": 8427929600, |
| "step": 32150 |
| }, |
| { |
| "epoch": 0.15359481021262386, |
| "grad_norm": 0.20116594433784485, |
| "learning_rate": 0.001, |
| "loss": 2.6422, |
| "num_input_tokens_seen": 8441036800, |
| "step": 32200 |
| }, |
| { |
| "epoch": 0.15383331147071802, |
| "grad_norm": 0.1932612508535385, |
| "learning_rate": 0.001, |
| "loss": 2.6494, |
| "num_input_tokens_seen": 8454144000, |
| "step": 32250 |
| }, |
| { |
| "epoch": 0.15407181272881215, |
| "grad_norm": 0.17934490740299225, |
| "learning_rate": 0.001, |
| "loss": 2.6474, |
| "num_input_tokens_seen": 8467251200, |
| "step": 32300 |
| }, |
| { |
| "epoch": 0.15431031398690628, |
| "grad_norm": 0.19273313879966736, |
| "learning_rate": 0.001, |
| "loss": 2.6447, |
| "num_input_tokens_seen": 8480358400, |
| "step": 32350 |
| }, |
| { |
| "epoch": 0.15454881524500041, |
| "grad_norm": 0.1921055018901825, |
| "learning_rate": 0.001, |
| "loss": 2.665, |
| "num_input_tokens_seen": 8493465600, |
| "step": 32400 |
| }, |
| { |
| "epoch": 0.15478731650309455, |
| "grad_norm": 0.37117844820022583, |
| "learning_rate": 0.001, |
| "loss": 2.6351, |
| "num_input_tokens_seen": 8506572800, |
| "step": 32450 |
| }, |
| { |
| "epoch": 0.15502581776118868, |
| "grad_norm": 0.1884016990661621, |
| "learning_rate": 0.001, |
| "loss": 2.6436, |
| "num_input_tokens_seen": 8519680000, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.15502581776118868, |
| "eval_loss": 2.5313448905944824, |
| "eval_runtime": 50.7051, |
| "eval_samples_per_second": 98.609, |
| "eval_steps_per_second": 24.652, |
| "num_input_tokens_seen": 8519680000, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.15526431901928284, |
| "grad_norm": 0.22205407917499542, |
| "learning_rate": 0.001, |
| "loss": 2.6464, |
| "num_input_tokens_seen": 8532787200, |
| "step": 32550 |
| }, |
| { |
| "epoch": 0.15550282027737697, |
| "grad_norm": 0.18515361845493317, |
| "learning_rate": 0.001, |
| "loss": 2.642, |
| "num_input_tokens_seen": 8545894400, |
| "step": 32600 |
| }, |
| { |
| "epoch": 0.1557413215354711, |
| "grad_norm": 0.18903231620788574, |
| "learning_rate": 0.001, |
| "loss": 2.6446, |
| "num_input_tokens_seen": 8559001600, |
| "step": 32650 |
| }, |
| { |
| "epoch": 0.15597982279356523, |
| "grad_norm": 0.1857556253671646, |
| "learning_rate": 0.001, |
| "loss": 2.6561, |
| "num_input_tokens_seen": 8572108800, |
| "step": 32700 |
| }, |
| { |
| "epoch": 0.15621832405165936, |
| "grad_norm": 0.45706707239151, |
| "learning_rate": 0.001, |
| "loss": 2.6487, |
| "num_input_tokens_seen": 8585216000, |
| "step": 32750 |
| }, |
| { |
| "epoch": 0.15645682530975352, |
| "grad_norm": 0.20191136002540588, |
| "learning_rate": 0.001, |
| "loss": 2.6593, |
| "num_input_tokens_seen": 8598323200, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.15669532656784765, |
| "grad_norm": 0.21191105246543884, |
| "learning_rate": 0.001, |
| "loss": 2.659, |
| "num_input_tokens_seen": 8611430400, |
| "step": 32850 |
| }, |
| { |
| "epoch": 0.15693382782594179, |
| "grad_norm": 0.20596672594547272, |
| "learning_rate": 0.001, |
| "loss": 2.6354, |
| "num_input_tokens_seen": 8624537600, |
| "step": 32900 |
| }, |
| { |
| "epoch": 0.15717232908403592, |
| "grad_norm": 0.2952199876308441, |
| "learning_rate": 0.001, |
| "loss": 2.6501, |
| "num_input_tokens_seen": 8637644800, |
| "step": 32950 |
| }, |
| { |
| "epoch": 0.15741083034213005, |
| "grad_norm": 0.2217044234275818, |
| "learning_rate": 0.001, |
| "loss": 2.6495, |
| "num_input_tokens_seen": 8650752000, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.15741083034213005, |
| "eval_loss": 2.5319430828094482, |
| "eval_runtime": 50.8413, |
| "eval_samples_per_second": 98.345, |
| "eval_steps_per_second": 24.586, |
| "num_input_tokens_seen": 8650752000, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.15764933160022418, |
| "grad_norm": 0.2384626269340515, |
| "learning_rate": 0.001, |
| "loss": 2.6503, |
| "num_input_tokens_seen": 8663859200, |
| "step": 33050 |
| }, |
| { |
| "epoch": 0.15788783285831834, |
| "grad_norm": 0.18387843668460846, |
| "learning_rate": 0.001, |
| "loss": 2.6469, |
| "num_input_tokens_seen": 8676966400, |
| "step": 33100 |
| }, |
| { |
| "epoch": 0.15812633411641247, |
| "grad_norm": 0.23530641198158264, |
| "learning_rate": 0.001, |
| "loss": 2.6484, |
| "num_input_tokens_seen": 8690073600, |
| "step": 33150 |
| }, |
| { |
| "epoch": 0.1583648353745066, |
| "grad_norm": 0.2027565985918045, |
| "learning_rate": 0.001, |
| "loss": 2.6564, |
| "num_input_tokens_seen": 8703180800, |
| "step": 33200 |
| }, |
| { |
| "epoch": 0.15860333663260073, |
| "grad_norm": 0.21472220122814178, |
| "learning_rate": 0.001, |
| "loss": 2.6543, |
| "num_input_tokens_seen": 8716288000, |
| "step": 33250 |
| }, |
| { |
| "epoch": 0.15884183789069486, |
| "grad_norm": 0.19012615084648132, |
| "learning_rate": 0.001, |
| "loss": 2.6378, |
| "num_input_tokens_seen": 8729395200, |
| "step": 33300 |
| }, |
| { |
| "epoch": 0.159080339148789, |
| "grad_norm": 0.18018738925457, |
| "learning_rate": 0.001, |
| "loss": 2.6461, |
| "num_input_tokens_seen": 8742502400, |
| "step": 33350 |
| }, |
| { |
| "epoch": 0.15931884040688316, |
| "grad_norm": 0.20139184594154358, |
| "learning_rate": 0.001, |
| "loss": 2.6419, |
| "num_input_tokens_seen": 8755609600, |
| "step": 33400 |
| }, |
| { |
| "epoch": 0.1595573416649773, |
| "grad_norm": 0.20734767615795135, |
| "learning_rate": 0.001, |
| "loss": 2.6299, |
| "num_input_tokens_seen": 8768716800, |
| "step": 33450 |
| }, |
| { |
| "epoch": 0.15979584292307142, |
| "grad_norm": 0.18958640098571777, |
| "learning_rate": 0.001, |
| "loss": 2.6525, |
| "num_input_tokens_seen": 8781824000, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.15979584292307142, |
| "eval_loss": 2.5301430225372314, |
| "eval_runtime": 64.6928, |
| "eval_samples_per_second": 77.288, |
| "eval_steps_per_second": 19.322, |
| "num_input_tokens_seen": 8781824000, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.16003434418116555, |
| "grad_norm": 0.20421727001667023, |
| "learning_rate": 0.001, |
| "loss": 2.6445, |
| "num_input_tokens_seen": 8794931200, |
| "step": 33550 |
| }, |
| { |
| "epoch": 0.16027284543925968, |
| "grad_norm": 0.18347379565238953, |
| "learning_rate": 0.001, |
| "loss": 2.6525, |
| "num_input_tokens_seen": 8808038400, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.16051134669735384, |
| "grad_norm": 0.19450639188289642, |
| "learning_rate": 0.001, |
| "loss": 2.6356, |
| "num_input_tokens_seen": 8821145600, |
| "step": 33650 |
| }, |
| { |
| "epoch": 0.16074984795544797, |
| "grad_norm": 0.17953775823116302, |
| "learning_rate": 0.001, |
| "loss": 2.6424, |
| "num_input_tokens_seen": 8834252800, |
| "step": 33700 |
| }, |
| { |
| "epoch": 0.1609883492135421, |
| "grad_norm": 0.1990649551153183, |
| "learning_rate": 0.001, |
| "loss": 2.6608, |
| "num_input_tokens_seen": 8847360000, |
| "step": 33750 |
| }, |
| { |
| "epoch": 0.16122685047163623, |
| "grad_norm": 0.19343194365501404, |
| "learning_rate": 0.001, |
| "loss": 2.6604, |
| "num_input_tokens_seen": 8860467200, |
| "step": 33800 |
| }, |
| { |
| "epoch": 0.16146535172973037, |
| "grad_norm": 0.19385921955108643, |
| "learning_rate": 0.001, |
| "loss": 2.6354, |
| "num_input_tokens_seen": 8873574400, |
| "step": 33850 |
| }, |
| { |
| "epoch": 0.1617038529878245, |
| "grad_norm": 0.1828273981809616, |
| "learning_rate": 0.001, |
| "loss": 2.6578, |
| "num_input_tokens_seen": 8886681600, |
| "step": 33900 |
| }, |
| { |
| "epoch": 0.16194235424591866, |
| "grad_norm": 0.216063991189003, |
| "learning_rate": 0.001, |
| "loss": 2.6575, |
| "num_input_tokens_seen": 8899788800, |
| "step": 33950 |
| }, |
| { |
| "epoch": 0.1621808555040128, |
| "grad_norm": 0.20358648896217346, |
| "learning_rate": 0.001, |
| "loss": 2.6499, |
| "num_input_tokens_seen": 8912896000, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.1621808555040128, |
| "eval_loss": 2.5330910682678223, |
| "eval_runtime": 50.7961, |
| "eval_samples_per_second": 98.433, |
| "eval_steps_per_second": 24.608, |
| "num_input_tokens_seen": 8912896000, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.16241935676210692, |
| "grad_norm": 0.1935052126646042, |
| "learning_rate": 0.001, |
| "loss": 2.6583, |
| "num_input_tokens_seen": 8926003200, |
| "step": 34050 |
| }, |
| { |
| "epoch": 0.16265785802020105, |
| "grad_norm": 0.7825157642364502, |
| "learning_rate": 0.001, |
| "loss": 2.6481, |
| "num_input_tokens_seen": 8939110400, |
| "step": 34100 |
| }, |
| { |
| "epoch": 0.16289635927829518, |
| "grad_norm": 0.23290683329105377, |
| "learning_rate": 0.001, |
| "loss": 2.6925, |
| "num_input_tokens_seen": 8952217600, |
| "step": 34150 |
| }, |
| { |
| "epoch": 0.16313486053638934, |
| "grad_norm": 0.23564130067825317, |
| "learning_rate": 0.001, |
| "loss": 2.6495, |
| "num_input_tokens_seen": 8965324800, |
| "step": 34200 |
| }, |
| { |
| "epoch": 0.16337336179448347, |
| "grad_norm": 0.19592130184173584, |
| "learning_rate": 0.001, |
| "loss": 2.6536, |
| "num_input_tokens_seen": 8978432000, |
| "step": 34250 |
| }, |
| { |
| "epoch": 0.1636118630525776, |
| "grad_norm": 0.23535041511058807, |
| "learning_rate": 0.001, |
| "loss": 2.6608, |
| "num_input_tokens_seen": 8991539200, |
| "step": 34300 |
| }, |
| { |
| "epoch": 0.16385036431067174, |
| "grad_norm": 0.1991938352584839, |
| "learning_rate": 0.001, |
| "loss": 2.6458, |
| "num_input_tokens_seen": 9004641856, |
| "step": 34350 |
| }, |
| { |
| "epoch": 0.16408886556876587, |
| "grad_norm": 0.19363388419151306, |
| "learning_rate": 0.001, |
| "loss": 2.6531, |
| "num_input_tokens_seen": 9017749056, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.16432736682686, |
| "grad_norm": 0.18500390648841858, |
| "learning_rate": 0.001, |
| "loss": 2.6391, |
| "num_input_tokens_seen": 9030856256, |
| "step": 34450 |
| }, |
| { |
| "epoch": 0.16456586808495416, |
| "grad_norm": 0.2774065434932709, |
| "learning_rate": 0.001, |
| "loss": 2.6619, |
| "num_input_tokens_seen": 9043963456, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.16456586808495416, |
| "eval_loss": 2.5325100421905518, |
| "eval_runtime": 51.5954, |
| "eval_samples_per_second": 96.908, |
| "eval_steps_per_second": 24.227, |
| "num_input_tokens_seen": 9043963456, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.1648043693430483, |
| "grad_norm": 0.1957511603832245, |
| "learning_rate": 0.001, |
| "loss": 2.6456, |
| "num_input_tokens_seen": 9057070656, |
| "step": 34550 |
| }, |
| { |
| "epoch": 0.16504287060114242, |
| "grad_norm": 0.20958378911018372, |
| "learning_rate": 0.001, |
| "loss": 2.6452, |
| "num_input_tokens_seen": 9070177856, |
| "step": 34600 |
| }, |
| { |
| "epoch": 0.16528137185923655, |
| "grad_norm": 0.206208735704422, |
| "learning_rate": 0.001, |
| "loss": 2.6548, |
| "num_input_tokens_seen": 9083285056, |
| "step": 34650 |
| }, |
| { |
| "epoch": 0.16551987311733068, |
| "grad_norm": 0.22349481284618378, |
| "learning_rate": 0.001, |
| "loss": 2.6653, |
| "num_input_tokens_seen": 9096392256, |
| "step": 34700 |
| }, |
| { |
| "epoch": 0.16575837437542484, |
| "grad_norm": 0.22599968314170837, |
| "learning_rate": 0.001, |
| "loss": 2.6329, |
| "num_input_tokens_seen": 9109499456, |
| "step": 34750 |
| }, |
| { |
| "epoch": 0.16599687563351898, |
| "grad_norm": 0.19219790399074554, |
| "learning_rate": 0.001, |
| "loss": 2.6404, |
| "num_input_tokens_seen": 9122606656, |
| "step": 34800 |
| }, |
| { |
| "epoch": 0.1662353768916131, |
| "grad_norm": 0.2006351351737976, |
| "learning_rate": 0.001, |
| "loss": 2.6522, |
| "num_input_tokens_seen": 9135713856, |
| "step": 34850 |
| }, |
| { |
| "epoch": 0.16647387814970724, |
| "grad_norm": 0.18393316864967346, |
| "learning_rate": 0.001, |
| "loss": 2.6464, |
| "num_input_tokens_seen": 9148821056, |
| "step": 34900 |
| }, |
| { |
| "epoch": 0.16671237940780137, |
| "grad_norm": 0.19820146262645721, |
| "learning_rate": 0.001, |
| "loss": 2.6402, |
| "num_input_tokens_seen": 9161928256, |
| "step": 34950 |
| }, |
| { |
| "epoch": 0.1669508806658955, |
| "grad_norm": 0.1995670199394226, |
| "learning_rate": 0.001, |
| "loss": 2.652, |
| "num_input_tokens_seen": 9175035456, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.1669508806658955, |
| "eval_loss": 2.5248045921325684, |
| "eval_runtime": 50.8205, |
| "eval_samples_per_second": 98.386, |
| "eval_steps_per_second": 24.596, |
| "num_input_tokens_seen": 9175035456, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.16718938192398966, |
| "grad_norm": 0.2099646031856537, |
| "learning_rate": 0.001, |
| "loss": 2.6307, |
| "num_input_tokens_seen": 9188142656, |
| "step": 35050 |
| }, |
| { |
| "epoch": 0.1674278831820838, |
| "grad_norm": 0.18913927674293518, |
| "learning_rate": 0.001, |
| "loss": 2.6368, |
| "num_input_tokens_seen": 9201249856, |
| "step": 35100 |
| }, |
| { |
| "epoch": 0.16766638444017792, |
| "grad_norm": 0.19193056225776672, |
| "learning_rate": 0.001, |
| "loss": 2.6325, |
| "num_input_tokens_seen": 9214357056, |
| "step": 35150 |
| }, |
| { |
| "epoch": 0.16790488569827206, |
| "grad_norm": 0.19911837577819824, |
| "learning_rate": 0.001, |
| "loss": 2.6543, |
| "num_input_tokens_seen": 9227464256, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.1681433869563662, |
| "grad_norm": 0.1985558718442917, |
| "learning_rate": 0.001, |
| "loss": 2.6518, |
| "num_input_tokens_seen": 9240571456, |
| "step": 35250 |
| }, |
| { |
| "epoch": 0.16838188821446032, |
| "grad_norm": 0.2079145759344101, |
| "learning_rate": 0.001, |
| "loss": 2.646, |
| "num_input_tokens_seen": 9253678656, |
| "step": 35300 |
| }, |
| { |
| "epoch": 0.16862038947255448, |
| "grad_norm": 0.18524424731731415, |
| "learning_rate": 0.001, |
| "loss": 2.6378, |
| "num_input_tokens_seen": 9266785856, |
| "step": 35350 |
| }, |
| { |
| "epoch": 0.1688588907306486, |
| "grad_norm": 0.19140370190143585, |
| "learning_rate": 0.001, |
| "loss": 2.6488, |
| "num_input_tokens_seen": 9279893056, |
| "step": 35400 |
| }, |
| { |
| "epoch": 0.16909739198874274, |
| "grad_norm": 0.18006138503551483, |
| "learning_rate": 0.001, |
| "loss": 2.6632, |
| "num_input_tokens_seen": 9293000256, |
| "step": 35450 |
| }, |
| { |
| "epoch": 0.16933589324683687, |
| "grad_norm": 0.18754282593727112, |
| "learning_rate": 0.001, |
| "loss": 2.6436, |
| "num_input_tokens_seen": 9306107456, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.16933589324683687, |
| "eval_loss": 2.5230932235717773, |
| "eval_runtime": 50.9895, |
| "eval_samples_per_second": 98.059, |
| "eval_steps_per_second": 24.515, |
| "num_input_tokens_seen": 9306107456, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.169574394504931, |
| "grad_norm": 0.18708109855651855, |
| "learning_rate": 0.001, |
| "loss": 2.6509, |
| "num_input_tokens_seen": 9319214656, |
| "step": 35550 |
| }, |
| { |
| "epoch": 0.16981289576302516, |
| "grad_norm": 0.2019611895084381, |
| "learning_rate": 0.001, |
| "loss": 2.6333, |
| "num_input_tokens_seen": 9332321856, |
| "step": 35600 |
| }, |
| { |
| "epoch": 0.1700513970211193, |
| "grad_norm": 0.22504755854606628, |
| "learning_rate": 0.001, |
| "loss": 2.6359, |
| "num_input_tokens_seen": 9345429056, |
| "step": 35650 |
| }, |
| { |
| "epoch": 0.17028989827921343, |
| "grad_norm": 0.1972053200006485, |
| "learning_rate": 0.001, |
| "loss": 2.6362, |
| "num_input_tokens_seen": 9358536256, |
| "step": 35700 |
| }, |
| { |
| "epoch": 0.17052839953730756, |
| "grad_norm": 0.21156789362430573, |
| "learning_rate": 0.001, |
| "loss": 2.637, |
| "num_input_tokens_seen": 9371643456, |
| "step": 35750 |
| }, |
| { |
| "epoch": 0.1707669007954017, |
| "grad_norm": 0.2680750787258148, |
| "learning_rate": 0.001, |
| "loss": 2.6332, |
| "num_input_tokens_seen": 9384750656, |
| "step": 35800 |
| }, |
| { |
| "epoch": 0.17100540205349582, |
| "grad_norm": 0.24413707852363586, |
| "learning_rate": 0.001, |
| "loss": 2.6366, |
| "num_input_tokens_seen": 9397857856, |
| "step": 35850 |
| }, |
| { |
| "epoch": 0.17124390331158998, |
| "grad_norm": 0.19973772764205933, |
| "learning_rate": 0.001, |
| "loss": 2.6381, |
| "num_input_tokens_seen": 9410965056, |
| "step": 35900 |
| }, |
| { |
| "epoch": 0.1714824045696841, |
| "grad_norm": 0.20807349681854248, |
| "learning_rate": 0.001, |
| "loss": 2.6416, |
| "num_input_tokens_seen": 9424072256, |
| "step": 35950 |
| }, |
| { |
| "epoch": 0.17172090582777824, |
| "grad_norm": 0.20126542448997498, |
| "learning_rate": 0.001, |
| "loss": 2.6377, |
| "num_input_tokens_seen": 9437179456, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.17172090582777824, |
| "eval_loss": 2.52500581741333, |
| "eval_runtime": 51.4353, |
| "eval_samples_per_second": 97.21, |
| "eval_steps_per_second": 24.302, |
| "num_input_tokens_seen": 9437179456, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.17195940708587237, |
| "grad_norm": 0.19696597754955292, |
| "learning_rate": 0.001, |
| "loss": 2.6521, |
| "num_input_tokens_seen": 9450286656, |
| "step": 36050 |
| }, |
| { |
| "epoch": 0.1721979083439665, |
| "grad_norm": 0.18839424848556519, |
| "learning_rate": 0.001, |
| "loss": 2.6484, |
| "num_input_tokens_seen": 9463393856, |
| "step": 36100 |
| }, |
| { |
| "epoch": 0.17243640960206066, |
| "grad_norm": 0.33748558163642883, |
| "learning_rate": 0.001, |
| "loss": 2.6496, |
| "num_input_tokens_seen": 9476501056, |
| "step": 36150 |
| }, |
| { |
| "epoch": 0.1726749108601548, |
| "grad_norm": 0.19529207050800323, |
| "learning_rate": 0.001, |
| "loss": 2.6484, |
| "num_input_tokens_seen": 9489608256, |
| "step": 36200 |
| }, |
| { |
| "epoch": 0.17291341211824893, |
| "grad_norm": 0.21542242169380188, |
| "learning_rate": 0.001, |
| "loss": 2.6572, |
| "num_input_tokens_seen": 9502715456, |
| "step": 36250 |
| }, |
| { |
| "epoch": 0.17315191337634306, |
| "grad_norm": 0.37017494440078735, |
| "learning_rate": 0.001, |
| "loss": 2.6517, |
| "num_input_tokens_seen": 9515822656, |
| "step": 36300 |
| }, |
| { |
| "epoch": 0.1733904146344372, |
| "grad_norm": 0.27284151315689087, |
| "learning_rate": 0.001, |
| "loss": 2.66, |
| "num_input_tokens_seen": 9528929856, |
| "step": 36350 |
| }, |
| { |
| "epoch": 0.17362891589253132, |
| "grad_norm": 0.4666242003440857, |
| "learning_rate": 0.001, |
| "loss": 2.6514, |
| "num_input_tokens_seen": 9542037056, |
| "step": 36400 |
| }, |
| { |
| "epoch": 0.17386741715062548, |
| "grad_norm": 0.2031467854976654, |
| "learning_rate": 0.001, |
| "loss": 2.6577, |
| "num_input_tokens_seen": 9555144256, |
| "step": 36450 |
| }, |
| { |
| "epoch": 0.1741059184087196, |
| "grad_norm": 0.2086576223373413, |
| "learning_rate": 0.001, |
| "loss": 2.6372, |
| "num_input_tokens_seen": 9568251456, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.1741059184087196, |
| "eval_loss": 2.5252223014831543, |
| "eval_runtime": 51.1282, |
| "eval_samples_per_second": 97.793, |
| "eval_steps_per_second": 24.448, |
| "num_input_tokens_seen": 9568251456, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.17434441966681374, |
| "grad_norm": 0.19739161431789398, |
| "learning_rate": 0.001, |
| "loss": 2.6184, |
| "num_input_tokens_seen": 9581358656, |
| "step": 36550 |
| }, |
| { |
| "epoch": 0.17458292092490788, |
| "grad_norm": 0.22384846210479736, |
| "learning_rate": 0.001, |
| "loss": 2.6504, |
| "num_input_tokens_seen": 9594465856, |
| "step": 36600 |
| }, |
| { |
| "epoch": 0.174821422183002, |
| "grad_norm": 0.2055511772632599, |
| "learning_rate": 0.001, |
| "loss": 2.6333, |
| "num_input_tokens_seen": 9607573056, |
| "step": 36650 |
| }, |
| { |
| "epoch": 0.17505992344109614, |
| "grad_norm": 0.18193551898002625, |
| "learning_rate": 0.001, |
| "loss": 2.6518, |
| "num_input_tokens_seen": 9620680256, |
| "step": 36700 |
| }, |
| { |
| "epoch": 0.1752984246991903, |
| "grad_norm": 0.1968860775232315, |
| "learning_rate": 0.001, |
| "loss": 2.6211, |
| "num_input_tokens_seen": 9633787456, |
| "step": 36750 |
| }, |
| { |
| "epoch": 0.17553692595728443, |
| "grad_norm": 0.20429988205432892, |
| "learning_rate": 0.001, |
| "loss": 2.6269, |
| "num_input_tokens_seen": 9646894656, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.17577542721537856, |
| "grad_norm": 0.18364110589027405, |
| "learning_rate": 0.001, |
| "loss": 2.6337, |
| "num_input_tokens_seen": 9660001856, |
| "step": 36850 |
| }, |
| { |
| "epoch": 0.1760139284734727, |
| "grad_norm": 0.2051621973514557, |
| "learning_rate": 0.001, |
| "loss": 2.6297, |
| "num_input_tokens_seen": 9673109056, |
| "step": 36900 |
| }, |
| { |
| "epoch": 0.17625242973156682, |
| "grad_norm": 0.25841349363327026, |
| "learning_rate": 0.001, |
| "loss": 2.6678, |
| "num_input_tokens_seen": 9686216256, |
| "step": 36950 |
| }, |
| { |
| "epoch": 0.17649093098966098, |
| "grad_norm": 0.198688805103302, |
| "learning_rate": 0.001, |
| "loss": 2.6521, |
| "num_input_tokens_seen": 9699323456, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.17649093098966098, |
| "eval_loss": 2.5248863697052, |
| "eval_runtime": 51.363, |
| "eval_samples_per_second": 97.346, |
| "eval_steps_per_second": 24.337, |
| "num_input_tokens_seen": 9699323456, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.1767294322477551, |
| "grad_norm": 0.2030065804719925, |
| "learning_rate": 0.001, |
| "loss": 2.6481, |
| "num_input_tokens_seen": 9712430656, |
| "step": 37050 |
| }, |
| { |
| "epoch": 0.17696793350584925, |
| "grad_norm": 0.20191729068756104, |
| "learning_rate": 0.001, |
| "loss": 2.6332, |
| "num_input_tokens_seen": 9725537856, |
| "step": 37100 |
| }, |
| { |
| "epoch": 0.17720643476394338, |
| "grad_norm": 0.19462484121322632, |
| "learning_rate": 0.001, |
| "loss": 2.6444, |
| "num_input_tokens_seen": 9738645056, |
| "step": 37150 |
| }, |
| { |
| "epoch": 0.1774449360220375, |
| "grad_norm": 0.27893325686454773, |
| "learning_rate": 0.001, |
| "loss": 2.6309, |
| "num_input_tokens_seen": 9751752256, |
| "step": 37200 |
| }, |
| { |
| "epoch": 0.17768343728013164, |
| "grad_norm": 0.20646531879901886, |
| "learning_rate": 0.001, |
| "loss": 2.646, |
| "num_input_tokens_seen": 9764859456, |
| "step": 37250 |
| }, |
| { |
| "epoch": 0.1779219385382258, |
| "grad_norm": 0.20815566182136536, |
| "learning_rate": 0.001, |
| "loss": 2.6374, |
| "num_input_tokens_seen": 9777966656, |
| "step": 37300 |
| }, |
| { |
| "epoch": 0.17816043979631993, |
| "grad_norm": 0.2194615602493286, |
| "learning_rate": 0.001, |
| "loss": 2.6313, |
| "num_input_tokens_seen": 9791073856, |
| "step": 37350 |
| }, |
| { |
| "epoch": 0.17839894105441406, |
| "grad_norm": 0.23223313689231873, |
| "learning_rate": 0.001, |
| "loss": 2.6435, |
| "num_input_tokens_seen": 9804181056, |
| "step": 37400 |
| }, |
| { |
| "epoch": 0.1786374423125082, |
| "grad_norm": 0.1731143593788147, |
| "learning_rate": 0.001, |
| "loss": 2.6397, |
| "num_input_tokens_seen": 9817288256, |
| "step": 37450 |
| }, |
| { |
| "epoch": 0.17887594357060232, |
| "grad_norm": 0.1929951161146164, |
| "learning_rate": 0.001, |
| "loss": 2.6406, |
| "num_input_tokens_seen": 9830395456, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.17887594357060232, |
| "eval_loss": 2.518928050994873, |
| "eval_runtime": 51.8733, |
| "eval_samples_per_second": 96.389, |
| "eval_steps_per_second": 24.097, |
| "num_input_tokens_seen": 9830395456, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.17911444482869648, |
| "grad_norm": 0.19979524612426758, |
| "learning_rate": 0.001, |
| "loss": 2.6363, |
| "num_input_tokens_seen": 9843502656, |
| "step": 37550 |
| }, |
| { |
| "epoch": 0.17935294608679062, |
| "grad_norm": 0.17963503301143646, |
| "learning_rate": 0.001, |
| "loss": 2.6423, |
| "num_input_tokens_seen": 9856609856, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.17959144734488475, |
| "grad_norm": 0.18216437101364136, |
| "learning_rate": 0.001, |
| "loss": 2.6351, |
| "num_input_tokens_seen": 9869717056, |
| "step": 37650 |
| }, |
| { |
| "epoch": 0.17982994860297888, |
| "grad_norm": 0.16782627999782562, |
| "learning_rate": 0.001, |
| "loss": 2.623, |
| "num_input_tokens_seen": 9882824256, |
| "step": 37700 |
| }, |
| { |
| "epoch": 0.180068449861073, |
| "grad_norm": 0.21884289383888245, |
| "learning_rate": 0.001, |
| "loss": 2.6418, |
| "num_input_tokens_seen": 9895931456, |
| "step": 37750 |
| }, |
| { |
| "epoch": 0.18030695111916714, |
| "grad_norm": 0.18940453231334686, |
| "learning_rate": 0.001, |
| "loss": 2.6371, |
| "num_input_tokens_seen": 9909038656, |
| "step": 37800 |
| }, |
| { |
| "epoch": 0.1805454523772613, |
| "grad_norm": 0.2075282484292984, |
| "learning_rate": 0.001, |
| "loss": 2.6347, |
| "num_input_tokens_seen": 9922145856, |
| "step": 37850 |
| }, |
| { |
| "epoch": 0.18078395363535543, |
| "grad_norm": 0.18504877388477325, |
| "learning_rate": 0.001, |
| "loss": 2.6391, |
| "num_input_tokens_seen": 9935253056, |
| "step": 37900 |
| }, |
| { |
| "epoch": 0.18102245489344956, |
| "grad_norm": 0.17926527559757233, |
| "learning_rate": 0.001, |
| "loss": 2.6358, |
| "num_input_tokens_seen": 9948360256, |
| "step": 37950 |
| }, |
| { |
| "epoch": 0.1812609561515437, |
| "grad_norm": 0.20022514462471008, |
| "learning_rate": 0.001, |
| "loss": 2.6369, |
| "num_input_tokens_seen": 9961467456, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.1812609561515437, |
| "eval_loss": 2.5171313285827637, |
| "eval_runtime": 51.617, |
| "eval_samples_per_second": 96.867, |
| "eval_steps_per_second": 24.217, |
| "num_input_tokens_seen": 9961467456, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.18149945740963783, |
| "grad_norm": 0.19376301765441895, |
| "learning_rate": 0.001, |
| "loss": 2.6274, |
| "num_input_tokens_seen": 9974574656, |
| "step": 38050 |
| }, |
| { |
| "epoch": 0.18173795866773199, |
| "grad_norm": 0.2077150195837021, |
| "learning_rate": 0.001, |
| "loss": 2.6303, |
| "num_input_tokens_seen": 9987681856, |
| "step": 38100 |
| }, |
| { |
| "epoch": 0.18197645992582612, |
| "grad_norm": 0.19407787919044495, |
| "learning_rate": 0.001, |
| "loss": 2.6246, |
| "num_input_tokens_seen": 10000789056, |
| "step": 38150 |
| }, |
| { |
| "epoch": 0.18221496118392025, |
| "grad_norm": 0.20558005571365356, |
| "learning_rate": 0.001, |
| "loss": 2.6291, |
| "num_input_tokens_seen": 10013896256, |
| "step": 38200 |
| }, |
| { |
| "epoch": 0.18245346244201438, |
| "grad_norm": 0.22928735613822937, |
| "learning_rate": 0.001, |
| "loss": 2.6336, |
| "num_input_tokens_seen": 10027003456, |
| "step": 38250 |
| }, |
| { |
| "epoch": 0.1826919637001085, |
| "grad_norm": 0.23481298983097076, |
| "learning_rate": 0.001, |
| "loss": 2.6412, |
| "num_input_tokens_seen": 10040110656, |
| "step": 38300 |
| }, |
| { |
| "epoch": 0.18293046495820264, |
| "grad_norm": 0.19808940589427948, |
| "learning_rate": 0.001, |
| "loss": 2.6395, |
| "num_input_tokens_seen": 10053217856, |
| "step": 38350 |
| }, |
| { |
| "epoch": 0.1831689662162968, |
| "grad_norm": 0.20152992010116577, |
| "learning_rate": 0.001, |
| "loss": 2.6224, |
| "num_input_tokens_seen": 10066325056, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.18340746747439093, |
| "grad_norm": 0.18065959215164185, |
| "learning_rate": 0.001, |
| "loss": 2.626, |
| "num_input_tokens_seen": 10079432256, |
| "step": 38450 |
| }, |
| { |
| "epoch": 0.18364596873248507, |
| "grad_norm": 0.20382963120937347, |
| "learning_rate": 0.001, |
| "loss": 2.6382, |
| "num_input_tokens_seen": 10092539456, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.18364596873248507, |
| "eval_loss": 2.5152854919433594, |
| "eval_runtime": 51.2566, |
| "eval_samples_per_second": 97.548, |
| "eval_steps_per_second": 24.387, |
| "num_input_tokens_seen": 10092539456, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.1838844699905792, |
| "grad_norm": 0.17728358507156372, |
| "learning_rate": 0.001, |
| "loss": 2.6293, |
| "num_input_tokens_seen": 10105646656, |
| "step": 38550 |
| }, |
| { |
| "epoch": 0.18412297124867333, |
| "grad_norm": 0.20164869725704193, |
| "learning_rate": 0.001, |
| "loss": 2.6372, |
| "num_input_tokens_seen": 10118753856, |
| "step": 38600 |
| }, |
| { |
| "epoch": 0.18436147250676746, |
| "grad_norm": 0.20125731825828552, |
| "learning_rate": 0.001, |
| "loss": 2.6326, |
| "num_input_tokens_seen": 10131861056, |
| "step": 38650 |
| }, |
| { |
| "epoch": 0.18459997376486162, |
| "grad_norm": 0.21193954348564148, |
| "learning_rate": 0.001, |
| "loss": 2.6249, |
| "num_input_tokens_seen": 10144968256, |
| "step": 38700 |
| }, |
| { |
| "epoch": 0.18483847502295575, |
| "grad_norm": 0.1925983726978302, |
| "learning_rate": 0.001, |
| "loss": 2.6424, |
| "num_input_tokens_seen": 10158075456, |
| "step": 38750 |
| }, |
| { |
| "epoch": 0.18507697628104988, |
| "grad_norm": 0.19814860820770264, |
| "learning_rate": 0.001, |
| "loss": 2.6431, |
| "num_input_tokens_seen": 10171182656, |
| "step": 38800 |
| }, |
| { |
| "epoch": 0.185315477539144, |
| "grad_norm": 0.1909031718969345, |
| "learning_rate": 0.001, |
| "loss": 2.6068, |
| "num_input_tokens_seen": 10184289856, |
| "step": 38850 |
| }, |
| { |
| "epoch": 0.18555397879723814, |
| "grad_norm": 0.20779775083065033, |
| "learning_rate": 0.001, |
| "loss": 2.625, |
| "num_input_tokens_seen": 10197397056, |
| "step": 38900 |
| }, |
| { |
| "epoch": 0.1857924800553323, |
| "grad_norm": 0.1768522411584854, |
| "learning_rate": 0.001, |
| "loss": 2.6112, |
| "num_input_tokens_seen": 10210504256, |
| "step": 38950 |
| }, |
| { |
| "epoch": 0.18603098131342644, |
| "grad_norm": 0.20275786519050598, |
| "learning_rate": 0.001, |
| "loss": 2.6284, |
| "num_input_tokens_seen": 10223611456, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.18603098131342644, |
| "eval_loss": 2.5149083137512207, |
| "eval_runtime": 51.6703, |
| "eval_samples_per_second": 96.767, |
| "eval_steps_per_second": 24.192, |
| "num_input_tokens_seen": 10223611456, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.18626948257152057, |
| "grad_norm": 0.19634057581424713, |
| "learning_rate": 0.001, |
| "loss": 2.6342, |
| "num_input_tokens_seen": 10236718656, |
| "step": 39050 |
| }, |
| { |
| "epoch": 0.1865079838296147, |
| "grad_norm": 0.19488537311553955, |
| "learning_rate": 0.001, |
| "loss": 2.6305, |
| "num_input_tokens_seen": 10249825856, |
| "step": 39100 |
| }, |
| { |
| "epoch": 0.18674648508770883, |
| "grad_norm": 0.2082369476556778, |
| "learning_rate": 0.001, |
| "loss": 2.6067, |
| "num_input_tokens_seen": 10262933056, |
| "step": 39150 |
| }, |
| { |
| "epoch": 0.18698498634580296, |
| "grad_norm": 0.21019776165485382, |
| "learning_rate": 0.001, |
| "loss": 2.628, |
| "num_input_tokens_seen": 10276040256, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.18722348760389712, |
| "grad_norm": 0.19929739832878113, |
| "learning_rate": 0.001, |
| "loss": 2.6256, |
| "num_input_tokens_seen": 10289147456, |
| "step": 39250 |
| }, |
| { |
| "epoch": 0.18746198886199125, |
| "grad_norm": 0.204230397939682, |
| "learning_rate": 0.001, |
| "loss": 2.6113, |
| "num_input_tokens_seen": 10302254656, |
| "step": 39300 |
| }, |
| { |
| "epoch": 0.18770049012008538, |
| "grad_norm": 0.2217213660478592, |
| "learning_rate": 0.001, |
| "loss": 2.6253, |
| "num_input_tokens_seen": 10315361856, |
| "step": 39350 |
| }, |
| { |
| "epoch": 0.18793899137817952, |
| "grad_norm": 0.19329366087913513, |
| "learning_rate": 0.001, |
| "loss": 2.6317, |
| "num_input_tokens_seen": 10328469056, |
| "step": 39400 |
| }, |
| { |
| "epoch": 0.18817749263627365, |
| "grad_norm": 0.18244336545467377, |
| "learning_rate": 0.001, |
| "loss": 2.6476, |
| "num_input_tokens_seen": 10341576256, |
| "step": 39450 |
| }, |
| { |
| "epoch": 0.1884159938943678, |
| "grad_norm": 0.1864692121744156, |
| "learning_rate": 0.001, |
| "loss": 2.642, |
| "num_input_tokens_seen": 10354683456, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.1884159938943678, |
| "eval_loss": 2.514141321182251, |
| "eval_runtime": 51.1111, |
| "eval_samples_per_second": 97.826, |
| "eval_steps_per_second": 24.457, |
| "num_input_tokens_seen": 10354683456, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.18865449515246194, |
| "grad_norm": 0.25003623962402344, |
| "learning_rate": 0.001, |
| "loss": 2.6299, |
| "num_input_tokens_seen": 10367790656, |
| "step": 39550 |
| }, |
| { |
| "epoch": 0.18889299641055607, |
| "grad_norm": 0.19642098248004913, |
| "learning_rate": 0.001, |
| "loss": 2.6412, |
| "num_input_tokens_seen": 10380897856, |
| "step": 39600 |
| }, |
| { |
| "epoch": 0.1891314976686502, |
| "grad_norm": 0.21947956085205078, |
| "learning_rate": 0.001, |
| "loss": 2.6211, |
| "num_input_tokens_seen": 10394005056, |
| "step": 39650 |
| }, |
| { |
| "epoch": 0.18936999892674433, |
| "grad_norm": 0.19838476181030273, |
| "learning_rate": 0.001, |
| "loss": 2.6451, |
| "num_input_tokens_seen": 10407112256, |
| "step": 39700 |
| }, |
| { |
| "epoch": 0.18960850018483846, |
| "grad_norm": 0.21131113171577454, |
| "learning_rate": 0.001, |
| "loss": 2.6375, |
| "num_input_tokens_seen": 10420219456, |
| "step": 39750 |
| }, |
| { |
| "epoch": 0.18984700144293262, |
| "grad_norm": 0.17576864361763, |
| "learning_rate": 0.001, |
| "loss": 2.6325, |
| "num_input_tokens_seen": 10433326656, |
| "step": 39800 |
| }, |
| { |
| "epoch": 0.19008550270102675, |
| "grad_norm": 0.2113037258386612, |
| "learning_rate": 0.001, |
| "loss": 2.6254, |
| "num_input_tokens_seen": 10446433856, |
| "step": 39850 |
| }, |
| { |
| "epoch": 0.19032400395912089, |
| "grad_norm": 0.1972583681344986, |
| "learning_rate": 0.001, |
| "loss": 2.6277, |
| "num_input_tokens_seen": 10459541056, |
| "step": 39900 |
| }, |
| { |
| "epoch": 0.19056250521721502, |
| "grad_norm": 0.43353378772735596, |
| "learning_rate": 0.001, |
| "loss": 2.6295, |
| "num_input_tokens_seen": 10472648256, |
| "step": 39950 |
| }, |
| { |
| "epoch": 0.19080100647530915, |
| "grad_norm": 0.22195081412792206, |
| "learning_rate": 0.001, |
| "loss": 2.6422, |
| "num_input_tokens_seen": 10485755456, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.19080100647530915, |
| "eval_loss": 2.5144667625427246, |
| "eval_runtime": 51.0006, |
| "eval_samples_per_second": 98.038, |
| "eval_steps_per_second": 24.51, |
| "num_input_tokens_seen": 10485755456, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.19103950773340328, |
| "grad_norm": 0.18717694282531738, |
| "learning_rate": 0.001, |
| "loss": 2.6512, |
| "num_input_tokens_seen": 10498862656, |
| "step": 40050 |
| }, |
| { |
| "epoch": 0.19127800899149744, |
| "grad_norm": 0.2009858638048172, |
| "learning_rate": 0.001, |
| "loss": 2.6289, |
| "num_input_tokens_seen": 10511969856, |
| "step": 40100 |
| }, |
| { |
| "epoch": 0.19151651024959157, |
| "grad_norm": 0.2515949010848999, |
| "learning_rate": 0.001, |
| "loss": 2.6342, |
| "num_input_tokens_seen": 10525077056, |
| "step": 40150 |
| }, |
| { |
| "epoch": 0.1917550115076857, |
| "grad_norm": 0.19864948093891144, |
| "learning_rate": 0.001, |
| "loss": 2.6191, |
| "num_input_tokens_seen": 10538184256, |
| "step": 40200 |
| }, |
| { |
| "epoch": 0.19199351276577983, |
| "grad_norm": 0.17704185843467712, |
| "learning_rate": 0.001, |
| "loss": 2.6176, |
| "num_input_tokens_seen": 10551291456, |
| "step": 40250 |
| }, |
| { |
| "epoch": 0.19223201402387396, |
| "grad_norm": 0.2097242772579193, |
| "learning_rate": 0.001, |
| "loss": 2.6509, |
| "num_input_tokens_seen": 10564398656, |
| "step": 40300 |
| }, |
| { |
| "epoch": 0.19247051528196812, |
| "grad_norm": 0.18630579113960266, |
| "learning_rate": 0.001, |
| "loss": 2.6273, |
| "num_input_tokens_seen": 10577505856, |
| "step": 40350 |
| }, |
| { |
| "epoch": 0.19270901654006226, |
| "grad_norm": 0.24162743985652924, |
| "learning_rate": 0.001, |
| "loss": 2.6405, |
| "num_input_tokens_seen": 10590613056, |
| "step": 40400 |
| }, |
| { |
| "epoch": 0.1929475177981564, |
| "grad_norm": 0.19576874375343323, |
| "learning_rate": 0.001, |
| "loss": 2.6403, |
| "num_input_tokens_seen": 10603720256, |
| "step": 40450 |
| }, |
| { |
| "epoch": 0.19318601905625052, |
| "grad_norm": 0.18408045172691345, |
| "learning_rate": 0.001, |
| "loss": 2.6149, |
| "num_input_tokens_seen": 10616827456, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.19318601905625052, |
| "eval_loss": 2.511899709701538, |
| "eval_runtime": 51.5326, |
| "eval_samples_per_second": 97.026, |
| "eval_steps_per_second": 24.257, |
| "num_input_tokens_seen": 10616827456, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.19342452031434465, |
| "grad_norm": 0.20845313370227814, |
| "learning_rate": 0.001, |
| "loss": 2.6242, |
| "num_input_tokens_seen": 10629934656, |
| "step": 40550 |
| }, |
| { |
| "epoch": 0.19366302157243878, |
| "grad_norm": 0.20603816211223602, |
| "learning_rate": 0.001, |
| "loss": 2.6305, |
| "num_input_tokens_seen": 10643041856, |
| "step": 40600 |
| }, |
| { |
| "epoch": 0.19390152283053294, |
| "grad_norm": 0.2180013507604599, |
| "learning_rate": 0.001, |
| "loss": 2.6271, |
| "num_input_tokens_seen": 10656149056, |
| "step": 40650 |
| }, |
| { |
| "epoch": 0.19414002408862707, |
| "grad_norm": 0.22217005491256714, |
| "learning_rate": 0.001, |
| "loss": 2.6407, |
| "num_input_tokens_seen": 10669256256, |
| "step": 40700 |
| }, |
| { |
| "epoch": 0.1943785253467212, |
| "grad_norm": 0.21379347145557404, |
| "learning_rate": 0.001, |
| "loss": 2.6209, |
| "num_input_tokens_seen": 10682363456, |
| "step": 40750 |
| }, |
| { |
| "epoch": 0.19461702660481534, |
| "grad_norm": 0.2011626958847046, |
| "learning_rate": 0.001, |
| "loss": 2.6471, |
| "num_input_tokens_seen": 10695470656, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.19485552786290947, |
| "grad_norm": 0.1946493685245514, |
| "learning_rate": 0.001, |
| "loss": 2.6267, |
| "num_input_tokens_seen": 10708577856, |
| "step": 40850 |
| }, |
| { |
| "epoch": 0.19509402912100363, |
| "grad_norm": 0.19157454371452332, |
| "learning_rate": 0.001, |
| "loss": 2.6362, |
| "num_input_tokens_seen": 10721685056, |
| "step": 40900 |
| }, |
| { |
| "epoch": 0.19533253037909776, |
| "grad_norm": 0.1978122442960739, |
| "learning_rate": 0.001, |
| "loss": 2.6448, |
| "num_input_tokens_seen": 10734792256, |
| "step": 40950 |
| }, |
| { |
| "epoch": 0.1955710316371919, |
| "grad_norm": 0.19996555149555206, |
| "learning_rate": 0.001, |
| "loss": 2.626, |
| "num_input_tokens_seen": 10747899456, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.1955710316371919, |
| "eval_loss": 2.5084941387176514, |
| "eval_runtime": 51.6987, |
| "eval_samples_per_second": 96.714, |
| "eval_steps_per_second": 24.179, |
| "num_input_tokens_seen": 10747899456, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.19580953289528602, |
| "grad_norm": 0.20298945903778076, |
| "learning_rate": 0.001, |
| "loss": 2.6233, |
| "num_input_tokens_seen": 10761006656, |
| "step": 41050 |
| }, |
| { |
| "epoch": 0.19604803415338015, |
| "grad_norm": 0.2280716896057129, |
| "learning_rate": 0.001, |
| "loss": 2.6427, |
| "num_input_tokens_seen": 10774113856, |
| "step": 41100 |
| }, |
| { |
| "epoch": 0.19628653541147428, |
| "grad_norm": 0.19223643839359283, |
| "learning_rate": 0.001, |
| "loss": 2.6263, |
| "num_input_tokens_seen": 10787221056, |
| "step": 41150 |
| }, |
| { |
| "epoch": 0.19652503666956844, |
| "grad_norm": 0.19221842288970947, |
| "learning_rate": 0.001, |
| "loss": 2.6401, |
| "num_input_tokens_seen": 10800328256, |
| "step": 41200 |
| }, |
| { |
| "epoch": 0.19676353792766257, |
| "grad_norm": 0.19479979574680328, |
| "learning_rate": 0.001, |
| "loss": 2.6269, |
| "num_input_tokens_seen": 10813435456, |
| "step": 41250 |
| }, |
| { |
| "epoch": 0.1970020391857567, |
| "grad_norm": 0.24501195549964905, |
| "learning_rate": 0.001, |
| "loss": 2.618, |
| "num_input_tokens_seen": 10826542656, |
| "step": 41300 |
| }, |
| { |
| "epoch": 0.19724054044385084, |
| "grad_norm": 0.1994044929742813, |
| "learning_rate": 0.001, |
| "loss": 2.64, |
| "num_input_tokens_seen": 10839649856, |
| "step": 41350 |
| }, |
| { |
| "epoch": 0.19747904170194497, |
| "grad_norm": 0.20831650495529175, |
| "learning_rate": 0.001, |
| "loss": 2.6513, |
| "num_input_tokens_seen": 10852757056, |
| "step": 41400 |
| }, |
| { |
| "epoch": 0.19771754296003913, |
| "grad_norm": 0.21919438242912292, |
| "learning_rate": 0.001, |
| "loss": 2.6379, |
| "num_input_tokens_seen": 10865864256, |
| "step": 41450 |
| }, |
| { |
| "epoch": 0.19795604421813326, |
| "grad_norm": 0.23088768124580383, |
| "learning_rate": 0.001, |
| "loss": 2.6449, |
| "num_input_tokens_seen": 10878971456, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.19795604421813326, |
| "eval_loss": 2.5156567096710205, |
| "eval_runtime": 51.6776, |
| "eval_samples_per_second": 96.754, |
| "eval_steps_per_second": 24.188, |
| "num_input_tokens_seen": 10878971456, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.1981945454762274, |
| "grad_norm": 0.1982518881559372, |
| "learning_rate": 0.001, |
| "loss": 2.6304, |
| "num_input_tokens_seen": 10892078656, |
| "step": 41550 |
| }, |
| { |
| "epoch": 0.19843304673432152, |
| "grad_norm": 0.2099853903055191, |
| "learning_rate": 0.001, |
| "loss": 2.6305, |
| "num_input_tokens_seen": 10905185856, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.19867154799241565, |
| "grad_norm": 0.19403131306171417, |
| "learning_rate": 0.001, |
| "loss": 2.6419, |
| "num_input_tokens_seen": 10918293056, |
| "step": 41650 |
| }, |
| { |
| "epoch": 0.19891004925050979, |
| "grad_norm": 0.20865993201732635, |
| "learning_rate": 0.001, |
| "loss": 2.6116, |
| "num_input_tokens_seen": 10931400256, |
| "step": 41700 |
| }, |
| { |
| "epoch": 0.19914855050860394, |
| "grad_norm": 0.19042626023292542, |
| "learning_rate": 0.001, |
| "loss": 2.6271, |
| "num_input_tokens_seen": 10944507456, |
| "step": 41750 |
| }, |
| { |
| "epoch": 0.19938705176669808, |
| "grad_norm": 0.20514579117298126, |
| "learning_rate": 0.001, |
| "loss": 2.6348, |
| "num_input_tokens_seen": 10957614656, |
| "step": 41800 |
| }, |
| { |
| "epoch": 0.1996255530247922, |
| "grad_norm": 0.21224668622016907, |
| "learning_rate": 0.001, |
| "loss": 2.6314, |
| "num_input_tokens_seen": 10970721856, |
| "step": 41850 |
| }, |
| { |
| "epoch": 0.19986405428288634, |
| "grad_norm": 0.18857082724571228, |
| "learning_rate": 0.001, |
| "loss": 2.6217, |
| "num_input_tokens_seen": 10983829056, |
| "step": 41900 |
| }, |
| { |
| "epoch": 0.20010255554098047, |
| "grad_norm": 0.18431074917316437, |
| "learning_rate": 0.001, |
| "loss": 2.6267, |
| "num_input_tokens_seen": 10996936256, |
| "step": 41950 |
| }, |
| { |
| "epoch": 0.2003410567990746, |
| "grad_norm": 0.20570099353790283, |
| "learning_rate": 0.001, |
| "loss": 2.6016, |
| "num_input_tokens_seen": 11010043456, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.2003410567990746, |
| "eval_loss": 2.506241798400879, |
| "eval_runtime": 51.5548, |
| "eval_samples_per_second": 96.984, |
| "eval_steps_per_second": 24.246, |
| "num_input_tokens_seen": 11010043456, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.20057955805716876, |
| "grad_norm": 0.17952106893062592, |
| "learning_rate": 0.001, |
| "loss": 2.6165, |
| "num_input_tokens_seen": 11023150656, |
| "step": 42050 |
| }, |
| { |
| "epoch": 0.2008180593152629, |
| "grad_norm": 0.20292694866657257, |
| "learning_rate": 0.001, |
| "loss": 2.6357, |
| "num_input_tokens_seen": 11036257856, |
| "step": 42100 |
| }, |
| { |
| "epoch": 0.20105656057335702, |
| "grad_norm": 0.19588933885097504, |
| "learning_rate": 0.001, |
| "loss": 2.6102, |
| "num_input_tokens_seen": 11049365056, |
| "step": 42150 |
| }, |
| { |
| "epoch": 0.20129506183145116, |
| "grad_norm": 0.1982785314321518, |
| "learning_rate": 0.001, |
| "loss": 2.6019, |
| "num_input_tokens_seen": 11062472256, |
| "step": 42200 |
| }, |
| { |
| "epoch": 0.2015335630895453, |
| "grad_norm": 0.18049876391887665, |
| "learning_rate": 0.001, |
| "loss": 2.6081, |
| "num_input_tokens_seen": 11075579456, |
| "step": 42250 |
| }, |
| { |
| "epoch": 0.20177206434763945, |
| "grad_norm": 0.2069908082485199, |
| "learning_rate": 0.001, |
| "loss": 2.6173, |
| "num_input_tokens_seen": 11088686656, |
| "step": 42300 |
| }, |
| { |
| "epoch": 0.20201056560573358, |
| "grad_norm": 0.2415982335805893, |
| "learning_rate": 0.001, |
| "loss": 2.6173, |
| "num_input_tokens_seen": 11101793856, |
| "step": 42350 |
| }, |
| { |
| "epoch": 0.2022490668638277, |
| "grad_norm": 0.20267252624034882, |
| "learning_rate": 0.001, |
| "loss": 2.6299, |
| "num_input_tokens_seen": 11114901056, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.20248756812192184, |
| "grad_norm": 0.20683065056800842, |
| "learning_rate": 0.001, |
| "loss": 2.6282, |
| "num_input_tokens_seen": 11128008256, |
| "step": 42450 |
| }, |
| { |
| "epoch": 0.20272606938001597, |
| "grad_norm": 0.22137881815433502, |
| "learning_rate": 0.001, |
| "loss": 2.6271, |
| "num_input_tokens_seen": 11141115456, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.20272606938001597, |
| "eval_loss": 2.5125572681427, |
| "eval_runtime": 51.794, |
| "eval_samples_per_second": 96.536, |
| "eval_steps_per_second": 24.134, |
| "num_input_tokens_seen": 11141115456, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.2029645706381101, |
| "grad_norm": 0.20610037446022034, |
| "learning_rate": 0.001, |
| "loss": 2.6255, |
| "num_input_tokens_seen": 11154222656, |
| "step": 42550 |
| }, |
| { |
| "epoch": 0.20320307189620426, |
| "grad_norm": 0.21218810975551605, |
| "learning_rate": 0.001, |
| "loss": 2.6149, |
| "num_input_tokens_seen": 11167329856, |
| "step": 42600 |
| }, |
| { |
| "epoch": 0.2034415731542984, |
| "grad_norm": 0.19685466587543488, |
| "learning_rate": 0.001, |
| "loss": 2.6208, |
| "num_input_tokens_seen": 11180437056, |
| "step": 42650 |
| }, |
| { |
| "epoch": 0.20368007441239253, |
| "grad_norm": 0.20507460832595825, |
| "learning_rate": 0.001, |
| "loss": 2.6227, |
| "num_input_tokens_seen": 11193544256, |
| "step": 42700 |
| }, |
| { |
| "epoch": 0.20391857567048666, |
| "grad_norm": 0.20014505088329315, |
| "learning_rate": 0.001, |
| "loss": 2.6238, |
| "num_input_tokens_seen": 11206651456, |
| "step": 42750 |
| }, |
| { |
| "epoch": 0.2041570769285808, |
| "grad_norm": 0.1907282918691635, |
| "learning_rate": 0.001, |
| "loss": 2.6157, |
| "num_input_tokens_seen": 11219758656, |
| "step": 42800 |
| }, |
| { |
| "epoch": 0.20439557818667495, |
| "grad_norm": 0.18553833663463593, |
| "learning_rate": 0.001, |
| "loss": 2.6123, |
| "num_input_tokens_seen": 11232865856, |
| "step": 42850 |
| }, |
| { |
| "epoch": 0.20463407944476908, |
| "grad_norm": 0.20382866263389587, |
| "learning_rate": 0.001, |
| "loss": 2.6163, |
| "num_input_tokens_seen": 11245973056, |
| "step": 42900 |
| }, |
| { |
| "epoch": 0.2048725807028632, |
| "grad_norm": 0.18923860788345337, |
| "learning_rate": 0.001, |
| "loss": 2.5981, |
| "num_input_tokens_seen": 11259080256, |
| "step": 42950 |
| }, |
| { |
| "epoch": 0.20511108196095734, |
| "grad_norm": 0.19230851531028748, |
| "learning_rate": 0.001, |
| "loss": 2.618, |
| "num_input_tokens_seen": 11272187456, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.20511108196095734, |
| "eval_loss": 2.5047237873077393, |
| "eval_runtime": 51.2959, |
| "eval_samples_per_second": 97.474, |
| "eval_steps_per_second": 24.368, |
| "num_input_tokens_seen": 11272187456, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.20534958321905147, |
| "grad_norm": 0.22746357321739197, |
| "learning_rate": 0.001, |
| "loss": 2.6281, |
| "num_input_tokens_seen": 11285294656, |
| "step": 43050 |
| }, |
| { |
| "epoch": 0.2055880844771456, |
| "grad_norm": 0.21107150614261627, |
| "learning_rate": 0.001, |
| "loss": 2.6154, |
| "num_input_tokens_seen": 11298401856, |
| "step": 43100 |
| }, |
| { |
| "epoch": 0.20582658573523976, |
| "grad_norm": 0.18025045096874237, |
| "learning_rate": 0.001, |
| "loss": 2.6141, |
| "num_input_tokens_seen": 11311509056, |
| "step": 43150 |
| }, |
| { |
| "epoch": 0.2060650869933339, |
| "grad_norm": 0.2009642869234085, |
| "learning_rate": 0.001, |
| "loss": 2.6133, |
| "num_input_tokens_seen": 11324616256, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.20630358825142803, |
| "grad_norm": 0.1872788518667221, |
| "learning_rate": 0.001, |
| "loss": 2.6197, |
| "num_input_tokens_seen": 11337723456, |
| "step": 43250 |
| }, |
| { |
| "epoch": 0.20654208950952216, |
| "grad_norm": 0.216310054063797, |
| "learning_rate": 0.001, |
| "loss": 2.6353, |
| "num_input_tokens_seen": 11350830656, |
| "step": 43300 |
| }, |
| { |
| "epoch": 0.2067805907676163, |
| "grad_norm": 0.2705513536930084, |
| "learning_rate": 0.001, |
| "loss": 2.6333, |
| "num_input_tokens_seen": 11363937856, |
| "step": 43350 |
| }, |
| { |
| "epoch": 0.20701909202571045, |
| "grad_norm": 0.3040550649166107, |
| "learning_rate": 0.001, |
| "loss": 2.6094, |
| "num_input_tokens_seen": 11377045056, |
| "step": 43400 |
| }, |
| { |
| "epoch": 0.20725759328380458, |
| "grad_norm": 0.2075599879026413, |
| "learning_rate": 0.001, |
| "loss": 2.6225, |
| "num_input_tokens_seen": 11390152256, |
| "step": 43450 |
| }, |
| { |
| "epoch": 0.2074960945418987, |
| "grad_norm": 0.22293590009212494, |
| "learning_rate": 0.001, |
| "loss": 2.6271, |
| "num_input_tokens_seen": 11403259456, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.2074960945418987, |
| "eval_loss": 2.5097975730895996, |
| "eval_runtime": 51.7037, |
| "eval_samples_per_second": 96.705, |
| "eval_steps_per_second": 24.176, |
| "num_input_tokens_seen": 11403259456, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.20773459579999284, |
| "grad_norm": 0.21221335232257843, |
| "learning_rate": 0.001, |
| "loss": 2.618, |
| "num_input_tokens_seen": 11416366656, |
| "step": 43550 |
| }, |
| { |
| "epoch": 0.20797309705808698, |
| "grad_norm": 0.19894948601722717, |
| "learning_rate": 0.001, |
| "loss": 2.6305, |
| "num_input_tokens_seen": 11429473856, |
| "step": 43600 |
| }, |
| { |
| "epoch": 0.2082115983161811, |
| "grad_norm": 0.29371336102485657, |
| "learning_rate": 0.001, |
| "loss": 2.6211, |
| "num_input_tokens_seen": 11442581056, |
| "step": 43650 |
| }, |
| { |
| "epoch": 0.20845009957427527, |
| "grad_norm": 0.19441936910152435, |
| "learning_rate": 0.001, |
| "loss": 2.6355, |
| "num_input_tokens_seen": 11455688256, |
| "step": 43700 |
| }, |
| { |
| "epoch": 0.2086886008323694, |
| "grad_norm": 0.19868114590644836, |
| "learning_rate": 0.001, |
| "loss": 2.6206, |
| "num_input_tokens_seen": 11468795456, |
| "step": 43750 |
| }, |
| { |
| "epoch": 0.20892710209046353, |
| "grad_norm": 0.19971340894699097, |
| "learning_rate": 0.001, |
| "loss": 2.6124, |
| "num_input_tokens_seen": 11481902656, |
| "step": 43800 |
| }, |
| { |
| "epoch": 0.20916560334855766, |
| "grad_norm": 0.22261051833629608, |
| "learning_rate": 0.001, |
| "loss": 2.623, |
| "num_input_tokens_seen": 11495009856, |
| "step": 43850 |
| }, |
| { |
| "epoch": 0.2094041046066518, |
| "grad_norm": 0.20982281863689423, |
| "learning_rate": 0.001, |
| "loss": 2.6182, |
| "num_input_tokens_seen": 11508117056, |
| "step": 43900 |
| }, |
| { |
| "epoch": 0.20964260586474592, |
| "grad_norm": 0.2216535359621048, |
| "learning_rate": 0.001, |
| "loss": 2.6086, |
| "num_input_tokens_seen": 11521224256, |
| "step": 43950 |
| }, |
| { |
| "epoch": 0.20988110712284008, |
| "grad_norm": 0.19298988580703735, |
| "learning_rate": 0.001, |
| "loss": 2.6364, |
| "num_input_tokens_seen": 11534331456, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.20988110712284008, |
| "eval_loss": 2.5009121894836426, |
| "eval_runtime": 51.4356, |
| "eval_samples_per_second": 97.209, |
| "eval_steps_per_second": 24.302, |
| "num_input_tokens_seen": 11534331456, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.21011960838093421, |
| "grad_norm": 0.19737008213996887, |
| "learning_rate": 0.001, |
| "loss": 2.6272, |
| "num_input_tokens_seen": 11547438656, |
| "step": 44050 |
| }, |
| { |
| "epoch": 0.21035810963902835, |
| "grad_norm": 0.1984977424144745, |
| "learning_rate": 0.001, |
| "loss": 2.6417, |
| "num_input_tokens_seen": 11560545856, |
| "step": 44100 |
| }, |
| { |
| "epoch": 0.21059661089712248, |
| "grad_norm": 0.19575904309749603, |
| "learning_rate": 0.001, |
| "loss": 2.6277, |
| "num_input_tokens_seen": 11573653056, |
| "step": 44150 |
| }, |
| { |
| "epoch": 0.2108351121552166, |
| "grad_norm": 0.19875651597976685, |
| "learning_rate": 0.001, |
| "loss": 2.6362, |
| "num_input_tokens_seen": 11586760256, |
| "step": 44200 |
| }, |
| { |
| "epoch": 0.21107361341331077, |
| "grad_norm": 0.20936185121536255, |
| "learning_rate": 0.001, |
| "loss": 2.6217, |
| "num_input_tokens_seen": 11599867456, |
| "step": 44250 |
| }, |
| { |
| "epoch": 0.2113121146714049, |
| "grad_norm": 0.19474463164806366, |
| "learning_rate": 0.001, |
| "loss": 2.6235, |
| "num_input_tokens_seen": 11612974656, |
| "step": 44300 |
| }, |
| { |
| "epoch": 0.21155061592949903, |
| "grad_norm": 0.20833207666873932, |
| "learning_rate": 0.001, |
| "loss": 2.6, |
| "num_input_tokens_seen": 11626081856, |
| "step": 44350 |
| }, |
| { |
| "epoch": 0.21178911718759316, |
| "grad_norm": 0.19269512593746185, |
| "learning_rate": 0.001, |
| "loss": 2.6211, |
| "num_input_tokens_seen": 11639189056, |
| "step": 44400 |
| }, |
| { |
| "epoch": 0.2120276184456873, |
| "grad_norm": 0.21018226444721222, |
| "learning_rate": 0.001, |
| "loss": 2.6294, |
| "num_input_tokens_seen": 11652296256, |
| "step": 44450 |
| }, |
| { |
| "epoch": 0.21226611970378143, |
| "grad_norm": 0.19836543500423431, |
| "learning_rate": 0.001, |
| "loss": 2.6051, |
| "num_input_tokens_seen": 11665403456, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.21226611970378143, |
| "eval_loss": 2.499817132949829, |
| "eval_runtime": 50.9003, |
| "eval_samples_per_second": 98.231, |
| "eval_steps_per_second": 24.558, |
| "num_input_tokens_seen": 11665403456, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.21250462096187558, |
| "grad_norm": 0.18411967158317566, |
| "learning_rate": 0.001, |
| "loss": 2.6228, |
| "num_input_tokens_seen": 11678510656, |
| "step": 44550 |
| }, |
| { |
| "epoch": 0.21274312221996972, |
| "grad_norm": 0.19387467205524445, |
| "learning_rate": 0.001, |
| "loss": 2.5902, |
| "num_input_tokens_seen": 11691617856, |
| "step": 44600 |
| }, |
| { |
| "epoch": 0.21298162347806385, |
| "grad_norm": 0.22076952457427979, |
| "learning_rate": 0.001, |
| "loss": 2.613, |
| "num_input_tokens_seen": 11704725056, |
| "step": 44650 |
| }, |
| { |
| "epoch": 0.21322012473615798, |
| "grad_norm": 0.33861082792282104, |
| "learning_rate": 0.001, |
| "loss": 2.6142, |
| "num_input_tokens_seen": 11717832256, |
| "step": 44700 |
| }, |
| { |
| "epoch": 0.2134586259942521, |
| "grad_norm": 0.20097902417182922, |
| "learning_rate": 0.001, |
| "loss": 2.6549, |
| "num_input_tokens_seen": 11730939456, |
| "step": 44750 |
| }, |
| { |
| "epoch": 0.21369712725234627, |
| "grad_norm": 0.24534635245800018, |
| "learning_rate": 0.001, |
| "loss": 2.6293, |
| "num_input_tokens_seen": 11744046656, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.2139356285104404, |
| "grad_norm": 0.2439020723104477, |
| "learning_rate": 0.001, |
| "loss": 2.635, |
| "num_input_tokens_seen": 11757153856, |
| "step": 44850 |
| }, |
| { |
| "epoch": 0.21417412976853453, |
| "grad_norm": 0.24259154498577118, |
| "learning_rate": 0.001, |
| "loss": 2.6232, |
| "num_input_tokens_seen": 11770261056, |
| "step": 44900 |
| }, |
| { |
| "epoch": 0.21441263102662866, |
| "grad_norm": 0.23554636538028717, |
| "learning_rate": 0.001, |
| "loss": 2.6061, |
| "num_input_tokens_seen": 11783368256, |
| "step": 44950 |
| }, |
| { |
| "epoch": 0.2146511322847228, |
| "grad_norm": 0.20377275347709656, |
| "learning_rate": 0.001, |
| "loss": 2.6156, |
| "num_input_tokens_seen": 11796475456, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.2146511322847228, |
| "eval_loss": 2.503781318664551, |
| "eval_runtime": 51.1656, |
| "eval_samples_per_second": 97.722, |
| "eval_steps_per_second": 24.43, |
| "num_input_tokens_seen": 11796475456, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.21488963354281693, |
| "grad_norm": 0.226406991481781, |
| "learning_rate": 0.001, |
| "loss": 2.626, |
| "num_input_tokens_seen": 11809582656, |
| "step": 45050 |
| }, |
| { |
| "epoch": 0.21512813480091109, |
| "grad_norm": 0.20505741238594055, |
| "learning_rate": 0.001, |
| "loss": 2.6095, |
| "num_input_tokens_seen": 11822689856, |
| "step": 45100 |
| }, |
| { |
| "epoch": 0.21536663605900522, |
| "grad_norm": 0.2917146682739258, |
| "learning_rate": 0.001, |
| "loss": 2.6439, |
| "num_input_tokens_seen": 11835797056, |
| "step": 45150 |
| }, |
| { |
| "epoch": 0.21560513731709935, |
| "grad_norm": 0.24030283093452454, |
| "learning_rate": 0.001, |
| "loss": 2.6386, |
| "num_input_tokens_seen": 11848904256, |
| "step": 45200 |
| }, |
| { |
| "epoch": 0.21584363857519348, |
| "grad_norm": 0.1799454241991043, |
| "learning_rate": 0.001, |
| "loss": 2.6344, |
| "num_input_tokens_seen": 11862011456, |
| "step": 45250 |
| }, |
| { |
| "epoch": 0.2160821398332876, |
| "grad_norm": 0.2093718945980072, |
| "learning_rate": 0.001, |
| "loss": 2.6152, |
| "num_input_tokens_seen": 11875118656, |
| "step": 45300 |
| }, |
| { |
| "epoch": 0.21632064109138174, |
| "grad_norm": 0.19477079808712006, |
| "learning_rate": 0.001, |
| "loss": 2.622, |
| "num_input_tokens_seen": 11888225856, |
| "step": 45350 |
| }, |
| { |
| "epoch": 0.2165591423494759, |
| "grad_norm": 0.2764741778373718, |
| "learning_rate": 0.001, |
| "loss": 2.5951, |
| "num_input_tokens_seen": 11901333056, |
| "step": 45400 |
| }, |
| { |
| "epoch": 0.21679764360757003, |
| "grad_norm": 0.2127208709716797, |
| "learning_rate": 0.001, |
| "loss": 2.6231, |
| "num_input_tokens_seen": 11914440256, |
| "step": 45450 |
| }, |
| { |
| "epoch": 0.21703614486566417, |
| "grad_norm": 0.21089383959770203, |
| "learning_rate": 0.001, |
| "loss": 2.6099, |
| "num_input_tokens_seen": 11927547456, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.21703614486566417, |
| "eval_loss": 2.502464771270752, |
| "eval_runtime": 50.946, |
| "eval_samples_per_second": 98.143, |
| "eval_steps_per_second": 24.536, |
| "num_input_tokens_seen": 11927547456, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.2172746461237583, |
| "grad_norm": 0.19550016522407532, |
| "learning_rate": 0.001, |
| "loss": 2.6365, |
| "num_input_tokens_seen": 11940654656, |
| "step": 45550 |
| }, |
| { |
| "epoch": 0.21751314738185243, |
| "grad_norm": 0.18284358084201813, |
| "learning_rate": 0.001, |
| "loss": 2.6358, |
| "num_input_tokens_seen": 11953761856, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.2177516486399466, |
| "grad_norm": 0.21821847558021545, |
| "learning_rate": 0.001, |
| "loss": 2.607, |
| "num_input_tokens_seen": 11966869056, |
| "step": 45650 |
| }, |
| { |
| "epoch": 0.21799014989804072, |
| "grad_norm": 0.2195073515176773, |
| "learning_rate": 0.001, |
| "loss": 2.6195, |
| "num_input_tokens_seen": 11979976256, |
| "step": 45700 |
| }, |
| { |
| "epoch": 0.21822865115613485, |
| "grad_norm": 0.19679750502109528, |
| "learning_rate": 0.001, |
| "loss": 2.6259, |
| "num_input_tokens_seen": 11993083456, |
| "step": 45750 |
| }, |
| { |
| "epoch": 0.21846715241422898, |
| "grad_norm": 0.1985604166984558, |
| "learning_rate": 0.001, |
| "loss": 2.6224, |
| "num_input_tokens_seen": 12006190656, |
| "step": 45800 |
| }, |
| { |
| "epoch": 0.2187056536723231, |
| "grad_norm": 0.18398787081241608, |
| "learning_rate": 0.001, |
| "loss": 2.6215, |
| "num_input_tokens_seen": 12019297856, |
| "step": 45850 |
| }, |
| { |
| "epoch": 0.21894415493041725, |
| "grad_norm": 0.2306145578622818, |
| "learning_rate": 0.001, |
| "loss": 2.6346, |
| "num_input_tokens_seen": 12032405056, |
| "step": 45900 |
| }, |
| { |
| "epoch": 0.2191826561885114, |
| "grad_norm": 0.21335257589817047, |
| "learning_rate": 0.001, |
| "loss": 2.6232, |
| "num_input_tokens_seen": 12045512256, |
| "step": 45950 |
| }, |
| { |
| "epoch": 0.21942115744660554, |
| "grad_norm": 0.22988814115524292, |
| "learning_rate": 0.001, |
| "loss": 2.6132, |
| "num_input_tokens_seen": 12058619456, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.21942115744660554, |
| "eval_loss": 2.499041795730591, |
| "eval_runtime": 50.6868, |
| "eval_samples_per_second": 98.645, |
| "eval_steps_per_second": 24.661, |
| "num_input_tokens_seen": 12058619456, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.21965965870469967, |
| "grad_norm": 0.19492709636688232, |
| "learning_rate": 0.001, |
| "loss": 2.6196, |
| "num_input_tokens_seen": 12071726656, |
| "step": 46050 |
| }, |
| { |
| "epoch": 0.2198981599627938, |
| "grad_norm": 0.19643568992614746, |
| "learning_rate": 0.001, |
| "loss": 2.6108, |
| "num_input_tokens_seen": 12084833856, |
| "step": 46100 |
| }, |
| { |
| "epoch": 0.22013666122088793, |
| "grad_norm": 0.18720099329948425, |
| "learning_rate": 0.001, |
| "loss": 2.6181, |
| "num_input_tokens_seen": 12097941056, |
| "step": 46150 |
| }, |
| { |
| "epoch": 0.2203751624789821, |
| "grad_norm": 0.1929876208305359, |
| "learning_rate": 0.001, |
| "loss": 2.6152, |
| "num_input_tokens_seen": 12111048256, |
| "step": 46200 |
| }, |
| { |
| "epoch": 0.22061366373707622, |
| "grad_norm": 0.19732603430747986, |
| "learning_rate": 0.001, |
| "loss": 2.6267, |
| "num_input_tokens_seen": 12124155456, |
| "step": 46250 |
| }, |
| { |
| "epoch": 0.22085216499517035, |
| "grad_norm": 0.1964132934808731, |
| "learning_rate": 0.001, |
| "loss": 2.605, |
| "num_input_tokens_seen": 12137262656, |
| "step": 46300 |
| }, |
| { |
| "epoch": 0.22109066625326448, |
| "grad_norm": 0.1927288919687271, |
| "learning_rate": 0.001, |
| "loss": 2.6178, |
| "num_input_tokens_seen": 12150369856, |
| "step": 46350 |
| }, |
| { |
| "epoch": 0.22132916751135862, |
| "grad_norm": 0.17873398959636688, |
| "learning_rate": 0.001, |
| "loss": 2.6033, |
| "num_input_tokens_seen": 12163477056, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.22156766876945275, |
| "grad_norm": 0.24716190993785858, |
| "learning_rate": 0.001, |
| "loss": 2.6141, |
| "num_input_tokens_seen": 12176584256, |
| "step": 46450 |
| }, |
| { |
| "epoch": 0.2218061700275469, |
| "grad_norm": 0.2021339386701584, |
| "learning_rate": 0.001, |
| "loss": 2.6259, |
| "num_input_tokens_seen": 12189691456, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.2218061700275469, |
| "eval_loss": 2.4975087642669678, |
| "eval_runtime": 50.8921, |
| "eval_samples_per_second": 98.247, |
| "eval_steps_per_second": 24.562, |
| "num_input_tokens_seen": 12189691456, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.22204467128564104, |
| "grad_norm": 0.20796166360378265, |
| "learning_rate": 0.001, |
| "loss": 2.6211, |
| "num_input_tokens_seen": 12202798656, |
| "step": 46550 |
| }, |
| { |
| "epoch": 0.22228317254373517, |
| "grad_norm": 0.20472556352615356, |
| "learning_rate": 0.001, |
| "loss": 2.6123, |
| "num_input_tokens_seen": 12215905856, |
| "step": 46600 |
| }, |
| { |
| "epoch": 0.2225216738018293, |
| "grad_norm": 0.20017485320568085, |
| "learning_rate": 0.001, |
| "loss": 2.6037, |
| "num_input_tokens_seen": 12229013056, |
| "step": 46650 |
| }, |
| { |
| "epoch": 0.22276017505992343, |
| "grad_norm": 0.2037762850522995, |
| "learning_rate": 0.001, |
| "loss": 2.6155, |
| "num_input_tokens_seen": 12242120256, |
| "step": 46700 |
| }, |
| { |
| "epoch": 0.2229986763180176, |
| "grad_norm": 0.19346804916858673, |
| "learning_rate": 0.001, |
| "loss": 2.601, |
| "num_input_tokens_seen": 12255227456, |
| "step": 46750 |
| }, |
| { |
| "epoch": 0.22323717757611172, |
| "grad_norm": 0.18640096485614777, |
| "learning_rate": 0.001, |
| "loss": 2.6168, |
| "num_input_tokens_seen": 12268334656, |
| "step": 46800 |
| }, |
| { |
| "epoch": 0.22347567883420585, |
| "grad_norm": 0.20295055210590363, |
| "learning_rate": 0.001, |
| "loss": 2.6221, |
| "num_input_tokens_seen": 12281441856, |
| "step": 46850 |
| }, |
| { |
| "epoch": 0.22371418009229999, |
| "grad_norm": 0.20705671608448029, |
| "learning_rate": 0.001, |
| "loss": 2.6202, |
| "num_input_tokens_seen": 12294549056, |
| "step": 46900 |
| }, |
| { |
| "epoch": 0.22395268135039412, |
| "grad_norm": 0.18724282085895538, |
| "learning_rate": 0.001, |
| "loss": 2.6061, |
| "num_input_tokens_seen": 12307656256, |
| "step": 46950 |
| }, |
| { |
| "epoch": 0.22419118260848825, |
| "grad_norm": 0.18210910260677338, |
| "learning_rate": 0.001, |
| "loss": 2.6045, |
| "num_input_tokens_seen": 12320763456, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.22419118260848825, |
| "eval_loss": 2.497344493865967, |
| "eval_runtime": 51.17, |
| "eval_samples_per_second": 97.713, |
| "eval_steps_per_second": 24.428, |
| "num_input_tokens_seen": 12320763456, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.2244296838665824, |
| "grad_norm": 0.18894509971141815, |
| "learning_rate": 0.001, |
| "loss": 2.6069, |
| "num_input_tokens_seen": 12333870656, |
| "step": 47050 |
| }, |
| { |
| "epoch": 0.22466818512467654, |
| "grad_norm": 0.23441652953624725, |
| "learning_rate": 0.001, |
| "loss": 2.6092, |
| "num_input_tokens_seen": 12346977856, |
| "step": 47100 |
| }, |
| { |
| "epoch": 0.22490668638277067, |
| "grad_norm": 0.20195326209068298, |
| "learning_rate": 0.001, |
| "loss": 2.6135, |
| "num_input_tokens_seen": 12360085056, |
| "step": 47150 |
| }, |
| { |
| "epoch": 0.2251451876408648, |
| "grad_norm": 0.22025838494300842, |
| "learning_rate": 0.001, |
| "loss": 2.6034, |
| "num_input_tokens_seen": 12373192256, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.22538368889895893, |
| "grad_norm": 0.19111979007720947, |
| "learning_rate": 0.001, |
| "loss": 2.6151, |
| "num_input_tokens_seen": 12386299456, |
| "step": 47250 |
| }, |
| { |
| "epoch": 0.22562219015705307, |
| "grad_norm": 0.2010103464126587, |
| "learning_rate": 0.001, |
| "loss": 2.6031, |
| "num_input_tokens_seen": 12399406656, |
| "step": 47300 |
| }, |
| { |
| "epoch": 0.22586069141514722, |
| "grad_norm": 0.21569807827472687, |
| "learning_rate": 0.001, |
| "loss": 2.6012, |
| "num_input_tokens_seen": 12412513856, |
| "step": 47350 |
| }, |
| { |
| "epoch": 0.22609919267324136, |
| "grad_norm": 0.18600653111934662, |
| "learning_rate": 0.001, |
| "loss": 2.6087, |
| "num_input_tokens_seen": 12425621056, |
| "step": 47400 |
| }, |
| { |
| "epoch": 0.2263376939313355, |
| "grad_norm": 0.19476164877414703, |
| "learning_rate": 0.001, |
| "loss": 2.6179, |
| "num_input_tokens_seen": 12438728256, |
| "step": 47450 |
| }, |
| { |
| "epoch": 0.22657619518942962, |
| "grad_norm": 0.19705821573734283, |
| "learning_rate": 0.001, |
| "loss": 2.5983, |
| "num_input_tokens_seen": 12451835456, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.22657619518942962, |
| "eval_loss": 2.495936393737793, |
| "eval_runtime": 51.8116, |
| "eval_samples_per_second": 96.504, |
| "eval_steps_per_second": 24.126, |
| "num_input_tokens_seen": 12451835456, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.22681469644752375, |
| "grad_norm": 0.23161695897579193, |
| "learning_rate": 0.001, |
| "loss": 2.5974, |
| "num_input_tokens_seen": 12464942656, |
| "step": 47550 |
| }, |
| { |
| "epoch": 0.2270531977056179, |
| "grad_norm": 0.2022540420293808, |
| "learning_rate": 0.001, |
| "loss": 2.6251, |
| "num_input_tokens_seen": 12478049856, |
| "step": 47600 |
| }, |
| { |
| "epoch": 0.22729169896371204, |
| "grad_norm": 1.0341856479644775, |
| "learning_rate": 0.001, |
| "loss": 2.5831, |
| "num_input_tokens_seen": 12491157056, |
| "step": 47650 |
| }, |
| { |
| "epoch": 0.22753020022180617, |
| "grad_norm": 0.3812394440174103, |
| "learning_rate": 0.001, |
| "loss": 2.6407, |
| "num_input_tokens_seen": 12504264256, |
| "step": 47700 |
| }, |
| { |
| "epoch": 0.2277687014799003, |
| "grad_norm": 0.27030590176582336, |
| "learning_rate": 0.001, |
| "loss": 2.6327, |
| "num_input_tokens_seen": 12517371456, |
| "step": 47750 |
| }, |
| { |
| "epoch": 0.22800720273799444, |
| "grad_norm": 1.3918724060058594, |
| "learning_rate": 0.001, |
| "loss": 2.6344, |
| "num_input_tokens_seen": 12530478656, |
| "step": 47800 |
| }, |
| { |
| "epoch": 0.22824570399608857, |
| "grad_norm": 0.22610582411289215, |
| "learning_rate": 0.001, |
| "loss": 2.6444, |
| "num_input_tokens_seen": 12543585856, |
| "step": 47850 |
| }, |
| { |
| "epoch": 0.22848420525418273, |
| "grad_norm": 0.21421480178833008, |
| "learning_rate": 0.001, |
| "loss": 2.6169, |
| "num_input_tokens_seen": 12556693056, |
| "step": 47900 |
| }, |
| { |
| "epoch": 0.22872270651227686, |
| "grad_norm": 0.20389467477798462, |
| "learning_rate": 0.001, |
| "loss": 2.6158, |
| "num_input_tokens_seen": 12569800256, |
| "step": 47950 |
| }, |
| { |
| "epoch": 0.228961207770371, |
| "grad_norm": 0.2265746295452118, |
| "learning_rate": 0.001, |
| "loss": 2.6101, |
| "num_input_tokens_seen": 12582907456, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.228961207770371, |
| "eval_loss": 2.4971351623535156, |
| "eval_runtime": 54.0453, |
| "eval_samples_per_second": 92.515, |
| "eval_steps_per_second": 23.129, |
| "num_input_tokens_seen": 12582907456, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.22919970902846512, |
| "grad_norm": 0.20247948169708252, |
| "learning_rate": 0.001, |
| "loss": 2.6122, |
| "num_input_tokens_seen": 12596014656, |
| "step": 48050 |
| }, |
| { |
| "epoch": 0.22943821028655925, |
| "grad_norm": 0.20237554609775543, |
| "learning_rate": 0.001, |
| "loss": 2.6235, |
| "num_input_tokens_seen": 12609121856, |
| "step": 48100 |
| }, |
| { |
| "epoch": 0.2296767115446534, |
| "grad_norm": 0.19862660765647888, |
| "learning_rate": 0.001, |
| "loss": 2.6264, |
| "num_input_tokens_seen": 12622229056, |
| "step": 48150 |
| }, |
| { |
| "epoch": 0.22991521280274754, |
| "grad_norm": 0.20839153230190277, |
| "learning_rate": 0.001, |
| "loss": 2.5915, |
| "num_input_tokens_seen": 12635336256, |
| "step": 48200 |
| }, |
| { |
| "epoch": 0.23015371406084167, |
| "grad_norm": 0.19385166466236115, |
| "learning_rate": 0.001, |
| "loss": 2.5979, |
| "num_input_tokens_seen": 12648443456, |
| "step": 48250 |
| }, |
| { |
| "epoch": 0.2303922153189358, |
| "grad_norm": 0.197597935795784, |
| "learning_rate": 0.001, |
| "loss": 2.6093, |
| "num_input_tokens_seen": 12661550656, |
| "step": 48300 |
| }, |
| { |
| "epoch": 0.23063071657702994, |
| "grad_norm": 0.20289985835552216, |
| "learning_rate": 0.001, |
| "loss": 2.6039, |
| "num_input_tokens_seen": 12674657856, |
| "step": 48350 |
| }, |
| { |
| "epoch": 0.23086921783512407, |
| "grad_norm": 0.1986515372991562, |
| "learning_rate": 0.001, |
| "loss": 2.6048, |
| "num_input_tokens_seen": 12687765056, |
| "step": 48400 |
| }, |
| { |
| "epoch": 0.23110771909321823, |
| "grad_norm": 0.19720982015132904, |
| "learning_rate": 0.001, |
| "loss": 2.6171, |
| "num_input_tokens_seen": 12700872256, |
| "step": 48450 |
| }, |
| { |
| "epoch": 0.23134622035131236, |
| "grad_norm": 0.24635523557662964, |
| "learning_rate": 0.001, |
| "loss": 2.6242, |
| "num_input_tokens_seen": 12713979456, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.23134622035131236, |
| "eval_loss": 2.495468854904175, |
| "eval_runtime": 53.4259, |
| "eval_samples_per_second": 93.588, |
| "eval_steps_per_second": 23.397, |
| "num_input_tokens_seen": 12713979456, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.2315847216094065, |
| "grad_norm": 0.5883195996284485, |
| "learning_rate": 0.001, |
| "loss": 2.6399, |
| "num_input_tokens_seen": 12727086656, |
| "step": 48550 |
| }, |
| { |
| "epoch": 0.23182322286750062, |
| "grad_norm": 0.20890024304389954, |
| "learning_rate": 0.001, |
| "loss": 2.6325, |
| "num_input_tokens_seen": 12740193856, |
| "step": 48600 |
| }, |
| { |
| "epoch": 0.23206172412559475, |
| "grad_norm": 0.21251678466796875, |
| "learning_rate": 0.001, |
| "loss": 2.6233, |
| "num_input_tokens_seen": 12753301056, |
| "step": 48650 |
| }, |
| { |
| "epoch": 0.23230022538368889, |
| "grad_norm": 0.20996986329555511, |
| "learning_rate": 0.001, |
| "loss": 2.6174, |
| "num_input_tokens_seen": 12766408256, |
| "step": 48700 |
| }, |
| { |
| "epoch": 0.23253872664178304, |
| "grad_norm": 0.23039382696151733, |
| "learning_rate": 0.001, |
| "loss": 2.6305, |
| "num_input_tokens_seen": 12779515456, |
| "step": 48750 |
| }, |
| { |
| "epoch": 0.23277722789987718, |
| "grad_norm": 0.23922136425971985, |
| "learning_rate": 0.001, |
| "loss": 2.6108, |
| "num_input_tokens_seen": 12792622656, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.2330157291579713, |
| "grad_norm": 0.22746366262435913, |
| "learning_rate": 0.001, |
| "loss": 2.6219, |
| "num_input_tokens_seen": 12805729856, |
| "step": 48850 |
| }, |
| { |
| "epoch": 0.23325423041606544, |
| "grad_norm": 0.22131897509098053, |
| "learning_rate": 0.001, |
| "loss": 2.6205, |
| "num_input_tokens_seen": 12818837056, |
| "step": 48900 |
| }, |
| { |
| "epoch": 0.23349273167415957, |
| "grad_norm": 0.25431814789772034, |
| "learning_rate": 0.001, |
| "loss": 2.6252, |
| "num_input_tokens_seen": 12831944256, |
| "step": 48950 |
| }, |
| { |
| "epoch": 0.23373123293225373, |
| "grad_norm": 0.2622738778591156, |
| "learning_rate": 0.001, |
| "loss": 2.6288, |
| "num_input_tokens_seen": 12845051456, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.23373123293225373, |
| "eval_loss": 2.498055934906006, |
| "eval_runtime": 53.8861, |
| "eval_samples_per_second": 92.788, |
| "eval_steps_per_second": 23.197, |
| "num_input_tokens_seen": 12845051456, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.23396973419034786, |
| "grad_norm": 0.209337517619133, |
| "learning_rate": 0.001, |
| "loss": 2.6348, |
| "num_input_tokens_seen": 12858158656, |
| "step": 49050 |
| }, |
| { |
| "epoch": 0.234208235448442, |
| "grad_norm": 0.1974038928747177, |
| "learning_rate": 0.001, |
| "loss": 2.6158, |
| "num_input_tokens_seen": 12871265856, |
| "step": 49100 |
| }, |
| { |
| "epoch": 0.23444673670653612, |
| "grad_norm": 0.28099164366722107, |
| "learning_rate": 0.001, |
| "loss": 2.6101, |
| "num_input_tokens_seen": 12884373056, |
| "step": 49150 |
| }, |
| { |
| "epoch": 0.23468523796463026, |
| "grad_norm": 0.2172873318195343, |
| "learning_rate": 0.001, |
| "loss": 2.596, |
| "num_input_tokens_seen": 12897480256, |
| "step": 49200 |
| }, |
| { |
| "epoch": 0.2349237392227244, |
| "grad_norm": 0.2120896875858307, |
| "learning_rate": 0.001, |
| "loss": 2.5994, |
| "num_input_tokens_seen": 12910587456, |
| "step": 49250 |
| }, |
| { |
| "epoch": 0.23516224048081855, |
| "grad_norm": 0.20109935104846954, |
| "learning_rate": 0.001, |
| "loss": 2.6101, |
| "num_input_tokens_seen": 12923694656, |
| "step": 49300 |
| }, |
| { |
| "epoch": 0.23540074173891268, |
| "grad_norm": 0.20735585689544678, |
| "learning_rate": 0.001, |
| "loss": 2.6142, |
| "num_input_tokens_seen": 12936801856, |
| "step": 49350 |
| }, |
| { |
| "epoch": 0.2356392429970068, |
| "grad_norm": 0.21295137703418732, |
| "learning_rate": 0.001, |
| "loss": 2.6226, |
| "num_input_tokens_seen": 12949909056, |
| "step": 49400 |
| }, |
| { |
| "epoch": 0.23587774425510094, |
| "grad_norm": 0.20560845732688904, |
| "learning_rate": 0.001, |
| "loss": 2.6027, |
| "num_input_tokens_seen": 12963016256, |
| "step": 49450 |
| }, |
| { |
| "epoch": 0.23611624551319507, |
| "grad_norm": 0.33747321367263794, |
| "learning_rate": 0.001, |
| "loss": 2.6231, |
| "num_input_tokens_seen": 12976123456, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.23611624551319507, |
| "eval_loss": 2.5008058547973633, |
| "eval_runtime": 54.2104, |
| "eval_samples_per_second": 92.233, |
| "eval_steps_per_second": 23.058, |
| "num_input_tokens_seen": 12976123456, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.23635474677128923, |
| "grad_norm": 0.24593485891819, |
| "learning_rate": 0.001, |
| "loss": 2.6336, |
| "num_input_tokens_seen": 12989230656, |
| "step": 49550 |
| }, |
| { |
| "epoch": 0.23659324802938336, |
| "grad_norm": 0.25253933668136597, |
| "learning_rate": 0.001, |
| "loss": 2.643, |
| "num_input_tokens_seen": 13002337856, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.2368317492874775, |
| "grad_norm": 0.24231670796871185, |
| "learning_rate": 0.001, |
| "loss": 2.6074, |
| "num_input_tokens_seen": 13015445056, |
| "step": 49650 |
| }, |
| { |
| "epoch": 0.23707025054557163, |
| "grad_norm": 0.2178962677717209, |
| "learning_rate": 0.001, |
| "loss": 2.6184, |
| "num_input_tokens_seen": 13028552256, |
| "step": 49700 |
| }, |
| { |
| "epoch": 0.23730875180366576, |
| "grad_norm": 0.2651260793209076, |
| "learning_rate": 0.001, |
| "loss": 2.6335, |
| "num_input_tokens_seen": 13041659456, |
| "step": 49750 |
| }, |
| { |
| "epoch": 0.2375472530617599, |
| "grad_norm": 0.1909639537334442, |
| "learning_rate": 0.001, |
| "loss": 2.61, |
| "num_input_tokens_seen": 13054766656, |
| "step": 49800 |
| }, |
| { |
| "epoch": 0.23778575431985405, |
| "grad_norm": 0.21107855439186096, |
| "learning_rate": 0.001, |
| "loss": 2.6333, |
| "num_input_tokens_seen": 13067873856, |
| "step": 49850 |
| }, |
| { |
| "epoch": 0.23802425557794818, |
| "grad_norm": 0.19366736710071564, |
| "learning_rate": 0.001, |
| "loss": 2.6068, |
| "num_input_tokens_seen": 13080981056, |
| "step": 49900 |
| }, |
| { |
| "epoch": 0.2382627568360423, |
| "grad_norm": 0.2851523458957672, |
| "learning_rate": 0.001, |
| "loss": 2.6183, |
| "num_input_tokens_seen": 13094088256, |
| "step": 49950 |
| }, |
| { |
| "epoch": 0.23850125809413644, |
| "grad_norm": 0.23617912828922272, |
| "learning_rate": 0.001, |
| "loss": 2.617, |
| "num_input_tokens_seen": 13107195456, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.23850125809413644, |
| "eval_loss": 2.497406005859375, |
| "eval_runtime": 53.6538, |
| "eval_samples_per_second": 93.19, |
| "eval_steps_per_second": 23.298, |
| "num_input_tokens_seen": 13107195456, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.23873975935223057, |
| "grad_norm": 0.5069316029548645, |
| "learning_rate": 0.001, |
| "loss": 2.6591, |
| "num_input_tokens_seen": 13120302656, |
| "step": 50050 |
| }, |
| { |
| "epoch": 0.23897826061032473, |
| "grad_norm": 0.21306034922599792, |
| "learning_rate": 0.001, |
| "loss": 2.6455, |
| "num_input_tokens_seen": 13133409856, |
| "step": 50100 |
| }, |
| { |
| "epoch": 0.23921676186841886, |
| "grad_norm": 0.2045888900756836, |
| "learning_rate": 0.001, |
| "loss": 2.6227, |
| "num_input_tokens_seen": 13146517056, |
| "step": 50150 |
| }, |
| { |
| "epoch": 0.239455263126513, |
| "grad_norm": 0.2335623949766159, |
| "learning_rate": 0.001, |
| "loss": 2.6097, |
| "num_input_tokens_seen": 13159624256, |
| "step": 50200 |
| }, |
| { |
| "epoch": 0.23969376438460713, |
| "grad_norm": 0.19884036481380463, |
| "learning_rate": 0.001, |
| "loss": 2.6189, |
| "num_input_tokens_seen": 13172731456, |
| "step": 50250 |
| }, |
| { |
| "epoch": 0.23993226564270126, |
| "grad_norm": 0.21080589294433594, |
| "learning_rate": 0.001, |
| "loss": 2.6057, |
| "num_input_tokens_seen": 13185838656, |
| "step": 50300 |
| }, |
| { |
| "epoch": 0.2401707669007954, |
| "grad_norm": 0.21613669395446777, |
| "learning_rate": 0.001, |
| "loss": 2.6045, |
| "num_input_tokens_seen": 13198945856, |
| "step": 50350 |
| }, |
| { |
| "epoch": 0.24040926815888955, |
| "grad_norm": 0.2029023915529251, |
| "learning_rate": 0.001, |
| "loss": 2.6127, |
| "num_input_tokens_seen": 13212053056, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.24064776941698368, |
| "grad_norm": 0.2275777906179428, |
| "learning_rate": 0.001, |
| "loss": 2.6149, |
| "num_input_tokens_seen": 13225160256, |
| "step": 50450 |
| }, |
| { |
| "epoch": 0.2408862706750778, |
| "grad_norm": 0.3332397937774658, |
| "learning_rate": 0.001, |
| "loss": 2.6013, |
| "num_input_tokens_seen": 13238267456, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.2408862706750778, |
| "eval_loss": 2.5022270679473877, |
| "eval_runtime": 53.5942, |
| "eval_samples_per_second": 93.294, |
| "eval_steps_per_second": 23.323, |
| "num_input_tokens_seen": 13238267456, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.24112477193317194, |
| "grad_norm": 0.2197851538658142, |
| "learning_rate": 0.001, |
| "loss": 2.6326, |
| "num_input_tokens_seen": 13251374656, |
| "step": 50550 |
| }, |
| { |
| "epoch": 0.24136327319126608, |
| "grad_norm": 0.2201780080795288, |
| "learning_rate": 0.001, |
| "loss": 2.6265, |
| "num_input_tokens_seen": 13264481856, |
| "step": 50600 |
| }, |
| { |
| "epoch": 0.2416017744493602, |
| "grad_norm": 0.2196362316608429, |
| "learning_rate": 0.001, |
| "loss": 2.6272, |
| "num_input_tokens_seen": 13277589056, |
| "step": 50650 |
| }, |
| { |
| "epoch": 0.24184027570745437, |
| "grad_norm": 0.2234160453081131, |
| "learning_rate": 0.001, |
| "loss": 2.6178, |
| "num_input_tokens_seen": 13290696256, |
| "step": 50700 |
| }, |
| { |
| "epoch": 0.2420787769655485, |
| "grad_norm": 0.24019016325473785, |
| "learning_rate": 0.001, |
| "loss": 2.6142, |
| "num_input_tokens_seen": 13303803456, |
| "step": 50750 |
| }, |
| { |
| "epoch": 0.24231727822364263, |
| "grad_norm": 0.21481236815452576, |
| "learning_rate": 0.001, |
| "loss": 2.6149, |
| "num_input_tokens_seen": 13316910656, |
| "step": 50800 |
| }, |
| { |
| "epoch": 0.24255577948173676, |
| "grad_norm": 0.20477178692817688, |
| "learning_rate": 0.001, |
| "loss": 2.5977, |
| "num_input_tokens_seen": 13330017856, |
| "step": 50850 |
| }, |
| { |
| "epoch": 0.2427942807398309, |
| "grad_norm": 0.20742499828338623, |
| "learning_rate": 0.001, |
| "loss": 2.6153, |
| "num_input_tokens_seen": 13343125056, |
| "step": 50900 |
| }, |
| { |
| "epoch": 0.24303278199792505, |
| "grad_norm": 0.21933062374591827, |
| "learning_rate": 0.001, |
| "loss": 2.5966, |
| "num_input_tokens_seen": 13356232256, |
| "step": 50950 |
| }, |
| { |
| "epoch": 0.24327128325601918, |
| "grad_norm": 0.3282420337200165, |
| "learning_rate": 0.001, |
| "loss": 2.6063, |
| "num_input_tokens_seen": 13369339456, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.24327128325601918, |
| "eval_loss": 2.4981296062469482, |
| "eval_runtime": 53.5536, |
| "eval_samples_per_second": 93.364, |
| "eval_steps_per_second": 23.341, |
| "num_input_tokens_seen": 13369339456, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.24350978451411331, |
| "grad_norm": 0.20502831041812897, |
| "learning_rate": 0.001, |
| "loss": 2.6059, |
| "num_input_tokens_seen": 13382446656, |
| "step": 51050 |
| }, |
| { |
| "epoch": 0.24374828577220745, |
| "grad_norm": 0.20750559866428375, |
| "learning_rate": 0.001, |
| "loss": 2.6056, |
| "num_input_tokens_seen": 13395553856, |
| "step": 51100 |
| }, |
| { |
| "epoch": 0.24398678703030158, |
| "grad_norm": 0.19882823526859283, |
| "learning_rate": 0.001, |
| "loss": 2.5983, |
| "num_input_tokens_seen": 13408661056, |
| "step": 51150 |
| }, |
| { |
| "epoch": 0.2442252882883957, |
| "grad_norm": 0.20900660753250122, |
| "learning_rate": 0.001, |
| "loss": 2.6087, |
| "num_input_tokens_seen": 13421768256, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.24446378954648987, |
| "grad_norm": 0.21428415179252625, |
| "learning_rate": 0.001, |
| "loss": 2.5901, |
| "num_input_tokens_seen": 13434875456, |
| "step": 51250 |
| }, |
| { |
| "epoch": 0.244702290804584, |
| "grad_norm": 0.19987250864505768, |
| "learning_rate": 0.001, |
| "loss": 2.5982, |
| "num_input_tokens_seen": 13447982656, |
| "step": 51300 |
| }, |
| { |
| "epoch": 0.24494079206267813, |
| "grad_norm": 0.2045862078666687, |
| "learning_rate": 0.001, |
| "loss": 2.6058, |
| "num_input_tokens_seen": 13461089856, |
| "step": 51350 |
| }, |
| { |
| "epoch": 0.24517929332077226, |
| "grad_norm": 0.22261273860931396, |
| "learning_rate": 0.001, |
| "loss": 2.5972, |
| "num_input_tokens_seen": 13474197056, |
| "step": 51400 |
| }, |
| { |
| "epoch": 0.2454177945788664, |
| "grad_norm": 0.20395706593990326, |
| "learning_rate": 0.001, |
| "loss": 2.6064, |
| "num_input_tokens_seen": 13487304256, |
| "step": 51450 |
| }, |
| { |
| "epoch": 0.24565629583696055, |
| "grad_norm": 0.21490858495235443, |
| "learning_rate": 0.001, |
| "loss": 2.5922, |
| "num_input_tokens_seen": 13500411456, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.24565629583696055, |
| "eval_loss": 2.488300085067749, |
| "eval_runtime": 53.7972, |
| "eval_samples_per_second": 92.942, |
| "eval_steps_per_second": 23.235, |
| "num_input_tokens_seen": 13500411456, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.24589479709505468, |
| "grad_norm": 0.2039102464914322, |
| "learning_rate": 0.001, |
| "loss": 2.5894, |
| "num_input_tokens_seen": 13513518656, |
| "step": 51550 |
| }, |
| { |
| "epoch": 0.24613329835314882, |
| "grad_norm": 0.21426360309123993, |
| "learning_rate": 0.001, |
| "loss": 2.6089, |
| "num_input_tokens_seen": 13526625856, |
| "step": 51600 |
| }, |
| { |
| "epoch": 0.24637179961124295, |
| "grad_norm": 0.194682314991951, |
| "learning_rate": 0.001, |
| "loss": 2.5932, |
| "num_input_tokens_seen": 13539733056, |
| "step": 51650 |
| }, |
| { |
| "epoch": 0.24661030086933708, |
| "grad_norm": 0.1901472508907318, |
| "learning_rate": 0.001, |
| "loss": 2.6031, |
| "num_input_tokens_seen": 13552840256, |
| "step": 51700 |
| }, |
| { |
| "epoch": 0.2468488021274312, |
| "grad_norm": 0.20517823100090027, |
| "learning_rate": 0.001, |
| "loss": 2.5978, |
| "num_input_tokens_seen": 13565947456, |
| "step": 51750 |
| }, |
| { |
| "epoch": 0.24708730338552537, |
| "grad_norm": 0.23713302612304688, |
| "learning_rate": 0.001, |
| "loss": 2.6061, |
| "num_input_tokens_seen": 13579054656, |
| "step": 51800 |
| }, |
| { |
| "epoch": 0.2473258046436195, |
| "grad_norm": 0.2431441992521286, |
| "learning_rate": 0.001, |
| "loss": 2.6062, |
| "num_input_tokens_seen": 13592161856, |
| "step": 51850 |
| }, |
| { |
| "epoch": 0.24756430590171363, |
| "grad_norm": 0.20358557999134064, |
| "learning_rate": 0.001, |
| "loss": 2.6161, |
| "num_input_tokens_seen": 13605269056, |
| "step": 51900 |
| }, |
| { |
| "epoch": 0.24780280715980776, |
| "grad_norm": 0.21245016157627106, |
| "learning_rate": 0.001, |
| "loss": 2.6166, |
| "num_input_tokens_seen": 13618376256, |
| "step": 51950 |
| }, |
| { |
| "epoch": 0.2480413084179019, |
| "grad_norm": 0.24295999109745026, |
| "learning_rate": 0.001, |
| "loss": 2.6139, |
| "num_input_tokens_seen": 13631483456, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.2480413084179019, |
| "eval_loss": 2.4932186603546143, |
| "eval_runtime": 53.6797, |
| "eval_samples_per_second": 93.145, |
| "eval_steps_per_second": 23.286, |
| "num_input_tokens_seen": 13631483456, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.24827980967599603, |
| "grad_norm": 0.22135989367961884, |
| "learning_rate": 0.001, |
| "loss": 2.5947, |
| "num_input_tokens_seen": 13644590656, |
| "step": 52050 |
| }, |
| { |
| "epoch": 0.2485183109340902, |
| "grad_norm": 0.3656958341598511, |
| "learning_rate": 0.001, |
| "loss": 2.6263, |
| "num_input_tokens_seen": 13657697856, |
| "step": 52100 |
| }, |
| { |
| "epoch": 0.24875681219218432, |
| "grad_norm": 0.2960817813873291, |
| "learning_rate": 0.001, |
| "loss": 2.6086, |
| "num_input_tokens_seen": 13670805056, |
| "step": 52150 |
| }, |
| { |
| "epoch": 0.24899531345027845, |
| "grad_norm": 0.2150612622499466, |
| "learning_rate": 0.001, |
| "loss": 2.6314, |
| "num_input_tokens_seen": 13683912256, |
| "step": 52200 |
| }, |
| { |
| "epoch": 0.24923381470837258, |
| "grad_norm": 0.23089592158794403, |
| "learning_rate": 0.001, |
| "loss": 2.6072, |
| "num_input_tokens_seen": 13697019456, |
| "step": 52250 |
| }, |
| { |
| "epoch": 0.2494723159664667, |
| "grad_norm": 0.19151148200035095, |
| "learning_rate": 0.001, |
| "loss": 2.6177, |
| "num_input_tokens_seen": 13710126656, |
| "step": 52300 |
| }, |
| { |
| "epoch": 0.24971081722456087, |
| "grad_norm": 0.47803962230682373, |
| "learning_rate": 0.001, |
| "loss": 2.6018, |
| "num_input_tokens_seen": 13723233856, |
| "step": 52350 |
| }, |
| { |
| "epoch": 0.249949318482655, |
| "grad_norm": 0.2346401810646057, |
| "learning_rate": 0.001, |
| "loss": 2.6068, |
| "num_input_tokens_seen": 13736341056, |
| "step": 52400 |
| }, |
| { |
| "epoch": 0.2501878197407491, |
| "grad_norm": 0.21514126658439636, |
| "learning_rate": 0.001, |
| "loss": 2.6186, |
| "num_input_tokens_seen": 13749448256, |
| "step": 52450 |
| }, |
| { |
| "epoch": 0.25042632099884327, |
| "grad_norm": 0.20311090350151062, |
| "learning_rate": 0.001, |
| "loss": 2.595, |
| "num_input_tokens_seen": 13762555456, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.25042632099884327, |
| "eval_loss": 2.490104913711548, |
| "eval_runtime": 53.8709, |
| "eval_samples_per_second": 92.814, |
| "eval_steps_per_second": 23.204, |
| "num_input_tokens_seen": 13762555456, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.2506648222569374, |
| "grad_norm": 0.2120152711868286, |
| "learning_rate": 0.001, |
| "loss": 2.6027, |
| "num_input_tokens_seen": 13775662656, |
| "step": 52550 |
| }, |
| { |
| "epoch": 0.25090332351503153, |
| "grad_norm": 0.3172776401042938, |
| "learning_rate": 0.001, |
| "loss": 2.6089, |
| "num_input_tokens_seen": 13788769856, |
| "step": 52600 |
| }, |
| { |
| "epoch": 0.2511418247731257, |
| "grad_norm": 0.24425551295280457, |
| "learning_rate": 0.001, |
| "loss": 2.611, |
| "num_input_tokens_seen": 13801877056, |
| "step": 52650 |
| }, |
| { |
| "epoch": 0.2513803260312198, |
| "grad_norm": 0.24523352086544037, |
| "learning_rate": 0.001, |
| "loss": 2.6066, |
| "num_input_tokens_seen": 13814984256, |
| "step": 52700 |
| }, |
| { |
| "epoch": 0.25161882728931395, |
| "grad_norm": 0.21642154455184937, |
| "learning_rate": 0.001, |
| "loss": 2.6069, |
| "num_input_tokens_seen": 13828091456, |
| "step": 52750 |
| }, |
| { |
| "epoch": 0.2518573285474081, |
| "grad_norm": 0.21867206692695618, |
| "learning_rate": 0.001, |
| "loss": 2.6163, |
| "num_input_tokens_seen": 13841198656, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.2520958298055022, |
| "grad_norm": 0.2124466449022293, |
| "learning_rate": 0.001, |
| "loss": 2.6045, |
| "num_input_tokens_seen": 13854305856, |
| "step": 52850 |
| }, |
| { |
| "epoch": 0.2523343310635964, |
| "grad_norm": 0.20598042011260986, |
| "learning_rate": 0.001, |
| "loss": 2.5881, |
| "num_input_tokens_seen": 13867413056, |
| "step": 52900 |
| }, |
| { |
| "epoch": 0.2525728323216905, |
| "grad_norm": 0.1949404776096344, |
| "learning_rate": 0.001, |
| "loss": 2.6051, |
| "num_input_tokens_seen": 13880520256, |
| "step": 52950 |
| }, |
| { |
| "epoch": 0.25281133357978464, |
| "grad_norm": 0.18877142667770386, |
| "learning_rate": 0.001, |
| "loss": 2.608, |
| "num_input_tokens_seen": 13893627456, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.25281133357978464, |
| "eval_loss": 2.485513210296631, |
| "eval_runtime": 53.7202, |
| "eval_samples_per_second": 93.075, |
| "eval_steps_per_second": 23.269, |
| "num_input_tokens_seen": 13893627456, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.2530498348378788, |
| "grad_norm": 0.20486177504062653, |
| "learning_rate": 0.001, |
| "loss": 2.5977, |
| "num_input_tokens_seen": 13906734656, |
| "step": 53050 |
| }, |
| { |
| "epoch": 0.2532883360959729, |
| "grad_norm": 0.18098385632038116, |
| "learning_rate": 0.001, |
| "loss": 2.5931, |
| "num_input_tokens_seen": 13919841856, |
| "step": 53100 |
| }, |
| { |
| "epoch": 0.25352683735406706, |
| "grad_norm": 0.1933833658695221, |
| "learning_rate": 0.001, |
| "loss": 2.6058, |
| "num_input_tokens_seen": 13932949056, |
| "step": 53150 |
| }, |
| { |
| "epoch": 0.25376533861216116, |
| "grad_norm": 0.29640141129493713, |
| "learning_rate": 0.001, |
| "loss": 2.5864, |
| "num_input_tokens_seen": 13946056256, |
| "step": 53200 |
| }, |
| { |
| "epoch": 0.2540038398702553, |
| "grad_norm": 0.2559553384780884, |
| "learning_rate": 0.001, |
| "loss": 2.6137, |
| "num_input_tokens_seen": 13959163456, |
| "step": 53250 |
| }, |
| { |
| "epoch": 0.2542423411283494, |
| "grad_norm": 0.21698619425296783, |
| "learning_rate": 0.001, |
| "loss": 2.6184, |
| "num_input_tokens_seen": 13972270656, |
| "step": 53300 |
| }, |
| { |
| "epoch": 0.2544808423864436, |
| "grad_norm": 0.19658173620700836, |
| "learning_rate": 0.001, |
| "loss": 2.5938, |
| "num_input_tokens_seen": 13985377856, |
| "step": 53350 |
| }, |
| { |
| "epoch": 0.25471934364453774, |
| "grad_norm": 0.2056342512369156, |
| "learning_rate": 0.001, |
| "loss": 2.5952, |
| "num_input_tokens_seen": 13998485056, |
| "step": 53400 |
| }, |
| { |
| "epoch": 0.25495784490263185, |
| "grad_norm": 0.1932424008846283, |
| "learning_rate": 0.001, |
| "loss": 2.6101, |
| "num_input_tokens_seen": 14011592256, |
| "step": 53450 |
| }, |
| { |
| "epoch": 0.255196346160726, |
| "grad_norm": 0.19347251951694489, |
| "learning_rate": 0.001, |
| "loss": 2.5976, |
| "num_input_tokens_seen": 14024699456, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.255196346160726, |
| "eval_loss": 2.4863245487213135, |
| "eval_runtime": 53.2426, |
| "eval_samples_per_second": 93.91, |
| "eval_steps_per_second": 23.477, |
| "num_input_tokens_seen": 14024699456, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.2554348474188201, |
| "grad_norm": 0.1986820101737976, |
| "learning_rate": 0.001, |
| "loss": 2.6066, |
| "num_input_tokens_seen": 14037806656, |
| "step": 53550 |
| }, |
| { |
| "epoch": 0.25567334867691427, |
| "grad_norm": 0.21295565366744995, |
| "learning_rate": 0.001, |
| "loss": 2.6107, |
| "num_input_tokens_seen": 14050913856, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.25591184993500843, |
| "grad_norm": 0.21585114300251007, |
| "learning_rate": 0.001, |
| "loss": 2.6077, |
| "num_input_tokens_seen": 14064021056, |
| "step": 53650 |
| }, |
| { |
| "epoch": 0.25615035119310253, |
| "grad_norm": 0.19424305856227875, |
| "learning_rate": 0.001, |
| "loss": 2.5931, |
| "num_input_tokens_seen": 14077128256, |
| "step": 53700 |
| }, |
| { |
| "epoch": 0.2563888524511967, |
| "grad_norm": 0.20265349745750427, |
| "learning_rate": 0.001, |
| "loss": 2.5901, |
| "num_input_tokens_seen": 14090235456, |
| "step": 53750 |
| }, |
| { |
| "epoch": 0.2566273537092908, |
| "grad_norm": 1.037636160850525, |
| "learning_rate": 0.001, |
| "loss": 2.5775, |
| "num_input_tokens_seen": 14103342656, |
| "step": 53800 |
| }, |
| { |
| "epoch": 0.25686585496738495, |
| "grad_norm": 0.32030293345451355, |
| "learning_rate": 0.001, |
| "loss": 2.6242, |
| "num_input_tokens_seen": 14116449856, |
| "step": 53850 |
| }, |
| { |
| "epoch": 0.2571043562254791, |
| "grad_norm": 0.2339978665113449, |
| "learning_rate": 0.001, |
| "loss": 2.6122, |
| "num_input_tokens_seen": 14129557056, |
| "step": 53900 |
| }, |
| { |
| "epoch": 0.2573428574835732, |
| "grad_norm": 0.22179783880710602, |
| "learning_rate": 0.001, |
| "loss": 2.6025, |
| "num_input_tokens_seen": 14142664256, |
| "step": 53950 |
| }, |
| { |
| "epoch": 0.2575813587416674, |
| "grad_norm": 0.22616736590862274, |
| "learning_rate": 0.001, |
| "loss": 2.5916, |
| "num_input_tokens_seen": 14155771456, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.2575813587416674, |
| "eval_loss": 2.4871394634246826, |
| "eval_runtime": 53.8695, |
| "eval_samples_per_second": 92.817, |
| "eval_steps_per_second": 23.204, |
| "num_input_tokens_seen": 14155771456, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.2578198599997615, |
| "grad_norm": 0.2028844654560089, |
| "learning_rate": 0.001, |
| "loss": 2.6039, |
| "num_input_tokens_seen": 14168878656, |
| "step": 54050 |
| }, |
| { |
| "epoch": 0.25805836125785564, |
| "grad_norm": 0.19936658442020416, |
| "learning_rate": 0.001, |
| "loss": 2.5985, |
| "num_input_tokens_seen": 14181985856, |
| "step": 54100 |
| }, |
| { |
| "epoch": 0.2582968625159498, |
| "grad_norm": 0.2087993025779724, |
| "learning_rate": 0.001, |
| "loss": 2.62, |
| "num_input_tokens_seen": 14195093056, |
| "step": 54150 |
| }, |
| { |
| "epoch": 0.2585353637740439, |
| "grad_norm": 0.18972960114479065, |
| "learning_rate": 0.001, |
| "loss": 2.5936, |
| "num_input_tokens_seen": 14208200256, |
| "step": 54200 |
| }, |
| { |
| "epoch": 0.25877386503213806, |
| "grad_norm": 0.2162945419549942, |
| "learning_rate": 0.001, |
| "loss": 2.6125, |
| "num_input_tokens_seen": 14221307456, |
| "step": 54250 |
| }, |
| { |
| "epoch": 0.25901236629023217, |
| "grad_norm": 0.2538411319255829, |
| "learning_rate": 0.001, |
| "loss": 2.6197, |
| "num_input_tokens_seen": 14234414656, |
| "step": 54300 |
| }, |
| { |
| "epoch": 0.2592508675483263, |
| "grad_norm": 0.28060850501060486, |
| "learning_rate": 0.001, |
| "loss": 2.6194, |
| "num_input_tokens_seen": 14247521856, |
| "step": 54350 |
| }, |
| { |
| "epoch": 0.25948936880642043, |
| "grad_norm": 0.21557608246803284, |
| "learning_rate": 0.001, |
| "loss": 2.623, |
| "num_input_tokens_seen": 14260629056, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.2597278700645146, |
| "grad_norm": 0.21628426015377045, |
| "learning_rate": 0.001, |
| "loss": 2.6077, |
| "num_input_tokens_seen": 14273736256, |
| "step": 54450 |
| }, |
| { |
| "epoch": 0.25996637132260875, |
| "grad_norm": 0.19123327732086182, |
| "learning_rate": 0.001, |
| "loss": 2.5991, |
| "num_input_tokens_seen": 14286843456, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.25996637132260875, |
| "eval_loss": 2.4861645698547363, |
| "eval_runtime": 53.6448, |
| "eval_samples_per_second": 93.206, |
| "eval_steps_per_second": 23.301, |
| "num_input_tokens_seen": 14286843456, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.26020487258070285, |
| "grad_norm": 0.20462968945503235, |
| "learning_rate": 0.001, |
| "loss": 2.5887, |
| "num_input_tokens_seen": 14299950656, |
| "step": 54550 |
| }, |
| { |
| "epoch": 0.260443373838797, |
| "grad_norm": 0.20952938497066498, |
| "learning_rate": 0.001, |
| "loss": 2.608, |
| "num_input_tokens_seen": 14313057856, |
| "step": 54600 |
| }, |
| { |
| "epoch": 0.2606818750968911, |
| "grad_norm": 0.2095402032136917, |
| "learning_rate": 0.001, |
| "loss": 2.6079, |
| "num_input_tokens_seen": 14326165056, |
| "step": 54650 |
| }, |
| { |
| "epoch": 0.2609203763549853, |
| "grad_norm": 0.2343517541885376, |
| "learning_rate": 0.001, |
| "loss": 2.6124, |
| "num_input_tokens_seen": 14339272256, |
| "step": 54700 |
| }, |
| { |
| "epoch": 0.26115887761307943, |
| "grad_norm": 0.23840700089931488, |
| "learning_rate": 0.001, |
| "loss": 2.6015, |
| "num_input_tokens_seen": 14352379456, |
| "step": 54750 |
| }, |
| { |
| "epoch": 0.26139737887117354, |
| "grad_norm": 0.22024671733379364, |
| "learning_rate": 0.001, |
| "loss": 2.5812, |
| "num_input_tokens_seen": 14365486656, |
| "step": 54800 |
| }, |
| { |
| "epoch": 0.2616358801292677, |
| "grad_norm": 0.19884246587753296, |
| "learning_rate": 0.001, |
| "loss": 2.6118, |
| "num_input_tokens_seen": 14378593856, |
| "step": 54850 |
| }, |
| { |
| "epoch": 0.2618743813873618, |
| "grad_norm": 0.46560585498809814, |
| "learning_rate": 0.001, |
| "loss": 2.6024, |
| "num_input_tokens_seen": 14391701056, |
| "step": 54900 |
| }, |
| { |
| "epoch": 0.26211288264545596, |
| "grad_norm": 0.2956256568431854, |
| "learning_rate": 0.001, |
| "loss": 2.6073, |
| "num_input_tokens_seen": 14404808256, |
| "step": 54950 |
| }, |
| { |
| "epoch": 0.2623513839035501, |
| "grad_norm": 0.286327064037323, |
| "learning_rate": 0.001, |
| "loss": 2.5946, |
| "num_input_tokens_seen": 14417915456, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.2623513839035501, |
| "eval_loss": 2.4892399311065674, |
| "eval_runtime": 53.3184, |
| "eval_samples_per_second": 93.776, |
| "eval_steps_per_second": 23.444, |
| "num_input_tokens_seen": 14417915456, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.2625898851616442, |
| "grad_norm": 0.22046101093292236, |
| "learning_rate": 0.001, |
| "loss": 2.6077, |
| "num_input_tokens_seen": 14431022656, |
| "step": 55050 |
| }, |
| { |
| "epoch": 0.2628283864197384, |
| "grad_norm": 0.4682837724685669, |
| "learning_rate": 0.001, |
| "loss": 2.6065, |
| "num_input_tokens_seen": 14444129856, |
| "step": 55100 |
| }, |
| { |
| "epoch": 0.2630668876778325, |
| "grad_norm": 0.21442484855651855, |
| "learning_rate": 0.001, |
| "loss": 2.6079, |
| "num_input_tokens_seen": 14457237056, |
| "step": 55150 |
| }, |
| { |
| "epoch": 0.26330538893592664, |
| "grad_norm": 0.2513403296470642, |
| "learning_rate": 0.001, |
| "loss": 2.6037, |
| "num_input_tokens_seen": 14470344256, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.26354389019402075, |
| "grad_norm": 0.21526487171649933, |
| "learning_rate": 0.001, |
| "loss": 2.6049, |
| "num_input_tokens_seen": 14483451456, |
| "step": 55250 |
| }, |
| { |
| "epoch": 0.2637823914521149, |
| "grad_norm": 0.22567112743854523, |
| "learning_rate": 0.001, |
| "loss": 2.5953, |
| "num_input_tokens_seen": 14496558656, |
| "step": 55300 |
| }, |
| { |
| "epoch": 0.26402089271020907, |
| "grad_norm": 0.20226064324378967, |
| "learning_rate": 0.001, |
| "loss": 2.609, |
| "num_input_tokens_seen": 14509665856, |
| "step": 55350 |
| }, |
| { |
| "epoch": 0.26425939396830317, |
| "grad_norm": 0.31736019253730774, |
| "learning_rate": 0.001, |
| "loss": 2.6174, |
| "num_input_tokens_seen": 14522773056, |
| "step": 55400 |
| }, |
| { |
| "epoch": 0.26449789522639733, |
| "grad_norm": 0.2573414146900177, |
| "learning_rate": 0.001, |
| "loss": 2.612, |
| "num_input_tokens_seen": 14535880256, |
| "step": 55450 |
| }, |
| { |
| "epoch": 0.26473639648449143, |
| "grad_norm": 0.278160959482193, |
| "learning_rate": 0.001, |
| "loss": 2.6713, |
| "num_input_tokens_seen": 14548987456, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.26473639648449143, |
| "eval_loss": 2.5104730129241943, |
| "eval_runtime": 54.2403, |
| "eval_samples_per_second": 92.182, |
| "eval_steps_per_second": 23.046, |
| "num_input_tokens_seen": 14548987456, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.2649748977425856, |
| "grad_norm": 0.25843819975852966, |
| "learning_rate": 0.001, |
| "loss": 2.6223, |
| "num_input_tokens_seen": 14562094656, |
| "step": 55550 |
| }, |
| { |
| "epoch": 0.26521339900067975, |
| "grad_norm": 0.42813193798065186, |
| "learning_rate": 0.001, |
| "loss": 2.6114, |
| "num_input_tokens_seen": 14575201856, |
| "step": 55600 |
| }, |
| { |
| "epoch": 0.26545190025877385, |
| "grad_norm": 0.23324181139469147, |
| "learning_rate": 0.001, |
| "loss": 2.6149, |
| "num_input_tokens_seen": 14588309056, |
| "step": 55650 |
| }, |
| { |
| "epoch": 0.265690401516868, |
| "grad_norm": 0.2795487940311432, |
| "learning_rate": 0.001, |
| "loss": 2.6067, |
| "num_input_tokens_seen": 14601416256, |
| "step": 55700 |
| }, |
| { |
| "epoch": 0.2659289027749621, |
| "grad_norm": 0.6856834888458252, |
| "learning_rate": 0.001, |
| "loss": 2.6135, |
| "num_input_tokens_seen": 14614523456, |
| "step": 55750 |
| }, |
| { |
| "epoch": 0.2661674040330563, |
| "grad_norm": 0.348906934261322, |
| "learning_rate": 0.001, |
| "loss": 2.6384, |
| "num_input_tokens_seen": 14627630656, |
| "step": 55800 |
| }, |
| { |
| "epoch": 0.26640590529115044, |
| "grad_norm": 0.2510247528553009, |
| "learning_rate": 0.001, |
| "loss": 2.6224, |
| "num_input_tokens_seen": 14640737856, |
| "step": 55850 |
| }, |
| { |
| "epoch": 0.26664440654924454, |
| "grad_norm": 0.34429189562797546, |
| "learning_rate": 0.001, |
| "loss": 2.6139, |
| "num_input_tokens_seen": 14653845056, |
| "step": 55900 |
| }, |
| { |
| "epoch": 0.2668829078073387, |
| "grad_norm": 0.25697243213653564, |
| "learning_rate": 0.001, |
| "loss": 2.6143, |
| "num_input_tokens_seen": 14666952256, |
| "step": 55950 |
| }, |
| { |
| "epoch": 0.2671214090654328, |
| "grad_norm": 0.2812611758708954, |
| "learning_rate": 0.001, |
| "loss": 2.6172, |
| "num_input_tokens_seen": 14680059456, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.2671214090654328, |
| "eval_loss": 2.492490291595459, |
| "eval_runtime": 53.3814, |
| "eval_samples_per_second": 93.666, |
| "eval_steps_per_second": 23.416, |
| "num_input_tokens_seen": 14680059456, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.26735991032352696, |
| "grad_norm": 0.22615984082221985, |
| "learning_rate": 0.0009999685283773503, |
| "loss": 2.5961, |
| "num_input_tokens_seen": 14693166656, |
| "step": 56050 |
| }, |
| { |
| "epoch": 0.2675984115816211, |
| "grad_norm": 0.2738794982433319, |
| "learning_rate": 0.0009998741174712534, |
| "loss": 2.612, |
| "num_input_tokens_seen": 14706273856, |
| "step": 56100 |
| }, |
| { |
| "epoch": 0.2678369128397152, |
| "grad_norm": 0.23470066487789154, |
| "learning_rate": 0.0009997167791667668, |
| "loss": 2.6071, |
| "num_input_tokens_seen": 14719381056, |
| "step": 56150 |
| }, |
| { |
| "epoch": 0.2680754140978094, |
| "grad_norm": 0.23558543622493744, |
| "learning_rate": 0.0009994965332706573, |
| "loss": 2.5956, |
| "num_input_tokens_seen": 14732488256, |
| "step": 56200 |
| }, |
| { |
| "epoch": 0.2683139153559035, |
| "grad_norm": 0.2274416983127594, |
| "learning_rate": 0.0009992134075089082, |
| "loss": 2.5873, |
| "num_input_tokens_seen": 14745595456, |
| "step": 56250 |
| }, |
| { |
| "epoch": 0.26855241661399765, |
| "grad_norm": 0.21609161794185638, |
| "learning_rate": 0.000998867437523228, |
| "loss": 2.6043, |
| "num_input_tokens_seen": 14758702656, |
| "step": 56300 |
| }, |
| { |
| "epoch": 0.26879091787209175, |
| "grad_norm": 0.2368565797805786, |
| "learning_rate": 0.000998458666866564, |
| "loss": 2.5952, |
| "num_input_tokens_seen": 14771809856, |
| "step": 56350 |
| }, |
| { |
| "epoch": 0.2690294191301859, |
| "grad_norm": 0.22180891036987305, |
| "learning_rate": 0.0009979871469976197, |
| "loss": 2.5934, |
| "num_input_tokens_seen": 14784917056, |
| "step": 56400 |
| }, |
| { |
| "epoch": 0.26926792038828007, |
| "grad_norm": 0.3060019910335541, |
| "learning_rate": 0.0009974529372743762, |
| "loss": 2.6224, |
| "num_input_tokens_seen": 14798024256, |
| "step": 56450 |
| }, |
| { |
| "epoch": 0.2695064216463742, |
| "grad_norm": 0.2387322634458542, |
| "learning_rate": 0.0009968561049466214, |
| "loss": 2.5905, |
| "num_input_tokens_seen": 14811131456, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.2695064216463742, |
| "eval_loss": 2.4835996627807617, |
| "eval_runtime": 53.8478, |
| "eval_samples_per_second": 92.854, |
| "eval_steps_per_second": 23.214, |
| "num_input_tokens_seen": 14811131456, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.26974492290446833, |
| "grad_norm": 0.22091372311115265, |
| "learning_rate": 0.0009961967251474822, |
| "loss": 2.6139, |
| "num_input_tokens_seen": 14824238656, |
| "step": 56550 |
| }, |
| { |
| "epoch": 0.26998342416256244, |
| "grad_norm": 0.2304680198431015, |
| "learning_rate": 0.0009954748808839674, |
| "loss": 2.6167, |
| "num_input_tokens_seen": 14837345856, |
| "step": 56600 |
| }, |
| { |
| "epoch": 0.2702219254206566, |
| "grad_norm": 0.19777421653270721, |
| "learning_rate": 0.0009946906630265184, |
| "loss": 2.6082, |
| "num_input_tokens_seen": 14850453056, |
| "step": 56650 |
| }, |
| { |
| "epoch": 0.27046042667875075, |
| "grad_norm": 0.2113979458808899, |
| "learning_rate": 0.0009938441702975688, |
| "loss": 2.5981, |
| "num_input_tokens_seen": 14863560256, |
| "step": 56700 |
| }, |
| { |
| "epoch": 0.27069892793684486, |
| "grad_norm": 0.19911637902259827, |
| "learning_rate": 0.0009929355092591179, |
| "loss": 2.5904, |
| "num_input_tokens_seen": 14876667456, |
| "step": 56750 |
| }, |
| { |
| "epoch": 0.270937429194939, |
| "grad_norm": 0.20081694424152374, |
| "learning_rate": 0.0009919647942993148, |
| "loss": 2.6012, |
| "num_input_tokens_seen": 14889774656, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.2711759304530331, |
| "grad_norm": 0.22752800583839417, |
| "learning_rate": 0.0009909321476180592, |
| "loss": 2.6017, |
| "num_input_tokens_seen": 14902881856, |
| "step": 56850 |
| }, |
| { |
| "epoch": 0.2714144317111273, |
| "grad_norm": 0.23174402117729187, |
| "learning_rate": 0.0009898376992116178, |
| "loss": 2.6012, |
| "num_input_tokens_seen": 14915989056, |
| "step": 56900 |
| }, |
| { |
| "epoch": 0.27165293296922144, |
| "grad_norm": 0.22149533033370972, |
| "learning_rate": 0.0009886815868562597, |
| "loss": 2.5881, |
| "num_input_tokens_seen": 14929096256, |
| "step": 56950 |
| }, |
| { |
| "epoch": 0.27189143422731554, |
| "grad_norm": 0.22576771676540375, |
| "learning_rate": 0.0009874639560909118, |
| "loss": 2.6021, |
| "num_input_tokens_seen": 14942203456, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.27189143422731554, |
| "eval_loss": 2.482896566390991, |
| "eval_runtime": 53.3773, |
| "eval_samples_per_second": 93.673, |
| "eval_steps_per_second": 23.418, |
| "num_input_tokens_seen": 14942203456, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.2721299354854097, |
| "grad_norm": 0.22044019401073456, |
| "learning_rate": 0.0009861849601988384, |
| "loss": 2.6119, |
| "num_input_tokens_seen": 14955310656, |
| "step": 57050 |
| }, |
| { |
| "epoch": 0.2723684367435038, |
| "grad_norm": 0.2155238389968872, |
| "learning_rate": 0.0009848447601883434, |
| "loss": 2.5869, |
| "num_input_tokens_seen": 14968417856, |
| "step": 57100 |
| }, |
| { |
| "epoch": 0.27260693800159796, |
| "grad_norm": 0.21131549775600433, |
| "learning_rate": 0.0009834435247725033, |
| "loss": 2.5988, |
| "num_input_tokens_seen": 14981525056, |
| "step": 57150 |
| }, |
| { |
| "epoch": 0.27284543925969207, |
| "grad_norm": 0.21247337758541107, |
| "learning_rate": 0.0009819814303479266, |
| "loss": 2.6198, |
| "num_input_tokens_seen": 14994632256, |
| "step": 57200 |
| }, |
| { |
| "epoch": 0.27308394051778623, |
| "grad_norm": 0.21916711330413818, |
| "learning_rate": 0.00098045866097255, |
| "loss": 2.6019, |
| "num_input_tokens_seen": 15007739456, |
| "step": 57250 |
| }, |
| { |
| "epoch": 0.2733224417758804, |
| "grad_norm": 0.1925441473722458, |
| "learning_rate": 0.0009788754083424652, |
| "loss": 2.6143, |
| "num_input_tokens_seen": 15020846656, |
| "step": 57300 |
| }, |
| { |
| "epoch": 0.2735609430339745, |
| "grad_norm": 0.38578665256500244, |
| "learning_rate": 0.0009772318717677904, |
| "loss": 2.6037, |
| "num_input_tokens_seen": 15033953856, |
| "step": 57350 |
| }, |
| { |
| "epoch": 0.27379944429206865, |
| "grad_norm": 0.19650611281394958, |
| "learning_rate": 0.0009755282581475768, |
| "loss": 2.5745, |
| "num_input_tokens_seen": 15047061056, |
| "step": 57400 |
| }, |
| { |
| "epoch": 0.27403794555016275, |
| "grad_norm": 0.2376088798046112, |
| "learning_rate": 0.0009737647819437645, |
| "loss": 2.5968, |
| "num_input_tokens_seen": 15060168256, |
| "step": 57450 |
| }, |
| { |
| "epoch": 0.2742764468082569, |
| "grad_norm": 0.21746863424777985, |
| "learning_rate": 0.0009719416651541838, |
| "loss": 2.5965, |
| "num_input_tokens_seen": 15073275456, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.2742764468082569, |
| "eval_loss": 2.483751058578491, |
| "eval_runtime": 53.9622, |
| "eval_samples_per_second": 92.657, |
| "eval_steps_per_second": 23.164, |
| "num_input_tokens_seen": 15073275456, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.27451494806635107, |
| "grad_norm": 0.2898815870285034, |
| "learning_rate": 0.0009700591372846095, |
| "loss": 2.6105, |
| "num_input_tokens_seen": 15086382656, |
| "step": 57550 |
| }, |
| { |
| "epoch": 0.2747534493244452, |
| "grad_norm": 0.24887384474277496, |
| "learning_rate": 0.0009681174353198686, |
| "loss": 2.6103, |
| "num_input_tokens_seen": 15099489856, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.27499195058253934, |
| "grad_norm": 0.26613715291023254, |
| "learning_rate": 0.0009661168036940071, |
| "loss": 2.6296, |
| "num_input_tokens_seen": 15112597056, |
| "step": 57650 |
| }, |
| { |
| "epoch": 0.27523045184063344, |
| "grad_norm": 0.23983849585056305, |
| "learning_rate": 0.0009640574942595195, |
| "loss": 2.6008, |
| "num_input_tokens_seen": 15125704256, |
| "step": 57700 |
| }, |
| { |
| "epoch": 0.2754689530987276, |
| "grad_norm": 0.23169022798538208, |
| "learning_rate": 0.0009619397662556434, |
| "loss": 2.596, |
| "num_input_tokens_seen": 15138811456, |
| "step": 57750 |
| }, |
| { |
| "epoch": 0.27570745435682176, |
| "grad_norm": 0.21353812515735626, |
| "learning_rate": 0.0009597638862757254, |
| "loss": 2.6039, |
| "num_input_tokens_seen": 15151918656, |
| "step": 57800 |
| }, |
| { |
| "epoch": 0.27594595561491586, |
| "grad_norm": 0.2561227083206177, |
| "learning_rate": 0.00095753012823366, |
| "loss": 2.6046, |
| "num_input_tokens_seen": 15165025856, |
| "step": 57850 |
| }, |
| { |
| "epoch": 0.27618445687301, |
| "grad_norm": 0.20380394160747528, |
| "learning_rate": 0.000955238773329408, |
| "loss": 2.5968, |
| "num_input_tokens_seen": 15178133056, |
| "step": 57900 |
| }, |
| { |
| "epoch": 0.2764229581311041, |
| "grad_norm": 0.26447024941444397, |
| "learning_rate": 0.000952890110013597, |
| "loss": 2.5848, |
| "num_input_tokens_seen": 15191240256, |
| "step": 57950 |
| }, |
| { |
| "epoch": 0.2766614593891983, |
| "grad_norm": 0.23530781269073486, |
| "learning_rate": 0.0009504844339512095, |
| "loss": 2.582, |
| "num_input_tokens_seen": 15204347456, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.2766614593891983, |
| "eval_loss": 2.482050895690918, |
| "eval_runtime": 53.5775, |
| "eval_samples_per_second": 93.323, |
| "eval_steps_per_second": 23.331, |
| "num_input_tokens_seen": 15204347456, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.2768999606472924, |
| "grad_norm": 0.2281644344329834, |
| "learning_rate": 0.0009480220479843627, |
| "loss": 2.6212, |
| "num_input_tokens_seen": 15217454656, |
| "step": 58050 |
| }, |
| { |
| "epoch": 0.27713846190538655, |
| "grad_norm": 0.2181713730096817, |
| "learning_rate": 0.0009455032620941839, |
| "loss": 2.5927, |
| "num_input_tokens_seen": 15230561856, |
| "step": 58100 |
| }, |
| { |
| "epoch": 0.2773769631634807, |
| "grad_norm": 0.21573083102703094, |
| "learning_rate": 0.00094292839336179, |
| "loss": 2.6112, |
| "num_input_tokens_seen": 15243669056, |
| "step": 58150 |
| }, |
| { |
| "epoch": 0.2776154644215748, |
| "grad_norm": 0.2686486840248108, |
| "learning_rate": 0.000940297765928369, |
| "loss": 2.6133, |
| "num_input_tokens_seen": 15256776256, |
| "step": 58200 |
| }, |
| { |
| "epoch": 0.27785396567966897, |
| "grad_norm": 0.2320137470960617, |
| "learning_rate": 0.0009376117109543769, |
| "loss": 2.6094, |
| "num_input_tokens_seen": 15269883456, |
| "step": 58250 |
| }, |
| { |
| "epoch": 0.27809246693776307, |
| "grad_norm": 0.22277672588825226, |
| "learning_rate": 0.0009348705665778478, |
| "loss": 2.5885, |
| "num_input_tokens_seen": 15282990656, |
| "step": 58300 |
| }, |
| { |
| "epoch": 0.27833096819585723, |
| "grad_norm": 0.22681231796741486, |
| "learning_rate": 0.0009320746778718274, |
| "loss": 2.6005, |
| "num_input_tokens_seen": 15296097856, |
| "step": 58350 |
| }, |
| { |
| "epoch": 0.2785694694539514, |
| "grad_norm": 0.25187453627586365, |
| "learning_rate": 0.000929224396800933, |
| "loss": 2.5944, |
| "num_input_tokens_seen": 15309205056, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.2788079707120455, |
| "grad_norm": 0.24962358176708221, |
| "learning_rate": 0.0009263200821770461, |
| "loss": 2.5888, |
| "num_input_tokens_seen": 15322312256, |
| "step": 58450 |
| }, |
| { |
| "epoch": 0.27904647197013965, |
| "grad_norm": 0.18929679691791534, |
| "learning_rate": 0.0009233620996141421, |
| "loss": 2.5927, |
| "num_input_tokens_seen": 15335419456, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.27904647197013965, |
| "eval_loss": 2.4754066467285156, |
| "eval_runtime": 53.7558, |
| "eval_samples_per_second": 93.013, |
| "eval_steps_per_second": 23.253, |
| "num_input_tokens_seen": 15335419456, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.27928497322823376, |
| "grad_norm": 0.22240912914276123, |
| "learning_rate": 0.0009203508214822651, |
| "loss": 2.5944, |
| "num_input_tokens_seen": 15348526656, |
| "step": 58550 |
| }, |
| { |
| "epoch": 0.2795234744863279, |
| "grad_norm": 0.2096235305070877, |
| "learning_rate": 0.0009172866268606513, |
| "loss": 2.5964, |
| "num_input_tokens_seen": 15361633856, |
| "step": 58600 |
| }, |
| { |
| "epoch": 0.2797619757444221, |
| "grad_norm": 0.2913396954536438, |
| "learning_rate": 0.0009141699014900082, |
| "loss": 2.5975, |
| "num_input_tokens_seen": 15374741056, |
| "step": 58650 |
| }, |
| { |
| "epoch": 0.2800004770025162, |
| "grad_norm": 0.21000444889068604, |
| "learning_rate": 0.0009110010377239551, |
| "loss": 2.5987, |
| "num_input_tokens_seen": 15387848256, |
| "step": 58700 |
| }, |
| { |
| "epoch": 0.28023897826061034, |
| "grad_norm": 0.18561489880084991, |
| "learning_rate": 0.0009077804344796301, |
| "loss": 2.5955, |
| "num_input_tokens_seen": 15400955456, |
| "step": 58750 |
| }, |
| { |
| "epoch": 0.28047747951870444, |
| "grad_norm": 0.330816388130188, |
| "learning_rate": 0.0009045084971874737, |
| "loss": 2.5837, |
| "num_input_tokens_seen": 15414062656, |
| "step": 58800 |
| }, |
| { |
| "epoch": 0.2807159807767986, |
| "grad_norm": 0.21823953092098236, |
| "learning_rate": 0.000901185637740189, |
| "loss": 2.5921, |
| "num_input_tokens_seen": 15427169856, |
| "step": 58850 |
| }, |
| { |
| "epoch": 0.28095448203489276, |
| "grad_norm": 0.28721505403518677, |
| "learning_rate": 0.0008978122744408905, |
| "loss": 2.5893, |
| "num_input_tokens_seen": 15440277056, |
| "step": 58900 |
| }, |
| { |
| "epoch": 0.28119298329298686, |
| "grad_norm": 0.2468225359916687, |
| "learning_rate": 0.0008943888319504456, |
| "loss": 2.5999, |
| "num_input_tokens_seen": 15453384256, |
| "step": 58950 |
| }, |
| { |
| "epoch": 0.281431484551081, |
| "grad_norm": 0.20486761629581451, |
| "learning_rate": 0.000890915741234015, |
| "loss": 2.6026, |
| "num_input_tokens_seen": 15466491456, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.281431484551081, |
| "eval_loss": 2.4756667613983154, |
| "eval_runtime": 53.3408, |
| "eval_samples_per_second": 93.737, |
| "eval_steps_per_second": 23.434, |
| "num_input_tokens_seen": 15466491456, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.2816699858091751, |
| "grad_norm": 0.3338637351989746, |
| "learning_rate": 0.0008873934395068005, |
| "loss": 2.587, |
| "num_input_tokens_seen": 15479598656, |
| "step": 59050 |
| }, |
| { |
| "epoch": 0.2819084870672693, |
| "grad_norm": 0.20848780870437622, |
| "learning_rate": 0.0008838223701790055, |
| "loss": 2.5989, |
| "num_input_tokens_seen": 15492705856, |
| "step": 59100 |
| }, |
| { |
| "epoch": 0.2821469883253634, |
| "grad_norm": 0.21479378640651703, |
| "learning_rate": 0.0008802029828000156, |
| "loss": 2.6052, |
| "num_input_tokens_seen": 15505813056, |
| "step": 59150 |
| }, |
| { |
| "epoch": 0.28238548958345755, |
| "grad_norm": 0.1944151073694229, |
| "learning_rate": 0.0008765357330018055, |
| "loss": 2.6044, |
| "num_input_tokens_seen": 15518920256, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.2826239908415517, |
| "grad_norm": 0.2078033685684204, |
| "learning_rate": 0.0008728210824415827, |
| "loss": 2.5929, |
| "num_input_tokens_seen": 15532027456, |
| "step": 59250 |
| }, |
| { |
| "epoch": 0.2828624920996458, |
| "grad_norm": 0.19340284168720245, |
| "learning_rate": 0.0008690594987436704, |
| "loss": 2.5875, |
| "num_input_tokens_seen": 15545134656, |
| "step": 59300 |
| }, |
| { |
| "epoch": 0.28310099335773997, |
| "grad_norm": 0.22354012727737427, |
| "learning_rate": 0.0008652514554406388, |
| "loss": 2.5976, |
| "num_input_tokens_seen": 15558241856, |
| "step": 59350 |
| }, |
| { |
| "epoch": 0.2833394946158341, |
| "grad_norm": 0.26784005761146545, |
| "learning_rate": 0.0008613974319136957, |
| "loss": 2.5868, |
| "num_input_tokens_seen": 15571349056, |
| "step": 59400 |
| }, |
| { |
| "epoch": 0.28357799587392823, |
| "grad_norm": 0.20749828219413757, |
| "learning_rate": 0.0008574979133323377, |
| "loss": 2.5784, |
| "num_input_tokens_seen": 15584456256, |
| "step": 59450 |
| }, |
| { |
| "epoch": 0.2838164971320224, |
| "grad_norm": 0.21545729041099548, |
| "learning_rate": 0.0008535533905932737, |
| "loss": 2.5939, |
| "num_input_tokens_seen": 15597563456, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.2838164971320224, |
| "eval_loss": 2.469989538192749, |
| "eval_runtime": 54.0784, |
| "eval_samples_per_second": 92.458, |
| "eval_steps_per_second": 23.115, |
| "num_input_tokens_seen": 15597563456, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.2840549983901165, |
| "grad_norm": 0.20836423337459564, |
| "learning_rate": 0.0008495643602586287, |
| "loss": 2.5858, |
| "num_input_tokens_seen": 15610670656, |
| "step": 59550 |
| }, |
| { |
| "epoch": 0.28429349964821066, |
| "grad_norm": 0.20427604019641876, |
| "learning_rate": 0.0008455313244934324, |
| "loss": 2.5781, |
| "num_input_tokens_seen": 15623777856, |
| "step": 59600 |
| }, |
| { |
| "epoch": 0.28453200090630476, |
| "grad_norm": 0.2341683804988861, |
| "learning_rate": 0.0008414547910024035, |
| "loss": 2.5713, |
| "num_input_tokens_seen": 15636885056, |
| "step": 59650 |
| }, |
| { |
| "epoch": 0.2847705021643989, |
| "grad_norm": 0.20808522403240204, |
| "learning_rate": 0.0008373352729660373, |
| "loss": 2.5751, |
| "num_input_tokens_seen": 15649992256, |
| "step": 59700 |
| }, |
| { |
| "epoch": 0.2850090034224931, |
| "grad_norm": 0.21032562851905823, |
| "learning_rate": 0.000833173288976002, |
| "loss": 2.5784, |
| "num_input_tokens_seen": 15663099456, |
| "step": 59750 |
| }, |
| { |
| "epoch": 0.2852475046805872, |
| "grad_norm": 0.23485584557056427, |
| "learning_rate": 0.0008289693629698564, |
| "loss": 2.5974, |
| "num_input_tokens_seen": 15676206656, |
| "step": 59800 |
| }, |
| { |
| "epoch": 0.28548600593868134, |
| "grad_norm": 0.2229880541563034, |
| "learning_rate": 0.0008247240241650918, |
| "loss": 2.5834, |
| "num_input_tokens_seen": 15689313856, |
| "step": 59850 |
| }, |
| { |
| "epoch": 0.28572450719677545, |
| "grad_norm": 0.21837118268013, |
| "learning_rate": 0.000820437806992512, |
| "loss": 2.5734, |
| "num_input_tokens_seen": 15702421056, |
| "step": 59900 |
| }, |
| { |
| "epoch": 0.2859630084548696, |
| "grad_norm": 0.2157929688692093, |
| "learning_rate": 0.0008161112510289549, |
| "loss": 2.587, |
| "num_input_tokens_seen": 15715528256, |
| "step": 59950 |
| }, |
| { |
| "epoch": 0.2862015097129637, |
| "grad_norm": 0.24053893983364105, |
| "learning_rate": 0.0008117449009293668, |
| "loss": 2.5853, |
| "num_input_tokens_seen": 15728635456, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.2862015097129637, |
| "eval_loss": 2.470459461212158, |
| "eval_runtime": 53.5859, |
| "eval_samples_per_second": 93.308, |
| "eval_steps_per_second": 23.327, |
| "num_input_tokens_seen": 15728635456, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.28644001097105787, |
| "grad_norm": 0.25951045751571655, |
| "learning_rate": 0.0008073393063582386, |
| "loss": 2.5946, |
| "num_input_tokens_seen": 15741742656, |
| "step": 60050 |
| }, |
| { |
| "epoch": 0.286678512229152, |
| "grad_norm": 0.22712726891040802, |
| "learning_rate": 0.00080289502192041, |
| "loss": 2.5882, |
| "num_input_tokens_seen": 15754849856, |
| "step": 60100 |
| }, |
| { |
| "epoch": 0.28691701348724613, |
| "grad_norm": 0.2236946076154709, |
| "learning_rate": 0.0007984126070912518, |
| "loss": 2.5854, |
| "num_input_tokens_seen": 15767957056, |
| "step": 60150 |
| }, |
| { |
| "epoch": 0.2871555147453403, |
| "grad_norm": 0.3175867795944214, |
| "learning_rate": 0.0007938926261462366, |
| "loss": 2.5855, |
| "num_input_tokens_seen": 15781064256, |
| "step": 60200 |
| }, |
| { |
| "epoch": 0.2873940160034344, |
| "grad_norm": 0.22954128682613373, |
| "learning_rate": 0.000789335648089903, |
| "loss": 2.595, |
| "num_input_tokens_seen": 15794171456, |
| "step": 60250 |
| }, |
| { |
| "epoch": 0.28763251726152855, |
| "grad_norm": 0.23379147052764893, |
| "learning_rate": 0.000784742246584226, |
| "loss": 2.5872, |
| "num_input_tokens_seen": 15807278656, |
| "step": 60300 |
| }, |
| { |
| "epoch": 0.2878710185196227, |
| "grad_norm": 0.22107115387916565, |
| "learning_rate": 0.0007801129998764014, |
| "loss": 2.5704, |
| "num_input_tokens_seen": 15820385856, |
| "step": 60350 |
| }, |
| { |
| "epoch": 0.2881095197777168, |
| "grad_norm": 0.21197494864463806, |
| "learning_rate": 0.0007754484907260512, |
| "loss": 2.5751, |
| "num_input_tokens_seen": 15833493056, |
| "step": 60400 |
| }, |
| { |
| "epoch": 0.288348021035811, |
| "grad_norm": 0.21372662484645844, |
| "learning_rate": 0.0007707493063318629, |
| "loss": 2.5901, |
| "num_input_tokens_seen": 15846600256, |
| "step": 60450 |
| }, |
| { |
| "epoch": 0.2885865222939051, |
| "grad_norm": 0.23300603032112122, |
| "learning_rate": 0.0007660160382576683, |
| "loss": 2.5888, |
| "num_input_tokens_seen": 15859707456, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.2885865222939051, |
| "eval_loss": 2.463745355606079, |
| "eval_runtime": 53.032, |
| "eval_samples_per_second": 94.283, |
| "eval_steps_per_second": 23.571, |
| "num_input_tokens_seen": 15859707456, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.28882502355199924, |
| "grad_norm": 0.2108684778213501, |
| "learning_rate": 0.0007612492823579744, |
| "loss": 2.5965, |
| "num_input_tokens_seen": 15872814656, |
| "step": 60550 |
| }, |
| { |
| "epoch": 0.2890635248100934, |
| "grad_norm": 0.20625820755958557, |
| "learning_rate": 0.0007564496387029531, |
| "loss": 2.5615, |
| "num_input_tokens_seen": 15885921856, |
| "step": 60600 |
| }, |
| { |
| "epoch": 0.2893020260681875, |
| "grad_norm": 0.22595694661140442, |
| "learning_rate": 0.0007516177115029001, |
| "loss": 2.5871, |
| "num_input_tokens_seen": 15899029056, |
| "step": 60650 |
| }, |
| { |
| "epoch": 0.28954052732628166, |
| "grad_norm": 0.2095574140548706, |
| "learning_rate": 0.0007467541090321735, |
| "loss": 2.5867, |
| "num_input_tokens_seen": 15912136256, |
| "step": 60700 |
| }, |
| { |
| "epoch": 0.28977902858437576, |
| "grad_norm": 0.1979990303516388, |
| "learning_rate": 0.00074185944355262, |
| "loss": 2.586, |
| "num_input_tokens_seen": 15925243456, |
| "step": 60750 |
| }, |
| { |
| "epoch": 0.2900175298424699, |
| "grad_norm": 0.3573000431060791, |
| "learning_rate": 0.0007369343312364993, |
| "loss": 2.5807, |
| "num_input_tokens_seen": 15938350656, |
| "step": 60800 |
| }, |
| { |
| "epoch": 0.2902560311005641, |
| "grad_norm": 0.2209523618221283, |
| "learning_rate": 0.0007319793920889171, |
| "loss": 2.5867, |
| "num_input_tokens_seen": 15951457856, |
| "step": 60850 |
| }, |
| { |
| "epoch": 0.2904945323586582, |
| "grad_norm": 0.1979866325855255, |
| "learning_rate": 0.0007269952498697733, |
| "loss": 2.5679, |
| "num_input_tokens_seen": 15964565056, |
| "step": 60900 |
| }, |
| { |
| "epoch": 0.29073303361675235, |
| "grad_norm": 0.2013344019651413, |
| "learning_rate": 0.0007219825320152411, |
| "loss": 2.5842, |
| "num_input_tokens_seen": 15977672256, |
| "step": 60950 |
| }, |
| { |
| "epoch": 0.29097153487484645, |
| "grad_norm": 0.20511233806610107, |
| "learning_rate": 0.0007169418695587791, |
| "loss": 2.5864, |
| "num_input_tokens_seen": 15990779456, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.29097153487484645, |
| "eval_loss": 2.4598097801208496, |
| "eval_runtime": 53.5493, |
| "eval_samples_per_second": 93.372, |
| "eval_steps_per_second": 23.343, |
| "num_input_tokens_seen": 15990779456, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.2912100361329406, |
| "grad_norm": 0.19767510890960693, |
| "learning_rate": 0.0007118738970516943, |
| "loss": 2.5963, |
| "num_input_tokens_seen": 16003886656, |
| "step": 61050 |
| }, |
| { |
| "epoch": 0.2914485373910347, |
| "grad_norm": 0.21463529765605927, |
| "learning_rate": 0.0007067792524832604, |
| "loss": 2.5825, |
| "num_input_tokens_seen": 16016993856, |
| "step": 61100 |
| }, |
| { |
| "epoch": 0.29168703864912887, |
| "grad_norm": 0.2011532485485077, |
| "learning_rate": 0.0007016585772004026, |
| "loss": 2.5783, |
| "num_input_tokens_seen": 16030101056, |
| "step": 61150 |
| }, |
| { |
| "epoch": 0.29192553990722303, |
| "grad_norm": 0.19351401925086975, |
| "learning_rate": 0.0006965125158269618, |
| "loss": 2.5619, |
| "num_input_tokens_seen": 16043208256, |
| "step": 61200 |
| }, |
| { |
| "epoch": 0.29216404116531713, |
| "grad_norm": 0.1988568902015686, |
| "learning_rate": 0.000691341716182545, |
| "loss": 2.6007, |
| "num_input_tokens_seen": 16056315456, |
| "step": 61250 |
| }, |
| { |
| "epoch": 0.2924025424234113, |
| "grad_norm": 0.20459413528442383, |
| "learning_rate": 0.0006861468292009726, |
| "loss": 2.5762, |
| "num_input_tokens_seen": 16069422656, |
| "step": 61300 |
| }, |
| { |
| "epoch": 0.2926410436815054, |
| "grad_norm": 0.1914205551147461, |
| "learning_rate": 0.0006809285088483361, |
| "loss": 2.5734, |
| "num_input_tokens_seen": 16082529856, |
| "step": 61350 |
| }, |
| { |
| "epoch": 0.29287954493959956, |
| "grad_norm": 0.194325253367424, |
| "learning_rate": 0.0006756874120406714, |
| "loss": 2.5874, |
| "num_input_tokens_seen": 16095637056, |
| "step": 61400 |
| }, |
| { |
| "epoch": 0.2931180461976937, |
| "grad_norm": 0.20854853093624115, |
| "learning_rate": 0.0006704241985612625, |
| "loss": 2.5865, |
| "num_input_tokens_seen": 16108744256, |
| "step": 61450 |
| }, |
| { |
| "epoch": 0.2933565474557878, |
| "grad_norm": 0.190395787358284, |
| "learning_rate": 0.0006651395309775837, |
| "loss": 2.5716, |
| "num_input_tokens_seen": 16121851456, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.2933565474557878, |
| "eval_loss": 2.4551966190338135, |
| "eval_runtime": 53.3343, |
| "eval_samples_per_second": 93.748, |
| "eval_steps_per_second": 23.437, |
| "num_input_tokens_seen": 16121851456, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.293595048713882, |
| "grad_norm": 0.20652073621749878, |
| "learning_rate": 0.0006598340745578908, |
| "loss": 2.5765, |
| "num_input_tokens_seen": 16134958656, |
| "step": 61550 |
| }, |
| { |
| "epoch": 0.2938335499719761, |
| "grad_norm": 0.20701836049556732, |
| "learning_rate": 0.0006545084971874737, |
| "loss": 2.5653, |
| "num_input_tokens_seen": 16148065856, |
| "step": 61600 |
| }, |
| { |
| "epoch": 0.29407205123007024, |
| "grad_norm": 0.1792392134666443, |
| "learning_rate": 0.000649163469284578, |
| "loss": 2.577, |
| "num_input_tokens_seen": 16161173056, |
| "step": 61650 |
| }, |
| { |
| "epoch": 0.2943105524881644, |
| "grad_norm": 0.21742790937423706, |
| "learning_rate": 0.0006437996637160086, |
| "loss": 2.574, |
| "num_input_tokens_seen": 16174280256, |
| "step": 61700 |
| }, |
| { |
| "epoch": 0.2945490537462585, |
| "grad_norm": 0.20747682452201843, |
| "learning_rate": 0.0006384177557124247, |
| "loss": 2.564, |
| "num_input_tokens_seen": 16187387456, |
| "step": 61750 |
| }, |
| { |
| "epoch": 0.29478755500435266, |
| "grad_norm": 0.19990311563014984, |
| "learning_rate": 0.0006330184227833376, |
| "loss": 2.5866, |
| "num_input_tokens_seen": 16200494656, |
| "step": 61800 |
| }, |
| { |
| "epoch": 0.29502605626244677, |
| "grad_norm": 0.20410317182540894, |
| "learning_rate": 0.0006276023446318213, |
| "loss": 2.5559, |
| "num_input_tokens_seen": 16213601856, |
| "step": 61850 |
| }, |
| { |
| "epoch": 0.2952645575205409, |
| "grad_norm": 0.19365034997463226, |
| "learning_rate": 0.000622170203068947, |
| "loss": 2.5705, |
| "num_input_tokens_seen": 16226709056, |
| "step": 61900 |
| }, |
| { |
| "epoch": 0.29550305877863503, |
| "grad_norm": 0.2115161269903183, |
| "learning_rate": 0.0006167226819279528, |
| "loss": 2.5621, |
| "num_input_tokens_seen": 16239816256, |
| "step": 61950 |
| }, |
| { |
| "epoch": 0.2957415600367292, |
| "grad_norm": 0.22992485761642456, |
| "learning_rate": 0.0006112604669781572, |
| "loss": 2.5587, |
| "num_input_tokens_seen": 16252923456, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.2957415600367292, |
| "eval_loss": 2.452096462249756, |
| "eval_runtime": 53.6354, |
| "eval_samples_per_second": 93.222, |
| "eval_steps_per_second": 23.306, |
| "num_input_tokens_seen": 16252923456, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.29598006129482335, |
| "grad_norm": 0.1945638656616211, |
| "learning_rate": 0.0006057842458386314, |
| "loss": 2.5582, |
| "num_input_tokens_seen": 16266030656, |
| "step": 62050 |
| }, |
| { |
| "epoch": 0.29621856255291745, |
| "grad_norm": 0.201882466673851, |
| "learning_rate": 0.0006002947078916364, |
| "loss": 2.5764, |
| "num_input_tokens_seen": 16279137856, |
| "step": 62100 |
| }, |
| { |
| "epoch": 0.2964570638110116, |
| "grad_norm": 0.2137998789548874, |
| "learning_rate": 0.0005947925441958392, |
| "loss": 2.5689, |
| "num_input_tokens_seen": 16292245056, |
| "step": 62150 |
| }, |
| { |
| "epoch": 0.2966955650691057, |
| "grad_norm": 0.18265672028064728, |
| "learning_rate": 0.0005892784473993184, |
| "loss": 2.5741, |
| "num_input_tokens_seen": 16305352256, |
| "step": 62200 |
| }, |
| { |
| "epoch": 0.2969340663271999, |
| "grad_norm": 0.16944251954555511, |
| "learning_rate": 0.0005837531116523682, |
| "loss": 2.5537, |
| "num_input_tokens_seen": 16318459456, |
| "step": 62250 |
| }, |
| { |
| "epoch": 0.29717256758529403, |
| "grad_norm": 0.20273485779762268, |
| "learning_rate": 0.0005782172325201155, |
| "loss": 2.5512, |
| "num_input_tokens_seen": 16331566656, |
| "step": 62300 |
| }, |
| { |
| "epoch": 0.29741106884338814, |
| "grad_norm": 0.19320476055145264, |
| "learning_rate": 0.0005726715068949564, |
| "loss": 2.5823, |
| "num_input_tokens_seen": 16344673856, |
| "step": 62350 |
| }, |
| { |
| "epoch": 0.2976495701014823, |
| "grad_norm": 0.21321871876716614, |
| "learning_rate": 0.0005671166329088278, |
| "loss": 2.5608, |
| "num_input_tokens_seen": 16357781056, |
| "step": 62400 |
| }, |
| { |
| "epoch": 0.2978880713595764, |
| "grad_norm": 0.2007117122411728, |
| "learning_rate": 0.0005615533098453215, |
| "loss": 2.5685, |
| "num_input_tokens_seen": 16370888256, |
| "step": 62450 |
| }, |
| { |
| "epoch": 0.29812657261767056, |
| "grad_norm": 0.1896267682313919, |
| "learning_rate": 0.0005559822380516539, |
| "loss": 2.56, |
| "num_input_tokens_seen": 16383995456, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.29812657261767056, |
| "eval_loss": 2.448042154312134, |
| "eval_runtime": 54.1994, |
| "eval_samples_per_second": 92.252, |
| "eval_steps_per_second": 23.063, |
| "num_input_tokens_seen": 16383995456, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.2983650738757647, |
| "grad_norm": 0.18581034243106842, |
| "learning_rate": 0.0005504041188505022, |
| "loss": 2.5691, |
| "num_input_tokens_seen": 16397102656, |
| "step": 62550 |
| }, |
| { |
| "epoch": 0.2986035751338588, |
| "grad_norm": 0.19272533059120178, |
| "learning_rate": 0.0005448196544517168, |
| "loss": 2.5635, |
| "num_input_tokens_seen": 16410209856, |
| "step": 62600 |
| }, |
| { |
| "epoch": 0.298842076391953, |
| "grad_norm": 0.19940300285816193, |
| "learning_rate": 0.0005392295478639225, |
| "loss": 2.5755, |
| "num_input_tokens_seen": 16423317056, |
| "step": 62650 |
| }, |
| { |
| "epoch": 0.2990805776500471, |
| "grad_norm": 0.18894875049591064, |
| "learning_rate": 0.0005336345028060199, |
| "loss": 2.5718, |
| "num_input_tokens_seen": 16436424256, |
| "step": 62700 |
| }, |
| { |
| "epoch": 0.29931907890814125, |
| "grad_norm": 0.19226962327957153, |
| "learning_rate": 0.0005280352236185959, |
| "loss": 2.563, |
| "num_input_tokens_seen": 16449531456, |
| "step": 62750 |
| }, |
| { |
| "epoch": 0.2995575801662354, |
| "grad_norm": 0.20716702938079834, |
| "learning_rate": 0.0005224324151752575, |
| "loss": 2.5532, |
| "num_input_tokens_seen": 16462638656, |
| "step": 62800 |
| }, |
| { |
| "epoch": 0.2997960814243295, |
| "grad_norm": 0.20232325792312622, |
| "learning_rate": 0.000516826782793897, |
| "loss": 2.5691, |
| "num_input_tokens_seen": 16475745856, |
| "step": 62850 |
| }, |
| { |
| "epoch": 0.30003458268242367, |
| "grad_norm": 0.19828926026821136, |
| "learning_rate": 0.0005112190321479025, |
| "loss": 2.5602, |
| "num_input_tokens_seen": 16488853056, |
| "step": 62900 |
| }, |
| { |
| "epoch": 0.30027308394051777, |
| "grad_norm": 0.22366905212402344, |
| "learning_rate": 0.000505609869177323, |
| "loss": 2.5556, |
| "num_input_tokens_seen": 16501960256, |
| "step": 62950 |
| }, |
| { |
| "epoch": 0.30051158519861193, |
| "grad_norm": 0.1883884221315384, |
| "learning_rate": 0.0005, |
| "loss": 2.5567, |
| "num_input_tokens_seen": 16515067456, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.30051158519861193, |
| "eval_loss": 2.4441678524017334, |
| "eval_runtime": 54.2448, |
| "eval_samples_per_second": 92.175, |
| "eval_steps_per_second": 23.044, |
| "num_input_tokens_seen": 16515067456, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.30075008645670603, |
| "grad_norm": 0.20152603089809418, |
| "learning_rate": 0.0004943901308226771, |
| "loss": 2.5562, |
| "num_input_tokens_seen": 16528174656, |
| "step": 63050 |
| }, |
| { |
| "epoch": 0.3009885877148002, |
| "grad_norm": 0.18534454703330994, |
| "learning_rate": 0.0004887809678520976, |
| "loss": 2.5559, |
| "num_input_tokens_seen": 16541281856, |
| "step": 63100 |
| }, |
| { |
| "epoch": 0.30122708897289435, |
| "grad_norm": 0.18770301342010498, |
| "learning_rate": 0.0004831732172061032, |
| "loss": 2.5538, |
| "num_input_tokens_seen": 16554389056, |
| "step": 63150 |
| }, |
| { |
| "epoch": 0.30146559023098846, |
| "grad_norm": 0.19565705955028534, |
| "learning_rate": 0.0004775675848247427, |
| "loss": 2.5593, |
| "num_input_tokens_seen": 16567496256, |
| "step": 63200 |
| }, |
| { |
| "epoch": 0.3017040914890826, |
| "grad_norm": 0.1954822540283203, |
| "learning_rate": 0.00047196477638140405, |
| "loss": 2.5694, |
| "num_input_tokens_seen": 16580603456, |
| "step": 63250 |
| }, |
| { |
| "epoch": 0.3019425927471767, |
| "grad_norm": 0.18120840191841125, |
| "learning_rate": 0.0004663654971939802, |
| "loss": 2.5622, |
| "num_input_tokens_seen": 16593710656, |
| "step": 63300 |
| }, |
| { |
| "epoch": 0.3021810940052709, |
| "grad_norm": 0.18100927770137787, |
| "learning_rate": 0.0004607704521360776, |
| "loss": 2.5437, |
| "num_input_tokens_seen": 16606817856, |
| "step": 63350 |
| }, |
| { |
| "epoch": 0.30241959526336504, |
| "grad_norm": 0.20565176010131836, |
| "learning_rate": 0.0004551803455482833, |
| "loss": 2.5463, |
| "num_input_tokens_seen": 16619925056, |
| "step": 63400 |
| }, |
| { |
| "epoch": 0.30265809652145914, |
| "grad_norm": 0.18989761173725128, |
| "learning_rate": 0.0004495958811494978, |
| "loss": 2.5609, |
| "num_input_tokens_seen": 16633032256, |
| "step": 63450 |
| }, |
| { |
| "epoch": 0.3028965977795533, |
| "grad_norm": 0.1870686262845993, |
| "learning_rate": 0.0004440177619483461, |
| "loss": 2.5554, |
| "num_input_tokens_seen": 16646139456, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.3028965977795533, |
| "eval_loss": 2.4395649433135986, |
| "eval_runtime": 53.4665, |
| "eval_samples_per_second": 93.516, |
| "eval_steps_per_second": 23.379, |
| "num_input_tokens_seen": 16646139456, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.3031350990376474, |
| "grad_norm": 0.1891048699617386, |
| "learning_rate": 0.00043844669015467863, |
| "loss": 2.5627, |
| "num_input_tokens_seen": 16659246656, |
| "step": 63550 |
| }, |
| { |
| "epoch": 0.30337360029574156, |
| "grad_norm": 0.18591411411762238, |
| "learning_rate": 0.0004328833670911724, |
| "loss": 2.5545, |
| "num_input_tokens_seen": 16672353856, |
| "step": 63600 |
| }, |
| { |
| "epoch": 0.3036121015538357, |
| "grad_norm": 0.18640951812267303, |
| "learning_rate": 0.0004273284931050438, |
| "loss": 2.5672, |
| "num_input_tokens_seen": 16685461056, |
| "step": 63650 |
| }, |
| { |
| "epoch": 0.3038506028119298, |
| "grad_norm": 0.1919756680727005, |
| "learning_rate": 0.0004217827674798845, |
| "loss": 2.5492, |
| "num_input_tokens_seen": 16698568256, |
| "step": 63700 |
| }, |
| { |
| "epoch": 0.304089104070024, |
| "grad_norm": 0.18388938903808594, |
| "learning_rate": 0.00041624688834763184, |
| "loss": 2.5487, |
| "num_input_tokens_seen": 16711675456, |
| "step": 63750 |
| }, |
| { |
| "epoch": 0.3043276053281181, |
| "grad_norm": 0.1851562261581421, |
| "learning_rate": 0.0004107215526006817, |
| "loss": 2.5539, |
| "num_input_tokens_seen": 16724782656, |
| "step": 63800 |
| }, |
| { |
| "epoch": 0.30456610658621225, |
| "grad_norm": 0.17315496504306793, |
| "learning_rate": 0.0004052074558041608, |
| "loss": 2.5544, |
| "num_input_tokens_seen": 16737889856, |
| "step": 63850 |
| }, |
| { |
| "epoch": 0.30480460784430635, |
| "grad_norm": 0.17985352873802185, |
| "learning_rate": 0.00039970529210836363, |
| "loss": 2.5511, |
| "num_input_tokens_seen": 16750997056, |
| "step": 63900 |
| }, |
| { |
| "epoch": 0.3050431091024005, |
| "grad_norm": 0.20455212891101837, |
| "learning_rate": 0.0003942157541613686, |
| "loss": 2.5593, |
| "num_input_tokens_seen": 16764104256, |
| "step": 63950 |
| }, |
| { |
| "epoch": 0.30528161036049467, |
| "grad_norm": 0.1965632140636444, |
| "learning_rate": 0.00038873953302184284, |
| "loss": 2.5599, |
| "num_input_tokens_seen": 16777211456, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.30528161036049467, |
| "eval_loss": 2.437380790710449, |
| "eval_runtime": 53.2524, |
| "eval_samples_per_second": 93.893, |
| "eval_steps_per_second": 23.473, |
| "num_input_tokens_seen": 16777211456, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.3055201116185888, |
| "grad_norm": 0.1703004688024521, |
| "learning_rate": 0.00038327731807204744, |
| "loss": 2.5506, |
| "num_input_tokens_seen": 16790318656, |
| "step": 64050 |
| }, |
| { |
| "epoch": 0.30575861287668293, |
| "grad_norm": 0.19769616425037384, |
| "learning_rate": 0.00037782979693105293, |
| "loss": 2.542, |
| "num_input_tokens_seen": 16803425856, |
| "step": 64100 |
| }, |
| { |
| "epoch": 0.30599711413477704, |
| "grad_norm": 0.20674961805343628, |
| "learning_rate": 0.00037239765536817873, |
| "loss": 2.539, |
| "num_input_tokens_seen": 16816533056, |
| "step": 64150 |
| }, |
| { |
| "epoch": 0.3062356153928712, |
| "grad_norm": 0.19121839106082916, |
| "learning_rate": 0.0003669815772166625, |
| "loss": 2.5573, |
| "num_input_tokens_seen": 16829640256, |
| "step": 64200 |
| }, |
| { |
| "epoch": 0.30647411665096536, |
| "grad_norm": 0.1734025925397873, |
| "learning_rate": 0.00036158224428757535, |
| "loss": 2.5416, |
| "num_input_tokens_seen": 16842747456, |
| "step": 64250 |
| }, |
| { |
| "epoch": 0.30671261790905946, |
| "grad_norm": 0.1857634037733078, |
| "learning_rate": 0.0003562003362839914, |
| "loss": 2.5652, |
| "num_input_tokens_seen": 16855854656, |
| "step": 64300 |
| }, |
| { |
| "epoch": 0.3069511191671536, |
| "grad_norm": 0.17733143270015717, |
| "learning_rate": 0.000350836530715422, |
| "loss": 2.5299, |
| "num_input_tokens_seen": 16868961856, |
| "step": 64350 |
| }, |
| { |
| "epoch": 0.3071896204252477, |
| "grad_norm": 0.18323005735874176, |
| "learning_rate": 0.00034549150281252633, |
| "loss": 2.5691, |
| "num_input_tokens_seen": 16882069056, |
| "step": 64400 |
| }, |
| { |
| "epoch": 0.3074281216833419, |
| "grad_norm": 0.18570365011692047, |
| "learning_rate": 0.00034016592544210936, |
| "loss": 2.5436, |
| "num_input_tokens_seen": 16895176256, |
| "step": 64450 |
| }, |
| { |
| "epoch": 0.30766662294143604, |
| "grad_norm": 0.18571798503398895, |
| "learning_rate": 0.00033486046902241664, |
| "loss": 2.5382, |
| "num_input_tokens_seen": 16908283456, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.30766662294143604, |
| "eval_loss": 2.4323015213012695, |
| "eval_runtime": 53.7237, |
| "eval_samples_per_second": 93.069, |
| "eval_steps_per_second": 23.267, |
| "num_input_tokens_seen": 16908283456, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.30790512419953014, |
| "grad_norm": 0.1829528957605362, |
| "learning_rate": 0.0003295758014387375, |
| "loss": 2.5453, |
| "num_input_tokens_seen": 16921390656, |
| "step": 64550 |
| }, |
| { |
| "epoch": 0.3081436254576243, |
| "grad_norm": 0.1703086644411087, |
| "learning_rate": 0.0003243125879593286, |
| "loss": 2.5441, |
| "num_input_tokens_seen": 16934497856, |
| "step": 64600 |
| }, |
| { |
| "epoch": 0.3083821267157184, |
| "grad_norm": 0.17826180160045624, |
| "learning_rate": 0.000319071491151664, |
| "loss": 2.545, |
| "num_input_tokens_seen": 16947605056, |
| "step": 64650 |
| }, |
| { |
| "epoch": 0.30862062797381257, |
| "grad_norm": 0.17889030277729034, |
| "learning_rate": 0.00031385317079902743, |
| "loss": 2.5405, |
| "num_input_tokens_seen": 16960712256, |
| "step": 64700 |
| }, |
| { |
| "epoch": 0.30885912923190667, |
| "grad_norm": 0.1711336225271225, |
| "learning_rate": 0.0003086582838174551, |
| "loss": 2.5222, |
| "num_input_tokens_seen": 16973819456, |
| "step": 64750 |
| }, |
| { |
| "epoch": 0.30909763049000083, |
| "grad_norm": 0.17962214350700378, |
| "learning_rate": 0.0003034874841730382, |
| "loss": 2.5376, |
| "num_input_tokens_seen": 16986926656, |
| "step": 64800 |
| }, |
| { |
| "epoch": 0.309336131748095, |
| "grad_norm": 0.1699627935886383, |
| "learning_rate": 0.0002983414227995975, |
| "loss": 2.5616, |
| "num_input_tokens_seen": 17000033856, |
| "step": 64850 |
| }, |
| { |
| "epoch": 0.3095746330061891, |
| "grad_norm": 0.18442535400390625, |
| "learning_rate": 0.00029322074751673977, |
| "loss": 2.5377, |
| "num_input_tokens_seen": 17013141056, |
| "step": 64900 |
| }, |
| { |
| "epoch": 0.30981313426428325, |
| "grad_norm": 0.17972196638584137, |
| "learning_rate": 0.0002881261029483057, |
| "loss": 2.5474, |
| "num_input_tokens_seen": 17026248256, |
| "step": 64950 |
| }, |
| { |
| "epoch": 0.31005163552237736, |
| "grad_norm": 0.1810217946767807, |
| "learning_rate": 0.00028305813044122096, |
| "loss": 2.5286, |
| "num_input_tokens_seen": 17039355456, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.31005163552237736, |
| "eval_loss": 2.4292306900024414, |
| "eval_runtime": 53.3956, |
| "eval_samples_per_second": 93.641, |
| "eval_steps_per_second": 23.41, |
| "num_input_tokens_seen": 17039355456, |
| "step": 65000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 70000, |
| "num_input_tokens_seen": 17039355456, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.5581938885892506e+18, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|