| { |
| "best_metric": 1.1948587894439697, |
| "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds3-llama/checkpoint-6012", |
| "epoch": 0.9999812653390037, |
| "eval_steps": 668, |
| "global_step": 6672, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0014987728797047418, |
| "grad_norm": 4.475478649139404, |
| "learning_rate": 1.4999999999999999e-05, |
| "loss": 4.4935, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0029975457594094835, |
| "grad_norm": 3.118635416030884, |
| "learning_rate": 2.9999999999999997e-05, |
| "loss": 4.159, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.004496318639114225, |
| "grad_norm": 2.2133734226226807, |
| "learning_rate": 4.4999999999999996e-05, |
| "loss": 3.7894, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.005995091518818967, |
| "grad_norm": 1.8941925764083862, |
| "learning_rate": 5.9999999999999995e-05, |
| "loss": 3.56, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.007493864398523709, |
| "grad_norm": 1.5571386814117432, |
| "learning_rate": 7.5e-05, |
| "loss": 3.4336, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00899263727822845, |
| "grad_norm": 1.125948190689087, |
| "learning_rate": 8.999999999999999e-05, |
| "loss": 3.2953, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.010491410157933192, |
| "grad_norm": 0.9792301058769226, |
| "learning_rate": 0.00010499999999999999, |
| "loss": 3.1549, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.011990183037637934, |
| "grad_norm": 1.3225536346435547, |
| "learning_rate": 0.00011999999999999999, |
| "loss": 2.9355, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.013488955917342676, |
| "grad_norm": 1.0157508850097656, |
| "learning_rate": 0.000135, |
| "loss": 2.7319, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.014987728797047418, |
| "grad_norm": 1.0096309185028076, |
| "learning_rate": 0.00015, |
| "loss": 2.6215, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01648650167675216, |
| "grad_norm": 1.1225345134735107, |
| "learning_rate": 0.000165, |
| "loss": 2.5117, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0179852745564569, |
| "grad_norm": 1.4821795225143433, |
| "learning_rate": 0.00017999999999999998, |
| "loss": 2.448, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.019484047436161642, |
| "grad_norm": 1.3336493968963623, |
| "learning_rate": 0.000195, |
| "loss": 2.3504, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.020982820315866384, |
| "grad_norm": 1.2394251823425293, |
| "learning_rate": 0.00020999999999999998, |
| "loss": 2.2978, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.022481593195571126, |
| "grad_norm": 1.6399582624435425, |
| "learning_rate": 0.000225, |
| "loss": 2.2439, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.02398036607527587, |
| "grad_norm": 2.268030881881714, |
| "learning_rate": 0.00023999999999999998, |
| "loss": 2.1803, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.02547913895498061, |
| "grad_norm": 2.153691291809082, |
| "learning_rate": 0.00025499999999999996, |
| "loss": 2.1265, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.026977911834685352, |
| "grad_norm": 1.5142606496810913, |
| "learning_rate": 0.00027, |
| "loss": 2.0739, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.028476684714390094, |
| "grad_norm": 1.531332015991211, |
| "learning_rate": 0.000285, |
| "loss": 2.0148, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.029975457594094836, |
| "grad_norm": 1.551711916923523, |
| "learning_rate": 0.0003, |
| "loss": 1.9854, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.031474230473799575, |
| "grad_norm": 1.8056877851486206, |
| "learning_rate": 0.0002999982328104334, |
| "loss": 1.9536, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.03297300335350432, |
| "grad_norm": 1.5872079133987427, |
| "learning_rate": 0.00029999293128337313, |
| "loss": 1.9306, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.03447177623320906, |
| "grad_norm": 1.6114915609359741, |
| "learning_rate": 0.00029998409554373644, |
| "loss": 1.8826, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.0359705491129138, |
| "grad_norm": 1.188768744468689, |
| "learning_rate": 0.00029997172579971585, |
| "loss": 1.86, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03746932199261854, |
| "grad_norm": 1.5296919345855713, |
| "learning_rate": 0.0002999558223427737, |
| "loss": 1.8221, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.038968094872323285, |
| "grad_norm": 1.196419596672058, |
| "learning_rate": 0.0002999363855476357, |
| "loss": 1.8225, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.04046686775202803, |
| "grad_norm": 1.453387975692749, |
| "learning_rate": 0.0002999134158722818, |
| "loss": 1.7898, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.04196564063173277, |
| "grad_norm": 1.1857067346572876, |
| "learning_rate": 0.00029988691385793553, |
| "loss": 1.7829, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.04346441351143751, |
| "grad_norm": 1.294594645500183, |
| "learning_rate": 0.00029985688012905155, |
| "loss": 1.7712, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.04496318639114225, |
| "grad_norm": 1.1869618892669678, |
| "learning_rate": 0.0002998233153933003, |
| "loss": 1.7575, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.046461959270846995, |
| "grad_norm": 1.1660012006759644, |
| "learning_rate": 0.00029978622044155175, |
| "loss": 1.7293, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.04796073215055174, |
| "grad_norm": 1.0317786931991577, |
| "learning_rate": 0.0002997455961478568, |
| "loss": 1.7288, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.04945950503025648, |
| "grad_norm": 1.1452827453613281, |
| "learning_rate": 0.0002997014434694265, |
| "loss": 1.7044, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.05095827790996122, |
| "grad_norm": 1.0942342281341553, |
| "learning_rate": 0.0002996537634466094, |
| "loss": 1.7116, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.05245705078966596, |
| "grad_norm": 1.1483399868011475, |
| "learning_rate": 0.00029960255720286755, |
| "loss": 1.6756, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.053955823669370705, |
| "grad_norm": 1.2330204248428345, |
| "learning_rate": 0.0002995478259447494, |
| "loss": 1.6669, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.05545459654907545, |
| "grad_norm": 1.1838743686676025, |
| "learning_rate": 0.00029948957096186167, |
| "loss": 1.6474, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.05695336942878019, |
| "grad_norm": 1.2730305194854736, |
| "learning_rate": 0.00029942779362683906, |
| "loss": 1.6409, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.05845214230848493, |
| "grad_norm": 1.1066051721572876, |
| "learning_rate": 0.00029936249539531175, |
| "loss": 1.6216, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.05995091518818967, |
| "grad_norm": 1.0314743518829346, |
| "learning_rate": 0.0002992936778058711, |
| "loss": 1.6263, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.061449688067894415, |
| "grad_norm": 1.0880593061447144, |
| "learning_rate": 0.00029922134248003344, |
| "loss": 1.6176, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.06294846094759915, |
| "grad_norm": 1.1548967361450195, |
| "learning_rate": 0.0002991454911222019, |
| "loss": 1.6251, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0644472338273039, |
| "grad_norm": 1.1637712717056274, |
| "learning_rate": 0.0002990661255196261, |
| "loss": 1.607, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.06594600670700863, |
| "grad_norm": 1.0483553409576416, |
| "learning_rate": 0.00029898324754236037, |
| "loss": 1.6015, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.06744477958671338, |
| "grad_norm": 1.0421444177627563, |
| "learning_rate": 0.00029889685914321923, |
| "loss": 1.5941, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.06894355246641812, |
| "grad_norm": 1.091874361038208, |
| "learning_rate": 0.0002988069623577318, |
| "loss": 1.5884, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.07044232534612287, |
| "grad_norm": 1.1074745655059814, |
| "learning_rate": 0.00029871355930409353, |
| "loss": 1.6035, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.0719410982258276, |
| "grad_norm": 1.0914019346237183, |
| "learning_rate": 0.00029861665218311646, |
| "loss": 1.5799, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.07343987110553235, |
| "grad_norm": 0.9614572525024414, |
| "learning_rate": 0.00029851624327817725, |
| "loss": 1.5599, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.07493864398523709, |
| "grad_norm": 1.052742600440979, |
| "learning_rate": 0.0002984123349551635, |
| "loss": 1.5627, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07643741686494183, |
| "grad_norm": 1.032119631767273, |
| "learning_rate": 0.00029830492966241795, |
| "loss": 1.5547, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.07793618974464657, |
| "grad_norm": 1.0201342105865479, |
| "learning_rate": 0.0002981940299306808, |
| "loss": 1.5422, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.07943496262435132, |
| "grad_norm": 1.087877631187439, |
| "learning_rate": 0.00029807963837303003, |
| "loss": 1.5543, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.08093373550405605, |
| "grad_norm": 1.0491427183151245, |
| "learning_rate": 0.00029796175768481974, |
| "loss": 1.5389, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.0824325083837608, |
| "grad_norm": 0.9637844562530518, |
| "learning_rate": 0.0002978403906436171, |
| "loss": 1.5477, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.08393128126346554, |
| "grad_norm": 0.9702052474021912, |
| "learning_rate": 0.0002977155401091362, |
| "loss": 1.5303, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.08543005414317027, |
| "grad_norm": 0.9769977331161499, |
| "learning_rate": 0.0002975872090231713, |
| "loss": 1.535, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.08692882702287502, |
| "grad_norm": 1.0341148376464844, |
| "learning_rate": 0.0002974554004095271, |
| "loss": 1.5356, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.08842759990257976, |
| "grad_norm": 0.995041012763977, |
| "learning_rate": 0.00029732011737394775, |
| "loss": 1.5232, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.0899263727822845, |
| "grad_norm": 1.0219428539276123, |
| "learning_rate": 0.0002971813631040434, |
| "loss": 1.5193, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.09142514566198924, |
| "grad_norm": 1.099050521850586, |
| "learning_rate": 0.0002970391408692154, |
| "loss": 1.535, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.09292391854169399, |
| "grad_norm": 1.247942566871643, |
| "learning_rate": 0.0002968934540205791, |
| "loss": 1.5252, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.09442269142139872, |
| "grad_norm": 0.9947668313980103, |
| "learning_rate": 0.0002967443059908849, |
| "loss": 1.5118, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.09592146430110347, |
| "grad_norm": 0.9500942230224609, |
| "learning_rate": 0.0002965917002944373, |
| "loss": 1.4768, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.09742023718080821, |
| "grad_norm": 0.9748952984809875, |
| "learning_rate": 0.0002964356405270123, |
| "loss": 1.4791, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.09891901006051296, |
| "grad_norm": 1.0195879936218262, |
| "learning_rate": 0.0002962761303657724, |
| "loss": 1.5021, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.10011802836427675, |
| "eval_loss": 1.5316129922866821, |
| "eval_runtime": 34.726, |
| "eval_samples_per_second": 719.921, |
| "eval_steps_per_second": 89.99, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.10041778294021769, |
| "grad_norm": 1.1510100364685059, |
| "learning_rate": 0.00029611317356918027, |
| "loss": 1.4856, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.10191655581992244, |
| "grad_norm": 0.8920977711677551, |
| "learning_rate": 0.00029594677397690975, |
| "loss": 1.4896, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.10341532869962718, |
| "grad_norm": 0.9080698490142822, |
| "learning_rate": 0.00029577693550975596, |
| "loss": 1.471, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.10491410157933193, |
| "grad_norm": 0.903261661529541, |
| "learning_rate": 0.0002956036621695424, |
| "loss": 1.5153, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.10641287445903666, |
| "grad_norm": 0.9598987698554993, |
| "learning_rate": 0.000295426958039027, |
| "loss": 1.478, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.10791164733874141, |
| "grad_norm": 0.9518589973449707, |
| "learning_rate": 0.00029524682728180565, |
| "loss": 1.4713, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.10941042021844614, |
| "grad_norm": 0.8936556577682495, |
| "learning_rate": 0.0002950632741422142, |
| "loss": 1.4658, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.1109091930981509, |
| "grad_norm": 0.9834211468696594, |
| "learning_rate": 0.0002948763029452287, |
| "loss": 1.4611, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.11240796597785563, |
| "grad_norm": 0.9097837805747986, |
| "learning_rate": 0.0002946859180963631, |
| "loss": 1.4696, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.11390673885756038, |
| "grad_norm": 0.8856462836265564, |
| "learning_rate": 0.00029449212408156554, |
| "loss": 1.4676, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.11540551173726511, |
| "grad_norm": 0.9628478288650513, |
| "learning_rate": 0.0002942949254671129, |
| "loss": 1.4582, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.11690428461696986, |
| "grad_norm": 0.8835513591766357, |
| "learning_rate": 0.000294094326899503, |
| "loss": 1.4547, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.1184030574966746, |
| "grad_norm": 0.9671273827552795, |
| "learning_rate": 0.00029389033310534517, |
| "loss": 1.4504, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.11990183037637935, |
| "grad_norm": 0.9154276847839355, |
| "learning_rate": 0.00029368294889124864, |
| "loss": 1.4387, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.12140060325608408, |
| "grad_norm": 0.8806156516075134, |
| "learning_rate": 0.0002934721791437098, |
| "loss": 1.4452, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.12289937613578883, |
| "grad_norm": 0.8779545426368713, |
| "learning_rate": 0.0002932580288289966, |
| "loss": 1.4425, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.12439814901549356, |
| "grad_norm": 0.9174330234527588, |
| "learning_rate": 0.0002930405029930317, |
| "loss": 1.4466, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.1258969218951983, |
| "grad_norm": 0.904965341091156, |
| "learning_rate": 0.00029281960676127365, |
| "loss": 1.4289, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.12739569477490303, |
| "grad_norm": 0.9320288300514221, |
| "learning_rate": 0.0002925953453385959, |
| "loss": 1.44, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1288944676546078, |
| "grad_norm": 0.8714675307273865, |
| "learning_rate": 0.00029236772400916455, |
| "loss": 1.4153, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.13039324053431253, |
| "grad_norm": 0.9434481859207153, |
| "learning_rate": 0.0002921367481363134, |
| "loss": 1.421, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.13189201341401727, |
| "grad_norm": 0.9329891800880432, |
| "learning_rate": 0.00029190242316241773, |
| "loss": 1.4459, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.133390786293722, |
| "grad_norm": 0.9669409394264221, |
| "learning_rate": 0.0002916647546087663, |
| "loss": 1.4254, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.13488955917342677, |
| "grad_norm": 0.9065726399421692, |
| "learning_rate": 0.00029142374807543083, |
| "loss": 1.4255, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.1363883320531315, |
| "grad_norm": 0.9425346255302429, |
| "learning_rate": 0.0002911794092411345, |
| "loss": 1.4276, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.13788710493283624, |
| "grad_norm": 0.9356669783592224, |
| "learning_rate": 0.0002909317438631179, |
| "loss": 1.4211, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.13938587781254097, |
| "grad_norm": 0.9592692852020264, |
| "learning_rate": 0.0002906807577770031, |
| "loss": 1.4053, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.14088465069224573, |
| "grad_norm": 0.8722407817840576, |
| "learning_rate": 0.0002904264568966569, |
| "loss": 1.406, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.14238342357195047, |
| "grad_norm": 0.812244713306427, |
| "learning_rate": 0.0002901688472140507, |
| "loss": 1.4193, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.1438821964516552, |
| "grad_norm": 1.0006746053695679, |
| "learning_rate": 0.00028990793479911973, |
| "loss": 1.4092, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.14538096933135994, |
| "grad_norm": 0.9519333243370056, |
| "learning_rate": 0.00028964372579961997, |
| "loss": 1.4055, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.1468797422110647, |
| "grad_norm": 0.8306151628494263, |
| "learning_rate": 0.0002893762264409832, |
| "loss": 1.3887, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.14837851509076944, |
| "grad_norm": 0.9266255497932434, |
| "learning_rate": 0.00028910544302617055, |
| "loss": 1.4288, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.14987728797047417, |
| "grad_norm": 0.890106201171875, |
| "learning_rate": 0.0002888313819355236, |
| "loss": 1.4126, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1513760608501789, |
| "grad_norm": 0.8950091600418091, |
| "learning_rate": 0.0002885540496266144, |
| "loss": 1.4022, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.15287483372988367, |
| "grad_norm": 0.8897644877433777, |
| "learning_rate": 0.00028827345263409304, |
| "loss": 1.4005, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.1543736066095884, |
| "grad_norm": 0.8642116189002991, |
| "learning_rate": 0.000287989597569534, |
| "loss": 1.371, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.15587237948929314, |
| "grad_norm": 0.9173794984817505, |
| "learning_rate": 0.00028770249112128, |
| "loss": 1.3888, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.15737115236899787, |
| "grad_norm": 0.9254131317138672, |
| "learning_rate": 0.0002874121400542846, |
| "loss": 1.3762, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.15886992524870264, |
| "grad_norm": 0.8322397470474243, |
| "learning_rate": 0.00028711855120995284, |
| "loss": 1.4087, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.16036869812840737, |
| "grad_norm": 0.8882824182510376, |
| "learning_rate": 0.00028682173150598007, |
| "loss": 1.4032, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.1618674710081121, |
| "grad_norm": 0.8823337554931641, |
| "learning_rate": 0.00028652168793618857, |
| "loss": 1.3995, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.16336624388781684, |
| "grad_norm": 1.0256520509719849, |
| "learning_rate": 0.0002862184275703633, |
| "loss": 1.3702, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.1648650167675216, |
| "grad_norm": 0.8852070569992065, |
| "learning_rate": 0.00028591195755408504, |
| "loss": 1.3974, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.16636378964722634, |
| "grad_norm": 0.9529812335968018, |
| "learning_rate": 0.00028560228510856185, |
| "loss": 1.3999, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.16786256252693108, |
| "grad_norm": 0.9794729948043823, |
| "learning_rate": 0.0002852894175304594, |
| "loss": 1.3841, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.1693613354066358, |
| "grad_norm": 0.8799359798431396, |
| "learning_rate": 0.00028497336219172854, |
| "loss": 1.38, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.17086010828634055, |
| "grad_norm": 0.8865130543708801, |
| "learning_rate": 0.00028465412653943194, |
| "loss": 1.3698, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.1723588811660453, |
| "grad_norm": 0.8830373883247375, |
| "learning_rate": 0.00028433171809556844, |
| "loss": 1.3813, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.17385765404575004, |
| "grad_norm": 0.9166454672813416, |
| "learning_rate": 0.00028400614445689583, |
| "loss": 1.3681, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.17535642692545478, |
| "grad_norm": 0.9003053307533264, |
| "learning_rate": 0.000283677413294752, |
| "loss": 1.3547, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.1768551998051595, |
| "grad_norm": 0.8830690979957581, |
| "learning_rate": 0.0002833455323548741, |
| "loss": 1.3778, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.17835397268486428, |
| "grad_norm": 0.8167861104011536, |
| "learning_rate": 0.00028301050945721577, |
| "loss": 1.3754, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.179852745564569, |
| "grad_norm": 0.8883965015411377, |
| "learning_rate": 0.00028267235249576335, |
| "loss": 1.369, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.18135151844427375, |
| "grad_norm": 0.8578314185142517, |
| "learning_rate": 0.00028233106943834947, |
| "loss": 1.3535, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.18285029132397848, |
| "grad_norm": 0.9007347822189331, |
| "learning_rate": 0.0002819866683264657, |
| "loss": 1.3629, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.18434906420368324, |
| "grad_norm": 0.8820904493331909, |
| "learning_rate": 0.00028163915727507266, |
| "loss": 1.3603, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.18584783708338798, |
| "grad_norm": 0.8988449573516846, |
| "learning_rate": 0.00028128854447240903, |
| "loss": 1.363, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.18734660996309271, |
| "grad_norm": 0.9054597616195679, |
| "learning_rate": 0.0002809348381797988, |
| "loss": 1.3555, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.18884538284279745, |
| "grad_norm": 0.901901125907898, |
| "learning_rate": 0.000280578046731456, |
| "loss": 1.3636, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.1903441557225022, |
| "grad_norm": 0.8617561459541321, |
| "learning_rate": 0.0002802181785342892, |
| "loss": 1.3747, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.19184292860220695, |
| "grad_norm": 0.9050679802894592, |
| "learning_rate": 0.0002798552420677025, |
| "loss": 1.358, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.19334170148191168, |
| "grad_norm": 0.8624427318572998, |
| "learning_rate": 0.00027948924588339655, |
| "loss": 1.3676, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.19484047436161642, |
| "grad_norm": 0.9002446532249451, |
| "learning_rate": 0.00027912019860516644, |
| "loss": 1.3553, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.19633924724132118, |
| "grad_norm": 0.8707396388053894, |
| "learning_rate": 0.0002787481089286989, |
| "loss": 1.3593, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.19783802012102591, |
| "grad_norm": 0.8842359781265259, |
| "learning_rate": 0.0002783729856213671, |
| "loss": 1.3466, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.19933679300073065, |
| "grad_norm": 0.9021062850952148, |
| "learning_rate": 0.00027799483752202444, |
| "loss": 1.3603, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.2002360567285535, |
| "eval_loss": 1.3936063051223755, |
| "eval_runtime": 34.728, |
| "eval_samples_per_second": 719.881, |
| "eval_steps_per_second": 89.985, |
| "step": 1336 |
| }, |
| { |
| "epoch": 0.20083556588043538, |
| "grad_norm": 0.8968262672424316, |
| "learning_rate": 0.00027761367354079574, |
| "loss": 1.3579, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.20233433876014015, |
| "grad_norm": 0.8595730662345886, |
| "learning_rate": 0.00027722950265886796, |
| "loss": 1.3514, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.20383311163984488, |
| "grad_norm": 0.8064100742340088, |
| "learning_rate": 0.00027684233392827806, |
| "loss": 1.3599, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.20533188451954962, |
| "grad_norm": 0.9714745879173279, |
| "learning_rate": 0.0002764521764716999, |
| "loss": 1.3328, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.20683065739925435, |
| "grad_norm": 0.8713521957397461, |
| "learning_rate": 0.0002760590394822293, |
| "loss": 1.3375, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.20832943027895912, |
| "grad_norm": 0.9022751450538635, |
| "learning_rate": 0.00027566293222316734, |
| "loss": 1.34, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.20982820315866385, |
| "grad_norm": 0.8861002326011658, |
| "learning_rate": 0.0002752638640278024, |
| "loss": 1.3429, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.21132697603836859, |
| "grad_norm": 0.908525288105011, |
| "learning_rate": 0.0002748618442991897, |
| "loss": 1.3416, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.21282574891807332, |
| "grad_norm": 0.8318501114845276, |
| "learning_rate": 0.0002744568825099302, |
| "loss": 1.3453, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.21432452179777806, |
| "grad_norm": 0.8497442603111267, |
| "learning_rate": 0.00027404898820194724, |
| "loss": 1.3434, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.21582329467748282, |
| "grad_norm": 0.8935710191726685, |
| "learning_rate": 0.00027363817098626165, |
| "loss": 1.3443, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.21732206755718755, |
| "grad_norm": 0.8185181617736816, |
| "learning_rate": 0.00027322444054276543, |
| "loss": 1.345, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.2188208404368923, |
| "grad_norm": 0.8508570194244385, |
| "learning_rate": 0.00027280780661999353, |
| "loss": 1.3476, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.22031961331659702, |
| "grad_norm": 0.9571162462234497, |
| "learning_rate": 0.00027238827903489424, |
| "loss": 1.3536, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.2218183861963018, |
| "grad_norm": 0.8471242785453796, |
| "learning_rate": 0.0002719658676725979, |
| "loss": 1.3338, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.22331715907600652, |
| "grad_norm": 0.817711353302002, |
| "learning_rate": 0.00027154058248618376, |
| "loss": 1.3425, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.22481593195571126, |
| "grad_norm": 0.8738152980804443, |
| "learning_rate": 0.00027111243349644583, |
| "loss": 1.365, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.226314704835416, |
| "grad_norm": 0.8730055689811707, |
| "learning_rate": 0.0002706814307916565, |
| "loss": 1.3256, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.22781347771512075, |
| "grad_norm": 0.8565317392349243, |
| "learning_rate": 0.00027024758452732876, |
| "loss": 1.3305, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.2293122505948255, |
| "grad_norm": 0.8381545543670654, |
| "learning_rate": 0.0002698109049259773, |
| "loss": 1.3436, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.23081102347453022, |
| "grad_norm": 0.8256663680076599, |
| "learning_rate": 0.0002693714022768772, |
| "loss": 1.3357, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.23230979635423496, |
| "grad_norm": 0.8698524832725525, |
| "learning_rate": 0.00026892908693582166, |
| "loss": 1.3357, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.23380856923393972, |
| "grad_norm": 0.8229045271873474, |
| "learning_rate": 0.00026848396932487826, |
| "loss": 1.333, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.23530734211364446, |
| "grad_norm": 0.793002188205719, |
| "learning_rate": 0.00026803605993214283, |
| "loss": 1.3492, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.2368061149933492, |
| "grad_norm": 0.862382709980011, |
| "learning_rate": 0.0002675853693114929, |
| "loss": 1.3286, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.23830488787305393, |
| "grad_norm": 0.8608536124229431, |
| "learning_rate": 0.00026713190808233853, |
| "loss": 1.3212, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.2398036607527587, |
| "grad_norm": 0.7945990562438965, |
| "learning_rate": 0.00026667568692937245, |
| "loss": 1.3226, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.24130243363246343, |
| "grad_norm": 0.7914056777954102, |
| "learning_rate": 0.0002662167166023182, |
| "loss": 1.3323, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.24280120651216816, |
| "grad_norm": 0.8237165212631226, |
| "learning_rate": 0.0002657550079156767, |
| "loss": 1.3586, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.2442999793918729, |
| "grad_norm": 0.8703092932701111, |
| "learning_rate": 0.0002652905717484716, |
| "loss": 1.323, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.24579875227157766, |
| "grad_norm": 0.8693041205406189, |
| "learning_rate": 0.0002648234190439929, |
| "loss": 1.3182, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.2472975251512824, |
| "grad_norm": 0.7807459831237793, |
| "learning_rate": 0.00026435356080953916, |
| "loss": 1.3429, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.24879629803098713, |
| "grad_norm": 0.810480535030365, |
| "learning_rate": 0.00026388100811615785, |
| "loss": 1.3036, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.25029507091069186, |
| "grad_norm": 0.8039631247520447, |
| "learning_rate": 0.0002634057720983849, |
| "loss": 1.2937, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.2517938437903966, |
| "grad_norm": 0.9298089146614075, |
| "learning_rate": 0.000262927863953982, |
| "loss": 1.3453, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.25329261667010133, |
| "grad_norm": 0.8398979902267456, |
| "learning_rate": 0.00026244729494367307, |
| "loss": 1.3188, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.25479138954980607, |
| "grad_norm": 0.8078634142875671, |
| "learning_rate": 0.0002619640763908786, |
| "loss": 1.3268, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.25629016242951086, |
| "grad_norm": 0.7533845901489258, |
| "learning_rate": 0.000261478219681449, |
| "loss": 1.3005, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.2577889353092156, |
| "grad_norm": 0.852730929851532, |
| "learning_rate": 0.00026098973626339654, |
| "loss": 1.3106, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.25928770818892033, |
| "grad_norm": 0.9218925833702087, |
| "learning_rate": 0.0002604986376466251, |
| "loss": 1.3169, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.26078648106862506, |
| "grad_norm": 0.858396589756012, |
| "learning_rate": 0.00026000493540265934, |
| "loss": 1.3261, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.2622852539483298, |
| "grad_norm": 0.8293901085853577, |
| "learning_rate": 0.000259508641164372, |
| "loss": 1.3184, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.26378402682803453, |
| "grad_norm": 0.7981786131858826, |
| "learning_rate": 0.0002590097666257099, |
| "loss": 1.2922, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.26528279970773927, |
| "grad_norm": 0.881025493144989, |
| "learning_rate": 0.00025850832354141784, |
| "loss": 1.3039, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.266781572587444, |
| "grad_norm": 0.8400424718856812, |
| "learning_rate": 0.0002580043237267625, |
| "loss": 1.3208, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.2682803454671488, |
| "grad_norm": 0.8640297055244446, |
| "learning_rate": 0.00025749777905725336, |
| "loss": 1.2938, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.26977911834685353, |
| "grad_norm": 0.820582389831543, |
| "learning_rate": 0.00025698870146836315, |
| "loss": 1.3163, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.27127789122655827, |
| "grad_norm": 0.8231412172317505, |
| "learning_rate": 0.00025647710295524656, |
| "loss": 1.2841, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.272776664106263, |
| "grad_norm": 0.8273252248764038, |
| "learning_rate": 0.00025596299557245774, |
| "loss": 1.2995, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.27427543698596774, |
| "grad_norm": 0.9586191177368164, |
| "learning_rate": 0.0002554463914336659, |
| "loss": 1.3058, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.27577420986567247, |
| "grad_norm": 0.839583158493042, |
| "learning_rate": 0.0002549273027113704, |
| "loss": 1.3043, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.2772729827453772, |
| "grad_norm": 0.8683123588562012, |
| "learning_rate": 0.00025440574163661364, |
| "loss": 1.2954, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.27877175562508194, |
| "grad_norm": 0.7915263175964355, |
| "learning_rate": 0.0002538817204986926, |
| "loss": 1.3061, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.28027052850478673, |
| "grad_norm": 0.7897734045982361, |
| "learning_rate": 0.00025335525164487, |
| "loss": 1.3015, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.28176930138449147, |
| "grad_norm": 0.8129907250404358, |
| "learning_rate": 0.0002528263474800826, |
| "loss": 1.3008, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.2832680742641962, |
| "grad_norm": 0.8599696159362793, |
| "learning_rate": 0.0002522950204666494, |
| "loss": 1.2996, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.28476684714390094, |
| "grad_norm": 0.79160475730896, |
| "learning_rate": 0.00025176128312397774, |
| "loss": 1.2994, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.28626562002360567, |
| "grad_norm": 0.9080151915550232, |
| "learning_rate": 0.0002512251480282685, |
| "loss": 1.2902, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.2877643929033104, |
| "grad_norm": 0.8382280468940735, |
| "learning_rate": 0.00025068662781221966, |
| "loss": 1.2938, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.28926316578301514, |
| "grad_norm": 0.8656803369522095, |
| "learning_rate": 0.00025014573516472864, |
| "loss": 1.292, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2907619386627199, |
| "grad_norm": 0.8192686438560486, |
| "learning_rate": 0.0002496024828305933, |
| "loss": 1.3002, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.29226071154242467, |
| "grad_norm": 0.7879363894462585, |
| "learning_rate": 0.0002490568836102118, |
| "loss": 1.3009, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2937594844221294, |
| "grad_norm": 0.8016439080238342, |
| "learning_rate": 0.0002485089503592808, |
| "loss": 1.2998, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.29525825730183414, |
| "grad_norm": 0.8002453446388245, |
| "learning_rate": 0.0002479586959884926, |
| "loss": 1.291, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.2967570301815389, |
| "grad_norm": 0.8366382718086243, |
| "learning_rate": 0.00024740613346323095, |
| "loss": 1.3098, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.2982558030612436, |
| "grad_norm": 0.8672970533370972, |
| "learning_rate": 0.0002468512758032656, |
| "loss": 1.2958, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.29975457594094834, |
| "grad_norm": 0.8241642713546753, |
| "learning_rate": 0.0002462941360824454, |
| "loss": 1.3034, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.30035408509283024, |
| "eval_loss": 1.3332195281982422, |
| "eval_runtime": 34.8158, |
| "eval_samples_per_second": 718.065, |
| "eval_steps_per_second": 89.758, |
| "step": 2004 |
| }, |
| { |
| "epoch": 0.3012533488206531, |
| "grad_norm": 0.874294638633728, |
| "learning_rate": 0.00024573472742839053, |
| "loss": 1.2974, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.3027521217003578, |
| "grad_norm": 0.788142204284668, |
| "learning_rate": 0.0002451730630221827, |
| "loss": 1.3018, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.30425089458006255, |
| "grad_norm": 0.8617194294929504, |
| "learning_rate": 0.0002446091560980549, |
| "loss": 1.2913, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.30574966745976734, |
| "grad_norm": 0.7948266267776489, |
| "learning_rate": 0.00024404301994307968, |
| "loss": 1.2885, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.3072484403394721, |
| "grad_norm": 0.8624528050422668, |
| "learning_rate": 0.00024347466789685575, |
| "loss": 1.2823, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.3087472132191768, |
| "grad_norm": 0.8431428670883179, |
| "learning_rate": 0.00024290411335119386, |
| "loss": 1.2784, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.31024598609888154, |
| "grad_norm": 0.7940576672554016, |
| "learning_rate": 0.0002423313697498012, |
| "loss": 1.3008, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.3117447589785863, |
| "grad_norm": 0.7964574098587036, |
| "learning_rate": 0.0002417564505879647, |
| "loss": 1.292, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.313243531858291, |
| "grad_norm": 0.7948952317237854, |
| "learning_rate": 0.00024117936941223293, |
| "loss": 1.2897, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.31474230473799575, |
| "grad_norm": 0.7771734595298767, |
| "learning_rate": 0.00024060013982009695, |
| "loss": 1.2737, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3162410776177005, |
| "grad_norm": 0.7866066098213196, |
| "learning_rate": 0.00024001877545967005, |
| "loss": 1.2908, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.3177398504974053, |
| "grad_norm": 0.8679277300834656, |
| "learning_rate": 0.00023943529002936595, |
| "loss": 1.2916, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.31923862337711, |
| "grad_norm": 0.7685953974723816, |
| "learning_rate": 0.0002388496972775762, |
| "loss": 1.2669, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.32073739625681474, |
| "grad_norm": 0.8774948716163635, |
| "learning_rate": 0.00023826201100234613, |
| "loss": 1.2834, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.3222361691365195, |
| "grad_norm": 0.7911577820777893, |
| "learning_rate": 0.00023767224505104984, |
| "loss": 1.2833, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.3237349420162242, |
| "grad_norm": 0.8205265402793884, |
| "learning_rate": 0.00023708041332006375, |
| "loss": 1.2902, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.32523371489592895, |
| "grad_norm": 0.7738403677940369, |
| "learning_rate": 0.00023648652975443937, |
| "loss": 1.2879, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.3267324877756337, |
| "grad_norm": 0.9220579862594604, |
| "learning_rate": 0.00023589060834757454, |
| "loss": 1.2841, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.3282312606553384, |
| "grad_norm": 0.7917136549949646, |
| "learning_rate": 0.00023529266314088388, |
| "loss": 1.2795, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.3297300335350432, |
| "grad_norm": 0.8733565211296082, |
| "learning_rate": 0.00023469270822346774, |
| "loss": 1.2896, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.33122880641474794, |
| "grad_norm": 0.874913215637207, |
| "learning_rate": 0.00023409075773178045, |
| "loss": 1.2607, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.3327275792944527, |
| "grad_norm": 0.7840304970741272, |
| "learning_rate": 0.00023348682584929702, |
| "loss": 1.2737, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.3342263521741574, |
| "grad_norm": 0.8352317810058594, |
| "learning_rate": 0.00023288092680617912, |
| "loss": 1.2804, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.33572512505386215, |
| "grad_norm": 0.8565024137496948, |
| "learning_rate": 0.00023227307487893957, |
| "loss": 1.2931, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.3372238979335669, |
| "grad_norm": 0.8791518211364746, |
| "learning_rate": 0.00023166328439010625, |
| "loss": 1.282, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.3387226708132716, |
| "grad_norm": 0.8080510497093201, |
| "learning_rate": 0.00023105156970788424, |
| "loss": 1.2986, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.34022144369297636, |
| "grad_norm": 0.8406448364257812, |
| "learning_rate": 0.0002304379452458177, |
| "loss": 1.2699, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.3417202165726811, |
| "grad_norm": 0.8735692501068115, |
| "learning_rate": 0.00022982242546244985, |
| "loss": 1.2762, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.3432189894523859, |
| "grad_norm": 0.8007855415344238, |
| "learning_rate": 0.00022920502486098262, |
| "loss": 1.27, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.3447177623320906, |
| "grad_norm": 0.7858870625495911, |
| "learning_rate": 0.0002285857579889346, |
| "loss": 1.2785, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.34621653521179535, |
| "grad_norm": 0.8488030433654785, |
| "learning_rate": 0.00022796463943779862, |
| "loss": 1.2687, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.3477153080915001, |
| "grad_norm": 0.8431515693664551, |
| "learning_rate": 0.0002273416838426976, |
| "loss": 1.2522, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.3492140809712048, |
| "grad_norm": 0.8310567140579224, |
| "learning_rate": 0.00022671690588203994, |
| "loss": 1.2826, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.35071285385090956, |
| "grad_norm": 0.7767550945281982, |
| "learning_rate": 0.00022609032027717357, |
| "loss": 1.2721, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.3522116267306143, |
| "grad_norm": 0.8902162909507751, |
| "learning_rate": 0.00022546194179203904, |
| "loss": 1.2795, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.353710399610319, |
| "grad_norm": 0.7950676083564758, |
| "learning_rate": 0.0002248317852328217, |
| "loss": 1.2703, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.3552091724900238, |
| "grad_norm": 0.8022187352180481, |
| "learning_rate": 0.00022419986544760284, |
| "loss": 1.2626, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.35670794536972855, |
| "grad_norm": 0.7801185846328735, |
| "learning_rate": 0.00022356619732600988, |
| "loss": 1.2781, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.3582067182494333, |
| "grad_norm": 0.7817099094390869, |
| "learning_rate": 0.0002229307957988653, |
| "loss": 1.2757, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.359705491129138, |
| "grad_norm": 0.8795964121818542, |
| "learning_rate": 0.0002222936758378352, |
| "loss": 1.2573, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.36120426400884276, |
| "grad_norm": 0.8010522723197937, |
| "learning_rate": 0.0002216548524550761, |
| "loss": 1.2747, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.3627030368885475, |
| "grad_norm": 0.8054040670394897, |
| "learning_rate": 0.0002210143407028817, |
| "loss": 1.2608, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.3642018097682522, |
| "grad_norm": 0.826788067817688, |
| "learning_rate": 0.00022037215567332767, |
| "loss": 1.2691, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.36570058264795696, |
| "grad_norm": 0.7943429350852966, |
| "learning_rate": 0.00021972831249791652, |
| "loss": 1.2683, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.36719935552766175, |
| "grad_norm": 0.7942007780075073, |
| "learning_rate": 0.00021908282634722082, |
| "loss": 1.2685, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.3686981284073665, |
| "grad_norm": 0.8077208995819092, |
| "learning_rate": 0.00021843571243052577, |
| "loss": 1.2548, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.3701969012870712, |
| "grad_norm": 0.8264975547790527, |
| "learning_rate": 0.00021778698599547088, |
| "loss": 1.2642, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.37169567416677596, |
| "grad_norm": 0.8238746523857117, |
| "learning_rate": 0.00021713666232769067, |
| "loss": 1.2619, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.3731944470464807, |
| "grad_norm": 0.8039039373397827, |
| "learning_rate": 0.00021648475675045445, |
| "loss": 1.2541, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.37469321992618543, |
| "grad_norm": 0.7852529883384705, |
| "learning_rate": 0.00021583128462430529, |
| "loss": 1.2595, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.37619199280589016, |
| "grad_norm": 0.7786864638328552, |
| "learning_rate": 0.00021517626134669824, |
| "loss": 1.264, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.3776907656855949, |
| "grad_norm": 0.7981049418449402, |
| "learning_rate": 0.0002145197023516374, |
| "loss": 1.2687, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.3791895385652997, |
| "grad_norm": 0.8835734128952026, |
| "learning_rate": 0.000213861623109312, |
| "loss": 1.2732, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.3806883114450044, |
| "grad_norm": 0.7418168187141418, |
| "learning_rate": 0.00021320203912573245, |
| "loss": 1.248, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.38218708432470916, |
| "grad_norm": 0.7426604628562927, |
| "learning_rate": 0.00021254096594236447, |
| "loss": 1.2472, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.3836858572044139, |
| "grad_norm": 0.7566102743148804, |
| "learning_rate": 0.00021187841913576324, |
| "loss": 1.2612, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.38518463008411863, |
| "grad_norm": 0.7827373743057251, |
| "learning_rate": 0.00021121441431720607, |
| "loss": 1.261, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.38668340296382336, |
| "grad_norm": 0.8316198587417603, |
| "learning_rate": 0.00021054896713232482, |
| "loss": 1.2649, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.3881821758435281, |
| "grad_norm": 0.7967325448989868, |
| "learning_rate": 0.00020988209326073713, |
| "loss": 1.2588, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.38968094872323283, |
| "grad_norm": 0.7973827719688416, |
| "learning_rate": 0.00020921380841567702, |
| "loss": 1.2534, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.39117972160293757, |
| "grad_norm": 0.8205732703208923, |
| "learning_rate": 0.00020854412834362445, |
| "loss": 1.2608, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.39267849448264236, |
| "grad_norm": 0.7526434063911438, |
| "learning_rate": 0.00020787306882393464, |
| "loss": 1.2517, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.3941772673623471, |
| "grad_norm": 0.8194977045059204, |
| "learning_rate": 0.00020720064566846603, |
| "loss": 1.2303, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.39567604024205183, |
| "grad_norm": 0.790014386177063, |
| "learning_rate": 0.0002065268747212077, |
| "loss": 1.2579, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.39717481312175656, |
| "grad_norm": 0.8768025040626526, |
| "learning_rate": 0.00020585177185790618, |
| "loss": 1.2728, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.3986735860014613, |
| "grad_norm": 0.7993418574333191, |
| "learning_rate": 0.00020517535298569134, |
| "loss": 1.2738, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.40017235888116603, |
| "grad_norm": 0.8214737176895142, |
| "learning_rate": 0.00020449763404270136, |
| "loss": 1.2519, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.400472113457107, |
| "eval_loss": 1.2892086505889893, |
| "eval_runtime": 34.8937, |
| "eval_samples_per_second": 716.462, |
| "eval_steps_per_second": 89.558, |
| "step": 2672 |
| }, |
| { |
| "epoch": 0.40167113176087077, |
| "grad_norm": 0.8561115264892578, |
| "learning_rate": 0.00020381863099770768, |
| "loss": 1.2384, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.4031699046405755, |
| "grad_norm": 0.7806345224380493, |
| "learning_rate": 0.00020313835984973815, |
| "loss": 1.2698, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.4046686775202803, |
| "grad_norm": 0.8666788935661316, |
| "learning_rate": 0.00020245683662770047, |
| "loss": 1.2461, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.40616745039998503, |
| "grad_norm": 0.8963791728019714, |
| "learning_rate": 0.0002017740773900043, |
| "loss": 1.2401, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.40766622327968977, |
| "grad_norm": 0.8115370273590088, |
| "learning_rate": 0.00020109009822418311, |
| "loss": 1.252, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.4091649961593945, |
| "grad_norm": 0.7880274653434753, |
| "learning_rate": 0.0002004049152465147, |
| "loss": 1.2622, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.41066376903909924, |
| "grad_norm": 0.8020321726799011, |
| "learning_rate": 0.0001997185446016419, |
| "loss": 1.2632, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.41216254191880397, |
| "grad_norm": 0.8517338037490845, |
| "learning_rate": 0.00019903100246219198, |
| "loss": 1.2514, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.4136613147985087, |
| "grad_norm": 0.7712230682373047, |
| "learning_rate": 0.00019834230502839548, |
| "loss": 1.2572, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.41516008767821344, |
| "grad_norm": 0.7825449705123901, |
| "learning_rate": 0.0001976524685277047, |
| "loss": 1.2391, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.41665886055791823, |
| "grad_norm": 0.8370509147644043, |
| "learning_rate": 0.00019696150921441125, |
| "loss": 1.2505, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.41815763343762297, |
| "grad_norm": 0.8516371846199036, |
| "learning_rate": 0.0001962694433692629, |
| "loss": 1.239, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.4196564063173277, |
| "grad_norm": 0.8301183581352234, |
| "learning_rate": 0.0001955762872990803, |
| "loss": 1.2399, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.42115517919703244, |
| "grad_norm": 0.7094582319259644, |
| "learning_rate": 0.00019488205733637234, |
| "loss": 1.2636, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.42265395207673717, |
| "grad_norm": 0.8317636847496033, |
| "learning_rate": 0.00019418676983895167, |
| "loss": 1.2369, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.4241527249564419, |
| "grad_norm": 0.8261407017707825, |
| "learning_rate": 0.00019349044118954916, |
| "loss": 1.2454, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.42565149783614664, |
| "grad_norm": 0.8200167417526245, |
| "learning_rate": 0.00019279308779542782, |
| "loss": 1.2505, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.4271502707158514, |
| "grad_norm": 0.8785801529884338, |
| "learning_rate": 0.00019209472608799604, |
| "loss": 1.2341, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.4286490435955561, |
| "grad_norm": 0.8413455486297607, |
| "learning_rate": 0.000191395372522421, |
| "loss": 1.2524, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.4301478164752609, |
| "grad_norm": 0.782440721988678, |
| "learning_rate": 0.00019069504357724024, |
| "loss": 1.2673, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.43164658935496564, |
| "grad_norm": 0.7999353408813477, |
| "learning_rate": 0.00018999375575397387, |
| "loss": 1.251, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.4331453622346704, |
| "grad_norm": 0.832058310508728, |
| "learning_rate": 0.00018929152557673555, |
| "loss": 1.253, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.4346441351143751, |
| "grad_norm": 0.7583170533180237, |
| "learning_rate": 0.0001885883695918432, |
| "loss": 1.2449, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.43614290799407984, |
| "grad_norm": 0.890659749507904, |
| "learning_rate": 0.000187884304367429, |
| "loss": 1.221, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.4376416808737846, |
| "grad_norm": 0.7491910457611084, |
| "learning_rate": 0.0001871793464930493, |
| "loss": 1.2231, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.4391404537534893, |
| "grad_norm": 0.8058717250823975, |
| "learning_rate": 0.0001864735125792934, |
| "loss": 1.2362, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.44063922663319405, |
| "grad_norm": 0.8197429776191711, |
| "learning_rate": 0.00018576681925739234, |
| "loss": 1.2177, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.44213799951289884, |
| "grad_norm": 0.8527374267578125, |
| "learning_rate": 0.00018505928317882696, |
| "loss": 1.2395, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.4436367723926036, |
| "grad_norm": 0.7480415105819702, |
| "learning_rate": 0.00018435092101493569, |
| "loss": 1.2462, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.4451355452723083, |
| "grad_norm": 0.7680370807647705, |
| "learning_rate": 0.00018364174945652146, |
| "loss": 1.2358, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.44663431815201304, |
| "grad_norm": 0.8246043920516968, |
| "learning_rate": 0.00018293178521345868, |
| "loss": 1.2222, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.4481330910317178, |
| "grad_norm": 0.7838782668113708, |
| "learning_rate": 0.0001822210450142994, |
| "loss": 1.2292, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.4496318639114225, |
| "grad_norm": 0.7519893646240234, |
| "learning_rate": 0.00018150954560587913, |
| "loss": 1.2536, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.45113063679112725, |
| "grad_norm": 0.8429172039031982, |
| "learning_rate": 0.00018079730375292232, |
| "loss": 1.2141, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.452629409670832, |
| "grad_norm": 0.8938582539558411, |
| "learning_rate": 0.00018008433623764721, |
| "loss": 1.2318, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.4541281825505368, |
| "grad_norm": 0.7177508473396301, |
| "learning_rate": 0.00017937065985937055, |
| "loss": 1.2494, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.4556269554302415, |
| "grad_norm": 0.8130243420600891, |
| "learning_rate": 0.00017865629143411162, |
| "loss": 1.2409, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.45712572830994624, |
| "grad_norm": 0.8147348165512085, |
| "learning_rate": 0.0001779412477941962, |
| "loss": 1.2347, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.458624501189651, |
| "grad_norm": 0.8635284900665283, |
| "learning_rate": 0.00017722554578785972, |
| "loss": 1.24, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.4601232740693557, |
| "grad_norm": 0.8101358413696289, |
| "learning_rate": 0.00017650920227885045, |
| "loss": 1.2441, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.46162204694906045, |
| "grad_norm": 0.7738655209541321, |
| "learning_rate": 0.00017579223414603202, |
| "loss": 1.2395, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.4631208198287652, |
| "grad_norm": 0.7867494225502014, |
| "learning_rate": 0.00017507465828298587, |
| "loss": 1.2334, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.4646195927084699, |
| "grad_norm": 0.8157032132148743, |
| "learning_rate": 0.00017435649159761298, |
| "loss": 1.232, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.46611836558817465, |
| "grad_norm": 0.7872126698493958, |
| "learning_rate": 0.0001736377510117357, |
| "loss": 1.231, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.46761713846787945, |
| "grad_norm": 0.8418169021606445, |
| "learning_rate": 0.00017291845346069888, |
| "loss": 1.238, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.4691159113475842, |
| "grad_norm": 0.8292982578277588, |
| "learning_rate": 0.00017219861589297083, |
| "loss": 1.2445, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.4706146842272889, |
| "grad_norm": 0.8435372710227966, |
| "learning_rate": 0.00017147825526974417, |
| "loss": 1.2397, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.47211345710699365, |
| "grad_norm": 0.8216134309768677, |
| "learning_rate": 0.0001707573885645359, |
| "loss": 1.2427, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.4736122299866984, |
| "grad_norm": 0.7630952000617981, |
| "learning_rate": 0.00017003603276278764, |
| "loss": 1.2406, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.4751110028664031, |
| "grad_norm": 0.7559238076210022, |
| "learning_rate": 0.0001693142048614653, |
| "loss": 1.2268, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.47660977574610786, |
| "grad_norm": 0.7821029424667358, |
| "learning_rate": 0.00016859192186865875, |
| "loss": 1.2235, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.4781085486258126, |
| "grad_norm": 0.7591744065284729, |
| "learning_rate": 0.00016786920080318085, |
| "loss": 1.2291, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.4796073215055174, |
| "grad_norm": 0.8490301966667175, |
| "learning_rate": 0.00016714605869416668, |
| "loss": 1.2494, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.4811060943852221, |
| "grad_norm": 0.8557929396629333, |
| "learning_rate": 0.00016642251258067205, |
| "loss": 1.2428, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.48260486726492685, |
| "grad_norm": 0.7579270601272583, |
| "learning_rate": 0.0001656985795112722, |
| "loss": 1.2354, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.4841036401446316, |
| "grad_norm": 0.7294782996177673, |
| "learning_rate": 0.0001649742765436601, |
| "loss": 1.2302, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.4856024130243363, |
| "grad_norm": 0.9022475481033325, |
| "learning_rate": 0.0001642496207442443, |
| "loss": 1.2174, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.48710118590404106, |
| "grad_norm": 0.8305564522743225, |
| "learning_rate": 0.0001635246291877471, |
| "loss": 1.2357, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.4885999587837458, |
| "grad_norm": 0.808020293712616, |
| "learning_rate": 0.000162799318956802, |
| "loss": 1.2283, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.4900987316634505, |
| "grad_norm": 0.8041689991950989, |
| "learning_rate": 0.00016207370714155128, |
| "loss": 1.223, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.4915975045431553, |
| "grad_norm": 0.7986804246902466, |
| "learning_rate": 0.0001613478108392434, |
| "loss": 1.2296, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.49309627742286005, |
| "grad_norm": 0.8282227516174316, |
| "learning_rate": 0.00016062164715382988, |
| "loss": 1.2211, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.4945950503025648, |
| "grad_norm": 0.7985265254974365, |
| "learning_rate": 0.00015989523319556265, |
| "loss": 1.2028, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.4960938231822695, |
| "grad_norm": 0.8276951909065247, |
| "learning_rate": 0.00015916858608059058, |
| "loss": 1.2534, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.49759259606197426, |
| "grad_norm": 0.8164412379264832, |
| "learning_rate": 0.00015844172293055637, |
| "loss": 1.2222, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.499091368941679, |
| "grad_norm": 0.7917633652687073, |
| "learning_rate": 0.000157714660872193, |
| "loss": 1.2351, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.5005901418213837, |
| "grad_norm": 0.8134817481040955, |
| "learning_rate": 0.00015698741703692025, |
| "loss": 1.2266, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.5005901418213837, |
| "eval_loss": 1.2627006769180298, |
| "eval_runtime": 34.7715, |
| "eval_samples_per_second": 718.979, |
| "eval_steps_per_second": 89.872, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.5020889147010885, |
| "grad_norm": 0.7944011092185974, |
| "learning_rate": 0.00015626000856044106, |
| "loss": 1.2145, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.5035876875807932, |
| "grad_norm": 0.8224478363990784, |
| "learning_rate": 0.00015553245258233763, |
| "loss": 1.2451, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.505086460460498, |
| "grad_norm": 0.7887598276138306, |
| "learning_rate": 0.0001548047662456678, |
| "loss": 1.2217, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.5065852333402027, |
| "grad_norm": 0.800266444683075, |
| "learning_rate": 0.00015407696669656091, |
| "loss": 1.2187, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.5080840062199075, |
| "grad_norm": 0.8337759971618652, |
| "learning_rate": 0.0001533490710838139, |
| "loss": 1.2172, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.5095827790996121, |
| "grad_norm": 0.7768263816833496, |
| "learning_rate": 0.0001526210965584872, |
| "loss": 1.2252, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.5110815519793169, |
| "grad_norm": 0.767500638961792, |
| "learning_rate": 0.00015189306027350063, |
| "loss": 1.2322, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.5125803248590217, |
| "grad_norm": 0.7581789493560791, |
| "learning_rate": 0.00015116497938322913, |
| "loss": 1.2376, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.5140790977387264, |
| "grad_norm": 0.7668277621269226, |
| "learning_rate": 0.00015043687104309886, |
| "loss": 1.2384, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.5155778706184312, |
| "grad_norm": 0.8178005218505859, |
| "learning_rate": 0.00014970875240918262, |
| "loss": 1.2252, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.5170766434981359, |
| "grad_norm": 0.7495589256286621, |
| "learning_rate": 0.00014898064063779574, |
| "loss": 1.2225, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.5185754163778407, |
| "grad_norm": 0.8504411578178406, |
| "learning_rate": 0.00014825255288509193, |
| "loss": 1.21, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.5200741892575453, |
| "grad_norm": 0.8137525916099548, |
| "learning_rate": 0.00014752450630665893, |
| "loss": 1.2015, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.5215729621372501, |
| "grad_norm": 0.7947141528129578, |
| "learning_rate": 0.00014679651805711428, |
| "loss": 1.2131, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.5230717350169549, |
| "grad_norm": 0.765978217124939, |
| "learning_rate": 0.00014606860528970116, |
| "loss": 1.225, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.5245705078966596, |
| "grad_norm": 0.8446719646453857, |
| "learning_rate": 0.00014534078515588425, |
| "loss": 1.2174, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.5260692807763644, |
| "grad_norm": 0.8156980276107788, |
| "learning_rate": 0.00014461307480494553, |
| "loss": 1.2255, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.5275680536560691, |
| "grad_norm": 0.747254490852356, |
| "learning_rate": 0.00014388549138358007, |
| "loss": 1.2382, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.5290668265357739, |
| "grad_norm": 0.8017699122428894, |
| "learning_rate": 0.0001431580520354924, |
| "loss": 1.2271, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.5305655994154785, |
| "grad_norm": 0.8388494849205017, |
| "learning_rate": 0.00014243077390099218, |
| "loss": 1.211, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.5320643722951833, |
| "grad_norm": 0.7686325311660767, |
| "learning_rate": 0.00014170367411659048, |
| "loss": 1.2073, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.533563145174888, |
| "grad_norm": 0.8303519487380981, |
| "learning_rate": 0.00014097676981459598, |
| "loss": 1.2087, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.5350619180545928, |
| "grad_norm": 0.7963255047798157, |
| "learning_rate": 0.0001402500781227114, |
| "loss": 1.2108, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.5365606909342976, |
| "grad_norm": 0.8091217875480652, |
| "learning_rate": 0.00013952361616362968, |
| "loss": 1.2041, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.5380594638140023, |
| "grad_norm": 0.8334858417510986, |
| "learning_rate": 0.00013879740105463074, |
| "loss": 1.2113, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.5395582366937071, |
| "grad_norm": 0.7692158818244934, |
| "learning_rate": 0.00013807144990717816, |
| "loss": 1.2095, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.5410570095734117, |
| "grad_norm": 0.8363123536109924, |
| "learning_rate": 0.00013734577982651584, |
| "loss": 1.2195, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.5425557824531165, |
| "grad_norm": 0.8173678517341614, |
| "learning_rate": 0.00013662040791126502, |
| "loss": 1.1964, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.5440545553328212, |
| "grad_norm": 0.7436100244522095, |
| "learning_rate": 0.0001358953512530215, |
| "loss": 1.2124, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.545553328212526, |
| "grad_norm": 0.7557327747344971, |
| "learning_rate": 0.00013517062693595266, |
| "loss": 1.208, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.5470521010922308, |
| "grad_norm": 0.7731929421424866, |
| "learning_rate": 0.00013444625203639531, |
| "loss": 1.2023, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.5485508739719355, |
| "grad_norm": 0.8247756958007812, |
| "learning_rate": 0.0001337222436224529, |
| "loss": 1.2056, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.5500496468516403, |
| "grad_norm": 0.8017778992652893, |
| "learning_rate": 0.00013299861875359367, |
| "loss": 1.2057, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.5515484197313449, |
| "grad_norm": 0.84889817237854, |
| "learning_rate": 0.00013227539448024855, |
| "loss": 1.2148, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.5530471926110497, |
| "grad_norm": 0.808515191078186, |
| "learning_rate": 0.00013155258784340934, |
| "loss": 1.2029, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.5545459654907544, |
| "grad_norm": 0.7231589555740356, |
| "learning_rate": 0.00013083021587422737, |
| "loss": 1.1948, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.5560447383704592, |
| "grad_norm": 0.8200321793556213, |
| "learning_rate": 0.0001301082955936121, |
| "loss": 1.2122, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.5575435112501639, |
| "grad_norm": 0.7973082065582275, |
| "learning_rate": 0.00012938684401183, |
| "loss": 1.2214, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.5590422841298687, |
| "grad_norm": 0.7436721324920654, |
| "learning_rate": 0.00012866587812810384, |
| "loss": 1.2124, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.5605410570095735, |
| "grad_norm": 0.794087290763855, |
| "learning_rate": 0.00012794541493021217, |
| "loss": 1.2057, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.5620398298892781, |
| "grad_norm": 0.815791666507721, |
| "learning_rate": 0.0001272254713940889, |
| "loss": 1.2078, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.5635386027689829, |
| "grad_norm": 0.8807167410850525, |
| "learning_rate": 0.0001265060644834235, |
| "loss": 1.228, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.5650373756486876, |
| "grad_norm": 0.8193700313568115, |
| "learning_rate": 0.00012578721114926098, |
| "loss": 1.1931, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.5665361485283924, |
| "grad_norm": 0.7985761761665344, |
| "learning_rate": 0.00012506892832960296, |
| "loss": 1.2146, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.5680349214080971, |
| "grad_norm": 0.7901711463928223, |
| "learning_rate": 0.00012435123294900815, |
| "loss": 1.1876, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.5695336942878019, |
| "grad_norm": 0.8111531138420105, |
| "learning_rate": 0.00012363414191819368, |
| "loss": 1.1815, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.5710324671675066, |
| "grad_norm": 0.7858944535255432, |
| "learning_rate": 0.00012291767213363678, |
| "loss": 1.2132, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.5725312400472113, |
| "grad_norm": 0.8336951732635498, |
| "learning_rate": 0.00012220184047717647, |
| "loss": 1.1849, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.5740300129269161, |
| "grad_norm": 0.8228222131729126, |
| "learning_rate": 0.00012148666381561589, |
| "loss": 1.2119, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.5755287858066208, |
| "grad_norm": 0.7975912690162659, |
| "learning_rate": 0.0001207721590003248, |
| "loss": 1.1882, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.5770275586863256, |
| "grad_norm": 0.816520631313324, |
| "learning_rate": 0.00012005834286684263, |
| "loss": 1.2164, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.5785263315660303, |
| "grad_norm": 0.8109871745109558, |
| "learning_rate": 0.00011934523223448168, |
| "loss": 1.1933, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.5800251044457351, |
| "grad_norm": 0.8165502548217773, |
| "learning_rate": 0.00011863284390593089, |
| "loss": 1.2, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.5815238773254398, |
| "grad_norm": 0.7544243931770325, |
| "learning_rate": 0.00011792119466685983, |
| "loss": 1.2173, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.5830226502051445, |
| "grad_norm": 0.822040855884552, |
| "learning_rate": 0.00011721030128552338, |
| "loss": 1.205, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.5845214230848493, |
| "grad_norm": 0.8056491017341614, |
| "learning_rate": 0.0001165001805123664, |
| "loss": 1.2011, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.586020195964554, |
| "grad_norm": 0.8770938515663147, |
| "learning_rate": 0.00011579084907962914, |
| "loss": 1.1903, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.5875189688442588, |
| "grad_norm": 0.833992600440979, |
| "learning_rate": 0.0001150823237009531, |
| "loss": 1.193, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.5890177417239635, |
| "grad_norm": 0.8358356356620789, |
| "learning_rate": 0.00011437462107098694, |
| "loss": 1.191, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.5905165146036683, |
| "grad_norm": 0.9014224410057068, |
| "learning_rate": 0.00011366775786499347, |
| "loss": 1.2004, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.592015287483373, |
| "grad_norm": 0.8393827080726624, |
| "learning_rate": 0.00011296175073845642, |
| "loss": 1.2118, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.5935140603630777, |
| "grad_norm": 0.8268956542015076, |
| "learning_rate": 0.00011225661632668815, |
| "loss": 1.2083, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.5950128332427824, |
| "grad_norm": 0.8107516169548035, |
| "learning_rate": 0.00011155237124443766, |
| "loss": 1.2094, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.5965116061224872, |
| "grad_norm": 0.8380411267280579, |
| "learning_rate": 0.00011084903208549916, |
| "loss": 1.2159, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.598010379002192, |
| "grad_norm": 0.8165388107299805, |
| "learning_rate": 0.00011014661542232089, |
| "loss": 1.2024, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.5995091518818967, |
| "grad_norm": 0.8068062663078308, |
| "learning_rate": 0.00010944513780561495, |
| "loss": 1.1826, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6007081701856605, |
| "eval_loss": 1.2374228239059448, |
| "eval_runtime": 34.9516, |
| "eval_samples_per_second": 715.274, |
| "eval_steps_per_second": 89.409, |
| "step": 4008 |
| }, |
| { |
| "epoch": 0.6010079247616015, |
| "grad_norm": 0.7994382381439209, |
| "learning_rate": 0.00010874461576396688, |
| "loss": 1.2043, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.6025066976413062, |
| "grad_norm": 0.8711692690849304, |
| "learning_rate": 0.00010804506580344664, |
| "loss": 1.2145, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.604005470521011, |
| "grad_norm": 0.8061762452125549, |
| "learning_rate": 0.00010734650440721944, |
| "loss": 1.1931, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.6055042434007156, |
| "grad_norm": 0.8066831827163696, |
| "learning_rate": 0.00010664894803515744, |
| "loss": 1.1848, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.6070030162804204, |
| "grad_norm": 0.8515334725379944, |
| "learning_rate": 0.00010595241312345186, |
| "loss": 1.2201, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.6085017891601251, |
| "grad_norm": 0.8231693506240845, |
| "learning_rate": 0.00010525691608422577, |
| "loss": 1.2027, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.6100005620398299, |
| "grad_norm": 0.8122180104255676, |
| "learning_rate": 0.00010456247330514733, |
| "loss": 1.1939, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.6114993349195347, |
| "grad_norm": 0.8237882852554321, |
| "learning_rate": 0.00010386910114904364, |
| "loss": 1.1879, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.6129981077992394, |
| "grad_norm": 0.8404508829116821, |
| "learning_rate": 0.00010317681595351525, |
| "loss": 1.201, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.6144968806789441, |
| "grad_norm": 0.7942978143692017, |
| "learning_rate": 0.00010248563403055112, |
| "loss": 1.1978, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.6159956535586488, |
| "grad_norm": 0.8365768194198608, |
| "learning_rate": 0.00010179557166614439, |
| "loss": 1.1903, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.6174944264383536, |
| "grad_norm": 0.8552048802375793, |
| "learning_rate": 0.00010110664511990852, |
| "loss": 1.1894, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.6189931993180583, |
| "grad_norm": 0.7810205817222595, |
| "learning_rate": 0.00010041887062469425, |
| "loss": 1.2134, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.6204919721977631, |
| "grad_norm": 0.8970271348953247, |
| "learning_rate": 9.973226438620703e-05, |
| "loss": 1.1936, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.6219907450774679, |
| "grad_norm": 0.8107681274414062, |
| "learning_rate": 9.904684258262535e-05, |
| "loss": 1.1842, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.6234895179571726, |
| "grad_norm": 0.7766697406768799, |
| "learning_rate": 9.836262136421924e-05, |
| "loss": 1.2083, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.6249882908368773, |
| "grad_norm": 0.818970799446106, |
| "learning_rate": 9.767961685297012e-05, |
| "loss": 1.2042, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.626487063716582, |
| "grad_norm": 0.8687489628791809, |
| "learning_rate": 9.699784514219056e-05, |
| "loss": 1.2028, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.6279858365962868, |
| "grad_norm": 0.8318779468536377, |
| "learning_rate": 9.631732229614529e-05, |
| "loss": 1.1856, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.6294846094759915, |
| "grad_norm": 0.8384169340133667, |
| "learning_rate": 9.56380643496726e-05, |
| "loss": 1.1887, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.6309833823556963, |
| "grad_norm": 0.8248523473739624, |
| "learning_rate": 9.496008730780657e-05, |
| "loss": 1.1771, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.632482155235401, |
| "grad_norm": 0.8473740220069885, |
| "learning_rate": 9.428340714539999e-05, |
| "loss": 1.1899, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.6339809281151058, |
| "grad_norm": 0.8694414496421814, |
| "learning_rate": 9.360803980674773e-05, |
| "loss": 1.2023, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.6354797009948105, |
| "grad_norm": 0.7950302958488464, |
| "learning_rate": 9.29340012052114e-05, |
| "loss": 1.1996, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.6369784738745152, |
| "grad_norm": 0.8233305215835571, |
| "learning_rate": 9.226130722284413e-05, |
| "loss": 1.1922, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.63847724675422, |
| "grad_norm": 0.894221842288971, |
| "learning_rate": 9.158997371001634e-05, |
| "loss": 1.1839, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.6399760196339247, |
| "grad_norm": 0.8127320408821106, |
| "learning_rate": 9.092001648504245e-05, |
| "loss": 1.186, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.6414747925136295, |
| "grad_norm": 0.8138620257377625, |
| "learning_rate": 9.025145133380806e-05, |
| "loss": 1.1845, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.6429735653933342, |
| "grad_norm": 0.7820717692375183, |
| "learning_rate": 8.958429400939794e-05, |
| "loss": 1.2, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.644472338273039, |
| "grad_norm": 0.8710906505584717, |
| "learning_rate": 8.891856023172496e-05, |
| "loss": 1.1838, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.6459711111527436, |
| "grad_norm": 0.8574820756912231, |
| "learning_rate": 8.825426568715958e-05, |
| "loss": 1.1876, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.6474698840324484, |
| "grad_norm": 0.8116622567176819, |
| "learning_rate": 8.759142602816032e-05, |
| "loss": 1.1908, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.6489686569121532, |
| "grad_norm": 0.82753586769104, |
| "learning_rate": 8.693005687290486e-05, |
| "loss": 1.1915, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.6504674297918579, |
| "grad_norm": 0.8335912227630615, |
| "learning_rate": 8.627017380492228e-05, |
| "loss": 1.191, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.6519662026715627, |
| "grad_norm": 0.8342856168746948, |
| "learning_rate": 8.561179237272537e-05, |
| "loss": 1.2059, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.6534649755512674, |
| "grad_norm": 0.8831748366355896, |
| "learning_rate": 8.495492808944492e-05, |
| "loss": 1.1895, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.6549637484309722, |
| "grad_norm": 0.7899265885353088, |
| "learning_rate": 8.429959643246359e-05, |
| "loss": 1.1913, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.6564625213106768, |
| "grad_norm": 0.7902125716209412, |
| "learning_rate": 8.364581284305171e-05, |
| "loss": 1.1883, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.6579612941903816, |
| "grad_norm": 0.8503438830375671, |
| "learning_rate": 8.299359272600301e-05, |
| "loss": 1.1887, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.6594600670700864, |
| "grad_norm": 0.8556089997291565, |
| "learning_rate": 8.234295144927204e-05, |
| "loss": 1.1751, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.6609588399497911, |
| "grad_norm": 0.837721586227417, |
| "learning_rate": 8.169390434361184e-05, |
| "loss": 1.1833, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.6624576128294959, |
| "grad_norm": 0.8476043343544006, |
| "learning_rate": 8.104646670221263e-05, |
| "loss": 1.2005, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.6639563857092006, |
| "grad_norm": 0.8832587599754333, |
| "learning_rate": 8.040065378034176e-05, |
| "loss": 1.1833, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.6654551585889054, |
| "grad_norm": 0.8358840346336365, |
| "learning_rate": 7.975648079498393e-05, |
| "loss": 1.1691, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.66695393146861, |
| "grad_norm": 0.8542131185531616, |
| "learning_rate": 7.911396292448295e-05, |
| "loss": 1.1891, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.6684527043483148, |
| "grad_norm": 0.7880673408508301, |
| "learning_rate": 7.847311530818372e-05, |
| "loss": 1.1668, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.6699514772280195, |
| "grad_norm": 0.8573721051216125, |
| "learning_rate": 7.783395304607596e-05, |
| "loss": 1.1895, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.6714502501077243, |
| "grad_norm": 0.8283008337020874, |
| "learning_rate": 7.719649119843801e-05, |
| "loss": 1.1685, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.6729490229874291, |
| "grad_norm": 0.8286172151565552, |
| "learning_rate": 7.656074478548231e-05, |
| "loss": 1.1723, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.6744477958671338, |
| "grad_norm": 0.9178438782691956, |
| "learning_rate": 7.592672878700118e-05, |
| "loss": 1.1861, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.6759465687468386, |
| "grad_norm": 0.8172318339347839, |
| "learning_rate": 7.529445814201399e-05, |
| "loss": 1.1899, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.6774453416265432, |
| "grad_norm": 0.8141478896141052, |
| "learning_rate": 7.466394774841536e-05, |
| "loss": 1.1707, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.678944114506248, |
| "grad_norm": 0.8715807795524597, |
| "learning_rate": 7.40352124626237e-05, |
| "loss": 1.1672, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.6804428873859527, |
| "grad_norm": 0.909372091293335, |
| "learning_rate": 7.340826709923161e-05, |
| "loss": 1.1773, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.6819416602656575, |
| "grad_norm": 0.7548136115074158, |
| "learning_rate": 7.278312643065637e-05, |
| "loss": 1.1886, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.6834404331453622, |
| "grad_norm": 0.8248059749603271, |
| "learning_rate": 7.215980518679235e-05, |
| "loss": 1.1779, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.684939206025067, |
| "grad_norm": 0.8389328122138977, |
| "learning_rate": 7.153831805466337e-05, |
| "loss": 1.1894, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.6864379789047718, |
| "grad_norm": 0.8137337565422058, |
| "learning_rate": 7.091867967807722e-05, |
| "loss": 1.1864, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.6879367517844764, |
| "grad_norm": 0.884548008441925, |
| "learning_rate": 7.030090465728023e-05, |
| "loss": 1.198, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.6894355246641812, |
| "grad_norm": 0.803050696849823, |
| "learning_rate": 6.968500754861329e-05, |
| "loss": 1.1778, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.6909342975438859, |
| "grad_norm": 0.870222270488739, |
| "learning_rate": 6.907100286416906e-05, |
| "loss": 1.174, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.6924330704235907, |
| "grad_norm": 0.8954461812973022, |
| "learning_rate": 6.845890507144973e-05, |
| "loss": 1.1967, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.6939318433032954, |
| "grad_norm": 0.8398353457450867, |
| "learning_rate": 6.784872859302653e-05, |
| "loss": 1.1678, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.6954306161830002, |
| "grad_norm": 0.8398712277412415, |
| "learning_rate": 6.724048780619943e-05, |
| "loss": 1.1912, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.696929389062705, |
| "grad_norm": 0.7871512770652771, |
| "learning_rate": 6.663419704265887e-05, |
| "loss": 1.1626, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.6984281619424096, |
| "grad_norm": 0.7826485633850098, |
| "learning_rate": 6.602987058814751e-05, |
| "loss": 1.1594, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.6999269348221144, |
| "grad_norm": 0.8265202641487122, |
| "learning_rate": 6.542752268212422e-05, |
| "loss": 1.1572, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.7008261985499372, |
| "eval_loss": 1.2169182300567627, |
| "eval_runtime": 34.6978, |
| "eval_samples_per_second": 720.507, |
| "eval_steps_per_second": 90.063, |
| "step": 4676 |
| }, |
| { |
| "epoch": 0.7014257077018191, |
| "grad_norm": 0.8427773714065552, |
| "learning_rate": 6.482716751742804e-05, |
| "loss": 1.1973, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.7029244805815239, |
| "grad_norm": 0.886134684085846, |
| "learning_rate": 6.422881923994411e-05, |
| "loss": 1.1851, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.7044232534612286, |
| "grad_norm": 0.8166589140892029, |
| "learning_rate": 6.363249194827026e-05, |
| "loss": 1.1851, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.7059220263409334, |
| "grad_norm": 0.9108282923698425, |
| "learning_rate": 6.303819969338465e-05, |
| "loss": 1.1689, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.707420799220638, |
| "grad_norm": 0.927238404750824, |
| "learning_rate": 6.2445956478315e-05, |
| "loss": 1.1975, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.7089195721003428, |
| "grad_norm": 0.8622543811798096, |
| "learning_rate": 6.185577625780826e-05, |
| "loss": 1.1834, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.7104183449800476, |
| "grad_norm": 0.8211912512779236, |
| "learning_rate": 6.126767293800227e-05, |
| "loss": 1.1775, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.7119171178597523, |
| "grad_norm": 0.8494730591773987, |
| "learning_rate": 6.0681660376097654e-05, |
| "loss": 1.1705, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.7134158907394571, |
| "grad_norm": 0.8458199501037598, |
| "learning_rate": 6.00977523800315e-05, |
| "loss": 1.1944, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.7149146636191618, |
| "grad_norm": 0.8529589176177979, |
| "learning_rate": 5.951596270815212e-05, |
| "loss": 1.1913, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.7164134364988666, |
| "grad_norm": 0.8020589351654053, |
| "learning_rate": 5.893630506889463e-05, |
| "loss": 1.1746, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.7179122093785713, |
| "grad_norm": 0.8732789158821106, |
| "learning_rate": 5.835879312045821e-05, |
| "loss": 1.1859, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.719410982258276, |
| "grad_norm": 0.8340930342674255, |
| "learning_rate": 5.7783440470483965e-05, |
| "loss": 1.1795, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.7209097551379807, |
| "grad_norm": 0.816852867603302, |
| "learning_rate": 5.7210260675734656e-05, |
| "loss": 1.1825, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.7224085280176855, |
| "grad_norm": 0.9594895243644714, |
| "learning_rate": 5.663926724177489e-05, |
| "loss": 1.1665, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.7239073008973903, |
| "grad_norm": 0.8372864127159119, |
| "learning_rate": 5.6070473622653293e-05, |
| "loss": 1.1694, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.725406073777095, |
| "grad_norm": 0.8622620105743408, |
| "learning_rate": 5.5503893220585096e-05, |
| "loss": 1.1676, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.7269048466567998, |
| "grad_norm": 0.8314744234085083, |
| "learning_rate": 5.493953938563666e-05, |
| "loss": 1.1847, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.7284036195365045, |
| "grad_norm": 0.7822853326797485, |
| "learning_rate": 5.437742541541085e-05, |
| "loss": 1.1617, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.7299023924162092, |
| "grad_norm": 0.8349812030792236, |
| "learning_rate": 5.381756455473346e-05, |
| "loss": 1.167, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.7314011652959139, |
| "grad_norm": 0.8560863733291626, |
| "learning_rate": 5.3259969995341535e-05, |
| "loss": 1.1723, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.7328999381756187, |
| "grad_norm": 0.8179499506950378, |
| "learning_rate": 5.270465487557218e-05, |
| "loss": 1.1685, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.7343987110553235, |
| "grad_norm": 0.8827424645423889, |
| "learning_rate": 5.215163228005328e-05, |
| "loss": 1.1701, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.7358974839350282, |
| "grad_norm": 0.847288191318512, |
| "learning_rate": 5.16009152393949e-05, |
| "loss": 1.1827, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.737396256814733, |
| "grad_norm": 0.8526114225387573, |
| "learning_rate": 5.105251672988256e-05, |
| "loss": 1.2011, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.7388950296944377, |
| "grad_norm": 0.9031111598014832, |
| "learning_rate": 5.050644967317117e-05, |
| "loss": 1.1769, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.7403938025741424, |
| "grad_norm": 0.8632171154022217, |
| "learning_rate": 4.996272693598088e-05, |
| "loss": 1.1855, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.7418925754538471, |
| "grad_norm": 0.8957777619361877, |
| "learning_rate": 4.9421361329793593e-05, |
| "loss": 1.162, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.7433913483335519, |
| "grad_norm": 0.8075733184814453, |
| "learning_rate": 4.888236561055135e-05, |
| "loss": 1.1684, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.7448901212132566, |
| "grad_norm": 0.8080265522003174, |
| "learning_rate": 4.834575247835571e-05, |
| "loss": 1.1775, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.7463888940929614, |
| "grad_norm": 0.8255054950714111, |
| "learning_rate": 4.7811534577168265e-05, |
| "loss": 1.2002, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.7478876669726662, |
| "grad_norm": 0.8013522624969482, |
| "learning_rate": 4.7279724494513196e-05, |
| "loss": 1.1784, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.7493864398523709, |
| "grad_norm": 0.8615186810493469, |
| "learning_rate": 4.675033476118002e-05, |
| "loss": 1.169, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.7508852127320756, |
| "grad_norm": 0.8255153894424438, |
| "learning_rate": 4.622337785092908e-05, |
| "loss": 1.1701, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.7523839856117803, |
| "grad_norm": 0.9061243534088135, |
| "learning_rate": 4.569886618019698e-05, |
| "loss": 1.1844, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.7538827584914851, |
| "grad_norm": 0.8674840927124023, |
| "learning_rate": 4.517681210780446e-05, |
| "loss": 1.1731, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.7553815313711898, |
| "grad_norm": 0.8248318433761597, |
| "learning_rate": 4.465722793466503e-05, |
| "loss": 1.1787, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.7568803042508946, |
| "grad_norm": 0.8240404725074768, |
| "learning_rate": 4.414012590349503e-05, |
| "loss": 1.1802, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.7583790771305994, |
| "grad_norm": 0.9083341956138611, |
| "learning_rate": 4.362551819852536e-05, |
| "loss": 1.1687, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.7598778500103041, |
| "grad_norm": 0.929366409778595, |
| "learning_rate": 4.3113416945214186e-05, |
| "loss": 1.2057, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.7613766228900088, |
| "grad_norm": 0.8615067005157471, |
| "learning_rate": 4.26038342099615e-05, |
| "loss": 1.172, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.7628753957697135, |
| "grad_norm": 0.8247563242912292, |
| "learning_rate": 4.209678199982441e-05, |
| "loss": 1.188, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.7643741686494183, |
| "grad_norm": 0.9103371500968933, |
| "learning_rate": 4.1592272262234714e-05, |
| "loss": 1.1673, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.765872941529123, |
| "grad_norm": 0.9434977769851685, |
| "learning_rate": 4.109031688471692e-05, |
| "loss": 1.1764, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.7673717144088278, |
| "grad_norm": 0.912162721157074, |
| "learning_rate": 4.059092769460852e-05, |
| "loss": 1.1589, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.7688704872885325, |
| "grad_norm": 0.9519455432891846, |
| "learning_rate": 4.009411645878097e-05, |
| "loss": 1.1769, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.7703692601682373, |
| "grad_norm": 0.8289220929145813, |
| "learning_rate": 3.9599894883362757e-05, |
| "loss": 1.1705, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.771868033047942, |
| "grad_norm": 0.8658756613731384, |
| "learning_rate": 3.910827461346339e-05, |
| "loss": 1.1668, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.7733668059276467, |
| "grad_norm": 0.8500393629074097, |
| "learning_rate": 3.8619267232898974e-05, |
| "loss": 1.1676, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.7748655788073515, |
| "grad_norm": 0.8148017525672913, |
| "learning_rate": 3.813288426391946e-05, |
| "loss": 1.1767, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.7763643516870562, |
| "grad_norm": 0.868813693523407, |
| "learning_rate": 3.7649137166936865e-05, |
| "loss": 1.1652, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.777863124566761, |
| "grad_norm": 0.8340365290641785, |
| "learning_rate": 3.716803734025559e-05, |
| "loss": 1.1581, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.7793618974464657, |
| "grad_norm": 0.8384237885475159, |
| "learning_rate": 3.668959611980345e-05, |
| "loss": 1.1673, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.7808606703261705, |
| "grad_norm": 0.9455569386482239, |
| "learning_rate": 3.6213824778865e-05, |
| "loss": 1.1678, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.7823594432058751, |
| "grad_norm": 0.8394315838813782, |
| "learning_rate": 3.574073452781544e-05, |
| "loss": 1.1683, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.7838582160855799, |
| "grad_norm": 0.8814610242843628, |
| "learning_rate": 3.527033651385699e-05, |
| "loss": 1.1647, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.7853569889652847, |
| "grad_norm": 0.8106175661087036, |
| "learning_rate": 3.480264182075573e-05, |
| "loss": 1.1664, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.7868557618449894, |
| "grad_norm": 0.8509274125099182, |
| "learning_rate": 3.4337661468580715e-05, |
| "loss": 1.1634, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.7883545347246942, |
| "grad_norm": 0.8374130725860596, |
| "learning_rate": 3.387540641344441e-05, |
| "loss": 1.1955, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.7898533076043989, |
| "grad_norm": 0.8402609825134277, |
| "learning_rate": 3.34158875472442e-05, |
| "loss": 1.1601, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.7913520804841037, |
| "grad_norm": 0.8734246492385864, |
| "learning_rate": 3.29591156974061e-05, |
| "loss": 1.1602, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.7928508533638083, |
| "grad_norm": 0.9936115741729736, |
| "learning_rate": 3.250510162662933e-05, |
| "loss": 1.1824, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.7943496262435131, |
| "grad_norm": 0.8631161451339722, |
| "learning_rate": 3.2053856032633016e-05, |
| "loss": 1.1679, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.7958483991232179, |
| "grad_norm": 0.8633742928504944, |
| "learning_rate": 3.160538954790385e-05, |
| "loss": 1.1603, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.7973471720029226, |
| "grad_norm": 0.8703827857971191, |
| "learning_rate": 3.1159712739445755e-05, |
| "loss": 1.1476, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.7988459448826274, |
| "grad_norm": 0.9506652355194092, |
| "learning_rate": 3.071683610853085e-05, |
| "loss": 1.1796, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.8003447177623321, |
| "grad_norm": 0.8602906465530396, |
| "learning_rate": 3.0276770090451873e-05, |
| "loss": 1.1754, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.800944226914214, |
| "eval_loss": 1.2033511400222778, |
| "eval_runtime": 34.2855, |
| "eval_samples_per_second": 729.172, |
| "eval_steps_per_second": 91.146, |
| "step": 5344 |
| }, |
| { |
| "epoch": 0.8018434906420369, |
| "grad_norm": 0.8773970007896423, |
| "learning_rate": 2.983952505427659e-05, |
| "loss": 1.1674, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.8033422635217415, |
| "grad_norm": 0.91343754529953, |
| "learning_rate": 2.940511130260314e-05, |
| "loss": 1.1703, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.8048410364014463, |
| "grad_norm": 0.8615610003471375, |
| "learning_rate": 2.8973539071317558e-05, |
| "loss": 1.158, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.806339809281151, |
| "grad_norm": 0.8163581490516663, |
| "learning_rate": 2.8544818529352408e-05, |
| "loss": 1.1588, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.8078385821608558, |
| "grad_norm": 0.8749719262123108, |
| "learning_rate": 2.8118959778447318e-05, |
| "loss": 1.1594, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.8093373550405606, |
| "grad_norm": 0.8937957882881165, |
| "learning_rate": 2.7695972852910774e-05, |
| "loss": 1.1598, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.8108361279202653, |
| "grad_norm": 0.8792014122009277, |
| "learning_rate": 2.7275867719383954e-05, |
| "loss": 1.1771, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.8123349007999701, |
| "grad_norm": 0.9492007493972778, |
| "learning_rate": 2.6858654276605536e-05, |
| "loss": 1.1764, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.8138336736796747, |
| "grad_norm": 0.8084315061569214, |
| "learning_rate": 2.6444342355178816e-05, |
| "loss": 1.1667, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.8153324465593795, |
| "grad_norm": 0.8339678049087524, |
| "learning_rate": 2.6032941717339882e-05, |
| "loss": 1.1612, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.8168312194390842, |
| "grad_norm": 0.8784351348876953, |
| "learning_rate": 2.5624462056727563e-05, |
| "loss": 1.1567, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.818329992318789, |
| "grad_norm": 0.8871691226959229, |
| "learning_rate": 2.521891299815515e-05, |
| "loss": 1.1672, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.8198287651984937, |
| "grad_norm": 0.8839700222015381, |
| "learning_rate": 2.4816304097383462e-05, |
| "loss": 1.1592, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.8213275380781985, |
| "grad_norm": 0.931260883808136, |
| "learning_rate": 2.4416644840895912e-05, |
| "loss": 1.1625, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.8228263109579033, |
| "grad_norm": 0.9054010510444641, |
| "learning_rate": 2.4019944645674595e-05, |
| "loss": 1.165, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.8243250838376079, |
| "grad_norm": 0.9103521704673767, |
| "learning_rate": 2.3626212858978894e-05, |
| "loss": 1.1598, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.8258238567173127, |
| "grad_norm": 0.8303549885749817, |
| "learning_rate": 2.3235458758124876e-05, |
| "loss": 1.1667, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.8273226295970174, |
| "grad_norm": 0.9100333452224731, |
| "learning_rate": 2.284769155026678e-05, |
| "loss": 1.1634, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.8288214024767222, |
| "grad_norm": 0.841582715511322, |
| "learning_rate": 2.2462920372180154e-05, |
| "loss": 1.1561, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.8303201753564269, |
| "grad_norm": 0.8073222637176514, |
| "learning_rate": 2.2081154290046445e-05, |
| "loss": 1.1584, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.8318189482361317, |
| "grad_norm": 0.8581207990646362, |
| "learning_rate": 2.170240229923954e-05, |
| "loss": 1.1547, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.8333177211158365, |
| "grad_norm": 0.859908401966095, |
| "learning_rate": 2.1326673324113603e-05, |
| "loss": 1.1783, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.8348164939955411, |
| "grad_norm": 0.8941123485565186, |
| "learning_rate": 2.0953976217792995e-05, |
| "loss": 1.1543, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.8363152668752459, |
| "grad_norm": 0.8776352405548096, |
| "learning_rate": 2.0584319761963532e-05, |
| "loss": 1.1656, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.8378140397549506, |
| "grad_norm": 0.8063647150993347, |
| "learning_rate": 2.021771266666568e-05, |
| "loss": 1.1668, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.8393128126346554, |
| "grad_norm": 0.856473445892334, |
| "learning_rate": 1.9854163570089175e-05, |
| "loss": 1.1535, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.8408115855143601, |
| "grad_norm": 0.7813337445259094, |
| "learning_rate": 1.9493681038369634e-05, |
| "loss": 1.163, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.8423103583940649, |
| "grad_norm": 0.854742705821991, |
| "learning_rate": 1.9136273565386674e-05, |
| "loss": 1.1579, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.8438091312737696, |
| "grad_norm": 0.8952386975288391, |
| "learning_rate": 1.8781949572563682e-05, |
| "loss": 1.167, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.8453079041534743, |
| "grad_norm": 0.8518572449684143, |
| "learning_rate": 1.843071740866957e-05, |
| "loss": 1.1531, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.8468066770331791, |
| "grad_norm": 0.9791853427886963, |
| "learning_rate": 1.808258534962179e-05, |
| "loss": 1.1657, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.8483054499128838, |
| "grad_norm": 0.8777291178703308, |
| "learning_rate": 1.7737561598291644e-05, |
| "loss": 1.1598, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.8498042227925886, |
| "grad_norm": 0.9136359095573425, |
| "learning_rate": 1.7395654284310743e-05, |
| "loss": 1.1567, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.8513029956722933, |
| "grad_norm": 0.8480714559555054, |
| "learning_rate": 1.7056871463879616e-05, |
| "loss": 1.1711, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.8528017685519981, |
| "grad_norm": 0.9030656814575195, |
| "learning_rate": 1.6721221119577778e-05, |
| "loss": 1.1501, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.8543005414317028, |
| "grad_norm": 0.845726490020752, |
| "learning_rate": 1.6388711160175744e-05, |
| "loss": 1.1734, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.8557993143114075, |
| "grad_norm": 0.8673616647720337, |
| "learning_rate": 1.6059349420448566e-05, |
| "loss": 1.1442, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.8572980871911122, |
| "grad_norm": 0.8604286909103394, |
| "learning_rate": 1.5733143660991354e-05, |
| "loss": 1.1631, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.858796860070817, |
| "grad_norm": 0.9111164808273315, |
| "learning_rate": 1.5410101568036266e-05, |
| "loss": 1.1671, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.8602956329505218, |
| "grad_norm": 0.8710470199584961, |
| "learning_rate": 1.509023075327151e-05, |
| "loss": 1.1632, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.8617944058302265, |
| "grad_norm": 1.0082249641418457, |
| "learning_rate": 1.4773538753662006e-05, |
| "loss": 1.1686, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.8632931787099313, |
| "grad_norm": 0.8322862386703491, |
| "learning_rate": 1.4460033031271707e-05, |
| "loss": 1.1677, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.864791951589636, |
| "grad_norm": 0.897473156452179, |
| "learning_rate": 1.4149720973087814e-05, |
| "loss": 1.1712, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.8662907244693407, |
| "grad_norm": 0.8400557041168213, |
| "learning_rate": 1.3842609890846795e-05, |
| "loss": 1.1481, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.8677894973490454, |
| "grad_norm": 0.8493039608001709, |
| "learning_rate": 1.353870702086195e-05, |
| "loss": 1.1645, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.8692882702287502, |
| "grad_norm": 0.9090917110443115, |
| "learning_rate": 1.3238019523853043e-05, |
| "loss": 1.1598, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.870787043108455, |
| "grad_norm": 0.8982146978378296, |
| "learning_rate": 1.2940554484777498e-05, |
| "loss": 1.1804, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.8722858159881597, |
| "grad_norm": 0.8175159096717834, |
| "learning_rate": 1.2646318912663522e-05, |
| "loss": 1.1681, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.8737845888678645, |
| "grad_norm": 0.7891288995742798, |
| "learning_rate": 1.235531974044484e-05, |
| "loss": 1.1586, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.8752833617475692, |
| "grad_norm": 0.8779332041740417, |
| "learning_rate": 1.2067563824797516e-05, |
| "loss": 1.1588, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.876782134627274, |
| "grad_norm": 0.882290244102478, |
| "learning_rate": 1.1783057945978203e-05, |
| "loss": 1.1521, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.8782809075069786, |
| "grad_norm": 0.8421468138694763, |
| "learning_rate": 1.1501808807664547e-05, |
| "loss": 1.1295, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.8797796803866834, |
| "grad_norm": 0.8814191222190857, |
| "learning_rate": 1.122382303679708e-05, |
| "loss": 1.1588, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.8812784532663881, |
| "grad_norm": 0.819195568561554, |
| "learning_rate": 1.0949107183423205e-05, |
| "loss": 1.1536, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.8827772261460929, |
| "grad_norm": 0.9204806685447693, |
| "learning_rate": 1.067766772054281e-05, |
| "loss": 1.1613, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.8842759990257977, |
| "grad_norm": 0.9718281030654907, |
| "learning_rate": 1.0409511043955664e-05, |
| "loss": 1.1609, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.8857747719055024, |
| "grad_norm": 0.8659542202949524, |
| "learning_rate": 1.0144643472110919e-05, |
| "loss": 1.1701, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.8872735447852071, |
| "grad_norm": 0.8663217425346375, |
| "learning_rate": 9.883071245957964e-06, |
| "loss": 1.1524, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.8887723176649118, |
| "grad_norm": 0.8552528023719788, |
| "learning_rate": 9.624800528799648e-06, |
| "loss": 1.1732, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.8902710905446166, |
| "grad_norm": 0.8265079855918884, |
| "learning_rate": 9.369837406146802e-06, |
| "loss": 1.1497, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.8917698634243213, |
| "grad_norm": 0.9253767132759094, |
| "learning_rate": 9.118187885575096e-06, |
| "loss": 1.1591, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.8932686363040261, |
| "grad_norm": 0.8842642307281494, |
| "learning_rate": 8.869857896583204e-06, |
| "loss": 1.1541, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.8947674091837308, |
| "grad_norm": 0.8671165108680725, |
| "learning_rate": 8.624853290453438e-06, |
| "loss": 1.1563, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.8962661820634356, |
| "grad_norm": 0.8760147094726562, |
| "learning_rate": 8.383179840113497e-06, |
| "loss": 1.1505, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.8977649549431403, |
| "grad_norm": 0.8540050983428955, |
| "learning_rate": 8.144843240000737e-06, |
| "loss": 1.144, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.899263727822845, |
| "grad_norm": 0.9016329050064087, |
| "learning_rate": 7.909849105927907e-06, |
| "loss": 1.1496, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9007625007025498, |
| "grad_norm": 0.8828489780426025, |
| "learning_rate": 7.678202974950687e-06, |
| "loss": 1.1526, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.9010622552784907, |
| "eval_loss": 1.1948587894439697, |
| "eval_runtime": 36.3071, |
| "eval_samples_per_second": 688.57, |
| "eval_steps_per_second": 86.071, |
| "step": 6012 |
| }, |
| { |
| "epoch": 0.9022612735822545, |
| "grad_norm": 0.8661078810691833, |
| "learning_rate": 7.4499103052374945e-06, |
| "loss": 1.1705, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.9037600464619593, |
| "grad_norm": 0.8815616369247437, |
| "learning_rate": 7.224976475940603e-06, |
| "loss": 1.1633, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.905258819341664, |
| "grad_norm": 0.8972934484481812, |
| "learning_rate": 7.00340678706961e-06, |
| "loss": 1.153, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.9067575922213688, |
| "grad_norm": 0.8003550171852112, |
| "learning_rate": 6.785206459366355e-06, |
| "loss": 1.168, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.9082563651010735, |
| "grad_norm": 0.856256902217865, |
| "learning_rate": 6.570380634182098e-06, |
| "loss": 1.156, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.9097551379807782, |
| "grad_norm": 0.8487430214881897, |
| "learning_rate": 6.3589343733563055e-06, |
| "loss": 1.1568, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.911253910860483, |
| "grad_norm": 0.8809577226638794, |
| "learning_rate": 6.150872659097255e-06, |
| "loss": 1.1517, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.9127526837401877, |
| "grad_norm": 0.8905763626098633, |
| "learning_rate": 5.946200393864886e-06, |
| "loss": 1.1632, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.9142514566198925, |
| "grad_norm": 0.8179495334625244, |
| "learning_rate": 5.74492240025502e-06, |
| "loss": 1.1528, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.9157502294995972, |
| "grad_norm": 0.9552949070930481, |
| "learning_rate": 5.547043420886005e-06, |
| "loss": 1.1442, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.917249002379302, |
| "grad_norm": 0.9348698854446411, |
| "learning_rate": 5.352568118286671e-06, |
| "loss": 1.1548, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.9187477752590066, |
| "grad_norm": 0.8429141640663147, |
| "learning_rate": 5.16150107478675e-06, |
| "loss": 1.1619, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.9202465481387114, |
| "grad_norm": 0.8613331913948059, |
| "learning_rate": 4.973846792408681e-06, |
| "loss": 1.1575, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.9217453210184162, |
| "grad_norm": 0.8871680498123169, |
| "learning_rate": 4.7896096927616925e-06, |
| "loss": 1.1686, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.9232440938981209, |
| "grad_norm": 0.850810170173645, |
| "learning_rate": 4.608794116937487e-06, |
| "loss": 1.1697, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.9247428667778257, |
| "grad_norm": 0.8758169412612915, |
| "learning_rate": 4.4314043254080725e-06, |
| "loss": 1.1813, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.9262416396575304, |
| "grad_norm": 0.8653439879417419, |
| "learning_rate": 4.257444497925328e-06, |
| "loss": 1.1517, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.9277404125372352, |
| "grad_norm": 0.8698475360870361, |
| "learning_rate": 4.086918733422429e-06, |
| "loss": 1.1531, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.9292391854169398, |
| "grad_norm": 0.9468777179718018, |
| "learning_rate": 3.919831049917444e-06, |
| "loss": 1.1491, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.9307379582966446, |
| "grad_norm": 0.8888046741485596, |
| "learning_rate": 3.7561853844185084e-06, |
| "loss": 1.1593, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.9322367311763493, |
| "grad_norm": 0.926652729511261, |
| "learning_rate": 3.595985592831102e-06, |
| "loss": 1.1669, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.9337355040560541, |
| "grad_norm": 0.8611651659011841, |
| "learning_rate": 3.43923544986725e-06, |
| "loss": 1.1625, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.9352342769357589, |
| "grad_norm": 0.8281158804893494, |
| "learning_rate": 3.285938648956482e-06, |
| "loss": 1.1468, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.9367330498154636, |
| "grad_norm": 0.833707332611084, |
| "learning_rate": 3.1360988021589483e-06, |
| "loss": 1.1656, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.9382318226951684, |
| "grad_norm": 0.912096381187439, |
| "learning_rate": 2.989719440080124e-06, |
| "loss": 1.1436, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.939730595574873, |
| "grad_norm": 0.8979214429855347, |
| "learning_rate": 2.8468040117878065e-06, |
| "loss": 1.1524, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.9412293684545778, |
| "grad_norm": 0.8215987682342529, |
| "learning_rate": 2.70735588473065e-06, |
| "loss": 1.157, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.9427281413342825, |
| "grad_norm": 0.8943039774894714, |
| "learning_rate": 2.571378344659042e-06, |
| "loss": 1.16, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.9442269142139873, |
| "grad_norm": 0.8807237148284912, |
| "learning_rate": 2.438874595547485e-06, |
| "loss": 1.1391, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.9457256870936921, |
| "grad_norm": 0.8931160569190979, |
| "learning_rate": 2.3098477595192566e-06, |
| "loss": 1.1675, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.9472244599733968, |
| "grad_norm": 0.8743401765823364, |
| "learning_rate": 2.1843008767726823e-06, |
| "loss": 1.1542, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.9487232328531016, |
| "grad_norm": 0.8389233946800232, |
| "learning_rate": 2.062236905509712e-06, |
| "loss": 1.1517, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.9502220057328062, |
| "grad_norm": 0.911375880241394, |
| "learning_rate": 1.9436587218659593e-06, |
| "loss": 1.1294, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.951720778612511, |
| "grad_norm": 0.8729566931724548, |
| "learning_rate": 1.828569119843204e-06, |
| "loss": 1.1672, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.9532195514922157, |
| "grad_norm": 0.9174323081970215, |
| "learning_rate": 1.716970811243329e-06, |
| "loss": 1.1851, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.9547183243719205, |
| "grad_norm": 0.8884388208389282, |
| "learning_rate": 1.6088664256045713e-06, |
| "loss": 1.1473, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.9562170972516252, |
| "grad_norm": 0.8815491795539856, |
| "learning_rate": 1.5042585101395055e-06, |
| "loss": 1.1601, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.95771587013133, |
| "grad_norm": 0.869717538356781, |
| "learning_rate": 1.4031495296749906e-06, |
| "loss": 1.1522, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.9592146430110348, |
| "grad_norm": 1.0628986358642578, |
| "learning_rate": 1.3055418665942009e-06, |
| "loss": 1.1279, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.9607134158907394, |
| "grad_norm": 0.8930190801620483, |
| "learning_rate": 1.21143782078037e-06, |
| "loss": 1.1602, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.9622121887704442, |
| "grad_norm": 1.1220424175262451, |
| "learning_rate": 1.1208396095626682e-06, |
| "loss": 1.1703, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.9637109616501489, |
| "grad_norm": 0.8779110312461853, |
| "learning_rate": 1.0337493676639442e-06, |
| "loss": 1.1473, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.9652097345298537, |
| "grad_norm": 0.9182150363922119, |
| "learning_rate": 9.501691471504146e-07, |
| "loss": 1.164, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.9667085074095584, |
| "grad_norm": 0.8875131607055664, |
| "learning_rate": 8.70100917383354e-07, |
| "loss": 1.1555, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.9682072802892632, |
| "grad_norm": 0.9799543619155884, |
| "learning_rate": 7.935465649726136e-07, |
| "loss": 1.1604, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.969706053168968, |
| "grad_norm": 0.8576260805130005, |
| "learning_rate": 7.205078937322417e-07, |
| "loss": 1.1585, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.9712048260486726, |
| "grad_norm": 0.9388023614883423, |
| "learning_rate": 6.50986624637917e-07, |
| "loss": 1.1675, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.9727035989283774, |
| "grad_norm": 0.8880642056465149, |
| "learning_rate": 5.849843957864808e-07, |
| "loss": 1.1591, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.9742023718080821, |
| "grad_norm": 0.9257229566574097, |
| "learning_rate": 5.225027623572686e-07, |
| "loss": 1.1445, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.9757011446877869, |
| "grad_norm": 0.9189469814300537, |
| "learning_rate": 4.635431965754888e-07, |
| "loss": 1.143, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.9771999175674916, |
| "grad_norm": 0.9432305693626404, |
| "learning_rate": 4.081070876775172e-07, |
| "loss": 1.1525, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.9786986904471964, |
| "grad_norm": 0.915924072265625, |
| "learning_rate": 3.5619574187822354e-07, |
| "loss": 1.1687, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.980197463326901, |
| "grad_norm": 0.8820152878761292, |
| "learning_rate": 3.078103823401123e-07, |
| "loss": 1.1697, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.9816962362066058, |
| "grad_norm": 0.8938290476799011, |
| "learning_rate": 2.629521491445463e-07, |
| "loss": 1.1666, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.9831950090863106, |
| "grad_norm": 0.8603911995887756, |
| "learning_rate": 2.216220992648843e-07, |
| "loss": 1.1634, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.9846937819660153, |
| "grad_norm": 0.8677682280540466, |
| "learning_rate": 1.8382120654156785e-07, |
| "loss": 1.1641, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.9861925548457201, |
| "grad_norm": 0.8882606625556946, |
| "learning_rate": 1.495503616591731e-07, |
| "loss": 1.1415, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.9876913277254248, |
| "grad_norm": 0.807101309299469, |
| "learning_rate": 1.1881037212542744e-07, |
| "loss": 1.1574, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.9891901006051296, |
| "grad_norm": 0.8506379723548889, |
| "learning_rate": 9.160196225217465e-08, |
| "loss": 1.1653, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.9906888734848343, |
| "grad_norm": 0.871025800704956, |
| "learning_rate": 6.792577313833868e-08, |
| "loss": 1.1458, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.992187646364539, |
| "grad_norm": 0.8364571928977966, |
| "learning_rate": 4.778236265475244e-08, |
| "loss": 1.1727, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.9936864192442437, |
| "grad_norm": 0.8232174515724182, |
| "learning_rate": 3.117220543110144e-08, |
| "loss": 1.1693, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.9951851921239485, |
| "grad_norm": 0.8296674489974976, |
| "learning_rate": 1.8095692844649625e-08, |
| "loss": 1.1582, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.9966839650036533, |
| "grad_norm": 0.8697881698608398, |
| "learning_rate": 8.553133011113267e-09, |
| "loss": 1.161, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.998182737883358, |
| "grad_norm": 0.9544343948364258, |
| "learning_rate": 2.544750777316862e-09, |
| "loss": 1.1713, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.9996815107630628, |
| "grad_norm": 0.9088115096092224, |
| "learning_rate": 7.068771591400845e-11, |
| "loss": 1.1531, |
| "step": 6670 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 6672, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 668, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2428429941080064.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|