| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.999818820886328, | |
| "eval_steps": 1000, | |
| "global_step": 34495, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007247164546871037, | |
| "grad_norm": 3.8405699729919434, | |
| "learning_rate": 4.9928975213799106e-05, | |
| "loss": 1.4877, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.014494329093742073, | |
| "grad_norm": 4.489353179931641, | |
| "learning_rate": 4.9856500942165534e-05, | |
| "loss": 1.1021, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.021741493640613112, | |
| "grad_norm": 5.1422295570373535, | |
| "learning_rate": 4.978402667053196e-05, | |
| "loss": 1.0659, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.028988658187484147, | |
| "grad_norm": 4.703744411468506, | |
| "learning_rate": 4.971155239889839e-05, | |
| "loss": 1.0636, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03623582273435518, | |
| "grad_norm": 3.330397605895996, | |
| "learning_rate": 4.963907812726482e-05, | |
| "loss": 1.0025, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.043482987281226224, | |
| "grad_norm": 3.511565685272217, | |
| "learning_rate": 4.9566603855631256e-05, | |
| "loss": 0.9945, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05073015182809726, | |
| "grad_norm": 3.470792055130005, | |
| "learning_rate": 4.9494129583997684e-05, | |
| "loss": 1.0228, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.057977316374968293, | |
| "grad_norm": 3.1396453380584717, | |
| "learning_rate": 4.942165531236411e-05, | |
| "loss": 0.9935, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06522448092183933, | |
| "grad_norm": 3.4228012561798096, | |
| "learning_rate": 4.934918104073054e-05, | |
| "loss": 0.9904, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07247164546871036, | |
| "grad_norm": 3.423574447631836, | |
| "learning_rate": 4.927670676909698e-05, | |
| "loss": 0.9884, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0797188100155814, | |
| "grad_norm": 2.9020352363586426, | |
| "learning_rate": 4.9204232497463405e-05, | |
| "loss": 0.9093, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08696597456245245, | |
| "grad_norm": 3.0173707008361816, | |
| "learning_rate": 4.9131758225829834e-05, | |
| "loss": 0.9374, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09421313910932348, | |
| "grad_norm": 3.4516496658325195, | |
| "learning_rate": 4.905928395419626e-05, | |
| "loss": 0.9798, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10146030365619452, | |
| "grad_norm": 2.8202695846557617, | |
| "learning_rate": 4.898680968256269e-05, | |
| "loss": 0.9293, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10870746820306555, | |
| "grad_norm": 3.3415849208831787, | |
| "learning_rate": 4.8914335410929126e-05, | |
| "loss": 0.9622, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.11595463274993659, | |
| "grad_norm": 3.4417173862457275, | |
| "learning_rate": 4.8841861139295555e-05, | |
| "loss": 0.9363, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12320179729680762, | |
| "grad_norm": 3.051495313644409, | |
| "learning_rate": 4.8769386867661983e-05, | |
| "loss": 0.9086, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.13044896184367866, | |
| "grad_norm": 2.5990054607391357, | |
| "learning_rate": 4.869691259602841e-05, | |
| "loss": 0.8976, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1376961263905497, | |
| "grad_norm": 3.3123867511749268, | |
| "learning_rate": 4.862443832439485e-05, | |
| "loss": 0.9568, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.14494329093742073, | |
| "grad_norm": 2.7175092697143555, | |
| "learning_rate": 4.8551964052761276e-05, | |
| "loss": 0.8665, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14494329093742073, | |
| "eval_loss": 0.855056643486023, | |
| "eval_runtime": 245.3157, | |
| "eval_samples_per_second": 140.615, | |
| "eval_steps_per_second": 14.064, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15219045548429178, | |
| "grad_norm": 2.541044235229492, | |
| "learning_rate": 4.848093926656037e-05, | |
| "loss": 0.887, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1594376200311628, | |
| "grad_norm": 3.0265707969665527, | |
| "learning_rate": 4.840846499492681e-05, | |
| "loss": 0.8987, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.16668478457803385, | |
| "grad_norm": 3.479360818862915, | |
| "learning_rate": 4.8335990723293237e-05, | |
| "loss": 0.8744, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1739319491249049, | |
| "grad_norm": 3.381410598754883, | |
| "learning_rate": 4.8263516451659665e-05, | |
| "loss": 0.869, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18117911367177592, | |
| "grad_norm": 3.526550769805908, | |
| "learning_rate": 4.8191042180026094e-05, | |
| "loss": 0.8753, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.18842627821864696, | |
| "grad_norm": 3.5922884941101074, | |
| "learning_rate": 4.811856790839252e-05, | |
| "loss": 0.8661, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.19567344276551799, | |
| "grad_norm": 3.390925168991089, | |
| "learning_rate": 4.804609363675896e-05, | |
| "loss": 0.865, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.20292060731238903, | |
| "grad_norm": 3.6313884258270264, | |
| "learning_rate": 4.7973619365125386e-05, | |
| "loss": 0.9077, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21016777185926006, | |
| "grad_norm": 3.6675050258636475, | |
| "learning_rate": 4.7901145093491815e-05, | |
| "loss": 0.8724, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2174149364061311, | |
| "grad_norm": 2.9119815826416016, | |
| "learning_rate": 4.782867082185824e-05, | |
| "loss": 0.887, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22466210095300213, | |
| "grad_norm": 3.978513479232788, | |
| "learning_rate": 4.775619655022467e-05, | |
| "loss": 0.909, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.23190926549987317, | |
| "grad_norm": 2.6962685585021973, | |
| "learning_rate": 4.76837222785911e-05, | |
| "loss": 0.9013, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.23915643004674422, | |
| "grad_norm": 3.688558578491211, | |
| "learning_rate": 4.761124800695753e-05, | |
| "loss": 0.8777, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.24640359459361524, | |
| "grad_norm": 2.6610682010650635, | |
| "learning_rate": 4.753877373532396e-05, | |
| "loss": 0.854, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2536507591404863, | |
| "grad_norm": 2.8136966228485107, | |
| "learning_rate": 4.746629946369039e-05, | |
| "loss": 0.8523, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2608979236873573, | |
| "grad_norm": 2.660381555557251, | |
| "learning_rate": 4.739382519205682e-05, | |
| "loss": 0.8764, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.26814508823422833, | |
| "grad_norm": 2.8120713233947754, | |
| "learning_rate": 4.732135092042325e-05, | |
| "loss": 0.8401, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2753922527810994, | |
| "grad_norm": 3.686016321182251, | |
| "learning_rate": 4.724887664878968e-05, | |
| "loss": 0.8891, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.28263941732797043, | |
| "grad_norm": 3.8460748195648193, | |
| "learning_rate": 4.717640237715611e-05, | |
| "loss": 0.8909, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.28988658187484145, | |
| "grad_norm": 3.1707077026367188, | |
| "learning_rate": 4.710392810552254e-05, | |
| "loss": 0.8564, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.28988658187484145, | |
| "eval_loss": 0.8084499835968018, | |
| "eval_runtime": 245.4746, | |
| "eval_samples_per_second": 140.524, | |
| "eval_steps_per_second": 14.054, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.29713374642171253, | |
| "grad_norm": 2.892038106918335, | |
| "learning_rate": 4.703145383388897e-05, | |
| "loss": 0.8496, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.30438091096858355, | |
| "grad_norm": 3.043569803237915, | |
| "learning_rate": 4.69589795622554e-05, | |
| "loss": 0.9013, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3116280755154546, | |
| "grad_norm": 2.6177148818969727, | |
| "learning_rate": 4.688650529062183e-05, | |
| "loss": 0.8438, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3188752400623256, | |
| "grad_norm": 3.0475521087646484, | |
| "learning_rate": 4.6814031018988264e-05, | |
| "loss": 0.8892, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.32612240460919667, | |
| "grad_norm": 3.0972824096679688, | |
| "learning_rate": 4.674155674735469e-05, | |
| "loss": 0.8019, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3333695691560677, | |
| "grad_norm": 3.916921854019165, | |
| "learning_rate": 4.666908247572112e-05, | |
| "loss": 0.8278, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3406167337029387, | |
| "grad_norm": 2.8206372261047363, | |
| "learning_rate": 4.659660820408755e-05, | |
| "loss": 0.8532, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3478638982498098, | |
| "grad_norm": 3.05037260055542, | |
| "learning_rate": 4.6524133932453985e-05, | |
| "loss": 0.8516, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3551110627966808, | |
| "grad_norm": 2.940920829772949, | |
| "learning_rate": 4.645165966082041e-05, | |
| "loss": 0.8326, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.36235822734355183, | |
| "grad_norm": 2.752655267715454, | |
| "learning_rate": 4.637918538918684e-05, | |
| "loss": 0.8705, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.36960539189042285, | |
| "grad_norm": 2.369166851043701, | |
| "learning_rate": 4.630671111755327e-05, | |
| "loss": 0.8111, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.37685255643729393, | |
| "grad_norm": 2.991446018218994, | |
| "learning_rate": 4.62342368459197e-05, | |
| "loss": 0.8379, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.38409972098416495, | |
| "grad_norm": 2.6964468955993652, | |
| "learning_rate": 4.6161762574286134e-05, | |
| "loss": 0.8459, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.39134688553103597, | |
| "grad_norm": 2.8385136127471924, | |
| "learning_rate": 4.608928830265256e-05, | |
| "loss": 0.8301, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.398594050077907, | |
| "grad_norm": 2.3763439655303955, | |
| "learning_rate": 4.601681403101899e-05, | |
| "loss": 0.7899, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.40584121462477807, | |
| "grad_norm": 3.157857894897461, | |
| "learning_rate": 4.594433975938542e-05, | |
| "loss": 0.8118, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4130883791716491, | |
| "grad_norm": 3.2475242614746094, | |
| "learning_rate": 4.5871865487751855e-05, | |
| "loss": 0.8096, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4203355437185201, | |
| "grad_norm": 2.474079132080078, | |
| "learning_rate": 4.5799391216118284e-05, | |
| "loss": 0.8164, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4275827082653912, | |
| "grad_norm": 2.8297600746154785, | |
| "learning_rate": 4.572691694448471e-05, | |
| "loss": 0.7803, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4348298728122622, | |
| "grad_norm": 3.616157054901123, | |
| "learning_rate": 4.565444267285114e-05, | |
| "loss": 0.7867, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4348298728122622, | |
| "eval_loss": 0.7722809910774231, | |
| "eval_runtime": 245.4414, | |
| "eval_samples_per_second": 140.543, | |
| "eval_steps_per_second": 14.056, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.44207703735913323, | |
| "grad_norm": 2.828176736831665, | |
| "learning_rate": 4.558196840121757e-05, | |
| "loss": 0.8496, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.44932420190600425, | |
| "grad_norm": 3.1643614768981934, | |
| "learning_rate": 4.5509494129584e-05, | |
| "loss": 0.7921, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4565713664528753, | |
| "grad_norm": 3.158106803894043, | |
| "learning_rate": 4.543701985795043e-05, | |
| "loss": 0.8128, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.46381853099974635, | |
| "grad_norm": 3.462926149368286, | |
| "learning_rate": 4.536454558631686e-05, | |
| "loss": 0.7772, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.47106569554661737, | |
| "grad_norm": 2.819894552230835, | |
| "learning_rate": 4.529207131468329e-05, | |
| "loss": 0.7911, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.47831286009348845, | |
| "grad_norm": 2.3228747844696045, | |
| "learning_rate": 4.521959704304972e-05, | |
| "loss": 0.7974, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.48556002464035947, | |
| "grad_norm": 2.4810431003570557, | |
| "learning_rate": 4.514712277141615e-05, | |
| "loss": 0.8064, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.4928071891872305, | |
| "grad_norm": 2.91050124168396, | |
| "learning_rate": 4.5074648499782577e-05, | |
| "loss": 0.8269, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5000543537341016, | |
| "grad_norm": 2.8636369705200195, | |
| "learning_rate": 4.5002174228149005e-05, | |
| "loss": 0.8139, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5073015182809726, | |
| "grad_norm": 2.4558308124542236, | |
| "learning_rate": 4.4929699956515434e-05, | |
| "loss": 0.7364, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5145486828278436, | |
| "grad_norm": 2.5896005630493164, | |
| "learning_rate": 4.485722568488187e-05, | |
| "loss": 0.7858, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5217958473747146, | |
| "grad_norm": 2.745985746383667, | |
| "learning_rate": 4.47847514132483e-05, | |
| "loss": 0.8279, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5290430119215856, | |
| "grad_norm": 3.3217363357543945, | |
| "learning_rate": 4.4712277141614726e-05, | |
| "loss": 0.7929, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5362901764684567, | |
| "grad_norm": 2.981140375137329, | |
| "learning_rate": 4.4639802869981155e-05, | |
| "loss": 0.8282, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5435373410153278, | |
| "grad_norm": 3.112213611602783, | |
| "learning_rate": 4.456732859834759e-05, | |
| "loss": 0.7823, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5507845055621988, | |
| "grad_norm": 2.5669238567352295, | |
| "learning_rate": 4.449485432671402e-05, | |
| "loss": 0.8118, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5580316701090698, | |
| "grad_norm": 2.7261078357696533, | |
| "learning_rate": 4.442238005508045e-05, | |
| "loss": 0.8098, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5652788346559409, | |
| "grad_norm": 2.5646941661834717, | |
| "learning_rate": 4.4349905783446876e-05, | |
| "loss": 0.8, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5725259992028119, | |
| "grad_norm": 2.627681016921997, | |
| "learning_rate": 4.427743151181331e-05, | |
| "loss": 0.7732, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.5797731637496829, | |
| "grad_norm": 2.443345069885254, | |
| "learning_rate": 4.420495724017974e-05, | |
| "loss": 0.7949, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5797731637496829, | |
| "eval_loss": 0.7400076389312744, | |
| "eval_runtime": 245.51, | |
| "eval_samples_per_second": 140.503, | |
| "eval_steps_per_second": 14.052, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5870203282965539, | |
| "grad_norm": 2.263333559036255, | |
| "learning_rate": 4.413248296854617e-05, | |
| "loss": 0.7523, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5942674928434251, | |
| "grad_norm": 2.655808210372925, | |
| "learning_rate": 4.40600086969126e-05, | |
| "loss": 0.7489, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6015146573902961, | |
| "grad_norm": 3.2610058784484863, | |
| "learning_rate": 4.3987534425279025e-05, | |
| "loss": 0.7853, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6087618219371671, | |
| "grad_norm": 2.82106876373291, | |
| "learning_rate": 4.391506015364546e-05, | |
| "loss": 0.7688, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6160089864840381, | |
| "grad_norm": 1.9377267360687256, | |
| "learning_rate": 4.384258588201189e-05, | |
| "loss": 0.7887, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6232561510309091, | |
| "grad_norm": 2.4087536334991455, | |
| "learning_rate": 4.377011161037832e-05, | |
| "loss": 0.8098, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6305033155777802, | |
| "grad_norm": 5.600491046905518, | |
| "learning_rate": 4.3697637338744747e-05, | |
| "loss": 0.7683, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6377504801246512, | |
| "grad_norm": 3.0500433444976807, | |
| "learning_rate": 4.362516306711118e-05, | |
| "loss": 0.7995, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6449976446715223, | |
| "grad_norm": 1.7417362928390503, | |
| "learning_rate": 4.355268879547761e-05, | |
| "loss": 0.7822, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6522448092183933, | |
| "grad_norm": 2.1057236194610596, | |
| "learning_rate": 4.348021452384404e-05, | |
| "loss": 0.7812, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6594919737652644, | |
| "grad_norm": 2.780358076095581, | |
| "learning_rate": 4.341063922307581e-05, | |
| "loss": 0.849, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6667391383121354, | |
| "grad_norm": 2.8940329551696777, | |
| "learning_rate": 4.333816495144224e-05, | |
| "loss": 0.7883, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6739863028590064, | |
| "grad_norm": 2.65120530128479, | |
| "learning_rate": 4.326569067980867e-05, | |
| "loss": 0.8011, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6812334674058774, | |
| "grad_norm": 3.0340774059295654, | |
| "learning_rate": 4.31932164081751e-05, | |
| "loss": 0.7784, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6884806319527484, | |
| "grad_norm": 3.227555990219116, | |
| "learning_rate": 4.312074213654153e-05, | |
| "loss": 0.7798, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6957277964996196, | |
| "grad_norm": 2.9701457023620605, | |
| "learning_rate": 4.304826786490796e-05, | |
| "loss": 0.7623, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7029749610464906, | |
| "grad_norm": 3.183608293533325, | |
| "learning_rate": 4.297579359327439e-05, | |
| "loss": 0.7591, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7102221255933616, | |
| "grad_norm": 3.034790277481079, | |
| "learning_rate": 4.290331932164082e-05, | |
| "loss": 0.75, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7174692901402326, | |
| "grad_norm": 2.8872621059417725, | |
| "learning_rate": 4.283084505000725e-05, | |
| "loss": 0.7672, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7247164546871037, | |
| "grad_norm": 2.583475112915039, | |
| "learning_rate": 4.275837077837368e-05, | |
| "loss": 0.7396, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7247164546871037, | |
| "eval_loss": 0.7259831428527832, | |
| "eval_runtime": 245.9324, | |
| "eval_samples_per_second": 140.262, | |
| "eval_steps_per_second": 14.028, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7319636192339747, | |
| "grad_norm": 2.8117451667785645, | |
| "learning_rate": 4.268589650674011e-05, | |
| "loss": 0.7471, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7392107837808457, | |
| "grad_norm": 2.556171178817749, | |
| "learning_rate": 4.261342223510654e-05, | |
| "loss": 0.734, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7464579483277168, | |
| "grad_norm": 2.494264602661133, | |
| "learning_rate": 4.2540947963472974e-05, | |
| "loss": 0.7657, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7537051128745879, | |
| "grad_norm": 2.5196897983551025, | |
| "learning_rate": 4.24684736918394e-05, | |
| "loss": 0.7373, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7609522774214589, | |
| "grad_norm": 1.9920554161071777, | |
| "learning_rate": 4.239599942020583e-05, | |
| "loss": 0.7603, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.7681994419683299, | |
| "grad_norm": 3.2272536754608154, | |
| "learning_rate": 4.232352514857226e-05, | |
| "loss": 0.798, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.7754466065152009, | |
| "grad_norm": 2.447430372238159, | |
| "learning_rate": 4.225105087693869e-05, | |
| "loss": 0.7918, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.7826937710620719, | |
| "grad_norm": 2.375603199005127, | |
| "learning_rate": 4.217857660530512e-05, | |
| "loss": 0.7578, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.789940935608943, | |
| "grad_norm": 3.0463624000549316, | |
| "learning_rate": 4.210610233367155e-05, | |
| "loss": 0.749, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.797188100155814, | |
| "grad_norm": 1.8498198986053467, | |
| "learning_rate": 4.203362806203798e-05, | |
| "loss": 0.7619, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8044352647026851, | |
| "grad_norm": 3.3921091556549072, | |
| "learning_rate": 4.196115379040441e-05, | |
| "loss": 0.7655, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8116824292495561, | |
| "grad_norm": 3.1778969764709473, | |
| "learning_rate": 4.188867951877084e-05, | |
| "loss": 0.7119, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8189295937964272, | |
| "grad_norm": 2.292695999145508, | |
| "learning_rate": 4.1816205247137266e-05, | |
| "loss": 0.7378, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8261767583432982, | |
| "grad_norm": 2.388732671737671, | |
| "learning_rate": 4.1743730975503695e-05, | |
| "loss": 0.7376, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8334239228901692, | |
| "grad_norm": 2.4461729526519775, | |
| "learning_rate": 4.1671256703870124e-05, | |
| "loss": 0.7656, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8406710874370402, | |
| "grad_norm": 3.55210280418396, | |
| "learning_rate": 4.159878243223656e-05, | |
| "loss": 0.7814, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8479182519839112, | |
| "grad_norm": 2.640709400177002, | |
| "learning_rate": 4.152630816060299e-05, | |
| "loss": 0.7541, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8551654165307824, | |
| "grad_norm": 2.837186336517334, | |
| "learning_rate": 4.1453833888969416e-05, | |
| "loss": 0.7492, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.8624125810776534, | |
| "grad_norm": 3.748387336730957, | |
| "learning_rate": 4.1381359617335845e-05, | |
| "loss": 0.7317, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.8696597456245244, | |
| "grad_norm": 2.7219724655151367, | |
| "learning_rate": 4.130888534570227e-05, | |
| "loss": 0.7546, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8696597456245244, | |
| "eval_loss": 0.7048457860946655, | |
| "eval_runtime": 245.6596, | |
| "eval_samples_per_second": 140.418, | |
| "eval_steps_per_second": 14.044, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8769069101713954, | |
| "grad_norm": 3.017869234085083, | |
| "learning_rate": 4.123641107406871e-05, | |
| "loss": 0.7354, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.8841540747182665, | |
| "grad_norm": 2.677828550338745, | |
| "learning_rate": 4.116393680243514e-05, | |
| "loss": 0.746, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.8914012392651375, | |
| "grad_norm": 2.0534462928771973, | |
| "learning_rate": 4.1091462530801566e-05, | |
| "loss": 0.7614, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.8986484038120085, | |
| "grad_norm": 3.0294041633605957, | |
| "learning_rate": 4.1018988259167994e-05, | |
| "loss": 0.7256, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9058955683588796, | |
| "grad_norm": 2.640902042388916, | |
| "learning_rate": 4.094651398753443e-05, | |
| "loss": 0.7276, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.9131427329057507, | |
| "grad_norm": 2.773237466812134, | |
| "learning_rate": 4.087403971590086e-05, | |
| "loss": 0.7404, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9203898974526217, | |
| "grad_norm": 2.8102800846099854, | |
| "learning_rate": 4.080156544426729e-05, | |
| "loss": 0.7631, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.9276370619994927, | |
| "grad_norm": 2.767054796218872, | |
| "learning_rate": 4.0729091172633715e-05, | |
| "loss": 0.764, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9348842265463637, | |
| "grad_norm": 2.9610891342163086, | |
| "learning_rate": 4.065661690100015e-05, | |
| "loss": 0.7536, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9421313910932347, | |
| "grad_norm": 3.0401785373687744, | |
| "learning_rate": 4.058414262936658e-05, | |
| "loss": 0.7824, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9493785556401058, | |
| "grad_norm": 2.3925185203552246, | |
| "learning_rate": 4.051166835773301e-05, | |
| "loss": 0.7277, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.9566257201869769, | |
| "grad_norm": 2.707064390182495, | |
| "learning_rate": 4.0439194086099436e-05, | |
| "loss": 0.7133, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9638728847338479, | |
| "grad_norm": 2.639535665512085, | |
| "learning_rate": 4.0366719814465865e-05, | |
| "loss": 0.7198, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.9711200492807189, | |
| "grad_norm": 2.1093649864196777, | |
| "learning_rate": 4.02942455428323e-05, | |
| "loss": 0.7417, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.97836721382759, | |
| "grad_norm": 4.515178203582764, | |
| "learning_rate": 4.022177127119873e-05, | |
| "loss": 0.7299, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.985614378374461, | |
| "grad_norm": 2.626970052719116, | |
| "learning_rate": 4.014929699956516e-05, | |
| "loss": 0.741, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.992861542921332, | |
| "grad_norm": 3.5896389484405518, | |
| "learning_rate": 4.0076822727931586e-05, | |
| "loss": 0.7568, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.0001087074682031, | |
| "grad_norm": 5.62574577331543, | |
| "learning_rate": 4.000434845629802e-05, | |
| "loss": 0.7374, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.0073558720150741, | |
| "grad_norm": 3.0508015155792236, | |
| "learning_rate": 3.993187418466445e-05, | |
| "loss": 0.5983, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.0146030365619452, | |
| "grad_norm": 1.7448738813400269, | |
| "learning_rate": 3.985939991303088e-05, | |
| "loss": 0.5915, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0146030365619452, | |
| "eval_loss": 0.696858823299408, | |
| "eval_runtime": 245.4484, | |
| "eval_samples_per_second": 140.539, | |
| "eval_steps_per_second": 14.056, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0218502011088162, | |
| "grad_norm": 3.0310137271881104, | |
| "learning_rate": 3.978692564139731e-05, | |
| "loss": 0.5752, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.0290973656556872, | |
| "grad_norm": 2.422286033630371, | |
| "learning_rate": 3.9714451369763736e-05, | |
| "loss": 0.6085, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.0363445302025582, | |
| "grad_norm": 2.385861396789551, | |
| "learning_rate": 3.9641977098130164e-05, | |
| "loss": 0.5922, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.0435916947494293, | |
| "grad_norm": 2.8933980464935303, | |
| "learning_rate": 3.956950282649659e-05, | |
| "loss": 0.6146, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.0508388592963003, | |
| "grad_norm": 2.2987074851989746, | |
| "learning_rate": 3.949702855486303e-05, | |
| "loss": 0.6061, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.0580860238431713, | |
| "grad_norm": 2.174790859222412, | |
| "learning_rate": 3.942455428322946e-05, | |
| "loss": 0.6033, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.0653331883900423, | |
| "grad_norm": 2.604837417602539, | |
| "learning_rate": 3.9352080011595885e-05, | |
| "loss": 0.6027, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.0725803529369133, | |
| "grad_norm": 3.116417646408081, | |
| "learning_rate": 3.9279605739962314e-05, | |
| "loss": 0.5783, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.0798275174837846, | |
| "grad_norm": 2.4891812801361084, | |
| "learning_rate": 3.920713146832874e-05, | |
| "loss": 0.6108, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.0870746820306556, | |
| "grad_norm": 2.293124198913574, | |
| "learning_rate": 3.913465719669517e-05, | |
| "loss": 0.5853, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.0943218465775266, | |
| "grad_norm": 2.6642067432403564, | |
| "learning_rate": 3.90621829250616e-05, | |
| "loss": 0.569, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.1015690111243976, | |
| "grad_norm": 2.5504302978515625, | |
| "learning_rate": 3.8989708653428035e-05, | |
| "loss": 0.6019, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.1088161756712687, | |
| "grad_norm": 2.672874927520752, | |
| "learning_rate": 3.8917234381794463e-05, | |
| "loss": 0.5799, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.1160633402181397, | |
| "grad_norm": 2.6319384574890137, | |
| "learning_rate": 3.884476011016089e-05, | |
| "loss": 0.6357, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.1233105047650107, | |
| "grad_norm": 3.3530545234680176, | |
| "learning_rate": 3.877228583852732e-05, | |
| "loss": 0.5737, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.1305576693118817, | |
| "grad_norm": 2.6204335689544678, | |
| "learning_rate": 3.8699811566893756e-05, | |
| "loss": 0.6139, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.1378048338587528, | |
| "grad_norm": 2.570143938064575, | |
| "learning_rate": 3.8627337295260185e-05, | |
| "loss": 0.6038, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.1450519984056238, | |
| "grad_norm": 2.826798439025879, | |
| "learning_rate": 3.855486302362661e-05, | |
| "loss": 0.5985, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.1522991629524948, | |
| "grad_norm": 1.774380087852478, | |
| "learning_rate": 3.848238875199304e-05, | |
| "loss": 0.6042, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.1595463274993658, | |
| "grad_norm": 2.9775238037109375, | |
| "learning_rate": 3.840991448035948e-05, | |
| "loss": 0.5887, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1595463274993658, | |
| "eval_loss": 0.69095379114151, | |
| "eval_runtime": 245.6192, | |
| "eval_samples_per_second": 140.441, | |
| "eval_steps_per_second": 14.046, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1667934920462368, | |
| "grad_norm": 2.4235680103302, | |
| "learning_rate": 3.8337440208725906e-05, | |
| "loss": 0.6397, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.1740406565931079, | |
| "grad_norm": 2.372230291366577, | |
| "learning_rate": 3.8264965937092334e-05, | |
| "loss": 0.6083, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.1812878211399789, | |
| "grad_norm": 2.4329166412353516, | |
| "learning_rate": 3.819249166545876e-05, | |
| "loss": 0.5956, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.1885349856868501, | |
| "grad_norm": 3.0749635696411133, | |
| "learning_rate": 3.812001739382519e-05, | |
| "loss": 0.6162, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.1957821502337211, | |
| "grad_norm": 2.3049559593200684, | |
| "learning_rate": 3.804754312219163e-05, | |
| "loss": 0.593, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.2030293147805922, | |
| "grad_norm": 2.3846399784088135, | |
| "learning_rate": 3.7975068850558055e-05, | |
| "loss": 0.6, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.2102764793274632, | |
| "grad_norm": 2.4246721267700195, | |
| "learning_rate": 3.7902594578924484e-05, | |
| "loss": 0.6299, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.2175236438743342, | |
| "grad_norm": 3.0294981002807617, | |
| "learning_rate": 3.783012030729091e-05, | |
| "loss": 0.5957, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.2247708084212052, | |
| "grad_norm": 2.184633255004883, | |
| "learning_rate": 3.775764603565735e-05, | |
| "loss": 0.5781, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.2320179729680762, | |
| "grad_norm": 2.423351287841797, | |
| "learning_rate": 3.7685171764023776e-05, | |
| "loss": 0.571, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2392651375149473, | |
| "grad_norm": 3.334449529647827, | |
| "learning_rate": 3.7612697492390205e-05, | |
| "loss": 0.579, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.2465123020618183, | |
| "grad_norm": 2.2588014602661133, | |
| "learning_rate": 3.7540223220756633e-05, | |
| "loss": 0.5874, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.2537594666086893, | |
| "grad_norm": 3.5975542068481445, | |
| "learning_rate": 3.746774894912306e-05, | |
| "loss": 0.5633, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.2610066311555603, | |
| "grad_norm": 2.7754578590393066, | |
| "learning_rate": 3.73952746774895e-05, | |
| "loss": 0.6276, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.2682537957024314, | |
| "grad_norm": 2.410757064819336, | |
| "learning_rate": 3.7322800405855926e-05, | |
| "loss": 0.6118, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.2755009602493024, | |
| "grad_norm": 2.5323429107666016, | |
| "learning_rate": 3.7250326134222355e-05, | |
| "loss": 0.5945, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.2827481247961736, | |
| "grad_norm": 2.5905425548553467, | |
| "learning_rate": 3.717785186258878e-05, | |
| "loss": 0.5903, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.2899952893430444, | |
| "grad_norm": 2.042097568511963, | |
| "learning_rate": 3.710537759095521e-05, | |
| "loss": 0.5997, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.2972424538899157, | |
| "grad_norm": 2.1564393043518066, | |
| "learning_rate": 3.703290331932164e-05, | |
| "loss": 0.5697, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.3044896184367867, | |
| "grad_norm": 2.8595190048217773, | |
| "learning_rate": 3.696042904768807e-05, | |
| "loss": 0.6026, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3044896184367867, | |
| "eval_loss": 0.6820585131645203, | |
| "eval_runtime": 245.5719, | |
| "eval_samples_per_second": 140.468, | |
| "eval_steps_per_second": 14.049, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3117367829836577, | |
| "grad_norm": 2.401057243347168, | |
| "learning_rate": 3.6887954776054504e-05, | |
| "loss": 0.615, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.3189839475305287, | |
| "grad_norm": 2.1161983013153076, | |
| "learning_rate": 3.681548050442093e-05, | |
| "loss": 0.6468, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.3262311120773997, | |
| "grad_norm": 2.9642062187194824, | |
| "learning_rate": 3.674300623278736e-05, | |
| "loss": 0.5908, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.3334782766242708, | |
| "grad_norm": 2.200223445892334, | |
| "learning_rate": 3.667053196115379e-05, | |
| "loss": 0.5493, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.3407254411711418, | |
| "grad_norm": 2.8368406295776367, | |
| "learning_rate": 3.659805768952022e-05, | |
| "loss": 0.6157, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.3479726057180128, | |
| "grad_norm": 2.500457286834717, | |
| "learning_rate": 3.652558341788665e-05, | |
| "loss": 0.6217, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.3552197702648838, | |
| "grad_norm": 2.435392379760742, | |
| "learning_rate": 3.645310914625308e-05, | |
| "loss": 0.5988, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.3624669348117548, | |
| "grad_norm": 2.5148136615753174, | |
| "learning_rate": 3.638063487461951e-05, | |
| "loss": 0.6043, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.3697140993586259, | |
| "grad_norm": 2.5917887687683105, | |
| "learning_rate": 3.630816060298594e-05, | |
| "loss": 0.6048, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.3769612639054971, | |
| "grad_norm": 3.429553747177124, | |
| "learning_rate": 3.623568633135237e-05, | |
| "loss": 0.5958, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.384208428452368, | |
| "grad_norm": 3.326967239379883, | |
| "learning_rate": 3.6163212059718803e-05, | |
| "loss": 0.5629, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.3914555929992392, | |
| "grad_norm": 2.0327517986297607, | |
| "learning_rate": 3.609073778808523e-05, | |
| "loss": 0.6238, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.39870275754611, | |
| "grad_norm": 2.4792556762695312, | |
| "learning_rate": 3.601826351645166e-05, | |
| "loss": 0.5662, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.4059499220929812, | |
| "grad_norm": 2.692080497741699, | |
| "learning_rate": 3.594578924481809e-05, | |
| "loss": 0.6257, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.4131970866398522, | |
| "grad_norm": 3.3779165744781494, | |
| "learning_rate": 3.587331497318452e-05, | |
| "loss": 0.5654, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.4204442511867232, | |
| "grad_norm": 2.2963967323303223, | |
| "learning_rate": 3.580084070155095e-05, | |
| "loss": 0.6, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.4276914157335943, | |
| "grad_norm": 2.7266321182250977, | |
| "learning_rate": 3.572836642991738e-05, | |
| "loss": 0.6333, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.4349385802804653, | |
| "grad_norm": 1.9801080226898193, | |
| "learning_rate": 3.565589215828381e-05, | |
| "loss": 0.583, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.4421857448273363, | |
| "grad_norm": 2.2797017097473145, | |
| "learning_rate": 3.558341788665024e-05, | |
| "loss": 0.5918, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.4494329093742073, | |
| "grad_norm": 2.325143575668335, | |
| "learning_rate": 3.5510943615016674e-05, | |
| "loss": 0.601, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4494329093742073, | |
| "eval_loss": 0.6719304919242859, | |
| "eval_runtime": 245.6786, | |
| "eval_samples_per_second": 140.407, | |
| "eval_steps_per_second": 14.043, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4566800739210783, | |
| "grad_norm": 2.4414122104644775, | |
| "learning_rate": 3.54384693433831e-05, | |
| "loss": 0.5557, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.4639272384679494, | |
| "grad_norm": 3.4254276752471924, | |
| "learning_rate": 3.536599507174953e-05, | |
| "loss": 0.5961, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.4711744030148204, | |
| "grad_norm": 1.9034006595611572, | |
| "learning_rate": 3.529352080011596e-05, | |
| "loss": 0.5716, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.4784215675616914, | |
| "grad_norm": 1.9632625579833984, | |
| "learning_rate": 3.5221046528482395e-05, | |
| "loss": 0.6656, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.4856687321085627, | |
| "grad_norm": 2.634131669998169, | |
| "learning_rate": 3.5148572256848824e-05, | |
| "loss": 0.605, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.4929158966554334, | |
| "grad_norm": 2.171637535095215, | |
| "learning_rate": 3.507609798521525e-05, | |
| "loss": 0.5942, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.5001630612023047, | |
| "grad_norm": 2.2301058769226074, | |
| "learning_rate": 3.500362371358168e-05, | |
| "loss": 0.5776, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.5074102257491755, | |
| "grad_norm": 2.1996707916259766, | |
| "learning_rate": 3.493114944194811e-05, | |
| "loss": 0.6082, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.5146573902960467, | |
| "grad_norm": 2.6578609943389893, | |
| "learning_rate": 3.4858675170314545e-05, | |
| "loss": 0.637, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.5219045548429178, | |
| "grad_norm": 2.902642011642456, | |
| "learning_rate": 3.4786200898680973e-05, | |
| "loss": 0.5898, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.5291517193897888, | |
| "grad_norm": 2.331738233566284, | |
| "learning_rate": 3.47137266270474e-05, | |
| "loss": 0.6314, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.5363988839366598, | |
| "grad_norm": 2.1168956756591797, | |
| "learning_rate": 3.464125235541383e-05, | |
| "loss": 0.6306, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.5436460484835308, | |
| "grad_norm": 3.4085164070129395, | |
| "learning_rate": 3.456877808378026e-05, | |
| "loss": 0.6065, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.5508932130304018, | |
| "grad_norm": 3.533252000808716, | |
| "learning_rate": 3.449630381214669e-05, | |
| "loss": 0.5818, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.5581403775772729, | |
| "grad_norm": 2.3345284461975098, | |
| "learning_rate": 3.4423829540513116e-05, | |
| "loss": 0.5762, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.5653875421241439, | |
| "grad_norm": 2.5388126373291016, | |
| "learning_rate": 3.4351355268879545e-05, | |
| "loss": 0.5967, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.572634706671015, | |
| "grad_norm": 2.583822250366211, | |
| "learning_rate": 3.4278880997245973e-05, | |
| "loss": 0.6388, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.5798818712178861, | |
| "grad_norm": 2.8442914485931396, | |
| "learning_rate": 3.420640672561241e-05, | |
| "loss": 0.6049, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.587129035764757, | |
| "grad_norm": 2.080157518386841, | |
| "learning_rate": 3.413393245397884e-05, | |
| "loss": 0.5681, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.5943762003116282, | |
| "grad_norm": 3.178879976272583, | |
| "learning_rate": 3.4061458182345266e-05, | |
| "loss": 0.5877, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.5943762003116282, | |
| "eval_loss": 0.6599454283714294, | |
| "eval_runtime": 245.7578, | |
| "eval_samples_per_second": 140.362, | |
| "eval_steps_per_second": 14.038, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.601623364858499, | |
| "grad_norm": 2.6149051189422607, | |
| "learning_rate": 3.3988983910711694e-05, | |
| "loss": 0.5706, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.6088705294053702, | |
| "grad_norm": 2.2884621620178223, | |
| "learning_rate": 3.391650963907813e-05, | |
| "loss": 0.5929, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.616117693952241, | |
| "grad_norm": 2.5482795238494873, | |
| "learning_rate": 3.384403536744456e-05, | |
| "loss": 0.5772, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.6233648584991123, | |
| "grad_norm": 2.499694347381592, | |
| "learning_rate": 3.377156109581099e-05, | |
| "loss": 0.589, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.6306120230459833, | |
| "grad_norm": 3.6115667819976807, | |
| "learning_rate": 3.3699086824177416e-05, | |
| "loss": 0.5824, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.6378591875928543, | |
| "grad_norm": 3.2503881454467773, | |
| "learning_rate": 3.3626612552543844e-05, | |
| "loss": 0.5768, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.6451063521397253, | |
| "grad_norm": 2.821199893951416, | |
| "learning_rate": 3.355413828091028e-05, | |
| "loss": 0.5685, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.6523535166865964, | |
| "grad_norm": 2.602804660797119, | |
| "learning_rate": 3.348166400927671e-05, | |
| "loss": 0.595, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.6596006812334674, | |
| "grad_norm": 3.2898738384246826, | |
| "learning_rate": 3.340918973764314e-05, | |
| "loss": 0.5737, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.6668478457803384, | |
| "grad_norm": 3.5453619956970215, | |
| "learning_rate": 3.3336715466009565e-05, | |
| "loss": 0.6154, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.6740950103272096, | |
| "grad_norm": 2.276144027709961, | |
| "learning_rate": 3.3264241194376e-05, | |
| "loss": 0.5962, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.6813421748740804, | |
| "grad_norm": 2.5161609649658203, | |
| "learning_rate": 3.319176692274243e-05, | |
| "loss": 0.5486, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.6885893394209517, | |
| "grad_norm": 2.3692619800567627, | |
| "learning_rate": 3.311929265110886e-05, | |
| "loss": 0.5959, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.6958365039678225, | |
| "grad_norm": 2.2731165885925293, | |
| "learning_rate": 3.3046818379475286e-05, | |
| "loss": 0.5484, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.7030836685146937, | |
| "grad_norm": 2.9670755863189697, | |
| "learning_rate": 3.297434410784172e-05, | |
| "loss": 0.5648, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.7103308330615645, | |
| "grad_norm": 2.7299599647521973, | |
| "learning_rate": 3.290186983620815e-05, | |
| "loss": 0.6107, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.7175779976084358, | |
| "grad_norm": 2.5043752193450928, | |
| "learning_rate": 3.282939556457458e-05, | |
| "loss": 0.5998, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.7248251621553068, | |
| "grad_norm": 2.849438428878784, | |
| "learning_rate": 3.275692129294101e-05, | |
| "loss": 0.5729, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.7320723267021778, | |
| "grad_norm": 2.4421026706695557, | |
| "learning_rate": 3.2684447021307436e-05, | |
| "loss": 0.5821, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.7393194912490488, | |
| "grad_norm": 2.8539223670959473, | |
| "learning_rate": 3.261197274967387e-05, | |
| "loss": 0.566, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7393194912490488, | |
| "eval_loss": 0.6502553820610046, | |
| "eval_runtime": 247.5286, | |
| "eval_samples_per_second": 139.358, | |
| "eval_steps_per_second": 13.938, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7465666557959199, | |
| "grad_norm": 2.71627140045166, | |
| "learning_rate": 3.25394984780403e-05, | |
| "loss": 0.6029, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.7538138203427909, | |
| "grad_norm": 2.3385589122772217, | |
| "learning_rate": 3.246702420640673e-05, | |
| "loss": 0.5514, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.761060984889662, | |
| "grad_norm": 2.6175739765167236, | |
| "learning_rate": 3.239454993477316e-05, | |
| "loss": 0.6304, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.768308149436533, | |
| "grad_norm": 2.545201301574707, | |
| "learning_rate": 3.232207566313959e-05, | |
| "loss": 0.5833, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.775555313983404, | |
| "grad_norm": 3.0050761699676514, | |
| "learning_rate": 3.225105087693869e-05, | |
| "loss": 0.5914, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.7828024785302752, | |
| "grad_norm": 2.7314724922180176, | |
| "learning_rate": 3.217857660530512e-05, | |
| "loss": 0.5652, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.790049643077146, | |
| "grad_norm": 2.4256742000579834, | |
| "learning_rate": 3.210610233367155e-05, | |
| "loss": 0.5891, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.7972968076240172, | |
| "grad_norm": 3.057068109512329, | |
| "learning_rate": 3.203362806203798e-05, | |
| "loss": 0.5873, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.804543972170888, | |
| "grad_norm": 2.393101215362549, | |
| "learning_rate": 3.196115379040441e-05, | |
| "loss": 0.5735, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.8117911367177593, | |
| "grad_norm": 2.644343614578247, | |
| "learning_rate": 3.188867951877084e-05, | |
| "loss": 0.569, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.81903830126463, | |
| "grad_norm": 2.401324987411499, | |
| "learning_rate": 3.181620524713727e-05, | |
| "loss": 0.5358, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.8262854658115013, | |
| "grad_norm": 2.5138838291168213, | |
| "learning_rate": 3.17437309755037e-05, | |
| "loss": 0.5348, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.8335326303583723, | |
| "grad_norm": 1.8828959465026855, | |
| "learning_rate": 3.167125670387013e-05, | |
| "loss": 0.5899, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.8407797949052433, | |
| "grad_norm": 2.3968441486358643, | |
| "learning_rate": 3.159878243223656e-05, | |
| "loss": 0.5385, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.8480269594521144, | |
| "grad_norm": 2.0079128742218018, | |
| "learning_rate": 3.152630816060299e-05, | |
| "loss": 0.5341, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.8552741239989854, | |
| "grad_norm": 1.722406029701233, | |
| "learning_rate": 3.145383388896942e-05, | |
| "loss": 0.5592, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.8625212885458564, | |
| "grad_norm": 2.9170126914978027, | |
| "learning_rate": 3.1381359617335845e-05, | |
| "loss": 0.5637, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.8697684530927274, | |
| "grad_norm": 3.159693479537964, | |
| "learning_rate": 3.1308885345702274e-05, | |
| "loss": 0.6119, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.8770156176395985, | |
| "grad_norm": 1.9590739011764526, | |
| "learning_rate": 3.12364110740687e-05, | |
| "loss": 0.6167, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.8842627821864695, | |
| "grad_norm": 2.080077886581421, | |
| "learning_rate": 3.116393680243514e-05, | |
| "loss": 0.5713, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.8842627821864695, | |
| "eval_loss": 0.6442924737930298, | |
| "eval_runtime": 246.2742, | |
| "eval_samples_per_second": 140.067, | |
| "eval_steps_per_second": 14.009, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.8915099467333407, | |
| "grad_norm": 3.022768974304199, | |
| "learning_rate": 3.1091462530801567e-05, | |
| "loss": 0.5889, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.8987571112802115, | |
| "grad_norm": 2.6845195293426514, | |
| "learning_rate": 3.1018988259167995e-05, | |
| "loss": 0.5536, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.9060042758270828, | |
| "grad_norm": 3.178252696990967, | |
| "learning_rate": 3.0946513987534424e-05, | |
| "loss": 0.6397, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.9132514403739536, | |
| "grad_norm": 3.4886860847473145, | |
| "learning_rate": 3.087403971590085e-05, | |
| "loss": 0.5945, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.9204986049208248, | |
| "grad_norm": 2.4418983459472656, | |
| "learning_rate": 3.080156544426729e-05, | |
| "loss": 0.6039, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.9277457694676956, | |
| "grad_norm": 2.7599799633026123, | |
| "learning_rate": 3.0729091172633716e-05, | |
| "loss": 0.5496, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.9349929340145668, | |
| "grad_norm": 2.276477098464966, | |
| "learning_rate": 3.0656616901000145e-05, | |
| "loss": 0.5836, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.9422400985614379, | |
| "grad_norm": 2.3959388732910156, | |
| "learning_rate": 3.058414262936657e-05, | |
| "loss": 0.603, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.9494872631083089, | |
| "grad_norm": 2.58145809173584, | |
| "learning_rate": 3.051166835773301e-05, | |
| "loss": 0.5671, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.95673442765518, | |
| "grad_norm": 1.9884346723556519, | |
| "learning_rate": 3.0439194086099437e-05, | |
| "loss": 0.5766, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.963981592202051, | |
| "grad_norm": 2.8933162689208984, | |
| "learning_rate": 3.0366719814465866e-05, | |
| "loss": 0.5529, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.971228756748922, | |
| "grad_norm": 3.837768316268921, | |
| "learning_rate": 3.0294245542832294e-05, | |
| "loss": 0.5698, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.978475921295793, | |
| "grad_norm": 5.475804805755615, | |
| "learning_rate": 3.0221771271198723e-05, | |
| "loss": 0.5887, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.9857230858426642, | |
| "grad_norm": 2.9813613891601562, | |
| "learning_rate": 3.014929699956516e-05, | |
| "loss": 0.5685, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.992970250389535, | |
| "grad_norm": 2.6293349266052246, | |
| "learning_rate": 3.0076822727931587e-05, | |
| "loss": 0.5773, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 2.0002174149364063, | |
| "grad_norm": 2.593838691711426, | |
| "learning_rate": 3.0004348456298015e-05, | |
| "loss": 0.5628, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.007464579483277, | |
| "grad_norm": 1.6289088726043701, | |
| "learning_rate": 2.9931874184664444e-05, | |
| "loss": 0.4418, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 2.0147117440301483, | |
| "grad_norm": 2.8888840675354004, | |
| "learning_rate": 2.985939991303088e-05, | |
| "loss": 0.4652, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.021958908577019, | |
| "grad_norm": 2.572951316833496, | |
| "learning_rate": 2.9786925641397308e-05, | |
| "loss": 0.4685, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 2.0292060731238903, | |
| "grad_norm": 3.1881916522979736, | |
| "learning_rate": 2.9714451369763737e-05, | |
| "loss": 0.4583, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.0292060731238903, | |
| "eval_loss": 0.6516901254653931, | |
| "eval_runtime": 247.3517, | |
| "eval_samples_per_second": 139.457, | |
| "eval_steps_per_second": 13.948, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.036453237670761, | |
| "grad_norm": 2.9465830326080322, | |
| "learning_rate": 2.9641977098130165e-05, | |
| "loss": 0.4494, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 2.0437004022176324, | |
| "grad_norm": 1.7231141328811646, | |
| "learning_rate": 2.9569502826496597e-05, | |
| "loss": 0.4183, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.050947566764503, | |
| "grad_norm": 3.4768612384796143, | |
| "learning_rate": 2.9497028554863026e-05, | |
| "loss": 0.4223, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 2.0581947313113744, | |
| "grad_norm": 2.902104139328003, | |
| "learning_rate": 2.9424554283229454e-05, | |
| "loss": 0.4569, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.0654418958582457, | |
| "grad_norm": 2.661472797393799, | |
| "learning_rate": 2.9352080011595883e-05, | |
| "loss": 0.4551, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 2.0726890604051165, | |
| "grad_norm": 2.9054114818573, | |
| "learning_rate": 2.927960573996231e-05, | |
| "loss": 0.4279, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.0799362249519877, | |
| "grad_norm": 2.9954373836517334, | |
| "learning_rate": 2.9207131468328747e-05, | |
| "loss": 0.4291, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 2.0871833894988585, | |
| "grad_norm": 2.9396731853485107, | |
| "learning_rate": 2.9134657196695175e-05, | |
| "loss": 0.4392, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.0944305540457298, | |
| "grad_norm": 2.4928243160247803, | |
| "learning_rate": 2.9062182925061604e-05, | |
| "loss": 0.402, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 2.1016777185926006, | |
| "grad_norm": 3.2848997116088867, | |
| "learning_rate": 2.8989708653428032e-05, | |
| "loss": 0.4189, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.108924883139472, | |
| "grad_norm": 3.1870994567871094, | |
| "learning_rate": 2.8917234381794468e-05, | |
| "loss": 0.4421, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 2.1161720476863426, | |
| "grad_norm": 2.7032647132873535, | |
| "learning_rate": 2.8844760110160896e-05, | |
| "loss": 0.4415, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.123419212233214, | |
| "grad_norm": 3.0945403575897217, | |
| "learning_rate": 2.8772285838527325e-05, | |
| "loss": 0.4515, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 2.1306663767800846, | |
| "grad_norm": 2.7250170707702637, | |
| "learning_rate": 2.8699811566893753e-05, | |
| "loss": 0.4374, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.137913541326956, | |
| "grad_norm": 3.7444469928741455, | |
| "learning_rate": 2.8627337295260182e-05, | |
| "loss": 0.4738, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 2.1451607058738267, | |
| "grad_norm": 3.042750597000122, | |
| "learning_rate": 2.8554863023626614e-05, | |
| "loss": 0.444, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.152407870420698, | |
| "grad_norm": 2.8017637729644775, | |
| "learning_rate": 2.8482388751993046e-05, | |
| "loss": 0.4425, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 2.159655034967569, | |
| "grad_norm": 2.268974781036377, | |
| "learning_rate": 2.8409914480359475e-05, | |
| "loss": 0.4594, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.16690219951444, | |
| "grad_norm": 2.553354263305664, | |
| "learning_rate": 2.8337440208725903e-05, | |
| "loss": 0.451, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 2.174149364061311, | |
| "grad_norm": 3.928313970565796, | |
| "learning_rate": 2.8264965937092335e-05, | |
| "loss": 0.4609, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.174149364061311, | |
| "eval_loss": 0.6510897278785706, | |
| "eval_runtime": 245.6483, | |
| "eval_samples_per_second": 140.424, | |
| "eval_steps_per_second": 14.044, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.181396528608182, | |
| "grad_norm": 2.9727916717529297, | |
| "learning_rate": 2.8192491665458764e-05, | |
| "loss": 0.445, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 2.1886436931550532, | |
| "grad_norm": 2.9330101013183594, | |
| "learning_rate": 2.8120017393825192e-05, | |
| "loss": 0.4395, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.195890857701924, | |
| "grad_norm": 2.5062899589538574, | |
| "learning_rate": 2.804754312219162e-05, | |
| "loss": 0.4293, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.2031380222487953, | |
| "grad_norm": 3.488398313522339, | |
| "learning_rate": 2.797506885055805e-05, | |
| "loss": 0.4382, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.210385186795666, | |
| "grad_norm": 2.6354928016662598, | |
| "learning_rate": 2.7902594578924485e-05, | |
| "loss": 0.4348, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.2176323513425373, | |
| "grad_norm": 2.0676591396331787, | |
| "learning_rate": 2.7830120307290913e-05, | |
| "loss": 0.4506, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.224879515889408, | |
| "grad_norm": 3.5719449520111084, | |
| "learning_rate": 2.7757646035657342e-05, | |
| "loss": 0.4718, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.2321266804362794, | |
| "grad_norm": 2.755336284637451, | |
| "learning_rate": 2.768517176402377e-05, | |
| "loss": 0.4934, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.23937384498315, | |
| "grad_norm": 2.0585694313049316, | |
| "learning_rate": 2.7612697492390206e-05, | |
| "loss": 0.4199, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.2466210095300214, | |
| "grad_norm": 3.553250551223755, | |
| "learning_rate": 2.7540223220756634e-05, | |
| "loss": 0.4414, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.253868174076892, | |
| "grad_norm": 2.6371939182281494, | |
| "learning_rate": 2.7467748949123063e-05, | |
| "loss": 0.4238, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.2611153386237635, | |
| "grad_norm": 2.935482978820801, | |
| "learning_rate": 2.739527467748949e-05, | |
| "loss": 0.4421, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.2683625031706347, | |
| "grad_norm": 2.8056271076202393, | |
| "learning_rate": 2.7322800405855923e-05, | |
| "loss": 0.4838, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.2756096677175055, | |
| "grad_norm": 1.6445329189300537, | |
| "learning_rate": 2.7250326134222352e-05, | |
| "loss": 0.4727, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.2828568322643767, | |
| "grad_norm": 2.901073694229126, | |
| "learning_rate": 2.7177851862588784e-05, | |
| "loss": 0.451, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.2901039968112475, | |
| "grad_norm": 3.6521453857421875, | |
| "learning_rate": 2.7105377590955213e-05, | |
| "loss": 0.4441, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.297351161358119, | |
| "grad_norm": 3.1472737789154053, | |
| "learning_rate": 2.703290331932164e-05, | |
| "loss": 0.4436, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.3045983259049896, | |
| "grad_norm": 2.873993396759033, | |
| "learning_rate": 2.6960429047688073e-05, | |
| "loss": 0.4555, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.311845490451861, | |
| "grad_norm": 2.5647237300872803, | |
| "learning_rate": 2.68879547760545e-05, | |
| "loss": 0.4621, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.3190926549987316, | |
| "grad_norm": 2.7584807872772217, | |
| "learning_rate": 2.681548050442093e-05, | |
| "loss": 0.4497, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.3190926549987316, | |
| "eval_loss": 0.644283652305603, | |
| "eval_runtime": 245.6009, | |
| "eval_samples_per_second": 140.451, | |
| "eval_steps_per_second": 14.047, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.326339819545603, | |
| "grad_norm": 3.315335750579834, | |
| "learning_rate": 2.674300623278736e-05, | |
| "loss": 0.4505, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.3335869840924737, | |
| "grad_norm": 2.8071439266204834, | |
| "learning_rate": 2.6670531961153794e-05, | |
| "loss": 0.4457, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.340834148639345, | |
| "grad_norm": 2.4625017642974854, | |
| "learning_rate": 2.6598057689520223e-05, | |
| "loss": 0.418, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.3480813131862157, | |
| "grad_norm": 3.5972659587860107, | |
| "learning_rate": 2.652558341788665e-05, | |
| "loss": 0.4188, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.355328477733087, | |
| "grad_norm": 2.557579517364502, | |
| "learning_rate": 2.645310914625308e-05, | |
| "loss": 0.4161, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.3625756422799578, | |
| "grad_norm": 2.0252676010131836, | |
| "learning_rate": 2.638063487461951e-05, | |
| "loss": 0.4504, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.369822806826829, | |
| "grad_norm": 2.324404001235962, | |
| "learning_rate": 2.6308160602985944e-05, | |
| "loss": 0.454, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.3770699713737002, | |
| "grad_norm": 2.3498804569244385, | |
| "learning_rate": 2.6235686331352372e-05, | |
| "loss": 0.4782, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.384317135920571, | |
| "grad_norm": 2.642273187637329, | |
| "learning_rate": 2.61632120597188e-05, | |
| "loss": 0.4531, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.3915643004674423, | |
| "grad_norm": 3.5636932849884033, | |
| "learning_rate": 2.609073778808523e-05, | |
| "loss": 0.4545, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.398811465014313, | |
| "grad_norm": 3.15792179107666, | |
| "learning_rate": 2.601826351645166e-05, | |
| "loss": 0.4827, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 2.4060586295611843, | |
| "grad_norm": 2.4860501289367676, | |
| "learning_rate": 2.594578924481809e-05, | |
| "loss": 0.4629, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.413305794108055, | |
| "grad_norm": 5.471420764923096, | |
| "learning_rate": 2.5873314973184522e-05, | |
| "loss": 0.4376, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 2.4205529586549264, | |
| "grad_norm": 3.098515033721924, | |
| "learning_rate": 2.580084070155095e-05, | |
| "loss": 0.4371, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.427800123201797, | |
| "grad_norm": 2.5739009380340576, | |
| "learning_rate": 2.5728366429917383e-05, | |
| "loss": 0.4339, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.4350472877486684, | |
| "grad_norm": 3.0826826095581055, | |
| "learning_rate": 2.565589215828381e-05, | |
| "loss": 0.4541, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.442294452295539, | |
| "grad_norm": 3.0049545764923096, | |
| "learning_rate": 2.558341788665024e-05, | |
| "loss": 0.4492, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 2.4495416168424105, | |
| "grad_norm": 3.194600820541382, | |
| "learning_rate": 2.5510943615016668e-05, | |
| "loss": 0.4509, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.4567887813892813, | |
| "grad_norm": 2.9486989974975586, | |
| "learning_rate": 2.5438469343383097e-05, | |
| "loss": 0.4104, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 2.4640359459361525, | |
| "grad_norm": 2.8794190883636475, | |
| "learning_rate": 2.5365995071749532e-05, | |
| "loss": 0.4308, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.4640359459361525, | |
| "eval_loss": 0.6368168592453003, | |
| "eval_runtime": 245.6078, | |
| "eval_samples_per_second": 140.447, | |
| "eval_steps_per_second": 14.047, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.4712831104830233, | |
| "grad_norm": 2.2457711696624756, | |
| "learning_rate": 2.529352080011596e-05, | |
| "loss": 0.4467, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 2.4785302750298945, | |
| "grad_norm": 3.0438215732574463, | |
| "learning_rate": 2.522104652848239e-05, | |
| "loss": 0.4748, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.485777439576766, | |
| "grad_norm": 3.1453700065612793, | |
| "learning_rate": 2.5148572256848818e-05, | |
| "loss": 0.4594, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 2.4930246041236366, | |
| "grad_norm": 2.476498603820801, | |
| "learning_rate": 2.5076097985215253e-05, | |
| "loss": 0.4637, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.500271768670508, | |
| "grad_norm": 2.5618162155151367, | |
| "learning_rate": 2.5003623713581682e-05, | |
| "loss": 0.4272, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 2.5075189332173786, | |
| "grad_norm": 2.719830274581909, | |
| "learning_rate": 2.4932598927380782e-05, | |
| "loss": 0.4436, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.51476609776425, | |
| "grad_norm": 2.393423557281494, | |
| "learning_rate": 2.486012465574721e-05, | |
| "loss": 0.5006, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 2.5220132623111207, | |
| "grad_norm": 2.518490791320801, | |
| "learning_rate": 2.4787650384113642e-05, | |
| "loss": 0.4378, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.529260426857992, | |
| "grad_norm": 3.152761697769165, | |
| "learning_rate": 2.471517611248007e-05, | |
| "loss": 0.4415, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 2.5365075914048627, | |
| "grad_norm": 2.689821243286133, | |
| "learning_rate": 2.46427018408465e-05, | |
| "loss": 0.4736, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.543754755951734, | |
| "grad_norm": 2.7041850090026855, | |
| "learning_rate": 2.457022756921293e-05, | |
| "loss": 0.4619, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 2.5510019204986047, | |
| "grad_norm": 2.8072993755340576, | |
| "learning_rate": 2.449775329757936e-05, | |
| "loss": 0.443, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.558249085045476, | |
| "grad_norm": 2.700951099395752, | |
| "learning_rate": 2.4425279025945792e-05, | |
| "loss": 0.4338, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 2.5654962495923472, | |
| "grad_norm": 2.2559311389923096, | |
| "learning_rate": 2.435280475431222e-05, | |
| "loss": 0.4371, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.572743414139218, | |
| "grad_norm": 2.7183778285980225, | |
| "learning_rate": 2.4280330482678653e-05, | |
| "loss": 0.4479, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 2.579990578686089, | |
| "grad_norm": 2.337385654449463, | |
| "learning_rate": 2.420785621104508e-05, | |
| "loss": 0.4703, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.58723774323296, | |
| "grad_norm": 2.985521078109741, | |
| "learning_rate": 2.413538193941151e-05, | |
| "loss": 0.44, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 2.5944849077798313, | |
| "grad_norm": 2.26230788230896, | |
| "learning_rate": 2.406290766777794e-05, | |
| "loss": 0.4485, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.601732072326702, | |
| "grad_norm": 3.0831480026245117, | |
| "learning_rate": 2.399043339614437e-05, | |
| "loss": 0.439, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 2.6089792368735734, | |
| "grad_norm": 3.325925827026367, | |
| "learning_rate": 2.39179591245108e-05, | |
| "loss": 0.4809, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.6089792368735734, | |
| "eval_loss": 0.6339168548583984, | |
| "eval_runtime": 245.517, | |
| "eval_samples_per_second": 140.499, | |
| "eval_steps_per_second": 14.052, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.616226401420444, | |
| "grad_norm": 3.043121099472046, | |
| "learning_rate": 2.3845484852877227e-05, | |
| "loss": 0.4382, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 2.6234735659673154, | |
| "grad_norm": 2.225372314453125, | |
| "learning_rate": 2.377301058124366e-05, | |
| "loss": 0.4379, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.630720730514186, | |
| "grad_norm": 3.059220790863037, | |
| "learning_rate": 2.3700536309610088e-05, | |
| "loss": 0.4475, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 2.6379678950610574, | |
| "grad_norm": 3.8372066020965576, | |
| "learning_rate": 2.362806203797652e-05, | |
| "loss": 0.4272, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.6452150596079282, | |
| "grad_norm": 1.4763438701629639, | |
| "learning_rate": 2.355558776634295e-05, | |
| "loss": 0.4629, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 2.6524622241547995, | |
| "grad_norm": 3.0369021892547607, | |
| "learning_rate": 2.348311349470938e-05, | |
| "loss": 0.438, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.6597093887016703, | |
| "grad_norm": 2.6003599166870117, | |
| "learning_rate": 2.341063922307581e-05, | |
| "loss": 0.4205, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 2.6669565532485415, | |
| "grad_norm": 2.9726734161376953, | |
| "learning_rate": 2.333816495144224e-05, | |
| "loss": 0.4453, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.6742037177954128, | |
| "grad_norm": 3.3538806438446045, | |
| "learning_rate": 2.326569067980867e-05, | |
| "loss": 0.4237, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 2.6814508823422836, | |
| "grad_norm": 2.160491704940796, | |
| "learning_rate": 2.31932164081751e-05, | |
| "loss": 0.4433, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.6886980468891544, | |
| "grad_norm": 3.117530345916748, | |
| "learning_rate": 2.312074213654153e-05, | |
| "loss": 0.4219, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.6959452114360256, | |
| "grad_norm": 2.787057638168335, | |
| "learning_rate": 2.304826786490796e-05, | |
| "loss": 0.4391, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.703192375982897, | |
| "grad_norm": 3.635530471801758, | |
| "learning_rate": 2.2975793593274387e-05, | |
| "loss": 0.461, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.7104395405297677, | |
| "grad_norm": 2.3657073974609375, | |
| "learning_rate": 2.290331932164082e-05, | |
| "loss": 0.4692, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.717686705076639, | |
| "grad_norm": 2.611757516860962, | |
| "learning_rate": 2.2830845050007248e-05, | |
| "loss": 0.4479, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.7249338696235097, | |
| "grad_norm": 2.967528820037842, | |
| "learning_rate": 2.2758370778373676e-05, | |
| "loss": 0.424, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.732181034170381, | |
| "grad_norm": 5.77213191986084, | |
| "learning_rate": 2.268589650674011e-05, | |
| "loss": 0.4389, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.7394281987172517, | |
| "grad_norm": 2.8429954051971436, | |
| "learning_rate": 2.2613422235106537e-05, | |
| "loss": 0.4829, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.746675363264123, | |
| "grad_norm": 2.9622368812561035, | |
| "learning_rate": 2.254094796347297e-05, | |
| "loss": 0.4781, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.7539225278109942, | |
| "grad_norm": 3.898066997528076, | |
| "learning_rate": 2.2468473691839397e-05, | |
| "loss": 0.4847, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.7539225278109942, | |
| "eval_loss": 0.6265138983726501, | |
| "eval_runtime": 245.5072, | |
| "eval_samples_per_second": 140.505, | |
| "eval_steps_per_second": 14.053, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.761169692357865, | |
| "grad_norm": 2.6696014404296875, | |
| "learning_rate": 2.239599942020583e-05, | |
| "loss": 0.4233, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.768416856904736, | |
| "grad_norm": 2.9069409370422363, | |
| "learning_rate": 2.2323525148572258e-05, | |
| "loss": 0.4632, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.775664021451607, | |
| "grad_norm": 3.3168020248413086, | |
| "learning_rate": 2.2251050876938687e-05, | |
| "loss": 0.4483, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.7829111859984783, | |
| "grad_norm": 2.685267686843872, | |
| "learning_rate": 2.217857660530512e-05, | |
| "loss": 0.4321, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.790158350545349, | |
| "grad_norm": 3.203169584274292, | |
| "learning_rate": 2.2106102333671547e-05, | |
| "loss": 0.4187, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.79740551509222, | |
| "grad_norm": 2.244285821914673, | |
| "learning_rate": 2.203362806203798e-05, | |
| "loss": 0.4336, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.804652679639091, | |
| "grad_norm": 2.8344221115112305, | |
| "learning_rate": 2.1961153790404408e-05, | |
| "loss": 0.4469, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.8118998441859624, | |
| "grad_norm": 2.193204879760742, | |
| "learning_rate": 2.188867951877084e-05, | |
| "loss": 0.4394, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.819147008732833, | |
| "grad_norm": 2.7660672664642334, | |
| "learning_rate": 2.1816205247137268e-05, | |
| "loss": 0.4646, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.8263941732797044, | |
| "grad_norm": 1.7778669595718384, | |
| "learning_rate": 2.1743730975503697e-05, | |
| "loss": 0.4176, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.8336413378265752, | |
| "grad_norm": 2.1182384490966797, | |
| "learning_rate": 2.1671256703870125e-05, | |
| "loss": 0.4266, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.8408885023734465, | |
| "grad_norm": 3.023648262023926, | |
| "learning_rate": 2.1598782432236557e-05, | |
| "loss": 0.4378, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.8481356669203173, | |
| "grad_norm": 3.217515707015991, | |
| "learning_rate": 2.1526308160602986e-05, | |
| "loss": 0.4328, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.8553828314671885, | |
| "grad_norm": 2.5275774002075195, | |
| "learning_rate": 2.1453833888969414e-05, | |
| "loss": 0.3978, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.8626299960140598, | |
| "grad_norm": 3.3163788318634033, | |
| "learning_rate": 2.1381359617335846e-05, | |
| "loss": 0.4594, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.8698771605609306, | |
| "grad_norm": 3.3472440242767334, | |
| "learning_rate": 2.1308885345702275e-05, | |
| "loss": 0.4368, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.8771243251078014, | |
| "grad_norm": 3.1864593029022217, | |
| "learning_rate": 2.1236411074068707e-05, | |
| "loss": 0.4715, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.8843714896546726, | |
| "grad_norm": 2.73544979095459, | |
| "learning_rate": 2.1163936802435135e-05, | |
| "loss": 0.443, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.891618654201544, | |
| "grad_norm": 3.4767727851867676, | |
| "learning_rate": 2.1091462530801567e-05, | |
| "loss": 0.4429, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.8988658187484146, | |
| "grad_norm": 2.7174811363220215, | |
| "learning_rate": 2.1018988259167996e-05, | |
| "loss": 0.4067, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.8988658187484146, | |
| "eval_loss": 0.622748076915741, | |
| "eval_runtime": 245.5244, | |
| "eval_samples_per_second": 140.495, | |
| "eval_steps_per_second": 14.052, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.9061129832952854, | |
| "grad_norm": 2.311414957046509, | |
| "learning_rate": 2.0947963472967096e-05, | |
| "loss": 0.451, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.9133601478421567, | |
| "grad_norm": 2.9489965438842773, | |
| "learning_rate": 2.0875489201333528e-05, | |
| "loss": 0.4621, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.920607312389028, | |
| "grad_norm": 3.1284987926483154, | |
| "learning_rate": 2.0803014929699957e-05, | |
| "loss": 0.4455, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.9278544769358987, | |
| "grad_norm": 2.579033613204956, | |
| "learning_rate": 2.073054065806639e-05, | |
| "loss": 0.4555, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.93510164148277, | |
| "grad_norm": 2.4020378589630127, | |
| "learning_rate": 2.0658066386432817e-05, | |
| "loss": 0.4531, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.9423488060296408, | |
| "grad_norm": 2.3254435062408447, | |
| "learning_rate": 2.0585592114799246e-05, | |
| "loss": 0.4434, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.949595970576512, | |
| "grad_norm": 2.742219924926758, | |
| "learning_rate": 2.0513117843165678e-05, | |
| "loss": 0.444, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.956843135123383, | |
| "grad_norm": 2.3544771671295166, | |
| "learning_rate": 2.0440643571532106e-05, | |
| "loss": 0.4576, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.964090299670254, | |
| "grad_norm": 2.842970609664917, | |
| "learning_rate": 2.0368169299898538e-05, | |
| "loss": 0.4416, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.9713374642171253, | |
| "grad_norm": 2.2534501552581787, | |
| "learning_rate": 2.0295695028264967e-05, | |
| "loss": 0.4348, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.978584628763996, | |
| "grad_norm": 2.976383924484253, | |
| "learning_rate": 2.02232207566314e-05, | |
| "loss": 0.4237, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.985831793310867, | |
| "grad_norm": 3.3762621879577637, | |
| "learning_rate": 2.0150746484997827e-05, | |
| "loss": 0.4506, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.993078957857738, | |
| "grad_norm": 3.0444388389587402, | |
| "learning_rate": 2.0078272213364256e-05, | |
| "loss": 0.4357, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 3.0003261224046094, | |
| "grad_norm": 3.294370412826538, | |
| "learning_rate": 2.0005797941730688e-05, | |
| "loss": 0.4358, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 3.00757328695148, | |
| "grad_norm": 2.1435303688049316, | |
| "learning_rate": 1.9933323670097116e-05, | |
| "loss": 0.343, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 3.0148204514983514, | |
| "grad_norm": 2.5070137977600098, | |
| "learning_rate": 1.9860849398463545e-05, | |
| "loss": 0.3397, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.0220676160452222, | |
| "grad_norm": 2.5345394611358643, | |
| "learning_rate": 1.9788375126829974e-05, | |
| "loss": 0.3221, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 3.0293147805920935, | |
| "grad_norm": 2.4468677043914795, | |
| "learning_rate": 1.9715900855196406e-05, | |
| "loss": 0.3404, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 3.0365619451389643, | |
| "grad_norm": 3.5916829109191895, | |
| "learning_rate": 1.964487606899551e-05, | |
| "loss": 0.3412, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 3.0438091096858355, | |
| "grad_norm": 1.7320780754089355, | |
| "learning_rate": 1.9572401797361938e-05, | |
| "loss": 0.3573, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.0438091096858355, | |
| "eval_loss": 0.6396881937980652, | |
| "eval_runtime": 245.0839, | |
| "eval_samples_per_second": 140.748, | |
| "eval_steps_per_second": 14.077, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.0510562742327063, | |
| "grad_norm": 2.4702775478363037, | |
| "learning_rate": 1.9499927525728366e-05, | |
| "loss": 0.3485, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 3.0583034387795776, | |
| "grad_norm": 3.04402232170105, | |
| "learning_rate": 1.9427453254094795e-05, | |
| "loss": 0.3575, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 3.0655506033264484, | |
| "grad_norm": 2.487696886062622, | |
| "learning_rate": 1.9354978982461227e-05, | |
| "loss": 0.3178, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 3.0727977678733196, | |
| "grad_norm": 2.565079689025879, | |
| "learning_rate": 1.9282504710827655e-05, | |
| "loss": 0.3311, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 3.0800449324201904, | |
| "grad_norm": 3.043264865875244, | |
| "learning_rate": 1.9210030439194087e-05, | |
| "loss": 0.3201, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 3.0872920969670616, | |
| "grad_norm": 3.086071729660034, | |
| "learning_rate": 1.9137556167560516e-05, | |
| "loss": 0.3385, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 3.094539261513933, | |
| "grad_norm": 1.5752780437469482, | |
| "learning_rate": 1.9065081895926948e-05, | |
| "loss": 0.305, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 3.1017864260608037, | |
| "grad_norm": 2.464972496032715, | |
| "learning_rate": 1.8992607624293376e-05, | |
| "loss": 0.3421, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 3.109033590607675, | |
| "grad_norm": 2.98641037940979, | |
| "learning_rate": 1.892013335265981e-05, | |
| "loss": 0.3581, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 3.1162807551545457, | |
| "grad_norm": 2.293949842453003, | |
| "learning_rate": 1.8847659081026237e-05, | |
| "loss": 0.3241, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.123527919701417, | |
| "grad_norm": 2.361656427383423, | |
| "learning_rate": 1.8775184809392666e-05, | |
| "loss": 0.3154, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 3.1307750842482878, | |
| "grad_norm": 3.095930576324463, | |
| "learning_rate": 1.8702710537759097e-05, | |
| "loss": 0.3043, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 3.138022248795159, | |
| "grad_norm": 2.254836320877075, | |
| "learning_rate": 1.8630236266125526e-05, | |
| "loss": 0.3381, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 3.14526941334203, | |
| "grad_norm": 3.281912088394165, | |
| "learning_rate": 1.8557761994491958e-05, | |
| "loss": 0.3292, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 3.152516577888901, | |
| "grad_norm": 2.4811136722564697, | |
| "learning_rate": 1.8485287722858387e-05, | |
| "loss": 0.3221, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 3.159763742435772, | |
| "grad_norm": 2.5498745441436768, | |
| "learning_rate": 1.8412813451224815e-05, | |
| "loss": 0.3548, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 3.167010906982643, | |
| "grad_norm": 2.9773364067077637, | |
| "learning_rate": 1.8340339179591247e-05, | |
| "loss": 0.3199, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 3.174258071529514, | |
| "grad_norm": 2.239015579223633, | |
| "learning_rate": 1.8267864907957676e-05, | |
| "loss": 0.3258, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 3.181505236076385, | |
| "grad_norm": 2.470496416091919, | |
| "learning_rate": 1.8195390636324104e-05, | |
| "loss": 0.3199, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 3.1887524006232564, | |
| "grad_norm": 2.567301034927368, | |
| "learning_rate": 1.8122916364690536e-05, | |
| "loss": 0.3859, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.1887524006232564, | |
| "eval_loss": 0.6378082036972046, | |
| "eval_runtime": 245.7126, | |
| "eval_samples_per_second": 140.388, | |
| "eval_steps_per_second": 14.041, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.195999565170127, | |
| "grad_norm": 3.044546127319336, | |
| "learning_rate": 1.8050442093056965e-05, | |
| "loss": 0.3116, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 3.2032467297169984, | |
| "grad_norm": 2.832991600036621, | |
| "learning_rate": 1.7977967821423393e-05, | |
| "loss": 0.3343, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 3.210493894263869, | |
| "grad_norm": 2.9658920764923096, | |
| "learning_rate": 1.7905493549789825e-05, | |
| "loss": 0.3576, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 3.2177410588107405, | |
| "grad_norm": 3.258549213409424, | |
| "learning_rate": 1.7833019278156254e-05, | |
| "loss": 0.3396, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 3.2249882233576113, | |
| "grad_norm": 3.3627419471740723, | |
| "learning_rate": 1.7760545006522686e-05, | |
| "loss": 0.3206, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 3.2322353879044825, | |
| "grad_norm": 2.9110984802246094, | |
| "learning_rate": 1.7688070734889114e-05, | |
| "loss": 0.3612, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 3.2394825524513533, | |
| "grad_norm": 2.8267645835876465, | |
| "learning_rate": 1.7615596463255546e-05, | |
| "loss": 0.3477, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 3.2467297169982245, | |
| "grad_norm": 2.8990402221679688, | |
| "learning_rate": 1.7543122191621975e-05, | |
| "loss": 0.3301, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 3.2539768815450953, | |
| "grad_norm": 2.5321147441864014, | |
| "learning_rate": 1.7470647919988407e-05, | |
| "loss": 0.3212, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 3.2612240460919666, | |
| "grad_norm": 3.9064178466796875, | |
| "learning_rate": 1.7398173648354835e-05, | |
| "loss": 0.3752, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.2684712106388374, | |
| "grad_norm": 2.5248515605926514, | |
| "learning_rate": 1.7325699376721267e-05, | |
| "loss": 0.3467, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 3.2757183751857086, | |
| "grad_norm": 2.442370653152466, | |
| "learning_rate": 1.7253225105087696e-05, | |
| "loss": 0.3171, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 3.2829655397325794, | |
| "grad_norm": 2.523216485977173, | |
| "learning_rate": 1.7180750833454125e-05, | |
| "loss": 0.3422, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 3.2902127042794507, | |
| "grad_norm": 3.633876323699951, | |
| "learning_rate": 1.7108276561820553e-05, | |
| "loss": 0.3236, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 3.297459868826322, | |
| "grad_norm": 3.5989325046539307, | |
| "learning_rate": 1.7035802290186985e-05, | |
| "loss": 0.3562, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 3.3047070333731927, | |
| "grad_norm": 2.8764047622680664, | |
| "learning_rate": 1.6963328018553414e-05, | |
| "loss": 0.3432, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 3.311954197920064, | |
| "grad_norm": 2.8960604667663574, | |
| "learning_rate": 1.6890853746919842e-05, | |
| "loss": 0.339, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 3.3192013624669348, | |
| "grad_norm": 3.1542296409606934, | |
| "learning_rate": 1.6818379475286274e-05, | |
| "loss": 0.3578, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 3.326448527013806, | |
| "grad_norm": 3.966387987136841, | |
| "learning_rate": 1.6745905203652703e-05, | |
| "loss": 0.3437, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 3.333695691560677, | |
| "grad_norm": 2.1080222129821777, | |
| "learning_rate": 1.6673430932019135e-05, | |
| "loss": 0.3476, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.333695691560677, | |
| "eval_loss": 0.6372683644294739, | |
| "eval_runtime": 245.8192, | |
| "eval_samples_per_second": 140.327, | |
| "eval_steps_per_second": 14.035, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.340942856107548, | |
| "grad_norm": 2.4654605388641357, | |
| "learning_rate": 1.6600956660385563e-05, | |
| "loss": 0.3582, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 3.348190020654419, | |
| "grad_norm": 3.2088425159454346, | |
| "learning_rate": 1.6528482388751995e-05, | |
| "loss": 0.3121, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 3.35543718520129, | |
| "grad_norm": 2.5280325412750244, | |
| "learning_rate": 1.6456008117118424e-05, | |
| "loss": 0.3233, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 3.362684349748161, | |
| "grad_norm": 2.5772511959075928, | |
| "learning_rate": 1.6383533845484852e-05, | |
| "loss": 0.3277, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 3.369931514295032, | |
| "grad_norm": 2.4769599437713623, | |
| "learning_rate": 1.6311059573851284e-05, | |
| "loss": 0.3349, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 3.377178678841903, | |
| "grad_norm": 3.1656038761138916, | |
| "learning_rate": 1.6238585302217713e-05, | |
| "loss": 0.3601, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 3.384425843388774, | |
| "grad_norm": 3.1911141872406006, | |
| "learning_rate": 1.6166111030584145e-05, | |
| "loss": 0.3664, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 3.391673007935645, | |
| "grad_norm": 3.001246213912964, | |
| "learning_rate": 1.6093636758950574e-05, | |
| "loss": 0.3547, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 3.398920172482516, | |
| "grad_norm": 2.2357707023620605, | |
| "learning_rate": 1.6021162487317005e-05, | |
| "loss": 0.3581, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 3.4061673370293875, | |
| "grad_norm": 2.813751459121704, | |
| "learning_rate": 1.5948688215683434e-05, | |
| "loss": 0.3436, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.4134145015762583, | |
| "grad_norm": 2.3340563774108887, | |
| "learning_rate": 1.5876213944049863e-05, | |
| "loss": 0.3424, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 3.4206616661231295, | |
| "grad_norm": 3.3509624004364014, | |
| "learning_rate": 1.580373967241629e-05, | |
| "loss": 0.365, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 3.4279088306700003, | |
| "grad_norm": 2.6918370723724365, | |
| "learning_rate": 1.5731265400782723e-05, | |
| "loss": 0.3396, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 3.4351559952168715, | |
| "grad_norm": 3.110868215560913, | |
| "learning_rate": 1.5658791129149152e-05, | |
| "loss": 0.3303, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 3.4424031597637423, | |
| "grad_norm": 3.6813771724700928, | |
| "learning_rate": 1.558631685751558e-05, | |
| "loss": 0.3726, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 3.4496503243106136, | |
| "grad_norm": 2.564406633377075, | |
| "learning_rate": 1.5513842585882012e-05, | |
| "loss": 0.3507, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 3.4568974888574844, | |
| "grad_norm": 2.496525526046753, | |
| "learning_rate": 1.544136831424844e-05, | |
| "loss": 0.3348, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 3.4641446534043556, | |
| "grad_norm": 3.00034761428833, | |
| "learning_rate": 1.5368894042614873e-05, | |
| "loss": 0.35, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 3.4713918179512264, | |
| "grad_norm": 2.688913106918335, | |
| "learning_rate": 1.52964197709813e-05, | |
| "loss": 0.3123, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 3.4786389824980977, | |
| "grad_norm": 3.100461721420288, | |
| "learning_rate": 1.5223945499347733e-05, | |
| "loss": 0.3559, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4786389824980977, | |
| "eval_loss": 0.6312422156333923, | |
| "eval_runtime": 246.9159, | |
| "eval_samples_per_second": 139.703, | |
| "eval_steps_per_second": 13.972, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4858861470449685, | |
| "grad_norm": 2.221304178237915, | |
| "learning_rate": 1.5151471227714162e-05, | |
| "loss": 0.3437, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 3.4931333115918397, | |
| "grad_norm": 2.807159662246704, | |
| "learning_rate": 1.5078996956080594e-05, | |
| "loss": 0.3284, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 3.5003804761387105, | |
| "grad_norm": 2.910870313644409, | |
| "learning_rate": 1.5006522684447022e-05, | |
| "loss": 0.3436, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 3.5076276406855817, | |
| "grad_norm": 2.7148754596710205, | |
| "learning_rate": 1.4934048412813451e-05, | |
| "loss": 0.364, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 3.514874805232453, | |
| "grad_norm": 3.629567861557007, | |
| "learning_rate": 1.4861574141179881e-05, | |
| "loss": 0.3293, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 3.522121969779324, | |
| "grad_norm": 1.6957030296325684, | |
| "learning_rate": 1.478909986954631e-05, | |
| "loss": 0.3418, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 3.529369134326195, | |
| "grad_norm": 2.671588659286499, | |
| "learning_rate": 1.4716625597912742e-05, | |
| "loss": 0.3054, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 3.536616298873066, | |
| "grad_norm": 2.832435369491577, | |
| "learning_rate": 1.464415132627917e-05, | |
| "loss": 0.3457, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 3.543863463419937, | |
| "grad_norm": 3.806084156036377, | |
| "learning_rate": 1.4571677054645602e-05, | |
| "loss": 0.3366, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 3.551110627966808, | |
| "grad_norm": 3.169780731201172, | |
| "learning_rate": 1.4499202783012031e-05, | |
| "loss": 0.3337, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 3.558357792513679, | |
| "grad_norm": 2.461219310760498, | |
| "learning_rate": 1.4426728511378463e-05, | |
| "loss": 0.3394, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 3.56560495706055, | |
| "grad_norm": 2.458402633666992, | |
| "learning_rate": 1.4354254239744891e-05, | |
| "loss": 0.3245, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 3.572852121607421, | |
| "grad_norm": 2.172034740447998, | |
| "learning_rate": 1.4281779968111322e-05, | |
| "loss": 0.3335, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 3.580099286154292, | |
| "grad_norm": 2.7269339561462402, | |
| "learning_rate": 1.420930569647775e-05, | |
| "loss": 0.3268, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 3.587346450701163, | |
| "grad_norm": 3.2520856857299805, | |
| "learning_rate": 1.4136831424844179e-05, | |
| "loss": 0.3282, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 3.5945936152480344, | |
| "grad_norm": 3.6039845943450928, | |
| "learning_rate": 1.406435715321061e-05, | |
| "loss": 0.3375, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 3.6018407797949052, | |
| "grad_norm": 2.7368907928466797, | |
| "learning_rate": 1.399188288157704e-05, | |
| "loss": 0.3368, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 3.609087944341776, | |
| "grad_norm": 3.1287124156951904, | |
| "learning_rate": 1.3919408609943471e-05, | |
| "loss": 0.3517, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 3.6163351088886473, | |
| "grad_norm": 3.3379523754119873, | |
| "learning_rate": 1.38469343383099e-05, | |
| "loss": 0.3334, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 3.6235822734355185, | |
| "grad_norm": 2.828714609146118, | |
| "learning_rate": 1.3774460066676332e-05, | |
| "loss": 0.3148, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.6235822734355185, | |
| "eval_loss": 0.6326374411582947, | |
| "eval_runtime": 245.5715, | |
| "eval_samples_per_second": 140.468, | |
| "eval_steps_per_second": 14.049, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.6308294379823893, | |
| "grad_norm": 2.7624423503875732, | |
| "learning_rate": 1.370198579504276e-05, | |
| "loss": 0.3333, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 3.6380766025292606, | |
| "grad_norm": 2.4403984546661377, | |
| "learning_rate": 1.362951152340919e-05, | |
| "loss": 0.3306, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 3.6453237670761314, | |
| "grad_norm": 3.4315109252929688, | |
| "learning_rate": 1.355703725177562e-05, | |
| "loss": 0.3428, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 3.6525709316230026, | |
| "grad_norm": 4.232142925262451, | |
| "learning_rate": 1.3484562980142051e-05, | |
| "loss": 0.3437, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 3.6598180961698734, | |
| "grad_norm": 2.562215805053711, | |
| "learning_rate": 1.341208870850848e-05, | |
| "loss": 0.3437, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 3.6670652607167447, | |
| "grad_norm": 2.4503726959228516, | |
| "learning_rate": 1.3339614436874908e-05, | |
| "loss": 0.3018, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 3.6743124252636155, | |
| "grad_norm": 2.709066390991211, | |
| "learning_rate": 1.3268589650674012e-05, | |
| "loss": 0.3582, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 3.6815595898104867, | |
| "grad_norm": 2.3442864418029785, | |
| "learning_rate": 1.319611537904044e-05, | |
| "loss": 0.3224, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 3.6888067543573575, | |
| "grad_norm": 4.138051509857178, | |
| "learning_rate": 1.312364110740687e-05, | |
| "loss": 0.3791, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 3.6960539189042287, | |
| "grad_norm": 3.238833427429199, | |
| "learning_rate": 1.3051166835773301e-05, | |
| "loss": 0.3191, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 3.7033010834511, | |
| "grad_norm": 3.0697717666625977, | |
| "learning_rate": 1.297869256413973e-05, | |
| "loss": 0.32, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 3.710548247997971, | |
| "grad_norm": 2.4563581943511963, | |
| "learning_rate": 1.2906218292506162e-05, | |
| "loss": 0.3269, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 3.7177954125448416, | |
| "grad_norm": 2.1714043617248535, | |
| "learning_rate": 1.283374402087259e-05, | |
| "loss": 0.3085, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 3.725042577091713, | |
| "grad_norm": 2.205698013305664, | |
| "learning_rate": 1.2761269749239022e-05, | |
| "loss": 0.3558, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 3.732289741638584, | |
| "grad_norm": 2.4091830253601074, | |
| "learning_rate": 1.268879547760545e-05, | |
| "loss": 0.3631, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 3.739536906185455, | |
| "grad_norm": 1.7875028848648071, | |
| "learning_rate": 1.2616321205971881e-05, | |
| "loss": 0.3241, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 3.746784070732326, | |
| "grad_norm": 3.206444501876831, | |
| "learning_rate": 1.2545296419770983e-05, | |
| "loss": 0.3237, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 3.754031235279197, | |
| "grad_norm": 2.4100027084350586, | |
| "learning_rate": 1.2472822148137411e-05, | |
| "loss": 0.3116, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 3.761278399826068, | |
| "grad_norm": 3.0889265537261963, | |
| "learning_rate": 1.2400347876503842e-05, | |
| "loss": 0.3599, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 3.768525564372939, | |
| "grad_norm": 2.965827703475952, | |
| "learning_rate": 1.2327873604870272e-05, | |
| "loss": 0.3477, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.768525564372939, | |
| "eval_loss": 0.6264123320579529, | |
| "eval_runtime": 245.5751, | |
| "eval_samples_per_second": 140.466, | |
| "eval_steps_per_second": 14.049, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.77577272891981, | |
| "grad_norm": 3.572783946990967, | |
| "learning_rate": 1.2255399333236702e-05, | |
| "loss": 0.3519, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 3.7830198934666814, | |
| "grad_norm": 3.2291600704193115, | |
| "learning_rate": 1.218292506160313e-05, | |
| "loss": 0.3377, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 3.7902670580135522, | |
| "grad_norm": 3.1608381271362305, | |
| "learning_rate": 1.2110450789969561e-05, | |
| "loss": 0.3306, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 3.797514222560423, | |
| "grad_norm": 2.326995611190796, | |
| "learning_rate": 1.2037976518335991e-05, | |
| "loss": 0.3425, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 3.8047613871072943, | |
| "grad_norm": 2.580730438232422, | |
| "learning_rate": 1.1965502246702421e-05, | |
| "loss": 0.3544, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 3.8120085516541655, | |
| "grad_norm": 3.1434969902038574, | |
| "learning_rate": 1.1893027975068852e-05, | |
| "loss": 0.3475, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 3.8192557162010363, | |
| "grad_norm": 2.08758282661438, | |
| "learning_rate": 1.1820553703435282e-05, | |
| "loss": 0.3287, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 3.826502880747907, | |
| "grad_norm": 2.8469362258911133, | |
| "learning_rate": 1.174807943180171e-05, | |
| "loss": 0.3236, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 3.8337500452947784, | |
| "grad_norm": 3.5601906776428223, | |
| "learning_rate": 1.1675605160168141e-05, | |
| "loss": 0.3263, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 3.8409972098416496, | |
| "grad_norm": 3.756640911102295, | |
| "learning_rate": 1.1603130888534571e-05, | |
| "loss": 0.3382, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 3.8482443743885204, | |
| "grad_norm": 2.394885778427124, | |
| "learning_rate": 1.1530656616901e-05, | |
| "loss": 0.3373, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 3.8554915389353916, | |
| "grad_norm": 2.798363208770752, | |
| "learning_rate": 1.145818234526743e-05, | |
| "loss": 0.3259, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 3.8627387034822624, | |
| "grad_norm": 2.411869764328003, | |
| "learning_rate": 1.138570807363386e-05, | |
| "loss": 0.3418, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 3.8699858680291337, | |
| "grad_norm": 3.126814603805542, | |
| "learning_rate": 1.131323380200029e-05, | |
| "loss": 0.3334, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 3.8772330325760045, | |
| "grad_norm": 3.4210116863250732, | |
| "learning_rate": 1.124075953036672e-05, | |
| "loss": 0.3249, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 3.8844801971228757, | |
| "grad_norm": 2.846679925918579, | |
| "learning_rate": 1.1168285258733151e-05, | |
| "loss": 0.3377, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 3.891727361669747, | |
| "grad_norm": 3.338003635406494, | |
| "learning_rate": 1.1095810987099581e-05, | |
| "loss": 0.3626, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 3.8989745262166178, | |
| "grad_norm": 3.7777626514434814, | |
| "learning_rate": 1.102333671546601e-05, | |
| "loss": 0.3385, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 3.9062216907634886, | |
| "grad_norm": 2.5645010471343994, | |
| "learning_rate": 1.095086244383244e-05, | |
| "loss": 0.3485, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 3.91346885531036, | |
| "grad_norm": 2.8242435455322266, | |
| "learning_rate": 1.0878388172198869e-05, | |
| "loss": 0.353, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 3.91346885531036, | |
| "eval_loss": 0.6270226836204529, | |
| "eval_runtime": 245.4756, | |
| "eval_samples_per_second": 140.523, | |
| "eval_steps_per_second": 14.054, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 3.920716019857231, | |
| "grad_norm": 3.0427374839782715, | |
| "learning_rate": 1.0805913900565299e-05, | |
| "loss": 0.3625, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 3.927963184404102, | |
| "grad_norm": 3.5145153999328613, | |
| "learning_rate": 1.073343962893173e-05, | |
| "loss": 0.3377, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 3.9352103489509727, | |
| "grad_norm": 3.0157294273376465, | |
| "learning_rate": 1.066096535729816e-05, | |
| "loss": 0.3231, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 3.942457513497844, | |
| "grad_norm": 3.3504700660705566, | |
| "learning_rate": 1.058849108566459e-05, | |
| "loss": 0.3407, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 3.949704678044715, | |
| "grad_norm": 2.313544988632202, | |
| "learning_rate": 1.051601681403102e-05, | |
| "loss": 0.3252, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 3.956951842591586, | |
| "grad_norm": 3.034682035446167, | |
| "learning_rate": 1.044354254239745e-05, | |
| "loss": 0.319, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 3.964199007138457, | |
| "grad_norm": 2.8075735569000244, | |
| "learning_rate": 1.037106827076388e-05, | |
| "loss": 0.3371, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 3.971446171685328, | |
| "grad_norm": 2.394465446472168, | |
| "learning_rate": 1.0298593999130309e-05, | |
| "loss": 0.3286, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 3.9786933362321992, | |
| "grad_norm": 2.6049180030822754, | |
| "learning_rate": 1.0226119727496738e-05, | |
| "loss": 0.3295, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 3.98594050077907, | |
| "grad_norm": 2.6557512283325195, | |
| "learning_rate": 1.0153645455863168e-05, | |
| "loss": 0.3566, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 3.9931876653259413, | |
| "grad_norm": 2.997840166091919, | |
| "learning_rate": 1.0081171184229598e-05, | |
| "loss": 0.326, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 4.0004348298728125, | |
| "grad_norm": 2.4651620388031006, | |
| "learning_rate": 1.0008696912596029e-05, | |
| "loss": 0.3328, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 4.007681994419683, | |
| "grad_norm": 2.93027400970459, | |
| "learning_rate": 9.936222640962459e-06, | |
| "loss": 0.2705, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 4.014929158966554, | |
| "grad_norm": 3.154695987701416, | |
| "learning_rate": 9.863748369328889e-06, | |
| "loss": 0.251, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 4.022176323513426, | |
| "grad_norm": 2.877485990524292, | |
| "learning_rate": 9.79127409769532e-06, | |
| "loss": 0.2557, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 4.029423488060297, | |
| "grad_norm": 2.5868325233459473, | |
| "learning_rate": 9.71879982606175e-06, | |
| "loss": 0.2481, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 4.036670652607167, | |
| "grad_norm": 4.68599271774292, | |
| "learning_rate": 9.646325554428178e-06, | |
| "loss": 0.2704, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 4.043917817154038, | |
| "grad_norm": 2.302772045135498, | |
| "learning_rate": 9.573851282794608e-06, | |
| "loss": 0.2537, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 4.05116498170091, | |
| "grad_norm": 2.2476446628570557, | |
| "learning_rate": 9.501377011161039e-06, | |
| "loss": 0.2488, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 4.058412146247781, | |
| "grad_norm": 1.8352832794189453, | |
| "learning_rate": 9.428902739527467e-06, | |
| "loss": 0.2693, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.058412146247781, | |
| "eval_loss": 0.6433804631233215, | |
| "eval_runtime": 245.0452, | |
| "eval_samples_per_second": 140.77, | |
| "eval_steps_per_second": 14.079, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.0656593107946515, | |
| "grad_norm": 2.861711263656616, | |
| "learning_rate": 9.356428467893898e-06, | |
| "loss": 0.2552, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 4.072906475341522, | |
| "grad_norm": 2.669003963470459, | |
| "learning_rate": 9.283954196260328e-06, | |
| "loss": 0.2737, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 4.080153639888394, | |
| "grad_norm": 3.106980323791504, | |
| "learning_rate": 9.211479924626758e-06, | |
| "loss": 0.265, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 4.087400804435265, | |
| "grad_norm": 2.7630670070648193, | |
| "learning_rate": 9.139005652993188e-06, | |
| "loss": 0.2457, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 4.094647968982136, | |
| "grad_norm": 2.5765066146850586, | |
| "learning_rate": 9.066531381359619e-06, | |
| "loss": 0.2815, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 4.101895133529006, | |
| "grad_norm": 2.980583667755127, | |
| "learning_rate": 8.994057109726047e-06, | |
| "loss": 0.2763, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 4.109142298075878, | |
| "grad_norm": 2.6509013175964355, | |
| "learning_rate": 8.921582838092477e-06, | |
| "loss": 0.2454, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 4.116389462622749, | |
| "grad_norm": 2.8553245067596436, | |
| "learning_rate": 8.849108566458908e-06, | |
| "loss": 0.2749, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 4.12363662716962, | |
| "grad_norm": 2.524636745452881, | |
| "learning_rate": 8.776634294825338e-06, | |
| "loss": 0.2537, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 4.130883791716491, | |
| "grad_norm": 3.848393440246582, | |
| "learning_rate": 8.704160023191767e-06, | |
| "loss": 0.2635, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 4.138130956263362, | |
| "grad_norm": 3.296485662460327, | |
| "learning_rate": 8.631685751558197e-06, | |
| "loss": 0.2499, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 4.145378120810233, | |
| "grad_norm": 3.0012335777282715, | |
| "learning_rate": 8.559211479924627e-06, | |
| "loss": 0.2698, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 4.152625285357104, | |
| "grad_norm": 1.961544156074524, | |
| "learning_rate": 8.486737208291057e-06, | |
| "loss": 0.23, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 4.159872449903975, | |
| "grad_norm": 3.157874822616577, | |
| "learning_rate": 8.414262936657488e-06, | |
| "loss": 0.2493, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 4.167119614450846, | |
| "grad_norm": 2.372300624847412, | |
| "learning_rate": 8.341788665023916e-06, | |
| "loss": 0.2874, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 4.174366778997717, | |
| "grad_norm": 1.9763847589492798, | |
| "learning_rate": 8.269314393390346e-06, | |
| "loss": 0.2508, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 4.181613943544588, | |
| "grad_norm": 3.00111985206604, | |
| "learning_rate": 8.196840121756777e-06, | |
| "loss": 0.2732, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 4.1888611080914595, | |
| "grad_norm": 3.2600185871124268, | |
| "learning_rate": 8.124365850123207e-06, | |
| "loss": 0.2718, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 4.19610827263833, | |
| "grad_norm": 2.755221366882324, | |
| "learning_rate": 8.051891578489637e-06, | |
| "loss": 0.2401, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 4.203355437185201, | |
| "grad_norm": 3.3103065490722656, | |
| "learning_rate": 7.979417306856067e-06, | |
| "loss": 0.2898, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.203355437185201, | |
| "eval_loss": 0.6491243243217468, | |
| "eval_runtime": 245.4343, | |
| "eval_samples_per_second": 140.547, | |
| "eval_steps_per_second": 14.057, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.210602601732072, | |
| "grad_norm": 2.783529281616211, | |
| "learning_rate": 7.906943035222496e-06, | |
| "loss": 0.2774, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 4.217849766278944, | |
| "grad_norm": 2.3917746543884277, | |
| "learning_rate": 7.834468763588926e-06, | |
| "loss": 0.2896, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 4.225096930825814, | |
| "grad_norm": 3.132794141769409, | |
| "learning_rate": 7.761994491955357e-06, | |
| "loss": 0.2704, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 4.232344095372685, | |
| "grad_norm": 2.8275017738342285, | |
| "learning_rate": 7.689520220321785e-06, | |
| "loss": 0.2695, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 4.239591259919557, | |
| "grad_norm": 3.1233084201812744, | |
| "learning_rate": 7.617045948688216e-06, | |
| "loss": 0.2677, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 4.246838424466428, | |
| "grad_norm": 3.1787428855895996, | |
| "learning_rate": 7.544571677054646e-06, | |
| "loss": 0.2611, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 4.2540855890132985, | |
| "grad_norm": 3.4065091609954834, | |
| "learning_rate": 7.472097405421076e-06, | |
| "loss": 0.285, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 4.261332753560169, | |
| "grad_norm": 2.7599704265594482, | |
| "learning_rate": 7.399623133787506e-06, | |
| "loss": 0.2586, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 4.268579918107041, | |
| "grad_norm": 2.2776358127593994, | |
| "learning_rate": 7.3271488621539365e-06, | |
| "loss": 0.2532, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 4.275827082653912, | |
| "grad_norm": 3.391362428665161, | |
| "learning_rate": 7.254674590520366e-06, | |
| "loss": 0.257, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 4.283074247200783, | |
| "grad_norm": 4.152310371398926, | |
| "learning_rate": 7.182200318886796e-06, | |
| "loss": 0.2655, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 4.290321411747653, | |
| "grad_norm": 1.8384218215942383, | |
| "learning_rate": 7.109726047253225e-06, | |
| "loss": 0.2556, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 4.297568576294525, | |
| "grad_norm": 2.8006668090820312, | |
| "learning_rate": 7.037251775619655e-06, | |
| "loss": 0.252, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 4.304815740841396, | |
| "grad_norm": 2.0686655044555664, | |
| "learning_rate": 6.964777503986085e-06, | |
| "loss": 0.2626, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 4.312062905388267, | |
| "grad_norm": 4.304172515869141, | |
| "learning_rate": 6.892303232352515e-06, | |
| "loss": 0.2566, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 4.319310069935138, | |
| "grad_norm": 3.3372154235839844, | |
| "learning_rate": 6.819828960718945e-06, | |
| "loss": 0.256, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 4.326557234482009, | |
| "grad_norm": 2.2065439224243164, | |
| "learning_rate": 6.747354689085375e-06, | |
| "loss": 0.251, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 4.33380439902888, | |
| "grad_norm": 3.629650354385376, | |
| "learning_rate": 6.6748804174518055e-06, | |
| "loss": 0.2696, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 4.341051563575751, | |
| "grad_norm": 2.2397236824035645, | |
| "learning_rate": 6.602406145818235e-06, | |
| "loss": 0.2359, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 4.348298728122622, | |
| "grad_norm": 3.494893789291382, | |
| "learning_rate": 6.529931874184665e-06, | |
| "loss": 0.2714, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.348298728122622, | |
| "eval_loss": 0.6459131836891174, | |
| "eval_runtime": 245.3544, | |
| "eval_samples_per_second": 140.593, | |
| "eval_steps_per_second": 14.061, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.355545892669493, | |
| "grad_norm": 2.9454917907714844, | |
| "learning_rate": 6.4574576025510955e-06, | |
| "loss": 0.2539, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 4.362793057216364, | |
| "grad_norm": 2.9951882362365723, | |
| "learning_rate": 6.384983330917525e-06, | |
| "loss": 0.2687, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 4.370040221763235, | |
| "grad_norm": 3.583976984024048, | |
| "learning_rate": 6.312509059283954e-06, | |
| "loss": 0.2879, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 4.3772873863101065, | |
| "grad_norm": 3.201929807662964, | |
| "learning_rate": 6.2400347876503846e-06, | |
| "loss": 0.2531, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 4.384534550856977, | |
| "grad_norm": 2.60980486869812, | |
| "learning_rate": 6.167560516016814e-06, | |
| "loss": 0.2681, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 4.391781715403848, | |
| "grad_norm": 2.1033682823181152, | |
| "learning_rate": 6.095086244383244e-06, | |
| "loss": 0.2774, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 4.399028879950719, | |
| "grad_norm": 2.239474058151245, | |
| "learning_rate": 6.022611972749674e-06, | |
| "loss": 0.2676, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 4.406276044497591, | |
| "grad_norm": 3.176302671432495, | |
| "learning_rate": 5.950137701116104e-06, | |
| "loss": 0.2722, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 4.413523209044461, | |
| "grad_norm": 2.5901739597320557, | |
| "learning_rate": 5.877663429482534e-06, | |
| "loss": 0.2523, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 4.420770373591332, | |
| "grad_norm": 2.8084895610809326, | |
| "learning_rate": 5.8051891578489645e-06, | |
| "loss": 0.2487, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 4.428017538138203, | |
| "grad_norm": 3.332167387008667, | |
| "learning_rate": 5.732714886215394e-06, | |
| "loss": 0.2523, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 4.435264702685075, | |
| "grad_norm": 2.872776746749878, | |
| "learning_rate": 5.660240614581823e-06, | |
| "loss": 0.2739, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 4.4425118672319455, | |
| "grad_norm": 1.9077197313308716, | |
| "learning_rate": 5.5877663429482536e-06, | |
| "loss": 0.2476, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 4.449759031778816, | |
| "grad_norm": 2.676182270050049, | |
| "learning_rate": 5.515292071314684e-06, | |
| "loss": 0.2579, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 4.457006196325688, | |
| "grad_norm": 3.4579455852508545, | |
| "learning_rate": 5.442817799681114e-06, | |
| "loss": 0.2686, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 4.464253360872559, | |
| "grad_norm": 2.6098556518554688, | |
| "learning_rate": 5.370343528047543e-06, | |
| "loss": 0.2578, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 4.4715005254194296, | |
| "grad_norm": 3.4452645778656006, | |
| "learning_rate": 5.297869256413973e-06, | |
| "loss": 0.2868, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 4.4787476899663, | |
| "grad_norm": 3.4186015129089355, | |
| "learning_rate": 5.225394984780403e-06, | |
| "loss": 0.2676, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 4.485994854513172, | |
| "grad_norm": 3.0700855255126953, | |
| "learning_rate": 5.1529207131468335e-06, | |
| "loss": 0.2642, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 4.493242019060043, | |
| "grad_norm": 2.718798875808716, | |
| "learning_rate": 5.080446441513263e-06, | |
| "loss": 0.2488, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 4.493242019060043, | |
| "eval_loss": 0.6449950337409973, | |
| "eval_runtime": 245.4001, | |
| "eval_samples_per_second": 140.566, | |
| "eval_steps_per_second": 14.059, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 4.500489183606914, | |
| "grad_norm": 3.136319875717163, | |
| "learning_rate": 5.007972169879692e-06, | |
| "loss": 0.2831, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 4.507736348153784, | |
| "grad_norm": 2.020862340927124, | |
| "learning_rate": 4.935497898246123e-06, | |
| "loss": 0.2312, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 4.514983512700656, | |
| "grad_norm": 4.0119948387146, | |
| "learning_rate": 4.863023626612553e-06, | |
| "loss": 0.2583, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 4.522230677247527, | |
| "grad_norm": 3.341949462890625, | |
| "learning_rate": 4.790549354978983e-06, | |
| "loss": 0.2777, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 4.529477841794398, | |
| "grad_norm": 3.6001293659210205, | |
| "learning_rate": 4.7180750833454125e-06, | |
| "loss": 0.251, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 4.536725006341269, | |
| "grad_norm": 2.4010775089263916, | |
| "learning_rate": 4.645600811711843e-06, | |
| "loss": 0.2679, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 4.54397217088814, | |
| "grad_norm": 2.33186674118042, | |
| "learning_rate": 4.573126540078272e-06, | |
| "loss": 0.2702, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 4.551219335435011, | |
| "grad_norm": 3.366321325302124, | |
| "learning_rate": 4.5006522684447025e-06, | |
| "loss": 0.2383, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 4.558466499981882, | |
| "grad_norm": 2.606224298477173, | |
| "learning_rate": 4.428177996811132e-06, | |
| "loss": 0.2709, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 4.5657136645287535, | |
| "grad_norm": 3.850285053253174, | |
| "learning_rate": 4.355703725177562e-06, | |
| "loss": 0.2712, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 4.572960829075624, | |
| "grad_norm": 2.4807302951812744, | |
| "learning_rate": 4.2832294535439924e-06, | |
| "loss": 0.267, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 4.580207993622495, | |
| "grad_norm": 1.4953159093856812, | |
| "learning_rate": 4.210755181910422e-06, | |
| "loss": 0.2599, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 4.587455158169366, | |
| "grad_norm": 2.457629680633545, | |
| "learning_rate": 4.138280910276852e-06, | |
| "loss": 0.2802, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 4.594702322716238, | |
| "grad_norm": 2.150555372238159, | |
| "learning_rate": 4.0658066386432815e-06, | |
| "loss": 0.2564, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 4.601949487263108, | |
| "grad_norm": 1.8131722211837769, | |
| "learning_rate": 3.993332367009712e-06, | |
| "loss": 0.2805, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 4.609196651809979, | |
| "grad_norm": 3.3967912197113037, | |
| "learning_rate": 3.920858095376142e-06, | |
| "loss": 0.2812, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 4.616443816356851, | |
| "grad_norm": 2.5398590564727783, | |
| "learning_rate": 3.8483838237425715e-06, | |
| "loss": 0.2787, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 4.623690980903722, | |
| "grad_norm": 2.45865535736084, | |
| "learning_rate": 3.7759095521090013e-06, | |
| "loss": 0.2333, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 4.6309381454505925, | |
| "grad_norm": 3.3966212272644043, | |
| "learning_rate": 3.704884765908103e-06, | |
| "loss": 0.2619, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 4.638185309997463, | |
| "grad_norm": 2.421985149383545, | |
| "learning_rate": 3.6324104942745326e-06, | |
| "loss": 0.2851, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.638185309997463, | |
| "eval_loss": 0.6421298980712891, | |
| "eval_runtime": 245.3379, | |
| "eval_samples_per_second": 140.602, | |
| "eval_steps_per_second": 14.062, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.645432474544334, | |
| "grad_norm": 2.5105528831481934, | |
| "learning_rate": 3.5599362226409624e-06, | |
| "loss": 0.2595, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 4.652679639091206, | |
| "grad_norm": 2.653074026107788, | |
| "learning_rate": 3.4874619510073923e-06, | |
| "loss": 0.2478, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 4.6599268036380765, | |
| "grad_norm": 1.6157690286636353, | |
| "learning_rate": 3.4149876793738225e-06, | |
| "loss": 0.2563, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 4.667173968184947, | |
| "grad_norm": 2.220090866088867, | |
| "learning_rate": 3.3425134077402524e-06, | |
| "loss": 0.2825, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 4.674421132731819, | |
| "grad_norm": 3.191338062286377, | |
| "learning_rate": 3.2700391361066826e-06, | |
| "loss": 0.2421, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 4.68166829727869, | |
| "grad_norm": 2.6636569499969482, | |
| "learning_rate": 3.1990143499057836e-06, | |
| "loss": 0.274, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 4.688915461825561, | |
| "grad_norm": 2.6303908824920654, | |
| "learning_rate": 3.126540078272214e-06, | |
| "loss": 0.2763, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 4.696162626372431, | |
| "grad_norm": 2.2153165340423584, | |
| "learning_rate": 3.0540658066386433e-06, | |
| "loss": 0.2512, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 4.703409790919303, | |
| "grad_norm": 2.291551351547241, | |
| "learning_rate": 2.9815915350050736e-06, | |
| "loss": 0.2739, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 4.710656955466174, | |
| "grad_norm": 3.2897346019744873, | |
| "learning_rate": 2.909117263371503e-06, | |
| "loss": 0.239, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 4.717904120013045, | |
| "grad_norm": 3.1026740074157715, | |
| "learning_rate": 2.8366429917379333e-06, | |
| "loss": 0.2806, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 4.7251512845599155, | |
| "grad_norm": 4.166581153869629, | |
| "learning_rate": 2.764168720104363e-06, | |
| "loss": 0.2597, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 4.732398449106787, | |
| "grad_norm": 3.2309772968292236, | |
| "learning_rate": 2.691694448470793e-06, | |
| "loss": 0.2748, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 4.739645613653658, | |
| "grad_norm": 3.379218816757202, | |
| "learning_rate": 2.619220176837223e-06, | |
| "loss": 0.2665, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 4.746892778200529, | |
| "grad_norm": 2.8024582862854004, | |
| "learning_rate": 2.5467459052036526e-06, | |
| "loss": 0.2571, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 4.7541399427474005, | |
| "grad_norm": 1.905375361442566, | |
| "learning_rate": 2.474271633570083e-06, | |
| "loss": 0.2547, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 4.761387107294271, | |
| "grad_norm": 2.8878672122955322, | |
| "learning_rate": 2.4017973619365127e-06, | |
| "loss": 0.2633, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 4.768634271841142, | |
| "grad_norm": 2.940661907196045, | |
| "learning_rate": 2.3293230903029426e-06, | |
| "loss": 0.2752, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 4.775881436388013, | |
| "grad_norm": 2.747434139251709, | |
| "learning_rate": 2.2568488186693724e-06, | |
| "loss": 0.2762, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 4.783128600934885, | |
| "grad_norm": 3.6419451236724854, | |
| "learning_rate": 2.1843745470358027e-06, | |
| "loss": 0.2685, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 4.783128600934885, | |
| "eval_loss": 0.6418930888175964, | |
| "eval_runtime": 245.4849, | |
| "eval_samples_per_second": 140.518, | |
| "eval_steps_per_second": 14.054, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 4.790375765481755, | |
| "grad_norm": 2.916613817214966, | |
| "learning_rate": 2.111900275402232e-06, | |
| "loss": 0.2846, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 4.797622930028626, | |
| "grad_norm": 2.95576548576355, | |
| "learning_rate": 2.0394260037686624e-06, | |
| "loss": 0.2626, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 4.804870094575497, | |
| "grad_norm": 2.053476333618164, | |
| "learning_rate": 1.966951732135092e-06, | |
| "loss": 0.2619, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 4.812117259122369, | |
| "grad_norm": 2.7688095569610596, | |
| "learning_rate": 1.894477460501522e-06, | |
| "loss": 0.2974, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 4.8193644236692395, | |
| "grad_norm": 2.67800235748291, | |
| "learning_rate": 1.8220031888679521e-06, | |
| "loss": 0.2676, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 4.82661158821611, | |
| "grad_norm": 3.279421806335449, | |
| "learning_rate": 1.7495289172343818e-06, | |
| "loss": 0.2689, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 4.833858752762982, | |
| "grad_norm": 1.616542100906372, | |
| "learning_rate": 1.6770546456008118e-06, | |
| "loss": 0.2555, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 4.841105917309853, | |
| "grad_norm": 3.103170156478882, | |
| "learning_rate": 1.6045803739672419e-06, | |
| "loss": 0.2551, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 4.8483530818567235, | |
| "grad_norm": 2.5930793285369873, | |
| "learning_rate": 1.5321061023336715e-06, | |
| "loss": 0.2618, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 4.855600246403594, | |
| "grad_norm": 3.2237420082092285, | |
| "learning_rate": 1.4596318307001016e-06, | |
| "loss": 0.2461, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 4.862847410950465, | |
| "grad_norm": 2.2981555461883545, | |
| "learning_rate": 1.3871575590665314e-06, | |
| "loss": 0.2707, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 4.870094575497337, | |
| "grad_norm": 1.4708250761032104, | |
| "learning_rate": 1.3146832874329614e-06, | |
| "loss": 0.254, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 4.877341740044208, | |
| "grad_norm": 2.699856758117676, | |
| "learning_rate": 1.2422090157993913e-06, | |
| "loss": 0.2593, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 4.884588904591078, | |
| "grad_norm": 2.0948593616485596, | |
| "learning_rate": 1.1697347441658213e-06, | |
| "loss": 0.2583, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 4.89183606913795, | |
| "grad_norm": 2.787429094314575, | |
| "learning_rate": 1.0972604725322512e-06, | |
| "loss": 0.2381, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 4.899083233684821, | |
| "grad_norm": 2.4767441749572754, | |
| "learning_rate": 1.024786200898681e-06, | |
| "loss": 0.2474, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 4.906330398231692, | |
| "grad_norm": 3.3083810806274414, | |
| "learning_rate": 9.523119292651109e-07, | |
| "loss": 0.2647, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 4.9135775627785625, | |
| "grad_norm": 2.8774940967559814, | |
| "learning_rate": 8.798376576315409e-07, | |
| "loss": 0.2622, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 4.920824727325434, | |
| "grad_norm": 2.5657265186309814, | |
| "learning_rate": 8.073633859979708e-07, | |
| "loss": 0.2537, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 4.928071891872305, | |
| "grad_norm": 1.964735984802246, | |
| "learning_rate": 7.348891143644006e-07, | |
| "loss": 0.2646, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.928071891872305, | |
| "eval_loss": 0.6408438682556152, | |
| "eval_runtime": 245.4737, | |
| "eval_samples_per_second": 140.524, | |
| "eval_steps_per_second": 14.054, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.935319056419176, | |
| "grad_norm": 2.6487998962402344, | |
| "learning_rate": 6.624148427308306e-07, | |
| "loss": 0.2799, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 4.942566220966047, | |
| "grad_norm": 2.05784010887146, | |
| "learning_rate": 5.899405710972605e-07, | |
| "loss": 0.2657, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 4.949813385512918, | |
| "grad_norm": 2.4890284538269043, | |
| "learning_rate": 5.174662994636905e-07, | |
| "loss": 0.2477, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 4.957060550059789, | |
| "grad_norm": 2.277297258377075, | |
| "learning_rate": 4.449920278301203e-07, | |
| "loss": 0.2664, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 4.96430771460666, | |
| "grad_norm": 3.3281660079956055, | |
| "learning_rate": 3.7251775619655025e-07, | |
| "loss": 0.2582, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 4.971554879153532, | |
| "grad_norm": 3.9353535175323486, | |
| "learning_rate": 3.0004348456298015e-07, | |
| "loss": 0.2622, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 4.978802043700402, | |
| "grad_norm": 2.5661704540252686, | |
| "learning_rate": 2.275692129294101e-07, | |
| "loss": 0.2562, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 4.986049208247273, | |
| "grad_norm": 3.4376327991485596, | |
| "learning_rate": 1.5509494129583997e-07, | |
| "loss": 0.2647, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 4.993296372794144, | |
| "grad_norm": 3.3473730087280273, | |
| "learning_rate": 8.26206696622699e-08, | |
| "loss": 0.2737, | |
| "step": 34450 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 34495, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5981403722758554e+18, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |