| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 9851, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00010152026598309688, | |
| "grad_norm": 20.906557083129883, | |
| "learning_rate": 0.0, | |
| "loss": 13.5217, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010152026598309688, | |
| "grad_norm": 33.71811294555664, | |
| "learning_rate": 2.008113590263692e-05, | |
| "loss": 10.8681, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.020304053196619376, | |
| "grad_norm": 27.14719581604004, | |
| "learning_rate": 4.036511156186613e-05, | |
| "loss": 7.3949, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.030456079794929064, | |
| "grad_norm": 22.181039810180664, | |
| "learning_rate": 6.064908722109534e-05, | |
| "loss": 5.9034, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04060810639323875, | |
| "grad_norm": 14.685601234436035, | |
| "learning_rate": 8.093306288032456e-05, | |
| "loss": 5.0405, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05076013299154844, | |
| "grad_norm": 12.026344299316406, | |
| "learning_rate": 9.9935883735841e-05, | |
| "loss": 4.4838, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06091215958985813, | |
| "grad_norm": 14.332210540771484, | |
| "learning_rate": 9.886727933319086e-05, | |
| "loss": 4.0977, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07106418618816782, | |
| "grad_norm": 10.582870483398438, | |
| "learning_rate": 9.779867493054072e-05, | |
| "loss": 3.8085, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0812162127864775, | |
| "grad_norm": 8.66326904296875, | |
| "learning_rate": 9.673007052789058e-05, | |
| "loss": 3.6189, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09136823938478719, | |
| "grad_norm": 8.398235321044922, | |
| "learning_rate": 9.566146612524043e-05, | |
| "loss": 3.4115, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10152026598309688, | |
| "grad_norm": 8.620134353637695, | |
| "learning_rate": 9.45928617225903e-05, | |
| "loss": 3.3107, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11167229258140657, | |
| "grad_norm": 8.263737678527832, | |
| "learning_rate": 9.352425731994017e-05, | |
| "loss": 3.2162, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12182431917971626, | |
| "grad_norm": 7.5817646980285645, | |
| "learning_rate": 9.245565291729003e-05, | |
| "loss": 3.14, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.13197634577802594, | |
| "grad_norm": 8.10513973236084, | |
| "learning_rate": 9.138704851463988e-05, | |
| "loss": 3.0052, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.14212837237633563, | |
| "grad_norm": 9.334915161132812, | |
| "learning_rate": 9.031844411198974e-05, | |
| "loss": 2.9633, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.15228039897464532, | |
| "grad_norm": 6.445065975189209, | |
| "learning_rate": 8.924983970933962e-05, | |
| "loss": 2.9094, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.162432425572955, | |
| "grad_norm": 7.780778884887695, | |
| "learning_rate": 8.818123530668947e-05, | |
| "loss": 2.8677, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1725844521712647, | |
| "grad_norm": 7.130796432495117, | |
| "learning_rate": 8.711263090403933e-05, | |
| "loss": 2.7924, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.18273647876957438, | |
| "grad_norm": 6.419895648956299, | |
| "learning_rate": 8.604402650138918e-05, | |
| "loss": 2.7478, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.19288850536788407, | |
| "grad_norm": 7.287320613861084, | |
| "learning_rate": 8.497542209873905e-05, | |
| "loss": 2.6924, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.20304053196619376, | |
| "grad_norm": 9.301968574523926, | |
| "learning_rate": 8.390681769608892e-05, | |
| "loss": 2.7025, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.21319255856450345, | |
| "grad_norm": 6.076135635375977, | |
| "learning_rate": 8.283821329343877e-05, | |
| "loss": 2.6538, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.22334458516281314, | |
| "grad_norm": 8.065519332885742, | |
| "learning_rate": 8.176960889078863e-05, | |
| "loss": 2.5985, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.23349661176112282, | |
| "grad_norm": 6.776027679443359, | |
| "learning_rate": 8.07010044881385e-05, | |
| "loss": 2.5731, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2436486383594325, | |
| "grad_norm": 7.288259029388428, | |
| "learning_rate": 7.963240008548835e-05, | |
| "loss": 2.5714, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2538006649577422, | |
| "grad_norm": 7.614097595214844, | |
| "learning_rate": 7.856379568283822e-05, | |
| "loss": 2.5819, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2639526915560519, | |
| "grad_norm": 6.544124603271484, | |
| "learning_rate": 7.749519128018808e-05, | |
| "loss": 2.5085, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.27410471815436155, | |
| "grad_norm": 6.303622722625732, | |
| "learning_rate": 7.642658687753794e-05, | |
| "loss": 2.4782, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.28425674475267126, | |
| "grad_norm": 6.230086803436279, | |
| "learning_rate": 7.53579824748878e-05, | |
| "loss": 2.4933, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.2944087713509809, | |
| "grad_norm": 7.580053806304932, | |
| "learning_rate": 7.428937807223765e-05, | |
| "loss": 2.4438, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.30456079794929064, | |
| "grad_norm": 7.0785675048828125, | |
| "learning_rate": 7.322077366958752e-05, | |
| "loss": 2.4569, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3147128245476003, | |
| "grad_norm": 7.264654159545898, | |
| "learning_rate": 7.215216926693739e-05, | |
| "loss": 2.348, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.32486485114591, | |
| "grad_norm": 8.996079444885254, | |
| "learning_rate": 7.108356486428725e-05, | |
| "loss": 2.4008, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3350168777442197, | |
| "grad_norm": 7.104641914367676, | |
| "learning_rate": 7.00149604616371e-05, | |
| "loss": 2.3889, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.3451689043425294, | |
| "grad_norm": 6.1596221923828125, | |
| "learning_rate": 6.894635605898697e-05, | |
| "loss": 2.3832, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.35532093094083905, | |
| "grad_norm": 6.6536712646484375, | |
| "learning_rate": 6.787775165633682e-05, | |
| "loss": 2.3509, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.36547295753914877, | |
| "grad_norm": 8.533254623413086, | |
| "learning_rate": 6.680914725368669e-05, | |
| "loss": 2.3616, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.37562498413745843, | |
| "grad_norm": 6.878735542297363, | |
| "learning_rate": 6.574054285103655e-05, | |
| "loss": 2.2491, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.38577701073576814, | |
| "grad_norm": 6.529376029968262, | |
| "learning_rate": 6.467193844838642e-05, | |
| "loss": 2.239, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.3959290373340778, | |
| "grad_norm": 8.117202758789062, | |
| "learning_rate": 6.360333404573627e-05, | |
| "loss": 2.311, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4060810639323875, | |
| "grad_norm": 7.16500186920166, | |
| "learning_rate": 6.253472964308614e-05, | |
| "loss": 2.2687, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4162330905306972, | |
| "grad_norm": 7.756877899169922, | |
| "learning_rate": 6.1466125240436e-05, | |
| "loss": 2.2649, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.4263851171290069, | |
| "grad_norm": 6.9812469482421875, | |
| "learning_rate": 6.039752083778586e-05, | |
| "loss": 2.2575, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.43653714372731656, | |
| "grad_norm": 7.193202972412109, | |
| "learning_rate": 5.932891643513572e-05, | |
| "loss": 2.2156, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4466891703256263, | |
| "grad_norm": 6.245234489440918, | |
| "learning_rate": 5.826031203248558e-05, | |
| "loss": 2.2088, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.45684119692393593, | |
| "grad_norm": 7.823598861694336, | |
| "learning_rate": 5.719170762983543e-05, | |
| "loss": 2.2095, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.46699322352224565, | |
| "grad_norm": 9.34554386138916, | |
| "learning_rate": 5.61231032271853e-05, | |
| "loss": 2.2101, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.4771452501205553, | |
| "grad_norm": 5.98472261428833, | |
| "learning_rate": 5.505449882453516e-05, | |
| "loss": 2.2042, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.487297276718865, | |
| "grad_norm": 6.595754146575928, | |
| "learning_rate": 5.398589442188502e-05, | |
| "loss": 2.1979, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.4974493033171747, | |
| "grad_norm": 6.549368858337402, | |
| "learning_rate": 5.291729001923488e-05, | |
| "loss": 2.1515, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5076013299154843, | |
| "grad_norm": 6.540451526641846, | |
| "learning_rate": 5.184868561658475e-05, | |
| "loss": 2.1269, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5177533565137941, | |
| "grad_norm": 5.891973972320557, | |
| "learning_rate": 5.07800812139346e-05, | |
| "loss": 2.145, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5279053831121038, | |
| "grad_norm": 7.685993194580078, | |
| "learning_rate": 4.9711476811284465e-05, | |
| "loss": 2.141, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5380574097104135, | |
| "grad_norm": 7.306082725524902, | |
| "learning_rate": 4.8642872408634326e-05, | |
| "loss": 2.1274, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5482094363087231, | |
| "grad_norm": 5.640676498413086, | |
| "learning_rate": 4.757426800598419e-05, | |
| "loss": 2.129, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5583614629070328, | |
| "grad_norm": 5.850844860076904, | |
| "learning_rate": 4.650566360333405e-05, | |
| "loss": 2.0921, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5685134895053425, | |
| "grad_norm": 7.10348653793335, | |
| "learning_rate": 4.5437059200683905e-05, | |
| "loss": 2.135, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.5786655161036522, | |
| "grad_norm": 4.819798946380615, | |
| "learning_rate": 4.436845479803377e-05, | |
| "loss": 2.0731, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.5888175427019618, | |
| "grad_norm": 8.465038299560547, | |
| "learning_rate": 4.329985039538363e-05, | |
| "loss": 2.0882, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.5989695693002716, | |
| "grad_norm": 6.31635046005249, | |
| "learning_rate": 4.223124599273349e-05, | |
| "loss": 2.1084, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6091215958985813, | |
| "grad_norm": 7.215450286865234, | |
| "learning_rate": 4.116264159008335e-05, | |
| "loss": 2.0788, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.619273622496891, | |
| "grad_norm": 6.831523895263672, | |
| "learning_rate": 4.009403718743321e-05, | |
| "loss": 2.0606, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6294256490952006, | |
| "grad_norm": 6.920884132385254, | |
| "learning_rate": 3.9025432784783075e-05, | |
| "loss": 2.0504, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6395776756935103, | |
| "grad_norm": 5.9327802658081055, | |
| "learning_rate": 3.7956828382132936e-05, | |
| "loss": 2.061, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.64972970229182, | |
| "grad_norm": 7.0381550788879395, | |
| "learning_rate": 3.68882239794828e-05, | |
| "loss": 2.021, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6598817288901296, | |
| "grad_norm": 5.229250431060791, | |
| "learning_rate": 3.581961957683266e-05, | |
| "loss": 2.0095, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6700337554884394, | |
| "grad_norm": 5.998141288757324, | |
| "learning_rate": 3.475101517418252e-05, | |
| "loss": 2.0756, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.6801857820867491, | |
| "grad_norm": 7.094350814819336, | |
| "learning_rate": 3.368241077153238e-05, | |
| "loss": 2.019, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.6903378086850588, | |
| "grad_norm": 6.863796710968018, | |
| "learning_rate": 3.2613806368882245e-05, | |
| "loss": 2.0057, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7004898352833684, | |
| "grad_norm": 7.739531993865967, | |
| "learning_rate": 3.15452019662321e-05, | |
| "loss": 2.0452, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.7106418618816781, | |
| "grad_norm": 6.940340995788574, | |
| "learning_rate": 3.0476597563581964e-05, | |
| "loss": 2.0123, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7207938884799878, | |
| "grad_norm": 6.965139865875244, | |
| "learning_rate": 2.9407993160931823e-05, | |
| "loss": 2.0076, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.7309459150782975, | |
| "grad_norm": 6.2129411697387695, | |
| "learning_rate": 2.8339388758281688e-05, | |
| "loss": 2.0221, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7410979416766071, | |
| "grad_norm": 6.450573921203613, | |
| "learning_rate": 2.7270784355631546e-05, | |
| "loss": 1.9899, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7512499682749169, | |
| "grad_norm": 6.273707866668701, | |
| "learning_rate": 2.6202179952981408e-05, | |
| "loss": 1.9801, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.7614019948732266, | |
| "grad_norm": 6.587959289550781, | |
| "learning_rate": 2.5133575550331266e-05, | |
| "loss": 1.973, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7715540214715363, | |
| "grad_norm": 7.1678948402404785, | |
| "learning_rate": 2.4064971147681128e-05, | |
| "loss": 2.0192, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.7817060480698459, | |
| "grad_norm": 6.3077874183654785, | |
| "learning_rate": 2.299636674503099e-05, | |
| "loss": 1.9815, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.7918580746681556, | |
| "grad_norm": 7.233158111572266, | |
| "learning_rate": 2.192776234238085e-05, | |
| "loss": 2.0093, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.8020101012664653, | |
| "grad_norm": 6.688365459442139, | |
| "learning_rate": 2.0859157939730713e-05, | |
| "loss": 1.9382, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.812162127864775, | |
| "grad_norm": 6.085184574127197, | |
| "learning_rate": 1.9790553537080574e-05, | |
| "loss": 1.967, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8223141544630846, | |
| "grad_norm": 6.654317378997803, | |
| "learning_rate": 1.8721949134430436e-05, | |
| "loss": 1.9463, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8324661810613944, | |
| "grad_norm": 6.44837760925293, | |
| "learning_rate": 1.7653344731780298e-05, | |
| "loss": 1.9443, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.8426182076597041, | |
| "grad_norm": 6.596092224121094, | |
| "learning_rate": 1.658474032913016e-05, | |
| "loss": 1.9514, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.8527702342580138, | |
| "grad_norm": 6.20723295211792, | |
| "learning_rate": 1.5516135926480017e-05, | |
| "loss": 1.9623, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.8629222608563234, | |
| "grad_norm": 6.767848491668701, | |
| "learning_rate": 1.4447531523829879e-05, | |
| "loss": 1.959, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8730742874546331, | |
| "grad_norm": 7.135315418243408, | |
| "learning_rate": 1.337892712117974e-05, | |
| "loss": 1.9661, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.8832263140529428, | |
| "grad_norm": 6.851596832275391, | |
| "learning_rate": 1.2310322718529602e-05, | |
| "loss": 1.8859, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.8933783406512525, | |
| "grad_norm": 5.696228504180908, | |
| "learning_rate": 1.1241718315879462e-05, | |
| "loss": 1.9249, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.9035303672495621, | |
| "grad_norm": 7.561517238616943, | |
| "learning_rate": 1.0173113913229324e-05, | |
| "loss": 1.8969, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9136823938478719, | |
| "grad_norm": 5.516517639160156, | |
| "learning_rate": 9.104509510579184e-06, | |
| "loss": 1.952, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9238344204461816, | |
| "grad_norm": 6.028636932373047, | |
| "learning_rate": 8.035905107929046e-06, | |
| "loss": 1.957, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.9339864470444913, | |
| "grad_norm": 5.458266735076904, | |
| "learning_rate": 6.9673007052789065e-06, | |
| "loss": 1.9445, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.9441384736428009, | |
| "grad_norm": 7.229398727416992, | |
| "learning_rate": 5.898696302628767e-06, | |
| "loss": 1.8976, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.9542905002411106, | |
| "grad_norm": 5.801294803619385, | |
| "learning_rate": 4.830091899978628e-06, | |
| "loss": 1.9468, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.9644425268394203, | |
| "grad_norm": 4.766051769256592, | |
| "learning_rate": 3.7614874973284893e-06, | |
| "loss": 1.8822, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.97459455343773, | |
| "grad_norm": 5.5660200119018555, | |
| "learning_rate": 2.69288309467835e-06, | |
| "loss": 1.8705, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.9847465800360397, | |
| "grad_norm": 7.402261734008789, | |
| "learning_rate": 1.6242786920282113e-06, | |
| "loss": 1.9297, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.9948986066343494, | |
| "grad_norm": 8.760640144348145, | |
| "learning_rate": 5.556742893780722e-07, | |
| "loss": 1.8924, | |
| "step": 9800 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 9851, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5355046335700656.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |