| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 81375, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018433179723502304, | |
| "grad_norm": 2.7722034454345703, | |
| "learning_rate": 4.969339477726575e-05, | |
| "loss": 5.2387, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03686635944700461, | |
| "grad_norm": 3.414055347442627, | |
| "learning_rate": 4.938617511520738e-05, | |
| "loss": 4.3585, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.055299539170506916, | |
| "grad_norm": 2.91390061378479, | |
| "learning_rate": 4.9078955453149006e-05, | |
| "loss": 4.1208, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.07373271889400922, | |
| "grad_norm": 2.3701863288879395, | |
| "learning_rate": 4.877173579109063e-05, | |
| "loss": 3.9869, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.09216589861751152, | |
| "grad_norm": 2.441763162612915, | |
| "learning_rate": 4.846451612903226e-05, | |
| "loss": 3.8932, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.11059907834101383, | |
| "grad_norm": 2.8250324726104736, | |
| "learning_rate": 4.815729646697389e-05, | |
| "loss": 3.8175, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 2.9016897678375244, | |
| "learning_rate": 4.7850076804915513e-05, | |
| "loss": 3.7022, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.14746543778801843, | |
| "grad_norm": 2.2199313640594482, | |
| "learning_rate": 4.7542857142857146e-05, | |
| "loss": 3.6935, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.16589861751152074, | |
| "grad_norm": 2.4812119007110596, | |
| "learning_rate": 4.723563748079877e-05, | |
| "loss": 3.6745, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.18433179723502305, | |
| "grad_norm": 2.6783430576324463, | |
| "learning_rate": 4.69284178187404e-05, | |
| "loss": 3.5583, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.20276497695852536, | |
| "grad_norm": 2.524801254272461, | |
| "learning_rate": 4.662119815668203e-05, | |
| "loss": 3.5784, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.22119815668202766, | |
| "grad_norm": 2.821859836578369, | |
| "learning_rate": 4.6313978494623653e-05, | |
| "loss": 3.5481, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.23963133640552994, | |
| "grad_norm": 2.7526915073394775, | |
| "learning_rate": 4.6006758832565286e-05, | |
| "loss": 3.5201, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 3.1275274753570557, | |
| "learning_rate": 4.569953917050692e-05, | |
| "loss": 3.4428, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.2764976958525346, | |
| "grad_norm": 2.565107583999634, | |
| "learning_rate": 4.539231950844854e-05, | |
| "loss": 3.4385, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.29493087557603687, | |
| "grad_norm": 2.554239511489868, | |
| "learning_rate": 4.5085099846390175e-05, | |
| "loss": 3.4189, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.31336405529953915, | |
| "grad_norm": 2.702221155166626, | |
| "learning_rate": 4.47778801843318e-05, | |
| "loss": 3.413, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.3317972350230415, | |
| "grad_norm": 2.413268804550171, | |
| "learning_rate": 4.4470660522273425e-05, | |
| "loss": 3.3684, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.35023041474654376, | |
| "grad_norm": 2.424586296081543, | |
| "learning_rate": 4.416344086021506e-05, | |
| "loss": 3.3702, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.3686635944700461, | |
| "grad_norm": 2.7243025302886963, | |
| "learning_rate": 4.385622119815668e-05, | |
| "loss": 3.2839, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 2.2430036067962646, | |
| "learning_rate": 4.354900153609831e-05, | |
| "loss": 3.3245, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4055299539170507, | |
| "grad_norm": 2.5448081493377686, | |
| "learning_rate": 4.324178187403994e-05, | |
| "loss": 3.2824, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.423963133640553, | |
| "grad_norm": 2.6062798500061035, | |
| "learning_rate": 4.293456221198157e-05, | |
| "loss": 3.2928, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.4423963133640553, | |
| "grad_norm": 2.51362681388855, | |
| "learning_rate": 4.26273425499232e-05, | |
| "loss": 3.2731, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 2.2453083992004395, | |
| "learning_rate": 4.232012288786483e-05, | |
| "loss": 3.2556, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.4792626728110599, | |
| "grad_norm": 2.3285653591156006, | |
| "learning_rate": 4.2012903225806455e-05, | |
| "loss": 3.2397, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.4976958525345622, | |
| "grad_norm": 2.4322783946990967, | |
| "learning_rate": 4.170568356374808e-05, | |
| "loss": 3.2348, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 2.180086374282837, | |
| "learning_rate": 4.139846390168971e-05, | |
| "loss": 3.2059, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.5345622119815668, | |
| "grad_norm": 2.293834686279297, | |
| "learning_rate": 4.109124423963134e-05, | |
| "loss": 3.2066, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.5529953917050692, | |
| "grad_norm": 2.4870762825012207, | |
| "learning_rate": 4.078402457757296e-05, | |
| "loss": 3.1875, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.4512012004852295, | |
| "learning_rate": 4.0476804915514595e-05, | |
| "loss": 3.1519, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.5898617511520737, | |
| "grad_norm": 3.0072903633117676, | |
| "learning_rate": 4.016958525345622e-05, | |
| "loss": 3.194, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.6082949308755761, | |
| "grad_norm": 2.5981032848358154, | |
| "learning_rate": 3.986236559139785e-05, | |
| "loss": 3.1575, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.6267281105990783, | |
| "grad_norm": 2.6231231689453125, | |
| "learning_rate": 3.9555145929339484e-05, | |
| "loss": 3.1602, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 2.6723060607910156, | |
| "learning_rate": 3.924792626728111e-05, | |
| "loss": 3.1713, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.663594470046083, | |
| "grad_norm": 2.222766876220703, | |
| "learning_rate": 3.8940706605222735e-05, | |
| "loss": 3.1258, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.6820276497695853, | |
| "grad_norm": 2.3424344062805176, | |
| "learning_rate": 3.863348694316437e-05, | |
| "loss": 3.1091, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.7004608294930875, | |
| "grad_norm": 2.849412679672241, | |
| "learning_rate": 3.832626728110599e-05, | |
| "loss": 3.1179, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.7188940092165899, | |
| "grad_norm": 2.475759267807007, | |
| "learning_rate": 3.8019047619047624e-05, | |
| "loss": 3.1026, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.7373271889400922, | |
| "grad_norm": 2.421753168106079, | |
| "learning_rate": 3.771182795698925e-05, | |
| "loss": 3.0828, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.7557603686635944, | |
| "grad_norm": 2.5588021278381348, | |
| "learning_rate": 3.7404608294930875e-05, | |
| "loss": 3.1001, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 2.294607400894165, | |
| "learning_rate": 3.709738863287251e-05, | |
| "loss": 3.0791, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.7926267281105991, | |
| "grad_norm": 2.657045841217041, | |
| "learning_rate": 3.679016897081413e-05, | |
| "loss": 3.0481, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.8110599078341014, | |
| "grad_norm": 2.426490068435669, | |
| "learning_rate": 3.648294930875576e-05, | |
| "loss": 3.0674, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.8294930875576036, | |
| "grad_norm": 2.5447800159454346, | |
| "learning_rate": 3.617572964669739e-05, | |
| "loss": 3.075, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.847926267281106, | |
| "grad_norm": 2.820953130722046, | |
| "learning_rate": 3.586850998463902e-05, | |
| "loss": 3.068, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.8663594470046083, | |
| "grad_norm": 2.321009397506714, | |
| "learning_rate": 3.556129032258065e-05, | |
| "loss": 3.0588, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.8847926267281107, | |
| "grad_norm": 2.54306697845459, | |
| "learning_rate": 3.525407066052228e-05, | |
| "loss": 3.0475, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 2.3935065269470215, | |
| "learning_rate": 3.4946850998463904e-05, | |
| "loss": 3.0174, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 2.3906099796295166, | |
| "learning_rate": 3.463963133640553e-05, | |
| "loss": 3.0189, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.9400921658986175, | |
| "grad_norm": 2.480583906173706, | |
| "learning_rate": 3.433241167434716e-05, | |
| "loss": 3.0467, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.9585253456221198, | |
| "grad_norm": 2.1853816509246826, | |
| "learning_rate": 3.402519201228879e-05, | |
| "loss": 3.0324, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.9769585253456221, | |
| "grad_norm": 2.525022506713867, | |
| "learning_rate": 3.371797235023041e-05, | |
| "loss": 3.0046, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.9953917050691244, | |
| "grad_norm": 2.310753345489502, | |
| "learning_rate": 3.3410752688172044e-05, | |
| "loss": 3.0123, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.0138248847926268, | |
| "grad_norm": 2.827805757522583, | |
| "learning_rate": 3.3103533026113676e-05, | |
| "loss": 3.0073, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.032258064516129, | |
| "grad_norm": 2.4869062900543213, | |
| "learning_rate": 3.27963133640553e-05, | |
| "loss": 2.9999, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.0506912442396312, | |
| "grad_norm": 2.9428353309631348, | |
| "learning_rate": 3.2489093701996933e-05, | |
| "loss": 3.0065, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.0691244239631337, | |
| "grad_norm": 2.4092352390289307, | |
| "learning_rate": 3.218187403993856e-05, | |
| "loss": 2.97, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.087557603686636, | |
| "grad_norm": 2.185153007507324, | |
| "learning_rate": 3.1874654377880184e-05, | |
| "loss": 2.9744, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.1059907834101383, | |
| "grad_norm": 2.547611713409424, | |
| "learning_rate": 3.1567434715821816e-05, | |
| "loss": 2.9755, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.1244239631336406, | |
| "grad_norm": 2.3823814392089844, | |
| "learning_rate": 3.126021505376344e-05, | |
| "loss": 2.9597, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 2.282871961593628, | |
| "learning_rate": 3.095299539170507e-05, | |
| "loss": 2.9871, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.1612903225806452, | |
| "grad_norm": 2.517770767211914, | |
| "learning_rate": 3.06457757296467e-05, | |
| "loss": 2.9971, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.1797235023041475, | |
| "grad_norm": 2.8500301837921143, | |
| "learning_rate": 3.0338556067588324e-05, | |
| "loss": 2.9692, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.1981566820276497, | |
| "grad_norm": 2.3024988174438477, | |
| "learning_rate": 3.0031336405529953e-05, | |
| "loss": 2.9512, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.2165898617511521, | |
| "grad_norm": 2.4389448165893555, | |
| "learning_rate": 2.9724116743471585e-05, | |
| "loss": 2.9743, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.2350230414746544, | |
| "grad_norm": 2.6087846755981445, | |
| "learning_rate": 2.9416897081413213e-05, | |
| "loss": 2.9634, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.2534562211981566, | |
| "grad_norm": 2.1963679790496826, | |
| "learning_rate": 2.9109677419354842e-05, | |
| "loss": 2.9408, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.271889400921659, | |
| "grad_norm": 2.6434950828552246, | |
| "learning_rate": 2.880245775729647e-05, | |
| "loss": 2.9334, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 2.5725350379943848, | |
| "learning_rate": 2.8495238095238096e-05, | |
| "loss": 2.9443, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.3087557603686637, | |
| "grad_norm": 2.343334674835205, | |
| "learning_rate": 2.8188018433179725e-05, | |
| "loss": 2.9587, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.327188940092166, | |
| "grad_norm": 2.673114776611328, | |
| "learning_rate": 2.7880798771121353e-05, | |
| "loss": 2.956, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.3456221198156681, | |
| "grad_norm": 2.481757640838623, | |
| "learning_rate": 2.757357910906298e-05, | |
| "loss": 2.9332, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.3640552995391704, | |
| "grad_norm": 3.0299792289733887, | |
| "learning_rate": 2.7266359447004607e-05, | |
| "loss": 2.947, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.3824884792626728, | |
| "grad_norm": 3.3357937335968018, | |
| "learning_rate": 2.6959139784946236e-05, | |
| "loss": 2.9236, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.400921658986175, | |
| "grad_norm": 2.214954376220703, | |
| "learning_rate": 2.6651920122887865e-05, | |
| "loss": 2.9334, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.4193548387096775, | |
| "grad_norm": 2.7208831310272217, | |
| "learning_rate": 2.6344700460829497e-05, | |
| "loss": 2.9001, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.4377880184331797, | |
| "grad_norm": 2.822230577468872, | |
| "learning_rate": 2.6037480798771125e-05, | |
| "loss": 2.9563, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.456221198156682, | |
| "grad_norm": 2.5907464027404785, | |
| "learning_rate": 2.573026113671275e-05, | |
| "loss": 2.9352, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.4746543778801844, | |
| "grad_norm": 2.509422540664673, | |
| "learning_rate": 2.542304147465438e-05, | |
| "loss": 2.9388, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.4930875576036866, | |
| "grad_norm": 2.8918466567993164, | |
| "learning_rate": 2.5115821812596008e-05, | |
| "loss": 2.9072, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.511520737327189, | |
| "grad_norm": 2.3461146354675293, | |
| "learning_rate": 2.4808602150537637e-05, | |
| "loss": 2.9013, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.5299539170506913, | |
| "grad_norm": 2.3494646549224854, | |
| "learning_rate": 2.4501382488479262e-05, | |
| "loss": 2.8947, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.5483870967741935, | |
| "grad_norm": 2.2246921062469482, | |
| "learning_rate": 2.419416282642089e-05, | |
| "loss": 2.9193, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.5668202764976957, | |
| "grad_norm": 2.4895882606506348, | |
| "learning_rate": 2.3886943164362523e-05, | |
| "loss": 2.9209, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.5852534562211982, | |
| "grad_norm": 2.234105110168457, | |
| "learning_rate": 2.3579723502304148e-05, | |
| "loss": 2.9104, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.6036866359447006, | |
| "grad_norm": 2.2471518516540527, | |
| "learning_rate": 2.3272503840245777e-05, | |
| "loss": 2.8924, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.6221198156682028, | |
| "grad_norm": 2.6903395652770996, | |
| "learning_rate": 2.2965284178187405e-05, | |
| "loss": 2.9108, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.640552995391705, | |
| "grad_norm": 2.5113911628723145, | |
| "learning_rate": 2.265806451612903e-05, | |
| "loss": 2.9167, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.6589861751152073, | |
| "grad_norm": 2.295367956161499, | |
| "learning_rate": 2.2350844854070663e-05, | |
| "loss": 2.91, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.6774193548387095, | |
| "grad_norm": 2.705887794494629, | |
| "learning_rate": 2.204362519201229e-05, | |
| "loss": 2.9193, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.695852534562212, | |
| "grad_norm": 2.490004777908325, | |
| "learning_rate": 2.1736405529953917e-05, | |
| "loss": 2.8773, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 2.564751148223877, | |
| "learning_rate": 2.1429185867895545e-05, | |
| "loss": 2.8979, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.7327188940092166, | |
| "grad_norm": 2.5527286529541016, | |
| "learning_rate": 2.1121966205837174e-05, | |
| "loss": 2.9022, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.7511520737327189, | |
| "grad_norm": 2.6402347087860107, | |
| "learning_rate": 2.0814746543778803e-05, | |
| "loss": 2.8751, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.769585253456221, | |
| "grad_norm": 2.415748357772827, | |
| "learning_rate": 2.050752688172043e-05, | |
| "loss": 2.8786, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.7880184331797235, | |
| "grad_norm": 2.6750245094299316, | |
| "learning_rate": 2.020030721966206e-05, | |
| "loss": 2.8853, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.8064516129032258, | |
| "grad_norm": 2.4245858192443848, | |
| "learning_rate": 1.989308755760369e-05, | |
| "loss": 2.875, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.8248847926267282, | |
| "grad_norm": 2.660170078277588, | |
| "learning_rate": 1.9585867895545314e-05, | |
| "loss": 2.8737, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.8433179723502304, | |
| "grad_norm": 2.4194977283477783, | |
| "learning_rate": 1.9278648233486943e-05, | |
| "loss": 2.8654, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.8617511520737327, | |
| "grad_norm": 2.4706435203552246, | |
| "learning_rate": 1.8971428571428575e-05, | |
| "loss": 2.8302, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.8801843317972349, | |
| "grad_norm": 2.8965485095977783, | |
| "learning_rate": 1.86642089093702e-05, | |
| "loss": 2.874, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.8986175115207373, | |
| "grad_norm": 2.812009811401367, | |
| "learning_rate": 1.835698924731183e-05, | |
| "loss": 2.8833, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.9170506912442398, | |
| "grad_norm": 2.7252895832061768, | |
| "learning_rate": 1.8049769585253457e-05, | |
| "loss": 2.8639, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 2.5407986640930176, | |
| "learning_rate": 1.7742549923195083e-05, | |
| "loss": 2.8637, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.9539170506912442, | |
| "grad_norm": 2.5381031036376953, | |
| "learning_rate": 1.7435330261136715e-05, | |
| "loss": 2.8636, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.9723502304147464, | |
| "grad_norm": 2.5974888801574707, | |
| "learning_rate": 1.7128110599078343e-05, | |
| "loss": 2.8563, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.9907834101382489, | |
| "grad_norm": 2.5476796627044678, | |
| "learning_rate": 1.682089093701997e-05, | |
| "loss": 2.8651, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.0092165898617513, | |
| "grad_norm": 2.4536616802215576, | |
| "learning_rate": 1.6513671274961597e-05, | |
| "loss": 2.8788, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.0276497695852536, | |
| "grad_norm": 3.109189510345459, | |
| "learning_rate": 1.6206451612903226e-05, | |
| "loss": 2.8355, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.046082949308756, | |
| "grad_norm": 2.727445363998413, | |
| "learning_rate": 1.5899231950844855e-05, | |
| "loss": 2.8582, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.064516129032258, | |
| "grad_norm": 2.809833288192749, | |
| "learning_rate": 1.5592012288786483e-05, | |
| "loss": 2.8763, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.0829493087557602, | |
| "grad_norm": 2.9285683631896973, | |
| "learning_rate": 1.5284792626728112e-05, | |
| "loss": 2.8524, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.1013824884792625, | |
| "grad_norm": 2.599776268005371, | |
| "learning_rate": 1.4977572964669739e-05, | |
| "loss": 2.8235, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.119815668202765, | |
| "grad_norm": 2.367570638656616, | |
| "learning_rate": 1.4670353302611368e-05, | |
| "loss": 2.8624, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.1382488479262673, | |
| "grad_norm": 2.971496820449829, | |
| "learning_rate": 1.4363133640552995e-05, | |
| "loss": 2.8542, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.1566820276497696, | |
| "grad_norm": 2.530744791030884, | |
| "learning_rate": 1.4055913978494625e-05, | |
| "loss": 2.878, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.175115207373272, | |
| "grad_norm": 2.3559248447418213, | |
| "learning_rate": 1.3748694316436254e-05, | |
| "loss": 2.8378, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.193548387096774, | |
| "grad_norm": 2.6017301082611084, | |
| "learning_rate": 1.344147465437788e-05, | |
| "loss": 2.8526, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.2119815668202767, | |
| "grad_norm": 2.727224349975586, | |
| "learning_rate": 1.313425499231951e-05, | |
| "loss": 2.8457, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.230414746543779, | |
| "grad_norm": 2.7515804767608643, | |
| "learning_rate": 1.2827035330261136e-05, | |
| "loss": 2.8422, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 2.248847926267281, | |
| "grad_norm": 2.1259450912475586, | |
| "learning_rate": 1.2519815668202767e-05, | |
| "loss": 2.8552, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 2.2672811059907834, | |
| "grad_norm": 2.3828954696655273, | |
| "learning_rate": 1.2212596006144395e-05, | |
| "loss": 2.833, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 2.588263988494873, | |
| "learning_rate": 1.1905376344086022e-05, | |
| "loss": 2.8259, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 2.3041474654377883, | |
| "grad_norm": 2.4910430908203125, | |
| "learning_rate": 1.159815668202765e-05, | |
| "loss": 2.8322, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 2.3225806451612905, | |
| "grad_norm": 2.4441442489624023, | |
| "learning_rate": 1.129093701996928e-05, | |
| "loss": 2.8368, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.3410138248847927, | |
| "grad_norm": 2.8292665481567383, | |
| "learning_rate": 1.0983717357910907e-05, | |
| "loss": 2.8541, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 2.359447004608295, | |
| "grad_norm": 3.0737709999084473, | |
| "learning_rate": 1.0676497695852535e-05, | |
| "loss": 2.8302, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.377880184331797, | |
| "grad_norm": 2.6240386962890625, | |
| "learning_rate": 1.0369278033794164e-05, | |
| "loss": 2.8581, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 2.3963133640552994, | |
| "grad_norm": 2.7089595794677734, | |
| "learning_rate": 1.0062058371735791e-05, | |
| "loss": 2.8291, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.4147465437788016, | |
| "grad_norm": 2.5590097904205322, | |
| "learning_rate": 9.754838709677421e-06, | |
| "loss": 2.8423, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 2.4331797235023043, | |
| "grad_norm": 2.529129981994629, | |
| "learning_rate": 9.447619047619048e-06, | |
| "loss": 2.8313, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.4516129032258065, | |
| "grad_norm": 2.3878464698791504, | |
| "learning_rate": 9.140399385560675e-06, | |
| "loss": 2.8396, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 2.4700460829493087, | |
| "grad_norm": 2.324528217315674, | |
| "learning_rate": 8.833179723502306e-06, | |
| "loss": 2.8406, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 2.488479262672811, | |
| "grad_norm": 2.531818389892578, | |
| "learning_rate": 8.525960061443933e-06, | |
| "loss": 2.8283, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 2.506912442396313, | |
| "grad_norm": 2.370063066482544, | |
| "learning_rate": 8.218740399385561e-06, | |
| "loss": 2.8218, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.525345622119816, | |
| "grad_norm": 2.7173168659210205, | |
| "learning_rate": 7.91152073732719e-06, | |
| "loss": 2.8193, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 2.543778801843318, | |
| "grad_norm": 2.893047571182251, | |
| "learning_rate": 7.604301075268818e-06, | |
| "loss": 2.8789, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.5622119815668203, | |
| "grad_norm": 2.3326709270477295, | |
| "learning_rate": 7.297081413210446e-06, | |
| "loss": 2.82, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 2.6681976318359375, | |
| "learning_rate": 6.989861751152074e-06, | |
| "loss": 2.8304, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.5990783410138247, | |
| "grad_norm": 2.398226261138916, | |
| "learning_rate": 6.682642089093702e-06, | |
| "loss": 2.8588, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 2.6175115207373274, | |
| "grad_norm": 2.898515462875366, | |
| "learning_rate": 6.375422427035331e-06, | |
| "loss": 2.8197, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 2.6359447004608296, | |
| "grad_norm": 2.739598035812378, | |
| "learning_rate": 6.0682027649769585e-06, | |
| "loss": 2.8118, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 2.654377880184332, | |
| "grad_norm": 2.643958806991577, | |
| "learning_rate": 5.760983102918587e-06, | |
| "loss": 2.8237, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.672811059907834, | |
| "grad_norm": 2.4359323978424072, | |
| "learning_rate": 5.453763440860216e-06, | |
| "loss": 2.849, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 2.6912442396313363, | |
| "grad_norm": 2.789459228515625, | |
| "learning_rate": 5.146543778801844e-06, | |
| "loss": 2.8295, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 2.709677419354839, | |
| "grad_norm": 2.7510650157928467, | |
| "learning_rate": 4.8393241167434715e-06, | |
| "loss": 2.8261, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 2.7281105990783407, | |
| "grad_norm": 2.687920570373535, | |
| "learning_rate": 4.5321044546851e-06, | |
| "loss": 2.8247, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 2.7465437788018434, | |
| "grad_norm": 2.563568592071533, | |
| "learning_rate": 4.224884792626729e-06, | |
| "loss": 2.8077, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 2.7649769585253456, | |
| "grad_norm": 2.5233335494995117, | |
| "learning_rate": 3.917665130568357e-06, | |
| "loss": 2.8249, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 2.783410138248848, | |
| "grad_norm": 2.1638145446777344, | |
| "learning_rate": 3.610445468509985e-06, | |
| "loss": 2.8117, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 2.80184331797235, | |
| "grad_norm": 2.3863344192504883, | |
| "learning_rate": 3.303225806451613e-06, | |
| "loss": 2.8267, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 2.8202764976958523, | |
| "grad_norm": 2.6310081481933594, | |
| "learning_rate": 2.9960061443932414e-06, | |
| "loss": 2.7984, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 2.838709677419355, | |
| "grad_norm": 2.5833451747894287, | |
| "learning_rate": 2.6887864823348697e-06, | |
| "loss": 2.828, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 2.796090602874756, | |
| "learning_rate": 2.381566820276498e-06, | |
| "loss": 2.8328, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 2.8755760368663594, | |
| "grad_norm": 2.4523568153381348, | |
| "learning_rate": 2.074347158218126e-06, | |
| "loss": 2.8262, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 2.8940092165898617, | |
| "grad_norm": 2.7820804119110107, | |
| "learning_rate": 1.7671274961597542e-06, | |
| "loss": 2.8492, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 2.912442396313364, | |
| "grad_norm": 2.6446750164031982, | |
| "learning_rate": 1.4599078341013825e-06, | |
| "loss": 2.8004, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 2.9308755760368665, | |
| "grad_norm": 2.688173532485962, | |
| "learning_rate": 1.1526881720430107e-06, | |
| "loss": 2.8324, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 2.9493087557603688, | |
| "grad_norm": 2.5094845294952393, | |
| "learning_rate": 8.454685099846391e-07, | |
| "loss": 2.8299, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.967741935483871, | |
| "grad_norm": 3.0129997730255127, | |
| "learning_rate": 5.382488479262673e-07, | |
| "loss": 2.8397, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 2.986175115207373, | |
| "grad_norm": 2.273484706878662, | |
| "learning_rate": 2.3102918586789556e-07, | |
| "loss": 2.8117, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 81375, | |
| "total_flos": 3692222600970240.0, | |
| "train_loss": 3.0354640246975806, | |
| "train_runtime": 19788.2132, | |
| "train_samples_per_second": 131.59, | |
| "train_steps_per_second": 4.112 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 81375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3692222600970240.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } |