| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.429204003563381, |
| "eval_steps": 1000000, |
| "global_step": 49143, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0043668885046027, |
| "grad_norm": 2.018752336502075, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 10.1855, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0087337770092054, |
| "grad_norm": 2.0094995498657227, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 9.2041, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.013100665513808101, |
| "grad_norm": 1.4977174997329712, |
| "learning_rate": 1.5e-06, |
| "loss": 8.6586, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0174675540184108, |
| "grad_norm": 1.407737135887146, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 8.1201, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.021834442523013503, |
| "grad_norm": 1.2318576574325562, |
| "learning_rate": 2.5e-06, |
| "loss": 7.6276, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.026201331027616202, |
| "grad_norm": 1.2920515537261963, |
| "learning_rate": 3e-06, |
| "loss": 7.2474, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.030568219532218905, |
| "grad_norm": 1.1667238473892212, |
| "learning_rate": 3.5e-06, |
| "loss": 7.0004, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.0349351080368216, |
| "grad_norm": 1.1691466569900513, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 6.8215, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.03930199654142431, |
| "grad_norm": 1.065576195716858, |
| "learning_rate": 4.5e-06, |
| "loss": 6.652, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.043668885046027006, |
| "grad_norm": 1.477279782295227, |
| "learning_rate": 5e-06, |
| "loss": 6.5107, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.048035773550629705, |
| "grad_norm": 1.583247184753418, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 6.3804, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.052402662055232405, |
| "grad_norm": 1.5069547891616821, |
| "learning_rate": 6e-06, |
| "loss": 6.2661, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.056769550559835104, |
| "grad_norm": 1.7084009647369385, |
| "learning_rate": 6.5000000000000004e-06, |
| "loss": 6.1654, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.06113643906443781, |
| "grad_norm": 1.6150327920913696, |
| "learning_rate": 7e-06, |
| "loss": 6.0775, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.06550332756904051, |
| "grad_norm": 1.9077385663986206, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 5.9847, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.0698702160736432, |
| "grad_norm": 2.038984537124634, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 5.8919, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.07423710457824591, |
| "grad_norm": 2.5044028759002686, |
| "learning_rate": 8.5e-06, |
| "loss": 5.8066, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.07860399308284861, |
| "grad_norm": 2.200798511505127, |
| "learning_rate": 9e-06, |
| "loss": 5.7103, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.0829708815874513, |
| "grad_norm": 2.3363890647888184, |
| "learning_rate": 9.5e-06, |
| "loss": 5.6353, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.08733777009205401, |
| "grad_norm": 2.091174602508545, |
| "learning_rate": 1e-05, |
| "loss": 5.5577, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.0917046585966567, |
| "grad_norm": 2.0803627967834473, |
| "learning_rate": 9.999562929421844e-06, |
| "loss": 5.4809, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.09607154710125941, |
| "grad_norm": 2.083531141281128, |
| "learning_rate": 9.999125858843687e-06, |
| "loss": 5.4045, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.10043843560586212, |
| "grad_norm": 1.9286231994628906, |
| "learning_rate": 9.99868878826553e-06, |
| "loss": 5.3344, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.10480532411046481, |
| "grad_norm": 2.1751465797424316, |
| "learning_rate": 9.998251717687372e-06, |
| "loss": 5.2562, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.10917221261506752, |
| "grad_norm": 2.120579957962036, |
| "learning_rate": 9.997814647109217e-06, |
| "loss": 5.2069, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.11353910111967021, |
| "grad_norm": 2.0757505893707275, |
| "learning_rate": 9.997377576531058e-06, |
| "loss": 5.1558, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.11790598962427291, |
| "grad_norm": 2.013015031814575, |
| "learning_rate": 9.996940505952902e-06, |
| "loss": 5.0984, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.12227287812887562, |
| "grad_norm": 1.910936951637268, |
| "learning_rate": 9.996503435374745e-06, |
| "loss": 5.0394, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.1266397666334783, |
| "grad_norm": 2.150876045227051, |
| "learning_rate": 9.996066364796588e-06, |
| "loss": 5.0009, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.13100665513808102, |
| "grad_norm": 1.9558886289596558, |
| "learning_rate": 9.995629294218431e-06, |
| "loss": 4.9463, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.13537354364268372, |
| "grad_norm": 1.999248743057251, |
| "learning_rate": 9.995192223640275e-06, |
| "loss": 4.9029, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.1397404321472864, |
| "grad_norm": 2.1909689903259277, |
| "learning_rate": 9.994755153062116e-06, |
| "loss": 4.843, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.1441073206518891, |
| "grad_norm": 2.090623140335083, |
| "learning_rate": 9.994318082483961e-06, |
| "loss": 4.7988, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.14847420915649182, |
| "grad_norm": 2.2268435955047607, |
| "learning_rate": 9.993881011905804e-06, |
| "loss": 4.7431, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.15284109766109452, |
| "grad_norm": 2.164546251296997, |
| "learning_rate": 9.993443941327646e-06, |
| "loss": 4.6919, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.15720798616569723, |
| "grad_norm": 2.255798101425171, |
| "learning_rate": 9.993006870749489e-06, |
| "loss": 4.6517, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.1615748746702999, |
| "grad_norm": 2.169243812561035, |
| "learning_rate": 9.992569800171332e-06, |
| "loss": 4.6128, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.1659417631749026, |
| "grad_norm": 2.106949806213379, |
| "learning_rate": 9.992132729593176e-06, |
| "loss": 4.5726, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.17030865167950532, |
| "grad_norm": 2.143815040588379, |
| "learning_rate": 9.991695659015019e-06, |
| "loss": 4.5344, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.17467554018410802, |
| "grad_norm": 2.406649589538574, |
| "learning_rate": 9.991258588436862e-06, |
| "loss": 4.5041, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.17904242868871073, |
| "grad_norm": 2.092935085296631, |
| "learning_rate": 9.990821517858704e-06, |
| "loss": 4.4631, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.1834093171933134, |
| "grad_norm": 2.0865073204040527, |
| "learning_rate": 9.990384447280548e-06, |
| "loss": 4.4408, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.18777620569791612, |
| "grad_norm": 2.061974287033081, |
| "learning_rate": 9.98994737670239e-06, |
| "loss": 4.4113, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.19214309420251882, |
| "grad_norm": 1.916175365447998, |
| "learning_rate": 9.989510306124233e-06, |
| "loss": 4.3833, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.19650998270712153, |
| "grad_norm": 1.9595962762832642, |
| "learning_rate": 9.989073235546078e-06, |
| "loss": 4.353, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.20087687121172423, |
| "grad_norm": 2.0971903800964355, |
| "learning_rate": 9.98863616496792e-06, |
| "loss": 4.3333, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.2052437597163269, |
| "grad_norm": 2.0486457347869873, |
| "learning_rate": 9.988199094389763e-06, |
| "loss": 4.3109, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.20961064822092962, |
| "grad_norm": 1.9522242546081543, |
| "learning_rate": 9.987762023811606e-06, |
| "loss": 4.279, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.21397753672553232, |
| "grad_norm": 2.1979501247406006, |
| "learning_rate": 9.98732495323345e-06, |
| "loss": 4.2567, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.21834442523013503, |
| "grad_norm": 1.9632526636123657, |
| "learning_rate": 9.986887882655293e-06, |
| "loss": 4.2416, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.22271131373473774, |
| "grad_norm": 2.0721206665039062, |
| "learning_rate": 9.986450812077136e-06, |
| "loss": 4.22, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.22707820223934042, |
| "grad_norm": 2.1473758220672607, |
| "learning_rate": 9.986013741498977e-06, |
| "loss": 4.2079, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.23144509074394312, |
| "grad_norm": 2.040027379989624, |
| "learning_rate": 9.98557667092082e-06, |
| "loss": 4.182, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.23581197924854583, |
| "grad_norm": 2.2831156253814697, |
| "learning_rate": 9.985139600342664e-06, |
| "loss": 4.1655, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.24017886775314853, |
| "grad_norm": 1.9741929769515991, |
| "learning_rate": 9.984702529764507e-06, |
| "loss": 4.1594, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.24454575625775124, |
| "grad_norm": 2.0276150703430176, |
| "learning_rate": 9.98426545918635e-06, |
| "loss": 4.1374, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.24891264476235392, |
| "grad_norm": 1.9253956079483032, |
| "learning_rate": 9.983828388608193e-06, |
| "loss": 4.1206, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.2532795332669566, |
| "grad_norm": 1.9739083051681519, |
| "learning_rate": 9.983391318030037e-06, |
| "loss": 4.1049, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.25764642177155933, |
| "grad_norm": 2.0716798305511475, |
| "learning_rate": 9.98295424745188e-06, |
| "loss": 4.0768, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.26201331027616204, |
| "grad_norm": 2.0090582370758057, |
| "learning_rate": 9.982517176873723e-06, |
| "loss": 4.0771, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.26638019878076474, |
| "grad_norm": 1.9497123956680298, |
| "learning_rate": 9.982080106295565e-06, |
| "loss": 4.0663, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.27074708728536745, |
| "grad_norm": 2.1742944717407227, |
| "learning_rate": 9.98164303571741e-06, |
| "loss": 4.0483, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.27511397578997016, |
| "grad_norm": 1.954126238822937, |
| "learning_rate": 9.981205965139251e-06, |
| "loss": 4.0313, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.2794808642945728, |
| "grad_norm": 2.0533246994018555, |
| "learning_rate": 9.980768894561094e-06, |
| "loss": 4.0169, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.2838477527991755, |
| "grad_norm": 1.8938665390014648, |
| "learning_rate": 9.980331823982938e-06, |
| "loss": 4.0087, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.2882146413037782, |
| "grad_norm": 1.9633103609085083, |
| "learning_rate": 9.97989475340478e-06, |
| "loss": 3.9909, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.2925815298083809, |
| "grad_norm": 1.903270959854126, |
| "learning_rate": 9.979457682826622e-06, |
| "loss": 3.9817, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.29694841831298363, |
| "grad_norm": 2.009631395339966, |
| "learning_rate": 9.979020612248467e-06, |
| "loss": 3.9712, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.30131530681758634, |
| "grad_norm": 1.9002183675765991, |
| "learning_rate": 9.97858354167031e-06, |
| "loss": 3.9701, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.30568219532218904, |
| "grad_norm": 1.9432848691940308, |
| "learning_rate": 9.978146471092152e-06, |
| "loss": 3.9536, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.31004908382679175, |
| "grad_norm": 1.9657421112060547, |
| "learning_rate": 9.977709400513997e-06, |
| "loss": 3.9392, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.31441597233139446, |
| "grad_norm": 1.9874509572982788, |
| "learning_rate": 9.977272329935838e-06, |
| "loss": 3.9299, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.31878286083599716, |
| "grad_norm": 2.0522308349609375, |
| "learning_rate": 9.976835259357682e-06, |
| "loss": 3.9203, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.3231497493405998, |
| "grad_norm": 1.9851490259170532, |
| "learning_rate": 9.976398188779525e-06, |
| "loss": 3.9125, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.3275166378452025, |
| "grad_norm": 2.00964093208313, |
| "learning_rate": 9.975961118201368e-06, |
| "loss": 3.9056, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.3318835263498052, |
| "grad_norm": 1.8827855587005615, |
| "learning_rate": 9.975524047623211e-06, |
| "loss": 3.8967, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.33625041485440793, |
| "grad_norm": 1.8730061054229736, |
| "learning_rate": 9.975086977045055e-06, |
| "loss": 3.891, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.34061730335901064, |
| "grad_norm": 1.9370964765548706, |
| "learning_rate": 9.974649906466896e-06, |
| "loss": 3.8769, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.34498419186361334, |
| "grad_norm": 1.959948182106018, |
| "learning_rate": 9.97421283588874e-06, |
| "loss": 3.8707, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.34935108036821605, |
| "grad_norm": 1.9862849712371826, |
| "learning_rate": 9.973775765310583e-06, |
| "loss": 3.8568, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.35371796887281876, |
| "grad_norm": 1.9760117530822754, |
| "learning_rate": 9.973338694732426e-06, |
| "loss": 3.8536, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.35808485737742146, |
| "grad_norm": 2.0359582901000977, |
| "learning_rate": 9.972901624154269e-06, |
| "loss": 3.8466, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.36245174588202417, |
| "grad_norm": 1.9113123416900635, |
| "learning_rate": 9.972464553576112e-06, |
| "loss": 3.8345, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.3668186343866268, |
| "grad_norm": 1.9586379528045654, |
| "learning_rate": 9.972027482997956e-06, |
| "loss": 3.8306, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.3711855228912295, |
| "grad_norm": 1.887161374092102, |
| "learning_rate": 9.971590412419799e-06, |
| "loss": 3.8178, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.37555241139583223, |
| "grad_norm": 1.8756746053695679, |
| "learning_rate": 9.971153341841642e-06, |
| "loss": 3.8145, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.37991929990043494, |
| "grad_norm": 1.9797776937484741, |
| "learning_rate": 9.970716271263484e-06, |
| "loss": 3.8106, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.38428618840503764, |
| "grad_norm": 1.9709391593933105, |
| "learning_rate": 9.970279200685328e-06, |
| "loss": 3.7974, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.38865307690964035, |
| "grad_norm": 1.8535213470458984, |
| "learning_rate": 9.96984213010717e-06, |
| "loss": 3.7866, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.39301996541424306, |
| "grad_norm": 1.8140771389007568, |
| "learning_rate": 9.969405059529013e-06, |
| "loss": 3.788, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.39738685391884576, |
| "grad_norm": 1.8980203866958618, |
| "learning_rate": 9.968967988950856e-06, |
| "loss": 3.7813, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.40175374242344847, |
| "grad_norm": 1.88387131690979, |
| "learning_rate": 9.9685309183727e-06, |
| "loss": 3.7782, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.4061206309280511, |
| "grad_norm": 2.057882785797119, |
| "learning_rate": 9.968093847794543e-06, |
| "loss": 3.7688, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.4104875194326538, |
| "grad_norm": 1.9363012313842773, |
| "learning_rate": 9.967656777216386e-06, |
| "loss": 3.7582, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.41485440793725653, |
| "grad_norm": 1.8989619016647339, |
| "learning_rate": 9.96721970663823e-06, |
| "loss": 3.7643, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.41922129644185924, |
| "grad_norm": 1.9946751594543457, |
| "learning_rate": 9.966782636060071e-06, |
| "loss": 3.7483, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.42358818494646194, |
| "grad_norm": 1.8525508642196655, |
| "learning_rate": 9.966345565481916e-06, |
| "loss": 3.7442, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.42795507345106465, |
| "grad_norm": 1.8581886291503906, |
| "learning_rate": 9.965908494903757e-06, |
| "loss": 3.7361, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.429204003563381, |
| "step": 49143, |
| "total_flos": 1.2327047290505134e+18, |
| "train_loss": 4.777673066357365, |
| "train_runtime": 46797.1012, |
| "train_samples_per_second": 11744.031, |
| "train_steps_per_second": 244.669 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 11449800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 100, |
| "save_steps": 1000000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2327047290505134e+18, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|