| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9400921658986174, |
| "eval_steps": 500, |
| "global_step": 108, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.027649769585253458, |
| "grad_norm": 6.218597135492551, |
| "learning_rate": 0.0, |
| "loss": 0.3225, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.055299539170506916, |
| "grad_norm": 6.047780940751843, |
| "learning_rate": 9.090909090909091e-07, |
| "loss": 0.3102, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.08294930875576037, |
| "grad_norm": 5.725628334215099, |
| "learning_rate": 1.8181818181818183e-06, |
| "loss": 0.3092, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.11059907834101383, |
| "grad_norm": 5.291280324265542, |
| "learning_rate": 2.7272727272727272e-06, |
| "loss": 0.3167, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.1382488479262673, |
| "grad_norm": 2.7599992875130943, |
| "learning_rate": 3.6363636363636366e-06, |
| "loss": 0.2646, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.16589861751152074, |
| "grad_norm": 3.4307446243159427, |
| "learning_rate": 4.5454545454545455e-06, |
| "loss": 0.2347, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.1935483870967742, |
| "grad_norm": 2.743173006209666, |
| "learning_rate": 5.4545454545454545e-06, |
| "loss": 0.2243, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.22119815668202766, |
| "grad_norm": 4.332341499527876, |
| "learning_rate": 6.363636363636364e-06, |
| "loss": 0.2741, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.2488479262672811, |
| "grad_norm": 2.150601224938458, |
| "learning_rate": 7.272727272727273e-06, |
| "loss": 0.2436, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.2764976958525346, |
| "grad_norm": 1.5762454338091747, |
| "learning_rate": 8.181818181818183e-06, |
| "loss": 0.2042, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.30414746543778803, |
| "grad_norm": 2.456237511748212, |
| "learning_rate": 9.090909090909091e-06, |
| "loss": 0.1996, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.3317972350230415, |
| "grad_norm": 1.9843238243820491, |
| "learning_rate": 1e-05, |
| "loss": 0.1839, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.35944700460829493, |
| "grad_norm": 2.7062318993272605, |
| "learning_rate": 9.997377845227577e-06, |
| "loss": 0.1578, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 1.3941400459380622, |
| "learning_rate": 9.98951413118856e-06, |
| "loss": 0.1593, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.4147465437788018, |
| "grad_norm": 1.1610695697921565, |
| "learning_rate": 9.97641710583307e-06, |
| "loss": 0.1675, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.4423963133640553, |
| "grad_norm": 1.0523290062211754, |
| "learning_rate": 9.958100506132127e-06, |
| "loss": 0.1726, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.4700460829493088, |
| "grad_norm": 1.0801667737571008, |
| "learning_rate": 9.934583543669454e-06, |
| "loss": 0.1829, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.4976958525345622, |
| "grad_norm": 1.3571254664283856, |
| "learning_rate": 9.905890884491196e-06, |
| "loss": 0.1527, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.5253456221198156, |
| "grad_norm": 1.4608505233360805, |
| "learning_rate": 9.872052623234632e-06, |
| "loss": 0.147, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.5529953917050692, |
| "grad_norm": 0.9738980854967625, |
| "learning_rate": 9.833104251563058e-06, |
| "loss": 0.1717, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.5806451612903226, |
| "grad_norm": 0.9511695004905902, |
| "learning_rate": 9.789086620939936e-06, |
| "loss": 0.154, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.6082949308755761, |
| "grad_norm": 0.9492770790673514, |
| "learning_rate": 9.740045899781353e-06, |
| "loss": 0.1486, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.6359447004608295, |
| "grad_norm": 1.662083098498058, |
| "learning_rate": 9.68603352503172e-06, |
| "loss": 0.1423, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.663594470046083, |
| "grad_norm": 1.0620441015348367, |
| "learning_rate": 9.627106148213521e-06, |
| "loss": 0.1414, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.6912442396313364, |
| "grad_norm": 1.5593131122473323, |
| "learning_rate": 9.563325576007702e-06, |
| "loss": 0.1274, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.7188940092165899, |
| "grad_norm": 0.913058028692136, |
| "learning_rate": 9.494758705426978e-06, |
| "loss": 0.1414, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.7465437788018433, |
| "grad_norm": 2.508172363862381, |
| "learning_rate": 9.421477453650118e-06, |
| "loss": 0.1629, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 2.7393725430348574, |
| "learning_rate": 9.343558682590757e-06, |
| "loss": 0.1356, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.8018433179723502, |
| "grad_norm": 6.864836486057846, |
| "learning_rate": 9.261084118279846e-06, |
| "loss": 0.1591, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.8294930875576036, |
| "grad_norm": 0.8774514371920219, |
| "learning_rate": 9.174140265146356e-06, |
| "loss": 0.1423, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 1.0409147093308013, |
| "learning_rate": 9.082818315286054e-06, |
| "loss": 0.1397, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.8847926267281107, |
| "grad_norm": 0.7615257724651999, |
| "learning_rate": 8.987214052813605e-06, |
| "loss": 0.157, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.9124423963133641, |
| "grad_norm": 0.8128642416450813, |
| "learning_rate": 8.887427753398249e-06, |
| "loss": 0.1489, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.9400921658986175, |
| "grad_norm": 0.6930096403019158, |
| "learning_rate": 8.783564079088478e-06, |
| "loss": 0.1432, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 0.8773700569916818, |
| "learning_rate": 8.675731968536004e-06, |
| "loss": 0.1292, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.9953917050691244, |
| "grad_norm": 1.0254335401965728, |
| "learning_rate": 8.564044522734147e-06, |
| "loss": 0.1403, |
| "step": 36 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.0254335401965728, |
| "learning_rate": 8.448618886390523e-06, |
| "loss": 0.1451, |
| "step": 37 |
| }, |
| { |
| "epoch": 1.0276497695852536, |
| "grad_norm": 1.8000941773216486, |
| "learning_rate": 8.329576125058406e-06, |
| "loss": 0.0861, |
| "step": 38 |
| }, |
| { |
| "epoch": 1.055299539170507, |
| "grad_norm": 2.1523838043667936, |
| "learning_rate": 8.207041098155701e-06, |
| "loss": 0.093, |
| "step": 39 |
| }, |
| { |
| "epoch": 1.0829493087557605, |
| "grad_norm": 0.8728738735253629, |
| "learning_rate": 8.081142328004638e-06, |
| "loss": 0.1009, |
| "step": 40 |
| }, |
| { |
| "epoch": 1.1105990783410138, |
| "grad_norm": 0.8446208855675313, |
| "learning_rate": 7.952011865029614e-06, |
| "loss": 0.0962, |
| "step": 41 |
| }, |
| { |
| "epoch": 1.1382488479262673, |
| "grad_norm": 0.583782311676644, |
| "learning_rate": 7.819785149254534e-06, |
| "loss": 0.0873, |
| "step": 42 |
| }, |
| { |
| "epoch": 1.1658986175115207, |
| "grad_norm": 0.7773678602540318, |
| "learning_rate": 7.68460086824492e-06, |
| "loss": 0.0984, |
| "step": 43 |
| }, |
| { |
| "epoch": 1.1935483870967742, |
| "grad_norm": 0.5920912387076327, |
| "learning_rate": 7.546600811643816e-06, |
| "loss": 0.0961, |
| "step": 44 |
| }, |
| { |
| "epoch": 1.2211981566820276, |
| "grad_norm": 0.6278156435849434, |
| "learning_rate": 7.405929722454026e-06, |
| "loss": 0.0808, |
| "step": 45 |
| }, |
| { |
| "epoch": 1.2488479262672811, |
| "grad_norm": 0.6735110117536915, |
| "learning_rate": 7.262735145222696e-06, |
| "loss": 0.09, |
| "step": 46 |
| }, |
| { |
| "epoch": 1.2764976958525347, |
| "grad_norm": 0.6721245903010009, |
| "learning_rate": 7.117167271287453e-06, |
| "loss": 0.0901, |
| "step": 47 |
| }, |
| { |
| "epoch": 1.304147465437788, |
| "grad_norm": 0.6581773002499689, |
| "learning_rate": 6.969378781246436e-06, |
| "loss": 0.0802, |
| "step": 48 |
| }, |
| { |
| "epoch": 1.3317972350230414, |
| "grad_norm": 0.6154758737738647, |
| "learning_rate": 6.819524684817439e-06, |
| "loss": 0.0773, |
| "step": 49 |
| }, |
| { |
| "epoch": 1.359447004608295, |
| "grad_norm": 0.6078121382769383, |
| "learning_rate": 6.667762158254104e-06, |
| "loss": 0.0796, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.3870967741935485, |
| "grad_norm": 0.5916042033246923, |
| "learning_rate": 6.514250379489754e-06, |
| "loss": 0.0797, |
| "step": 51 |
| }, |
| { |
| "epoch": 1.4147465437788018, |
| "grad_norm": 0.6742233018841318, |
| "learning_rate": 6.3591503611817155e-06, |
| "loss": 0.0877, |
| "step": 52 |
| }, |
| { |
| "epoch": 1.4423963133640554, |
| "grad_norm": 0.6254587000513546, |
| "learning_rate": 6.202624781831269e-06, |
| "loss": 0.0852, |
| "step": 53 |
| }, |
| { |
| "epoch": 1.4700460829493087, |
| "grad_norm": 0.6461870128180857, |
| "learning_rate": 6.044837815156377e-06, |
| "loss": 0.0943, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.4976958525345623, |
| "grad_norm": 0.6189619126092755, |
| "learning_rate": 5.885954957896115e-06, |
| "loss": 0.0941, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.5253456221198156, |
| "grad_norm": 0.6656207681608218, |
| "learning_rate": 5.726142856227453e-06, |
| "loss": 0.0933, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.5529953917050692, |
| "grad_norm": 0.641201435235512, |
| "learning_rate": 5.5655691309764225e-06, |
| "loss": 0.0827, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.5806451612903225, |
| "grad_norm": 0.5520132793031652, |
| "learning_rate": 5.404402201807022e-06, |
| "loss": 0.0749, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.608294930875576, |
| "grad_norm": 0.5710686877367372, |
| "learning_rate": 5.242811110572243e-06, |
| "loss": 0.0778, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.6359447004608296, |
| "grad_norm": 0.6110053501356016, |
| "learning_rate": 5.080965344012509e-06, |
| "loss": 0.0844, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.663594470046083, |
| "grad_norm": 0.5836120939696657, |
| "learning_rate": 4.919034655987493e-06, |
| "loss": 0.0767, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.6912442396313363, |
| "grad_norm": 0.6258155158697727, |
| "learning_rate": 4.757188889427761e-06, |
| "loss": 0.0886, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.7188940092165899, |
| "grad_norm": 0.5949010863382941, |
| "learning_rate": 4.59559779819298e-06, |
| "loss": 0.0982, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.7465437788018434, |
| "grad_norm": 0.5665380658610257, |
| "learning_rate": 4.434430869023579e-06, |
| "loss": 0.0811, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.5999744212269682, |
| "learning_rate": 4.27385714377255e-06, |
| "loss": 0.0857, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.80184331797235, |
| "grad_norm": 0.5851633715161859, |
| "learning_rate": 4.1140450421038865e-06, |
| "loss": 0.07, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.8294930875576036, |
| "grad_norm": 0.5602466431740857, |
| "learning_rate": 3.955162184843625e-06, |
| "loss": 0.0868, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.8571428571428572, |
| "grad_norm": 0.6003026453668614, |
| "learning_rate": 3.7973752181687336e-06, |
| "loss": 0.0897, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.8847926267281108, |
| "grad_norm": 0.5828634522878124, |
| "learning_rate": 3.6408496388182857e-06, |
| "loss": 0.0811, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.912442396313364, |
| "grad_norm": 0.606858679369741, |
| "learning_rate": 3.4857496205102475e-06, |
| "loss": 0.085, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.9400921658986174, |
| "grad_norm": 0.6071679638417599, |
| "learning_rate": 3.3322378417458985e-06, |
| "loss": 0.089, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.967741935483871, |
| "grad_norm": 0.5798347707839837, |
| "learning_rate": 3.180475315182563e-06, |
| "loss": 0.069, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.9953917050691246, |
| "grad_norm": 0.485811481914557, |
| "learning_rate": 3.0306212187535653e-06, |
| "loss": 0.074, |
| "step": 73 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.485811481914557, |
| "learning_rate": 2.882832728712551e-06, |
| "loss": 0.0988, |
| "step": 74 |
| }, |
| { |
| "epoch": 2.0276497695852536, |
| "grad_norm": 1.5333123289167132, |
| "learning_rate": 2.7372648547773063e-06, |
| "loss": 0.0533, |
| "step": 75 |
| }, |
| { |
| "epoch": 2.055299539170507, |
| "grad_norm": 0.47653360582102766, |
| "learning_rate": 2.594070277545975e-06, |
| "loss": 0.0456, |
| "step": 76 |
| }, |
| { |
| "epoch": 2.0829493087557602, |
| "grad_norm": 0.44653376480891205, |
| "learning_rate": 2.4533991883561868e-06, |
| "loss": 0.0433, |
| "step": 77 |
| }, |
| { |
| "epoch": 2.110599078341014, |
| "grad_norm": 0.45184100467810834, |
| "learning_rate": 2.315399131755081e-06, |
| "loss": 0.0434, |
| "step": 78 |
| }, |
| { |
| "epoch": 2.1382488479262673, |
| "grad_norm": 0.46406715803678933, |
| "learning_rate": 2.1802148507454675e-06, |
| "loss": 0.0475, |
| "step": 79 |
| }, |
| { |
| "epoch": 2.165898617511521, |
| "grad_norm": 0.44470042182732816, |
| "learning_rate": 2.0479881349703885e-06, |
| "loss": 0.0485, |
| "step": 80 |
| }, |
| { |
| "epoch": 2.193548387096774, |
| "grad_norm": 0.4541836919405326, |
| "learning_rate": 1.9188576719953635e-06, |
| "loss": 0.0453, |
| "step": 81 |
| }, |
| { |
| "epoch": 2.2211981566820276, |
| "grad_norm": 0.41945408404533757, |
| "learning_rate": 1.7929589018443016e-06, |
| "loss": 0.0476, |
| "step": 82 |
| }, |
| { |
| "epoch": 2.248847926267281, |
| "grad_norm": 0.44239519120852955, |
| "learning_rate": 1.6704238749415958e-06, |
| "loss": 0.0471, |
| "step": 83 |
| }, |
| { |
| "epoch": 2.2764976958525347, |
| "grad_norm": 0.4072713113001334, |
| "learning_rate": 1.5513811136094786e-06, |
| "loss": 0.0441, |
| "step": 84 |
| }, |
| { |
| "epoch": 2.3041474654377883, |
| "grad_norm": 0.40529452752820916, |
| "learning_rate": 1.4359554772658551e-06, |
| "loss": 0.048, |
| "step": 85 |
| }, |
| { |
| "epoch": 2.3317972350230414, |
| "grad_norm": 0.3932405667255203, |
| "learning_rate": 1.3242680314639995e-06, |
| "loss": 0.0392, |
| "step": 86 |
| }, |
| { |
| "epoch": 2.359447004608295, |
| "grad_norm": 0.4117890616435653, |
| "learning_rate": 1.2164359209115235e-06, |
| "loss": 0.0409, |
| "step": 87 |
| }, |
| { |
| "epoch": 2.3870967741935485, |
| "grad_norm": 0.46974554587042566, |
| "learning_rate": 1.1125722466017547e-06, |
| "loss": 0.0393, |
| "step": 88 |
| }, |
| { |
| "epoch": 2.4147465437788016, |
| "grad_norm": 0.4350219213576534, |
| "learning_rate": 1.012785947186397e-06, |
| "loss": 0.0475, |
| "step": 89 |
| }, |
| { |
| "epoch": 2.442396313364055, |
| "grad_norm": 0.49090032041418447, |
| "learning_rate": 9.171816847139447e-07, |
| "loss": 0.0424, |
| "step": 90 |
| }, |
| { |
| "epoch": 2.4700460829493087, |
| "grad_norm": 0.42408815275335165, |
| "learning_rate": 8.258597348536452e-07, |
| "loss": 0.0413, |
| "step": 91 |
| }, |
| { |
| "epoch": 2.4976958525345623, |
| "grad_norm": 0.40399622782019035, |
| "learning_rate": 7.389158817201541e-07, |
| "loss": 0.0388, |
| "step": 92 |
| }, |
| { |
| "epoch": 2.525345622119816, |
| "grad_norm": 0.38833269245723195, |
| "learning_rate": 6.564413174092443e-07, |
| "loss": 0.037, |
| "step": 93 |
| }, |
| { |
| "epoch": 2.5529953917050694, |
| "grad_norm": 0.389374030767258, |
| "learning_rate": 5.785225463498828e-07, |
| "loss": 0.0404, |
| "step": 94 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.42570497393280116, |
| "learning_rate": 5.05241294573024e-07, |
| "loss": 0.0398, |
| "step": 95 |
| }, |
| { |
| "epoch": 2.608294930875576, |
| "grad_norm": 0.4547617441003299, |
| "learning_rate": 4.3667442399229985e-07, |
| "loss": 0.0364, |
| "step": 96 |
| }, |
| { |
| "epoch": 2.6359447004608296, |
| "grad_norm": 0.4076559649282273, |
| "learning_rate": 3.728938517864794e-07, |
| "loss": 0.0448, |
| "step": 97 |
| }, |
| { |
| "epoch": 2.6635944700460827, |
| "grad_norm": 0.46055342435769453, |
| "learning_rate": 3.1396647496828245e-07, |
| "loss": 0.0365, |
| "step": 98 |
| }, |
| { |
| "epoch": 2.6912442396313363, |
| "grad_norm": 0.4056353196574725, |
| "learning_rate": 2.599541002186479e-07, |
| "loss": 0.0387, |
| "step": 99 |
| }, |
| { |
| "epoch": 2.71889400921659, |
| "grad_norm": 0.3970285169859869, |
| "learning_rate": 2.109133790600648e-07, |
| "loss": 0.0453, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.7465437788018434, |
| "grad_norm": 0.39424163292147957, |
| "learning_rate": 1.6689574843694433e-07, |
| "loss": 0.0353, |
| "step": 101 |
| }, |
| { |
| "epoch": 2.774193548387097, |
| "grad_norm": 0.4031654414089018, |
| "learning_rate": 1.2794737676536993e-07, |
| "loss": 0.0397, |
| "step": 102 |
| }, |
| { |
| "epoch": 2.80184331797235, |
| "grad_norm": 0.40508729456084114, |
| "learning_rate": 9.410911550880474e-08, |
| "loss": 0.0391, |
| "step": 103 |
| }, |
| { |
| "epoch": 2.8294930875576036, |
| "grad_norm": 0.3826393754876196, |
| "learning_rate": 6.54164563305465e-08, |
| "loss": 0.038, |
| "step": 104 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.4337313927895487, |
| "learning_rate": 4.189949386787462e-08, |
| "loss": 0.0418, |
| "step": 105 |
| }, |
| { |
| "epoch": 2.8847926267281108, |
| "grad_norm": 0.37776947123435684, |
| "learning_rate": 2.358289416693027e-08, |
| "loss": 0.0355, |
| "step": 106 |
| }, |
| { |
| "epoch": 2.912442396313364, |
| "grad_norm": 0.36823889024056433, |
| "learning_rate": 1.0485868811441757e-08, |
| "loss": 0.0384, |
| "step": 107 |
| }, |
| { |
| "epoch": 2.9400921658986174, |
| "grad_norm": 0.4725582160373693, |
| "learning_rate": 2.6221547724253337e-09, |
| "loss": 0.0385, |
| "step": 108 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 108, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 20, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2693023331503309e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|