| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.994675460468109, | |
| "eval_steps": 500, | |
| "global_step": 62000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016086222150727903, | |
| "grad_norm": 0.5078127384185791, | |
| "learning_rate": 4.959864230101023e-05, | |
| "loss": 2.1432, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.032172444301455806, | |
| "grad_norm": 0.4508506655693054, | |
| "learning_rate": 4.9196480277974395e-05, | |
| "loss": 1.9093, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.048258666452183706, | |
| "grad_norm": 0.4430558979511261, | |
| "learning_rate": 4.879431825493855e-05, | |
| "loss": 1.8418, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.06434488860291161, | |
| "grad_norm": 0.4775325059890747, | |
| "learning_rate": 4.8392156231902713e-05, | |
| "loss": 1.7771, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.08043111075363951, | |
| "grad_norm": 0.49685001373291016, | |
| "learning_rate": 4.7989994208866876e-05, | |
| "loss": 1.7226, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.09651733290436741, | |
| "grad_norm": 0.5552434325218201, | |
| "learning_rate": 4.7587832185831025e-05, | |
| "loss": 1.6767, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.11260355505509531, | |
| "grad_norm": 0.6779139637947083, | |
| "learning_rate": 4.718567016279519e-05, | |
| "loss": 1.6588, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.12868977720582322, | |
| "grad_norm": 0.5552022457122803, | |
| "learning_rate": 4.6783508139759344e-05, | |
| "loss": 1.603, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.1447759993565511, | |
| "grad_norm": 0.5302042365074158, | |
| "learning_rate": 4.638134611672351e-05, | |
| "loss": 1.5776, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.16086222150727902, | |
| "grad_norm": 0.5810815691947937, | |
| "learning_rate": 4.597918409368766e-05, | |
| "loss": 1.5333, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1769484436580069, | |
| "grad_norm": 0.5819700956344604, | |
| "learning_rate": 4.5577022070651826e-05, | |
| "loss": 1.5168, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.19303466580873482, | |
| "grad_norm": 0.6134072542190552, | |
| "learning_rate": 4.517486004761599e-05, | |
| "loss": 1.4748, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2091208879594627, | |
| "grad_norm": 0.5746152400970459, | |
| "learning_rate": 4.4772698024580144e-05, | |
| "loss": 1.4622, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.22520711011019062, | |
| "grad_norm": 0.7663710713386536, | |
| "learning_rate": 4.437053600154431e-05, | |
| "loss": 1.4767, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.24129333226091854, | |
| "grad_norm": 0.7993176579475403, | |
| "learning_rate": 4.396837397850846e-05, | |
| "loss": 1.4527, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.25737955441164645, | |
| "grad_norm": 0.6892676949501038, | |
| "learning_rate": 4.3566211955472626e-05, | |
| "loss": 1.4325, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2734657765623743, | |
| "grad_norm": 0.6928556561470032, | |
| "learning_rate": 4.316404993243678e-05, | |
| "loss": 1.4038, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.2895519987131022, | |
| "grad_norm": 0.7578593492507935, | |
| "learning_rate": 4.2761887909400944e-05, | |
| "loss": 1.3945, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.30563822086383013, | |
| "grad_norm": 0.7504703402519226, | |
| "learning_rate": 4.23597258863651e-05, | |
| "loss": 1.3644, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.32172444301455805, | |
| "grad_norm": 0.8370710611343384, | |
| "learning_rate": 4.1957563863329256e-05, | |
| "loss": 1.3619, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.3378106651652859, | |
| "grad_norm": 0.8501142263412476, | |
| "learning_rate": 4.155540184029342e-05, | |
| "loss": 1.3448, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.3538968873160138, | |
| "grad_norm": 0.9001900553703308, | |
| "learning_rate": 4.1153239817257575e-05, | |
| "loss": 1.3004, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.36998310946674173, | |
| "grad_norm": 1.0658681392669678, | |
| "learning_rate": 4.075107779422174e-05, | |
| "loss": 1.2789, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.38606933161746965, | |
| "grad_norm": 1.1038371324539185, | |
| "learning_rate": 4.0348915771185894e-05, | |
| "loss": 1.2651, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.40215555376819756, | |
| "grad_norm": 1.2004213333129883, | |
| "learning_rate": 3.994755807219613e-05, | |
| "loss": 1.2216, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.4182417759189254, | |
| "grad_norm": 1.235543966293335, | |
| "learning_rate": 3.9545396049160286e-05, | |
| "loss": 1.1955, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.43432799806965333, | |
| "grad_norm": 1.5088828802108765, | |
| "learning_rate": 3.914323402612445e-05, | |
| "loss": 1.1836, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.45041422022038125, | |
| "grad_norm": 1.264153242111206, | |
| "learning_rate": 3.8741072003088605e-05, | |
| "loss": 1.1658, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.46650044237110916, | |
| "grad_norm": 1.3023343086242676, | |
| "learning_rate": 3.833971430409884e-05, | |
| "loss": 1.1481, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.48258666452183707, | |
| "grad_norm": 1.3824670314788818, | |
| "learning_rate": 3.7938356605109064e-05, | |
| "loss": 1.1221, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.49867288667256493, | |
| "grad_norm": 1.4364969730377197, | |
| "learning_rate": 3.75369989061193e-05, | |
| "loss": 1.1057, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.5147591088232929, | |
| "grad_norm": 2.051701545715332, | |
| "learning_rate": 3.7134836883083456e-05, | |
| "loss": 1.0873, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.5308453309740208, | |
| "grad_norm": 1.4329720735549927, | |
| "learning_rate": 3.673267486004762e-05, | |
| "loss": 1.0607, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.5469315531247486, | |
| "grad_norm": 1.4981014728546143, | |
| "learning_rate": 3.6330512837011775e-05, | |
| "loss": 1.0516, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5630177752754766, | |
| "grad_norm": 1.3012079000473022, | |
| "learning_rate": 3.592835081397594e-05, | |
| "loss": 1.0317, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.5791039974262044, | |
| "grad_norm": 1.401825189590454, | |
| "learning_rate": 3.552699311498617e-05, | |
| "loss": 1.0183, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5951902195769324, | |
| "grad_norm": 2.0783369541168213, | |
| "learning_rate": 3.512483109195033e-05, | |
| "loss": 0.9985, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.6112764417276603, | |
| "grad_norm": 2.3940794467926025, | |
| "learning_rate": 3.4722669068914486e-05, | |
| "loss": 0.9698, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.6273626638783881, | |
| "grad_norm": 1.4747998714447021, | |
| "learning_rate": 3.432050704587865e-05, | |
| "loss": 0.9657, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.6434488860291161, | |
| "grad_norm": 3.0782012939453125, | |
| "learning_rate": 3.391914934688888e-05, | |
| "loss": 0.9379, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.659535108179844, | |
| "grad_norm": 2.4914307594299316, | |
| "learning_rate": 3.3516987323853034e-05, | |
| "loss": 0.915, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.6756213303305718, | |
| "grad_norm": 2.772120237350464, | |
| "learning_rate": 3.3115629624863264e-05, | |
| "loss": 0.9047, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.6917075524812998, | |
| "grad_norm": 2.519575595855713, | |
| "learning_rate": 3.271346760182743e-05, | |
| "loss": 0.8688, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.7077937746320276, | |
| "grad_norm": 4.085098743438721, | |
| "learning_rate": 3.231130557879158e-05, | |
| "loss": 0.8581, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.7238799967827556, | |
| "grad_norm": 1.4670002460479736, | |
| "learning_rate": 3.1909143555755745e-05, | |
| "loss": 0.8354, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.7399662189334835, | |
| "grad_norm": 2.4749488830566406, | |
| "learning_rate": 3.1507785856765975e-05, | |
| "loss": 0.8108, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.7560524410842113, | |
| "grad_norm": 1.8635029792785645, | |
| "learning_rate": 3.110562383373014e-05, | |
| "loss": 0.7773, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.7721386632349393, | |
| "grad_norm": 3.5713748931884766, | |
| "learning_rate": 3.0703461810694294e-05, | |
| "loss": 0.756, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7882248853856672, | |
| "grad_norm": 1.8903526067733765, | |
| "learning_rate": 3.0301299787658456e-05, | |
| "loss": 0.7326, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.8043111075363951, | |
| "grad_norm": 8.286703109741211, | |
| "learning_rate": 2.9899942088668686e-05, | |
| "loss": 0.6948, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.820397329687123, | |
| "grad_norm": 2.2209272384643555, | |
| "learning_rate": 2.9497780065632845e-05, | |
| "loss": 0.6914, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.8364835518378508, | |
| "grad_norm": 2.2284536361694336, | |
| "learning_rate": 2.9095618042597e-05, | |
| "loss": 0.6585, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.8525697739885788, | |
| "grad_norm": 3.4615938663482666, | |
| "learning_rate": 2.869345601956116e-05, | |
| "loss": 0.633, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.8686559961393067, | |
| "grad_norm": 3.1158838272094727, | |
| "learning_rate": 2.829209832057139e-05, | |
| "loss": 0.6181, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.8847422182900346, | |
| "grad_norm": 2.3320417404174805, | |
| "learning_rate": 2.7889936297535553e-05, | |
| "loss": 0.5993, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.9008284404407625, | |
| "grad_norm": 1.8331427574157715, | |
| "learning_rate": 2.7487774274499712e-05, | |
| "loss": 0.5839, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.9169146625914903, | |
| "grad_norm": 3.2398369312286377, | |
| "learning_rate": 2.708561225146387e-05, | |
| "loss": 0.562, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.9330008847422183, | |
| "grad_norm": 1.6575061082839966, | |
| "learning_rate": 2.66842545524741e-05, | |
| "loss": 0.5313, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.9490871068929462, | |
| "grad_norm": 2.1604230403900146, | |
| "learning_rate": 2.6282092529438264e-05, | |
| "loss": 0.5203, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.9651733290436741, | |
| "grad_norm": 3.3743808269500732, | |
| "learning_rate": 2.5879930506402423e-05, | |
| "loss": 0.4938, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.981259551194402, | |
| "grad_norm": 3.766514301300049, | |
| "learning_rate": 2.5477768483366583e-05, | |
| "loss": 0.4724, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.9973457733451299, | |
| "grad_norm": 2.26712703704834, | |
| "learning_rate": 2.5075606460330742e-05, | |
| "loss": 0.4656, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.26554691791534424, | |
| "eval_runtime": 1917.4803, | |
| "eval_samples_per_second": 345.81, | |
| "eval_steps_per_second": 43.227, | |
| "step": 31083 | |
| }, | |
| { | |
| "epoch": 1.0134159092737072, | |
| "grad_norm": 2.1041958332061768, | |
| "learning_rate": 2.467424876134097e-05, | |
| "loss": 0.4381, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.029502131424435, | |
| "grad_norm": 1.7629106044769287, | |
| "learning_rate": 2.427208673830513e-05, | |
| "loss": 0.4298, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.0455883535751629, | |
| "grad_norm": 2.5032904148101807, | |
| "learning_rate": 2.386992471526929e-05, | |
| "loss": 0.4188, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.0616745757258907, | |
| "grad_norm": 1.6467881202697754, | |
| "learning_rate": 2.3467762692233446e-05, | |
| "loss": 0.3986, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.0777607978766186, | |
| "grad_norm": 1.957220435142517, | |
| "learning_rate": 2.3065600669197606e-05, | |
| "loss": 0.382, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.0938470200273467, | |
| "grad_norm": 1.6566946506500244, | |
| "learning_rate": 2.2663438646161765e-05, | |
| "loss": 0.3689, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.1099332421780745, | |
| "grad_norm": 2.081613540649414, | |
| "learning_rate": 2.2261276623125928e-05, | |
| "loss": 0.3603, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.1260194643288024, | |
| "grad_norm": 2.155226945877075, | |
| "learning_rate": 2.1859918924136157e-05, | |
| "loss": 0.3478, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.1421056864795303, | |
| "grad_norm": 1.9459590911865234, | |
| "learning_rate": 2.1457756901100317e-05, | |
| "loss": 0.3315, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.1581919086302581, | |
| "grad_norm": 2.3381567001342773, | |
| "learning_rate": 2.1055594878064476e-05, | |
| "loss": 0.3259, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.1742781307809862, | |
| "grad_norm": 1.4302254915237427, | |
| "learning_rate": 2.0653432855028635e-05, | |
| "loss": 0.3168, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.190364352931714, | |
| "grad_norm": 1.1770597696304321, | |
| "learning_rate": 2.0251270831992795e-05, | |
| "loss": 0.3082, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.206450575082442, | |
| "grad_norm": 1.7475298643112183, | |
| "learning_rate": 1.9849913133003024e-05, | |
| "loss": 0.3014, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.2225367972331698, | |
| "grad_norm": 1.2397468090057373, | |
| "learning_rate": 1.9447751109967187e-05, | |
| "loss": 0.288, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.2386230193838976, | |
| "grad_norm": 1.6603740453720093, | |
| "learning_rate": 1.9045589086931343e-05, | |
| "loss": 0.2797, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.2547092415346257, | |
| "grad_norm": 1.7009538412094116, | |
| "learning_rate": 1.8643427063895502e-05, | |
| "loss": 0.275, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.2707954636853536, | |
| "grad_norm": 1.4941717386245728, | |
| "learning_rate": 1.8241265040859662e-05, | |
| "loss": 0.2623, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.2868816858360814, | |
| "grad_norm": 1.941115140914917, | |
| "learning_rate": 1.7839907341869895e-05, | |
| "loss": 0.2572, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.3029679079868093, | |
| "grad_norm": 1.487726092338562, | |
| "learning_rate": 1.7437745318834054e-05, | |
| "loss": 0.2502, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.3190541301375371, | |
| "grad_norm": 1.4628674983978271, | |
| "learning_rate": 1.7035583295798213e-05, | |
| "loss": 0.2437, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.3351403522882652, | |
| "grad_norm": 1.401607632637024, | |
| "learning_rate": 1.663342127276237e-05, | |
| "loss": 0.2421, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.351226574438993, | |
| "grad_norm": 1.1497563123703003, | |
| "learning_rate": 1.623125924972653e-05, | |
| "loss": 0.231, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.367312796589721, | |
| "grad_norm": 1.322836995124817, | |
| "learning_rate": 1.5829097226690688e-05, | |
| "loss": 0.2261, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.3833990187404488, | |
| "grad_norm": 1.5328525304794312, | |
| "learning_rate": 1.542773952770092e-05, | |
| "loss": 0.2177, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.3994852408911767, | |
| "grad_norm": 1.7748241424560547, | |
| "learning_rate": 1.502557750466508e-05, | |
| "loss": 0.2186, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.4155714630419047, | |
| "grad_norm": 1.6542141437530518, | |
| "learning_rate": 1.4623415481629241e-05, | |
| "loss": 0.2138, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.4316576851926326, | |
| "grad_norm": 1.3098843097686768, | |
| "learning_rate": 1.4221253458593397e-05, | |
| "loss": 0.211, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.4477439073433604, | |
| "grad_norm": 1.345651626586914, | |
| "learning_rate": 1.3819091435557557e-05, | |
| "loss": 0.2027, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.4638301294940883, | |
| "grad_norm": 1.4520297050476074, | |
| "learning_rate": 1.3416929412521718e-05, | |
| "loss": 0.2039, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.4799163516448162, | |
| "grad_norm": 1.5913499593734741, | |
| "learning_rate": 1.3014767389485877e-05, | |
| "loss": 0.1939, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.4960025737955442, | |
| "grad_norm": 1.1803226470947266, | |
| "learning_rate": 1.2612605366450037e-05, | |
| "loss": 0.1887, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.5120887959462719, | |
| "grad_norm": 1.1462236642837524, | |
| "learning_rate": 1.2210443343414194e-05, | |
| "loss": 0.1883, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.528175018097, | |
| "grad_norm": 0.8483968377113342, | |
| "learning_rate": 1.1808281320378355e-05, | |
| "loss": 0.1809, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.5442612402477278, | |
| "grad_norm": 1.1205823421478271, | |
| "learning_rate": 1.1406119297342515e-05, | |
| "loss": 0.1813, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.5603474623984557, | |
| "grad_norm": 1.417622447013855, | |
| "learning_rate": 1.1003957274306672e-05, | |
| "loss": 0.1788, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.5764336845491838, | |
| "grad_norm": 1.179103970527649, | |
| "learning_rate": 1.0602599575316904e-05, | |
| "loss": 0.1809, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.5925199066999114, | |
| "grad_norm": 1.1092889308929443, | |
| "learning_rate": 1.0200437552281065e-05, | |
| "loss": 0.1734, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.6086061288506395, | |
| "grad_norm": 1.0196574926376343, | |
| "learning_rate": 9.798275529245222e-06, | |
| "loss": 0.1688, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.6246923510013673, | |
| "grad_norm": 1.1376862525939941, | |
| "learning_rate": 9.396113506209382e-06, | |
| "loss": 0.1703, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.6407785731520952, | |
| "grad_norm": 0.8885149955749512, | |
| "learning_rate": 8.995560131265685e-06, | |
| "loss": 0.1691, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.6568647953028233, | |
| "grad_norm": 1.2574944496154785, | |
| "learning_rate": 8.593398108229844e-06, | |
| "loss": 0.1615, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.672951017453551, | |
| "grad_norm": 1.2620723247528076, | |
| "learning_rate": 8.191236085194004e-06, | |
| "loss": 0.1593, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.689037239604279, | |
| "grad_norm": 1.551480770111084, | |
| "learning_rate": 7.789074062158163e-06, | |
| "loss": 0.1639, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.7051234617550068, | |
| "grad_norm": 1.5938962697982788, | |
| "learning_rate": 7.386912039122322e-06, | |
| "loss": 0.1587, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.7212096839057347, | |
| "grad_norm": 1.0503953695297241, | |
| "learning_rate": 6.984750016086482e-06, | |
| "loss": 0.1599, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.7372959060564628, | |
| "grad_norm": 1.1205036640167236, | |
| "learning_rate": 6.583392317096712e-06, | |
| "loss": 0.1541, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.7533821282071904, | |
| "grad_norm": 0.7524433732032776, | |
| "learning_rate": 6.181230294060872e-06, | |
| "loss": 0.1521, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.7694683503579185, | |
| "grad_norm": 0.9619775414466858, | |
| "learning_rate": 5.779068271025031e-06, | |
| "loss": 0.1521, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.7855545725086464, | |
| "grad_norm": 0.9406844973564148, | |
| "learning_rate": 5.37690624798919e-06, | |
| "loss": 0.1509, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.8016407946593742, | |
| "grad_norm": 0.9363726377487183, | |
| "learning_rate": 4.975548548999421e-06, | |
| "loss": 0.1513, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.8177270168101023, | |
| "grad_norm": 0.9941402673721313, | |
| "learning_rate": 4.573386525963581e-06, | |
| "loss": 0.1484, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.83381323896083, | |
| "grad_norm": 1.3756345510482788, | |
| "learning_rate": 4.17122450292774e-06, | |
| "loss": 0.1509, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.849899461111558, | |
| "grad_norm": 1.0644595623016357, | |
| "learning_rate": 3.7690624798918986e-06, | |
| "loss": 0.1486, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.8659856832622859, | |
| "grad_norm": 1.070890188217163, | |
| "learning_rate": 3.3669004568560584e-06, | |
| "loss": 0.1462, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.8820719054130137, | |
| "grad_norm": 1.3034768104553223, | |
| "learning_rate": 2.9647384338202173e-06, | |
| "loss": 0.1481, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.8981581275637418, | |
| "grad_norm": 1.127517580986023, | |
| "learning_rate": 2.5625764107843767e-06, | |
| "loss": 0.1451, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.9142443497144694, | |
| "grad_norm": 0.9431403279304504, | |
| "learning_rate": 2.1604143877485364e-06, | |
| "loss": 0.1458, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.9303305718651975, | |
| "grad_norm": 1.271483302116394, | |
| "learning_rate": 1.7590566887587673e-06, | |
| "loss": 0.1463, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.9464167940159254, | |
| "grad_norm": 0.7327952980995178, | |
| "learning_rate": 1.3568946657229264e-06, | |
| "loss": 0.1434, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 1.9625030161666532, | |
| "grad_norm": 1.0670543909072876, | |
| "learning_rate": 9.547326426870858e-07, | |
| "loss": 0.1424, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.9785892383173813, | |
| "grad_norm": 1.2705425024032593, | |
| "learning_rate": 5.525706196512451e-07, | |
| "loss": 0.1431, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 1.994675460468109, | |
| "grad_norm": 0.9267213344573975, | |
| "learning_rate": 1.5040859661540443e-07, | |
| "loss": 0.1418, | |
| "step": 62000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 62164, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3425693542685082e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |