| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9932279909706545, | |
| "eval_steps": 500, | |
| "global_step": 996, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015048908954100828, | |
| "grad_norm": 1.2988319396972656, | |
| "learning_rate": 4.9996890990217804e-05, | |
| "loss": 2.4707, | |
| "num_input_tokens_seen": 5864, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.030097817908201655, | |
| "grad_norm": 1.8058427572250366, | |
| "learning_rate": 4.9987564734146566e-05, | |
| "loss": 2.2509, | |
| "num_input_tokens_seen": 11432, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045146726862302484, | |
| "grad_norm": 0.8231738209724426, | |
| "learning_rate": 4.997202355141999e-05, | |
| "loss": 1.6895, | |
| "num_input_tokens_seen": 17000, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06019563581640331, | |
| "grad_norm": 0.7266705632209778, | |
| "learning_rate": 4.995027130745321e-05, | |
| "loss": 1.4876, | |
| "num_input_tokens_seen": 22840, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07524454477050414, | |
| "grad_norm": 1.1722582578659058, | |
| "learning_rate": 4.992231341248137e-05, | |
| "loss": 1.4812, | |
| "num_input_tokens_seen": 28984, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09029345372460497, | |
| "grad_norm": 0.9262341260910034, | |
| "learning_rate": 4.9888156820213974e-05, | |
| "loss": 1.3642, | |
| "num_input_tokens_seen": 34856, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1053423626787058, | |
| "grad_norm": 0.8832902908325195, | |
| "learning_rate": 4.9847810026105394e-05, | |
| "loss": 1.3651, | |
| "num_input_tokens_seen": 41216, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.12039127163280662, | |
| "grad_norm": 0.8503655791282654, | |
| "learning_rate": 4.980128306524183e-05, | |
| "loss": 1.1321, | |
| "num_input_tokens_seen": 47304, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13544018058690746, | |
| "grad_norm": 1.348948359489441, | |
| "learning_rate": 4.97485875098454e-05, | |
| "loss": 1.3012, | |
| "num_input_tokens_seen": 53184, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1504890895410083, | |
| "grad_norm": 0.7177269458770752, | |
| "learning_rate": 4.968973646639589e-05, | |
| "loss": 0.9827, | |
| "num_input_tokens_seen": 59024, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1655379984951091, | |
| "grad_norm": 0.6005258560180664, | |
| "learning_rate": 4.9624744572370865e-05, | |
| "loss": 1.2313, | |
| "num_input_tokens_seen": 64816, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.18058690744920994, | |
| "grad_norm": 0.6153081059455872, | |
| "learning_rate": 4.9553627992605066e-05, | |
| "loss": 1.0347, | |
| "num_input_tokens_seen": 70848, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19563581640331076, | |
| "grad_norm": 0.7796200513839722, | |
| "learning_rate": 4.947640441526989e-05, | |
| "loss": 1.0422, | |
| "num_input_tokens_seen": 76888, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2106847253574116, | |
| "grad_norm": 0.7273033857345581, | |
| "learning_rate": 4.939309304747391e-05, | |
| "loss": 0.9996, | |
| "num_input_tokens_seen": 82840, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.22573363431151242, | |
| "grad_norm": 0.7943289875984192, | |
| "learning_rate": 4.930371461048571e-05, | |
| "loss": 1.0755, | |
| "num_input_tokens_seen": 88824, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24078254326561324, | |
| "grad_norm": 0.6128024458885193, | |
| "learning_rate": 4.9208291334580104e-05, | |
| "loss": 1.026, | |
| "num_input_tokens_seen": 94264, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2558314522197141, | |
| "grad_norm": 0.7087495923042297, | |
| "learning_rate": 4.910684695350895e-05, | |
| "loss": 1.1307, | |
| "num_input_tokens_seen": 99896, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2708803611738149, | |
| "grad_norm": 0.711476743221283, | |
| "learning_rate": 4.8999406698598074e-05, | |
| "loss": 1.0221, | |
| "num_input_tokens_seen": 105640, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.28592927012791575, | |
| "grad_norm": 0.5772566795349121, | |
| "learning_rate": 4.8885997292471774e-05, | |
| "loss": 1.012, | |
| "num_input_tokens_seen": 111280, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3009781790820166, | |
| "grad_norm": 0.6769325137138367, | |
| "learning_rate": 4.87666469424063e-05, | |
| "loss": 1.0151, | |
| "num_input_tokens_seen": 116640, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3160270880361174, | |
| "grad_norm": 0.679373025894165, | |
| "learning_rate": 4.86413853333141e-05, | |
| "loss": 1.0028, | |
| "num_input_tokens_seen": 121864, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3310759969902182, | |
| "grad_norm": 0.9181504845619202, | |
| "learning_rate": 4.851024362036064e-05, | |
| "loss": 1.143, | |
| "num_input_tokens_seen": 127384, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.34612490594431905, | |
| "grad_norm": 0.7842696905136108, | |
| "learning_rate": 4.837325442121538e-05, | |
| "loss": 0.9695, | |
| "num_input_tokens_seen": 133008, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3611738148984199, | |
| "grad_norm": 0.6459535360336304, | |
| "learning_rate": 4.8230451807939135e-05, | |
| "loss": 0.9017, | |
| "num_input_tokens_seen": 139144, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3762227238525207, | |
| "grad_norm": 0.6695935726165771, | |
| "learning_rate": 4.808187129850963e-05, | |
| "loss": 1.035, | |
| "num_input_tokens_seen": 144848, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3912716328066215, | |
| "grad_norm": 0.9289236664772034, | |
| "learning_rate": 4.792754984798745e-05, | |
| "loss": 1.0128, | |
| "num_input_tokens_seen": 150480, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.40632054176072235, | |
| "grad_norm": 0.6192979216575623, | |
| "learning_rate": 4.776752583932454e-05, | |
| "loss": 0.9432, | |
| "num_input_tokens_seen": 156336, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4213694507148232, | |
| "grad_norm": 0.7946303486824036, | |
| "learning_rate": 4.760183907381757e-05, | |
| "loss": 1.0344, | |
| "num_input_tokens_seen": 162440, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.436418359668924, | |
| "grad_norm": 0.6548484563827515, | |
| "learning_rate": 4.7430530761208494e-05, | |
| "loss": 0.9452, | |
| "num_input_tokens_seen": 168304, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.45146726862302483, | |
| "grad_norm": 0.9075986742973328, | |
| "learning_rate": 4.725364350943492e-05, | |
| "loss": 0.9559, | |
| "num_input_tokens_seen": 173984, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.46651617757712566, | |
| "grad_norm": 0.8047800660133362, | |
| "learning_rate": 4.707122131403251e-05, | |
| "loss": 0.9726, | |
| "num_input_tokens_seen": 179896, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4815650865312265, | |
| "grad_norm": 0.6954847574234009, | |
| "learning_rate": 4.6883309547192476e-05, | |
| "loss": 0.9344, | |
| "num_input_tokens_seen": 185296, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4966139954853273, | |
| "grad_norm": 0.7912609577178955, | |
| "learning_rate": 4.668995494647653e-05, | |
| "loss": 0.9497, | |
| "num_input_tokens_seen": 190928, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5116629044394282, | |
| "grad_norm": 0.7360678315162659, | |
| "learning_rate": 4.649120560319225e-05, | |
| "loss": 1.057, | |
| "num_input_tokens_seen": 197352, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.526711813393529, | |
| "grad_norm": 0.7325194478034973, | |
| "learning_rate": 4.6287110950431865e-05, | |
| "loss": 0.9847, | |
| "num_input_tokens_seen": 203216, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5417607223476298, | |
| "grad_norm": 0.7140082120895386, | |
| "learning_rate": 4.607772175077711e-05, | |
| "loss": 1.001, | |
| "num_input_tokens_seen": 208624, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5568096313017307, | |
| "grad_norm": 0.9454194903373718, | |
| "learning_rate": 4.586309008367359e-05, | |
| "loss": 0.9384, | |
| "num_input_tokens_seen": 214552, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5718585402558315, | |
| "grad_norm": 0.9370235800743103, | |
| "learning_rate": 4.564326933247752e-05, | |
| "loss": 1.0312, | |
| "num_input_tokens_seen": 220704, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5869074492099323, | |
| "grad_norm": 0.7274216413497925, | |
| "learning_rate": 4.541831417117815e-05, | |
| "loss": 0.9112, | |
| "num_input_tokens_seen": 226480, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6019563581640331, | |
| "grad_norm": 0.9026529788970947, | |
| "learning_rate": 4.518828055079925e-05, | |
| "loss": 0.9967, | |
| "num_input_tokens_seen": 232136, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.617005267118134, | |
| "grad_norm": 0.9668667316436768, | |
| "learning_rate": 4.4953225685482904e-05, | |
| "loss": 1.0905, | |
| "num_input_tokens_seen": 238072, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6320541760722348, | |
| "grad_norm": 0.7728851437568665, | |
| "learning_rate": 4.471320803825915e-05, | |
| "loss": 0.9487, | |
| "num_input_tokens_seen": 243680, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6471030850263356, | |
| "grad_norm": 0.7141396999359131, | |
| "learning_rate": 4.4468287306505045e-05, | |
| "loss": 0.8675, | |
| "num_input_tokens_seen": 249376, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6621519939804364, | |
| "grad_norm": 0.7524191737174988, | |
| "learning_rate": 4.421852440709666e-05, | |
| "loss": 0.8624, | |
| "num_input_tokens_seen": 255288, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6772009029345373, | |
| "grad_norm": 1.1502355337142944, | |
| "learning_rate": 4.39639814612578e-05, | |
| "loss": 1.0489, | |
| "num_input_tokens_seen": 261592, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6922498118886381, | |
| "grad_norm": 0.7467320561408997, | |
| "learning_rate": 4.370472177910914e-05, | |
| "loss": 0.9139, | |
| "num_input_tokens_seen": 267192, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7072987208427389, | |
| "grad_norm": 0.6400129795074463, | |
| "learning_rate": 4.3440809843921725e-05, | |
| "loss": 0.9905, | |
| "num_input_tokens_seen": 272712, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7223476297968398, | |
| "grad_norm": 0.6654481291770935, | |
| "learning_rate": 4.3172311296078595e-05, | |
| "loss": 0.8974, | |
| "num_input_tokens_seen": 278720, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7373965387509406, | |
| "grad_norm": 0.7487585544586182, | |
| "learning_rate": 4.28992929167487e-05, | |
| "loss": 0.999, | |
| "num_input_tokens_seen": 284584, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7524454477050414, | |
| "grad_norm": 0.6885581612586975, | |
| "learning_rate": 4.2621822611277e-05, | |
| "loss": 0.9916, | |
| "num_input_tokens_seen": 290408, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7674943566591422, | |
| "grad_norm": 0.774027407169342, | |
| "learning_rate": 4.233996939229502e-05, | |
| "loss": 0.9242, | |
| "num_input_tokens_seen": 295776, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.782543265613243, | |
| "grad_norm": 0.8608073592185974, | |
| "learning_rate": 4.205380336255594e-05, | |
| "loss": 1.0426, | |
| "num_input_tokens_seen": 301736, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7975921745673439, | |
| "grad_norm": 0.6539498567581177, | |
| "learning_rate": 4.176339569749865e-05, | |
| "loss": 0.8625, | |
| "num_input_tokens_seen": 307224, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8126410835214447, | |
| "grad_norm": 0.8432996273040771, | |
| "learning_rate": 4.1468818627544845e-05, | |
| "loss": 0.9959, | |
| "num_input_tokens_seen": 313040, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8276899924755455, | |
| "grad_norm": 0.877001166343689, | |
| "learning_rate": 4.11701454201339e-05, | |
| "loss": 0.939, | |
| "num_input_tokens_seen": 319112, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8427389014296464, | |
| "grad_norm": 0.9003238081932068, | |
| "learning_rate": 4.08674503614997e-05, | |
| "loss": 0.9741, | |
| "num_input_tokens_seen": 325040, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8577878103837472, | |
| "grad_norm": 0.8585950136184692, | |
| "learning_rate": 4.0560808738194114e-05, | |
| "loss": 0.98, | |
| "num_input_tokens_seen": 330904, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.872836719337848, | |
| "grad_norm": 0.8015385270118713, | |
| "learning_rate": 4.0250296818361647e-05, | |
| "loss": 0.8898, | |
| "num_input_tokens_seen": 336392, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8878856282919488, | |
| "grad_norm": 0.8380082845687866, | |
| "learning_rate": 3.993599183277001e-05, | |
| "loss": 0.953, | |
| "num_input_tokens_seen": 342832, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9029345372460497, | |
| "grad_norm": 0.8890098929405212, | |
| "learning_rate": 3.961797195560118e-05, | |
| "loss": 0.9311, | |
| "num_input_tokens_seen": 348944, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9179834462001505, | |
| "grad_norm": 0.9356483221054077, | |
| "learning_rate": 3.9296316285007887e-05, | |
| "loss": 0.9114, | |
| "num_input_tokens_seen": 354680, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9330323551542513, | |
| "grad_norm": 0.8241044878959656, | |
| "learning_rate": 3.897110482344024e-05, | |
| "loss": 0.9674, | |
| "num_input_tokens_seen": 361008, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9480812641083521, | |
| "grad_norm": 0.7882922887802124, | |
| "learning_rate": 3.864241845774746e-05, | |
| "loss": 0.9582, | |
| "num_input_tokens_seen": 366760, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.963130173062453, | |
| "grad_norm": 0.7503064274787903, | |
| "learning_rate": 3.8310338939059644e-05, | |
| "loss": 0.9863, | |
| "num_input_tokens_seen": 372448, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9781790820165538, | |
| "grad_norm": 0.6487952470779419, | |
| "learning_rate": 3.797494886245456e-05, | |
| "loss": 0.906, | |
| "num_input_tokens_seen": 378520, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9932279909706546, | |
| "grad_norm": 0.8584316968917847, | |
| "learning_rate": 3.7636331646414524e-05, | |
| "loss": 0.8958, | |
| "num_input_tokens_seen": 384272, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0060195635816402, | |
| "grad_norm": 0.8825767040252686, | |
| "learning_rate": 3.7294571512078506e-05, | |
| "loss": 0.8349, | |
| "num_input_tokens_seen": 389280, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.021068472535741, | |
| "grad_norm": 0.8422874808311462, | |
| "learning_rate": 3.694975346229458e-05, | |
| "loss": 0.8507, | |
| "num_input_tokens_seen": 394944, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.036117381489842, | |
| "grad_norm": 0.8337146639823914, | |
| "learning_rate": 3.6601963260477924e-05, | |
| "loss": 0.9287, | |
| "num_input_tokens_seen": 400800, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.0511662904439427, | |
| "grad_norm": 0.936469316482544, | |
| "learning_rate": 3.625128740927971e-05, | |
| "loss": 0.9107, | |
| "num_input_tokens_seen": 406728, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0662151993980435, | |
| "grad_norm": 0.8475446105003357, | |
| "learning_rate": 3.589781312907207e-05, | |
| "loss": 0.952, | |
| "num_input_tokens_seen": 412656, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0812641083521444, | |
| "grad_norm": 0.7245047092437744, | |
| "learning_rate": 3.55416283362546e-05, | |
| "loss": 0.9526, | |
| "num_input_tokens_seen": 418488, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0963130173062452, | |
| "grad_norm": 1.0173735618591309, | |
| "learning_rate": 3.518282162138772e-05, | |
| "loss": 0.8775, | |
| "num_input_tokens_seen": 424192, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.111361926260346, | |
| "grad_norm": 0.9992531538009644, | |
| "learning_rate": 3.482148222715835e-05, | |
| "loss": 0.883, | |
| "num_input_tokens_seen": 430312, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1264108352144468, | |
| "grad_norm": 1.0938397645950317, | |
| "learning_rate": 3.4457700026183374e-05, | |
| "loss": 1.0032, | |
| "num_input_tokens_seen": 436128, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.141459744168548, | |
| "grad_norm": 0.8988808989524841, | |
| "learning_rate": 3.409156549865654e-05, | |
| "loss": 0.943, | |
| "num_input_tokens_seen": 441928, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1565086531226485, | |
| "grad_norm": 0.9952559471130371, | |
| "learning_rate": 3.3723169709844026e-05, | |
| "loss": 0.801, | |
| "num_input_tokens_seen": 447560, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.1715575620767495, | |
| "grad_norm": 0.7556662559509277, | |
| "learning_rate": 3.335260428743475e-05, | |
| "loss": 0.9294, | |
| "num_input_tokens_seen": 453296, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1866064710308502, | |
| "grad_norm": 0.8362197279930115, | |
| "learning_rate": 3.297996139875055e-05, | |
| "loss": 0.9528, | |
| "num_input_tokens_seen": 459336, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.2016553799849512, | |
| "grad_norm": 0.9389665722846985, | |
| "learning_rate": 3.260533372782234e-05, | |
| "loss": 0.8981, | |
| "num_input_tokens_seen": 464944, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2167042889390518, | |
| "grad_norm": 1.1821860074996948, | |
| "learning_rate": 3.222881445233759e-05, | |
| "loss": 0.9823, | |
| "num_input_tokens_seen": 470992, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.2317531978931529, | |
| "grad_norm": 1.0015898942947388, | |
| "learning_rate": 3.185049722046516e-05, | |
| "loss": 0.9047, | |
| "num_input_tokens_seen": 476216, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2468021068472535, | |
| "grad_norm": 0.8765709400177002, | |
| "learning_rate": 3.147047612756302e-05, | |
| "loss": 0.8582, | |
| "num_input_tokens_seen": 481824, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.2618510158013545, | |
| "grad_norm": 0.9712916612625122, | |
| "learning_rate": 3.10888456927748e-05, | |
| "loss": 0.8787, | |
| "num_input_tokens_seen": 487576, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.276899924755455, | |
| "grad_norm": 1.1555066108703613, | |
| "learning_rate": 3.0705700835520895e-05, | |
| "loss": 0.8729, | |
| "num_input_tokens_seen": 493336, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.2919488337095562, | |
| "grad_norm": 1.1198400259017944, | |
| "learning_rate": 3.0321136851890036e-05, | |
| "loss": 0.8772, | |
| "num_input_tokens_seen": 499760, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3069977426636568, | |
| "grad_norm": 1.1468943357467651, | |
| "learning_rate": 2.9935249390937183e-05, | |
| "loss": 0.9451, | |
| "num_input_tokens_seen": 505400, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.3220466516177578, | |
| "grad_norm": 0.8468641042709351, | |
| "learning_rate": 2.9548134430893604e-05, | |
| "loss": 0.8202, | |
| "num_input_tokens_seen": 511760, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3370955605718584, | |
| "grad_norm": 1.3206151723861694, | |
| "learning_rate": 2.9159888255295116e-05, | |
| "loss": 0.9773, | |
| "num_input_tokens_seen": 517616, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.3521444695259595, | |
| "grad_norm": 1.1996040344238281, | |
| "learning_rate": 2.8770607429034352e-05, | |
| "loss": 0.9101, | |
| "num_input_tokens_seen": 522744, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.36719337848006, | |
| "grad_norm": 1.1539313793182373, | |
| "learning_rate": 2.8380388774343047e-05, | |
| "loss": 0.9633, | |
| "num_input_tokens_seen": 528648, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.382242287434161, | |
| "grad_norm": 1.021848440170288, | |
| "learning_rate": 2.7989329346710375e-05, | |
| "loss": 0.8886, | |
| "num_input_tokens_seen": 534000, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.3972911963882617, | |
| "grad_norm": 0.8612179160118103, | |
| "learning_rate": 2.759752641074322e-05, | |
| "loss": 0.9258, | |
| "num_input_tokens_seen": 539688, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.4123401053423628, | |
| "grad_norm": 1.0109293460845947, | |
| "learning_rate": 2.7205077415974416e-05, | |
| "loss": 0.9039, | |
| "num_input_tokens_seen": 545112, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4273890142964636, | |
| "grad_norm": 1.1920832395553589, | |
| "learning_rate": 2.6812079972625077e-05, | |
| "loss": 1.0116, | |
| "num_input_tokens_seen": 551328, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.4424379232505644, | |
| "grad_norm": 1.0512142181396484, | |
| "learning_rate": 2.6418631827326857e-05, | |
| "loss": 0.8218, | |
| "num_input_tokens_seen": 556816, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4574868322046652, | |
| "grad_norm": 1.146946907043457, | |
| "learning_rate": 2.602483083881035e-05, | |
| "loss": 0.8604, | |
| "num_input_tokens_seen": 562552, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.472535741158766, | |
| "grad_norm": 1.1064790487289429, | |
| "learning_rate": 2.563077495356561e-05, | |
| "loss": 0.8044, | |
| "num_input_tokens_seen": 568480, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.487584650112867, | |
| "grad_norm": 0.9678347110748291, | |
| "learning_rate": 2.5236562181480794e-05, | |
| "loss": 0.9198, | |
| "num_input_tokens_seen": 574072, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.5026335590669677, | |
| "grad_norm": 0.9460956454277039, | |
| "learning_rate": 2.484229057146507e-05, | |
| "loss": 0.9181, | |
| "num_input_tokens_seen": 580040, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5176824680210683, | |
| "grad_norm": 1.175920844078064, | |
| "learning_rate": 2.4448058187061835e-05, | |
| "loss": 0.8644, | |
| "num_input_tokens_seen": 586128, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.5327313769751694, | |
| "grad_norm": 1.2150397300720215, | |
| "learning_rate": 2.4053963082058244e-05, | |
| "loss": 1.0127, | |
| "num_input_tokens_seen": 592256, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.54778028592927, | |
| "grad_norm": 0.9520708918571472, | |
| "learning_rate": 2.3660103276097232e-05, | |
| "loss": 0.7937, | |
| "num_input_tokens_seen": 597704, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.562829194883371, | |
| "grad_norm": 1.0742231607437134, | |
| "learning_rate": 2.3266576730297956e-05, | |
| "loss": 0.9806, | |
| "num_input_tokens_seen": 603240, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5778781038374716, | |
| "grad_norm": 1.0484352111816406, | |
| "learning_rate": 2.2873481322890862e-05, | |
| "loss": 0.934, | |
| "num_input_tokens_seen": 609616, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.5929270127915727, | |
| "grad_norm": 0.8829598426818848, | |
| "learning_rate": 2.2480914824873297e-05, | |
| "loss": 0.9288, | |
| "num_input_tokens_seen": 615520, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6079759217456733, | |
| "grad_norm": 0.9222884178161621, | |
| "learning_rate": 2.2088974875691863e-05, | |
| "loss": 0.8597, | |
| "num_input_tokens_seen": 621208, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.6230248306997743, | |
| "grad_norm": 0.894801914691925, | |
| "learning_rate": 2.1697758958957448e-05, | |
| "loss": 0.8817, | |
| "num_input_tokens_seen": 627176, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6380737396538751, | |
| "grad_norm": 1.1703195571899414, | |
| "learning_rate": 2.1307364378199005e-05, | |
| "loss": 0.777, | |
| "num_input_tokens_seen": 633248, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.653122648607976, | |
| "grad_norm": 1.0596733093261719, | |
| "learning_rate": 2.0917888232662196e-05, | |
| "loss": 0.798, | |
| "num_input_tokens_seen": 639000, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6681715575620768, | |
| "grad_norm": 1.0426228046417236, | |
| "learning_rate": 2.0529427393158705e-05, | |
| "loss": 0.9104, | |
| "num_input_tokens_seen": 645280, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.6832204665161776, | |
| "grad_norm": 1.3300392627716064, | |
| "learning_rate": 2.014207847797256e-05, | |
| "loss": 0.8293, | |
| "num_input_tokens_seen": 651760, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6982693754702785, | |
| "grad_norm": 1.2664028406143188, | |
| "learning_rate": 1.9755937828829067e-05, | |
| "loss": 0.8821, | |
| "num_input_tokens_seen": 657272, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.7133182844243793, | |
| "grad_norm": 0.9889734983444214, | |
| "learning_rate": 1.937110148693265e-05, | |
| "loss": 0.8253, | |
| "num_input_tokens_seen": 663336, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.72836719337848, | |
| "grad_norm": 1.0789241790771484, | |
| "learning_rate": 1.8987665169079454e-05, | |
| "loss": 0.9391, | |
| "num_input_tokens_seen": 668936, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.743416102332581, | |
| "grad_norm": 1.2337504625320435, | |
| "learning_rate": 1.8605724243850502e-05, | |
| "loss": 0.8711, | |
| "num_input_tokens_seen": 675000, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.7584650112866818, | |
| "grad_norm": 0.905838668346405, | |
| "learning_rate": 1.822537370789163e-05, | |
| "loss": 0.8346, | |
| "num_input_tokens_seen": 680584, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.7735139202407826, | |
| "grad_norm": 1.1633321046829224, | |
| "learning_rate": 1.7846708162285785e-05, | |
| "loss": 0.8275, | |
| "num_input_tokens_seen": 686416, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7885628291948834, | |
| "grad_norm": 0.9946597814559937, | |
| "learning_rate": 1.7469821789023815e-05, | |
| "loss": 0.9435, | |
| "num_input_tokens_seen": 692016, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.8036117381489842, | |
| "grad_norm": 1.0259568691253662, | |
| "learning_rate": 1.70948083275794e-05, | |
| "loss": 0.8584, | |
| "num_input_tokens_seen": 697984, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.818660647103085, | |
| "grad_norm": 1.0644334554672241, | |
| "learning_rate": 1.672176105159417e-05, | |
| "loss": 0.88, | |
| "num_input_tokens_seen": 704056, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.8337095560571859, | |
| "grad_norm": 1.0443474054336548, | |
| "learning_rate": 1.635077274567854e-05, | |
| "loss": 0.8825, | |
| "num_input_tokens_seen": 709760, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8487584650112867, | |
| "grad_norm": 1.0267105102539062, | |
| "learning_rate": 1.5981935682334264e-05, | |
| "loss": 0.9978, | |
| "num_input_tokens_seen": 715872, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.8638073739653875, | |
| "grad_norm": 1.3127869367599487, | |
| "learning_rate": 1.561534159900441e-05, | |
| "loss": 0.9626, | |
| "num_input_tokens_seen": 722184, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8788562829194884, | |
| "grad_norm": 1.2093840837478638, | |
| "learning_rate": 1.525108167525624e-05, | |
| "loss": 0.9308, | |
| "num_input_tokens_seen": 727776, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.8939051918735892, | |
| "grad_norm": 0.982764482498169, | |
| "learning_rate": 1.4889246510103077e-05, | |
| "loss": 0.9757, | |
| "num_input_tokens_seen": 733760, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.90895410082769, | |
| "grad_norm": 1.111680507659912, | |
| "learning_rate": 1.4529926099470348e-05, | |
| "loss": 0.767, | |
| "num_input_tokens_seen": 740024, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.9240030097817908, | |
| "grad_norm": 1.218017578125, | |
| "learning_rate": 1.4173209813811788e-05, | |
| "loss": 0.9272, | |
| "num_input_tokens_seen": 745480, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9390519187358917, | |
| "grad_norm": 1.3443623781204224, | |
| "learning_rate": 1.381918637588112e-05, | |
| "loss": 0.7941, | |
| "num_input_tokens_seen": 751384, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.9541008276899925, | |
| "grad_norm": 0.9702039361000061, | |
| "learning_rate": 1.3467943838664863e-05, | |
| "loss": 0.8408, | |
| "num_input_tokens_seen": 756920, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9691497366440933, | |
| "grad_norm": 1.1215064525604248, | |
| "learning_rate": 1.311956956348177e-05, | |
| "loss": 0.8459, | |
| "num_input_tokens_seen": 762424, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.9841986455981941, | |
| "grad_norm": 1.3830626010894775, | |
| "learning_rate": 1.277415019825417e-05, | |
| "loss": 1.0117, | |
| "num_input_tokens_seen": 768224, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.999247554552295, | |
| "grad_norm": 1.028895616531372, | |
| "learning_rate": 1.2431771655956925e-05, | |
| "loss": 0.9665, | |
| "num_input_tokens_seen": 773568, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.0120391271632805, | |
| "grad_norm": 1.1555911302566528, | |
| "learning_rate": 1.2092519093248988e-05, | |
| "loss": 0.7625, | |
| "num_input_tokens_seen": 778672, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.0270880361173815, | |
| "grad_norm": 1.037429690361023, | |
| "learning_rate": 1.1756476889293269e-05, | |
| "loss": 0.8667, | |
| "num_input_tokens_seen": 784488, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.042136945071482, | |
| "grad_norm": 1.053051471710205, | |
| "learning_rate": 1.1423728624769695e-05, | |
| "loss": 0.8297, | |
| "num_input_tokens_seen": 790304, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.057185854025583, | |
| "grad_norm": 1.0523649454116821, | |
| "learning_rate": 1.1094357061087033e-05, | |
| "loss": 0.8774, | |
| "num_input_tokens_seen": 796192, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.072234762979684, | |
| "grad_norm": 1.0367976427078247, | |
| "learning_rate": 1.0768444119798357e-05, | |
| "loss": 0.8476, | |
| "num_input_tokens_seen": 802144, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.087283671933785, | |
| "grad_norm": 1.4130756855010986, | |
| "learning_rate": 1.0446070862225463e-05, | |
| "loss": 0.8641, | |
| "num_input_tokens_seen": 807768, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.1023325808878854, | |
| "grad_norm": 1.1584120988845825, | |
| "learning_rate": 1.0127317469297277e-05, | |
| "loss": 0.8383, | |
| "num_input_tokens_seen": 813712, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.1173814898419865, | |
| "grad_norm": 1.2318339347839355, | |
| "learning_rate": 9.812263221607112e-06, | |
| "loss": 0.9123, | |
| "num_input_tokens_seen": 819360, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.132430398796087, | |
| "grad_norm": 1.6237512826919556, | |
| "learning_rate": 9.500986479694036e-06, | |
| "loss": 0.9635, | |
| "num_input_tokens_seen": 824584, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.147479307750188, | |
| "grad_norm": 1.106604814529419, | |
| "learning_rate": 9.19356466455287e-06, | |
| "loss": 0.9221, | |
| "num_input_tokens_seen": 830600, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.1625282167042887, | |
| "grad_norm": 0.8615310788154602, | |
| "learning_rate": 8.890074238378074e-06, | |
| "loss": 0.8757, | |
| "num_input_tokens_seen": 836856, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.17757712565839, | |
| "grad_norm": 0.8537486791610718, | |
| "learning_rate": 8.590590685545946e-06, | |
| "loss": 0.7958, | |
| "num_input_tokens_seen": 842872, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.1926260346124904, | |
| "grad_norm": 0.8556107878684998, | |
| "learning_rate": 8.295188493840104e-06, | |
| "loss": 0.7993, | |
| "num_input_tokens_seen": 848664, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.2076749435665914, | |
| "grad_norm": 1.093944787979126, | |
| "learning_rate": 8.003941135924858e-06, | |
| "loss": 0.8436, | |
| "num_input_tokens_seen": 854712, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.222723852520692, | |
| "grad_norm": 1.2639975547790527, | |
| "learning_rate": 7.71692105107098e-06, | |
| "loss": 0.896, | |
| "num_input_tokens_seen": 860648, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.237772761474793, | |
| "grad_norm": 1.177778720855713, | |
| "learning_rate": 7.434199627138602e-06, | |
| "loss": 0.8948, | |
| "num_input_tokens_seen": 866080, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.2528216704288937, | |
| "grad_norm": 0.9701932668685913, | |
| "learning_rate": 7.155847182821523e-06, | |
| "loss": 0.8546, | |
| "num_input_tokens_seen": 871560, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.2678705793829947, | |
| "grad_norm": 1.0232161283493042, | |
| "learning_rate": 6.881932950157538e-06, | |
| "loss": 0.8494, | |
| "num_input_tokens_seen": 877568, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.282919488337096, | |
| "grad_norm": 1.119441270828247, | |
| "learning_rate": 6.612525057308949e-06, | |
| "loss": 0.7723, | |
| "num_input_tokens_seen": 883808, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.2979683972911964, | |
| "grad_norm": 1.5488731861114502, | |
| "learning_rate": 6.347690511617693e-06, | |
| "loss": 0.9168, | |
| "num_input_tokens_seen": 889296, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.313017306245297, | |
| "grad_norm": 1.2143895626068115, | |
| "learning_rate": 6.0874951829392234e-06, | |
| "loss": 0.8831, | |
| "num_input_tokens_seen": 895120, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.328066215199398, | |
| "grad_norm": 1.157663106918335, | |
| "learning_rate": 5.832003787259327e-06, | |
| "loss": 0.854, | |
| "num_input_tokens_seen": 900320, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 2.343115124153499, | |
| "grad_norm": 1.4496403932571411, | |
| "learning_rate": 5.581279870597867e-06, | |
| "loss": 0.8843, | |
| "num_input_tokens_seen": 905928, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3581640331075997, | |
| "grad_norm": 0.8820686936378479, | |
| "learning_rate": 5.335385793203604e-06, | |
| "loss": 0.862, | |
| "num_input_tokens_seen": 911976, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 2.3732129420617003, | |
| "grad_norm": 1.622916579246521, | |
| "learning_rate": 5.094382714043907e-06, | |
| "loss": 0.985, | |
| "num_input_tokens_seen": 917840, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.3882618510158014, | |
| "grad_norm": 1.0603710412979126, | |
| "learning_rate": 4.85833057559322e-06, | |
| "loss": 0.7679, | |
| "num_input_tokens_seen": 923168, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 2.4033107599699024, | |
| "grad_norm": 1.0989526510238647, | |
| "learning_rate": 4.627288088924156e-06, | |
| "loss": 0.8198, | |
| "num_input_tokens_seen": 928720, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.418359668924003, | |
| "grad_norm": 0.9745952486991882, | |
| "learning_rate": 4.401312719104802e-06, | |
| "loss": 0.7773, | |
| "num_input_tokens_seen": 934568, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 2.4334085778781036, | |
| "grad_norm": 1.529707670211792, | |
| "learning_rate": 4.180460670905978e-06, | |
| "loss": 0.9312, | |
| "num_input_tokens_seen": 940264, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.4484574868322047, | |
| "grad_norm": 1.2537649869918823, | |
| "learning_rate": 3.964786874821955e-06, | |
| "loss": 0.8497, | |
| "num_input_tokens_seen": 946128, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 2.4635063957863057, | |
| "grad_norm": 1.0871232748031616, | |
| "learning_rate": 3.754344973408064e-06, | |
| "loss": 0.782, | |
| "num_input_tokens_seen": 952032, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.4785553047404063, | |
| "grad_norm": 1.2940268516540527, | |
| "learning_rate": 3.5491873079387256e-06, | |
| "loss": 0.8937, | |
| "num_input_tokens_seen": 957960, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.493604213694507, | |
| "grad_norm": 1.2327598333358765, | |
| "learning_rate": 3.3493649053890326e-06, | |
| "loss": 0.7039, | |
| "num_input_tokens_seen": 964336, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.508653122648608, | |
| "grad_norm": 1.516093373298645, | |
| "learning_rate": 3.1549274657433375e-06, | |
| "loss": 0.9265, | |
| "num_input_tokens_seen": 970168, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 2.523702031602709, | |
| "grad_norm": 1.1418204307556152, | |
| "learning_rate": 2.9659233496337786e-06, | |
| "loss": 0.8669, | |
| "num_input_tokens_seen": 975752, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.5387509405568096, | |
| "grad_norm": 1.3584462404251099, | |
| "learning_rate": 2.7823995663120327e-06, | |
| "loss": 0.9174, | |
| "num_input_tokens_seen": 981672, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.55379984951091, | |
| "grad_norm": 1.1911269426345825, | |
| "learning_rate": 2.6044017619571065e-06, | |
| "loss": 0.8718, | |
| "num_input_tokens_seen": 987560, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5688487584650113, | |
| "grad_norm": 1.3048710823059082, | |
| "learning_rate": 2.431974208322191e-06, | |
| "loss": 0.8634, | |
| "num_input_tokens_seen": 993200, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 2.5838976674191123, | |
| "grad_norm": 1.1356749534606934, | |
| "learning_rate": 2.265159791723373e-06, | |
| "loss": 0.845, | |
| "num_input_tokens_seen": 999192, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.598946576373213, | |
| "grad_norm": 1.2655149698257446, | |
| "learning_rate": 2.104000002372886e-06, | |
| "loss": 0.8008, | |
| "num_input_tokens_seen": 1004576, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 2.6139954853273135, | |
| "grad_norm": 1.354706048965454, | |
| "learning_rate": 1.9485349240596613e-06, | |
| "loss": 0.8797, | |
| "num_input_tokens_seen": 1010352, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.6290443942814146, | |
| "grad_norm": 1.0957777500152588, | |
| "learning_rate": 1.7988032241796376e-06, | |
| "loss": 0.946, | |
| "num_input_tokens_seen": 1016272, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.6440933032355156, | |
| "grad_norm": 1.3322904109954834, | |
| "learning_rate": 1.6548421441183875e-06, | |
| "loss": 0.8032, | |
| "num_input_tokens_seen": 1021896, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.659142212189616, | |
| "grad_norm": 1.1363080739974976, | |
| "learning_rate": 1.5166874899884053e-06, | |
| "loss": 0.8892, | |
| "num_input_tokens_seen": 1027704, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 2.674191121143717, | |
| "grad_norm": 1.2706754207611084, | |
| "learning_rate": 1.3843736237233784e-06, | |
| "loss": 0.856, | |
| "num_input_tokens_seen": 1033800, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.689240030097818, | |
| "grad_norm": 1.1934438943862915, | |
| "learning_rate": 1.2579334545316733e-06, | |
| "loss": 0.8617, | |
| "num_input_tokens_seen": 1040008, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 2.704288939051919, | |
| "grad_norm": 1.4581674337387085, | |
| "learning_rate": 1.137398430711123e-06, | |
| "loss": 0.9117, | |
| "num_input_tokens_seen": 1046272, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.7193378480060195, | |
| "grad_norm": 1.080992579460144, | |
| "learning_rate": 1.0227985318271682e-06, | |
| "loss": 0.7855, | |
| "num_input_tokens_seen": 1052032, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 2.73438675696012, | |
| "grad_norm": 1.0012861490249634, | |
| "learning_rate": 9.141622612563571e-07, | |
| "loss": 0.8212, | |
| "num_input_tokens_seen": 1057584, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.749435665914221, | |
| "grad_norm": 1.1472314596176147, | |
| "learning_rate": 8.115166390969125e-07, | |
| "loss": 0.8404, | |
| "num_input_tokens_seen": 1063760, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.764484574868322, | |
| "grad_norm": 1.2558523416519165, | |
| "learning_rate": 7.148871954483105e-07, | |
| "loss": 0.7782, | |
| "num_input_tokens_seen": 1069544, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.779533483822423, | |
| "grad_norm": 1.1380338668823242, | |
| "learning_rate": 6.242979640613933e-07, | |
| "loss": 0.7847, | |
| "num_input_tokens_seen": 1075472, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.7945823927765234, | |
| "grad_norm": 0.972878098487854, | |
| "learning_rate": 5.397714763606843e-07, | |
| "loss": 0.8857, | |
| "num_input_tokens_seen": 1081464, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.8096313017306245, | |
| "grad_norm": 1.2546579837799072, | |
| "learning_rate": 4.613287558403512e-07, | |
| "loss": 0.8029, | |
| "num_input_tokens_seen": 1087464, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 2.8246802106847255, | |
| "grad_norm": 1.1165034770965576, | |
| "learning_rate": 3.8898931283523344e-07, | |
| "loss": 0.8154, | |
| "num_input_tokens_seen": 1092888, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.839729119638826, | |
| "grad_norm": 1.3924362659454346, | |
| "learning_rate": 3.227711396682015e-07, | |
| "loss": 0.8791, | |
| "num_input_tokens_seen": 1098808, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.854778028592927, | |
| "grad_norm": 1.021448016166687, | |
| "learning_rate": 2.626907061751116e-07, | |
| "loss": 0.787, | |
| "num_input_tokens_seen": 1104688, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.869826937547028, | |
| "grad_norm": 1.3344382047653198, | |
| "learning_rate": 2.0876295560839364e-07, | |
| "loss": 0.8831, | |
| "num_input_tokens_seen": 1110960, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.884875846501129, | |
| "grad_norm": 1.3956490755081177, | |
| "learning_rate": 1.6100130092037703e-07, | |
| "loss": 0.7677, | |
| "num_input_tokens_seen": 1116800, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.8999247554552294, | |
| "grad_norm": 1.1644206047058105, | |
| "learning_rate": 1.194176214271897e-07, | |
| "loss": 0.7567, | |
| "num_input_tokens_seen": 1122248, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.9149736644093305, | |
| "grad_norm": 1.2540746927261353, | |
| "learning_rate": 8.402225985413848e-08, | |
| "loss": 0.8944, | |
| "num_input_tokens_seen": 1127928, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.930022573363431, | |
| "grad_norm": 1.1684881448745728, | |
| "learning_rate": 5.4824019763252685e-08, | |
| "loss": 0.9737, | |
| "num_input_tokens_seen": 1133336, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.945071482317532, | |
| "grad_norm": 1.072198510169983, | |
| "learning_rate": 3.1830163363655296e-08, | |
| "loss": 0.8965, | |
| "num_input_tokens_seen": 1139048, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.9601203912716327, | |
| "grad_norm": 1.7171086072921753, | |
| "learning_rate": 1.504640970531046e-08, | |
| "loss": 0.837, | |
| "num_input_tokens_seen": 1144456, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.975169300225734, | |
| "grad_norm": 1.4984806776046753, | |
| "learning_rate": 4.4769332565558485e-09, | |
| "loss": 0.7812, | |
| "num_input_tokens_seen": 1150160, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.9902182091798344, | |
| "grad_norm": 1.2322272062301636, | |
| "learning_rate": 1.2436286584982527e-10, | |
| "loss": 0.8613, | |
| "num_input_tokens_seen": 1156704, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.9932279909706545, | |
| "num_input_tokens_seen": 1157808, | |
| "step": 996, | |
| "total_flos": 1.3788411572404224e+16, | |
| "train_loss": 0.939127180590687, | |
| "train_runtime": 10484.6402, | |
| "train_samples_per_second": 0.761, | |
| "train_steps_per_second": 0.095 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 996, | |
| "num_input_tokens_seen": 1157808, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3788411572404224e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |