| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.998003992015968, |
| "eval_steps": 50, |
| "global_step": 375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01330671989354624, |
| "grad_norm": 38.0893895691, |
| "learning_rate": 2.631578947368421e-06, |
| "loss": 9.4719, |
| "mean_token_accuracy": 0.6992475613951683, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02661343978709248, |
| "grad_norm": 53.38062892110183, |
| "learning_rate": 5.263157894736842e-06, |
| "loss": 9.2617, |
| "mean_token_accuracy": 0.7013110458850861, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03992015968063872, |
| "grad_norm": 65.59379133263475, |
| "learning_rate": 7.894736842105265e-06, |
| "loss": 8.8052, |
| "mean_token_accuracy": 0.7065529704093934, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05322687957418496, |
| "grad_norm": 30.823233386013126, |
| "learning_rate": 1.0526315789473684e-05, |
| "loss": 8.1585, |
| "mean_token_accuracy": 0.719629879295826, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0665335994677312, |
| "grad_norm": 12.66541979902211, |
| "learning_rate": 1.3157894736842108e-05, |
| "loss": 7.3748, |
| "mean_token_accuracy": 0.740014499425888, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07984031936127745, |
| "grad_norm": 10.709230045785015, |
| "learning_rate": 1.578947368421053e-05, |
| "loss": 7.1481, |
| "mean_token_accuracy": 0.7436378166079521, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09314703925482369, |
| "grad_norm": 7.432960167563066, |
| "learning_rate": 1.8421052631578947e-05, |
| "loss": 6.9424, |
| "mean_token_accuracy": 0.7486263796687126, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.10645375914836992, |
| "grad_norm": 10.045989605681791, |
| "learning_rate": 1.9998261969639324e-05, |
| "loss": 6.9028, |
| "mean_token_accuracy": 0.7475503668189049, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.11976047904191617, |
| "grad_norm": 15.965640756107303, |
| "learning_rate": 1.9978716065702566e-05, |
| "loss": 7.2718, |
| "mean_token_accuracy": 0.7349641278386116, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1330671989354624, |
| "grad_norm": 27.77458817629002, |
| "learning_rate": 1.9937494319239112e-05, |
| "loss": 7.3623, |
| "mean_token_accuracy": 0.7310615047812462, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1330671989354624, |
| "eval_loss": 1.008537769317627, |
| "eval_mean_token_accuracy": 0.7182760106192695, |
| "eval_runtime": 42.3882, |
| "eval_samples_per_second": 3.397, |
| "eval_steps_per_second": 0.425, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.14637391882900866, |
| "grad_norm": 30.81470758273484, |
| "learning_rate": 1.9874686272438467e-05, |
| "loss": 7.2943, |
| "mean_token_accuracy": 0.7337011635303498, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1596806387225549, |
| "grad_norm": 22.17371294352614, |
| "learning_rate": 1.979042835741503e-05, |
| "loss": 7.104, |
| "mean_token_accuracy": 0.7383959114551544, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.17298735861610112, |
| "grad_norm": 27.90713548690936, |
| "learning_rate": 1.968490359984923e-05, |
| "loss": 7.1833, |
| "mean_token_accuracy": 0.7363018915057182, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.18629407850964738, |
| "grad_norm": 9.001486401205879, |
| "learning_rate": 1.9558341221417744e-05, |
| "loss": 7.05, |
| "mean_token_accuracy": 0.740586844086647, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1996007984031936, |
| "grad_norm": 13.898951307519155, |
| "learning_rate": 1.9411016141876438e-05, |
| "loss": 7.0786, |
| "mean_token_accuracy": 0.7393299728631973, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.21290751829673984, |
| "grad_norm": 44.17413162889863, |
| "learning_rate": 1.9243248381877605e-05, |
| "loss": 7.513, |
| "mean_token_accuracy": 0.7232646465301513, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2262142381902861, |
| "grad_norm": 43.276281867208816, |
| "learning_rate": 1.9055402367818673e-05, |
| "loss": 7.2214, |
| "mean_token_accuracy": 0.7344184964895248, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.23952095808383234, |
| "grad_norm": 22.007621479836395, |
| "learning_rate": 1.8847886140232438e-05, |
| "loss": 7.1625, |
| "mean_token_accuracy": 0.735144229233265, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2528276779773786, |
| "grad_norm": 77.4740488466291, |
| "learning_rate": 1.862115046743831e-05, |
| "loss": 7.5932, |
| "mean_token_accuracy": 0.722845695912838, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2661343978709248, |
| "grad_norm": 1066.7215003063964, |
| "learning_rate": 1.8375687866379988e-05, |
| "loss": 7.4423, |
| "mean_token_accuracy": 0.7269001781940461, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2661343978709248, |
| "eval_loss": 1.0586838722229004, |
| "eval_mean_token_accuracy": 0.6958943770991431, |
| "eval_runtime": 42.4569, |
| "eval_samples_per_second": 3.392, |
| "eval_steps_per_second": 0.424, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.27944111776447106, |
| "grad_norm": 923.4336165050305, |
| "learning_rate": 1.811203153277641e-05, |
| "loss": 8.4501, |
| "mean_token_accuracy": 0.6929014056921006, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2927478376580173, |
| "grad_norm": 10253.700005495137, |
| "learning_rate": 1.7830754182909985e-05, |
| "loss": 11.581, |
| "mean_token_accuracy": 0.6142643451690674, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3060545575515635, |
| "grad_norm": 11508.216656593022, |
| "learning_rate": 1.753246680956795e-05, |
| "loss": 15.7105, |
| "mean_token_accuracy": 0.5149824447929859, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.3193612774451098, |
| "grad_norm": 6036.292381336853, |
| "learning_rate": 1.721781735483921e-05, |
| "loss": 26.2876, |
| "mean_token_accuracy": 0.33059981614351275, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.33266799733865604, |
| "grad_norm": 26623.625140159445, |
| "learning_rate": 1.6887489302649657e-05, |
| "loss": 30.2414, |
| "mean_token_accuracy": 0.26836080476641655, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.34597471723220224, |
| "grad_norm": 213666.71326672958, |
| "learning_rate": 1.654220019409317e-05, |
| "loss": 36.7917, |
| "mean_token_accuracy": 0.20126449912786484, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3592814371257485, |
| "grad_norm": 281204.55298403726, |
| "learning_rate": 1.6182700068783463e-05, |
| "loss": 53.894, |
| "mean_token_accuracy": 0.08266483591869474, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.37258815701929476, |
| "grad_norm": 274791.48096197617, |
| "learning_rate": 1.580976983561235e-05, |
| "loss": 58.3125, |
| "mean_token_accuracy": 0.06163843311369419, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.38589487691284097, |
| "grad_norm": 9564.75792140504, |
| "learning_rate": 1.5424219576453526e-05, |
| "loss": 45.3478, |
| "mean_token_accuracy": 0.12734813932329417, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3992015968063872, |
| "grad_norm": 6120.107227530501, |
| "learning_rate": 1.5026886786496624e-05, |
| "loss": 42.2261, |
| "mean_token_accuracy": 0.1591544572263956, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3992015968063872, |
| "eval_loss": 4.92734432220459, |
| "eval_mean_token_accuracy": 0.1774691359864341, |
| "eval_runtime": 42.2395, |
| "eval_samples_per_second": 3.409, |
| "eval_steps_per_second": 0.426, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4125083166999335, |
| "grad_norm": 13441.595281708549, |
| "learning_rate": 1.46186345550338e-05, |
| "loss": 32.8561, |
| "mean_token_accuracy": 0.23304792679846287, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4258150365934797, |
| "grad_norm": 4366.951799070855, |
| "learning_rate": 1.4200349690650654e-05, |
| "loss": 26.1181, |
| "mean_token_accuracy": 0.3177220694720745, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.43912175648702595, |
| "grad_norm": 8486.995137743008, |
| "learning_rate": 1.3772940794893916e-05, |
| "loss": 28.5985, |
| "mean_token_accuracy": 0.28858516551554203, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.4524284763805722, |
| "grad_norm": 4760.402708498517, |
| "learning_rate": 1.3337336288600297e-05, |
| "loss": 24.7618, |
| "mean_token_accuracy": 0.3370695985853672, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4657351962741184, |
| "grad_norm": 1237.143262994326, |
| "learning_rate": 1.2894482395173695e-05, |
| "loss": 17.015, |
| "mean_token_accuracy": 0.4780235022306442, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.47904191616766467, |
| "grad_norm": 408.8332305447992, |
| "learning_rate": 1.24453410851916e-05, |
| "loss": 15.019, |
| "mean_token_accuracy": 0.516390411555767, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.49234863606121093, |
| "grad_norm": 311.65890181237427, |
| "learning_rate": 1.1990887986805295e-05, |
| "loss": 13.0538, |
| "mean_token_accuracy": 0.5649969473481178, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5056553559547572, |
| "grad_norm": 218.78464454962847, |
| "learning_rate": 1.1532110266473026e-05, |
| "loss": 11.4017, |
| "mean_token_accuracy": 0.6076564386487007, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5189620758483033, |
| "grad_norm": 248.3901525774538, |
| "learning_rate": 1.1070004484629543e-05, |
| "loss": 10.3675, |
| "mean_token_accuracy": 0.6390743300318718, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5322687957418496, |
| "grad_norm": 68.79121302231147, |
| "learning_rate": 1.0605574430949983e-05, |
| "loss": 9.2733, |
| "mean_token_accuracy": 0.6695673123002053, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5322687957418496, |
| "eval_loss": 1.2745658159255981, |
| "eval_mean_token_accuracy": 0.6473477118545108, |
| "eval_runtime": 42.3799, |
| "eval_samples_per_second": 3.398, |
| "eval_steps_per_second": 0.425, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5455755156353959, |
| "grad_norm": 61.494613995295225, |
| "learning_rate": 1.0139828943910358e-05, |
| "loss": 8.6282, |
| "mean_token_accuracy": 0.6900610521435737, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5588822355289421, |
| "grad_norm": 223.47361816320114, |
| "learning_rate": 9.673779719380967e-06, |
| "loss": 8.8734, |
| "mean_token_accuracy": 0.6839690148830414, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5721889554224884, |
| "grad_norm": 311.485197048925, |
| "learning_rate": 9.208439113012984e-06, |
| "loss": 9.6346, |
| "mean_token_accuracy": 0.6596978038549424, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5854956753160346, |
| "grad_norm": 208.95277149419533, |
| "learning_rate": 8.744817941191862e-06, |
| "loss": 9.8742, |
| "mean_token_accuracy": 0.6521520212292671, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5988023952095808, |
| "grad_norm": 99.55141840914388, |
| "learning_rate": 8.283923285334304e-06, |
| "loss": 10.0645, |
| "mean_token_accuracy": 0.6457211509346962, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.612109115103127, |
| "grad_norm": 113.01850407325877, |
| "learning_rate": 7.826756304298428e-06, |
| "loss": 9.6991, |
| "mean_token_accuracy": 0.6567307710647583, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6254158349966733, |
| "grad_norm": 122.00372512450222, |
| "learning_rate": 7.3743100596589e-06, |
| "loss": 9.3977, |
| "mean_token_accuracy": 0.6660859316587449, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.6387225548902196, |
| "grad_norm": 141.28653030104314, |
| "learning_rate": 6.92756735857107e-06, |
| "loss": 9.916, |
| "mean_token_accuracy": 0.6507371798157692, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6520292747837658, |
| "grad_norm": 807.7008160726642, |
| "learning_rate": 6.487498618909845e-06, |
| "loss": 9.8794, |
| "mean_token_accuracy": 0.6521420940756798, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.6653359946773121, |
| "grad_norm": 12179.703084325536, |
| "learning_rate": 6.0550597613206205e-06, |
| "loss": 10.3914, |
| "mean_token_accuracy": 0.6389522299170494, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6653359946773121, |
| "eval_loss": 1.7774240970611572, |
| "eval_mean_token_accuracy": 0.5449769298235575, |
| "eval_runtime": 42.2375, |
| "eval_samples_per_second": 3.409, |
| "eval_steps_per_second": 0.426, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6786427145708582, |
| "grad_norm": 1054.76927310761, |
| "learning_rate": 5.631190132761247e-06, |
| "loss": 11.7133, |
| "mean_token_accuracy": 0.5997588485479355, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6919494344644045, |
| "grad_norm": 1424.9333125346188, |
| "learning_rate": 5.216810466045448e-06, |
| "loss": 12.5735, |
| "mean_token_accuracy": 0.5747481673955918, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7052561543579507, |
| "grad_norm": 798.7973283165478, |
| "learning_rate": 4.812820879820034e-06, |
| "loss": 13.4974, |
| "mean_token_accuracy": 0.5521100461483002, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.718562874251497, |
| "grad_norm": 1790.9039210107235, |
| "learning_rate": 4.420098923320378e-06, |
| "loss": 14.4898, |
| "mean_token_accuracy": 0.5296176724135876, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7318695941450433, |
| "grad_norm": 829.5583452937811, |
| "learning_rate": 4.0394976701513235e-06, |
| "loss": 14.5873, |
| "mean_token_accuracy": 0.5268749997019768, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7451763140385895, |
| "grad_norm": 6150.368162782374, |
| "learning_rate": 3.671843865234238e-06, |
| "loss": 14.6091, |
| "mean_token_accuracy": 0.5281791850924492, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7584830339321357, |
| "grad_norm": 29207.957135047905, |
| "learning_rate": 3.3179361289454694e-06, |
| "loss": 16.9682, |
| "mean_token_accuracy": 0.48243742287158964, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.7717897538256819, |
| "grad_norm": 54201.045662727825, |
| "learning_rate": 2.978543222347076e-06, |
| "loss": 20.7529, |
| "mean_token_accuracy": 0.4138728640973568, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7850964737192282, |
| "grad_norm": 23857.53870983074, |
| "learning_rate": 2.6544023772782736e-06, |
| "loss": 21.3047, |
| "mean_token_accuracy": 0.403152472525835, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7984031936127745, |
| "grad_norm": 11423.97832396548, |
| "learning_rate": 2.346217694934847e-06, |
| "loss": 20.4021, |
| "mean_token_accuracy": 0.41206730976700784, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7984031936127745, |
| "eval_loss": 3.0904834270477295, |
| "eval_mean_token_accuracy": 0.34269434379206765, |
| "eval_runtime": 42.4109, |
| "eval_samples_per_second": 3.395, |
| "eval_steps_per_second": 0.424, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8117099135063207, |
| "grad_norm": 6555.658931606807, |
| "learning_rate": 2.0546586164151827e-06, |
| "loss": 19.3343, |
| "mean_token_accuracy": 0.42890567928552625, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.825016633399867, |
| "grad_norm": 8596.180267406271, |
| "learning_rate": 1.7803584685552877e-06, |
| "loss": 19.1283, |
| "mean_token_accuracy": 0.4296626977622509, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.8383233532934131, |
| "grad_norm": 7559.729843914057, |
| "learning_rate": 1.523913088211415e-06, |
| "loss": 19.9312, |
| "mean_token_accuracy": 0.4130441091954708, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8516300731869594, |
| "grad_norm": 6283.0641437588, |
| "learning_rate": 1.2858795279787517e-06, |
| "loss": 20.0128, |
| "mean_token_accuracy": 0.40999465957283976, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8649367930805056, |
| "grad_norm": 4627.199331551124, |
| "learning_rate": 1.0667748461575544e-06, |
| "loss": 20.2021, |
| "mean_token_accuracy": 0.40563797727227213, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8782435129740519, |
| "grad_norm": 3896.885861785845, |
| "learning_rate": 8.670749835951964e-07, |
| "loss": 20.3633, |
| "mean_token_accuracy": 0.40211157202720643, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8915502328675982, |
| "grad_norm": 3649.4556728802945, |
| "learning_rate": 6.872137298438653e-07, |
| "loss": 20.6856, |
| "mean_token_accuracy": 0.39633470848202706, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.9048569527611444, |
| "grad_norm": 5894.774814516487, |
| "learning_rate": 5.275817808796013e-07, |
| "loss": 21.1202, |
| "mean_token_accuracy": 0.38578067943453787, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9181636726546906, |
| "grad_norm": 4756.426577271269, |
| "learning_rate": 3.885258904295575e-07, |
| "loss": 21.8543, |
| "mean_token_accuracy": 0.37522283270955087, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.9314703925482368, |
| "grad_norm": 11889.44146408838, |
| "learning_rate": 2.703481167509281e-07, |
| "loss": 22.5845, |
| "mean_token_accuracy": 0.36378281489014624, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9314703925482368, |
| "eval_loss": 3.6812663078308105, |
| "eval_mean_token_accuracy": 0.2753951284620497, |
| "eval_runtime": 42.3376, |
| "eval_samples_per_second": 3.401, |
| "eval_steps_per_second": 0.425, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9447771124417831, |
| "grad_norm": 6803.573866074052, |
| "learning_rate": 1.73305166497707e-07, |
| "loss": 22.5139, |
| "mean_token_accuracy": 0.36300976797938345, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.9580838323353293, |
| "grad_norm": 7457.921947626093, |
| "learning_rate": 9.760783710056176e-08, |
| "loss": 22.5036, |
| "mean_token_accuracy": 0.3649313189089298, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9713905522288756, |
| "grad_norm": 8489.33440106548, |
| "learning_rate": 4.3420558871060116e-08, |
| "loss": 22.7636, |
| "mean_token_accuracy": 0.35811431556940077, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.9846972721224219, |
| "grad_norm": 10404.8253118789, |
| "learning_rate": 1.0861037824896337e-08, |
| "loss": 22.8268, |
| "mean_token_accuracy": 0.3570818044245243, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.998003992015968, |
| "grad_norm": 7227.984198697766, |
| "learning_rate": 0.0, |
| "loss": 22.9385, |
| "mean_token_accuracy": 0.3560729533433914, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.998003992015968, |
| "step": 375, |
| "total_flos": 3.616227566899167e+18, |
| "train_loss": 16.595592213948567, |
| "train_runtime": 29081.677, |
| "train_samples_per_second": 0.827, |
| "train_steps_per_second": 0.013 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 375, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.616227566899167e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|