| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7220420251022422, |
| "eval_steps": 500, |
| "global_step": 1280, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0028204766605556338, |
| "grad_norm": 12.768223762512207, |
| "learning_rate": 1.8691588785046728e-06, |
| "loss": 1.538864517211914, |
| "mean_token_accuracy": 0.6813269466161728, |
| "num_tokens": 5217337.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0056409533211112676, |
| "grad_norm": 3.0813796520233154, |
| "learning_rate": 4.205607476635514e-06, |
| "loss": 1.3301036834716797, |
| "mean_token_accuracy": 0.6994958072900772, |
| "num_tokens": 10428364.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.008461429981666902, |
| "grad_norm": 3.0398967266082764, |
| "learning_rate": 6.542056074766355e-06, |
| "loss": 1.0892114639282227, |
| "mean_token_accuracy": 0.7210752069950104, |
| "num_tokens": 15653051.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.011281906642222535, |
| "grad_norm": 1.0566803216934204, |
| "learning_rate": 8.878504672897196e-06, |
| "loss": 1.0293330192565917, |
| "mean_token_accuracy": 0.726907679438591, |
| "num_tokens": 20879685.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01410238330277817, |
| "grad_norm": 0.6501312255859375, |
| "learning_rate": 1.1214953271028037e-05, |
| "loss": 0.980504322052002, |
| "mean_token_accuracy": 0.7339713454246521, |
| "num_tokens": 26070702.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.016922859963333804, |
| "grad_norm": 0.5271162390708923, |
| "learning_rate": 1.3551401869158877e-05, |
| "loss": 0.9426581382751464, |
| "mean_token_accuracy": 0.7401848375797272, |
| "num_tokens": 31272328.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.019743336623889437, |
| "grad_norm": 0.46021828055381775, |
| "learning_rate": 1.588785046728972e-05, |
| "loss": 0.9113862991333008, |
| "mean_token_accuracy": 0.7477431446313858, |
| "num_tokens": 36488067.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.02256381328444507, |
| "grad_norm": 0.3989814817905426, |
| "learning_rate": 1.822429906542056e-05, |
| "loss": 0.9023752212524414, |
| "mean_token_accuracy": 0.748314619064331, |
| "num_tokens": 41717937.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.025384289945000704, |
| "grad_norm": 0.3570704460144043, |
| "learning_rate": 2.05607476635514e-05, |
| "loss": 0.8852872848510742, |
| "mean_token_accuracy": 0.7515153616666794, |
| "num_tokens": 46942356.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.02820476660555634, |
| "grad_norm": 0.3649837076663971, |
| "learning_rate": 2.2897196261682244e-05, |
| "loss": 0.8809089660644531, |
| "mean_token_accuracy": 0.752334040403366, |
| "num_tokens": 52171635.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.031025243266111974, |
| "grad_norm": 0.3402465879917145, |
| "learning_rate": 2.5233644859813084e-05, |
| "loss": 0.8583005905151367, |
| "mean_token_accuracy": 0.7575921297073365, |
| "num_tokens": 57376988.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.03384571992666761, |
| "grad_norm": 0.3836044669151306, |
| "learning_rate": 2.7570093457943924e-05, |
| "loss": 0.8582182884216308, |
| "mean_token_accuracy": 0.7565034329891205, |
| "num_tokens": 62588530.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03666619658722324, |
| "grad_norm": 0.5010499954223633, |
| "learning_rate": 2.9906542056074764e-05, |
| "loss": 0.8590673446655274, |
| "mean_token_accuracy": 0.7561922013759613, |
| "num_tokens": 67790875.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.039486673247778874, |
| "grad_norm": 0.46556833386421204, |
| "learning_rate": 3.224299065420561e-05, |
| "loss": 0.8575173377990722, |
| "mean_token_accuracy": 0.756476667523384, |
| "num_tokens": 72993985.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04230714990833451, |
| "grad_norm": 0.5273818373680115, |
| "learning_rate": 3.457943925233645e-05, |
| "loss": 0.8381841659545899, |
| "mean_token_accuracy": 0.7609980225563049, |
| "num_tokens": 78201847.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.04512762656889014, |
| "grad_norm": 0.3794383704662323, |
| "learning_rate": 3.691588785046729e-05, |
| "loss": 0.8465839385986328, |
| "mean_token_accuracy": 0.7580922782421112, |
| "num_tokens": 83386756.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.047948103229445774, |
| "grad_norm": 0.436827689409256, |
| "learning_rate": 3.925233644859813e-05, |
| "loss": 0.8467378616333008, |
| "mean_token_accuracy": 0.7579659283161163, |
| "num_tokens": 88612512.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.05076857989000141, |
| "grad_norm": 0.5298004746437073, |
| "learning_rate": 4.1588785046728974e-05, |
| "loss": 0.8391071319580078, |
| "mean_token_accuracy": 0.7598621785640717, |
| "num_tokens": 93826207.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05358905655055705, |
| "grad_norm": 0.7562506198883057, |
| "learning_rate": 4.392523364485982e-05, |
| "loss": 0.8302077293395996, |
| "mean_token_accuracy": 0.7620529294013977, |
| "num_tokens": 99052664.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.05640953321111268, |
| "grad_norm": 0.7646096348762512, |
| "learning_rate": 4.6261682242990654e-05, |
| "loss": 0.8623744964599609, |
| "mean_token_accuracy": 0.7532553136348724, |
| "num_tokens": 104260632.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.059230009871668314, |
| "grad_norm": 0.758683979511261, |
| "learning_rate": 4.85981308411215e-05, |
| "loss": 0.8187331199645996, |
| "mean_token_accuracy": 0.764651182293892, |
| "num_tokens": 109476727.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.06205048653222395, |
| "grad_norm": 0.7841841578483582, |
| "learning_rate": 4.9999822205123904e-05, |
| "loss": 0.8433048248291015, |
| "mean_token_accuracy": 0.7578113496303558, |
| "num_tokens": 114695617.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06487096319277957, |
| "grad_norm": 0.41629040241241455, |
| "learning_rate": 4.999782204181027e-05, |
| "loss": 0.8195880889892578, |
| "mean_token_accuracy": 0.7641658455133438, |
| "num_tokens": 119922657.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.06769143985333521, |
| "grad_norm": 0.5381537079811096, |
| "learning_rate": 4.999359964998888e-05, |
| "loss": 0.8174989700317383, |
| "mean_token_accuracy": 0.7642888396978378, |
| "num_tokens": 125118655.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07051191651389085, |
| "grad_norm": 0.5138171315193176, |
| "learning_rate": 4.99871554050172e-05, |
| "loss": 0.8351571083068847, |
| "mean_token_accuracy": 0.7588716298341751, |
| "num_tokens": 130323342.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.07333239317444648, |
| "grad_norm": 0.41433724761009216, |
| "learning_rate": 4.997848987976854e-05, |
| "loss": 0.8338793754577637, |
| "mean_token_accuracy": 0.7594460546970367, |
| "num_tokens": 135520292.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07615286983500212, |
| "grad_norm": 0.45325490832328796, |
| "learning_rate": 4.99676038445811e-05, |
| "loss": 0.8269057273864746, |
| "mean_token_accuracy": 0.7616348803043366, |
| "num_tokens": 140739854.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.07897334649555775, |
| "grad_norm": 0.5383926033973694, |
| "learning_rate": 4.995449826718951e-05, |
| "loss": 0.8304360389709473, |
| "mean_token_accuracy": 0.7600398540496827, |
| "num_tokens": 145944767.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08179382315611339, |
| "grad_norm": 0.37939369678497314, |
| "learning_rate": 4.993917431263875e-05, |
| "loss": 0.8136066436767578, |
| "mean_token_accuracy": 0.764724999666214, |
| "num_tokens": 151174397.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.08461429981666901, |
| "grad_norm": 0.48885253071784973, |
| "learning_rate": 4.9921633343180654e-05, |
| "loss": 0.8315029144287109, |
| "mean_token_accuracy": 0.7593274593353272, |
| "num_tokens": 156395789.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08743477647722465, |
| "grad_norm": 0.552020788192749, |
| "learning_rate": 4.9901876918152766e-05, |
| "loss": 0.8077513694763183, |
| "mean_token_accuracy": 0.7656220257282257, |
| "num_tokens": 161596039.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.09025525313778028, |
| "grad_norm": 0.4785487949848175, |
| "learning_rate": 4.9879906793839725e-05, |
| "loss": 0.8288763046264649, |
| "mean_token_accuracy": 0.7598470091819763, |
| "num_tokens": 166823527.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09307572979833592, |
| "grad_norm": 0.5288258194923401, |
| "learning_rate": 4.985572492331715e-05, |
| "loss": 0.8152419090270996, |
| "mean_token_accuracy": 0.7636227786540986, |
| "num_tokens": 172045905.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.09589620645889155, |
| "grad_norm": 0.39211970567703247, |
| "learning_rate": 4.9829333456277985e-05, |
| "loss": 0.8113995552062988, |
| "mean_token_accuracy": 0.7643435508012771, |
| "num_tokens": 177218911.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09871668311944719, |
| "grad_norm": 0.39555636048316956, |
| "learning_rate": 4.980073473884145e-05, |
| "loss": 0.8115266799926758, |
| "mean_token_accuracy": 0.7643254607915878, |
| "num_tokens": 182443958.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.10153715978000281, |
| "grad_norm": 0.4641428589820862, |
| "learning_rate": 4.976993131334443e-05, |
| "loss": 0.8158926010131836, |
| "mean_token_accuracy": 0.7630140006542205, |
| "num_tokens": 187669516.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.10435763644055845, |
| "grad_norm": 0.4861908555030823, |
| "learning_rate": 4.973692591811549e-05, |
| "loss": 0.8149589538574219, |
| "mean_token_accuracy": 0.7634222060441971, |
| "num_tokens": 192876653.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1071781131011141, |
| "grad_norm": 0.4445413649082184, |
| "learning_rate": 4.970172148723146e-05, |
| "loss": 0.813404655456543, |
| "mean_token_accuracy": 0.762807947397232, |
| "num_tokens": 198051477.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.10999858976166972, |
| "grad_norm": 0.34663382172584534, |
| "learning_rate": 4.966432115025658e-05, |
| "loss": 0.8136863708496094, |
| "mean_token_accuracy": 0.7636684775352478, |
| "num_tokens": 203269094.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.11281906642222536, |
| "grad_norm": 0.45725521445274353, |
| "learning_rate": 4.9624728231964285e-05, |
| "loss": 0.798128318786621, |
| "mean_token_accuracy": 0.7675120651721954, |
| "num_tokens": 208457917.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.11563954308278099, |
| "grad_norm": 0.38955095410346985, |
| "learning_rate": 4.958294625204168e-05, |
| "loss": 0.7989690780639649, |
| "mean_token_accuracy": 0.767362242937088, |
| "num_tokens": 213637288.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.11846001974333663, |
| "grad_norm": 0.3952544629573822, |
| "learning_rate": 4.9538978924776634e-05, |
| "loss": 0.8036691665649414, |
| "mean_token_accuracy": 0.7659778416156768, |
| "num_tokens": 218849412.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.12128049640389225, |
| "grad_norm": 0.4218426048755646, |
| "learning_rate": 4.949283015872757e-05, |
| "loss": 0.7949204921722413, |
| "mean_token_accuracy": 0.7678728461265564, |
| "num_tokens": 224068354.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.1241009730644479, |
| "grad_norm": 0.36846643686294556, |
| "learning_rate": 4.944450405637602e-05, |
| "loss": 0.8046231269836426, |
| "mean_token_accuracy": 0.7653253704309464, |
| "num_tokens": 229284008.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.12692144972500352, |
| "grad_norm": 0.43659818172454834, |
| "learning_rate": 4.939400491376195e-05, |
| "loss": 0.7970304965972901, |
| "mean_token_accuracy": 0.7672561138868332, |
| "num_tokens": 234509307.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.12974192638555915, |
| "grad_norm": 0.5250388979911804, |
| "learning_rate": 4.934133722010183e-05, |
| "loss": 0.806855583190918, |
| "mean_token_accuracy": 0.7648405969142914, |
| "num_tokens": 239715541.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1325624030461148, |
| "grad_norm": 0.36886003613471985, |
| "learning_rate": 4.928650565738955e-05, |
| "loss": 0.7958408832550049, |
| "mean_token_accuracy": 0.7676242113113403, |
| "num_tokens": 244915777.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.13538287970667043, |
| "grad_norm": 0.3311154544353485, |
| "learning_rate": 4.922951509998023e-05, |
| "loss": 0.8041030883789062, |
| "mean_token_accuracy": 0.7651675373315812, |
| "num_tokens": 250143741.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.13820335636722605, |
| "grad_norm": 0.35112565755844116, |
| "learning_rate": 4.9170370614156896e-05, |
| "loss": 0.7944831848144531, |
| "mean_token_accuracy": 0.7675338089466095, |
| "num_tokens": 255364058.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.1410238330277817, |
| "grad_norm": 0.35654959082603455, |
| "learning_rate": 4.910907745768009e-05, |
| "loss": 0.7977266311645508, |
| "mean_token_accuracy": 0.7664741456508637, |
| "num_tokens": 260587004.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.14384430968833734, |
| "grad_norm": 0.3724896311759949, |
| "learning_rate": 4.9045641079320484e-05, |
| "loss": 0.7852715492248535, |
| "mean_token_accuracy": 0.7702670186758042, |
| "num_tokens": 265753862.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.14666478634889296, |
| "grad_norm": 0.43565183877944946, |
| "learning_rate": 4.898006711837449e-05, |
| "loss": 0.800434684753418, |
| "mean_token_accuracy": 0.7659019708633423, |
| "num_tokens": 270958142.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.1494852630094486, |
| "grad_norm": 0.43222641944885254, |
| "learning_rate": 4.8912361404162987e-05, |
| "loss": 0.7894124984741211, |
| "mean_token_accuracy": 0.7686454892158509, |
| "num_tokens": 276173659.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.15230573967000424, |
| "grad_norm": 0.37449970841407776, |
| "learning_rate": 4.884252995551305e-05, |
| "loss": 0.777103042602539, |
| "mean_token_accuracy": 0.7725787729024887, |
| "num_tokens": 281389672.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.15512621633055987, |
| "grad_norm": 0.38641855120658875, |
| "learning_rate": 4.877057898022291e-05, |
| "loss": 0.7899458408355713, |
| "mean_token_accuracy": 0.7684644788503647, |
| "num_tokens": 286579538.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.1579466929911155, |
| "grad_norm": 0.42743316292762756, |
| "learning_rate": 4.8696514874510156e-05, |
| "loss": 0.7985178470611572, |
| "mean_token_accuracy": 0.7661768645048141, |
| "num_tokens": 291773512.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.16076716965167112, |
| "grad_norm": 0.4252716302871704, |
| "learning_rate": 4.862034422244305e-05, |
| "loss": 0.7951089859008789, |
| "mean_token_accuracy": 0.7667285829782486, |
| "num_tokens": 296940105.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.16358764631222678, |
| "grad_norm": 0.35900887846946716, |
| "learning_rate": 4.8542073795355294e-05, |
| "loss": 0.7855204582214356, |
| "mean_token_accuracy": 0.7693159490823746, |
| "num_tokens": 302158270.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.1664081229727824, |
| "grad_norm": 0.691571056842804, |
| "learning_rate": 4.846171055124401e-05, |
| "loss": 0.8179656982421875, |
| "mean_token_accuracy": 0.7643016219139099, |
| "num_tokens": 307356395.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.16922859963333803, |
| "grad_norm": 0.37687426805496216, |
| "learning_rate": 4.837926163415128e-05, |
| "loss": 0.795097827911377, |
| "mean_token_accuracy": 0.7669901877641678, |
| "num_tokens": 312563111.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.17204907629389365, |
| "grad_norm": 0.3831689655780792, |
| "learning_rate": 4.8294734373528983e-05, |
| "loss": 0.7758209228515625, |
| "mean_token_accuracy": 0.7721955150365829, |
| "num_tokens": 317750361.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.1748695529544493, |
| "grad_norm": 0.4321906864643097, |
| "learning_rate": 4.820813628358727e-05, |
| "loss": 0.7786943435668945, |
| "mean_token_accuracy": 0.771857762336731, |
| "num_tokens": 322952581.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.17769002961500494, |
| "grad_norm": 0.3875954747200012, |
| "learning_rate": 4.811947506262657e-05, |
| "loss": 0.7797961235046387, |
| "mean_token_accuracy": 0.7704864501953125, |
| "num_tokens": 328156016.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.18051050627556056, |
| "grad_norm": 0.5029323697090149, |
| "learning_rate": 4.802875859235325e-05, |
| "loss": 0.7656207084655762, |
| "mean_token_accuracy": 0.7749809384346008, |
| "num_tokens": 333374308.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.18333098293611622, |
| "grad_norm": 0.42617279291152954, |
| "learning_rate": 4.793599493717891e-05, |
| "loss": 0.7736545085906983, |
| "mean_token_accuracy": 0.7725418835878373, |
| "num_tokens": 338564854.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.18615145959667184, |
| "grad_norm": 0.3626423180103302, |
| "learning_rate": 4.784119234350353e-05, |
| "loss": 0.7666655540466308, |
| "mean_token_accuracy": 0.7740924268960953, |
| "num_tokens": 343758549.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.18897193625722747, |
| "grad_norm": 0.35442817211151123, |
| "learning_rate": 4.774435923898235e-05, |
| "loss": 0.8000862121582031, |
| "mean_token_accuracy": 0.7652783513069152, |
| "num_tokens": 348980481.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.1917924129177831, |
| "grad_norm": 0.32683518528938293, |
| "learning_rate": 4.764550423177673e-05, |
| "loss": 0.7713173389434814, |
| "mean_token_accuracy": 0.773258313536644, |
| "num_tokens": 354190860.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.19461288957833875, |
| "grad_norm": 0.31471505761146545, |
| "learning_rate": 4.754463610978886e-05, |
| "loss": 0.7849064826965332, |
| "mean_token_accuracy": 0.7690862119197845, |
| "num_tokens": 359399443.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.19743336623889438, |
| "grad_norm": 0.3361181914806366, |
| "learning_rate": 4.744176383988055e-05, |
| "loss": 0.7590707302093506, |
| "mean_token_accuracy": 0.7761936277151108, |
| "num_tokens": 364594796.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.20025384289945, |
| "grad_norm": 0.3417888581752777, |
| "learning_rate": 4.733689656707615e-05, |
| "loss": 0.7630936622619628, |
| "mean_token_accuracy": 0.7746737778186799, |
| "num_tokens": 369800137.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.20307431956000563, |
| "grad_norm": 0.3212883174419403, |
| "learning_rate": 4.723004361374953e-05, |
| "loss": 0.7731470108032227, |
| "mean_token_accuracy": 0.7723031550645828, |
| "num_tokens": 375005470.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.20589479622056128, |
| "grad_norm": 0.3806549310684204, |
| "learning_rate": 4.7121214478795386e-05, |
| "loss": 0.7695651054382324, |
| "mean_token_accuracy": 0.7732091188430786, |
| "num_tokens": 380204022.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.2087152728811169, |
| "grad_norm": 0.3326440751552582, |
| "learning_rate": 4.7010418836784786e-05, |
| "loss": 0.7776393890380859, |
| "mean_token_accuracy": 0.7707630872726441, |
| "num_tokens": 385430442.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.21153574954167254, |
| "grad_norm": 0.33390548825263977, |
| "learning_rate": 4.689766653710517e-05, |
| "loss": 0.7794651985168457, |
| "mean_token_accuracy": 0.7707834959030151, |
| "num_tokens": 390660313.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.2143562262022282, |
| "grad_norm": 0.4141367971897125, |
| "learning_rate": 4.678296760308474e-05, |
| "loss": 0.7713262557983398, |
| "mean_token_accuracy": 0.773048859834671, |
| "num_tokens": 395866254.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.21717670286278382, |
| "grad_norm": 0.4335712492465973, |
| "learning_rate": 4.666633223110142e-05, |
| "loss": 0.7711901664733887, |
| "mean_token_accuracy": 0.7728887468576431, |
| "num_tokens": 401075996.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.21999717952333944, |
| "grad_norm": 0.3701898455619812, |
| "learning_rate": 4.6547770789676436e-05, |
| "loss": 0.7711897850036621, |
| "mean_token_accuracy": 0.7721995264291763, |
| "num_tokens": 406281030.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.22281765618389507, |
| "grad_norm": 0.38962966203689575, |
| "learning_rate": 4.642729381855262e-05, |
| "loss": 0.7722814083099365, |
| "mean_token_accuracy": 0.7722497761249543, |
| "num_tokens": 411505168.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.22563813284445072, |
| "grad_norm": 0.334955632686615, |
| "learning_rate": 4.630491202775739e-05, |
| "loss": 0.7845487594604492, |
| "mean_token_accuracy": 0.768746617436409, |
| "num_tokens": 416686758.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.22845860950500635, |
| "grad_norm": 0.28480038046836853, |
| "learning_rate": 4.618063629665069e-05, |
| "loss": 0.7675346374511719, |
| "mean_token_accuracy": 0.7733120858669281, |
| "num_tokens": 421911969.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.23127908616556198, |
| "grad_norm": 0.32578566670417786, |
| "learning_rate": 4.605447767295791e-05, |
| "loss": 0.7733859062194824, |
| "mean_token_accuracy": 0.7722859561443329, |
| "num_tokens": 427132233.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2340995628261176, |
| "grad_norm": 0.37755486369132996, |
| "learning_rate": 4.592644737178769e-05, |
| "loss": 0.7817283153533936, |
| "mean_token_accuracy": 0.7701420873403549, |
| "num_tokens": 432359875.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.23692003948667326, |
| "grad_norm": 0.38393500447273254, |
| "learning_rate": 4.5796556774634955e-05, |
| "loss": 0.7568459987640381, |
| "mean_token_accuracy": 0.776387557387352, |
| "num_tokens": 437582569.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.23974051614722888, |
| "grad_norm": 0.3589465320110321, |
| "learning_rate": 4.5664817428369176e-05, |
| "loss": 0.7752207756042481, |
| "mean_token_accuracy": 0.7715168982744217, |
| "num_tokens": 442808705.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.2425609928077845, |
| "grad_norm": 0.34019097685813904, |
| "learning_rate": 4.553124104420784e-05, |
| "loss": 0.7655211448669433, |
| "mean_token_accuracy": 0.774545556306839, |
| "num_tokens": 448007126.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.24538146946834016, |
| "grad_norm": 0.38820672035217285, |
| "learning_rate": 4.5395839496675404e-05, |
| "loss": 0.7734439849853516, |
| "mean_token_accuracy": 0.771916925907135, |
| "num_tokens": 453196680.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.2482019461288958, |
| "grad_norm": 0.38175490498542786, |
| "learning_rate": 4.525862482254764e-05, |
| "loss": 0.7811372756958008, |
| "mean_token_accuracy": 0.7695781767368317, |
| "num_tokens": 458395676.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2510224227894514, |
| "grad_norm": 0.366558700799942, |
| "learning_rate": 4.511960921978163e-05, |
| "loss": 0.7817493438720703, |
| "mean_token_accuracy": 0.7694737076759338, |
| "num_tokens": 463597764.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.25384289945000704, |
| "grad_norm": 0.32293614745140076, |
| "learning_rate": 4.4978805046431416e-05, |
| "loss": 0.7695838928222656, |
| "mean_token_accuracy": 0.7725834578275681, |
| "num_tokens": 468808142.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.25666337611056267, |
| "grad_norm": 0.35432401299476624, |
| "learning_rate": 4.483622481954938e-05, |
| "loss": 0.7757863521575927, |
| "mean_token_accuracy": 0.7709449827671051, |
| "num_tokens": 474013365.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.2594838527711183, |
| "grad_norm": 0.3135406970977783, |
| "learning_rate": 4.469188121407353e-05, |
| "loss": 0.780633544921875, |
| "mean_token_accuracy": 0.7696676045656204, |
| "num_tokens": 479237358.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.262304329431674, |
| "grad_norm": 0.3338964283466339, |
| "learning_rate": 4.454578706170075e-05, |
| "loss": 0.7923246383666992, |
| "mean_token_accuracy": 0.7662366658449173, |
| "num_tokens": 484429691.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.2651248060922296, |
| "grad_norm": 0.3694622814655304, |
| "learning_rate": 4.439795534974607e-05, |
| "loss": 0.770482349395752, |
| "mean_token_accuracy": 0.7723766535520553, |
| "num_tokens": 489651036.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.26794528275278523, |
| "grad_norm": 0.3651314973831177, |
| "learning_rate": 4.424839921998819e-05, |
| "loss": 0.7813654899597168, |
| "mean_token_accuracy": 0.7697213411331176, |
| "num_tokens": 494857597.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.27076575941334086, |
| "grad_norm": 0.38329413533210754, |
| "learning_rate": 4.4097131967501124e-05, |
| "loss": 0.7724425315856933, |
| "mean_token_accuracy": 0.7712427735328674, |
| "num_tokens": 500063588.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2735862360738965, |
| "grad_norm": 0.3533983528614044, |
| "learning_rate": 4.394416703947243e-05, |
| "loss": 0.760558795928955, |
| "mean_token_accuracy": 0.7749832183122635, |
| "num_tokens": 505284578.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.2764067127344521, |
| "grad_norm": 0.2940215468406677, |
| "learning_rate": 4.378951803400768e-05, |
| "loss": 0.7766805648803711, |
| "mean_token_accuracy": 0.770464363694191, |
| "num_tokens": 510489061.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.27922718939500774, |
| "grad_norm": 0.3216915726661682, |
| "learning_rate": 4.3633198698921724e-05, |
| "loss": 0.7665189743041992, |
| "mean_token_accuracy": 0.773473608493805, |
| "num_tokens": 515682890.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.2820476660555634, |
| "grad_norm": 0.3060475289821625, |
| "learning_rate": 4.347522293051648e-05, |
| "loss": 0.7670203685760498, |
| "mean_token_accuracy": 0.7731666147708893, |
| "num_tokens": 520901566.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.28486814271611904, |
| "grad_norm": 0.3380279242992401, |
| "learning_rate": 4.331560477234565e-05, |
| "loss": 0.7661257266998291, |
| "mean_token_accuracy": 0.7734242618083954, |
| "num_tokens": 526113564.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.28768861937667467, |
| "grad_norm": 0.3771665692329407, |
| "learning_rate": 4.315435841396626e-05, |
| "loss": 0.7718358516693116, |
| "mean_token_accuracy": 0.7719818025827407, |
| "num_tokens": 531308013.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.2905090960372303, |
| "grad_norm": 0.3177052438259125, |
| "learning_rate": 4.299149818967726e-05, |
| "loss": 0.7608656406402587, |
| "mean_token_accuracy": 0.7754600733518601, |
| "num_tokens": 536536429.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.2933295726977859, |
| "grad_norm": 0.3333839774131775, |
| "learning_rate": 4.282703857724527e-05, |
| "loss": 0.7573532104492188, |
| "mean_token_accuracy": 0.7759276896715164, |
| "num_tokens": 541757187.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.29615004935834155, |
| "grad_norm": 0.3042449355125427, |
| "learning_rate": 4.2660994196617496e-05, |
| "loss": 0.7559514999389648, |
| "mean_token_accuracy": 0.7758087396621705, |
| "num_tokens": 546980793.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.2989705260188972, |
| "grad_norm": 0.3035680651664734, |
| "learning_rate": 4.249337980862215e-05, |
| "loss": 0.7580223083496094, |
| "mean_token_accuracy": 0.7753052890300751, |
| "num_tokens": 552180186.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3017910026794528, |
| "grad_norm": 0.2809864580631256, |
| "learning_rate": 4.2324210313656176e-05, |
| "loss": 0.769060754776001, |
| "mean_token_accuracy": 0.7719310760498047, |
| "num_tokens": 557388834.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3046114793400085, |
| "grad_norm": 0.307574987411499, |
| "learning_rate": 4.215350075036067e-05, |
| "loss": 0.7691577434539795, |
| "mean_token_accuracy": 0.7722433179616928, |
| "num_tokens": 562585314.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.3074319560005641, |
| "grad_norm": 0.31300097703933716, |
| "learning_rate": 4.198126629428406e-05, |
| "loss": 0.7613607406616211, |
| "mean_token_accuracy": 0.7739471793174744, |
| "num_tokens": 567787568.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.31025243266111974, |
| "grad_norm": 0.37344154715538025, |
| "learning_rate": 4.180752225653292e-05, |
| "loss": 0.783112621307373, |
| "mean_token_accuracy": 0.768501365184784, |
| "num_tokens": 572993740.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.31307290932167536, |
| "grad_norm": 0.401102751493454, |
| "learning_rate": 4.1632284082410994e-05, |
| "loss": 0.7505324363708497, |
| "mean_token_accuracy": 0.7774714142084121, |
| "num_tokens": 578210343.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.315893385982231, |
| "grad_norm": 0.33030402660369873, |
| "learning_rate": 4.145556735004606e-05, |
| "loss": 0.7742326736450196, |
| "mean_token_accuracy": 0.7704325795173645, |
| "num_tokens": 583420811.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3187138626427866, |
| "grad_norm": 0.31906676292419434, |
| "learning_rate": 4.127738776900513e-05, |
| "loss": 0.7598705291748047, |
| "mean_token_accuracy": 0.7744769185781479, |
| "num_tokens": 588629077.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.32153433930334224, |
| "grad_norm": 0.3493998646736145, |
| "learning_rate": 4.109776117889789e-05, |
| "loss": 0.7727458000183105, |
| "mean_token_accuracy": 0.7708871066570282, |
| "num_tokens": 593833594.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.3243548159638979, |
| "grad_norm": 0.43525683879852295, |
| "learning_rate": 4.091670354796866e-05, |
| "loss": 0.7558047294616699, |
| "mean_token_accuracy": 0.7755980044603348, |
| "num_tokens": 599042413.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.32717529262445355, |
| "grad_norm": 0.4609135687351227, |
| "learning_rate": 4.073423097167681e-05, |
| "loss": 0.7683779239654541, |
| "mean_token_accuracy": 0.7725673496723175, |
| "num_tokens": 604263432.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.3299957692850092, |
| "grad_norm": 0.4085586965084076, |
| "learning_rate": 4.055035967126592e-05, |
| "loss": 0.7682935237884522, |
| "mean_token_accuracy": 0.7724095970392227, |
| "num_tokens": 609481905.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.3328162459455648, |
| "grad_norm": 0.4387555420398712, |
| "learning_rate": 4.036510599232183e-05, |
| "loss": 0.7553305625915527, |
| "mean_token_accuracy": 0.7757373869419097, |
| "num_tokens": 614695205.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.33563672260612043, |
| "grad_norm": 0.42048364877700806, |
| "learning_rate": 4.01784864033195e-05, |
| "loss": 0.7646809577941894, |
| "mean_token_accuracy": 0.7734672874212265, |
| "num_tokens": 619906924.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.33845719926667606, |
| "grad_norm": 0.3200714886188507, |
| "learning_rate": 3.999051749415905e-05, |
| "loss": 0.7549895286560059, |
| "mean_token_accuracy": 0.7760183870792389, |
| "num_tokens": 625118843.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3412776759272317, |
| "grad_norm": 0.2978014051914215, |
| "learning_rate": 3.980121597469096e-05, |
| "loss": 0.7442358016967774, |
| "mean_token_accuracy": 0.7792284607887268, |
| "num_tokens": 630325802.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.3440981525877873, |
| "grad_norm": 0.3026927709579468, |
| "learning_rate": 3.96105986732306e-05, |
| "loss": 0.7637527465820313, |
| "mean_token_accuracy": 0.7731468260288239, |
| "num_tokens": 635526166.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.346918629248343, |
| "grad_norm": 0.2822380065917969, |
| "learning_rate": 3.941868253506227e-05, |
| "loss": 0.7543724536895752, |
| "mean_token_accuracy": 0.7762434631586075, |
| "num_tokens": 640729993.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.3497391059088986, |
| "grad_norm": 0.28617018461227417, |
| "learning_rate": 3.9225484620932805e-05, |
| "loss": 0.7670584678649902, |
| "mean_token_accuracy": 0.7725878387689591, |
| "num_tokens": 645951079.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.35255958256945424, |
| "grad_norm": 0.3128873109817505, |
| "learning_rate": 3.9031022105534945e-05, |
| "loss": 0.7662755489349365, |
| "mean_token_accuracy": 0.7721714347600936, |
| "num_tokens": 651176154.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.35538005923000987, |
| "grad_norm": 0.27455100417137146, |
| "learning_rate": 3.8835312275980516e-05, |
| "loss": 0.748596477508545, |
| "mean_token_accuracy": 0.7781710177659988, |
| "num_tokens": 656395834.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3582005358905655, |
| "grad_norm": 0.33018219470977783, |
| "learning_rate": 3.8638372530263715e-05, |
| "loss": 0.7739686489105224, |
| "mean_token_accuracy": 0.7703105211257935, |
| "num_tokens": 661600206.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.3610210125511211, |
| "grad_norm": 0.3159162402153015, |
| "learning_rate": 3.844022037571443e-05, |
| "loss": 0.7523816108703614, |
| "mean_token_accuracy": 0.7764188557863235, |
| "num_tokens": 666811188.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.36384148921167675, |
| "grad_norm": 0.3154295086860657, |
| "learning_rate": 3.824087342744195e-05, |
| "loss": 0.754487133026123, |
| "mean_token_accuracy": 0.7756473571062088, |
| "num_tokens": 672029336.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.36666196587223243, |
| "grad_norm": 0.2579188644886017, |
| "learning_rate": 3.804034940676894e-05, |
| "loss": 0.7692998886108399, |
| "mean_token_accuracy": 0.7718347430229187, |
| "num_tokens": 677251262.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.36948244253278806, |
| "grad_norm": 0.32802215218544006, |
| "learning_rate": 3.783866613965622e-05, |
| "loss": 0.7555614471435547, |
| "mean_token_accuracy": 0.7755881071090698, |
| "num_tokens": 682467688.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.3723029191933437, |
| "grad_norm": 0.31287023425102234, |
| "learning_rate": 3.763584155511794e-05, |
| "loss": 0.7655069351196289, |
| "mean_token_accuracy": 0.7732323706150055, |
| "num_tokens": 687666025.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.3751233958538993, |
| "grad_norm": 0.294041246175766, |
| "learning_rate": 3.743189368362784e-05, |
| "loss": 0.7489017486572266, |
| "mean_token_accuracy": 0.7772963434457779, |
| "num_tokens": 692873846.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.37794387251445494, |
| "grad_norm": 0.2788609564304352, |
| "learning_rate": 3.722684065551638e-05, |
| "loss": 0.750185203552246, |
| "mean_token_accuracy": 0.776657447218895, |
| "num_tokens": 698100240.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.38076434917501056, |
| "grad_norm": 0.2861092984676361, |
| "learning_rate": 3.702070069935898e-05, |
| "loss": 0.745637035369873, |
| "mean_token_accuracy": 0.7782793074846268, |
| "num_tokens": 703289295.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.3835848258355662, |
| "grad_norm": 0.31763726472854614, |
| "learning_rate": 3.6813492140355596e-05, |
| "loss": 0.7542277336120605, |
| "mean_token_accuracy": 0.775793018937111, |
| "num_tokens": 708493173.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.38640530249612187, |
| "grad_norm": 0.2676416337490082, |
| "learning_rate": 3.660523339870164e-05, |
| "loss": 0.7405709266662598, |
| "mean_token_accuracy": 0.7790880471467971, |
| "num_tokens": 713701339.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.3892257791566775, |
| "grad_norm": 0.2737492322921753, |
| "learning_rate": 3.639594298795048e-05, |
| "loss": 0.7666029930114746, |
| "mean_token_accuracy": 0.7718399643898011, |
| "num_tokens": 718920064.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.3920462558172331, |
| "grad_norm": 0.26979735493659973, |
| "learning_rate": 3.6185639513367656e-05, |
| "loss": 0.7548455238342285, |
| "mean_token_accuracy": 0.7755049705505371, |
| "num_tokens": 724132875.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.39486673247778875, |
| "grad_norm": 0.30082252621650696, |
| "learning_rate": 3.597434167027695e-05, |
| "loss": 0.7691206455230712, |
| "mean_token_accuracy": 0.7716991513967514, |
| "num_tokens": 729335465.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.3976872091383444, |
| "grad_norm": 0.2921012341976166, |
| "learning_rate": 3.5762068242398393e-05, |
| "loss": 0.751345443725586, |
| "mean_token_accuracy": 0.7767131477594376, |
| "num_tokens": 734543658.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.4005076857989, |
| "grad_norm": 0.31780368089675903, |
| "learning_rate": 3.554883810017844e-05, |
| "loss": 0.7625522613525391, |
| "mean_token_accuracy": 0.7733297258615494, |
| "num_tokens": 739758180.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.40332816245945563, |
| "grad_norm": 0.34423163533210754, |
| "learning_rate": 3.533467019911252e-05, |
| "loss": 0.7443047046661377, |
| "mean_token_accuracy": 0.7784169435501098, |
| "num_tokens": 744978491.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.40614863912001126, |
| "grad_norm": 0.3304055333137512, |
| "learning_rate": 3.5119583578059846e-05, |
| "loss": 0.7599642753601075, |
| "mean_token_accuracy": 0.7740559220314026, |
| "num_tokens": 750173836.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.40896911578056694, |
| "grad_norm": 0.30678245425224304, |
| "learning_rate": 3.490359735755102e-05, |
| "loss": 0.764622974395752, |
| "mean_token_accuracy": 0.7724904954433441, |
| "num_tokens": 755357042.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.41178959244112256, |
| "grad_norm": 0.2555997669696808, |
| "learning_rate": 3.468673073808822e-05, |
| "loss": 0.7338571548461914, |
| "mean_token_accuracy": 0.780937722325325, |
| "num_tokens": 760569986.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.4146100691016782, |
| "grad_norm": 0.2800235450267792, |
| "learning_rate": 3.4469002998438335e-05, |
| "loss": 0.7382781028747558, |
| "mean_token_accuracy": 0.7798346370458603, |
| "num_tokens": 765783266.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.4174305457622338, |
| "grad_norm": 0.29841479659080505, |
| "learning_rate": 3.425043349391918e-05, |
| "loss": 0.7705670356750488, |
| "mean_token_accuracy": 0.7709584295749664, |
| "num_tokens": 771011379.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.42025102242278944, |
| "grad_norm": 0.25931602716445923, |
| "learning_rate": 3.403104165467883e-05, |
| "loss": 0.748842716217041, |
| "mean_token_accuracy": 0.7773065626621246, |
| "num_tokens": 776228741.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.42307149908334507, |
| "grad_norm": 0.27186912298202515, |
| "learning_rate": 3.381084698396835e-05, |
| "loss": 0.7525691032409668, |
| "mean_token_accuracy": 0.7757814288139343, |
| "num_tokens": 781410976.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4258919757439007, |
| "grad_norm": 0.26616016030311584, |
| "learning_rate": 3.358986905640802e-05, |
| "loss": 0.7436333656311035, |
| "mean_token_accuracy": 0.7787700086832047, |
| "num_tokens": 786617338.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.4287124524044564, |
| "grad_norm": 0.2676887810230255, |
| "learning_rate": 3.336812751624723e-05, |
| "loss": 0.7410964965820312, |
| "mean_token_accuracy": 0.7793578028678894, |
| "num_tokens": 791825315.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.431532929065012, |
| "grad_norm": 0.2672156095504761, |
| "learning_rate": 3.314564207561816e-05, |
| "loss": 0.7534364700317383, |
| "mean_token_accuracy": 0.7755870819091797, |
| "num_tokens": 797048164.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.43435340572556763, |
| "grad_norm": 0.2551250457763672, |
| "learning_rate": 3.2922432512783395e-05, |
| "loss": 0.7435198783874511, |
| "mean_token_accuracy": 0.7779635220766068, |
| "num_tokens": 802233191.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.43717388238612326, |
| "grad_norm": 0.3182034492492676, |
| "learning_rate": 3.269851867037774e-05, |
| "loss": 0.7505601406097412, |
| "mean_token_accuracy": 0.7762654155492783, |
| "num_tokens": 807443805.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.4399943590466789, |
| "grad_norm": 0.29518380761146545, |
| "learning_rate": 3.247392045364426e-05, |
| "loss": 0.7480457782745361, |
| "mean_token_accuracy": 0.7769211769104004, |
| "num_tokens": 812656912.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.4428148357072345, |
| "grad_norm": 0.2449672371149063, |
| "learning_rate": 3.224865782866478e-05, |
| "loss": 0.7489072799682617, |
| "mean_token_accuracy": 0.7767190963029862, |
| "num_tokens": 817867809.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.44563531236779014, |
| "grad_norm": 0.28610652685165405, |
| "learning_rate": 3.202275082058492e-05, |
| "loss": 0.7520014762878418, |
| "mean_token_accuracy": 0.7751724421977997, |
| "num_tokens": 823079833.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.44845578902834576, |
| "grad_norm": 0.27258017659187317, |
| "learning_rate": 3.179621951183397e-05, |
| "loss": 0.7486692428588867, |
| "mean_token_accuracy": 0.7766971349716186, |
| "num_tokens": 828303959.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.45127626568890145, |
| "grad_norm": 0.26820626854896545, |
| "learning_rate": 3.156908404033961e-05, |
| "loss": 0.7468665599822998, |
| "mean_token_accuracy": 0.7771286904811859, |
| "num_tokens": 833510973.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.45409674234945707, |
| "grad_norm": 0.28003305196762085, |
| "learning_rate": 3.1341364597737686e-05, |
| "loss": 0.7600772380828857, |
| "mean_token_accuracy": 0.7737066566944122, |
| "num_tokens": 838698396.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.4569172190100127, |
| "grad_norm": 0.29965445399284363, |
| "learning_rate": 3.111308142757728e-05, |
| "loss": 0.7347710132598877, |
| "mean_token_accuracy": 0.7805228114128113, |
| "num_tokens": 843923030.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4597376956705683, |
| "grad_norm": 0.2517057955265045, |
| "learning_rate": 3.088425482352107e-05, |
| "loss": 0.7391749382019043, |
| "mean_token_accuracy": 0.7790901213884354, |
| "num_tokens": 849142004.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.46255817233112395, |
| "grad_norm": 0.3115026652812958, |
| "learning_rate": 3.0654905127541326e-05, |
| "loss": 0.7473933219909668, |
| "mean_token_accuracy": 0.7775959491729736, |
| "num_tokens": 854357908.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.4653786489916796, |
| "grad_norm": 0.2688276767730713, |
| "learning_rate": 3.0425052728111585e-05, |
| "loss": 0.7278037071228027, |
| "mean_token_accuracy": 0.7819948852062225, |
| "num_tokens": 859531077.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.4681991256522352, |
| "grad_norm": 0.32312244176864624, |
| "learning_rate": 3.0194718058394123e-05, |
| "loss": 0.7427204132080079, |
| "mean_token_accuracy": 0.777895525097847, |
| "num_tokens": 864750610.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.4710196023127909, |
| "grad_norm": 0.3414735496044159, |
| "learning_rate": 2.996392159442355e-05, |
| "loss": 0.7428229331970215, |
| "mean_token_accuracy": 0.7787304818630219, |
| "num_tokens": 869931763.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.4738400789733465, |
| "grad_norm": 0.29070428013801575, |
| "learning_rate": 2.973268385328655e-05, |
| "loss": 0.7371402740478515, |
| "mean_token_accuracy": 0.7795629620552063, |
| "num_tokens": 875134610.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.47666055563390214, |
| "grad_norm": 0.2716343104839325, |
| "learning_rate": 2.9501025391297976e-05, |
| "loss": 0.7384316444396972, |
| "mean_token_accuracy": 0.7791845291852951, |
| "num_tokens": 880349943.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.47948103229445777, |
| "grad_norm": 0.3179316222667694, |
| "learning_rate": 2.9268966802173436e-05, |
| "loss": 0.7503187179565429, |
| "mean_token_accuracy": 0.7757825434207917, |
| "num_tokens": 885547239.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.4823015089550134, |
| "grad_norm": 0.2789818048477173, |
| "learning_rate": 2.903652871519863e-05, |
| "loss": 0.7283576011657715, |
| "mean_token_accuracy": 0.7821285218000412, |
| "num_tokens": 890745008.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.485121985615569, |
| "grad_norm": 0.2598032057285309, |
| "learning_rate": 2.88037317933954e-05, |
| "loss": 0.731390380859375, |
| "mean_token_accuracy": 0.7811041116714478, |
| "num_tokens": 895969819.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.48794246227612464, |
| "grad_norm": 0.24372799694538116, |
| "learning_rate": 2.8570596731684895e-05, |
| "loss": 0.742131233215332, |
| "mean_token_accuracy": 0.778620821237564, |
| "num_tokens": 901171390.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.4907629389366803, |
| "grad_norm": 0.2416885793209076, |
| "learning_rate": 2.833714425504786e-05, |
| "loss": 0.7391448974609375, |
| "mean_token_accuracy": 0.779407599568367, |
| "num_tokens": 906394256.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.49358341559723595, |
| "grad_norm": 0.2647145092487335, |
| "learning_rate": 2.810339511668223e-05, |
| "loss": 0.7384161472320556, |
| "mean_token_accuracy": 0.7792135775089264, |
| "num_tokens": 911613856.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.4964038922577916, |
| "grad_norm": 0.26926189661026, |
| "learning_rate": 2.786937009615824e-05, |
| "loss": 0.7412851333618165, |
| "mean_token_accuracy": 0.7784785836935043, |
| "num_tokens": 916834932.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.4992243689183472, |
| "grad_norm": 0.25262531638145447, |
| "learning_rate": 2.7635089997571196e-05, |
| "loss": 0.7453501701354981, |
| "mean_token_accuracy": 0.777086952328682, |
| "num_tokens": 922044065.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.5020448455789028, |
| "grad_norm": 0.27210497856140137, |
| "learning_rate": 2.7400575647692046e-05, |
| "loss": 0.7517458438873291, |
| "mean_token_accuracy": 0.7750386208295822, |
| "num_tokens": 927241056.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5048653222394585, |
| "grad_norm": 0.25430935621261597, |
| "learning_rate": 2.7165847894115953e-05, |
| "loss": 0.7491694450378418, |
| "mean_token_accuracy": 0.7764867752790451, |
| "num_tokens": 932456368.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.5076857989000141, |
| "grad_norm": 0.2582172155380249, |
| "learning_rate": 2.693092760340899e-05, |
| "loss": 0.7363146305084228, |
| "mean_token_accuracy": 0.7792346268892288, |
| "num_tokens": 937679075.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5105062755605697, |
| "grad_norm": 0.23149679601192474, |
| "learning_rate": 2.66958356592532e-05, |
| "loss": 0.7403749942779541, |
| "mean_token_accuracy": 0.7784234285354614, |
| "num_tokens": 942885883.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.5133267522211253, |
| "grad_norm": 0.2690688669681549, |
| "learning_rate": 2.6460592960590064e-05, |
| "loss": 0.7586381912231446, |
| "mean_token_accuracy": 0.7731125712394714, |
| "num_tokens": 948067461.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.516147228881681, |
| "grad_norm": 0.24865229427814484, |
| "learning_rate": 2.622522041976269e-05, |
| "loss": 0.7361614227294921, |
| "mean_token_accuracy": 0.7801000714302063, |
| "num_tokens": 953267500.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.5189677055422366, |
| "grad_norm": 0.3116484582424164, |
| "learning_rate": 2.598973896065674e-05, |
| "loss": 0.7414368629455567, |
| "mean_token_accuracy": 0.7783037513494492, |
| "num_tokens": 958474033.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5217881822027922, |
| "grad_norm": 0.23773521184921265, |
| "learning_rate": 2.5754169516840355e-05, |
| "loss": 0.7330810546875, |
| "mean_token_accuracy": 0.7805785417556763, |
| "num_tokens": 963684645.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.524608658863348, |
| "grad_norm": 0.2488972693681717, |
| "learning_rate": 2.5518533029703274e-05, |
| "loss": 0.752569580078125, |
| "mean_token_accuracy": 0.7753267168998719, |
| "num_tokens": 968887491.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5274291355239036, |
| "grad_norm": 0.22092726826667786, |
| "learning_rate": 2.5282850446595158e-05, |
| "loss": 0.7525276184082031, |
| "mean_token_accuracy": 0.7744133800268174, |
| "num_tokens": 974082901.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.5302496121844592, |
| "grad_norm": 0.2564944326877594, |
| "learning_rate": 2.504714271896345e-05, |
| "loss": 0.7492488861083985, |
| "mean_token_accuracy": 0.7756923973560333, |
| "num_tokens": 979313348.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5330700888450148, |
| "grad_norm": 0.24430619180202484, |
| "learning_rate": 2.4811430800490885e-05, |
| "loss": 0.7475570678710938, |
| "mean_token_accuracy": 0.7765256404876709, |
| "num_tokens": 984536065.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.5358905655055705, |
| "grad_norm": 0.2511347830295563, |
| "learning_rate": 2.4575735645232743e-05, |
| "loss": 0.7428129196166993, |
| "mean_token_accuracy": 0.7775215625762939, |
| "num_tokens": 989758496.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5387110421661261, |
| "grad_norm": 0.2539248764514923, |
| "learning_rate": 2.43400782057541e-05, |
| "loss": 0.7449743747711182, |
| "mean_token_accuracy": 0.7767373085021972, |
| "num_tokens": 994981945.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.5415315188266817, |
| "grad_norm": 0.2781333029270172, |
| "learning_rate": 2.4104479431267196e-05, |
| "loss": 0.7502236366271973, |
| "mean_token_accuracy": 0.7762497693300248, |
| "num_tokens": 1000202855.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5443519954872373, |
| "grad_norm": 0.251446932554245, |
| "learning_rate": 2.38689602657692e-05, |
| "loss": 0.7517457962036133, |
| "mean_token_accuracy": 0.7751468151807785, |
| "num_tokens": 1005415062.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.547172472147793, |
| "grad_norm": 0.2488507777452469, |
| "learning_rate": 2.363354164618022e-05, |
| "loss": 0.729612922668457, |
| "mean_token_accuracy": 0.7817542374134063, |
| "num_tokens": 1010615295.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5499929488083486, |
| "grad_norm": 0.2578504681587219, |
| "learning_rate": 2.339824450048218e-05, |
| "loss": 0.725861930847168, |
| "mean_token_accuracy": 0.782097339630127, |
| "num_tokens": 1015841988.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.5528134254689042, |
| "grad_norm": 0.23905125260353088, |
| "learning_rate": 2.3163089745858357e-05, |
| "loss": 0.7432829856872558, |
| "mean_token_accuracy": 0.7776479661464691, |
| "num_tokens": 1021052219.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5556339021294598, |
| "grad_norm": 0.30154407024383545, |
| "learning_rate": 2.292809828683388e-05, |
| "loss": 0.7325653553009033, |
| "mean_token_accuracy": 0.7808756172657013, |
| "num_tokens": 1026266561.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.5584543787900155, |
| "grad_norm": 0.2418544590473175, |
| "learning_rate": 2.2693291013417453e-05, |
| "loss": 0.772521686553955, |
| "mean_token_accuracy": 0.7760232150554657, |
| "num_tokens": 1031469779.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5612748554505711, |
| "grad_norm": 0.24888356029987335, |
| "learning_rate": 2.2458688799244205e-05, |
| "loss": 0.7518490314483642, |
| "mean_token_accuracy": 0.7753332704305649, |
| "num_tokens": 1036679049.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.5640953321111268, |
| "grad_norm": 0.24516652524471283, |
| "learning_rate": 2.222431249972015e-05, |
| "loss": 0.7229015350341796, |
| "mean_token_accuracy": 0.7828704863786697, |
| "num_tokens": 1041896384.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5669158087716825, |
| "grad_norm": 0.2515292763710022, |
| "learning_rate": 2.199018295016822e-05, |
| "loss": 0.7194217205047607, |
| "mean_token_accuracy": 0.7844615399837493, |
| "num_tokens": 1047071005.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.5697362854322381, |
| "grad_norm": 0.2749954164028168, |
| "learning_rate": 2.1756320963976012e-05, |
| "loss": 0.7305520057678223, |
| "mean_token_accuracy": 0.7813587754964828, |
| "num_tokens": 1052273340.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.5725567620927937, |
| "grad_norm": 0.24928732216358185, |
| "learning_rate": 2.152274733074558e-05, |
| "loss": 0.7407473564147949, |
| "mean_token_accuracy": 0.7781451612710952, |
| "num_tokens": 1057495565.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.5753772387533493, |
| "grad_norm": 0.2335939258337021, |
| "learning_rate": 2.128948281444532e-05, |
| "loss": 0.7340809822082519, |
| "mean_token_accuracy": 0.7799234807491302, |
| "num_tokens": 1062704187.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.578197715413905, |
| "grad_norm": 0.21696053445339203, |
| "learning_rate": 2.1056548151564063e-05, |
| "loss": 0.7354939460754395, |
| "mean_token_accuracy": 0.7797737270593643, |
| "num_tokens": 1067931721.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.5810181920744606, |
| "grad_norm": 0.2403183877468109, |
| "learning_rate": 2.0823964049267723e-05, |
| "loss": 0.7496252059936523, |
| "mean_token_accuracy": 0.7757533907890319, |
| "num_tokens": 1073116435.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.5838386687350162, |
| "grad_norm": 0.26338285207748413, |
| "learning_rate": 2.0591751183558468e-05, |
| "loss": 0.7384109497070312, |
| "mean_token_accuracy": 0.7789325386285781, |
| "num_tokens": 1078344450.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.5866591453955718, |
| "grad_norm": 0.26449069380760193, |
| "learning_rate": 2.035993019743666e-05, |
| "loss": 0.7206357955932617, |
| "mean_token_accuracy": 0.7838350623846054, |
| "num_tokens": 1083570977.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.5894796220561275, |
| "grad_norm": 0.26153990626335144, |
| "learning_rate": 2.012852169906584e-05, |
| "loss": 0.7402269840240479, |
| "mean_token_accuracy": 0.7779274940490722, |
| "num_tokens": 1088779432.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.5923000987166831, |
| "grad_norm": 0.2370881587266922, |
| "learning_rate": 1.9897546259940618e-05, |
| "loss": 0.7502132415771484, |
| "mean_token_accuracy": 0.7757242441177368, |
| "num_tokens": 1093978609.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5951205753772387, |
| "grad_norm": 0.23634928464889526, |
| "learning_rate": 1.9667024413058028e-05, |
| "loss": 0.7406221389770508, |
| "mean_token_accuracy": 0.7780844628810882, |
| "num_tokens": 1099183883.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.5979410520377944, |
| "grad_norm": 0.21724943816661835, |
| "learning_rate": 1.9436976651092144e-05, |
| "loss": 0.7378547668457032, |
| "mean_token_accuracy": 0.7793454140424728, |
| "num_tokens": 1104381541.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.60076152869835, |
| "grad_norm": 0.25824031233787537, |
| "learning_rate": 1.9207423424572366e-05, |
| "loss": 0.7385224342346192, |
| "mean_token_accuracy": 0.7781225651502609, |
| "num_tokens": 1109596846.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.6035820053589056, |
| "grad_norm": 0.2292436957359314, |
| "learning_rate": 1.8978385140065453e-05, |
| "loss": 0.7486650943756104, |
| "mean_token_accuracy": 0.7761229813098908, |
| "num_tokens": 1114802727.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6064024820194613, |
| "grad_norm": 0.24457287788391113, |
| "learning_rate": 1.874988215836141e-05, |
| "loss": 0.7344676971435546, |
| "mean_token_accuracy": 0.779693141579628, |
| "num_tokens": 1120030959.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.609222958680017, |
| "grad_norm": 0.2467031627893448, |
| "learning_rate": 1.8521934792663477e-05, |
| "loss": 0.7370716571807862, |
| "mean_token_accuracy": 0.7789741307497025, |
| "num_tokens": 1125247092.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6120434353405726, |
| "grad_norm": 0.27388912439346313, |
| "learning_rate": 1.8294563306782396e-05, |
| "loss": 0.7286103248596192, |
| "mean_token_accuracy": 0.7814449548721314, |
| "num_tokens": 1130468945.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.6148639120011282, |
| "grad_norm": 0.2156301885843277, |
| "learning_rate": 1.8067787913334944e-05, |
| "loss": 0.745603609085083, |
| "mean_token_accuracy": 0.7774240404367447, |
| "num_tokens": 1135680292.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6176843886616838, |
| "grad_norm": 0.2245538830757141, |
| "learning_rate": 1.784162877194719e-05, |
| "loss": 0.7460683345794678, |
| "mean_token_accuracy": 0.7766919553279876, |
| "num_tokens": 1140892783.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.6205048653222395, |
| "grad_norm": 0.23326575756072998, |
| "learning_rate": 1.761610598746226e-05, |
| "loss": 0.7598372459411621, |
| "mean_token_accuracy": 0.772556483745575, |
| "num_tokens": 1146107545.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6233253419827951, |
| "grad_norm": 0.2319810688495636, |
| "learning_rate": 1.7391239608153163e-05, |
| "loss": 0.7281291007995605, |
| "mean_token_accuracy": 0.78159399330616, |
| "num_tokens": 1151321289.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.6261458186433507, |
| "grad_norm": 0.21929802000522614, |
| "learning_rate": 1.7167049623940557e-05, |
| "loss": 0.7409855365753174, |
| "mean_token_accuracy": 0.7778546661138535, |
| "num_tokens": 1156512899.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.6289662953039064, |
| "grad_norm": 0.23836582899093628, |
| "learning_rate": 1.694355596461562e-05, |
| "loss": 0.7418097972869873, |
| "mean_token_accuracy": 0.7773305416107178, |
| "num_tokens": 1161722642.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.631786771964462, |
| "grad_norm": 0.21553729474544525, |
| "learning_rate": 1.6720778498068465e-05, |
| "loss": 0.7374235153198242, |
| "mean_token_accuracy": 0.7792312622070312, |
| "num_tokens": 1166914936.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6346072486250176, |
| "grad_norm": 0.21120575070381165, |
| "learning_rate": 1.649873702852189e-05, |
| "loss": 0.728809118270874, |
| "mean_token_accuracy": 0.781609109044075, |
| "num_tokens": 1172103530.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.6374277252855732, |
| "grad_norm": 0.22732919454574585, |
| "learning_rate": 1.6277451294770834e-05, |
| "loss": 0.7273163795471191, |
| "mean_token_accuracy": 0.7819408357143403, |
| "num_tokens": 1177324382.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6402482019461289, |
| "grad_norm": 0.2159194052219391, |
| "learning_rate": 1.60569409684277e-05, |
| "loss": 0.7394785404205322, |
| "mean_token_accuracy": 0.7781968146562577, |
| "num_tokens": 1182546421.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.6430686786066845, |
| "grad_norm": 0.21402783691883087, |
| "learning_rate": 1.5837225652173587e-05, |
| "loss": 0.752212381362915, |
| "mean_token_accuracy": 0.7745260059833526, |
| "num_tokens": 1187746895.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.6458891552672401, |
| "grad_norm": 0.21710145473480225, |
| "learning_rate": 1.561832487801565e-05, |
| "loss": 0.7255624294281006, |
| "mean_token_accuracy": 0.7824052214622498, |
| "num_tokens": 1192961255.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.6487096319277958, |
| "grad_norm": 0.21105757355690002, |
| "learning_rate": 1.5400258105550813e-05, |
| "loss": 0.7374918460845947, |
| "mean_token_accuracy": 0.7783012241125107, |
| "num_tokens": 1198188396.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6515301085883515, |
| "grad_norm": 0.21594274044036865, |
| "learning_rate": 1.5183044720235834e-05, |
| "loss": 0.7187402248382568, |
| "mean_token_accuracy": 0.7842638313770294, |
| "num_tokens": 1203372436.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.6543505852489071, |
| "grad_norm": 0.23616893589496613, |
| "learning_rate": 1.4966704031664026e-05, |
| "loss": 0.7164999008178711, |
| "mean_token_accuracy": 0.7841055691242218, |
| "num_tokens": 1208563379.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.6571710619094627, |
| "grad_norm": 0.22016112506389618, |
| "learning_rate": 1.4751255271848662e-05, |
| "loss": 0.7453357696533203, |
| "mean_token_accuracy": 0.7763175398111344, |
| "num_tokens": 1213791014.0, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.6599915385700184, |
| "grad_norm": 0.21811263263225555, |
| "learning_rate": 1.453671759351334e-05, |
| "loss": 0.7320215225219726, |
| "mean_token_accuracy": 0.7802564471960067, |
| "num_tokens": 1219001273.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.662812015230574, |
| "grad_norm": 0.21295255422592163, |
| "learning_rate": 1.4323110068389358e-05, |
| "loss": 0.71702880859375, |
| "mean_token_accuracy": 0.7841863363981247, |
| "num_tokens": 1224225308.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.6656324918911296, |
| "grad_norm": 0.2202758491039276, |
| "learning_rate": 1.4110451685520265e-05, |
| "loss": 0.7342299938201904, |
| "mean_token_accuracy": 0.7799709439277649, |
| "num_tokens": 1229436976.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.6684529685516852, |
| "grad_norm": 0.22536863386631012, |
| "learning_rate": 1.3898761349573841e-05, |
| "loss": 0.7358542919158936, |
| "mean_token_accuracy": 0.7793794482946396, |
| "num_tokens": 1234660399.0, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.6712734452122409, |
| "grad_norm": 0.21837250888347626, |
| "learning_rate": 1.368805787916152e-05, |
| "loss": 0.729494047164917, |
| "mean_token_accuracy": 0.7810665190219879, |
| "num_tokens": 1239877232.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6740939218727965, |
| "grad_norm": 0.21181219816207886, |
| "learning_rate": 1.3478360005165432e-05, |
| "loss": 0.7322878837585449, |
| "mean_token_accuracy": 0.7803588449954987, |
| "num_tokens": 1245083794.0, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.6769143985333521, |
| "grad_norm": 0.24090248346328735, |
| "learning_rate": 1.3269686369073347e-05, |
| "loss": 0.7103838920593262, |
| "mean_token_accuracy": 0.7863076359033585, |
| "num_tokens": 1250308093.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6797348751939077, |
| "grad_norm": 0.19208142161369324, |
| "learning_rate": 1.306205552132147e-05, |
| "loss": 0.723546314239502, |
| "mean_token_accuracy": 0.7827136069536209, |
| "num_tokens": 1255522176.0, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.6825553518544634, |
| "grad_norm": 0.19714190065860748, |
| "learning_rate": 1.2855485919645355e-05, |
| "loss": 0.7245129585266114, |
| "mean_token_accuracy": 0.7823726564645768, |
| "num_tokens": 1260734730.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.685375828515019, |
| "grad_norm": 0.2126917988061905, |
| "learning_rate": 1.26499959274391e-05, |
| "loss": 0.7271872520446777, |
| "mean_token_accuracy": 0.781196317076683, |
| "num_tokens": 1265950168.0, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.6881963051755746, |
| "grad_norm": 0.21914449334144592, |
| "learning_rate": 1.2445603812122886e-05, |
| "loss": 0.7242794513702393, |
| "mean_token_accuracy": 0.7824690848588943, |
| "num_tokens": 1271176340.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.6910167818361304, |
| "grad_norm": 0.20978210866451263, |
| "learning_rate": 1.224232774351906e-05, |
| "loss": 0.7303569793701172, |
| "mean_token_accuracy": 0.7805366754531861, |
| "num_tokens": 1276388870.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.693837258496686, |
| "grad_norm": 0.20040655136108398, |
| "learning_rate": 1.2040185792236874e-05, |
| "loss": 0.7304568290710449, |
| "mean_token_accuracy": 0.7806206464767456, |
| "num_tokens": 1281614500.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.6966577351572416, |
| "grad_norm": 0.22813160717487335, |
| "learning_rate": 1.1839195928066102e-05, |
| "loss": 0.7257880687713623, |
| "mean_token_accuracy": 0.7822084277868271, |
| "num_tokens": 1286829234.0, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.6994782118177972, |
| "grad_norm": 0.19202908873558044, |
| "learning_rate": 1.1639376018379566e-05, |
| "loss": 0.7248349189758301, |
| "mean_token_accuracy": 0.7827732414007187, |
| "num_tokens": 1292054878.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.7022986884783529, |
| "grad_norm": 0.19687888026237488, |
| "learning_rate": 1.1440743826544753e-05, |
| "loss": 0.7293760776519775, |
| "mean_token_accuracy": 0.7805086255073548, |
| "num_tokens": 1297267890.0, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.7051191651389085, |
| "grad_norm": 0.22136810421943665, |
| "learning_rate": 1.1243317010344759e-05, |
| "loss": 0.7223714828491211, |
| "mean_token_accuracy": 0.7826942443847656, |
| "num_tokens": 1302488964.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7079396417994641, |
| "grad_norm": 0.21676434576511383, |
| "learning_rate": 1.1047113120408537e-05, |
| "loss": 0.7311611652374268, |
| "mean_token_accuracy": 0.7800871402025222, |
| "num_tokens": 1307701202.0, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.7107601184600197, |
| "grad_norm": 0.2193623036146164, |
| "learning_rate": 1.0852149598650684e-05, |
| "loss": 0.7414857387542725, |
| "mean_token_accuracy": 0.7775449156761169, |
| "num_tokens": 1312918390.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7135805951205754, |
| "grad_norm": 0.2889624238014221, |
| "learning_rate": 1.0658443776720956e-05, |
| "loss": 0.7338351249694824, |
| "mean_token_accuracy": 0.7796378195285797, |
| "num_tokens": 1318143275.0, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.716401071781131, |
| "grad_norm": 0.19788454473018646, |
| "learning_rate": 1.0466012874463507e-05, |
| "loss": 0.7364720821380615, |
| "mean_token_accuracy": 0.7789324551820755, |
| "num_tokens": 1323318690.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7192215484416866, |
| "grad_norm": 0.21078291535377502, |
| "learning_rate": 1.0274873998386083e-05, |
| "loss": 0.7365177154541016, |
| "mean_token_accuracy": 0.7786802440881729, |
| "num_tokens": 1328546048.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.7220420251022422, |
| "grad_norm": 0.2020423710346222, |
| "learning_rate": 1.0085044140139353e-05, |
| "loss": 0.7265225410461426, |
| "mean_token_accuracy": 0.7812680572271347, |
| "num_tokens": 1333747221.0, |
| "step": 1280 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1773, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 320, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.269279683030548e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|