| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2094240, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009550003820001528, | |
| "grad_norm": 1.1688205003738403, | |
| "learning_rate": 0.00028647758200830823, | |
| "loss": 4.7726, | |
| "mean_token_accuracy": 0.29613289506579993, | |
| "num_tokens": 716800000.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.019100007640003056, | |
| "grad_norm": 1.0122355222702026, | |
| "learning_rate": 0.00029724265264455594, | |
| "loss": 3.5064, | |
| "mean_token_accuracy": 0.39106567760258915, | |
| "num_tokens": 1433600000.0, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.028650011460004583, | |
| "grad_norm": 0.9957022070884705, | |
| "learning_rate": 0.0002943487112555509, | |
| "loss": 3.4122, | |
| "mean_token_accuracy": 0.40039802242666483, | |
| "num_tokens": 2150400000.0, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.03820001528000611, | |
| "grad_norm": 1.2025322914123535, | |
| "learning_rate": 0.00029145476986654586, | |
| "loss": 3.3766, | |
| "mean_token_accuracy": 0.404039168266952, | |
| "num_tokens": 2867200000.0, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.04775001910000764, | |
| "grad_norm": 1.0380007028579712, | |
| "learning_rate": 0.0002885608284775408, | |
| "loss": 3.3565, | |
| "mean_token_accuracy": 0.40606449462026356, | |
| "num_tokens": 3584000000.0, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.057300022920009165, | |
| "grad_norm": 1.1203994750976562, | |
| "learning_rate": 0.0002856668870885358, | |
| "loss": 3.3421, | |
| "mean_token_accuracy": 0.40754339938014744, | |
| "num_tokens": 4300800000.0, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.0668500267400107, | |
| "grad_norm": 1.1507455110549927, | |
| "learning_rate": 0.0002827729456995307, | |
| "loss": 3.3327, | |
| "mean_token_accuracy": 0.40846395269036295, | |
| "num_tokens": 5017600000.0, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.07640003056001222, | |
| "grad_norm": 1.2616394758224487, | |
| "learning_rate": 0.0002798790043105257, | |
| "loss": 3.3239, | |
| "mean_token_accuracy": 0.40932240092903377, | |
| "num_tokens": 5734400000.0, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.08595003438001375, | |
| "grad_norm": 1.2600022554397583, | |
| "learning_rate": 0.00027698506292152063, | |
| "loss": 3.3172, | |
| "mean_token_accuracy": 0.41004598908573386, | |
| "num_tokens": 6451200000.0, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.09550003820001528, | |
| "grad_norm": 1.1764442920684814, | |
| "learning_rate": 0.00027409112153251557, | |
| "loss": 3.3119, | |
| "mean_token_accuracy": 0.410561589974165, | |
| "num_tokens": 7168000000.0, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.1050500420200168, | |
| "grad_norm": 1.2408899068832397, | |
| "learning_rate": 0.00027119718014351055, | |
| "loss": 3.3061, | |
| "mean_token_accuracy": 0.411199274918437, | |
| "num_tokens": 7884800000.0, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.11460004584001833, | |
| "grad_norm": 1.2953583002090454, | |
| "learning_rate": 0.0002683032387545055, | |
| "loss": 3.3012, | |
| "mean_token_accuracy": 0.41174386738538743, | |
| "num_tokens": 8601600000.0, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.12415004966001987, | |
| "grad_norm": 1.3780796527862549, | |
| "learning_rate": 0.0002654092973655004, | |
| "loss": 3.2974, | |
| "mean_token_accuracy": 0.41211788419932127, | |
| "num_tokens": 9318400000.0, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 0.1337000534800214, | |
| "grad_norm": 1.2816623449325562, | |
| "learning_rate": 0.0002625153559764954, | |
| "loss": 3.2952, | |
| "mean_token_accuracy": 0.4122900270193815, | |
| "num_tokens": 10035200000.0, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 0.1432500573000229, | |
| "grad_norm": 1.462896466255188, | |
| "learning_rate": 0.00025962141458749034, | |
| "loss": 3.2898, | |
| "mean_token_accuracy": 0.4128769779801369, | |
| "num_tokens": 10752000000.0, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 0.15280006112002445, | |
| "grad_norm": 1.4050343036651611, | |
| "learning_rate": 0.0002567274731984853, | |
| "loss": 3.2866, | |
| "mean_token_accuracy": 0.4132985157236457, | |
| "num_tokens": 11468800000.0, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 0.162350064940026, | |
| "grad_norm": 1.3888232707977295, | |
| "learning_rate": 0.00025383353180948026, | |
| "loss": 3.2839, | |
| "mean_token_accuracy": 0.41356196113973853, | |
| "num_tokens": 12185600000.0, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 0.1719000687600275, | |
| "grad_norm": 1.3985786437988281, | |
| "learning_rate": 0.0002509395904204752, | |
| "loss": 3.2824, | |
| "mean_token_accuracy": 0.4136822006031871, | |
| "num_tokens": 12902400000.0, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 0.18145007258002904, | |
| "grad_norm": 1.5853444337844849, | |
| "learning_rate": 0.00024804564903147013, | |
| "loss": 3.2796, | |
| "mean_token_accuracy": 0.41403463195711376, | |
| "num_tokens": 13619200000.0, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 0.19100007640003056, | |
| "grad_norm": 1.5639410018920898, | |
| "learning_rate": 0.00024515170764246506, | |
| "loss": 3.2773, | |
| "mean_token_accuracy": 0.4142179542243481, | |
| "num_tokens": 14336000000.0, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 0.2005500802200321, | |
| "grad_norm": 1.4351433515548706, | |
| "learning_rate": 0.00024225776625346005, | |
| "loss": 3.2752, | |
| "mean_token_accuracy": 0.41444926087111233, | |
| "num_tokens": 15052800000.0, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 0.2101000840400336, | |
| "grad_norm": 1.3414918184280396, | |
| "learning_rate": 0.00023936382486445498, | |
| "loss": 3.2725, | |
| "mean_token_accuracy": 0.41476476649940014, | |
| "num_tokens": 15769600000.0, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 0.21965008786003515, | |
| "grad_norm": 1.3302001953125, | |
| "learning_rate": 0.00023646988347544994, | |
| "loss": 3.2696, | |
| "mean_token_accuracy": 0.4150806698143482, | |
| "num_tokens": 16486400000.0, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 0.22920009168003666, | |
| "grad_norm": 1.3756541013717651, | |
| "learning_rate": 0.00023357594208644488, | |
| "loss": 3.2683, | |
| "mean_token_accuracy": 0.4152164765149355, | |
| "num_tokens": 17203200000.0, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 0.2387500955000382, | |
| "grad_norm": 1.330269694328308, | |
| "learning_rate": 0.00023068200069743984, | |
| "loss": 3.2665, | |
| "mean_token_accuracy": 0.4154003126785159, | |
| "num_tokens": 17920000000.0, | |
| "step": 500000 | |
| }, | |
| { | |
| "epoch": 0.24830009932003974, | |
| "grad_norm": 1.7756025791168213, | |
| "learning_rate": 0.00022778805930843483, | |
| "loss": 3.2648, | |
| "mean_token_accuracy": 0.4155686768323183, | |
| "num_tokens": 18636800000.0, | |
| "step": 520000 | |
| }, | |
| { | |
| "epoch": 0.25785010314004125, | |
| "grad_norm": 1.6679662466049194, | |
| "learning_rate": 0.00022489411791942976, | |
| "loss": 3.2617, | |
| "mean_token_accuracy": 0.41595828180462124, | |
| "num_tokens": 19353600000.0, | |
| "step": 540000 | |
| }, | |
| { | |
| "epoch": 0.2674001069600428, | |
| "grad_norm": 1.5160603523254395, | |
| "learning_rate": 0.00022200017653042472, | |
| "loss": 3.2601, | |
| "mean_token_accuracy": 0.4160811348050833, | |
| "num_tokens": 716800000.0, | |
| "step": 560000 | |
| }, | |
| { | |
| "epoch": 0.27695011078004433, | |
| "grad_norm": 2.128943681716919, | |
| "learning_rate": 0.00021910623514141968, | |
| "loss": 3.259, | |
| "mean_token_accuracy": 0.41622272021621465, | |
| "num_tokens": 1433600000.0, | |
| "step": 580000 | |
| }, | |
| { | |
| "epoch": 0.2865001146000458, | |
| "grad_norm": 1.550946593284607, | |
| "learning_rate": 0.00021621229375241461, | |
| "loss": 3.2555, | |
| "mean_token_accuracy": 0.4166563056409359, | |
| "num_tokens": 2150400000.0, | |
| "step": 600000 | |
| }, | |
| { | |
| "epoch": 0.29605011842004736, | |
| "grad_norm": 1.5084576606750488, | |
| "learning_rate": 0.00021331835236340957, | |
| "loss": 3.2543, | |
| "mean_token_accuracy": 0.4167668910384178, | |
| "num_tokens": 2867200000.0, | |
| "step": 620000 | |
| }, | |
| { | |
| "epoch": 0.3056001222400489, | |
| "grad_norm": 1.7308917045593262, | |
| "learning_rate": 0.00021042441097440454, | |
| "loss": 3.2531, | |
| "mean_token_accuracy": 0.416923209066689, | |
| "num_tokens": 3584000000.0, | |
| "step": 640000 | |
| }, | |
| { | |
| "epoch": 0.31515012606005044, | |
| "grad_norm": 1.6496219635009766, | |
| "learning_rate": 0.00020753046958539947, | |
| "loss": 3.2503, | |
| "mean_token_accuracy": 0.4171900423392653, | |
| "num_tokens": 4300800000.0, | |
| "step": 660000 | |
| }, | |
| { | |
| "epoch": 0.324700129880052, | |
| "grad_norm": 1.911421537399292, | |
| "learning_rate": 0.00020463652819639443, | |
| "loss": 3.2494, | |
| "mean_token_accuracy": 0.4172973266944289, | |
| "num_tokens": 5017600000.0, | |
| "step": 680000 | |
| }, | |
| { | |
| "epoch": 0.33425013370005346, | |
| "grad_norm": 1.625441074371338, | |
| "learning_rate": 0.00020174258680738936, | |
| "loss": 3.2485, | |
| "mean_token_accuracy": 0.4173892762258649, | |
| "num_tokens": 5734400000.0, | |
| "step": 700000 | |
| }, | |
| { | |
| "epoch": 0.343800137520055, | |
| "grad_norm": 1.570096731185913, | |
| "learning_rate": 0.00019884864541838432, | |
| "loss": 3.2467, | |
| "mean_token_accuracy": 0.4176161937117577, | |
| "num_tokens": 6451200000.0, | |
| "step": 720000 | |
| }, | |
| { | |
| "epoch": 0.35335014134005654, | |
| "grad_norm": 1.5831801891326904, | |
| "learning_rate": 0.00019595470402937928, | |
| "loss": 3.2441, | |
| "mean_token_accuracy": 0.41793804090470077, | |
| "num_tokens": 7168000000.0, | |
| "step": 740000 | |
| }, | |
| { | |
| "epoch": 0.3629001451600581, | |
| "grad_norm": 1.6359102725982666, | |
| "learning_rate": 0.00019306076264037422, | |
| "loss": 3.2439, | |
| "mean_token_accuracy": 0.41790355779081584, | |
| "num_tokens": 7884800000.0, | |
| "step": 760000 | |
| }, | |
| { | |
| "epoch": 0.37245014898005957, | |
| "grad_norm": 1.9125442504882812, | |
| "learning_rate": 0.00019016682125136918, | |
| "loss": 3.2416, | |
| "mean_token_accuracy": 0.4181554499194026, | |
| "num_tokens": 8601600000.0, | |
| "step": 780000 | |
| }, | |
| { | |
| "epoch": 0.3820001528000611, | |
| "grad_norm": 1.6356027126312256, | |
| "learning_rate": 0.0001872728798623641, | |
| "loss": 3.2392, | |
| "mean_token_accuracy": 0.41842390780746935, | |
| "num_tokens": 9318400000.0, | |
| "step": 800000 | |
| }, | |
| { | |
| "epoch": 0.39155015662006265, | |
| "grad_norm": 1.69579017162323, | |
| "learning_rate": 0.00018437893847335907, | |
| "loss": 3.2381, | |
| "mean_token_accuracy": 0.41857809690237047, | |
| "num_tokens": 10035200000.0, | |
| "step": 820000 | |
| }, | |
| { | |
| "epoch": 0.4011001604400642, | |
| "grad_norm": 1.7878586053848267, | |
| "learning_rate": 0.00018148499708435403, | |
| "loss": 3.2375, | |
| "mean_token_accuracy": 0.41865148201435803, | |
| "num_tokens": 10752000000.0, | |
| "step": 840000 | |
| }, | |
| { | |
| "epoch": 0.41065016426006573, | |
| "grad_norm": 1.7767938375473022, | |
| "learning_rate": 0.00017859105569534897, | |
| "loss": 3.2353, | |
| "mean_token_accuracy": 0.41889262205660344, | |
| "num_tokens": 11468800000.0, | |
| "step": 860000 | |
| }, | |
| { | |
| "epoch": 0.4202001680800672, | |
| "grad_norm": 1.6678400039672852, | |
| "learning_rate": 0.00017569711430634393, | |
| "loss": 3.2337, | |
| "mean_token_accuracy": 0.41908056329786775, | |
| "num_tokens": 12185600000.0, | |
| "step": 880000 | |
| }, | |
| { | |
| "epoch": 0.42975017190006876, | |
| "grad_norm": 1.862349033355713, | |
| "learning_rate": 0.00017280317291733891, | |
| "loss": 3.2329, | |
| "mean_token_accuracy": 0.4191736038953066, | |
| "num_tokens": 12902400000.0, | |
| "step": 900000 | |
| }, | |
| { | |
| "epoch": 0.4393001757200703, | |
| "grad_norm": 1.8526560068130493, | |
| "learning_rate": 0.00016990923152833385, | |
| "loss": 3.232, | |
| "mean_token_accuracy": 0.4192189232364297, | |
| "num_tokens": 13619200000.0, | |
| "step": 920000 | |
| }, | |
| { | |
| "epoch": 0.44885017954007184, | |
| "grad_norm": 1.7189236879348755, | |
| "learning_rate": 0.0001670152901393288, | |
| "loss": 3.2302, | |
| "mean_token_accuracy": 0.4194715451017022, | |
| "num_tokens": 14336000000.0, | |
| "step": 940000 | |
| }, | |
| { | |
| "epoch": 0.4584001833600733, | |
| "grad_norm": 2.0065693855285645, | |
| "learning_rate": 0.00016412134875032377, | |
| "loss": 3.2277, | |
| "mean_token_accuracy": 0.419769259378314, | |
| "num_tokens": 15052800000.0, | |
| "step": 960000 | |
| }, | |
| { | |
| "epoch": 0.46795018718007486, | |
| "grad_norm": 1.647645354270935, | |
| "learning_rate": 0.0001612274073613187, | |
| "loss": 3.2272, | |
| "mean_token_accuracy": 0.41980642325282097, | |
| "num_tokens": 15769600000.0, | |
| "step": 980000 | |
| }, | |
| { | |
| "epoch": 0.4775001910000764, | |
| "grad_norm": 1.8431377410888672, | |
| "learning_rate": 0.00015833346597231366, | |
| "loss": 3.2244, | |
| "mean_token_accuracy": 0.42018424404114485, | |
| "num_tokens": 16486400000.0, | |
| "step": 1000000 | |
| }, | |
| { | |
| "epoch": 0.48705019482007794, | |
| "grad_norm": 1.8152481317520142, | |
| "learning_rate": 0.0001554395245833086, | |
| "loss": 3.2239, | |
| "mean_token_accuracy": 0.42016951968967914, | |
| "num_tokens": 17203200000.0, | |
| "step": 1020000 | |
| }, | |
| { | |
| "epoch": 0.4966001986400795, | |
| "grad_norm": 1.979134440422058, | |
| "learning_rate": 0.00015254558319430356, | |
| "loss": 3.2226, | |
| "mean_token_accuracy": 0.42036232096105813, | |
| "num_tokens": 17920000000.0, | |
| "step": 1040000 | |
| }, | |
| { | |
| "epoch": 0.506150202460081, | |
| "grad_norm": 2.2059521675109863, | |
| "learning_rate": 0.00014965164180529852, | |
| "loss": 3.2199, | |
| "mean_token_accuracy": 0.4206514175161719, | |
| "num_tokens": 18636800000.0, | |
| "step": 1060000 | |
| }, | |
| { | |
| "epoch": 0.5157002062800825, | |
| "grad_norm": 2.0843093395233154, | |
| "learning_rate": 0.00014675770041629345, | |
| "loss": 3.2192, | |
| "mean_token_accuracy": 0.42072975924313066, | |
| "num_tokens": 19353600000.0, | |
| "step": 1080000 | |
| }, | |
| { | |
| "epoch": 0.525250210100084, | |
| "grad_norm": 1.7924293279647827, | |
| "learning_rate": 0.0001438637590272884, | |
| "loss": 3.2178, | |
| "mean_token_accuracy": 0.4209353769227862, | |
| "num_tokens": 20070400000.0, | |
| "step": 1100000 | |
| }, | |
| { | |
| "epoch": 0.5348002139200856, | |
| "grad_norm": 1.8186842203140259, | |
| "learning_rate": 0.00014096981763828334, | |
| "loss": 3.2166, | |
| "mean_token_accuracy": 0.42104669906646014, | |
| "num_tokens": 20787200000.0, | |
| "step": 1120000 | |
| }, | |
| { | |
| "epoch": 0.5443502177400871, | |
| "grad_norm": 2.0781736373901367, | |
| "learning_rate": 0.0001380758762492783, | |
| "loss": 3.2147, | |
| "mean_token_accuracy": 0.42130602815449236, | |
| "num_tokens": 21504000000.0, | |
| "step": 1140000 | |
| }, | |
| { | |
| "epoch": 0.5539002215600887, | |
| "grad_norm": 2.1371679306030273, | |
| "learning_rate": 0.00013518193486027327, | |
| "loss": 3.2135, | |
| "mean_token_accuracy": 0.421437505723536, | |
| "num_tokens": 22220800000.0, | |
| "step": 1160000 | |
| }, | |
| { | |
| "epoch": 0.5634502253800902, | |
| "grad_norm": 1.9272387027740479, | |
| "learning_rate": 0.00013228799347126823, | |
| "loss": 3.2129, | |
| "mean_token_accuracy": 0.42148565844893454, | |
| "num_tokens": 22937600000.0, | |
| "step": 1180000 | |
| }, | |
| { | |
| "epoch": 0.5730002292000916, | |
| "grad_norm": 1.975583553314209, | |
| "learning_rate": 0.00012939405208226316, | |
| "loss": 3.2101, | |
| "mean_token_accuracy": 0.42179442739486694, | |
| "num_tokens": 23654400000.0, | |
| "step": 1200000 | |
| }, | |
| { | |
| "epoch": 0.5825502330200932, | |
| "grad_norm": 1.8845446109771729, | |
| "learning_rate": 0.00012650011069325812, | |
| "loss": 3.209, | |
| "mean_token_accuracy": 0.42191650020480154, | |
| "num_tokens": 24371200000.0, | |
| "step": 1220000 | |
| }, | |
| { | |
| "epoch": 0.5921002368400947, | |
| "grad_norm": 1.8266512155532837, | |
| "learning_rate": 0.00012360616930425308, | |
| "loss": 3.2075, | |
| "mean_token_accuracy": 0.4220881850525737, | |
| "num_tokens": 25088000000.0, | |
| "step": 1240000 | |
| }, | |
| { | |
| "epoch": 0.6016502406600963, | |
| "grad_norm": 1.9238523244857788, | |
| "learning_rate": 0.00012071222791524803, | |
| "loss": 3.2051, | |
| "mean_token_accuracy": 0.42236345537304876, | |
| "num_tokens": 25804800000.0, | |
| "step": 1260000 | |
| }, | |
| { | |
| "epoch": 0.6112002444800978, | |
| "grad_norm": 1.8870298862457275, | |
| "learning_rate": 0.00011781828652624297, | |
| "loss": 3.2044, | |
| "mean_token_accuracy": 0.4224274314776063, | |
| "num_tokens": 26521600000.0, | |
| "step": 1280000 | |
| }, | |
| { | |
| "epoch": 0.6207502483000993, | |
| "grad_norm": 2.10017991065979, | |
| "learning_rate": 0.00011492434513723792, | |
| "loss": 3.2038, | |
| "mean_token_accuracy": 0.4225288334354758, | |
| "num_tokens": 27238400000.0, | |
| "step": 1300000 | |
| }, | |
| { | |
| "epoch": 0.6303002521201009, | |
| "grad_norm": 2.2103898525238037, | |
| "learning_rate": 0.00011203040374823287, | |
| "loss": 3.2006, | |
| "mean_token_accuracy": 0.42296220604628326, | |
| "num_tokens": 27955200000.0, | |
| "step": 1320000 | |
| }, | |
| { | |
| "epoch": 0.6398502559401024, | |
| "grad_norm": 2.1420845985412598, | |
| "learning_rate": 0.00010913646235922783, | |
| "loss": 3.1995, | |
| "mean_token_accuracy": 0.42305554908663034, | |
| "num_tokens": 28672000000.0, | |
| "step": 1340000 | |
| }, | |
| { | |
| "epoch": 0.649400259760104, | |
| "grad_norm": 2.468029737472534, | |
| "learning_rate": 0.00010624252097022279, | |
| "loss": 3.1994, | |
| "mean_token_accuracy": 0.4230162718191743, | |
| "num_tokens": 29388800000.0, | |
| "step": 1360000 | |
| }, | |
| { | |
| "epoch": 0.6589502635801054, | |
| "grad_norm": 2.0718960762023926, | |
| "learning_rate": 0.00010334857958121774, | |
| "loss": 3.1964, | |
| "mean_token_accuracy": 0.4234230771496892, | |
| "num_tokens": 30105600000.0, | |
| "step": 1380000 | |
| }, | |
| { | |
| "epoch": 0.6685002674001069, | |
| "grad_norm": 2.7991926670074463, | |
| "learning_rate": 0.00010045463819221268, | |
| "loss": 3.1943, | |
| "mean_token_accuracy": 0.4237000042423606, | |
| "num_tokens": 30822400000.0, | |
| "step": 1400000 | |
| }, | |
| { | |
| "epoch": 0.6780502712201085, | |
| "grad_norm": 2.4362847805023193, | |
| "learning_rate": 9.756069680320764e-05, | |
| "loss": 3.1922, | |
| "mean_token_accuracy": 0.4239600421443582, | |
| "num_tokens": 31539200000.0, | |
| "step": 1420000 | |
| }, | |
| { | |
| "epoch": 0.68760027504011, | |
| "grad_norm": 2.591127872467041, | |
| "learning_rate": 9.466675541420259e-05, | |
| "loss": 3.191, | |
| "mean_token_accuracy": 0.4240850882053375, | |
| "num_tokens": 32256000000.0, | |
| "step": 1440000 | |
| }, | |
| { | |
| "epoch": 0.6971502788601115, | |
| "grad_norm": 2.234511375427246, | |
| "learning_rate": 9.177281402519754e-05, | |
| "loss": 3.1897, | |
| "mean_token_accuracy": 0.4242406051442027, | |
| "num_tokens": 32972800000.0, | |
| "step": 1460000 | |
| }, | |
| { | |
| "epoch": 0.7067002826801131, | |
| "grad_norm": 2.165247917175293, | |
| "learning_rate": 8.887887263619248e-05, | |
| "loss": 3.1876, | |
| "mean_token_accuracy": 0.42447070305049417, | |
| "num_tokens": 33689600000.0, | |
| "step": 1480000 | |
| }, | |
| { | |
| "epoch": 0.7162502865001146, | |
| "grad_norm": 2.373894453048706, | |
| "learning_rate": 8.598493124718745e-05, | |
| "loss": 3.1858, | |
| "mean_token_accuracy": 0.4246869832932949, | |
| "num_tokens": 34406400000.0, | |
| "step": 1500000 | |
| }, | |
| { | |
| "epoch": 0.7258002903201162, | |
| "grad_norm": 2.232844352722168, | |
| "learning_rate": 8.309098985818239e-05, | |
| "loss": 3.1842, | |
| "mean_token_accuracy": 0.424876312276721, | |
| "num_tokens": 35123200000.0, | |
| "step": 1520000 | |
| }, | |
| { | |
| "epoch": 0.7353502941401177, | |
| "grad_norm": 2.4994523525238037, | |
| "learning_rate": 8.019704846917734e-05, | |
| "loss": 3.1827, | |
| "mean_token_accuracy": 0.42511233622431754, | |
| "num_tokens": 35840000000.0, | |
| "step": 1540000 | |
| }, | |
| { | |
| "epoch": 0.7449002979601191, | |
| "grad_norm": 2.4633235931396484, | |
| "learning_rate": 7.73031070801723e-05, | |
| "loss": 3.1798, | |
| "mean_token_accuracy": 0.4254420336738229, | |
| "num_tokens": 36556800000.0, | |
| "step": 1560000 | |
| }, | |
| { | |
| "epoch": 0.7544503017801207, | |
| "grad_norm": 2.386373281478882, | |
| "learning_rate": 7.440916569116725e-05, | |
| "loss": 3.1786, | |
| "mean_token_accuracy": 0.4255216441363096, | |
| "num_tokens": 37273600000.0, | |
| "step": 1580000 | |
| }, | |
| { | |
| "epoch": 0.7640003056001222, | |
| "grad_norm": 2.4472737312316895, | |
| "learning_rate": 7.151522430216221e-05, | |
| "loss": 3.1767, | |
| "mean_token_accuracy": 0.4258053430899978, | |
| "num_tokens": 37990400000.0, | |
| "step": 1600000 | |
| }, | |
| { | |
| "epoch": 0.7735503094201238, | |
| "grad_norm": 2.5622825622558594, | |
| "learning_rate": 6.862128291315715e-05, | |
| "loss": 3.1746, | |
| "mean_token_accuracy": 0.4260344804123044, | |
| "num_tokens": 38707200000.0, | |
| "step": 1620000 | |
| }, | |
| { | |
| "epoch": 0.7831003132401253, | |
| "grad_norm": 2.4821906089782715, | |
| "learning_rate": 6.57273415241521e-05, | |
| "loss": 3.1723, | |
| "mean_token_accuracy": 0.4263139396473765, | |
| "num_tokens": 39424000000.0, | |
| "step": 1640000 | |
| }, | |
| { | |
| "epoch": 0.7926503170601268, | |
| "grad_norm": 2.5140678882598877, | |
| "learning_rate": 6.283340013514706e-05, | |
| "loss": 3.1701, | |
| "mean_token_accuracy": 0.42658785569369795, | |
| "num_tokens": 40140800000.0, | |
| "step": 1660000 | |
| }, | |
| { | |
| "epoch": 0.8022003208801284, | |
| "grad_norm": 2.634791851043701, | |
| "learning_rate": 5.9939458746142e-05, | |
| "loss": 3.1678, | |
| "mean_token_accuracy": 0.42689967503100634, | |
| "num_tokens": 40857600000.0, | |
| "step": 1680000 | |
| }, | |
| { | |
| "epoch": 0.8117503247001299, | |
| "grad_norm": 3.0459752082824707, | |
| "learning_rate": 5.704551735713696e-05, | |
| "loss": 3.1672, | |
| "mean_token_accuracy": 0.42699285422861577, | |
| "num_tokens": 41574400000.0, | |
| "step": 1700000 | |
| }, | |
| { | |
| "epoch": 0.8213003285201315, | |
| "grad_norm": 2.686730146408081, | |
| "learning_rate": 5.4151575968131916e-05, | |
| "loss": 3.1642, | |
| "mean_token_accuracy": 0.42733070581257343, | |
| "num_tokens": 42291200000.0, | |
| "step": 1720000 | |
| }, | |
| { | |
| "epoch": 0.830850332340133, | |
| "grad_norm": 2.967567205429077, | |
| "learning_rate": 5.125763457912686e-05, | |
| "loss": 3.1618, | |
| "mean_token_accuracy": 0.42769208399355413, | |
| "num_tokens": 43008000000.0, | |
| "step": 1740000 | |
| }, | |
| { | |
| "epoch": 0.8404003361601344, | |
| "grad_norm": 2.7259535789489746, | |
| "learning_rate": 4.836369319012181e-05, | |
| "loss": 3.1596, | |
| "mean_token_accuracy": 0.42790325200855733, | |
| "num_tokens": 43724800000.0, | |
| "step": 1760000 | |
| }, | |
| { | |
| "epoch": 0.849950339980136, | |
| "grad_norm": 2.709784746170044, | |
| "learning_rate": 4.546975180111677e-05, | |
| "loss": 3.1567, | |
| "mean_token_accuracy": 0.42829815539866684, | |
| "num_tokens": 44441600000.0, | |
| "step": 1780000 | |
| }, | |
| { | |
| "epoch": 0.8595003438001375, | |
| "grad_norm": 3.054241418838501, | |
| "learning_rate": 4.2575810412111724e-05, | |
| "loss": 3.1551, | |
| "mean_token_accuracy": 0.42849865667670967, | |
| "num_tokens": 45158400000.0, | |
| "step": 1800000 | |
| }, | |
| { | |
| "epoch": 0.869050347620139, | |
| "grad_norm": 2.976240873336792, | |
| "learning_rate": 3.968186902310667e-05, | |
| "loss": 3.1519, | |
| "mean_token_accuracy": 0.42887963571995497, | |
| "num_tokens": 45875200000.0, | |
| "step": 1820000 | |
| }, | |
| { | |
| "epoch": 0.8786003514401406, | |
| "grad_norm": 2.723175525665283, | |
| "learning_rate": 3.678792763410162e-05, | |
| "loss": 3.1498, | |
| "mean_token_accuracy": 0.42917205161750316, | |
| "num_tokens": 46592000000.0, | |
| "step": 1840000 | |
| }, | |
| { | |
| "epoch": 0.8881503552601421, | |
| "grad_norm": 2.917259693145752, | |
| "learning_rate": 3.389398624509658e-05, | |
| "loss": 3.146, | |
| "mean_token_accuracy": 0.42966016797572376, | |
| "num_tokens": 47308800000.0, | |
| "step": 1860000 | |
| }, | |
| { | |
| "epoch": 0.8977003590801437, | |
| "grad_norm": 3.254523515701294, | |
| "learning_rate": 3.1000044856091526e-05, | |
| "loss": 3.1442, | |
| "mean_token_accuracy": 0.42988122535943984, | |
| "num_tokens": 48025600000.0, | |
| "step": 1880000 | |
| }, | |
| { | |
| "epoch": 0.9072503629001452, | |
| "grad_norm": 2.9410157203674316, | |
| "learning_rate": 2.810610346708648e-05, | |
| "loss": 3.1408, | |
| "mean_token_accuracy": 0.43033207250982525, | |
| "num_tokens": 48742400000.0, | |
| "step": 1900000 | |
| }, | |
| { | |
| "epoch": 0.9168003667201466, | |
| "grad_norm": 3.0534310340881348, | |
| "learning_rate": 2.521216207808143e-05, | |
| "loss": 3.1383, | |
| "mean_token_accuracy": 0.43062722102552653, | |
| "num_tokens": 49459200000.0, | |
| "step": 1920000 | |
| }, | |
| { | |
| "epoch": 0.9263503705401482, | |
| "grad_norm": 3.0121352672576904, | |
| "learning_rate": 2.2318220689076384e-05, | |
| "loss": 3.1352, | |
| "mean_token_accuracy": 0.43105802639722823, | |
| "num_tokens": 50176000000.0, | |
| "step": 1940000 | |
| }, | |
| { | |
| "epoch": 0.9359003743601497, | |
| "grad_norm": 3.4966187477111816, | |
| "learning_rate": 1.9424279300071334e-05, | |
| "loss": 3.1317, | |
| "mean_token_accuracy": 0.43149784843176603, | |
| "num_tokens": 50892800000.0, | |
| "step": 1960000 | |
| }, | |
| { | |
| "epoch": 0.9454503781801513, | |
| "grad_norm": 3.3833205699920654, | |
| "learning_rate": 1.6530337911066284e-05, | |
| "loss": 3.1285, | |
| "mean_token_accuracy": 0.4319396187588572, | |
| "num_tokens": 51609600000.0, | |
| "step": 1980000 | |
| }, | |
| { | |
| "epoch": 0.9550003820001528, | |
| "grad_norm": 3.30391001701355, | |
| "learning_rate": 1.3636396522061238e-05, | |
| "loss": 3.1248, | |
| "mean_token_accuracy": 0.43242690176963805, | |
| "num_tokens": 52326400000.0, | |
| "step": 2000000 | |
| }, | |
| { | |
| "epoch": 0.9645503858201543, | |
| "grad_norm": 3.326011896133423, | |
| "learning_rate": 1.074245513305619e-05, | |
| "loss": 3.1213, | |
| "mean_token_accuracy": 0.4328598498493433, | |
| "num_tokens": 53043200000.0, | |
| "step": 2020000 | |
| }, | |
| { | |
| "epoch": 0.9741003896401559, | |
| "grad_norm": 3.2196035385131836, | |
| "learning_rate": 7.84851374405114e-06, | |
| "loss": 3.1176, | |
| "mean_token_accuracy": 0.4334187435388565, | |
| "num_tokens": 53760000000.0, | |
| "step": 2040000 | |
| }, | |
| { | |
| "epoch": 0.9836503934601574, | |
| "grad_norm": 3.748178720474243, | |
| "learning_rate": 4.9545723550460936e-06, | |
| "loss": 3.1139, | |
| "mean_token_accuracy": 0.43390719720572235, | |
| "num_tokens": 54476800000.0, | |
| "step": 2060000 | |
| }, | |
| { | |
| "epoch": 0.993200397280159, | |
| "grad_norm": 3.283731460571289, | |
| "learning_rate": 2.0606309660410444e-06, | |
| "loss": 3.1103, | |
| "mean_token_accuracy": 0.4344122279688716, | |
| "num_tokens": 55193600000.0, | |
| "step": 2080000 | |
| } | |
| ], | |
| "logging_steps": 20000, | |
| "max_steps": 2094240, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1815907720024556e+18, | |
| "train_batch_size": 140, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |