| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.963671128107075, |
| "eval_steps": 500, |
| "global_step": 650, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0076481835564053535, |
| "grad_norm": 0.9770538524861805, |
| "learning_rate": 1e-05, |
| "loss": 0.5557, |
| "mean_token_accuracy": 0.8261559456586838, |
| "num_tokens": 576000.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.015296367112810707, |
| "grad_norm": 1.0016201329962942, |
| "learning_rate": 9.999996405511328e-06, |
| "loss": 0.5724, |
| "mean_token_accuracy": 0.8204647451639175, |
| "num_tokens": 1152000.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.022944550669216062, |
| "grad_norm": 0.8796352845621198, |
| "learning_rate": 9.999985622050476e-06, |
| "loss": 0.5616, |
| "mean_token_accuracy": 0.8209040015935898, |
| "num_tokens": 1728000.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.030592734225621414, |
| "grad_norm": 0.7858551577048387, |
| "learning_rate": 9.999967649632953e-06, |
| "loss": 0.583, |
| "mean_token_accuracy": 0.8155530691146851, |
| "num_tokens": 2304000.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.03824091778202677, |
| "grad_norm": 0.3692234525413837, |
| "learning_rate": 9.999942488284598e-06, |
| "loss": 0.5371, |
| "mean_token_accuracy": 0.8288730829954147, |
| "num_tokens": 2880000.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.045889101338432124, |
| "grad_norm": 0.337938001083141, |
| "learning_rate": 9.999910138041584e-06, |
| "loss": 0.5252, |
| "mean_token_accuracy": 0.8318870887160301, |
| "num_tokens": 3456000.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.05353728489483748, |
| "grad_norm": 0.3249029142805452, |
| "learning_rate": 9.99987059895043e-06, |
| "loss": 0.5661, |
| "mean_token_accuracy": 0.8197997808456421, |
| "num_tokens": 4032000.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06118546845124283, |
| "grad_norm": 0.6141658703983041, |
| "learning_rate": 9.999823871067981e-06, |
| "loss": 0.5406, |
| "mean_token_accuracy": 0.8242708295583725, |
| "num_tokens": 4585945.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.06883365200764818, |
| "grad_norm": 0.7987265960698617, |
| "learning_rate": 9.999769954461425e-06, |
| "loss": 0.5426, |
| "mean_token_accuracy": 0.8265986815094948, |
| "num_tokens": 5161945.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.07648183556405354, |
| "grad_norm": 0.7395483390721038, |
| "learning_rate": 9.999708849208279e-06, |
| "loss": 0.5517, |
| "mean_token_accuracy": 0.8210133761167526, |
| "num_tokens": 5737945.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0841300191204589, |
| "grad_norm": 0.6041875583696124, |
| "learning_rate": 9.999640555396404e-06, |
| "loss": 0.5331, |
| "mean_token_accuracy": 0.8267983421683311, |
| "num_tokens": 6313945.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.09177820267686425, |
| "grad_norm": 0.45467723125722653, |
| "learning_rate": 9.999565073123991e-06, |
| "loss": 0.5164, |
| "mean_token_accuracy": 0.8316023647785187, |
| "num_tokens": 6889945.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0994263862332696, |
| "grad_norm": 0.32791299547939345, |
| "learning_rate": 9.999482402499569e-06, |
| "loss": 0.5475, |
| "mean_token_accuracy": 0.8233416005969048, |
| "num_tokens": 7465945.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.10707456978967496, |
| "grad_norm": 0.23684367528291977, |
| "learning_rate": 9.999392543642e-06, |
| "loss": 0.5361, |
| "mean_token_accuracy": 0.8290345445275307, |
| "num_tokens": 8041945.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1147227533460803, |
| "grad_norm": 0.2528394745096081, |
| "learning_rate": 9.999295496680482e-06, |
| "loss": 0.5142, |
| "mean_token_accuracy": 0.8332222104072571, |
| "num_tokens": 8617945.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12237093690248566, |
| "grad_norm": 0.3264470002400492, |
| "learning_rate": 9.99919126175455e-06, |
| "loss": 0.5682, |
| "mean_token_accuracy": 0.8158846944570541, |
| "num_tokens": 9193945.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.13001912045889102, |
| "grad_norm": 0.37843802091227036, |
| "learning_rate": 9.999079839014074e-06, |
| "loss": 0.4766, |
| "mean_token_accuracy": 0.8428528308868408, |
| "num_tokens": 9769945.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.13766730401529637, |
| "grad_norm": 0.3930284951759443, |
| "learning_rate": 9.998961228619255e-06, |
| "loss": 0.5433, |
| "mean_token_accuracy": 0.8239961341023445, |
| "num_tokens": 10345945.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.14531548757170173, |
| "grad_norm": 0.38145764149019024, |
| "learning_rate": 9.99883543074063e-06, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8341441303491592, |
| "num_tokens": 10921945.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.15296367112810708, |
| "grad_norm": 0.37363877853538846, |
| "learning_rate": 9.998702445559071e-06, |
| "loss": 0.5283, |
| "mean_token_accuracy": 0.8283886983990669, |
| "num_tokens": 11497945.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.16061185468451242, |
| "grad_norm": 0.34011521406149564, |
| "learning_rate": 9.998562273265786e-06, |
| "loss": 0.518, |
| "mean_token_accuracy": 0.831451490521431, |
| "num_tokens": 12060592.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.1682600382409178, |
| "grad_norm": 0.2676333142200147, |
| "learning_rate": 9.99841491406231e-06, |
| "loss": 0.5285, |
| "mean_token_accuracy": 0.8288713470101357, |
| "num_tokens": 12636592.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.17590822179732313, |
| "grad_norm": 0.24262999668848875, |
| "learning_rate": 9.99826036816052e-06, |
| "loss": 0.5199, |
| "mean_token_accuracy": 0.8309321850538254, |
| "num_tokens": 13212592.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.1835564053537285, |
| "grad_norm": 0.21796704803826206, |
| "learning_rate": 9.998098635782616e-06, |
| "loss": 0.5281, |
| "mean_token_accuracy": 0.8288609310984612, |
| "num_tokens": 13788592.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.19120458891013384, |
| "grad_norm": 0.20760778914940356, |
| "learning_rate": 9.997929717161142e-06, |
| "loss": 0.5368, |
| "mean_token_accuracy": 0.8261212259531021, |
| "num_tokens": 14364592.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1988527724665392, |
| "grad_norm": 0.25167231093180664, |
| "learning_rate": 9.997753612538963e-06, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.8385418727993965, |
| "num_tokens": 14940592.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.20650095602294455, |
| "grad_norm": 0.2755218609160029, |
| "learning_rate": 9.997570322169285e-06, |
| "loss": 0.5221, |
| "mean_token_accuracy": 0.8294894322752953, |
| "num_tokens": 15516592.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.21414913957934992, |
| "grad_norm": 0.2808823032638946, |
| "learning_rate": 9.99737984631564e-06, |
| "loss": 0.5264, |
| "mean_token_accuracy": 0.8296040147542953, |
| "num_tokens": 16092592.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.22179732313575526, |
| "grad_norm": 0.2583300789913383, |
| "learning_rate": 9.997182185251896e-06, |
| "loss": 0.5098, |
| "mean_token_accuracy": 0.8332170099020004, |
| "num_tokens": 16668592.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.2294455066921606, |
| "grad_norm": 0.25870516586563114, |
| "learning_rate": 9.996977339262247e-06, |
| "loss": 0.4964, |
| "mean_token_accuracy": 0.8371130004525185, |
| "num_tokens": 17244592.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.23709369024856597, |
| "grad_norm": 0.2296798543288584, |
| "learning_rate": 9.996765308641218e-06, |
| "loss": 0.5203, |
| "mean_token_accuracy": 0.8292376697063446, |
| "num_tokens": 17820592.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.2447418738049713, |
| "grad_norm": 0.20940808224809382, |
| "learning_rate": 9.996546093693671e-06, |
| "loss": 0.5297, |
| "mean_token_accuracy": 0.828244574368, |
| "num_tokens": 18396592.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.25239005736137665, |
| "grad_norm": 0.15777243777113475, |
| "learning_rate": 9.996319694734787e-06, |
| "loss": 0.4841, |
| "mean_token_accuracy": 0.8402329161763191, |
| "num_tokens": 18972592.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.26003824091778205, |
| "grad_norm": 0.17599866369775718, |
| "learning_rate": 9.996086112090085e-06, |
| "loss": 0.5302, |
| "mean_token_accuracy": 0.8264007493853569, |
| "num_tokens": 19548592.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.2676864244741874, |
| "grad_norm": 0.197575035953607, |
| "learning_rate": 9.995845346095406e-06, |
| "loss": 0.5239, |
| "mean_token_accuracy": 0.8283487483859062, |
| "num_tokens": 20124592.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.27533460803059273, |
| "grad_norm": 0.18551509540101133, |
| "learning_rate": 9.995597397096923e-06, |
| "loss": 0.5166, |
| "mean_token_accuracy": 0.8316527083516121, |
| "num_tokens": 20700592.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2829827915869981, |
| "grad_norm": 0.18027752934372468, |
| "learning_rate": 9.995342265451138e-06, |
| "loss": 0.4946, |
| "mean_token_accuracy": 0.8379411548376083, |
| "num_tokens": 21276592.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.29063097514340347, |
| "grad_norm": 0.1783882139249774, |
| "learning_rate": 9.995079951524876e-06, |
| "loss": 0.5298, |
| "mean_token_accuracy": 0.8277133032679558, |
| "num_tokens": 21852592.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2982791586998088, |
| "grad_norm": 0.1617380846519598, |
| "learning_rate": 9.994810455695291e-06, |
| "loss": 0.481, |
| "mean_token_accuracy": 0.8421861305832863, |
| "num_tokens": 22428592.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.30592734225621415, |
| "grad_norm": 0.17083879189298595, |
| "learning_rate": 9.994533778349867e-06, |
| "loss": 0.5269, |
| "mean_token_accuracy": 0.828789733350277, |
| "num_tokens": 23004592.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3135755258126195, |
| "grad_norm": 0.1509185021498891, |
| "learning_rate": 9.994249919886402e-06, |
| "loss": 0.5004, |
| "mean_token_accuracy": 0.8369046598672867, |
| "num_tokens": 23580592.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.32122370936902483, |
| "grad_norm": 0.1511275570122673, |
| "learning_rate": 9.993958880713033e-06, |
| "loss": 0.5094, |
| "mean_token_accuracy": 0.8330798596143723, |
| "num_tokens": 24156592.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.32887189292543023, |
| "grad_norm": 0.16474397925550863, |
| "learning_rate": 9.99366066124821e-06, |
| "loss": 0.5083, |
| "mean_token_accuracy": 0.8331527784466743, |
| "num_tokens": 24732592.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.3365200764818356, |
| "grad_norm": 0.1510059303541963, |
| "learning_rate": 9.993355261920714e-06, |
| "loss": 0.4811, |
| "mean_token_accuracy": 0.8414968326687813, |
| "num_tokens": 25306492.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3441682600382409, |
| "grad_norm": 0.2206211577243794, |
| "learning_rate": 9.993042683169647e-06, |
| "loss": 0.5145, |
| "mean_token_accuracy": 0.8324548229575157, |
| "num_tokens": 25882492.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.35181644359464626, |
| "grad_norm": 0.19826261020124922, |
| "learning_rate": 9.992722925444434e-06, |
| "loss": 0.525, |
| "mean_token_accuracy": 0.8288452923297882, |
| "num_tokens": 26458492.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.35946462715105165, |
| "grad_norm": 0.1539436477198697, |
| "learning_rate": 9.992395989204818e-06, |
| "loss": 0.531, |
| "mean_token_accuracy": 0.8279685229063034, |
| "num_tokens": 27034492.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.367112810707457, |
| "grad_norm": 0.14749502264744746, |
| "learning_rate": 9.992061874920869e-06, |
| "loss": 0.5213, |
| "mean_token_accuracy": 0.8304547518491745, |
| "num_tokens": 27610492.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.37476099426386233, |
| "grad_norm": 0.15532363519268472, |
| "learning_rate": 9.991720583072975e-06, |
| "loss": 0.4906, |
| "mean_token_accuracy": 0.8390665575861931, |
| "num_tokens": 28178987.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.3824091778202677, |
| "grad_norm": 0.23556356163528056, |
| "learning_rate": 9.991372114151843e-06, |
| "loss": 0.5353, |
| "mean_token_accuracy": 0.826218456029892, |
| "num_tokens": 28754987.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.390057361376673, |
| "grad_norm": 0.15385965560459058, |
| "learning_rate": 9.9910164686585e-06, |
| "loss": 0.5187, |
| "mean_token_accuracy": 0.8308228179812431, |
| "num_tokens": 29330987.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.3977055449330784, |
| "grad_norm": 0.1436263981933282, |
| "learning_rate": 9.990653647104292e-06, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.8325798138976097, |
| "num_tokens": 29906987.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.40535372848948376, |
| "grad_norm": 0.14302536490724446, |
| "learning_rate": 9.990283650010883e-06, |
| "loss": 0.4872, |
| "mean_token_accuracy": 0.8397606834769249, |
| "num_tokens": 30482987.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.4130019120458891, |
| "grad_norm": 0.17140284653127127, |
| "learning_rate": 9.98990647791025e-06, |
| "loss": 0.5318, |
| "mean_token_accuracy": 0.8275379464030266, |
| "num_tokens": 31058987.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.42065009560229444, |
| "grad_norm": 0.15486894690765854, |
| "learning_rate": 9.989522131344693e-06, |
| "loss": 0.462, |
| "mean_token_accuracy": 0.8481464609503746, |
| "num_tokens": 31634987.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.42829827915869984, |
| "grad_norm": 0.14715377366317914, |
| "learning_rate": 9.989130610866822e-06, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.8358698934316635, |
| "num_tokens": 32210987.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.4359464627151052, |
| "grad_norm": 0.133253236290824, |
| "learning_rate": 9.988731917039564e-06, |
| "loss": 0.5115, |
| "mean_token_accuracy": 0.8322514817118645, |
| "num_tokens": 32784624.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.4435946462715105, |
| "grad_norm": 0.1424854004952171, |
| "learning_rate": 9.988326050436158e-06, |
| "loss": 0.5227, |
| "mean_token_accuracy": 0.8293939307332039, |
| "num_tokens": 33360624.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.45124282982791586, |
| "grad_norm": 0.14122226382716493, |
| "learning_rate": 9.987913011640157e-06, |
| "loss": 0.5248, |
| "mean_token_accuracy": 0.8290501683950424, |
| "num_tokens": 33936624.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.4588910133843212, |
| "grad_norm": 0.13993101979440462, |
| "learning_rate": 9.987492801245431e-06, |
| "loss": 0.4793, |
| "mean_token_accuracy": 0.8423371836543083, |
| "num_tokens": 34512624.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4665391969407266, |
| "grad_norm": 0.14632054172475426, |
| "learning_rate": 9.98706541985615e-06, |
| "loss": 0.5469, |
| "mean_token_accuracy": 0.821442224085331, |
| "num_tokens": 35074731.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.47418738049713194, |
| "grad_norm": 0.1400105720290886, |
| "learning_rate": 9.986630868086804e-06, |
| "loss": 0.5063, |
| "mean_token_accuracy": 0.8351511061191559, |
| "num_tokens": 35650731.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.4818355640535373, |
| "grad_norm": 0.14863573475989134, |
| "learning_rate": 9.986189146562191e-06, |
| "loss": 0.5222, |
| "mean_token_accuracy": 0.8316596522927284, |
| "num_tokens": 36226731.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.4894837476099426, |
| "grad_norm": 0.14605144207294682, |
| "learning_rate": 9.985740255917413e-06, |
| "loss": 0.4925, |
| "mean_token_accuracy": 0.8410836607217789, |
| "num_tokens": 36802731.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.497131931166348, |
| "grad_norm": 0.1357927018597777, |
| "learning_rate": 9.985284196797884e-06, |
| "loss": 0.5007, |
| "mean_token_accuracy": 0.835041731595993, |
| "num_tokens": 37378731.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.5047801147227533, |
| "grad_norm": 0.16344701399255454, |
| "learning_rate": 9.984820969859326e-06, |
| "loss": 0.5519, |
| "mean_token_accuracy": 0.8199577778577805, |
| "num_tokens": 37954731.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.5124282982791587, |
| "grad_norm": 0.13295009461309124, |
| "learning_rate": 9.984350575767763e-06, |
| "loss": 0.4968, |
| "mean_token_accuracy": 0.8373057246208191, |
| "num_tokens": 38530731.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5200764818355641, |
| "grad_norm": 0.1316301231700835, |
| "learning_rate": 9.983873015199524e-06, |
| "loss": 0.4541, |
| "mean_token_accuracy": 0.8496673479676247, |
| "num_tokens": 39106731.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.5277246653919694, |
| "grad_norm": 0.1381202418270699, |
| "learning_rate": 9.983388288841246e-06, |
| "loss": 0.5195, |
| "mean_token_accuracy": 0.8305971175432205, |
| "num_tokens": 39682731.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5353728489483748, |
| "grad_norm": 0.13415757088597044, |
| "learning_rate": 9.982896397389866e-06, |
| "loss": 0.4641, |
| "mean_token_accuracy": 0.8476342782378197, |
| "num_tokens": 40258731.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5430210325047801, |
| "grad_norm": 0.13977158090215638, |
| "learning_rate": 9.98239734155262e-06, |
| "loss": 0.5204, |
| "mean_token_accuracy": 0.8300554230809212, |
| "num_tokens": 40834731.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.5506692160611855, |
| "grad_norm": 0.1414811559776876, |
| "learning_rate": 9.981891122047052e-06, |
| "loss": 0.5039, |
| "mean_token_accuracy": 0.8352587595582008, |
| "num_tokens": 41410731.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5583173996175909, |
| "grad_norm": 0.14675842034623368, |
| "learning_rate": 9.981377739601002e-06, |
| "loss": 0.5249, |
| "mean_token_accuracy": 0.8289755135774612, |
| "num_tokens": 41986731.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5659655831739961, |
| "grad_norm": 0.1422646453200256, |
| "learning_rate": 9.980857194952605e-06, |
| "loss": 0.528, |
| "mean_token_accuracy": 0.8280744329094887, |
| "num_tokens": 42562731.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5736137667304015, |
| "grad_norm": 0.1428993249729862, |
| "learning_rate": 9.980329488850303e-06, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8318679928779602, |
| "num_tokens": 43138731.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5812619502868069, |
| "grad_norm": 0.13904745870216947, |
| "learning_rate": 9.979794622052825e-06, |
| "loss": 0.5101, |
| "mean_token_accuracy": 0.8323385044932365, |
| "num_tokens": 43714731.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.5889101338432122, |
| "grad_norm": 0.1408937048587796, |
| "learning_rate": 9.979252595329204e-06, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.8402296751737595, |
| "num_tokens": 44267402.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5965583173996176, |
| "grad_norm": 0.14639439502500298, |
| "learning_rate": 9.97870340945876e-06, |
| "loss": 0.5191, |
| "mean_token_accuracy": 0.8298158273100853, |
| "num_tokens": 44843402.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.6042065009560229, |
| "grad_norm": 0.14225383180098447, |
| "learning_rate": 9.978147065231114e-06, |
| "loss": 0.501, |
| "mean_token_accuracy": 0.8345382362604141, |
| "num_tokens": 45419402.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6118546845124283, |
| "grad_norm": 0.15165811102237464, |
| "learning_rate": 9.97758356344617e-06, |
| "loss": 0.5053, |
| "mean_token_accuracy": 0.8332482650876045, |
| "num_tokens": 45995402.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6195028680688337, |
| "grad_norm": 0.13839520774785488, |
| "learning_rate": 9.977012904914133e-06, |
| "loss": 0.5085, |
| "mean_token_accuracy": 0.8349983245134354, |
| "num_tokens": 46571402.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.627151051625239, |
| "grad_norm": 0.14712644887933124, |
| "learning_rate": 9.97643509045549e-06, |
| "loss": 0.5471, |
| "mean_token_accuracy": 0.8224283754825592, |
| "num_tokens": 47147402.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.6347992351816444, |
| "grad_norm": 0.13934749804071309, |
| "learning_rate": 9.975850120901023e-06, |
| "loss": 0.5107, |
| "mean_token_accuracy": 0.832944430410862, |
| "num_tokens": 47723402.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.6424474187380497, |
| "grad_norm": 0.14184064832659418, |
| "learning_rate": 9.975257997091795e-06, |
| "loss": 0.5019, |
| "mean_token_accuracy": 0.8352049514651299, |
| "num_tokens": 48299402.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.6500956022944551, |
| "grad_norm": 0.1592732332566689, |
| "learning_rate": 9.974658719879163e-06, |
| "loss": 0.5118, |
| "mean_token_accuracy": 0.8321266919374466, |
| "num_tokens": 48875402.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6577437858508605, |
| "grad_norm": 0.13974194258972175, |
| "learning_rate": 9.97405229012476e-06, |
| "loss": 0.5183, |
| "mean_token_accuracy": 0.8313193619251251, |
| "num_tokens": 49451402.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.6653919694072657, |
| "grad_norm": 0.13246562428620298, |
| "learning_rate": 9.973438708700513e-06, |
| "loss": 0.4767, |
| "mean_token_accuracy": 0.843241736292839, |
| "num_tokens": 50027402.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.6730401529636711, |
| "grad_norm": 0.16684991357812562, |
| "learning_rate": 9.972817976488623e-06, |
| "loss": 0.4971, |
| "mean_token_accuracy": 0.8372588530182838, |
| "num_tokens": 50603402.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.6806883365200764, |
| "grad_norm": 0.13340574719306686, |
| "learning_rate": 9.972190094381578e-06, |
| "loss": 0.4949, |
| "mean_token_accuracy": 0.8363786041736603, |
| "num_tokens": 51179402.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.6883365200764818, |
| "grad_norm": 0.14359639069142585, |
| "learning_rate": 9.971555063282145e-06, |
| "loss": 0.4855, |
| "mean_token_accuracy": 0.8407451063394547, |
| "num_tokens": 51755402.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6959847036328872, |
| "grad_norm": 0.1465843545803842, |
| "learning_rate": 9.970912884103365e-06, |
| "loss": 0.5498, |
| "mean_token_accuracy": 0.8212963715195656, |
| "num_tokens": 52331402.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.7036328871892925, |
| "grad_norm": 0.13373715318944093, |
| "learning_rate": 9.970263557768565e-06, |
| "loss": 0.5128, |
| "mean_token_accuracy": 0.8318002820014954, |
| "num_tokens": 52907402.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.7112810707456979, |
| "grad_norm": 0.1425977955340504, |
| "learning_rate": 9.96960708521134e-06, |
| "loss": 0.4922, |
| "mean_token_accuracy": 0.8385525420308113, |
| "num_tokens": 53473097.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.7189292543021033, |
| "grad_norm": 0.1414493480461768, |
| "learning_rate": 9.968943467375563e-06, |
| "loss": 0.5015, |
| "mean_token_accuracy": 0.8355400264263153, |
| "num_tokens": 54049097.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.7265774378585086, |
| "grad_norm": 0.13127380227428548, |
| "learning_rate": 9.968272705215382e-06, |
| "loss": 0.4915, |
| "mean_token_accuracy": 0.8390123769640923, |
| "num_tokens": 54609292.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.734225621414914, |
| "grad_norm": 0.13467129663035832, |
| "learning_rate": 9.967594799695218e-06, |
| "loss": 0.4794, |
| "mean_token_accuracy": 0.8427561372518539, |
| "num_tokens": 55170798.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.7418738049713193, |
| "grad_norm": 0.13621387326515388, |
| "learning_rate": 9.966909751789758e-06, |
| "loss": 0.487, |
| "mean_token_accuracy": 0.8411270678043365, |
| "num_tokens": 55746798.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.7495219885277247, |
| "grad_norm": 0.14264410268094876, |
| "learning_rate": 9.96621756248396e-06, |
| "loss": 0.4951, |
| "mean_token_accuracy": 0.8365452662110329, |
| "num_tokens": 56322798.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.7571701720841301, |
| "grad_norm": 0.13879533507650615, |
| "learning_rate": 9.965518232773052e-06, |
| "loss": 0.5045, |
| "mean_token_accuracy": 0.8338455036282539, |
| "num_tokens": 56898798.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.7648183556405354, |
| "grad_norm": 0.13281867848059528, |
| "learning_rate": 9.964811763662528e-06, |
| "loss": 0.4848, |
| "mean_token_accuracy": 0.8396026864647865, |
| "num_tokens": 57474798.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7724665391969407, |
| "grad_norm": 0.14634204976216025, |
| "learning_rate": 9.964098156168143e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.822074182331562, |
| "num_tokens": 58050798.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.780114722753346, |
| "grad_norm": 0.14047489451950268, |
| "learning_rate": 9.963377411315922e-06, |
| "loss": 0.5234, |
| "mean_token_accuracy": 0.829129159450531, |
| "num_tokens": 58625739.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.7877629063097514, |
| "grad_norm": 0.14274210766707315, |
| "learning_rate": 9.962649530142147e-06, |
| "loss": 0.5335, |
| "mean_token_accuracy": 0.8258555829524994, |
| "num_tokens": 59201739.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.7954110898661568, |
| "grad_norm": 0.14094212771766002, |
| "learning_rate": 9.961914513693362e-06, |
| "loss": 0.5211, |
| "mean_token_accuracy": 0.8285171613097191, |
| "num_tokens": 59777739.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.8030592734225621, |
| "grad_norm": 0.13762839208797772, |
| "learning_rate": 9.96117236302637e-06, |
| "loss": 0.5069, |
| "mean_token_accuracy": 0.8346649780869484, |
| "num_tokens": 60353739.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.8107074569789675, |
| "grad_norm": 0.1465369564636583, |
| "learning_rate": 9.960423079208235e-06, |
| "loss": 0.4612, |
| "mean_token_accuracy": 0.8477652445435524, |
| "num_tokens": 60907009.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.8183556405353728, |
| "grad_norm": 0.14637852098727955, |
| "learning_rate": 9.959666663316269e-06, |
| "loss": 0.5497, |
| "mean_token_accuracy": 0.8230603337287903, |
| "num_tokens": 61483009.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.8260038240917782, |
| "grad_norm": 0.1347219196728508, |
| "learning_rate": 9.958903116438049e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.841705210506916, |
| "num_tokens": 62059009.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.8336520076481836, |
| "grad_norm": 0.13871053974279857, |
| "learning_rate": 9.958132439671392e-06, |
| "loss": 0.5269, |
| "mean_token_accuracy": 0.8272046074271202, |
| "num_tokens": 62635009.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.8413001912045889, |
| "grad_norm": 0.13866989193780266, |
| "learning_rate": 9.95735463412438e-06, |
| "loss": 0.4786, |
| "mean_token_accuracy": 0.8430299237370491, |
| "num_tokens": 63211009.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8489483747609943, |
| "grad_norm": 0.1423988117487116, |
| "learning_rate": 9.956569700915338e-06, |
| "loss": 0.5059, |
| "mean_token_accuracy": 0.8349323570728302, |
| "num_tokens": 63787009.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.8565965583173997, |
| "grad_norm": 0.16163953173723308, |
| "learning_rate": 9.955777641172836e-06, |
| "loss": 0.4767, |
| "mean_token_accuracy": 0.8433250710368156, |
| "num_tokens": 64363009.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.864244741873805, |
| "grad_norm": 0.13530156968054668, |
| "learning_rate": 9.954978456035695e-06, |
| "loss": 0.4691, |
| "mean_token_accuracy": 0.8455977290868759, |
| "num_tokens": 64939009.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.8718929254302104, |
| "grad_norm": 0.13247546863564885, |
| "learning_rate": 9.95417214665298e-06, |
| "loss": 0.5163, |
| "mean_token_accuracy": 0.8312828913331032, |
| "num_tokens": 65515009.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.8795411089866156, |
| "grad_norm": 0.16495562650776383, |
| "learning_rate": 9.953358714183999e-06, |
| "loss": 0.5142, |
| "mean_token_accuracy": 0.8312735706567764, |
| "num_tokens": 66073393.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.887189292543021, |
| "grad_norm": 0.136972715382717, |
| "learning_rate": 9.9525381597983e-06, |
| "loss": 0.5155, |
| "mean_token_accuracy": 0.8300295248627663, |
| "num_tokens": 66643616.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.8948374760994264, |
| "grad_norm": 0.24477498311580997, |
| "learning_rate": 9.951710484675677e-06, |
| "loss": 0.4987, |
| "mean_token_accuracy": 0.8363108783960342, |
| "num_tokens": 67219616.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.9024856596558317, |
| "grad_norm": 0.15547014864530215, |
| "learning_rate": 9.950875690006152e-06, |
| "loss": 0.525, |
| "mean_token_accuracy": 0.8279963135719299, |
| "num_tokens": 67795616.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.9101338432122371, |
| "grad_norm": 0.13565975154579993, |
| "learning_rate": 9.950033776989994e-06, |
| "loss": 0.4875, |
| "mean_token_accuracy": 0.8386981412768364, |
| "num_tokens": 68371616.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.9177820267686424, |
| "grad_norm": 0.14096319378387628, |
| "learning_rate": 9.949184746837697e-06, |
| "loss": 0.5275, |
| "mean_token_accuracy": 0.8280240818858147, |
| "num_tokens": 68947616.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9254302103250478, |
| "grad_norm": 0.14179059237775263, |
| "learning_rate": 9.948328600769996e-06, |
| "loss": 0.5041, |
| "mean_token_accuracy": 0.834081619977951, |
| "num_tokens": 69523616.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.9330783938814532, |
| "grad_norm": 0.13988551396496146, |
| "learning_rate": 9.947465340017853e-06, |
| "loss": 0.5219, |
| "mean_token_accuracy": 0.8292012140154839, |
| "num_tokens": 70099616.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.9407265774378585, |
| "grad_norm": 0.13758598960469923, |
| "learning_rate": 9.94659496582246e-06, |
| "loss": 0.4947, |
| "mean_token_accuracy": 0.8360296338796616, |
| "num_tokens": 70675616.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.9483747609942639, |
| "grad_norm": 0.1332527555285676, |
| "learning_rate": 9.945717479435236e-06, |
| "loss": 0.4987, |
| "mean_token_accuracy": 0.8360331058502197, |
| "num_tokens": 71251616.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.9560229445506692, |
| "grad_norm": 0.1346620611912445, |
| "learning_rate": 9.94483288211783e-06, |
| "loss": 0.4766, |
| "mean_token_accuracy": 0.8422486484050751, |
| "num_tokens": 71827616.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.9636711281070746, |
| "grad_norm": 0.1412101809738773, |
| "learning_rate": 9.943941175142109e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.8348843902349472, |
| "num_tokens": 72389881.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.97131931166348, |
| "grad_norm": 0.1468591145222982, |
| "learning_rate": 9.943042359790168e-06, |
| "loss": 0.5318, |
| "mean_token_accuracy": 0.8273720592260361, |
| "num_tokens": 72963213.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.9789674952198852, |
| "grad_norm": 0.1920244752165593, |
| "learning_rate": 9.942136437354316e-06, |
| "loss": 0.5495, |
| "mean_token_accuracy": 0.8214943110942841, |
| "num_tokens": 73539213.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.9866156787762906, |
| "grad_norm": 0.15090295706963755, |
| "learning_rate": 9.941223409137088e-06, |
| "loss": 0.5079, |
| "mean_token_accuracy": 0.8330902680754662, |
| "num_tokens": 74115213.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.994263862332696, |
| "grad_norm": 0.13592806085375692, |
| "learning_rate": 9.94030327645123e-06, |
| "loss": 0.5012, |
| "mean_token_accuracy": 0.8337656334042549, |
| "num_tokens": 74691213.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.15427033521640424, |
| "learning_rate": 9.939376040619707e-06, |
| "loss": 0.4586, |
| "mean_token_accuracy": 0.8492923180262247, |
| "num_tokens": 75123213.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.0076481835564053, |
| "grad_norm": 0.1365403277523185, |
| "learning_rate": 9.938441702975689e-06, |
| "loss": 0.4728, |
| "mean_token_accuracy": 0.844464011490345, |
| "num_tokens": 75699213.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.0152963671128108, |
| "grad_norm": 0.13190000895931886, |
| "learning_rate": 9.937500264862567e-06, |
| "loss": 0.4705, |
| "mean_token_accuracy": 0.8449570834636688, |
| "num_tokens": 76275213.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.022944550669216, |
| "grad_norm": 0.1389937992497058, |
| "learning_rate": 9.936551727633934e-06, |
| "loss": 0.4889, |
| "mean_token_accuracy": 0.8385332003235817, |
| "num_tokens": 76851213.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.0305927342256214, |
| "grad_norm": 0.12744266634531476, |
| "learning_rate": 9.935596092653596e-06, |
| "loss": 0.4817, |
| "mean_token_accuracy": 0.8396877720952034, |
| "num_tokens": 77427213.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.0382409177820269, |
| "grad_norm": 0.13524852499452464, |
| "learning_rate": 9.934633361295558e-06, |
| "loss": 0.4857, |
| "mean_token_accuracy": 0.8397051244974136, |
| "num_tokens": 78003213.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.0458891013384322, |
| "grad_norm": 0.1399382135972632, |
| "learning_rate": 9.933663534944029e-06, |
| "loss": 0.4856, |
| "mean_token_accuracy": 0.8398839607834816, |
| "num_tokens": 78579213.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.0535372848948374, |
| "grad_norm": 0.13322210319690403, |
| "learning_rate": 9.932686614993425e-06, |
| "loss": 0.4648, |
| "mean_token_accuracy": 0.845810703933239, |
| "num_tokens": 79141860.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.0611854684512427, |
| "grad_norm": 0.13213772381798283, |
| "learning_rate": 9.931702602848354e-06, |
| "loss": 0.4571, |
| "mean_token_accuracy": 0.8474953845143318, |
| "num_tokens": 79717860.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.0688336520076482, |
| "grad_norm": 0.1331479705432894, |
| "learning_rate": 9.930711499923626e-06, |
| "loss": 0.4984, |
| "mean_token_accuracy": 0.835312582552433, |
| "num_tokens": 80293860.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.0764818355640535, |
| "grad_norm": 0.14215781774983868, |
| "learning_rate": 9.929713307644245e-06, |
| "loss": 0.4788, |
| "mean_token_accuracy": 0.8420683667063713, |
| "num_tokens": 80846531.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.0841300191204588, |
| "grad_norm": 0.12936783165938912, |
| "learning_rate": 9.928708027445403e-06, |
| "loss": 0.5017, |
| "mean_token_accuracy": 0.8367559015750885, |
| "num_tokens": 81416754.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.0917782026768643, |
| "grad_norm": 0.13361158262185466, |
| "learning_rate": 9.927695660772492e-06, |
| "loss": 0.488, |
| "mean_token_accuracy": 0.8376425430178642, |
| "num_tokens": 81992754.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.0994263862332696, |
| "grad_norm": 0.16213706049923526, |
| "learning_rate": 9.926676209081085e-06, |
| "loss": 0.5066, |
| "mean_token_accuracy": 0.8328819274902344, |
| "num_tokens": 82568754.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.107074569789675, |
| "grad_norm": 0.1359440567719058, |
| "learning_rate": 9.925649673836949e-06, |
| "loss": 0.5083, |
| "mean_token_accuracy": 0.8328124806284904, |
| "num_tokens": 83144754.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.1147227533460804, |
| "grad_norm": 0.1295608761839531, |
| "learning_rate": 9.924616056516027e-06, |
| "loss": 0.4879, |
| "mean_token_accuracy": 0.8381772711873055, |
| "num_tokens": 83720754.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.1223709369024857, |
| "grad_norm": 0.13635991726116314, |
| "learning_rate": 9.923575358604454e-06, |
| "loss": 0.5083, |
| "mean_token_accuracy": 0.8336632177233696, |
| "num_tokens": 84296754.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.130019120458891, |
| "grad_norm": 0.1371786254385925, |
| "learning_rate": 9.922527581598535e-06, |
| "loss": 0.5097, |
| "mean_token_accuracy": 0.8322065472602844, |
| "num_tokens": 84872754.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.1376673040152965, |
| "grad_norm": 0.1280042137264246, |
| "learning_rate": 9.921472727004765e-06, |
| "loss": 0.4447, |
| "mean_token_accuracy": 0.8517681285738945, |
| "num_tokens": 85448754.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.1453154875717018, |
| "grad_norm": 0.14709174504186834, |
| "learning_rate": 9.920410796339806e-06, |
| "loss": 0.4984, |
| "mean_token_accuracy": 0.8356476724147797, |
| "num_tokens": 86024754.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.152963671128107, |
| "grad_norm": 0.13247230064106771, |
| "learning_rate": 9.919341791130496e-06, |
| "loss": 0.4493, |
| "mean_token_accuracy": 0.8508201763033867, |
| "num_tokens": 86600754.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.1606118546845123, |
| "grad_norm": 0.1396533122936014, |
| "learning_rate": 9.91826571291385e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.8353525176644325, |
| "num_tokens": 87176754.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.1682600382409178, |
| "grad_norm": 0.12824246274478124, |
| "learning_rate": 9.917182563237045e-06, |
| "loss": 0.4778, |
| "mean_token_accuracy": 0.8412972092628479, |
| "num_tokens": 87752754.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.1759082217973231, |
| "grad_norm": 0.13963117320032115, |
| "learning_rate": 9.91609234365743e-06, |
| "loss": 0.494, |
| "mean_token_accuracy": 0.8358664214611053, |
| "num_tokens": 88328754.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.1835564053537284, |
| "grad_norm": 0.12893289909169098, |
| "learning_rate": 9.914995055742515e-06, |
| "loss": 0.4764, |
| "mean_token_accuracy": 0.841441310942173, |
| "num_tokens": 88904754.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.191204588910134, |
| "grad_norm": 0.13091574452160962, |
| "learning_rate": 9.91389070106998e-06, |
| "loss": 0.4487, |
| "mean_token_accuracy": 0.8508792147040367, |
| "num_tokens": 89480754.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.1988527724665392, |
| "grad_norm": 0.13244029890958137, |
| "learning_rate": 9.912779281227656e-06, |
| "loss": 0.473, |
| "mean_token_accuracy": 0.8444153964519501, |
| "num_tokens": 90056754.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.2065009560229445, |
| "grad_norm": 0.13918199257945654, |
| "learning_rate": 9.911660797813542e-06, |
| "loss": 0.5175, |
| "mean_token_accuracy": 0.8303498476743698, |
| "num_tokens": 90607633.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.21414913957935, |
| "grad_norm": 0.13614928489897143, |
| "learning_rate": 9.91053525243579e-06, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.8389325365424156, |
| "num_tokens": 91183633.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.2217973231357553, |
| "grad_norm": 0.13814896905701568, |
| "learning_rate": 9.909402646712697e-06, |
| "loss": 0.4886, |
| "mean_token_accuracy": 0.8386929333209991, |
| "num_tokens": 91759633.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2294455066921606, |
| "grad_norm": 0.13390235159092828, |
| "learning_rate": 9.908262982272724e-06, |
| "loss": 0.4981, |
| "mean_token_accuracy": 0.8359844833612442, |
| "num_tokens": 92335633.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.237093690248566, |
| "grad_norm": 0.1424290492085377, |
| "learning_rate": 9.907116260754472e-06, |
| "loss": 0.4998, |
| "mean_token_accuracy": 0.8363195657730103, |
| "num_tokens": 92911633.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.2447418738049714, |
| "grad_norm": 0.13101785380987388, |
| "learning_rate": 9.905962483806696e-06, |
| "loss": 0.4826, |
| "mean_token_accuracy": 0.8393439948558807, |
| "num_tokens": 93487633.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.2523900573613767, |
| "grad_norm": 0.13349754239199546, |
| "learning_rate": 9.904801653088287e-06, |
| "loss": 0.4868, |
| "mean_token_accuracy": 0.8397346436977386, |
| "num_tokens": 94063633.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.260038240917782, |
| "grad_norm": 0.13506310588052065, |
| "learning_rate": 9.903633770268286e-06, |
| "loss": 0.4935, |
| "mean_token_accuracy": 0.8379099145531654, |
| "num_tokens": 94639633.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.2676864244741874, |
| "grad_norm": 0.1460942025739368, |
| "learning_rate": 9.902458837025865e-06, |
| "loss": 0.533, |
| "mean_token_accuracy": 0.825570859014988, |
| "num_tokens": 95215633.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.2753346080305927, |
| "grad_norm": 0.14230237493097705, |
| "learning_rate": 9.901276855050342e-06, |
| "loss": 0.509, |
| "mean_token_accuracy": 0.8315832614898682, |
| "num_tokens": 95791633.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.282982791586998, |
| "grad_norm": 0.13383575612568485, |
| "learning_rate": 9.900087826041163e-06, |
| "loss": 0.4828, |
| "mean_token_accuracy": 0.8402016833424568, |
| "num_tokens": 96367633.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.2906309751434035, |
| "grad_norm": 0.13987895499055866, |
| "learning_rate": 9.89889175170791e-06, |
| "loss": 0.5255, |
| "mean_token_accuracy": 0.8278417810797691, |
| "num_tokens": 96943633.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.2982791586998088, |
| "grad_norm": 0.1313503038847592, |
| "learning_rate": 9.89768863377029e-06, |
| "loss": 0.4956, |
| "mean_token_accuracy": 0.8354028537869453, |
| "num_tokens": 97519633.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.305927342256214, |
| "grad_norm": 0.13457635948414468, |
| "learning_rate": 9.896478473958147e-06, |
| "loss": 0.4697, |
| "mean_token_accuracy": 0.8448807075619698, |
| "num_tokens": 98095633.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.3135755258126194, |
| "grad_norm": 0.13241587496517163, |
| "learning_rate": 9.895261274011436e-06, |
| "loss": 0.4954, |
| "mean_token_accuracy": 0.8361199200153351, |
| "num_tokens": 98671633.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.321223709369025, |
| "grad_norm": 0.133287438364359, |
| "learning_rate": 9.894037035680246e-06, |
| "loss": 0.4827, |
| "mean_token_accuracy": 0.842342384159565, |
| "num_tokens": 99247633.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.3288718929254302, |
| "grad_norm": 0.13647661096592764, |
| "learning_rate": 9.892805760724782e-06, |
| "loss": 0.4902, |
| "mean_token_accuracy": 0.8376477435231209, |
| "num_tokens": 99823633.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.3365200764818357, |
| "grad_norm": 0.13426483562570884, |
| "learning_rate": 9.89156745091536e-06, |
| "loss": 0.4851, |
| "mean_token_accuracy": 0.838831827044487, |
| "num_tokens": 100399633.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.344168260038241, |
| "grad_norm": 0.21654547351312775, |
| "learning_rate": 9.890322108032423e-06, |
| "loss": 0.4898, |
| "mean_token_accuracy": 0.8377328217029572, |
| "num_tokens": 100975633.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.3518164435946463, |
| "grad_norm": 0.14468682987371165, |
| "learning_rate": 9.889069733866515e-06, |
| "loss": 0.4822, |
| "mean_token_accuracy": 0.8400419503450394, |
| "num_tokens": 101551633.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.3594646271510515, |
| "grad_norm": 0.1410032902289748, |
| "learning_rate": 9.887810330218294e-06, |
| "loss": 0.5063, |
| "mean_token_accuracy": 0.8329392224550247, |
| "num_tokens": 102127633.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.367112810707457, |
| "grad_norm": 0.1300256891404132, |
| "learning_rate": 9.886543898898528e-06, |
| "loss": 0.4759, |
| "mean_token_accuracy": 0.8423354402184486, |
| "num_tokens": 102703633.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.3747609942638623, |
| "grad_norm": 0.14166005366415435, |
| "learning_rate": 9.885270441728085e-06, |
| "loss": 0.5011, |
| "mean_token_accuracy": 0.8352118879556656, |
| "num_tokens": 103279633.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.3824091778202676, |
| "grad_norm": 0.1386111808825074, |
| "learning_rate": 9.883989960537934e-06, |
| "loss": 0.5052, |
| "mean_token_accuracy": 0.8329878374934196, |
| "num_tokens": 103855633.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.3900573613766731, |
| "grad_norm": 0.13817887508453544, |
| "learning_rate": 9.882702457169147e-06, |
| "loss": 0.5012, |
| "mean_token_accuracy": 0.8339826613664627, |
| "num_tokens": 104431633.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.3977055449330784, |
| "grad_norm": 0.14012765345513187, |
| "learning_rate": 9.881407933472889e-06, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.834894172847271, |
| "num_tokens": 105007633.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.4053537284894837, |
| "grad_norm": 0.12930750196514224, |
| "learning_rate": 9.88010639131042e-06, |
| "loss": 0.4611, |
| "mean_token_accuracy": 0.845082089304924, |
| "num_tokens": 105583633.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.413001912045889, |
| "grad_norm": 0.12331109134382856, |
| "learning_rate": 9.878797832553093e-06, |
| "loss": 0.4714, |
| "mean_token_accuracy": 0.8437591195106506, |
| "num_tokens": 106159633.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.4206500956022945, |
| "grad_norm": 0.1293813455298218, |
| "learning_rate": 9.87748225908235e-06, |
| "loss": 0.4691, |
| "mean_token_accuracy": 0.8440056517720222, |
| "num_tokens": 106735633.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.4282982791586998, |
| "grad_norm": 0.14507012516088896, |
| "learning_rate": 9.876159672789711e-06, |
| "loss": 0.5212, |
| "mean_token_accuracy": 0.8286769017577171, |
| "num_tokens": 107311633.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.4359464627151053, |
| "grad_norm": 0.15111476207924585, |
| "learning_rate": 9.874830075576789e-06, |
| "loss": 0.5101, |
| "mean_token_accuracy": 0.8307446986436844, |
| "num_tokens": 107887633.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.4435946462715106, |
| "grad_norm": 0.14709035714161833, |
| "learning_rate": 9.873493469355271e-06, |
| "loss": 0.4971, |
| "mean_token_accuracy": 0.8350886180996895, |
| "num_tokens": 108463633.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.4512428298279159, |
| "grad_norm": 0.1344401231018188, |
| "learning_rate": 9.872149856046922e-06, |
| "loss": 0.4757, |
| "mean_token_accuracy": 0.8424031659960747, |
| "num_tokens": 109039633.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.4588910133843211, |
| "grad_norm": 0.13403966584878735, |
| "learning_rate": 9.870799237583586e-06, |
| "loss": 0.4622, |
| "mean_token_accuracy": 0.8463167697191238, |
| "num_tokens": 109613270.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.4665391969407267, |
| "grad_norm": 0.13413337229810696, |
| "learning_rate": 9.869441615907176e-06, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.8371755108237267, |
| "num_tokens": 110189270.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.474187380497132, |
| "grad_norm": 0.13213885141206153, |
| "learning_rate": 9.868076992969672e-06, |
| "loss": 0.4629, |
| "mean_token_accuracy": 0.8472193330526352, |
| "num_tokens": 110765270.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.4818355640535372, |
| "grad_norm": 0.14284631334355893, |
| "learning_rate": 9.866705370733126e-06, |
| "loss": 0.4759, |
| "mean_token_accuracy": 0.8413978591561317, |
| "num_tokens": 111330965.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.4894837476099427, |
| "grad_norm": 0.14434856966382234, |
| "learning_rate": 9.865326751169648e-06, |
| "loss": 0.4682, |
| "mean_token_accuracy": 0.8441445603966713, |
| "num_tokens": 111906965.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.497131931166348, |
| "grad_norm": 0.13331488674498373, |
| "learning_rate": 9.863941136261409e-06, |
| "loss": 0.4743, |
| "mean_token_accuracy": 0.842158354818821, |
| "num_tokens": 112482965.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.5047801147227533, |
| "grad_norm": 0.1333685996060613, |
| "learning_rate": 9.862548528000644e-06, |
| "loss": 0.4833, |
| "mean_token_accuracy": 0.8400801345705986, |
| "num_tokens": 113058965.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.5124282982791586, |
| "grad_norm": 0.13278247248038624, |
| "learning_rate": 9.861148928389634e-06, |
| "loss": 0.4886, |
| "mean_token_accuracy": 0.8382901325821877, |
| "num_tokens": 113634965.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.520076481835564, |
| "grad_norm": 0.13693399891139466, |
| "learning_rate": 9.859742339440723e-06, |
| "loss": 0.4822, |
| "mean_token_accuracy": 0.8395071998238564, |
| "num_tokens": 114210965.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.5277246653919694, |
| "grad_norm": 0.14014126230020535, |
| "learning_rate": 9.858328763176294e-06, |
| "loss": 0.509, |
| "mean_token_accuracy": 0.8321307003498077, |
| "num_tokens": 114785906.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.535372848948375, |
| "grad_norm": 0.13812738362519864, |
| "learning_rate": 9.85690820162878e-06, |
| "loss": 0.495, |
| "mean_token_accuracy": 0.8362865820527077, |
| "num_tokens": 115361906.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.5430210325047802, |
| "grad_norm": 0.12908599718928895, |
| "learning_rate": 9.855480656840662e-06, |
| "loss": 0.471, |
| "mean_token_accuracy": 0.843786895275116, |
| "num_tokens": 115937906.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.5506692160611855, |
| "grad_norm": 0.13168108610671495, |
| "learning_rate": 9.854046130864454e-06, |
| "loss": 0.4765, |
| "mean_token_accuracy": 0.8418128415942192, |
| "num_tokens": 116513906.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.5583173996175907, |
| "grad_norm": 0.1374975523241788, |
| "learning_rate": 9.852604625762712e-06, |
| "loss": 0.4502, |
| "mean_token_accuracy": 0.8503288328647614, |
| "num_tokens": 117089906.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.565965583173996, |
| "grad_norm": 0.13716664205655626, |
| "learning_rate": 9.851156143608025e-06, |
| "loss": 0.4848, |
| "mean_token_accuracy": 0.8381772711873055, |
| "num_tokens": 117665906.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.5736137667304015, |
| "grad_norm": 0.19260127106490352, |
| "learning_rate": 9.849700686483016e-06, |
| "loss": 0.4606, |
| "mean_token_accuracy": 0.8471116870641708, |
| "num_tokens": 118241906.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.581261950286807, |
| "grad_norm": 0.1285761532416256, |
| "learning_rate": 9.848238256480329e-06, |
| "loss": 0.4689, |
| "mean_token_accuracy": 0.8435264676809311, |
| "num_tokens": 118817906.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.5889101338432123, |
| "grad_norm": 0.13245065174094972, |
| "learning_rate": 9.846768855702646e-06, |
| "loss": 0.4931, |
| "mean_token_accuracy": 0.8375678807497025, |
| "num_tokens": 119393906.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.5965583173996176, |
| "grad_norm": 0.13171290320659565, |
| "learning_rate": 9.845292486262664e-06, |
| "loss": 0.4801, |
| "mean_token_accuracy": 0.8411461487412453, |
| "num_tokens": 119969906.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.604206500956023, |
| "grad_norm": 0.1382945206321001, |
| "learning_rate": 9.843809150283096e-06, |
| "loss": 0.5201, |
| "mean_token_accuracy": 0.8290223926305771, |
| "num_tokens": 120545906.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6118546845124282, |
| "grad_norm": 0.1308311967900458, |
| "learning_rate": 9.842318849896679e-06, |
| "loss": 0.4811, |
| "mean_token_accuracy": 0.8402815386652946, |
| "num_tokens": 121121906.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.6195028680688337, |
| "grad_norm": 0.13179762296615236, |
| "learning_rate": 9.840821587246158e-06, |
| "loss": 0.4841, |
| "mean_token_accuracy": 0.8397172838449478, |
| "num_tokens": 121697906.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.627151051625239, |
| "grad_norm": 0.13092322919750846, |
| "learning_rate": 9.839317364484295e-06, |
| "loss": 0.4857, |
| "mean_token_accuracy": 0.8396096378564835, |
| "num_tokens": 122273906.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.6347992351816445, |
| "grad_norm": 0.1406807481437863, |
| "learning_rate": 9.837806183773851e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.8343264311552048, |
| "num_tokens": 122849906.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.6424474187380498, |
| "grad_norm": 0.15127230928084542, |
| "learning_rate": 9.836288047287593e-06, |
| "loss": 0.5012, |
| "mean_token_accuracy": 0.8336441144347191, |
| "num_tokens": 123425906.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.650095602294455, |
| "grad_norm": 0.13764770828391903, |
| "learning_rate": 9.834762957208293e-06, |
| "loss": 0.4933, |
| "mean_token_accuracy": 0.8366129845380783, |
| "num_tokens": 124001906.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.6577437858508604, |
| "grad_norm": 0.1309674988371225, |
| "learning_rate": 9.83323091572872e-06, |
| "loss": 0.5042, |
| "mean_token_accuracy": 0.8334027752280235, |
| "num_tokens": 124577906.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.6653919694072656, |
| "grad_norm": 0.1416672912642957, |
| "learning_rate": 9.831691925051634e-06, |
| "loss": 0.469, |
| "mean_token_accuracy": 0.8436514809727669, |
| "num_tokens": 125153906.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.6730401529636711, |
| "grad_norm": 0.13213277792263045, |
| "learning_rate": 9.83014598738979e-06, |
| "loss": 0.4744, |
| "mean_token_accuracy": 0.8429621979594231, |
| "num_tokens": 125729906.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.6806883365200764, |
| "grad_norm": 0.14828248446823036, |
| "learning_rate": 9.82859310496593e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.841729499399662, |
| "num_tokens": 126305906.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.688336520076482, |
| "grad_norm": 0.13273638622183012, |
| "learning_rate": 9.827033280012783e-06, |
| "loss": 0.4601, |
| "mean_token_accuracy": 0.8464536741375923, |
| "num_tokens": 126881906.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.6959847036328872, |
| "grad_norm": 0.1371281964569988, |
| "learning_rate": 9.825466514773054e-06, |
| "loss": 0.5065, |
| "mean_token_accuracy": 0.8331058844923973, |
| "num_tokens": 127457906.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.7036328871892925, |
| "grad_norm": 0.13113344781507827, |
| "learning_rate": 9.823892811499435e-06, |
| "loss": 0.4751, |
| "mean_token_accuracy": 0.8415020853281021, |
| "num_tokens": 128033906.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.7112810707456978, |
| "grad_norm": 0.1377436100503237, |
| "learning_rate": 9.822312172454587e-06, |
| "loss": 0.52, |
| "mean_token_accuracy": 0.8276021778583527, |
| "num_tokens": 128609906.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.7189292543021033, |
| "grad_norm": 0.12699707417451003, |
| "learning_rate": 9.820724599911147e-06, |
| "loss": 0.446, |
| "mean_token_accuracy": 0.8517074510455132, |
| "num_tokens": 129169503.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.7265774378585086, |
| "grad_norm": 0.12634559250621336, |
| "learning_rate": 9.819130096151718e-06, |
| "loss": 0.4544, |
| "mean_token_accuracy": 0.8480943590402603, |
| "num_tokens": 129745503.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.734225621414914, |
| "grad_norm": 0.12958989836341028, |
| "learning_rate": 9.817528663468873e-06, |
| "loss": 0.4783, |
| "mean_token_accuracy": 0.8412247076630592, |
| "num_tokens": 130319403.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.7418738049713194, |
| "grad_norm": 0.1372257707417811, |
| "learning_rate": 9.815920304165144e-06, |
| "loss": 0.4881, |
| "mean_token_accuracy": 0.8391176462173462, |
| "num_tokens": 130881510.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.7495219885277247, |
| "grad_norm": 0.13053898036659867, |
| "learning_rate": 9.81430502055302e-06, |
| "loss": 0.4723, |
| "mean_token_accuracy": 0.8423580154776573, |
| "num_tokens": 131457510.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.75717017208413, |
| "grad_norm": 0.13772367894783863, |
| "learning_rate": 9.81268281495495e-06, |
| "loss": 0.4873, |
| "mean_token_accuracy": 0.8386755734682083, |
| "num_tokens": 132033510.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.7648183556405352, |
| "grad_norm": 0.1357109696346542, |
| "learning_rate": 9.811053689703333e-06, |
| "loss": 0.5173, |
| "mean_token_accuracy": 0.8325728848576546, |
| "num_tokens": 132609510.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.7724665391969407, |
| "grad_norm": 0.1279722497480012, |
| "learning_rate": 9.809417647140522e-06, |
| "loss": 0.4733, |
| "mean_token_accuracy": 0.8432070016860962, |
| "num_tokens": 133185510.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.780114722753346, |
| "grad_norm": 0.14056689318218055, |
| "learning_rate": 9.807774689618806e-06, |
| "loss": 0.5136, |
| "mean_token_accuracy": 0.8318975046277046, |
| "num_tokens": 133761510.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.7877629063097515, |
| "grad_norm": 0.13180288049266284, |
| "learning_rate": 9.806124819500427e-06, |
| "loss": 0.4863, |
| "mean_token_accuracy": 0.8381616622209549, |
| "num_tokens": 134337510.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.7954110898661568, |
| "grad_norm": 0.14261327766170212, |
| "learning_rate": 9.804468039157557e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.8391356617212296, |
| "num_tokens": 134913510.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.8030592734225621, |
| "grad_norm": 0.1415890220842668, |
| "learning_rate": 9.802804350972308e-06, |
| "loss": 0.5261, |
| "mean_token_accuracy": 0.8269285559654236, |
| "num_tokens": 135489510.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.8107074569789674, |
| "grad_norm": 0.13577919369547242, |
| "learning_rate": 9.801133757336726e-06, |
| "loss": 0.475, |
| "mean_token_accuracy": 0.8420576602220535, |
| "num_tokens": 136065510.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.8183556405353727, |
| "grad_norm": 0.13887863894469682, |
| "learning_rate": 9.799456260652778e-06, |
| "loss": 0.4676, |
| "mean_token_accuracy": 0.8456170037388802, |
| "num_tokens": 136625705.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.8260038240917782, |
| "grad_norm": 0.1329581032736448, |
| "learning_rate": 9.797771863332365e-06, |
| "loss": 0.468, |
| "mean_token_accuracy": 0.8451133370399475, |
| "num_tokens": 137201705.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.8336520076481837, |
| "grad_norm": 0.14266764923168374, |
| "learning_rate": 9.796080567797304e-06, |
| "loss": 0.4875, |
| "mean_token_accuracy": 0.8391078859567642, |
| "num_tokens": 137777705.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.841300191204589, |
| "grad_norm": 0.13123578796297256, |
| "learning_rate": 9.794382376479334e-06, |
| "loss": 0.4801, |
| "mean_token_accuracy": 0.8414048627018929, |
| "num_tokens": 138353705.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.8489483747609943, |
| "grad_norm": 0.13763073917570376, |
| "learning_rate": 9.792677291820107e-06, |
| "loss": 0.4785, |
| "mean_token_accuracy": 0.8403075709939003, |
| "num_tokens": 138929705.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.8565965583173996, |
| "grad_norm": 0.1339684449373738, |
| "learning_rate": 9.790965316271183e-06, |
| "loss": 0.4728, |
| "mean_token_accuracy": 0.8434257805347443, |
| "num_tokens": 139505705.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.8642447418738048, |
| "grad_norm": 0.13563074491467572, |
| "learning_rate": 9.789246452294034e-06, |
| "loss": 0.4885, |
| "mean_token_accuracy": 0.8397867307066917, |
| "num_tokens": 140081705.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.8718929254302104, |
| "grad_norm": 0.14368095826727606, |
| "learning_rate": 9.787520702360035e-06, |
| "loss": 0.4808, |
| "mean_token_accuracy": 0.840397872030735, |
| "num_tokens": 140657705.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.8795411089866156, |
| "grad_norm": 0.14397182347295984, |
| "learning_rate": 9.785788068950463e-06, |
| "loss": 0.5054, |
| "mean_token_accuracy": 0.8325815722346306, |
| "num_tokens": 141233705.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.8871892925430211, |
| "grad_norm": 0.1406587307230205, |
| "learning_rate": 9.784048554556488e-06, |
| "loss": 0.5024, |
| "mean_token_accuracy": 0.8344358131289482, |
| "num_tokens": 141809705.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.8948374760994264, |
| "grad_norm": 0.1346925448003243, |
| "learning_rate": 9.782302161679177e-06, |
| "loss": 0.4846, |
| "mean_token_accuracy": 0.8397086039185524, |
| "num_tokens": 142385705.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.9024856596558317, |
| "grad_norm": 0.13981403945386878, |
| "learning_rate": 9.780548892829486e-06, |
| "loss": 0.4886, |
| "mean_token_accuracy": 0.8380331769585609, |
| "num_tokens": 142961705.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.910133843212237, |
| "grad_norm": 0.13251833042206107, |
| "learning_rate": 9.778788750528257e-06, |
| "loss": 0.4667, |
| "mean_token_accuracy": 0.8444865867495537, |
| "num_tokens": 143537705.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.9177820267686423, |
| "grad_norm": 0.14376049074556502, |
| "learning_rate": 9.777021737306214e-06, |
| "loss": 0.4857, |
| "mean_token_accuracy": 0.8387880772352219, |
| "num_tokens": 144099211.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.9254302103250478, |
| "grad_norm": 0.14486384551210302, |
| "learning_rate": 9.775247855703962e-06, |
| "loss": 0.4826, |
| "mean_token_accuracy": 0.8404586315155029, |
| "num_tokens": 144675211.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.9330783938814533, |
| "grad_norm": 0.1388194492997388, |
| "learning_rate": 9.773467108271978e-06, |
| "loss": 0.51, |
| "mean_token_accuracy": 0.8311353400349617, |
| "num_tokens": 145251211.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.9407265774378586, |
| "grad_norm": 0.13026081138638712, |
| "learning_rate": 9.771679497570614e-06, |
| "loss": 0.4799, |
| "mean_token_accuracy": 0.8409708067774773, |
| "num_tokens": 145827211.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.9483747609942639, |
| "grad_norm": 0.1430745806772645, |
| "learning_rate": 9.769885026170088e-06, |
| "loss": 0.4629, |
| "mean_token_accuracy": 0.8461342081427574, |
| "num_tokens": 146403211.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.9560229445506692, |
| "grad_norm": 0.1423170947817084, |
| "learning_rate": 9.768083696650481e-06, |
| "loss": 0.4964, |
| "mean_token_accuracy": 0.8357796147465706, |
| "num_tokens": 146979211.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.9636711281070744, |
| "grad_norm": 0.1412781732428823, |
| "learning_rate": 9.766275511601742e-06, |
| "loss": 0.4685, |
| "mean_token_accuracy": 0.84524355083704, |
| "num_tokens": 147555211.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.97131931166348, |
| "grad_norm": 0.1347390798520367, |
| "learning_rate": 9.764460473623665e-06, |
| "loss": 0.4964, |
| "mean_token_accuracy": 0.8352379277348518, |
| "num_tokens": 148131211.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.9789674952198852, |
| "grad_norm": 0.140319211357454, |
| "learning_rate": 9.762638585325907e-06, |
| "loss": 0.4651, |
| "mean_token_accuracy": 0.8467453494668007, |
| "num_tokens": 148707211.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.9866156787762907, |
| "grad_norm": 0.14587479148422647, |
| "learning_rate": 9.760809849327967e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.8388071581721306, |
| "num_tokens": 149261156.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.994263862332696, |
| "grad_norm": 0.15571274133757082, |
| "learning_rate": 9.7589742682592e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.8390511944890022, |
| "num_tokens": 149814426.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.1656455865676849, |
| "learning_rate": 9.75713184475879e-06, |
| "loss": 0.5189, |
| "mean_token_accuracy": 0.8292289972305298, |
| "num_tokens": 150246426.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.0076481835564053, |
| "grad_norm": 0.13050271447099013, |
| "learning_rate": 9.755282581475769e-06, |
| "loss": 0.4564, |
| "mean_token_accuracy": 0.8467505499720573, |
| "num_tokens": 150822426.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.0152963671128106, |
| "grad_norm": 0.1336370185440778, |
| "learning_rate": 9.753426481068998e-06, |
| "loss": 0.4913, |
| "mean_token_accuracy": 0.8365452736616135, |
| "num_tokens": 151398426.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.022944550669216, |
| "grad_norm": 0.1382072756245929, |
| "learning_rate": 9.751563546207167e-06, |
| "loss": 0.4878, |
| "mean_token_accuracy": 0.8377542123198509, |
| "num_tokens": 151952371.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.0305927342256216, |
| "grad_norm": 0.14387760662210322, |
| "learning_rate": 9.749693779568799e-06, |
| "loss": 0.5203, |
| "mean_token_accuracy": 0.827904649078846, |
| "num_tokens": 152520866.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.038240917782027, |
| "grad_norm": 0.14163598725064547, |
| "learning_rate": 9.747817183842235e-06, |
| "loss": 0.486, |
| "mean_token_accuracy": 0.8394655287265778, |
| "num_tokens": 153096866.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.045889101338432, |
| "grad_norm": 0.1357429538351474, |
| "learning_rate": 9.745933761725635e-06, |
| "loss": 0.4392, |
| "mean_token_accuracy": 0.8543984368443489, |
| "num_tokens": 153672866.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.0535372848948374, |
| "grad_norm": 0.14159049917519523, |
| "learning_rate": 9.744043515926975e-06, |
| "loss": 0.4704, |
| "mean_token_accuracy": 0.8423840552568436, |
| "num_tokens": 154248866.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.0611854684512427, |
| "grad_norm": 0.140767759584201, |
| "learning_rate": 9.742146449164045e-06, |
| "loss": 0.4657, |
| "mean_token_accuracy": 0.8441150411963463, |
| "num_tokens": 154824866.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.068833652007648, |
| "grad_norm": 0.1376003238674713, |
| "learning_rate": 9.740242564164433e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.8394047617912292, |
| "num_tokens": 155400866.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.0764818355640537, |
| "grad_norm": 0.14234028861993708, |
| "learning_rate": 9.738331863665541e-06, |
| "loss": 0.4541, |
| "mean_token_accuracy": 0.8469480872154236, |
| "num_tokens": 155954136.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.084130019120459, |
| "grad_norm": 0.12785940871582666, |
| "learning_rate": 9.736414350414564e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.8540546894073486, |
| "num_tokens": 156530136.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.0917782026768643, |
| "grad_norm": 0.13077439345298142, |
| "learning_rate": 9.734490027168494e-06, |
| "loss": 0.4536, |
| "mean_token_accuracy": 0.8471047431230545, |
| "num_tokens": 157106136.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.0994263862332696, |
| "grad_norm": 0.13230573836928355, |
| "learning_rate": 9.732558896694114e-06, |
| "loss": 0.4722, |
| "mean_token_accuracy": 0.842342384159565, |
| "num_tokens": 157682136.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.107074569789675, |
| "grad_norm": 0.13384258031186463, |
| "learning_rate": 9.730620961767996e-06, |
| "loss": 0.4693, |
| "mean_token_accuracy": 0.8435490280389786, |
| "num_tokens": 158258136.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.11472275334608, |
| "grad_norm": 0.13295124171892045, |
| "learning_rate": 9.72867622517649e-06, |
| "loss": 0.4848, |
| "mean_token_accuracy": 0.8387293890118599, |
| "num_tokens": 158834136.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.1223709369024855, |
| "grad_norm": 0.13698639518555758, |
| "learning_rate": 9.726724689715734e-06, |
| "loss": 0.4709, |
| "mean_token_accuracy": 0.8430733233690262, |
| "num_tokens": 159410136.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.130019120458891, |
| "grad_norm": 0.131887831277675, |
| "learning_rate": 9.724766358191635e-06, |
| "loss": 0.482, |
| "mean_token_accuracy": 0.8394516482949257, |
| "num_tokens": 159986136.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.1376673040152965, |
| "grad_norm": 0.14891285174122362, |
| "learning_rate": 9.722801233419873e-06, |
| "loss": 0.4966, |
| "mean_token_accuracy": 0.8344601169228554, |
| "num_tokens": 160562136.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.1453154875717018, |
| "grad_norm": 0.13274530265535694, |
| "learning_rate": 9.720829318225897e-06, |
| "loss": 0.4412, |
| "mean_token_accuracy": 0.8516188189387321, |
| "num_tokens": 161138136.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.152963671128107, |
| "grad_norm": 0.12558989675749826, |
| "learning_rate": 9.718850615444915e-06, |
| "loss": 0.4617, |
| "mean_token_accuracy": 0.8458894118666649, |
| "num_tokens": 161714136.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.1606118546845123, |
| "grad_norm": 0.13805526111149538, |
| "learning_rate": 9.7168651279219e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.84109927713871, |
| "num_tokens": 162290136.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.1682600382409176, |
| "grad_norm": 0.1365628904112165, |
| "learning_rate": 9.714872858511574e-06, |
| "loss": 0.4813, |
| "mean_token_accuracy": 0.8389794006943703, |
| "num_tokens": 162866136.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.1759082217973233, |
| "grad_norm": 0.14202588056377932, |
| "learning_rate": 9.712873810078415e-06, |
| "loss": 0.4731, |
| "mean_token_accuracy": 0.8429031893610954, |
| "num_tokens": 163442136.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.1835564053537286, |
| "grad_norm": 0.14026758376596035, |
| "learning_rate": 9.710867985496644e-06, |
| "loss": 0.4828, |
| "mean_token_accuracy": 0.8403249308466911, |
| "num_tokens": 164018136.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.191204588910134, |
| "grad_norm": 0.1268634171589506, |
| "learning_rate": 9.708855387650229e-06, |
| "loss": 0.4566, |
| "mean_token_accuracy": 0.8471759259700775, |
| "num_tokens": 164594136.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.198852772466539, |
| "grad_norm": 0.1361940615902871, |
| "learning_rate": 9.706836019432872e-06, |
| "loss": 0.4843, |
| "mean_token_accuracy": 0.8384498655796051, |
| "num_tokens": 165170136.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.2065009560229445, |
| "grad_norm": 0.13071252562608368, |
| "learning_rate": 9.704809883748012e-06, |
| "loss": 0.4619, |
| "mean_token_accuracy": 0.8458164930343628, |
| "num_tokens": 165746136.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.21414913957935, |
| "grad_norm": 0.13749244273488634, |
| "learning_rate": 9.70277698350882e-06, |
| "loss": 0.4763, |
| "mean_token_accuracy": 0.8413892164826393, |
| "num_tokens": 166322136.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.221797323135755, |
| "grad_norm": 0.13653291312402388, |
| "learning_rate": 9.700737321638185e-06, |
| "loss": 0.494, |
| "mean_token_accuracy": 0.8379845768213272, |
| "num_tokens": 166898136.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.229445506692161, |
| "grad_norm": 0.1314091235725935, |
| "learning_rate": 9.69869090106873e-06, |
| "loss": 0.4553, |
| "mean_token_accuracy": 0.8476238548755646, |
| "num_tokens": 167474136.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.237093690248566, |
| "grad_norm": 0.1306318774358581, |
| "learning_rate": 9.696637724742785e-06, |
| "loss": 0.4588, |
| "mean_token_accuracy": 0.8466134071350098, |
| "num_tokens": 168050136.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.2447418738049714, |
| "grad_norm": 0.1320429519519253, |
| "learning_rate": 9.6945777956124e-06, |
| "loss": 0.4893, |
| "mean_token_accuracy": 0.8379810974001884, |
| "num_tokens": 168626136.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.2523900573613767, |
| "grad_norm": 0.12581567893697732, |
| "learning_rate": 9.69251111663933e-06, |
| "loss": 0.4421, |
| "mean_token_accuracy": 0.8516535460948944, |
| "num_tokens": 169202136.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.260038240917782, |
| "grad_norm": 0.15120633186562035, |
| "learning_rate": 9.690437690795038e-06, |
| "loss": 0.4828, |
| "mean_token_accuracy": 0.8403878286480904, |
| "num_tokens": 169764783.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.2676864244741872, |
| "grad_norm": 0.14083593416082985, |
| "learning_rate": 9.688357521060685e-06, |
| "loss": 0.4997, |
| "mean_token_accuracy": 0.8348125591874123, |
| "num_tokens": 170340783.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.275334608030593, |
| "grad_norm": 0.13552190258745817, |
| "learning_rate": 9.686270610427131e-06, |
| "loss": 0.4831, |
| "mean_token_accuracy": 0.8392866998910904, |
| "num_tokens": 170916783.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.2829827915869982, |
| "grad_norm": 0.1301480903850601, |
| "learning_rate": 9.684176961894927e-06, |
| "loss": 0.4668, |
| "mean_token_accuracy": 0.8437822833657265, |
| "num_tokens": 171490115.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.2906309751434035, |
| "grad_norm": 0.14821266280449158, |
| "learning_rate": 9.682076578474308e-06, |
| "loss": 0.4986, |
| "mean_token_accuracy": 0.8342083841562271, |
| "num_tokens": 172066115.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.298279158699809, |
| "grad_norm": 0.13381251110623754, |
| "learning_rate": 9.6799694631852e-06, |
| "loss": 0.4603, |
| "mean_token_accuracy": 0.8468859866261482, |
| "num_tokens": 172642115.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.305927342256214, |
| "grad_norm": 0.13169362581883381, |
| "learning_rate": 9.677855619057202e-06, |
| "loss": 0.4604, |
| "mean_token_accuracy": 0.8466967418789864, |
| "num_tokens": 173218115.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.3135755258126194, |
| "grad_norm": 0.13195788424828656, |
| "learning_rate": 9.675735049129588e-06, |
| "loss": 0.4621, |
| "mean_token_accuracy": 0.8446150496602058, |
| "num_tokens": 173794115.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.3212237093690247, |
| "grad_norm": 0.14122025118479573, |
| "learning_rate": 9.673607756451306e-06, |
| "loss": 0.4349, |
| "mean_token_accuracy": 0.853457435965538, |
| "num_tokens": 174370115.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.3288718929254304, |
| "grad_norm": 0.12878410078634828, |
| "learning_rate": 9.67147374408097e-06, |
| "loss": 0.4447, |
| "mean_token_accuracy": 0.8515077084302902, |
| "num_tokens": 174946115.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.3365200764818357, |
| "grad_norm": 0.125714053577621, |
| "learning_rate": 9.669333015086847e-06, |
| "loss": 0.4384, |
| "mean_token_accuracy": 0.8537577912211418, |
| "num_tokens": 175522115.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.344168260038241, |
| "grad_norm": 0.1364732476611976, |
| "learning_rate": 9.667185572546871e-06, |
| "loss": 0.4656, |
| "mean_token_accuracy": 0.8443147018551826, |
| "num_tokens": 176098115.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.3518164435946463, |
| "grad_norm": 0.13118221069966887, |
| "learning_rate": 9.665031419548625e-06, |
| "loss": 0.4533, |
| "mean_token_accuracy": 0.8475735187530518, |
| "num_tokens": 176674115.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.3594646271510515, |
| "grad_norm": 0.13021585818887338, |
| "learning_rate": 9.662870559189344e-06, |
| "loss": 0.4772, |
| "mean_token_accuracy": 0.8415628522634506, |
| "num_tokens": 177250115.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.367112810707457, |
| "grad_norm": 0.1296491093025139, |
| "learning_rate": 9.660702994575896e-06, |
| "loss": 0.4504, |
| "mean_token_accuracy": 0.8494465947151184, |
| "num_tokens": 177815810.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.3747609942638626, |
| "grad_norm": 0.1269070770369225, |
| "learning_rate": 9.658528728824799e-06, |
| "loss": 0.4454, |
| "mean_token_accuracy": 0.851382702589035, |
| "num_tokens": 178391810.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.382409177820268, |
| "grad_norm": 0.13236942393674378, |
| "learning_rate": 9.656347765062206e-06, |
| "loss": 0.4959, |
| "mean_token_accuracy": 0.8356025218963623, |
| "num_tokens": 178967810.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.390057361376673, |
| "grad_norm": 0.13431040882625667, |
| "learning_rate": 9.654160106423891e-06, |
| "loss": 0.4647, |
| "mean_token_accuracy": 0.8447903171181679, |
| "num_tokens": 179541447.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.3977055449330784, |
| "grad_norm": 0.1304990964909841, |
| "learning_rate": 9.651965756055262e-06, |
| "loss": 0.4733, |
| "mean_token_accuracy": 0.8432052731513977, |
| "num_tokens": 180117447.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.4053537284894837, |
| "grad_norm": 0.13562049534152407, |
| "learning_rate": 9.649764717111348e-06, |
| "loss": 0.4827, |
| "mean_token_accuracy": 0.8400332629680634, |
| "num_tokens": 180693447.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.413001912045889, |
| "grad_norm": 0.1322708589540597, |
| "learning_rate": 9.647556992756789e-06, |
| "loss": 0.4685, |
| "mean_token_accuracy": 0.8437608480453491, |
| "num_tokens": 181269447.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.4206500956022943, |
| "grad_norm": 0.13106792460307698, |
| "learning_rate": 9.645342586165845e-06, |
| "loss": 0.4676, |
| "mean_token_accuracy": 0.8447365760803223, |
| "num_tokens": 181845447.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.4282982791587, |
| "grad_norm": 0.12774223293558123, |
| "learning_rate": 9.643121500522377e-06, |
| "loss": 0.4783, |
| "mean_token_accuracy": 0.841241642832756, |
| "num_tokens": 182421447.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.4359464627151053, |
| "grad_norm": 0.13329940881644012, |
| "learning_rate": 9.640893739019852e-06, |
| "loss": 0.4558, |
| "mean_token_accuracy": 0.8470873907208443, |
| "num_tokens": 182997447.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.4435946462715106, |
| "grad_norm": 0.1389284366392599, |
| "learning_rate": 9.638659304861336e-06, |
| "loss": 0.4873, |
| "mean_token_accuracy": 0.8378456756472588, |
| "num_tokens": 183573447.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.451242829827916, |
| "grad_norm": 0.13022523574202438, |
| "learning_rate": 9.63641820125949e-06, |
| "loss": 0.4532, |
| "mean_token_accuracy": 0.8482332676649094, |
| "num_tokens": 184149447.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.458891013384321, |
| "grad_norm": 0.12869518334269098, |
| "learning_rate": 9.63417043143656e-06, |
| "loss": 0.4364, |
| "mean_token_accuracy": 0.8536501526832581, |
| "num_tokens": 184725447.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.4665391969407264, |
| "grad_norm": 0.1685448514608509, |
| "learning_rate": 9.631915998624382e-06, |
| "loss": 0.4801, |
| "mean_token_accuracy": 0.8401600047945976, |
| "num_tokens": 185301447.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.474187380497132, |
| "grad_norm": 0.13160582073089025, |
| "learning_rate": 9.629654906064365e-06, |
| "loss": 0.4649, |
| "mean_token_accuracy": 0.8448563814163208, |
| "num_tokens": 185877447.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.4818355640535374, |
| "grad_norm": 0.1367794173463771, |
| "learning_rate": 9.627387157007502e-06, |
| "loss": 0.4883, |
| "mean_token_accuracy": 0.8361355364322662, |
| "num_tokens": 186453447.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.4894837476099427, |
| "grad_norm": 0.13222850798423158, |
| "learning_rate": 9.62511275471435e-06, |
| "loss": 0.4677, |
| "mean_token_accuracy": 0.8434003219008446, |
| "num_tokens": 187010772.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.497131931166348, |
| "grad_norm": 0.15953227190496977, |
| "learning_rate": 9.622831702455035e-06, |
| "loss": 0.4831, |
| "mean_token_accuracy": 0.8401443809270859, |
| "num_tokens": 187586772.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.5047801147227533, |
| "grad_norm": 0.1298624067386934, |
| "learning_rate": 9.620544003509243e-06, |
| "loss": 0.4741, |
| "mean_token_accuracy": 0.8434900045394897, |
| "num_tokens": 188162772.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.5124282982791586, |
| "grad_norm": 0.13840666580879918, |
| "learning_rate": 9.618249661166218e-06, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.8363108858466148, |
| "num_tokens": 188738772.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.520076481835564, |
| "grad_norm": 0.12560912611469238, |
| "learning_rate": 9.615948678724756e-06, |
| "loss": 0.4396, |
| "mean_token_accuracy": 0.8522473201155663, |
| "num_tokens": 189314772.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.527724665391969, |
| "grad_norm": 0.1296554841366996, |
| "learning_rate": 9.613641059493197e-06, |
| "loss": 0.4615, |
| "mean_token_accuracy": 0.8463234454393387, |
| "num_tokens": 189890772.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.535372848948375, |
| "grad_norm": 0.14613127575466625, |
| "learning_rate": 9.611326806789424e-06, |
| "loss": 0.4799, |
| "mean_token_accuracy": 0.8401964753866196, |
| "num_tokens": 190466772.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.54302103250478, |
| "grad_norm": 0.13394861492299956, |
| "learning_rate": 9.609005923940865e-06, |
| "loss": 0.455, |
| "mean_token_accuracy": 0.8472384214401245, |
| "num_tokens": 191042772.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.5506692160611855, |
| "grad_norm": 0.12785020969707311, |
| "learning_rate": 9.606678414284469e-06, |
| "loss": 0.45, |
| "mean_token_accuracy": 0.8496569246053696, |
| "num_tokens": 191618772.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.5583173996175907, |
| "grad_norm": 0.13667091866209735, |
| "learning_rate": 9.604344281166721e-06, |
| "loss": 0.4569, |
| "mean_token_accuracy": 0.8474294021725655, |
| "num_tokens": 192194772.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.565965583173996, |
| "grad_norm": 0.14170531764608627, |
| "learning_rate": 9.602003527943629e-06, |
| "loss": 0.4841, |
| "mean_token_accuracy": 0.838857851922512, |
| "num_tokens": 192770772.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.5736137667304018, |
| "grad_norm": 0.14203506631993854, |
| "learning_rate": 9.599656157980715e-06, |
| "loss": 0.4711, |
| "mean_token_accuracy": 0.8428667038679123, |
| "num_tokens": 193346772.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.581261950286807, |
| "grad_norm": 0.13109007712840628, |
| "learning_rate": 9.597302174653016e-06, |
| "loss": 0.4452, |
| "mean_token_accuracy": 0.8508444800972939, |
| "num_tokens": 193922772.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.5889101338432123, |
| "grad_norm": 0.1376764806733569, |
| "learning_rate": 9.594941581345079e-06, |
| "loss": 0.4743, |
| "mean_token_accuracy": 0.8415871560573578, |
| "num_tokens": 194498772.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.5965583173996176, |
| "grad_norm": 0.13134460713857604, |
| "learning_rate": 9.592574381450957e-06, |
| "loss": 0.4536, |
| "mean_token_accuracy": 0.8493340015411377, |
| "num_tokens": 195074772.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.604206500956023, |
| "grad_norm": 0.12998973657236298, |
| "learning_rate": 9.590200578374198e-06, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.8423146158456802, |
| "num_tokens": 195650772.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.611854684512428, |
| "grad_norm": 0.13974086743961503, |
| "learning_rate": 9.587820175527844e-06, |
| "loss": 0.4633, |
| "mean_token_accuracy": 0.8453876450657845, |
| "num_tokens": 196226772.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.6195028680688335, |
| "grad_norm": 0.13737097736695414, |
| "learning_rate": 9.585433176334426e-06, |
| "loss": 0.4881, |
| "mean_token_accuracy": 0.8374150991439819, |
| "num_tokens": 196802772.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.6271510516252388, |
| "grad_norm": 0.1328384415039295, |
| "learning_rate": 9.583039584225966e-06, |
| "loss": 0.452, |
| "mean_token_accuracy": 0.8489537835121155, |
| "num_tokens": 197378772.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.6347992351816445, |
| "grad_norm": 0.13333625376712616, |
| "learning_rate": 9.580639402643957e-06, |
| "loss": 0.4749, |
| "mean_token_accuracy": 0.8421444669365883, |
| "num_tokens": 197954772.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.64244741873805, |
| "grad_norm": 0.13099239341333282, |
| "learning_rate": 9.578232635039368e-06, |
| "loss": 0.468, |
| "mean_token_accuracy": 0.8446983769536018, |
| "num_tokens": 198530772.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.650095602294455, |
| "grad_norm": 0.1319772619241733, |
| "learning_rate": 9.575819284872638e-06, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.8412885293364525, |
| "num_tokens": 199106772.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.6577437858508604, |
| "grad_norm": 0.1315801229711581, |
| "learning_rate": 9.573399355613675e-06, |
| "loss": 0.4538, |
| "mean_token_accuracy": 0.8480388075113297, |
| "num_tokens": 199682772.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.6653919694072656, |
| "grad_norm": 0.14029244455419185, |
| "learning_rate": 9.570972850741839e-06, |
| "loss": 0.4642, |
| "mean_token_accuracy": 0.8463078290224075, |
| "num_tokens": 200258772.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.6730401529636714, |
| "grad_norm": 0.1382382609127072, |
| "learning_rate": 9.568539773745947e-06, |
| "loss": 0.4792, |
| "mean_token_accuracy": 0.8414309024810791, |
| "num_tokens": 200834772.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.6806883365200767, |
| "grad_norm": 0.13641202302875485, |
| "learning_rate": 9.56610012812427e-06, |
| "loss": 0.4948, |
| "mean_token_accuracy": 0.8358525335788727, |
| "num_tokens": 201410772.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.688336520076482, |
| "grad_norm": 0.13648644317956463, |
| "learning_rate": 9.563653917384515e-06, |
| "loss": 0.4791, |
| "mean_token_accuracy": 0.8403978571295738, |
| "num_tokens": 201986772.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.6959847036328872, |
| "grad_norm": 0.1265267382875485, |
| "learning_rate": 9.561201145043835e-06, |
| "loss": 0.4565, |
| "mean_token_accuracy": 0.8470665439963341, |
| "num_tokens": 202562772.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.7036328871892925, |
| "grad_norm": 0.14834388827490924, |
| "learning_rate": 9.558741814628815e-06, |
| "loss": 0.4927, |
| "mean_token_accuracy": 0.8357188552618027, |
| "num_tokens": 203138772.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.711281070745698, |
| "grad_norm": 0.1339156205744843, |
| "learning_rate": 9.55627592967547e-06, |
| "loss": 0.4552, |
| "mean_token_accuracy": 0.8483408987522125, |
| "num_tokens": 203714772.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.718929254302103, |
| "grad_norm": 0.13745433509215765, |
| "learning_rate": 9.553803493729237e-06, |
| "loss": 0.4797, |
| "mean_token_accuracy": 0.8393444642424583, |
| "num_tokens": 204277037.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.7265774378585084, |
| "grad_norm": 0.12921999079532936, |
| "learning_rate": 9.551324510344972e-06, |
| "loss": 0.4649, |
| "mean_token_accuracy": 0.8451011776924133, |
| "num_tokens": 204853037.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.734225621414914, |
| "grad_norm": 0.13846536123122738, |
| "learning_rate": 9.548838983086948e-06, |
| "loss": 0.4771, |
| "mean_token_accuracy": 0.8409933820366859, |
| "num_tokens": 205429037.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.7418738049713194, |
| "grad_norm": 0.13803335699731223, |
| "learning_rate": 9.546346915528844e-06, |
| "loss": 0.4704, |
| "mean_token_accuracy": 0.8430611714720726, |
| "num_tokens": 206005037.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.7495219885277247, |
| "grad_norm": 0.1425603356301425, |
| "learning_rate": 9.543848311253744e-06, |
| "loss": 0.4506, |
| "mean_token_accuracy": 0.8472823351621628, |
| "num_tokens": 206551931.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.75717017208413, |
| "grad_norm": 0.12337957343310935, |
| "learning_rate": 9.541343173854128e-06, |
| "loss": 0.4024, |
| "mean_token_accuracy": 0.8640047535300255, |
| "num_tokens": 207127931.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.7648183556405352, |
| "grad_norm": 0.1440988584019457, |
| "learning_rate": 9.538831506931874e-06, |
| "loss": 0.483, |
| "mean_token_accuracy": 0.8385800793766975, |
| "num_tokens": 207703931.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.772466539196941, |
| "grad_norm": 0.1336436468088995, |
| "learning_rate": 9.536313314098243e-06, |
| "loss": 0.4869, |
| "mean_token_accuracy": 0.8362970650196075, |
| "num_tokens": 208277831.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.7801147227533463, |
| "grad_norm": 0.13491196154454094, |
| "learning_rate": 9.533788598973883e-06, |
| "loss": 0.465, |
| "mean_token_accuracy": 0.8448338061571121, |
| "num_tokens": 208853831.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.7877629063097515, |
| "grad_norm": 0.13638411467630132, |
| "learning_rate": 9.531257365188818e-06, |
| "loss": 0.4909, |
| "mean_token_accuracy": 0.8366459757089615, |
| "num_tokens": 209429831.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.795411089866157, |
| "grad_norm": 0.1368094560097927, |
| "learning_rate": 9.528719616382443e-06, |
| "loss": 0.4842, |
| "mean_token_accuracy": 0.8384652063250542, |
| "num_tokens": 209990026.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.803059273422562, |
| "grad_norm": 0.13780641532143703, |
| "learning_rate": 9.526175356203521e-06, |
| "loss": 0.4659, |
| "mean_token_accuracy": 0.8452418148517609, |
| "num_tokens": 210566026.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.8107074569789674, |
| "grad_norm": 0.14787034743484256, |
| "learning_rate": 9.523624588310181e-06, |
| "loss": 0.4681, |
| "mean_token_accuracy": 0.8442660793662071, |
| "num_tokens": 211142026.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.8183556405353727, |
| "grad_norm": 0.13862557544263718, |
| "learning_rate": 9.521067316369903e-06, |
| "loss": 0.4571, |
| "mean_token_accuracy": 0.8479363769292831, |
| "num_tokens": 211718026.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.826003824091778, |
| "grad_norm": 0.13772334123321825, |
| "learning_rate": 9.518503544059523e-06, |
| "loss": 0.4799, |
| "mean_token_accuracy": 0.8397033959627151, |
| "num_tokens": 212294026.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.8336520076481837, |
| "grad_norm": 0.13948890447531806, |
| "learning_rate": 9.515933275065218e-06, |
| "loss": 0.4896, |
| "mean_token_accuracy": 0.837375171482563, |
| "num_tokens": 212870026.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.841300191204589, |
| "grad_norm": 0.13858419410017597, |
| "learning_rate": 9.513356513082512e-06, |
| "loss": 0.4671, |
| "mean_token_accuracy": 0.8438424468040466, |
| "num_tokens": 213446026.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.8489483747609943, |
| "grad_norm": 0.13242895194664173, |
| "learning_rate": 9.510773261816261e-06, |
| "loss": 0.4517, |
| "mean_token_accuracy": 0.8478269800543785, |
| "num_tokens": 214022026.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.8565965583173996, |
| "grad_norm": 0.1361208802973601, |
| "learning_rate": 9.508183524980651e-06, |
| "loss": 0.4773, |
| "mean_token_accuracy": 0.8403232023119926, |
| "num_tokens": 214598026.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.864244741873805, |
| "grad_norm": 0.14229532646455947, |
| "learning_rate": 9.505587306299196e-06, |
| "loss": 0.4601, |
| "mean_token_accuracy": 0.8460283130407333, |
| "num_tokens": 215174026.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.8718929254302106, |
| "grad_norm": 0.14481534193858964, |
| "learning_rate": 9.502984609504724e-06, |
| "loss": 0.4816, |
| "mean_token_accuracy": 0.8387467563152313, |
| "num_tokens": 215750026.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.879541108986616, |
| "grad_norm": 0.12944860389639234, |
| "learning_rate": 9.500375438339384e-06, |
| "loss": 0.4539, |
| "mean_token_accuracy": 0.8480362817645073, |
| "num_tokens": 216311532.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.887189292543021, |
| "grad_norm": 0.14010965456936167, |
| "learning_rate": 9.497759796554629e-06, |
| "loss": 0.4715, |
| "mean_token_accuracy": 0.8425246849656105, |
| "num_tokens": 216887532.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.8948374760994264, |
| "grad_norm": 0.14339976848438407, |
| "learning_rate": 9.495137687911218e-06, |
| "loss": 0.4645, |
| "mean_token_accuracy": 0.8451827839016914, |
| "num_tokens": 217463532.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.9024856596558317, |
| "grad_norm": 0.1425721008440435, |
| "learning_rate": 9.492509116179206e-06, |
| "loss": 0.4758, |
| "mean_token_accuracy": 0.8416374921798706, |
| "num_tokens": 218039532.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.910133843212237, |
| "grad_norm": 0.12904281233195608, |
| "learning_rate": 9.48987408513794e-06, |
| "loss": 0.4513, |
| "mean_token_accuracy": 0.848038800060749, |
| "num_tokens": 218615532.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.9177820267686423, |
| "grad_norm": 0.13124547175151383, |
| "learning_rate": 9.487232598576056e-06, |
| "loss": 0.4435, |
| "mean_token_accuracy": 0.8508514240384102, |
| "num_tokens": 219191532.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.9254302103250476, |
| "grad_norm": 0.14160726517557595, |
| "learning_rate": 9.484584660291476e-06, |
| "loss": 0.4815, |
| "mean_token_accuracy": 0.8395801186561584, |
| "num_tokens": 219767532.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.9330783938814533, |
| "grad_norm": 0.13800419935826905, |
| "learning_rate": 9.481930274091388e-06, |
| "loss": 0.4562, |
| "mean_token_accuracy": 0.8468825072050095, |
| "num_tokens": 220343532.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.9407265774378586, |
| "grad_norm": 0.13561246479535266, |
| "learning_rate": 9.47926944379226e-06, |
| "loss": 0.4505, |
| "mean_token_accuracy": 0.8491343483328819, |
| "num_tokens": 220919532.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.948374760994264, |
| "grad_norm": 0.13430277105466684, |
| "learning_rate": 9.476602173219822e-06, |
| "loss": 0.4531, |
| "mean_token_accuracy": 0.8477696999907494, |
| "num_tokens": 221495532.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.956022944550669, |
| "grad_norm": 0.1471890474737779, |
| "learning_rate": 9.47392846620906e-06, |
| "loss": 0.4869, |
| "mean_token_accuracy": 0.8385036885738373, |
| "num_tokens": 222071532.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.9636711281070744, |
| "grad_norm": 0.15113073329267498, |
| "learning_rate": 9.47124832660422e-06, |
| "loss": 0.4813, |
| "mean_token_accuracy": 0.8399360328912735, |
| "num_tokens": 222647532.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.97131931166348, |
| "grad_norm": 0.13475027561090738, |
| "learning_rate": 9.468561758258795e-06, |
| "loss": 0.5047, |
| "mean_token_accuracy": 0.8322638496756554, |
| "num_tokens": 223223532.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.9789674952198855, |
| "grad_norm": 0.13848998512836547, |
| "learning_rate": 9.465868765035519e-06, |
| "loss": 0.4982, |
| "mean_token_accuracy": 0.8339010700583458, |
| "num_tokens": 223799532.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.9866156787762907, |
| "grad_norm": 0.1319671191248173, |
| "learning_rate": 9.463169350806369e-06, |
| "loss": 0.4761, |
| "mean_token_accuracy": 0.8407210931181908, |
| "num_tokens": 224361639.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.994263862332696, |
| "grad_norm": 0.14429086224338838, |
| "learning_rate": 9.460463519452547e-06, |
| "loss": 0.4894, |
| "mean_token_accuracy": 0.8367779329419136, |
| "num_tokens": 224937639.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.14450290671664134, |
| "learning_rate": 9.457751274864486e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.8539082705974579, |
| "num_tokens": 225369639.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 3.0076481835564053, |
| "grad_norm": 0.14273013240376312, |
| "learning_rate": 9.45503262094184e-06, |
| "loss": 0.4488, |
| "mean_token_accuracy": 0.8485075756907463, |
| "num_tokens": 225945639.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 3.0152963671128106, |
| "grad_norm": 0.1318792500917861, |
| "learning_rate": 9.452307561593476e-06, |
| "loss": 0.4355, |
| "mean_token_accuracy": 0.8525528833270073, |
| "num_tokens": 226521639.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.022944550669216, |
| "grad_norm": 0.13529933934932678, |
| "learning_rate": 9.449576100737474e-06, |
| "loss": 0.4405, |
| "mean_token_accuracy": 0.8521223217248917, |
| "num_tokens": 227097639.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 3.0305927342256216, |
| "grad_norm": 0.1239800938473107, |
| "learning_rate": 9.446838242301113e-06, |
| "loss": 0.4272, |
| "mean_token_accuracy": 0.8561867102980614, |
| "num_tokens": 227673639.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 3.038240917782027, |
| "grad_norm": 0.1346937584568678, |
| "learning_rate": 9.444093990220876e-06, |
| "loss": 0.4578, |
| "mean_token_accuracy": 0.8471168950200081, |
| "num_tokens": 228249639.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 3.045889101338432, |
| "grad_norm": 0.1342389352088606, |
| "learning_rate": 9.441343348442436e-06, |
| "loss": 0.44, |
| "mean_token_accuracy": 0.8513809442520142, |
| "num_tokens": 228825639.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 3.0535372848948374, |
| "grad_norm": 0.12739390345432006, |
| "learning_rate": 9.438586320920651e-06, |
| "loss": 0.421, |
| "mean_token_accuracy": 0.8576086536049843, |
| "num_tokens": 229401639.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.0611854684512427, |
| "grad_norm": 0.13440046577139128, |
| "learning_rate": 9.435822911619564e-06, |
| "loss": 0.4785, |
| "mean_token_accuracy": 0.8404169529676437, |
| "num_tokens": 229977639.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.068833652007648, |
| "grad_norm": 0.14325897840167365, |
| "learning_rate": 9.433053124512394e-06, |
| "loss": 0.4928, |
| "mean_token_accuracy": 0.8352709040045738, |
| "num_tokens": 230553639.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.0764818355640537, |
| "grad_norm": 0.1420958733294688, |
| "learning_rate": 9.430276963581526e-06, |
| "loss": 0.4624, |
| "mean_token_accuracy": 0.8442712873220444, |
| "num_tokens": 231129639.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.084130019120459, |
| "grad_norm": 0.13205149824341564, |
| "learning_rate": 9.427494432818514e-06, |
| "loss": 0.4558, |
| "mean_token_accuracy": 0.8468581959605217, |
| "num_tokens": 231705639.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.0917782026768643, |
| "grad_norm": 0.1316126104720129, |
| "learning_rate": 9.424705536224065e-06, |
| "loss": 0.4634, |
| "mean_token_accuracy": 0.8433320075273514, |
| "num_tokens": 232281639.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.0994263862332696, |
| "grad_norm": 0.1407535847573216, |
| "learning_rate": 9.421910277808044e-06, |
| "loss": 0.4896, |
| "mean_token_accuracy": 0.8361390084028244, |
| "num_tokens": 232857639.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.107074569789675, |
| "grad_norm": 0.13472622990137864, |
| "learning_rate": 9.419108661589462e-06, |
| "loss": 0.4536, |
| "mean_token_accuracy": 0.8465508967638016, |
| "num_tokens": 233433639.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.11472275334608, |
| "grad_norm": 0.1451607665647199, |
| "learning_rate": 9.416300691596469e-06, |
| "loss": 0.4722, |
| "mean_token_accuracy": 0.8422694802284241, |
| "num_tokens": 234009639.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.1223709369024855, |
| "grad_norm": 0.1467639357286111, |
| "learning_rate": 9.41348637186635e-06, |
| "loss": 0.4591, |
| "mean_token_accuracy": 0.8459449708461761, |
| "num_tokens": 234585639.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.130019120458891, |
| "grad_norm": 0.14178633805551513, |
| "learning_rate": 9.410665706445521e-06, |
| "loss": 0.4388, |
| "mean_token_accuracy": 0.8521133288741112, |
| "num_tokens": 235138310.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.1376673040152965, |
| "grad_norm": 0.13734459053843015, |
| "learning_rate": 9.407838699389525e-06, |
| "loss": 0.4542, |
| "mean_token_accuracy": 0.846922442317009, |
| "num_tokens": 235714310.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.1453154875717018, |
| "grad_norm": 0.1332800709874699, |
| "learning_rate": 9.405005354763017e-06, |
| "loss": 0.4542, |
| "mean_token_accuracy": 0.8467904925346375, |
| "num_tokens": 236290310.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.152963671128107, |
| "grad_norm": 0.13291737890683744, |
| "learning_rate": 9.40216567663977e-06, |
| "loss": 0.4474, |
| "mean_token_accuracy": 0.8494607359170914, |
| "num_tokens": 236866310.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.1606118546845123, |
| "grad_norm": 0.13823265816280292, |
| "learning_rate": 9.399319669102655e-06, |
| "loss": 0.4419, |
| "mean_token_accuracy": 0.8513462394475937, |
| "num_tokens": 237442310.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.1682600382409176, |
| "grad_norm": 0.13709902619662914, |
| "learning_rate": 9.396467336243656e-06, |
| "loss": 0.4385, |
| "mean_token_accuracy": 0.8522455841302872, |
| "num_tokens": 238018310.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.1759082217973233, |
| "grad_norm": 0.13499552935521622, |
| "learning_rate": 9.39360868216384e-06, |
| "loss": 0.4576, |
| "mean_token_accuracy": 0.8459102585911751, |
| "num_tokens": 238594310.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.1835564053537286, |
| "grad_norm": 0.1433007756568991, |
| "learning_rate": 9.390743710973366e-06, |
| "loss": 0.4664, |
| "mean_token_accuracy": 0.8439188376069069, |
| "num_tokens": 239170310.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.191204588910134, |
| "grad_norm": 0.129781541311147, |
| "learning_rate": 9.38787242679148e-06, |
| "loss": 0.4508, |
| "mean_token_accuracy": 0.8471464067697525, |
| "num_tokens": 239746310.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.198852772466539, |
| "grad_norm": 0.13681530680608084, |
| "learning_rate": 9.384994833746496e-06, |
| "loss": 0.4529, |
| "mean_token_accuracy": 0.8484097048640251, |
| "num_tokens": 240314805.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.2065009560229445, |
| "grad_norm": 0.1352192841534592, |
| "learning_rate": 9.382110935975811e-06, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.8462948203086853, |
| "num_tokens": 240868750.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.21414913957935, |
| "grad_norm": 0.14157928396699657, |
| "learning_rate": 9.379220737625877e-06, |
| "loss": 0.4686, |
| "mean_token_accuracy": 0.8430594280362129, |
| "num_tokens": 241444750.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.221797323135755, |
| "grad_norm": 0.13680279148546654, |
| "learning_rate": 9.376324242852206e-06, |
| "loss": 0.4461, |
| "mean_token_accuracy": 0.8499329835176468, |
| "num_tokens": 242020750.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.229445506692161, |
| "grad_norm": 0.13441364291759264, |
| "learning_rate": 9.37342145581937e-06, |
| "loss": 0.4534, |
| "mean_token_accuracy": 0.8477957472205162, |
| "num_tokens": 242596750.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.237093690248566, |
| "grad_norm": 0.14996464113428698, |
| "learning_rate": 9.370512380700976e-06, |
| "loss": 0.4538, |
| "mean_token_accuracy": 0.8473148196935654, |
| "num_tokens": 243172750.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.2447418738049714, |
| "grad_norm": 0.1291772530250717, |
| "learning_rate": 9.367597021679686e-06, |
| "loss": 0.4576, |
| "mean_token_accuracy": 0.8455560505390167, |
| "num_tokens": 243748750.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.2523900573613767, |
| "grad_norm": 0.14578020775893444, |
| "learning_rate": 9.364675382947185e-06, |
| "loss": 0.4318, |
| "mean_token_accuracy": 0.8554418906569481, |
| "num_tokens": 244324750.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.260038240917782, |
| "grad_norm": 0.13750460715362675, |
| "learning_rate": 9.361747468704196e-06, |
| "loss": 0.4418, |
| "mean_token_accuracy": 0.8520320132374763, |
| "num_tokens": 244900750.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.2676864244741872, |
| "grad_norm": 0.1372424458911705, |
| "learning_rate": 9.35881328316046e-06, |
| "loss": 0.4765, |
| "mean_token_accuracy": 0.8400749266147614, |
| "num_tokens": 245476750.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.275334608030593, |
| "grad_norm": 0.1395311312663304, |
| "learning_rate": 9.35587283053473e-06, |
| "loss": 0.5033, |
| "mean_token_accuracy": 0.8329617977142334, |
| "num_tokens": 246052750.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.2829827915869982, |
| "grad_norm": 0.13615900761434377, |
| "learning_rate": 9.352926115054783e-06, |
| "loss": 0.4659, |
| "mean_token_accuracy": 0.8443350717425346, |
| "num_tokens": 246622973.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.2906309751434035, |
| "grad_norm": 0.13476399243406678, |
| "learning_rate": 9.349973140957392e-06, |
| "loss": 0.4588, |
| "mean_token_accuracy": 0.8465144336223602, |
| "num_tokens": 247198973.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.298279158699809, |
| "grad_norm": 0.138319841165643, |
| "learning_rate": 9.347013912488324e-06, |
| "loss": 0.4456, |
| "mean_token_accuracy": 0.8498670160770416, |
| "num_tokens": 247774973.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.305927342256214, |
| "grad_norm": 0.1295935255491883, |
| "learning_rate": 9.344048433902351e-06, |
| "loss": 0.444, |
| "mean_token_accuracy": 0.8501743152737617, |
| "num_tokens": 248350973.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.3135755258126194, |
| "grad_norm": 0.1352566895751494, |
| "learning_rate": 9.34107670946322e-06, |
| "loss": 0.4381, |
| "mean_token_accuracy": 0.8526205867528915, |
| "num_tokens": 248926973.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.3212237093690247, |
| "grad_norm": 0.13526914802404547, |
| "learning_rate": 9.338098743443666e-06, |
| "loss": 0.4552, |
| "mean_token_accuracy": 0.8469623699784279, |
| "num_tokens": 249502973.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.3288718929254304, |
| "grad_norm": 0.1378327942247355, |
| "learning_rate": 9.335114540125393e-06, |
| "loss": 0.4983, |
| "mean_token_accuracy": 0.8344531655311584, |
| "num_tokens": 250078973.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.3365200764818357, |
| "grad_norm": 0.13678024458347515, |
| "learning_rate": 9.332124103799075e-06, |
| "loss": 0.4648, |
| "mean_token_accuracy": 0.8443667814135551, |
| "num_tokens": 250654973.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.344168260038241, |
| "grad_norm": 0.13664942328914748, |
| "learning_rate": 9.329127438764351e-06, |
| "loss": 0.4892, |
| "mean_token_accuracy": 0.8364584743976593, |
| "num_tokens": 251230973.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.3518164435946463, |
| "grad_norm": 0.13783383905141766, |
| "learning_rate": 9.32612454932981e-06, |
| "loss": 0.4633, |
| "mean_token_accuracy": 0.8456706553697586, |
| "num_tokens": 251806973.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.3594646271510515, |
| "grad_norm": 0.13339399076355143, |
| "learning_rate": 9.323115439812993e-06, |
| "loss": 0.4437, |
| "mean_token_accuracy": 0.8508689105510712, |
| "num_tokens": 252367168.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.367112810707457, |
| "grad_norm": 0.1338168045570386, |
| "learning_rate": 9.320100114540382e-06, |
| "loss": 0.4634, |
| "mean_token_accuracy": 0.8454900979995728, |
| "num_tokens": 252943168.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.3747609942638626, |
| "grad_norm": 0.13234254602598014, |
| "learning_rate": 9.317078577847402e-06, |
| "loss": 0.4669, |
| "mean_token_accuracy": 0.843821607530117, |
| "num_tokens": 253519168.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.382409177820268, |
| "grad_norm": 0.13860690572904696, |
| "learning_rate": 9.314050834078401e-06, |
| "loss": 0.453, |
| "mean_token_accuracy": 0.8475092798471451, |
| "num_tokens": 254095168.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.390057361376673, |
| "grad_norm": 0.14010574280265636, |
| "learning_rate": 9.311016887586659e-06, |
| "loss": 0.4529, |
| "mean_token_accuracy": 0.8471189886331558, |
| "num_tokens": 254669068.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.3977055449330784, |
| "grad_norm": 0.13286366341534026, |
| "learning_rate": 9.307976742734366e-06, |
| "loss": 0.4579, |
| "mean_token_accuracy": 0.8456550240516663, |
| "num_tokens": 255245068.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.4053537284894837, |
| "grad_norm": 0.12287227122879738, |
| "learning_rate": 9.304930403892633e-06, |
| "loss": 0.4284, |
| "mean_token_accuracy": 0.855134591460228, |
| "num_tokens": 255821068.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.413001912045889, |
| "grad_norm": 0.13496131187274832, |
| "learning_rate": 9.30187787544147e-06, |
| "loss": 0.4519, |
| "mean_token_accuracy": 0.8480301275849342, |
| "num_tokens": 256397068.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.4206500956022943, |
| "grad_norm": 0.12304815197408708, |
| "learning_rate": 9.298819161769788e-06, |
| "loss": 0.4055, |
| "mean_token_accuracy": 0.863836444914341, |
| "num_tokens": 256970705.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.4282982791587, |
| "grad_norm": 0.1312084281080521, |
| "learning_rate": 9.295754267275393e-06, |
| "loss": 0.4304, |
| "mean_token_accuracy": 0.8547022864222527, |
| "num_tokens": 257546705.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.4359464627151053, |
| "grad_norm": 0.1424625968795541, |
| "learning_rate": 9.292683196364975e-06, |
| "loss": 0.5003, |
| "mean_token_accuracy": 0.8330485969781876, |
| "num_tokens": 258122705.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.4435946462715106, |
| "grad_norm": 0.13782377842414753, |
| "learning_rate": 9.289605953454108e-06, |
| "loss": 0.4567, |
| "mean_token_accuracy": 0.8465578481554985, |
| "num_tokens": 258698705.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.451242829827916, |
| "grad_norm": 0.13609928849701888, |
| "learning_rate": 9.286522542967235e-06, |
| "loss": 0.4584, |
| "mean_token_accuracy": 0.8467818200588226, |
| "num_tokens": 259274705.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.458891013384321, |
| "grad_norm": 0.1261203105793479, |
| "learning_rate": 9.283432969337672e-06, |
| "loss": 0.4616, |
| "mean_token_accuracy": 0.8468790426850319, |
| "num_tokens": 259850705.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.4665391969407264, |
| "grad_norm": 0.13614975492427833, |
| "learning_rate": 9.280337237007592e-06, |
| "loss": 0.464, |
| "mean_token_accuracy": 0.8447556719183922, |
| "num_tokens": 260426705.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.474187380497132, |
| "grad_norm": 0.1327458325746583, |
| "learning_rate": 9.277235350428029e-06, |
| "loss": 0.4454, |
| "mean_token_accuracy": 0.8494572639465332, |
| "num_tokens": 261002705.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.4818355640535374, |
| "grad_norm": 0.13085159750760433, |
| "learning_rate": 9.274127314058857e-06, |
| "loss": 0.4371, |
| "mean_token_accuracy": 0.852618858218193, |
| "num_tokens": 261578705.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.4894837476099427, |
| "grad_norm": 0.13578463225475795, |
| "learning_rate": 9.271013132368799e-06, |
| "loss": 0.4642, |
| "mean_token_accuracy": 0.8443883061408997, |
| "num_tokens": 262140970.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.497131931166348, |
| "grad_norm": 0.14006598972599096, |
| "learning_rate": 9.267892809835409e-06, |
| "loss": 0.4637, |
| "mean_token_accuracy": 0.8449813947081566, |
| "num_tokens": 262716970.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.5047801147227533, |
| "grad_norm": 0.13185777866523177, |
| "learning_rate": 9.264766350945076e-06, |
| "loss": 0.4302, |
| "mean_token_accuracy": 0.8541102334856987, |
| "num_tokens": 263292970.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.5124282982791586, |
| "grad_norm": 0.13342931833994348, |
| "learning_rate": 9.261633760193005e-06, |
| "loss": 0.4686, |
| "mean_token_accuracy": 0.8435091152787209, |
| "num_tokens": 263868970.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.520076481835564, |
| "grad_norm": 0.13168965271918778, |
| "learning_rate": 9.258495042083222e-06, |
| "loss": 0.4434, |
| "mean_token_accuracy": 0.8501882031559944, |
| "num_tokens": 264444970.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.527724665391969, |
| "grad_norm": 0.13243605685606727, |
| "learning_rate": 9.255350201128564e-06, |
| "loss": 0.4465, |
| "mean_token_accuracy": 0.8496812433004379, |
| "num_tokens": 265020970.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.535372848948375, |
| "grad_norm": 0.1552286657686917, |
| "learning_rate": 9.252199241850666e-06, |
| "loss": 0.446, |
| "mean_token_accuracy": 0.8502941057085991, |
| "num_tokens": 265596970.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.54302103250478, |
| "grad_norm": 0.13185696300414065, |
| "learning_rate": 9.249042168779962e-06, |
| "loss": 0.4391, |
| "mean_token_accuracy": 0.8525671735405922, |
| "num_tokens": 266155354.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.5506692160611855, |
| "grad_norm": 0.12925331557566425, |
| "learning_rate": 9.245878986455684e-06, |
| "loss": 0.4556, |
| "mean_token_accuracy": 0.8468790277838707, |
| "num_tokens": 266731354.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.5583173996175907, |
| "grad_norm": 0.14112072128802675, |
| "learning_rate": 9.242709699425833e-06, |
| "loss": 0.4499, |
| "mean_token_accuracy": 0.8501986265182495, |
| "num_tokens": 267307354.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.565965583173996, |
| "grad_norm": 0.12961482591836798, |
| "learning_rate": 9.2395343122472e-06, |
| "loss": 0.4794, |
| "mean_token_accuracy": 0.8403509855270386, |
| "num_tokens": 267883354.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 3.5736137667304018, |
| "grad_norm": 0.15553386932308658, |
| "learning_rate": 9.236352829485342e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.8502125144004822, |
| "num_tokens": 268459354.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 3.581261950286807, |
| "grad_norm": 0.13773631625374297, |
| "learning_rate": 9.23316525571458e-06, |
| "loss": 0.472, |
| "mean_token_accuracy": 0.8414899259805679, |
| "num_tokens": 269035354.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 3.5889101338432123, |
| "grad_norm": 0.12994566689301978, |
| "learning_rate": 9.229971595517993e-06, |
| "loss": 0.4278, |
| "mean_token_accuracy": 0.8565200641751289, |
| "num_tokens": 269611354.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.5965583173996176, |
| "grad_norm": 0.147867087508267, |
| "learning_rate": 9.226771853487411e-06, |
| "loss": 0.4862, |
| "mean_token_accuracy": 0.8371338397264481, |
| "num_tokens": 270187354.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 3.604206500956023, |
| "grad_norm": 0.1333395328504752, |
| "learning_rate": 9.223566034223409e-06, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.838104359805584, |
| "num_tokens": 270763354.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 3.611854684512428, |
| "grad_norm": 0.14126948407840498, |
| "learning_rate": 9.2203541423353e-06, |
| "loss": 0.4591, |
| "mean_token_accuracy": 0.8453320935368538, |
| "num_tokens": 271339354.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 3.6195028680688335, |
| "grad_norm": 0.1364081032107475, |
| "learning_rate": 9.217136182441124e-06, |
| "loss": 0.4311, |
| "mean_token_accuracy": 0.8542838543653488, |
| "num_tokens": 271915354.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.6271510516252388, |
| "grad_norm": 0.13312098293506802, |
| "learning_rate": 9.213912159167655e-06, |
| "loss": 0.4652, |
| "mean_token_accuracy": 0.8445072919130325, |
| "num_tokens": 272488686.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.6347992351816445, |
| "grad_norm": 0.17152486390579674, |
| "learning_rate": 9.210682077150375e-06, |
| "loss": 0.4693, |
| "mean_token_accuracy": 0.8427104577422142, |
| "num_tokens": 273064686.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 3.64244741873805, |
| "grad_norm": 0.1463912816972637, |
| "learning_rate": 9.207445941033483e-06, |
| "loss": 0.4397, |
| "mean_token_accuracy": 0.8523378744721413, |
| "num_tokens": 273617956.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.650095602294455, |
| "grad_norm": 0.1389469013265535, |
| "learning_rate": 9.204203755469879e-06, |
| "loss": 0.4722, |
| "mean_token_accuracy": 0.8406079337000847, |
| "num_tokens": 274193956.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 3.6577437858508604, |
| "grad_norm": 0.1304788404105764, |
| "learning_rate": 9.200955525121165e-06, |
| "loss": 0.4286, |
| "mean_token_accuracy": 0.8560905903577805, |
| "num_tokens": 274756063.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 3.6653919694072656, |
| "grad_norm": 0.13302202550891926, |
| "learning_rate": 9.197701254657631e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.8527768403291702, |
| "num_tokens": 275332063.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.6730401529636714, |
| "grad_norm": 0.13640185735673466, |
| "learning_rate": 9.19444094875825e-06, |
| "loss": 0.4494, |
| "mean_token_accuracy": 0.848458968102932, |
| "num_tokens": 275908063.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 3.6806883365200767, |
| "grad_norm": 0.1356639995335594, |
| "learning_rate": 9.19117461211068e-06, |
| "loss": 0.4407, |
| "mean_token_accuracy": 0.8518905341625214, |
| "num_tokens": 276469569.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 3.688336520076482, |
| "grad_norm": 0.13619130952960193, |
| "learning_rate": 9.187902249411241e-06, |
| "loss": 0.4502, |
| "mean_token_accuracy": 0.8488947451114655, |
| "num_tokens": 277045569.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 3.6959847036328872, |
| "grad_norm": 0.13708965399379136, |
| "learning_rate": 9.184623865364924e-06, |
| "loss": 0.4769, |
| "mean_token_accuracy": 0.8410697728395462, |
| "num_tokens": 277621569.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 3.7036328871892925, |
| "grad_norm": 0.13720203429880218, |
| "learning_rate": 9.18133946468537e-06, |
| "loss": 0.4651, |
| "mean_token_accuracy": 0.8440004512667656, |
| "num_tokens": 278197569.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.711281070745698, |
| "grad_norm": 0.1299350454538957, |
| "learning_rate": 9.178049052094881e-06, |
| "loss": 0.437, |
| "mean_token_accuracy": 0.8537300229072571, |
| "num_tokens": 278773569.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.718929254302103, |
| "grad_norm": 0.13110362377063814, |
| "learning_rate": 9.174752632324394e-06, |
| "loss": 0.4408, |
| "mean_token_accuracy": 0.8510458767414093, |
| "num_tokens": 279349569.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 3.7265774378585084, |
| "grad_norm": 0.13551732054175086, |
| "learning_rate": 9.171450210113487e-06, |
| "loss": 0.4622, |
| "mean_token_accuracy": 0.844747006893158, |
| "num_tokens": 279925569.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 3.734225621414914, |
| "grad_norm": 0.13730675054002467, |
| "learning_rate": 9.16814179021037e-06, |
| "loss": 0.4284, |
| "mean_token_accuracy": 0.8542994931340218, |
| "num_tokens": 280501569.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 3.7418738049713194, |
| "grad_norm": 0.1292955427034706, |
| "learning_rate": 9.16482737737187e-06, |
| "loss": 0.463, |
| "mean_token_accuracy": 0.8444327488541603, |
| "num_tokens": 281077569.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.7495219885277247, |
| "grad_norm": 0.13912430248307478, |
| "learning_rate": 9.161506976363438e-06, |
| "loss": 0.4417, |
| "mean_token_accuracy": 0.8520372435450554, |
| "num_tokens": 281653569.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 3.75717017208413, |
| "grad_norm": 0.1366214099806215, |
| "learning_rate": 9.158180591959131e-06, |
| "loss": 0.4589, |
| "mean_token_accuracy": 0.8459536507725716, |
| "num_tokens": 282229569.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 3.7648183556405352, |
| "grad_norm": 0.1369364998873903, |
| "learning_rate": 9.154848228941607e-06, |
| "loss": 0.4385, |
| "mean_token_accuracy": 0.8520493879914284, |
| "num_tokens": 282805569.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 3.772466539196941, |
| "grad_norm": 0.12498764879815455, |
| "learning_rate": 9.151509892102125e-06, |
| "loss": 0.4247, |
| "mean_token_accuracy": 0.8574089854955673, |
| "num_tokens": 283381569.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 3.7801147227533463, |
| "grad_norm": 0.1322000135860979, |
| "learning_rate": 9.148165586240531e-06, |
| "loss": 0.4491, |
| "mean_token_accuracy": 0.8488652408123016, |
| "num_tokens": 283957569.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.7877629063097515, |
| "grad_norm": 0.13844783624470497, |
| "learning_rate": 9.144815316165251e-06, |
| "loss": 0.4533, |
| "mean_token_accuracy": 0.8480179756879807, |
| "num_tokens": 284533569.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 3.795411089866157, |
| "grad_norm": 0.14356041185467444, |
| "learning_rate": 9.14145908669329e-06, |
| "loss": 0.4415, |
| "mean_token_accuracy": 0.8507316261529922, |
| "num_tokens": 285109569.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 3.803059273422562, |
| "grad_norm": 0.13253514930903268, |
| "learning_rate": 9.138096902650217e-06, |
| "loss": 0.4577, |
| "mean_token_accuracy": 0.845842532813549, |
| "num_tokens": 285685569.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 3.8107074569789674, |
| "grad_norm": 0.1317692376942017, |
| "learning_rate": 9.134728768870167e-06, |
| "loss": 0.4439, |
| "mean_token_accuracy": 0.8503479510545731, |
| "num_tokens": 286261569.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 3.8183556405353727, |
| "grad_norm": 0.12686756349523995, |
| "learning_rate": 9.131354690195827e-06, |
| "loss": 0.4351, |
| "mean_token_accuracy": 0.8522073924541473, |
| "num_tokens": 286837569.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.826003824091778, |
| "grad_norm": 0.12900551278598996, |
| "learning_rate": 9.127974671478432e-06, |
| "loss": 0.4314, |
| "mean_token_accuracy": 0.8539366275072098, |
| "num_tokens": 287413569.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 3.8336520076481837, |
| "grad_norm": 0.13292231226530699, |
| "learning_rate": 9.124588717577759e-06, |
| "loss": 0.4686, |
| "mean_token_accuracy": 0.8422486335039139, |
| "num_tokens": 287989569.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 3.841300191204589, |
| "grad_norm": 0.136683585551859, |
| "learning_rate": 9.121196833362112e-06, |
| "loss": 0.4483, |
| "mean_token_accuracy": 0.8491551652550697, |
| "num_tokens": 288565569.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 3.8489483747609943, |
| "grad_norm": 0.1444111504728688, |
| "learning_rate": 9.117799023708334e-06, |
| "loss": 0.4773, |
| "mean_token_accuracy": 0.8403370901942253, |
| "num_tokens": 289141569.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 3.8565965583173996, |
| "grad_norm": 0.12171464057354885, |
| "learning_rate": 9.114395293501775e-06, |
| "loss": 0.4109, |
| "mean_token_accuracy": 0.8606747463345528, |
| "num_tokens": 289717569.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.864244741873805, |
| "grad_norm": 0.13308127354938107, |
| "learning_rate": 9.110985647636303e-06, |
| "loss": 0.4475, |
| "mean_token_accuracy": 0.8503652960062027, |
| "num_tokens": 290293569.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 3.8718929254302106, |
| "grad_norm": 0.16593234316109462, |
| "learning_rate": 9.107570091014295e-06, |
| "loss": 0.479, |
| "mean_token_accuracy": 0.8400173112750053, |
| "num_tokens": 290868510.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 3.879541108986616, |
| "grad_norm": 0.13689658405775249, |
| "learning_rate": 9.10414862854662e-06, |
| "loss": 0.4492, |
| "mean_token_accuracy": 0.8498774170875549, |
| "num_tokens": 291444510.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 3.887189292543021, |
| "grad_norm": 0.12795882142422862, |
| "learning_rate": 9.100721265152644e-06, |
| "loss": 0.4376, |
| "mean_token_accuracy": 0.8533775582909584, |
| "num_tokens": 292020510.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 3.8948374760994264, |
| "grad_norm": 0.13169792505551053, |
| "learning_rate": 9.097288005760213e-06, |
| "loss": 0.4498, |
| "mean_token_accuracy": 0.8481290861964226, |
| "num_tokens": 292596510.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.9024856596558317, |
| "grad_norm": 0.1297287757258664, |
| "learning_rate": 9.09384885530565e-06, |
| "loss": 0.4605, |
| "mean_token_accuracy": 0.8447956144809723, |
| "num_tokens": 293172510.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 3.910133843212237, |
| "grad_norm": 0.1376874016028007, |
| "learning_rate": 9.09040381873375e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8403527215123177, |
| "num_tokens": 293748510.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 3.9177820267686423, |
| "grad_norm": 0.1297958223852381, |
| "learning_rate": 9.086952900997774e-06, |
| "loss": 0.4518, |
| "mean_token_accuracy": 0.8482714593410492, |
| "num_tokens": 294324510.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 3.9254302103250476, |
| "grad_norm": 0.13708161020019655, |
| "learning_rate": 9.083496107059433e-06, |
| "loss": 0.4636, |
| "mean_token_accuracy": 0.8451602086424828, |
| "num_tokens": 294900510.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 3.9330783938814533, |
| "grad_norm": 0.12977023510450897, |
| "learning_rate": 9.08003344188889e-06, |
| "loss": 0.4627, |
| "mean_token_accuracy": 0.8444761633872986, |
| "num_tokens": 295476510.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.9407265774378586, |
| "grad_norm": 0.13266699207916796, |
| "learning_rate": 9.076564910464753e-06, |
| "loss": 0.4432, |
| "mean_token_accuracy": 0.8500024378299713, |
| "num_tokens": 296052510.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 3.948374760994264, |
| "grad_norm": 0.13852665867070613, |
| "learning_rate": 9.073090517774057e-06, |
| "loss": 0.4505, |
| "mean_token_accuracy": 0.8480318710207939, |
| "num_tokens": 296628510.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 3.956022944550669, |
| "grad_norm": 0.13867926058656033, |
| "learning_rate": 9.06961026881227e-06, |
| "loss": 0.4736, |
| "mean_token_accuracy": 0.8420107811689377, |
| "num_tokens": 297204510.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 3.9636711281070744, |
| "grad_norm": 0.1472014718774958, |
| "learning_rate": 9.066124168583277e-06, |
| "loss": 0.4339, |
| "mean_token_accuracy": 0.8534956350922585, |
| "num_tokens": 297780510.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 3.97131931166348, |
| "grad_norm": 0.14861258236560726, |
| "learning_rate": 9.062632222099375e-06, |
| "loss": 0.4604, |
| "mean_token_accuracy": 0.8463720604777336, |
| "num_tokens": 298356510.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.9789674952198855, |
| "grad_norm": 0.13635801589712673, |
| "learning_rate": 9.059134434381274e-06, |
| "loss": 0.4673, |
| "mean_token_accuracy": 0.8437070325016975, |
| "num_tokens": 298932510.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 3.9866156787762907, |
| "grad_norm": 0.13517971159266337, |
| "learning_rate": 9.055630810458072e-06, |
| "loss": 0.4703, |
| "mean_token_accuracy": 0.8419812694191933, |
| "num_tokens": 299508510.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 3.994263862332696, |
| "grad_norm": 0.13502762281684197, |
| "learning_rate": 9.052121355367267e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.8502528890967369, |
| "num_tokens": 300071157.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.16299524977157676, |
| "learning_rate": 9.048606074154738e-06, |
| "loss": 0.4471, |
| "mean_token_accuracy": 0.8504007657368978, |
| "num_tokens": 300492852.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 4.007648183556405, |
| "grad_norm": 0.1367796055868044, |
| "learning_rate": 9.045084971874738e-06, |
| "loss": 0.4092, |
| "mean_token_accuracy": 0.8609004467725754, |
| "num_tokens": 301068852.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.015296367112811, |
| "grad_norm": 0.13716134351284365, |
| "learning_rate": 9.041558053589894e-06, |
| "loss": 0.43, |
| "mean_token_accuracy": 0.853941835463047, |
| "num_tokens": 301644852.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 4.022944550669216, |
| "grad_norm": 0.13676487285704125, |
| "learning_rate": 9.038025324371192e-06, |
| "loss": 0.4638, |
| "mean_token_accuracy": 0.8445838019251823, |
| "num_tokens": 302220852.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 4.030592734225621, |
| "grad_norm": 0.13821684713351107, |
| "learning_rate": 9.034486789297973e-06, |
| "loss": 0.4609, |
| "mean_token_accuracy": 0.8445247709751129, |
| "num_tokens": 302796852.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 4.038240917782026, |
| "grad_norm": 0.14599457165525512, |
| "learning_rate": 9.030942453457928e-06, |
| "loss": 0.4614, |
| "mean_token_accuracy": 0.8445004597306252, |
| "num_tokens": 303372852.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 4.045889101338432, |
| "grad_norm": 0.16625511185143685, |
| "learning_rate": 9.027392321947088e-06, |
| "loss": 0.4793, |
| "mean_token_accuracy": 0.838548831641674, |
| "num_tokens": 303948852.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.053537284894838, |
| "grad_norm": 0.138119624588249, |
| "learning_rate": 9.023836399869814e-06, |
| "loss": 0.4242, |
| "mean_token_accuracy": 0.8558134436607361, |
| "num_tokens": 304524852.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 4.061185468451243, |
| "grad_norm": 0.14231125279660453, |
| "learning_rate": 9.020274692338796e-06, |
| "loss": 0.4149, |
| "mean_token_accuracy": 0.858605220913887, |
| "num_tokens": 305100852.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 4.0688336520076485, |
| "grad_norm": 0.13579393688100058, |
| "learning_rate": 9.01670720447504e-06, |
| "loss": 0.4289, |
| "mean_token_accuracy": 0.8546015843749046, |
| "num_tokens": 305676852.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 4.076481835564054, |
| "grad_norm": 0.14167790695591406, |
| "learning_rate": 9.013133941407866e-06, |
| "loss": 0.4585, |
| "mean_token_accuracy": 0.8454310745000839, |
| "num_tokens": 306252852.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 4.084130019120459, |
| "grad_norm": 0.1352511945058088, |
| "learning_rate": 9.009554908274893e-06, |
| "loss": 0.444, |
| "mean_token_accuracy": 0.8504850938916206, |
| "num_tokens": 306828852.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.091778202676864, |
| "grad_norm": 0.1432264112617214, |
| "learning_rate": 9.00597011022204e-06, |
| "loss": 0.4587, |
| "mean_token_accuracy": 0.8464554026722908, |
| "num_tokens": 307404852.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 4.09942638623327, |
| "grad_norm": 0.13151056491832114, |
| "learning_rate": 9.00237955240351e-06, |
| "loss": 0.4168, |
| "mean_token_accuracy": 0.8592042028903961, |
| "num_tokens": 307980852.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 4.107074569789675, |
| "grad_norm": 0.133618272264501, |
| "learning_rate": 8.998783239981796e-06, |
| "loss": 0.4271, |
| "mean_token_accuracy": 0.8552821651101112, |
| "num_tokens": 308556852.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 4.11472275334608, |
| "grad_norm": 0.13915971751181871, |
| "learning_rate": 8.995181178127659e-06, |
| "loss": 0.4518, |
| "mean_token_accuracy": 0.8473391234874725, |
| "num_tokens": 309132852.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 4.1223709369024855, |
| "grad_norm": 0.13454371391547879, |
| "learning_rate": 8.991573372020123e-06, |
| "loss": 0.4324, |
| "mean_token_accuracy": 0.8538081347942352, |
| "num_tokens": 309708852.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.130019120458891, |
| "grad_norm": 0.13213992505186528, |
| "learning_rate": 8.987959826846479e-06, |
| "loss": 0.4461, |
| "mean_token_accuracy": 0.8492263630032539, |
| "num_tokens": 310284852.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 4.137667304015296, |
| "grad_norm": 0.13593594697201444, |
| "learning_rate": 8.984340547802264e-06, |
| "loss": 0.4461, |
| "mean_token_accuracy": 0.8491100370883942, |
| "num_tokens": 310860852.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 4.145315487571701, |
| "grad_norm": 0.12981333151349195, |
| "learning_rate": 8.980715540091263e-06, |
| "loss": 0.4266, |
| "mean_token_accuracy": 0.8562596216797829, |
| "num_tokens": 311436852.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 4.1529636711281075, |
| "grad_norm": 0.13614955901925288, |
| "learning_rate": 8.977084808925494e-06, |
| "loss": 0.4121, |
| "mean_token_accuracy": 0.859827496111393, |
| "num_tokens": 312012852.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 4.160611854684513, |
| "grad_norm": 0.13682770116117984, |
| "learning_rate": 8.973448359525207e-06, |
| "loss": 0.4318, |
| "mean_token_accuracy": 0.8550026342272758, |
| "num_tokens": 312588852.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.168260038240918, |
| "grad_norm": 0.137849063910186, |
| "learning_rate": 8.96980619711887e-06, |
| "loss": 0.4395, |
| "mean_token_accuracy": 0.8518254309892654, |
| "num_tokens": 313164852.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 4.175908221797323, |
| "grad_norm": 0.13710001561248336, |
| "learning_rate": 8.96615832694317e-06, |
| "loss": 0.4189, |
| "mean_token_accuracy": 0.8579558879137039, |
| "num_tokens": 313740852.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 4.183556405353729, |
| "grad_norm": 0.13484817825542728, |
| "learning_rate": 8.962504754242997e-06, |
| "loss": 0.4178, |
| "mean_token_accuracy": 0.8575565740466118, |
| "num_tokens": 314316852.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 4.191204588910134, |
| "grad_norm": 0.1355879929584855, |
| "learning_rate": 8.958845484271443e-06, |
| "loss": 0.4326, |
| "mean_token_accuracy": 0.8534713238477707, |
| "num_tokens": 314892852.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 4.198852772466539, |
| "grad_norm": 0.13093335486708962, |
| "learning_rate": 8.955180522289787e-06, |
| "loss": 0.3991, |
| "mean_token_accuracy": 0.8641314953565598, |
| "num_tokens": 315468852.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.2065009560229445, |
| "grad_norm": 0.1356895962011802, |
| "learning_rate": 8.951509873567498e-06, |
| "loss": 0.4595, |
| "mean_token_accuracy": 0.845748782157898, |
| "num_tokens": 316044852.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 4.21414913957935, |
| "grad_norm": 0.1359263330797516, |
| "learning_rate": 8.947833543382216e-06, |
| "loss": 0.4512, |
| "mean_token_accuracy": 0.8480579107999802, |
| "num_tokens": 316620852.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 4.221797323135755, |
| "grad_norm": 0.13568308044384295, |
| "learning_rate": 8.944151537019752e-06, |
| "loss": 0.4331, |
| "mean_token_accuracy": 0.8542803898453712, |
| "num_tokens": 317196852.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 4.22944550669216, |
| "grad_norm": 0.14107612547899365, |
| "learning_rate": 8.940463859774078e-06, |
| "loss": 0.4391, |
| "mean_token_accuracy": 0.8508462160825729, |
| "num_tokens": 317772852.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 4.237093690248566, |
| "grad_norm": 0.14768495035601778, |
| "learning_rate": 8.93677051694732e-06, |
| "loss": 0.4169, |
| "mean_token_accuracy": 0.8585600778460503, |
| "num_tokens": 318348852.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.244741873804971, |
| "grad_norm": 0.1340278519676253, |
| "learning_rate": 8.93307151384975e-06, |
| "loss": 0.4234, |
| "mean_token_accuracy": 0.8561016395688057, |
| "num_tokens": 318924852.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 4.252390057361376, |
| "grad_norm": 0.13680193270577393, |
| "learning_rate": 8.929366855799777e-06, |
| "loss": 0.4493, |
| "mean_token_accuracy": 0.8488930016756058, |
| "num_tokens": 319500852.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 4.260038240917782, |
| "grad_norm": 0.13852513522634227, |
| "learning_rate": 8.925656548123942e-06, |
| "loss": 0.4648, |
| "mean_token_accuracy": 0.8442719057202339, |
| "num_tokens": 320063117.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 4.267686424474188, |
| "grad_norm": 0.13624189114266874, |
| "learning_rate": 8.92194059615691e-06, |
| "loss": 0.437, |
| "mean_token_accuracy": 0.8514139503240585, |
| "num_tokens": 320639117.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 4.275334608030593, |
| "grad_norm": 0.13350408206652886, |
| "learning_rate": 8.918219005241458e-06, |
| "loss": 0.4589, |
| "mean_token_accuracy": 0.844880685210228, |
| "num_tokens": 321215117.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.282982791586998, |
| "grad_norm": 0.13649099774497725, |
| "learning_rate": 8.914491780728471e-06, |
| "loss": 0.4671, |
| "mean_token_accuracy": 0.8434830605983734, |
| "num_tokens": 321791117.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 4.2906309751434035, |
| "grad_norm": 0.14170690277634168, |
| "learning_rate": 8.91075892797694e-06, |
| "loss": 0.4458, |
| "mean_token_accuracy": 0.8494138568639755, |
| "num_tokens": 322367117.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 4.298279158699809, |
| "grad_norm": 0.1428239008677417, |
| "learning_rate": 8.90702045235394e-06, |
| "loss": 0.4537, |
| "mean_token_accuracy": 0.8470439687371254, |
| "num_tokens": 322943117.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 4.305927342256214, |
| "grad_norm": 0.13750099119258852, |
| "learning_rate": 8.903276359234638e-06, |
| "loss": 0.434, |
| "mean_token_accuracy": 0.8533461019396782, |
| "num_tokens": 323518058.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 4.313575525812619, |
| "grad_norm": 0.13798858771811107, |
| "learning_rate": 8.899526654002268e-06, |
| "loss": 0.4482, |
| "mean_token_accuracy": 0.8492783904075623, |
| "num_tokens": 324071328.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 4.321223709369025, |
| "grad_norm": 0.13569499666557838, |
| "learning_rate": 8.895771342048145e-06, |
| "loss": 0.4322, |
| "mean_token_accuracy": 0.8544071167707443, |
| "num_tokens": 324647328.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 4.32887189292543, |
| "grad_norm": 0.14183511720361952, |
| "learning_rate": 8.892010428771638e-06, |
| "loss": 0.4325, |
| "mean_token_accuracy": 0.8530407473444939, |
| "num_tokens": 325223328.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 4.336520076481835, |
| "grad_norm": 0.14107156072568003, |
| "learning_rate": 8.88824391958017e-06, |
| "loss": 0.4307, |
| "mean_token_accuracy": 0.8547335267066956, |
| "num_tokens": 325799328.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 4.3441682600382405, |
| "grad_norm": 0.13599509659002554, |
| "learning_rate": 8.88447181988921e-06, |
| "loss": 0.4258, |
| "mean_token_accuracy": 0.8559349700808525, |
| "num_tokens": 326375328.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 4.351816443594647, |
| "grad_norm": 0.14168570065592437, |
| "learning_rate": 8.88069413512227e-06, |
| "loss": 0.447, |
| "mean_token_accuracy": 0.8499486148357391, |
| "num_tokens": 326951328.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.359464627151052, |
| "grad_norm": 0.14287905535251402, |
| "learning_rate": 8.876910870710885e-06, |
| "loss": 0.4468, |
| "mean_token_accuracy": 0.8484693765640259, |
| "num_tokens": 327527328.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 4.367112810707457, |
| "grad_norm": 0.13966911539057147, |
| "learning_rate": 8.873122032094614e-06, |
| "loss": 0.4578, |
| "mean_token_accuracy": 0.8461237996816635, |
| "num_tokens": 328103328.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 4.374760994263863, |
| "grad_norm": 0.14261415507458267, |
| "learning_rate": 8.869327624721033e-06, |
| "loss": 0.4484, |
| "mean_token_accuracy": 0.8485492318868637, |
| "num_tokens": 328679328.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 4.382409177820268, |
| "grad_norm": 0.133369532678928, |
| "learning_rate": 8.865527654045727e-06, |
| "loss": 0.4298, |
| "mean_token_accuracy": 0.8543098941445351, |
| "num_tokens": 329255328.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 4.390057361376673, |
| "grad_norm": 0.14255996347846578, |
| "learning_rate": 8.861722125532272e-06, |
| "loss": 0.4581, |
| "mean_token_accuracy": 0.84616519510746, |
| "num_tokens": 329817975.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.397705544933078, |
| "grad_norm": 0.14265360616209308, |
| "learning_rate": 8.857911044652244e-06, |
| "loss": 0.4457, |
| "mean_token_accuracy": 0.8503062576055527, |
| "num_tokens": 330393975.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 4.405353728489484, |
| "grad_norm": 0.14108492866232827, |
| "learning_rate": 8.854094416885192e-06, |
| "loss": 0.4797, |
| "mean_token_accuracy": 0.8391808122396469, |
| "num_tokens": 330969975.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 4.413001912045889, |
| "grad_norm": 0.1397903736370164, |
| "learning_rate": 8.850272247718654e-06, |
| "loss": 0.4497, |
| "mean_token_accuracy": 0.8479728251695633, |
| "num_tokens": 331545975.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 4.420650095602294, |
| "grad_norm": 0.1421238151544833, |
| "learning_rate": 8.84644454264812e-06, |
| "loss": 0.4181, |
| "mean_token_accuracy": 0.8584993183612823, |
| "num_tokens": 332121975.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 4.4282982791587, |
| "grad_norm": 0.13112204100458139, |
| "learning_rate": 8.842611307177051e-06, |
| "loss": 0.4182, |
| "mean_token_accuracy": 0.857766643166542, |
| "num_tokens": 332697975.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.435946462715105, |
| "grad_norm": 0.1421745075127969, |
| "learning_rate": 8.838772546816857e-06, |
| "loss": 0.43, |
| "mean_token_accuracy": 0.8541948571801186, |
| "num_tokens": 333251920.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 4.44359464627151, |
| "grad_norm": 0.1444747863216718, |
| "learning_rate": 8.834928267086884e-06, |
| "loss": 0.4605, |
| "mean_token_accuracy": 0.8450855612754822, |
| "num_tokens": 333827920.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 4.451242829827915, |
| "grad_norm": 0.13443943817675832, |
| "learning_rate": 8.831078473514427e-06, |
| "loss": 0.4485, |
| "mean_token_accuracy": 0.8491065725684166, |
| "num_tokens": 334403920.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 4.458891013384322, |
| "grad_norm": 0.12806454601850675, |
| "learning_rate": 8.827223171634698e-06, |
| "loss": 0.4518, |
| "mean_token_accuracy": 0.8473460748791695, |
| "num_tokens": 334979920.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 4.466539196940727, |
| "grad_norm": 0.14571665310132872, |
| "learning_rate": 8.823362366990833e-06, |
| "loss": 0.4519, |
| "mean_token_accuracy": 0.847073495388031, |
| "num_tokens": 335555920.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.474187380497132, |
| "grad_norm": 0.12883941943627572, |
| "learning_rate": 8.819496065133879e-06, |
| "loss": 0.4226, |
| "mean_token_accuracy": 0.8574985191226006, |
| "num_tokens": 336114304.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 4.4818355640535374, |
| "grad_norm": 0.1335387356959434, |
| "learning_rate": 8.81562427162279e-06, |
| "loss": 0.4597, |
| "mean_token_accuracy": 0.8447174802422523, |
| "num_tokens": 336690304.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 4.489483747609943, |
| "grad_norm": 0.12889357337389204, |
| "learning_rate": 8.81174699202441e-06, |
| "loss": 0.4405, |
| "mean_token_accuracy": 0.8510493487119675, |
| "num_tokens": 337266304.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 4.497131931166348, |
| "grad_norm": 0.1330406876609962, |
| "learning_rate": 8.807864231913475e-06, |
| "loss": 0.4332, |
| "mean_token_accuracy": 0.8526778817176819, |
| "num_tokens": 337842304.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 4.504780114722753, |
| "grad_norm": 0.12979232613135025, |
| "learning_rate": 8.8039759968726e-06, |
| "loss": 0.4236, |
| "mean_token_accuracy": 0.856186717748642, |
| "num_tokens": 338418304.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.512428298279159, |
| "grad_norm": 0.13730293405800015, |
| "learning_rate": 8.800082292492274e-06, |
| "loss": 0.4486, |
| "mean_token_accuracy": 0.8482888266444206, |
| "num_tokens": 338994304.0, |
| "step": 591 |
| }, |
| { |
| "epoch": 4.520076481835564, |
| "grad_norm": 0.13489614197624644, |
| "learning_rate": 8.796183124370843e-06, |
| "loss": 0.4306, |
| "mean_token_accuracy": 0.8548116609454155, |
| "num_tokens": 339570304.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 4.527724665391969, |
| "grad_norm": 0.1347926609469706, |
| "learning_rate": 8.792278498114517e-06, |
| "loss": 0.4544, |
| "mean_token_accuracy": 0.8463078364729881, |
| "num_tokens": 340146304.0, |
| "step": 593 |
| }, |
| { |
| "epoch": 4.5353728489483744, |
| "grad_norm": 0.1349897764307115, |
| "learning_rate": 8.788368419337348e-06, |
| "loss": 0.4396, |
| "mean_token_accuracy": 0.8521014824509621, |
| "num_tokens": 340722304.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 4.54302103250478, |
| "grad_norm": 0.1334117681436598, |
| "learning_rate": 8.784452893661229e-06, |
| "loss": 0.4388, |
| "mean_token_accuracy": 0.8514469414949417, |
| "num_tokens": 341298304.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.550669216061186, |
| "grad_norm": 0.13453993757890548, |
| "learning_rate": 8.780531926715888e-06, |
| "loss": 0.4408, |
| "mean_token_accuracy": 0.8514851331710815, |
| "num_tokens": 341874304.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 4.558317399617591, |
| "grad_norm": 0.14039109869111963, |
| "learning_rate": 8.77660552413887e-06, |
| "loss": 0.4373, |
| "mean_token_accuracy": 0.8525997698307037, |
| "num_tokens": 342450304.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 4.5659655831739965, |
| "grad_norm": 0.13204418688173547, |
| "learning_rate": 8.772673691575541e-06, |
| "loss": 0.4369, |
| "mean_token_accuracy": 0.8529435247182846, |
| "num_tokens": 343026304.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 4.573613766730402, |
| "grad_norm": 0.13217896451409836, |
| "learning_rate": 8.768736434679073e-06, |
| "loss": 0.4442, |
| "mean_token_accuracy": 0.850242018699646, |
| "num_tokens": 343602304.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 4.581261950286807, |
| "grad_norm": 0.13127212529235552, |
| "learning_rate": 8.764793759110435e-06, |
| "loss": 0.4293, |
| "mean_token_accuracy": 0.8552266061306, |
| "num_tokens": 344178304.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.588910133843212, |
| "grad_norm": 0.1343418903278343, |
| "learning_rate": 8.760845670538387e-06, |
| "loss": 0.4211, |
| "mean_token_accuracy": 0.8569020330905914, |
| "num_tokens": 344754304.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 4.596558317399618, |
| "grad_norm": 0.1271661715666118, |
| "learning_rate": 8.756892174639473e-06, |
| "loss": 0.408, |
| "mean_token_accuracy": 0.8616817370057106, |
| "num_tokens": 345330304.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 4.604206500956023, |
| "grad_norm": 0.13563629821014458, |
| "learning_rate": 8.752933277098012e-06, |
| "loss": 0.437, |
| "mean_token_accuracy": 0.8516730964183807, |
| "num_tokens": 345892411.0, |
| "step": 603 |
| }, |
| { |
| "epoch": 4.611854684512428, |
| "grad_norm": 0.13068071235260656, |
| "learning_rate": 8.74896898360609e-06, |
| "loss": 0.4299, |
| "mean_token_accuracy": 0.8544713631272316, |
| "num_tokens": 346468411.0, |
| "step": 604 |
| }, |
| { |
| "epoch": 4.6195028680688335, |
| "grad_norm": 0.13846369217848975, |
| "learning_rate": 8.744999299863549e-06, |
| "loss": 0.4211, |
| "mean_token_accuracy": 0.857238844037056, |
| "num_tokens": 347044411.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.627151051625239, |
| "grad_norm": 0.1367695822273923, |
| "learning_rate": 8.741024231577983e-06, |
| "loss": 0.4491, |
| "mean_token_accuracy": 0.8486238941550255, |
| "num_tokens": 347620411.0, |
| "step": 606 |
| }, |
| { |
| "epoch": 4.634799235181644, |
| "grad_norm": 0.13305300129278447, |
| "learning_rate": 8.737043784464726e-06, |
| "loss": 0.3945, |
| "mean_token_accuracy": 0.8662357404828072, |
| "num_tokens": 348196411.0, |
| "step": 607 |
| }, |
| { |
| "epoch": 4.642447418738049, |
| "grad_norm": 0.14038270415564713, |
| "learning_rate": 8.733057964246849e-06, |
| "loss": 0.4551, |
| "mean_token_accuracy": 0.8459553942084312, |
| "num_tokens": 348772411.0, |
| "step": 608 |
| }, |
| { |
| "epoch": 4.650095602294455, |
| "grad_norm": 0.14422126563292137, |
| "learning_rate": 8.729066776655144e-06, |
| "loss": 0.4335, |
| "mean_token_accuracy": 0.8531865999102592, |
| "num_tokens": 349348411.0, |
| "step": 609 |
| }, |
| { |
| "epoch": 4.657743785850861, |
| "grad_norm": 0.1413112708302926, |
| "learning_rate": 8.725070227428123e-06, |
| "loss": 0.452, |
| "mean_token_accuracy": 0.8475092723965645, |
| "num_tokens": 349924411.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.665391969407266, |
| "grad_norm": 0.13342911934210575, |
| "learning_rate": 8.721068322312007e-06, |
| "loss": 0.4195, |
| "mean_token_accuracy": 0.8577718585729599, |
| "num_tokens": 350500411.0, |
| "step": 611 |
| }, |
| { |
| "epoch": 4.673040152963671, |
| "grad_norm": 0.1430879510207355, |
| "learning_rate": 8.717061067060716e-06, |
| "loss": 0.4465, |
| "mean_token_accuracy": 0.8497645780444145, |
| "num_tokens": 351076411.0, |
| "step": 612 |
| }, |
| { |
| "epoch": 4.680688336520077, |
| "grad_norm": 0.14166248034965864, |
| "learning_rate": 8.713048467435865e-06, |
| "loss": 0.4604, |
| "mean_token_accuracy": 0.8444344848394394, |
| "num_tokens": 351652411.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 4.688336520076482, |
| "grad_norm": 0.1430056087650855, |
| "learning_rate": 8.70903052920675e-06, |
| "loss": 0.4295, |
| "mean_token_accuracy": 0.8543862923979759, |
| "num_tokens": 352228411.0, |
| "step": 614 |
| }, |
| { |
| "epoch": 4.695984703632887, |
| "grad_norm": 0.13993804535154394, |
| "learning_rate": 8.705007258150346e-06, |
| "loss": 0.4736, |
| "mean_token_accuracy": 0.8403875529766083, |
| "num_tokens": 352796906.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.7036328871892925, |
| "grad_norm": 0.13853295092272744, |
| "learning_rate": 8.700978660051293e-06, |
| "loss": 0.4343, |
| "mean_token_accuracy": 0.8529174774885178, |
| "num_tokens": 353372906.0, |
| "step": 616 |
| }, |
| { |
| "epoch": 4.711281070745698, |
| "grad_norm": 0.13648098405467515, |
| "learning_rate": 8.696944740701891e-06, |
| "loss": 0.4624, |
| "mean_token_accuracy": 0.8432382643222809, |
| "num_tokens": 353948906.0, |
| "step": 617 |
| }, |
| { |
| "epoch": 4.718929254302103, |
| "grad_norm": 0.13063447639184614, |
| "learning_rate": 8.692905505902091e-06, |
| "loss": 0.4432, |
| "mean_token_accuracy": 0.8508288562297821, |
| "num_tokens": 354524906.0, |
| "step": 618 |
| }, |
| { |
| "epoch": 4.726577437858508, |
| "grad_norm": 0.13137608605071113, |
| "learning_rate": 8.688860961459487e-06, |
| "loss": 0.412, |
| "mean_token_accuracy": 0.8600583970546722, |
| "num_tokens": 355100906.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 4.734225621414914, |
| "grad_norm": 0.15267208050688227, |
| "learning_rate": 8.684811113189306e-06, |
| "loss": 0.4541, |
| "mean_token_accuracy": 0.8480162993073463, |
| "num_tokens": 355674543.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.741873804971319, |
| "grad_norm": 0.13165612975561047, |
| "learning_rate": 8.6807559669144e-06, |
| "loss": 0.4107, |
| "mean_token_accuracy": 0.8604946732521057, |
| "num_tokens": 356247875.0, |
| "step": 621 |
| }, |
| { |
| "epoch": 4.749521988527725, |
| "grad_norm": 0.12772216877759487, |
| "learning_rate": 8.676695528465244e-06, |
| "loss": 0.4011, |
| "mean_token_accuracy": 0.8628982827067375, |
| "num_tokens": 356809381.0, |
| "step": 622 |
| }, |
| { |
| "epoch": 4.75717017208413, |
| "grad_norm": 0.13488174230220404, |
| "learning_rate": 8.672629803679914e-06, |
| "loss": 0.4498, |
| "mean_token_accuracy": 0.8484294563531876, |
| "num_tokens": 357385381.0, |
| "step": 623 |
| }, |
| { |
| "epoch": 4.764818355640536, |
| "grad_norm": 0.1401189577724157, |
| "learning_rate": 8.668558798404093e-06, |
| "loss": 0.429, |
| "mean_token_accuracy": 0.8550686091184616, |
| "num_tokens": 357961381.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 4.772466539196941, |
| "grad_norm": 0.1364797558773597, |
| "learning_rate": 8.664482518491053e-06, |
| "loss": 0.4539, |
| "mean_token_accuracy": 0.8475179374217987, |
| "num_tokens": 358537381.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.780114722753346, |
| "grad_norm": 0.13289176813716883, |
| "learning_rate": 8.660400969801653e-06, |
| "loss": 0.4213, |
| "mean_token_accuracy": 0.8573412746191025, |
| "num_tokens": 359113381.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 4.7877629063097515, |
| "grad_norm": 0.13614408467258662, |
| "learning_rate": 8.65631415820432e-06, |
| "loss": 0.4437, |
| "mean_token_accuracy": 0.8494902551174164, |
| "num_tokens": 359689381.0, |
| "step": 627 |
| }, |
| { |
| "epoch": 4.795411089866157, |
| "grad_norm": 0.1361828076841561, |
| "learning_rate": 8.652222089575059e-06, |
| "loss": 0.4528, |
| "mean_token_accuracy": 0.8468199968338013, |
| "num_tokens": 360265381.0, |
| "step": 628 |
| }, |
| { |
| "epoch": 4.803059273422562, |
| "grad_norm": 0.13328710422461576, |
| "learning_rate": 8.648124769797424e-06, |
| "loss": 0.4365, |
| "mean_token_accuracy": 0.8522612005472183, |
| "num_tokens": 360841381.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 4.810707456978967, |
| "grad_norm": 0.14533854629935952, |
| "learning_rate": 8.644022204762525e-06, |
| "loss": 0.462, |
| "mean_token_accuracy": 0.8440126106142998, |
| "num_tokens": 361401576.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.818355640535373, |
| "grad_norm": 0.1308097892624671, |
| "learning_rate": 8.63991440036901e-06, |
| "loss": 0.4136, |
| "mean_token_accuracy": 0.8578916490077972, |
| "num_tokens": 361977576.0, |
| "step": 631 |
| }, |
| { |
| "epoch": 4.826003824091778, |
| "grad_norm": 0.13348566755409633, |
| "learning_rate": 8.63580136252306e-06, |
| "loss": 0.4469, |
| "mean_token_accuracy": 0.8494173437356949, |
| "num_tokens": 362553576.0, |
| "step": 632 |
| }, |
| { |
| "epoch": 4.833652007648183, |
| "grad_norm": 0.13082548673790548, |
| "learning_rate": 8.631683097138386e-06, |
| "loss": 0.4306, |
| "mean_token_accuracy": 0.8541657999157906, |
| "num_tokens": 363129576.0, |
| "step": 633 |
| }, |
| { |
| "epoch": 4.8413001912045885, |
| "grad_norm": 0.13534537707509742, |
| "learning_rate": 8.627559610136209e-06, |
| "loss": 0.4378, |
| "mean_token_accuracy": 0.8518879190087318, |
| "num_tokens": 363705576.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 4.848948374760994, |
| "grad_norm": 0.1322877886744737, |
| "learning_rate": 8.623430907445263e-06, |
| "loss": 0.4501, |
| "mean_token_accuracy": 0.848146453499794, |
| "num_tokens": 364281576.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.8565965583174, |
| "grad_norm": 0.1305531222811248, |
| "learning_rate": 8.619296995001773e-06, |
| "loss": 0.4583, |
| "mean_token_accuracy": 0.8479276895523071, |
| "num_tokens": 364857576.0, |
| "step": 636 |
| }, |
| { |
| "epoch": 4.864244741873805, |
| "grad_norm": 0.13613414953238218, |
| "learning_rate": 8.615157878749462e-06, |
| "loss": 0.4291, |
| "mean_token_accuracy": 0.8538307175040245, |
| "num_tokens": 365433576.0, |
| "step": 637 |
| }, |
| { |
| "epoch": 4.871892925430211, |
| "grad_norm": 0.13535260994204976, |
| "learning_rate": 8.611013564639532e-06, |
| "loss": 0.452, |
| "mean_token_accuracy": 0.8471116721630096, |
| "num_tokens": 366009576.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 4.879541108986616, |
| "grad_norm": 0.13913988946297418, |
| "learning_rate": 8.60686405863066e-06, |
| "loss": 0.481, |
| "mean_token_accuracy": 0.8386234864592552, |
| "num_tokens": 366585576.0, |
| "step": 639 |
| }, |
| { |
| "epoch": 4.887189292543021, |
| "grad_norm": 0.13926823889358061, |
| "learning_rate": 8.602709366688989e-06, |
| "loss": 0.4611, |
| "mean_token_accuracy": 0.8453963324427605, |
| "num_tokens": 367161576.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.894837476099426, |
| "grad_norm": 0.13451549889430478, |
| "learning_rate": 8.598549494788111e-06, |
| "loss": 0.4458, |
| "mean_token_accuracy": 0.8480683267116547, |
| "num_tokens": 367737576.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 4.902485659655832, |
| "grad_norm": 0.13290646145187515, |
| "learning_rate": 8.594384448909074e-06, |
| "loss": 0.4381, |
| "mean_token_accuracy": 0.8519139811396599, |
| "num_tokens": 368313576.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 4.910133843212237, |
| "grad_norm": 0.14817863417364635, |
| "learning_rate": 8.590214235040363e-06, |
| "loss": 0.4328, |
| "mean_token_accuracy": 0.854470893740654, |
| "num_tokens": 368887476.0, |
| "step": 643 |
| }, |
| { |
| "epoch": 4.917782026768642, |
| "grad_norm": 0.12880758333637257, |
| "learning_rate": 8.586038859177891e-06, |
| "loss": 0.4261, |
| "mean_token_accuracy": 0.8556710705161095, |
| "num_tokens": 369463476.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 4.925430210325048, |
| "grad_norm": 0.13524444500578836, |
| "learning_rate": 8.581858327324996e-06, |
| "loss": 0.4523, |
| "mean_token_accuracy": 0.8463720753788948, |
| "num_tokens": 370039476.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.933078393881453, |
| "grad_norm": 0.13170708374690382, |
| "learning_rate": 8.577672645492426e-06, |
| "loss": 0.4198, |
| "mean_token_accuracy": 0.8571259826421738, |
| "num_tokens": 370615476.0, |
| "step": 646 |
| }, |
| { |
| "epoch": 4.940726577437858, |
| "grad_norm": 0.13590194960038368, |
| "learning_rate": 8.573481819698337e-06, |
| "loss": 0.4288, |
| "mean_token_accuracy": 0.8549418747425079, |
| "num_tokens": 371191476.0, |
| "step": 647 |
| }, |
| { |
| "epoch": 4.948374760994264, |
| "grad_norm": 0.13608499203187072, |
| "learning_rate": 8.569285855968278e-06, |
| "loss": 0.4287, |
| "mean_token_accuracy": 0.8553623184561729, |
| "num_tokens": 371761699.0, |
| "step": 648 |
| }, |
| { |
| "epoch": 4.95602294455067, |
| "grad_norm": 0.1581401895374939, |
| "learning_rate": 8.565084760335188e-06, |
| "loss": 0.4357, |
| "mean_token_accuracy": 0.8529161512851715, |
| "num_tokens": 372304065.0, |
| "step": 649 |
| }, |
| { |
| "epoch": 4.963671128107075, |
| "grad_norm": 0.13384003030736613, |
| "learning_rate": 8.560878538839379e-06, |
| "loss": 0.4169, |
| "mean_token_accuracy": 0.858254499733448, |
| "num_tokens": 372880065.0, |
| "step": 650 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2620, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.564795702483878e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|