{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 513, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005847953216374269, "grad_norm": 3.346622166062452, "learning_rate": 0.0, "loss": 1.1879, "num_tokens": 309834.0, "step": 1 }, { "epoch": 0.011695906432748537, "grad_norm": 3.2150187522491747, "learning_rate": 6.25e-07, "loss": 1.1528, "num_tokens": 626323.0, "step": 2 }, { "epoch": 0.017543859649122806, "grad_norm": 3.1095611576006044, "learning_rate": 1.25e-06, "loss": 1.13, "num_tokens": 962858.0, "step": 3 }, { "epoch": 0.023391812865497075, "grad_norm": 3.0927940568383274, "learning_rate": 1.8750000000000003e-06, "loss": 1.1257, "num_tokens": 1307919.0, "step": 4 }, { "epoch": 0.029239766081871343, "grad_norm": 3.0505204823401963, "learning_rate": 2.5e-06, "loss": 1.1159, "num_tokens": 1643402.0, "step": 5 }, { "epoch": 0.03508771929824561, "grad_norm": 2.8525229189090564, "learning_rate": 3.125e-06, "loss": 1.1119, "num_tokens": 1963547.0, "step": 6 }, { "epoch": 0.04093567251461988, "grad_norm": 2.136607198658089, "learning_rate": 3.7500000000000005e-06, "loss": 1.0281, "num_tokens": 2283318.0, "step": 7 }, { "epoch": 0.04678362573099415, "grad_norm": 1.9705509244699, "learning_rate": 4.3750000000000005e-06, "loss": 1.0088, "num_tokens": 2603880.0, "step": 8 }, { "epoch": 0.05263157894736842, "grad_norm": 1.2382281811593294, "learning_rate": 5e-06, "loss": 0.8683, "num_tokens": 2929732.0, "step": 9 }, { "epoch": 0.05847953216374269, "grad_norm": 1.2462399978155196, "learning_rate": 5.625e-06, "loss": 0.868, "num_tokens": 3252895.0, "step": 10 }, { "epoch": 0.06432748538011696, "grad_norm": 1.1972270533702403, "learning_rate": 6.25e-06, "loss": 0.8151, "num_tokens": 3578517.0, "step": 11 }, { "epoch": 0.07017543859649122, "grad_norm": 1.8012881936401126, "learning_rate": 6.875e-06, "loss": 0.6379, "num_tokens": 3911914.0, "step": 12 }, { "epoch": 0.07602339181286549, "grad_norm": 1.5004715524395629, "learning_rate": 7.500000000000001e-06, "loss": 0.6352, "num_tokens": 4228515.0, "step": 13 }, { "epoch": 0.08187134502923976, "grad_norm": 1.320062812526294, "learning_rate": 8.125000000000001e-06, "loss": 0.6228, "num_tokens": 4536476.0, "step": 14 }, { "epoch": 0.08771929824561403, "grad_norm": 0.9906906777846411, "learning_rate": 8.750000000000001e-06, "loss": 0.5299, "num_tokens": 4868361.0, "step": 15 }, { "epoch": 0.0935672514619883, "grad_norm": 0.8386484072060002, "learning_rate": 9.375000000000001e-06, "loss": 0.4729, "num_tokens": 5216197.0, "step": 16 }, { "epoch": 0.09941520467836257, "grad_norm": 0.4900339517100113, "learning_rate": 1e-05, "loss": 0.4352, "num_tokens": 5557003.0, "step": 17 }, { "epoch": 0.10526315789473684, "grad_norm": 0.40004540169105984, "learning_rate": 9.999910098271881e-06, "loss": 0.3938, "num_tokens": 5892764.0, "step": 18 }, { "epoch": 0.1111111111111111, "grad_norm": 0.3513180037096265, "learning_rate": 9.999640396679666e-06, "loss": 0.4064, "num_tokens": 6207362.0, "step": 19 }, { "epoch": 0.11695906432748537, "grad_norm": 0.6732732756411357, "learning_rate": 9.999190905999637e-06, "loss": 0.3789, "num_tokens": 6496012.0, "step": 20 }, { "epoch": 0.12280701754385964, "grad_norm": 0.31994176912879413, "learning_rate": 9.99856164419179e-06, "loss": 0.3778, "num_tokens": 6804315.0, "step": 21 }, { "epoch": 0.1286549707602339, "grad_norm": 0.548016923041077, "learning_rate": 9.997752636399114e-06, "loss": 0.3673, "num_tokens": 7143380.0, "step": 22 }, { "epoch": 0.13450292397660818, "grad_norm": 0.29902181791620935, "learning_rate": 9.996763914946586e-06, "loss": 0.3593, "num_tokens": 7463502.0, "step": 23 }, { "epoch": 0.14035087719298245, "grad_norm": 0.31707406326522014, "learning_rate": 9.995595519339882e-06, "loss": 0.3577, "num_tokens": 7774770.0, "step": 24 }, { "epoch": 0.14619883040935672, "grad_norm": 0.27176157042044713, "learning_rate": 9.994247496263792e-06, "loss": 0.3395, "num_tokens": 8087750.0, "step": 25 }, { "epoch": 0.15204678362573099, "grad_norm": 0.3065271953474858, "learning_rate": 9.992719899580364e-06, "loss": 0.3474, "num_tokens": 8434239.0, "step": 26 }, { "epoch": 0.15789473684210525, "grad_norm": 0.30277827390998685, "learning_rate": 9.991012790326745e-06, "loss": 0.3121, "num_tokens": 8750905.0, "step": 27 }, { "epoch": 0.16374269005847952, "grad_norm": 0.25470554589584754, "learning_rate": 9.989126236712746e-06, "loss": 0.3197, "num_tokens": 9075220.0, "step": 28 }, { "epoch": 0.1695906432748538, "grad_norm": 0.24585407625854147, "learning_rate": 9.987060314118111e-06, "loss": 0.3217, "num_tokens": 9397453.0, "step": 29 }, { "epoch": 0.17543859649122806, "grad_norm": 0.4536476490136459, "learning_rate": 9.984815105089515e-06, "loss": 0.3369, "num_tokens": 9705728.0, "step": 30 }, { "epoch": 0.18128654970760233, "grad_norm": 0.23980035473967873, "learning_rate": 9.982390699337253e-06, "loss": 0.2813, "num_tokens": 10025621.0, "step": 31 }, { "epoch": 0.1871345029239766, "grad_norm": 0.23564595909740174, "learning_rate": 9.979787193731666e-06, "loss": 0.3259, "num_tokens": 10337176.0, "step": 32 }, { "epoch": 0.19298245614035087, "grad_norm": 0.20709709888139777, "learning_rate": 9.977004692299273e-06, "loss": 0.2945, "num_tokens": 10651259.0, "step": 33 }, { "epoch": 0.19883040935672514, "grad_norm": 0.1973226462113262, "learning_rate": 9.974043306218595e-06, "loss": 0.2922, "num_tokens": 10992918.0, "step": 34 }, { "epoch": 0.2046783625730994, "grad_norm": 0.18869956638769483, "learning_rate": 9.970903153815731e-06, "loss": 0.2835, "num_tokens": 11310394.0, "step": 35 }, { "epoch": 0.21052631578947367, "grad_norm": 0.18686305886535334, "learning_rate": 9.967584360559632e-06, "loss": 0.2925, "num_tokens": 11636774.0, "step": 36 }, { "epoch": 0.21637426900584794, "grad_norm": 0.1725945504953492, "learning_rate": 9.964087059057075e-06, "loss": 0.2592, "num_tokens": 11956378.0, "step": 37 }, { "epoch": 0.2222222222222222, "grad_norm": 0.19881803465402237, "learning_rate": 9.960411389047366e-06, "loss": 0.2955, "num_tokens": 12248057.0, "step": 38 }, { "epoch": 0.22807017543859648, "grad_norm": 0.19691612763541272, "learning_rate": 9.95655749739677e-06, "loss": 0.2835, "num_tokens": 12554456.0, "step": 39 }, { "epoch": 0.23391812865497075, "grad_norm": 0.17622929852503963, "learning_rate": 9.952525538092627e-06, "loss": 0.2726, "num_tokens": 12880847.0, "step": 40 }, { "epoch": 0.23976608187134502, "grad_norm": 0.1622820101848469, "learning_rate": 9.948315672237208e-06, "loss": 0.2692, "num_tokens": 13237415.0, "step": 41 }, { "epoch": 0.24561403508771928, "grad_norm": 0.16492519002263994, "learning_rate": 9.943928068041274e-06, "loss": 0.2791, "num_tokens": 13600570.0, "step": 42 }, { "epoch": 0.25146198830409355, "grad_norm": 0.1659517586811088, "learning_rate": 9.939362900817362e-06, "loss": 0.2499, "num_tokens": 13934393.0, "step": 43 }, { "epoch": 0.2573099415204678, "grad_norm": 0.1675942094434052, "learning_rate": 9.934620352972766e-06, "loss": 0.2709, "num_tokens": 14283319.0, "step": 44 }, { "epoch": 0.2631578947368421, "grad_norm": 0.17033294793266862, "learning_rate": 9.929700614002265e-06, "loss": 0.2712, "num_tokens": 14602358.0, "step": 45 }, { "epoch": 0.26900584795321636, "grad_norm": 0.17218950775910574, "learning_rate": 9.924603880480543e-06, "loss": 0.2768, "num_tokens": 14914703.0, "step": 46 }, { "epoch": 0.27485380116959063, "grad_norm": 0.17142338750289324, "learning_rate": 9.919330356054332e-06, "loss": 0.2677, "num_tokens": 15226891.0, "step": 47 }, { "epoch": 0.2807017543859649, "grad_norm": 0.15935514552682936, "learning_rate": 9.913880251434279e-06, "loss": 0.2513, "num_tokens": 15566110.0, "step": 48 }, { "epoch": 0.28654970760233917, "grad_norm": 0.17590713121193502, "learning_rate": 9.90825378438653e-06, "loss": 0.2413, "num_tokens": 15864678.0, "step": 49 }, { "epoch": 0.29239766081871343, "grad_norm": 0.17105743981722707, "learning_rate": 9.902451179724025e-06, "loss": 0.2566, "num_tokens": 16166802.0, "step": 50 }, { "epoch": 0.2982456140350877, "grad_norm": 0.17592812081480816, "learning_rate": 9.896472669297508e-06, "loss": 0.2432, "num_tokens": 16465873.0, "step": 51 }, { "epoch": 0.30409356725146197, "grad_norm": 0.16854050409478574, "learning_rate": 9.890318491986282e-06, "loss": 0.2434, "num_tokens": 16764387.0, "step": 52 }, { "epoch": 0.30994152046783624, "grad_norm": 0.1722203346036174, "learning_rate": 9.883988893688645e-06, "loss": 0.2533, "num_tokens": 17059312.0, "step": 53 }, { "epoch": 0.3157894736842105, "grad_norm": 0.16603305455782896, "learning_rate": 9.877484127312072e-06, "loss": 0.2492, "num_tokens": 17382890.0, "step": 54 }, { "epoch": 0.3216374269005848, "grad_norm": 0.16334953855577547, "learning_rate": 9.870804452763118e-06, "loss": 0.2563, "num_tokens": 17716146.0, "step": 55 }, { "epoch": 0.32748538011695905, "grad_norm": 0.18120405762456362, "learning_rate": 9.863950136937019e-06, "loss": 0.2532, "num_tokens": 18044901.0, "step": 56 }, { "epoch": 0.3333333333333333, "grad_norm": 0.16955478627673745, "learning_rate": 9.856921453707036e-06, "loss": 0.256, "num_tokens": 18360773.0, "step": 57 }, { "epoch": 0.3391812865497076, "grad_norm": 0.15096634608102888, "learning_rate": 9.849718683913511e-06, "loss": 0.2259, "num_tokens": 18694718.0, "step": 58 }, { "epoch": 0.34502923976608185, "grad_norm": 0.1757789479931499, "learning_rate": 9.842342115352647e-06, "loss": 0.2595, "num_tokens": 19014702.0, "step": 59 }, { "epoch": 0.3508771929824561, "grad_norm": 0.16490033035648094, "learning_rate": 9.834792042764999e-06, "loss": 0.2404, "num_tokens": 19339612.0, "step": 60 }, { "epoch": 0.3567251461988304, "grad_norm": 0.16019689209153504, "learning_rate": 9.827068767823713e-06, "loss": 0.248, "num_tokens": 19681676.0, "step": 61 }, { "epoch": 0.36257309941520466, "grad_norm": 0.1691448945153913, "learning_rate": 9.819172599122466e-06, "loss": 0.2346, "num_tokens": 20000100.0, "step": 62 }, { "epoch": 0.3684210526315789, "grad_norm": 0.16082639569066132, "learning_rate": 9.811103852163126e-06, "loss": 0.2262, "num_tokens": 20319423.0, "step": 63 }, { "epoch": 0.3742690058479532, "grad_norm": 0.15278910823194214, "learning_rate": 9.802862849343155e-06, "loss": 0.2281, "num_tokens": 20664041.0, "step": 64 }, { "epoch": 0.38011695906432746, "grad_norm": 0.16155951042022568, "learning_rate": 9.794449919942736e-06, "loss": 0.241, "num_tokens": 20983214.0, "step": 65 }, { "epoch": 0.38596491228070173, "grad_norm": 0.16422330995295928, "learning_rate": 9.785865400111593e-06, "loss": 0.2489, "num_tokens": 21321454.0, "step": 66 }, { "epoch": 0.391812865497076, "grad_norm": 0.16006949845014626, "learning_rate": 9.777109632855579e-06, "loss": 0.2471, "num_tokens": 21641982.0, "step": 67 }, { "epoch": 0.39766081871345027, "grad_norm": 0.16727932073445337, "learning_rate": 9.768182968022964e-06, "loss": 0.2417, "num_tokens": 21978836.0, "step": 68 }, { "epoch": 0.40350877192982454, "grad_norm": 0.17239803759423833, "learning_rate": 9.759085762290457e-06, "loss": 0.2377, "num_tokens": 22268188.0, "step": 69 }, { "epoch": 0.4093567251461988, "grad_norm": 0.16257813227817636, "learning_rate": 9.749818379148958e-06, "loss": 0.2265, "num_tokens": 22581727.0, "step": 70 }, { "epoch": 0.4152046783625731, "grad_norm": 0.16056196258322708, "learning_rate": 9.74038118888902e-06, "loss": 0.2261, "num_tokens": 22899881.0, "step": 71 }, { "epoch": 0.42105263157894735, "grad_norm": 0.15733806901697214, "learning_rate": 9.730774568586076e-06, "loss": 0.2263, "num_tokens": 23240539.0, "step": 72 }, { "epoch": 0.4269005847953216, "grad_norm": 0.1679225698442003, "learning_rate": 9.720998902085354e-06, "loss": 0.2362, "num_tokens": 23546933.0, "step": 73 }, { "epoch": 0.4327485380116959, "grad_norm": 0.16689340356885685, "learning_rate": 9.71105457998655e-06, "loss": 0.2309, "num_tokens": 23867940.0, "step": 74 }, { "epoch": 0.43859649122807015, "grad_norm": 0.17313794861512294, "learning_rate": 9.70094199962821e-06, "loss": 0.2311, "num_tokens": 24191283.0, "step": 75 }, { "epoch": 0.4444444444444444, "grad_norm": 0.16976811460329427, "learning_rate": 9.690661565071875e-06, "loss": 0.2341, "num_tokens": 24504739.0, "step": 76 }, { "epoch": 0.4502923976608187, "grad_norm": 0.16719875175704804, "learning_rate": 9.68021368708591e-06, "loss": 0.2425, "num_tokens": 24846242.0, "step": 77 }, { "epoch": 0.45614035087719296, "grad_norm": 0.16237180095271134, "learning_rate": 9.66959878312911e-06, "loss": 0.219, "num_tokens": 25163117.0, "step": 78 }, { "epoch": 0.4619883040935672, "grad_norm": 0.16510212997973622, "learning_rate": 9.658817277334013e-06, "loss": 0.2304, "num_tokens": 25498031.0, "step": 79 }, { "epoch": 0.4678362573099415, "grad_norm": 0.16563608461998558, "learning_rate": 9.647869600489954e-06, "loss": 0.231, "num_tokens": 25812720.0, "step": 80 }, { "epoch": 0.47368421052631576, "grad_norm": 0.17555857386256016, "learning_rate": 9.63675619002585e-06, "loss": 0.2323, "num_tokens": 26123680.0, "step": 81 }, { "epoch": 0.47953216374269003, "grad_norm": 0.164333445701762, "learning_rate": 9.625477489992727e-06, "loss": 0.2138, "num_tokens": 26410771.0, "step": 82 }, { "epoch": 0.4853801169590643, "grad_norm": 0.17696917107757262, "learning_rate": 9.614033951045974e-06, "loss": 0.2286, "num_tokens": 26716396.0, "step": 83 }, { "epoch": 0.49122807017543857, "grad_norm": 0.15710163009015682, "learning_rate": 9.602426030427335e-06, "loss": 0.22, "num_tokens": 27038109.0, "step": 84 }, { "epoch": 0.49707602339181284, "grad_norm": 0.1677938058002079, "learning_rate": 9.590654191946645e-06, "loss": 0.2327, "num_tokens": 27372562.0, "step": 85 }, { "epoch": 0.5029239766081871, "grad_norm": 0.15756946980734074, "learning_rate": 9.578718905963289e-06, "loss": 0.2274, "num_tokens": 27719366.0, "step": 86 }, { "epoch": 0.5087719298245614, "grad_norm": 0.16483666302272912, "learning_rate": 9.566620649367418e-06, "loss": 0.23, "num_tokens": 28062728.0, "step": 87 }, { "epoch": 0.5146198830409356, "grad_norm": 0.16380030907396115, "learning_rate": 9.554359905560887e-06, "loss": 0.2139, "num_tokens": 28392885.0, "step": 88 }, { "epoch": 0.52046783625731, "grad_norm": 0.15375223243216413, "learning_rate": 9.541937164437942e-06, "loss": 0.2249, "num_tokens": 28727888.0, "step": 89 }, { "epoch": 0.5263157894736842, "grad_norm": 0.16323721706648206, "learning_rate": 9.52935292236565e-06, "loss": 0.2206, "num_tokens": 29040411.0, "step": 90 }, { "epoch": 0.5321637426900585, "grad_norm": 0.16395159555865288, "learning_rate": 9.516607682164058e-06, "loss": 0.2077, "num_tokens": 29363581.0, "step": 91 }, { "epoch": 0.5380116959064327, "grad_norm": 0.173187036283939, "learning_rate": 9.503701953086107e-06, "loss": 0.2325, "num_tokens": 29691373.0, "step": 92 }, { "epoch": 0.543859649122807, "grad_norm": 0.15592117720334775, "learning_rate": 9.490636250797288e-06, "loss": 0.2215, "num_tokens": 30026282.0, "step": 93 }, { "epoch": 0.5497076023391813, "grad_norm": 0.16780226146886296, "learning_rate": 9.477411097355025e-06, "loss": 0.2266, "num_tokens": 30357776.0, "step": 94 }, { "epoch": 0.5555555555555556, "grad_norm": 0.16080371447382497, "learning_rate": 9.464027021187833e-06, "loss": 0.2261, "num_tokens": 30675188.0, "step": 95 }, { "epoch": 0.5614035087719298, "grad_norm": 0.15669972327863108, "learning_rate": 9.450484557074188e-06, "loss": 0.2247, "num_tokens": 31041728.0, "step": 96 }, { "epoch": 0.5672514619883041, "grad_norm": 0.15864033792743365, "learning_rate": 9.43678424612117e-06, "loss": 0.2219, "num_tokens": 31373582.0, "step": 97 }, { "epoch": 0.5730994152046783, "grad_norm": 0.15897622365337738, "learning_rate": 9.422926635742834e-06, "loss": 0.2124, "num_tokens": 31693920.0, "step": 98 }, { "epoch": 0.5789473684210527, "grad_norm": 0.1541883977143332, "learning_rate": 9.40891227963835e-06, "loss": 0.2089, "num_tokens": 32016538.0, "step": 99 }, { "epoch": 0.5847953216374269, "grad_norm": 0.16113178998178, "learning_rate": 9.39474173776986e-06, "loss": 0.2131, "num_tokens": 32342868.0, "step": 100 }, { "epoch": 0.5906432748538012, "grad_norm": 0.1639032352257192, "learning_rate": 9.380415576340127e-06, "loss": 0.2126, "num_tokens": 32649722.0, "step": 101 }, { "epoch": 0.5964912280701754, "grad_norm": 0.15812907943305207, "learning_rate": 9.365934367769885e-06, "loss": 0.2046, "num_tokens": 32961156.0, "step": 102 }, { "epoch": 0.6023391812865497, "grad_norm": 0.16040845291570488, "learning_rate": 9.351298690674996e-06, "loss": 0.212, "num_tokens": 33286164.0, "step": 103 }, { "epoch": 0.6081871345029239, "grad_norm": 0.15564337683728058, "learning_rate": 9.33650912984331e-06, "loss": 0.2112, "num_tokens": 33634944.0, "step": 104 }, { "epoch": 0.6140350877192983, "grad_norm": 0.16026166188911017, "learning_rate": 9.321566276211304e-06, "loss": 0.2167, "num_tokens": 33940455.0, "step": 105 }, { "epoch": 0.6198830409356725, "grad_norm": 0.15084775438103953, "learning_rate": 9.306470726840472e-06, "loss": 0.212, "num_tokens": 34269432.0, "step": 106 }, { "epoch": 0.6257309941520468, "grad_norm": 0.16379797154749554, "learning_rate": 9.291223084893472e-06, "loss": 0.2259, "num_tokens": 34564983.0, "step": 107 }, { "epoch": 0.631578947368421, "grad_norm": 0.1626726779429298, "learning_rate": 9.275823959610019e-06, "loss": 0.2068, "num_tokens": 34869398.0, "step": 108 }, { "epoch": 0.6374269005847953, "grad_norm": 0.16973276732555354, "learning_rate": 9.260273966282546e-06, "loss": 0.2103, "num_tokens": 35179769.0, "step": 109 }, { "epoch": 0.6432748538011696, "grad_norm": 0.16573716072448422, "learning_rate": 9.244573726231621e-06, "loss": 0.209, "num_tokens": 35489608.0, "step": 110 }, { "epoch": 0.6491228070175439, "grad_norm": 0.16034467135549915, "learning_rate": 9.22872386678111e-06, "loss": 0.2056, "num_tokens": 35795317.0, "step": 111 }, { "epoch": 0.6549707602339181, "grad_norm": 0.16859253078446698, "learning_rate": 9.212725021233135e-06, "loss": 0.2105, "num_tokens": 36108365.0, "step": 112 }, { "epoch": 0.6608187134502924, "grad_norm": 0.17271477988986808, "learning_rate": 9.196577828842738e-06, "loss": 0.2075, "num_tokens": 36435675.0, "step": 113 }, { "epoch": 0.6666666666666666, "grad_norm": 0.17273881432682334, "learning_rate": 9.180282934792369e-06, "loss": 0.2054, "num_tokens": 36737269.0, "step": 114 }, { "epoch": 0.672514619883041, "grad_norm": 0.15988223863731596, "learning_rate": 9.163840990166085e-06, "loss": 0.2011, "num_tokens": 37059436.0, "step": 115 }, { "epoch": 0.6783625730994152, "grad_norm": 0.1693196726503627, "learning_rate": 9.147252651923546e-06, "loss": 0.2202, "num_tokens": 37382958.0, "step": 116 }, { "epoch": 0.6842105263157895, "grad_norm": 0.1605754092944871, "learning_rate": 9.130518582873765e-06, "loss": 0.2169, "num_tokens": 37711301.0, "step": 117 }, { "epoch": 0.6900584795321637, "grad_norm": 0.16182274147996495, "learning_rate": 9.11363945164862e-06, "loss": 0.2022, "num_tokens": 38034357.0, "step": 118 }, { "epoch": 0.695906432748538, "grad_norm": 0.17280602584782606, "learning_rate": 9.096615932676138e-06, "loss": 0.2011, "num_tokens": 38349527.0, "step": 119 }, { "epoch": 0.7017543859649122, "grad_norm": 0.16010466565814827, "learning_rate": 9.079448706153554e-06, "loss": 0.2016, "num_tokens": 38654356.0, "step": 120 }, { "epoch": 0.7076023391812866, "grad_norm": 0.15767250824653006, "learning_rate": 9.062138458020128e-06, "loss": 0.1917, "num_tokens": 38962205.0, "step": 121 }, { "epoch": 0.7134502923976608, "grad_norm": 0.17133096215596827, "learning_rate": 9.044685879929734e-06, "loss": 0.2087, "num_tokens": 39267552.0, "step": 122 }, { "epoch": 0.7192982456140351, "grad_norm": 0.15684580395831532, "learning_rate": 9.027091669223228e-06, "loss": 0.203, "num_tokens": 39570208.0, "step": 123 }, { "epoch": 0.7251461988304093, "grad_norm": 0.16789387397403432, "learning_rate": 9.00935652890059e-06, "loss": 0.2099, "num_tokens": 39895778.0, "step": 124 }, { "epoch": 0.7309941520467836, "grad_norm": 0.15658978810688212, "learning_rate": 8.991481167592826e-06, "loss": 0.2061, "num_tokens": 40225470.0, "step": 125 }, { "epoch": 0.7368421052631579, "grad_norm": 0.15588015347201137, "learning_rate": 8.973466299533656e-06, "loss": 0.2047, "num_tokens": 40559255.0, "step": 126 }, { "epoch": 0.7426900584795322, "grad_norm": 0.1750982604863698, "learning_rate": 8.955312644530976e-06, "loss": 0.1996, "num_tokens": 40860587.0, "step": 127 }, { "epoch": 0.7485380116959064, "grad_norm": 0.15381841682641284, "learning_rate": 8.937020927938103e-06, "loss": 0.2001, "num_tokens": 41189624.0, "step": 128 }, { "epoch": 0.7543859649122807, "grad_norm": 0.1577155995424487, "learning_rate": 8.918591880624783e-06, "loss": 0.2005, "num_tokens": 41490687.0, "step": 129 }, { "epoch": 0.7602339181286549, "grad_norm": 0.15962057074829455, "learning_rate": 8.900026238947995e-06, "loss": 0.2115, "num_tokens": 41818157.0, "step": 130 }, { "epoch": 0.7660818713450293, "grad_norm": 0.15033669547133874, "learning_rate": 8.881324744722524e-06, "loss": 0.1945, "num_tokens": 42149764.0, "step": 131 }, { "epoch": 0.7719298245614035, "grad_norm": 0.15680523481040093, "learning_rate": 8.86248814519133e-06, "loss": 0.204, "num_tokens": 42452660.0, "step": 132 }, { "epoch": 0.7777777777777778, "grad_norm": 0.19538772884868966, "learning_rate": 8.843517192995673e-06, "loss": 0.2094, "num_tokens": 42762176.0, "step": 133 }, { "epoch": 0.783625730994152, "grad_norm": 0.1497802316166281, "learning_rate": 8.824412646145065e-06, "loss": 0.206, "num_tokens": 43122509.0, "step": 134 }, { "epoch": 0.7894736842105263, "grad_norm": 0.17792632602872682, "learning_rate": 8.805175267986955e-06, "loss": 0.2021, "num_tokens": 43438515.0, "step": 135 }, { "epoch": 0.7953216374269005, "grad_norm": 0.16345383837628785, "learning_rate": 8.785805827176256e-06, "loss": 0.213, "num_tokens": 43750567.0, "step": 136 }, { "epoch": 0.8011695906432749, "grad_norm": 0.15988232783718637, "learning_rate": 8.766305097644608e-06, "loss": 0.2076, "num_tokens": 44061251.0, "step": 137 }, { "epoch": 0.8070175438596491, "grad_norm": 0.148560033989183, "learning_rate": 8.746673858569478e-06, "loss": 0.2056, "num_tokens": 44402399.0, "step": 138 }, { "epoch": 0.8128654970760234, "grad_norm": 0.15498081570702754, "learning_rate": 8.726912894343e-06, "loss": 0.2011, "num_tokens": 44713760.0, "step": 139 }, { "epoch": 0.8187134502923976, "grad_norm": 0.16409619767618208, "learning_rate": 8.707022994540659e-06, "loss": 0.1985, "num_tokens": 45006733.0, "step": 140 }, { "epoch": 0.8245614035087719, "grad_norm": 0.16387952215184107, "learning_rate": 8.687004953889729e-06, "loss": 0.2117, "num_tokens": 45319558.0, "step": 141 }, { "epoch": 0.8304093567251462, "grad_norm": 0.14972932521892918, "learning_rate": 8.666859572237517e-06, "loss": 0.2017, "num_tokens": 45648581.0, "step": 142 }, { "epoch": 0.8362573099415205, "grad_norm": 0.14894137892754095, "learning_rate": 8.646587654519413e-06, "loss": 0.2011, "num_tokens": 45961313.0, "step": 143 }, { "epoch": 0.8421052631578947, "grad_norm": 0.15579100817213587, "learning_rate": 8.626190010726723e-06, "loss": 0.1851, "num_tokens": 46290840.0, "step": 144 }, { "epoch": 0.847953216374269, "grad_norm": 0.154512527802127, "learning_rate": 8.605667455874302e-06, "loss": 0.1903, "num_tokens": 46585564.0, "step": 145 }, { "epoch": 0.8538011695906432, "grad_norm": 0.16017530332563623, "learning_rate": 8.585020809967995e-06, "loss": 0.2066, "num_tokens": 46893844.0, "step": 146 }, { "epoch": 0.8596491228070176, "grad_norm": 0.16010504287695315, "learning_rate": 8.564250897971862e-06, "loss": 0.2151, "num_tokens": 47228507.0, "step": 147 }, { "epoch": 0.8654970760233918, "grad_norm": 0.15911701846573467, "learning_rate": 8.543358549775232e-06, "loss": 0.2029, "num_tokens": 47537550.0, "step": 148 }, { "epoch": 0.8713450292397661, "grad_norm": 0.1385770650249908, "learning_rate": 8.522344600159532e-06, "loss": 0.1892, "num_tokens": 47871896.0, "step": 149 }, { "epoch": 0.8771929824561403, "grad_norm": 0.14241522573672255, "learning_rate": 8.501209888764928e-06, "loss": 0.2016, "num_tokens": 48224890.0, "step": 150 }, { "epoch": 0.8830409356725146, "grad_norm": 0.16042004560579917, "learning_rate": 8.479955260056793e-06, "loss": 0.2293, "num_tokens": 48551394.0, "step": 151 }, { "epoch": 0.8888888888888888, "grad_norm": 0.1606415549333606, "learning_rate": 8.458581563291948e-06, "loss": 0.1993, "num_tokens": 48869584.0, "step": 152 }, { "epoch": 0.8947368421052632, "grad_norm": 0.13692085860159872, "learning_rate": 8.437089652484735e-06, "loss": 0.188, "num_tokens": 49220358.0, "step": 153 }, { "epoch": 0.9005847953216374, "grad_norm": 0.15029963686711117, "learning_rate": 8.415480386372901e-06, "loss": 0.2176, "num_tokens": 49555633.0, "step": 154 }, { "epoch": 0.9064327485380117, "grad_norm": 0.15377050265299294, "learning_rate": 8.393754628383274e-06, "loss": 0.2078, "num_tokens": 49857399.0, "step": 155 }, { "epoch": 0.9122807017543859, "grad_norm": 0.1600390896381573, "learning_rate": 8.371913246597272e-06, "loss": 0.1987, "num_tokens": 50208717.0, "step": 156 }, { "epoch": 0.9181286549707602, "grad_norm": 0.1709173684661271, "learning_rate": 8.349957113716213e-06, "loss": 0.212, "num_tokens": 50502126.0, "step": 157 }, { "epoch": 0.9239766081871345, "grad_norm": 0.16675835187365423, "learning_rate": 8.327887107026445e-06, "loss": 0.2237, "num_tokens": 50820497.0, "step": 158 }, { "epoch": 0.9298245614035088, "grad_norm": 0.14395552142792745, "learning_rate": 8.305704108364301e-06, "loss": 0.2076, "num_tokens": 51154766.0, "step": 159 }, { "epoch": 0.935672514619883, "grad_norm": 0.15573171280863216, "learning_rate": 8.283409004080853e-06, "loss": 0.2114, "num_tokens": 51491802.0, "step": 160 }, { "epoch": 0.9415204678362573, "grad_norm": 0.158518456781856, "learning_rate": 8.261002685006503e-06, "loss": 0.2224, "num_tokens": 51818025.0, "step": 161 }, { "epoch": 0.9473684210526315, "grad_norm": 0.15531715834460813, "learning_rate": 8.238486046415385e-06, "loss": 0.1937, "num_tokens": 52118378.0, "step": 162 }, { "epoch": 0.9532163742690059, "grad_norm": 0.1515317024708392, "learning_rate": 8.2158599879896e-06, "loss": 0.1968, "num_tokens": 52428129.0, "step": 163 }, { "epoch": 0.9590643274853801, "grad_norm": 0.14424883914854034, "learning_rate": 8.19312541378326e-06, "loss": 0.193, "num_tokens": 52735470.0, "step": 164 }, { "epoch": 0.9649122807017544, "grad_norm": 0.15498488683654527, "learning_rate": 8.170283232186365e-06, "loss": 0.1943, "num_tokens": 53051592.0, "step": 165 }, { "epoch": 0.9707602339181286, "grad_norm": 0.1754541926428641, "learning_rate": 8.14733435588852e-06, "loss": 0.2214, "num_tokens": 53343217.0, "step": 166 }, { "epoch": 0.9766081871345029, "grad_norm": 0.14868906024546139, "learning_rate": 8.12427970184245e-06, "loss": 0.1935, "num_tokens": 53658830.0, "step": 167 }, { "epoch": 0.9824561403508771, "grad_norm": 0.1500168809044866, "learning_rate": 8.101120191227374e-06, "loss": 0.1981, "num_tokens": 53999419.0, "step": 168 }, { "epoch": 0.9883040935672515, "grad_norm": 0.14699416276573674, "learning_rate": 8.07785674941219e-06, "loss": 0.194, "num_tokens": 54309901.0, "step": 169 }, { "epoch": 0.9941520467836257, "grad_norm": 0.15232473722185103, "learning_rate": 8.054490305918512e-06, "loss": 0.1955, "num_tokens": 54612844.0, "step": 170 }, { "epoch": 1.0, "grad_norm": 0.1607048554407368, "learning_rate": 8.031021794383513e-06, "loss": 0.2092, "num_tokens": 54926459.0, "step": 171 }, { "epoch": 1.0058479532163742, "grad_norm": 0.1625217088750646, "learning_rate": 8.007452152522639e-06, "loss": 0.1875, "num_tokens": 55222609.0, "step": 172 }, { "epoch": 1.0116959064327484, "grad_norm": 0.15670723402140246, "learning_rate": 7.983782322092126e-06, "loss": 0.1938, "num_tokens": 55537898.0, "step": 173 }, { "epoch": 1.0175438596491229, "grad_norm": 0.14242775964832494, "learning_rate": 7.960013248851375e-06, "loss": 0.1882, "num_tokens": 55862238.0, "step": 174 }, { "epoch": 1.023391812865497, "grad_norm": 0.1606453358565539, "learning_rate": 7.936145882525174e-06, "loss": 0.1877, "num_tokens": 56180559.0, "step": 175 }, { "epoch": 1.0292397660818713, "grad_norm": 0.15292057131184103, "learning_rate": 7.91218117676573e-06, "loss": 0.1783, "num_tokens": 56530315.0, "step": 176 }, { "epoch": 1.0350877192982457, "grad_norm": 0.14722221394043097, "learning_rate": 7.888120089114586e-06, "loss": 0.1758, "num_tokens": 56837967.0, "step": 177 }, { "epoch": 1.04093567251462, "grad_norm": 0.15795692601491945, "learning_rate": 7.863963580964344e-06, "loss": 0.1772, "num_tokens": 57149693.0, "step": 178 }, { "epoch": 1.0467836257309941, "grad_norm": 0.1700985282278579, "learning_rate": 7.839712617520263e-06, "loss": 0.1897, "num_tokens": 57481658.0, "step": 179 }, { "epoch": 1.0526315789473684, "grad_norm": 0.14557972709377917, "learning_rate": 7.815368167761686e-06, "loss": 0.1797, "num_tokens": 57795613.0, "step": 180 }, { "epoch": 1.0584795321637426, "grad_norm": 0.15501104048829578, "learning_rate": 7.790931204403323e-06, "loss": 0.177, "num_tokens": 58094562.0, "step": 181 }, { "epoch": 1.064327485380117, "grad_norm": 0.1521089989961407, "learning_rate": 7.766402703856391e-06, "loss": 0.1748, "num_tokens": 58418586.0, "step": 182 }, { "epoch": 1.0701754385964912, "grad_norm": 0.16486785923579997, "learning_rate": 7.741783646189597e-06, "loss": 0.1927, "num_tokens": 58745927.0, "step": 183 }, { "epoch": 1.0760233918128654, "grad_norm": 0.15410381183903402, "learning_rate": 7.717075015089976e-06, "loss": 0.1884, "num_tokens": 59070496.0, "step": 184 }, { "epoch": 1.0818713450292399, "grad_norm": 0.1444493695200652, "learning_rate": 7.692277797823585e-06, "loss": 0.1755, "num_tokens": 59388680.0, "step": 185 }, { "epoch": 1.087719298245614, "grad_norm": 0.15330407620774641, "learning_rate": 7.667392985196064e-06, "loss": 0.1866, "num_tokens": 59707236.0, "step": 186 }, { "epoch": 1.0935672514619883, "grad_norm": 0.16477562582655433, "learning_rate": 7.64242157151304e-06, "loss": 0.1999, "num_tokens": 60042655.0, "step": 187 }, { "epoch": 1.0994152046783625, "grad_norm": 0.140060196586728, "learning_rate": 7.6173645545404e-06, "loss": 0.1834, "num_tokens": 60397091.0, "step": 188 }, { "epoch": 1.1052631578947367, "grad_norm": 0.1525186599047059, "learning_rate": 7.5922229354644195e-06, "loss": 0.1811, "num_tokens": 60707243.0, "step": 189 }, { "epoch": 1.1111111111111112, "grad_norm": 0.14180526703114305, "learning_rate": 7.56699771885177e-06, "loss": 0.1789, "num_tokens": 61056021.0, "step": 190 }, { "epoch": 1.1169590643274854, "grad_norm": 0.14606069061084653, "learning_rate": 7.541689912609365e-06, "loss": 0.1833, "num_tokens": 61381476.0, "step": 191 }, { "epoch": 1.1228070175438596, "grad_norm": 0.1452299835582357, "learning_rate": 7.516300527944104e-06, "loss": 0.1889, "num_tokens": 61710931.0, "step": 192 }, { "epoch": 1.128654970760234, "grad_norm": 0.1607339684687444, "learning_rate": 7.4908305793224565e-06, "loss": 0.1891, "num_tokens": 62048426.0, "step": 193 }, { "epoch": 1.1345029239766082, "grad_norm": 0.15970871017649693, "learning_rate": 7.465281084429931e-06, "loss": 0.1841, "num_tokens": 62347583.0, "step": 194 }, { "epoch": 1.1403508771929824, "grad_norm": 0.15135926518093104, "learning_rate": 7.4396530641304135e-06, "loss": 0.1817, "num_tokens": 62662619.0, "step": 195 }, { "epoch": 1.1461988304093567, "grad_norm": 0.1463984323420409, "learning_rate": 7.413947542425377e-06, "loss": 0.1795, "num_tokens": 62990146.0, "step": 196 }, { "epoch": 1.1520467836257309, "grad_norm": 0.15028061491082353, "learning_rate": 7.388165546412967e-06, "loss": 0.1809, "num_tokens": 63314531.0, "step": 197 }, { "epoch": 1.1578947368421053, "grad_norm": 0.15759301916806728, "learning_rate": 7.362308106246956e-06, "loss": 0.1842, "num_tokens": 63647247.0, "step": 198 }, { "epoch": 1.1637426900584795, "grad_norm": 0.14433827296829588, "learning_rate": 7.336376255095592e-06, "loss": 0.1758, "num_tokens": 63974328.0, "step": 199 }, { "epoch": 1.1695906432748537, "grad_norm": 0.1489036598644256, "learning_rate": 7.3103710291003134e-06, "loss": 0.1832, "num_tokens": 64295392.0, "step": 200 }, { "epoch": 1.1754385964912282, "grad_norm": 0.1621435575204086, "learning_rate": 7.284293467334344e-06, "loss": 0.1829, "num_tokens": 64601120.0, "step": 201 }, { "epoch": 1.1812865497076024, "grad_norm": 0.15685350805242304, "learning_rate": 7.258144611761181e-06, "loss": 0.1828, "num_tokens": 64910553.0, "step": 202 }, { "epoch": 1.1871345029239766, "grad_norm": 0.1537822114754735, "learning_rate": 7.23192550719296e-06, "loss": 0.1786, "num_tokens": 65230586.0, "step": 203 }, { "epoch": 1.1929824561403508, "grad_norm": 0.1522958629898793, "learning_rate": 7.2056372012487065e-06, "loss": 0.1858, "num_tokens": 65576822.0, "step": 204 }, { "epoch": 1.198830409356725, "grad_norm": 0.15072282593856123, "learning_rate": 7.179280744312481e-06, "loss": 0.1717, "num_tokens": 65892198.0, "step": 205 }, { "epoch": 1.2046783625730995, "grad_norm": 0.14679430331251794, "learning_rate": 7.152857189491406e-06, "loss": 0.1709, "num_tokens": 66218113.0, "step": 206 }, { "epoch": 1.2105263157894737, "grad_norm": 0.14528524091480893, "learning_rate": 7.126367592573589e-06, "loss": 0.172, "num_tokens": 66560316.0, "step": 207 }, { "epoch": 1.2163742690058479, "grad_norm": 0.1513531832369065, "learning_rate": 7.099813011985936e-06, "loss": 0.1867, "num_tokens": 66886426.0, "step": 208 }, { "epoch": 1.2222222222222223, "grad_norm": 0.14889506905441677, "learning_rate": 7.073194508751863e-06, "loss": 0.184, "num_tokens": 67205115.0, "step": 209 }, { "epoch": 1.2280701754385965, "grad_norm": 0.1654317181387804, "learning_rate": 7.046513146448899e-06, "loss": 0.1892, "num_tokens": 67509071.0, "step": 210 }, { "epoch": 1.2339181286549707, "grad_norm": 0.1510266755197108, "learning_rate": 7.019769991166189e-06, "loss": 0.1788, "num_tokens": 67841682.0, "step": 211 }, { "epoch": 1.239766081871345, "grad_norm": 0.1419685060740966, "learning_rate": 6.992966111461903e-06, "loss": 0.1685, "num_tokens": 68147715.0, "step": 212 }, { "epoch": 1.2456140350877192, "grad_norm": 0.14993568993277867, "learning_rate": 6.966102578320531e-06, "loss": 0.1764, "num_tokens": 68463460.0, "step": 213 }, { "epoch": 1.2514619883040936, "grad_norm": 0.14659321843622847, "learning_rate": 6.9391804651100924e-06, "loss": 0.1897, "num_tokens": 68799959.0, "step": 214 }, { "epoch": 1.2573099415204678, "grad_norm": 0.15073229254770368, "learning_rate": 6.912200847539261e-06, "loss": 0.172, "num_tokens": 69111397.0, "step": 215 }, { "epoch": 1.263157894736842, "grad_norm": 0.15759494389625772, "learning_rate": 6.885164803614366e-06, "loss": 0.1838, "num_tokens": 69392630.0, "step": 216 }, { "epoch": 1.2690058479532165, "grad_norm": 0.14630639885005334, "learning_rate": 6.858073413596324e-06, "loss": 0.1807, "num_tokens": 69701641.0, "step": 217 }, { "epoch": 1.2748538011695907, "grad_norm": 0.14730200293827667, "learning_rate": 6.830927759957487e-06, "loss": 0.1813, "num_tokens": 70030091.0, "step": 218 }, { "epoch": 1.280701754385965, "grad_norm": 0.14753719595946904, "learning_rate": 6.80372892733837e-06, "loss": 0.1804, "num_tokens": 70348775.0, "step": 219 }, { "epoch": 1.286549707602339, "grad_norm": 0.16483571370877223, "learning_rate": 6.776478002504335e-06, "loss": 0.1859, "num_tokens": 70651692.0, "step": 220 }, { "epoch": 1.2923976608187133, "grad_norm": 0.16648842833487107, "learning_rate": 6.7491760743021535e-06, "loss": 0.1903, "num_tokens": 70956682.0, "step": 221 }, { "epoch": 1.2982456140350878, "grad_norm": 0.14671697438686584, "learning_rate": 6.721824233616503e-06, "loss": 0.1758, "num_tokens": 71265203.0, "step": 222 }, { "epoch": 1.304093567251462, "grad_norm": 0.15188899959419136, "learning_rate": 6.694423573326382e-06, "loss": 0.1782, "num_tokens": 71583993.0, "step": 223 }, { "epoch": 1.3099415204678362, "grad_norm": 0.14838740052418056, "learning_rate": 6.666975188261437e-06, "loss": 0.182, "num_tokens": 71899719.0, "step": 224 }, { "epoch": 1.3157894736842106, "grad_norm": 0.1576944297988978, "learning_rate": 6.639480175158227e-06, "loss": 0.1784, "num_tokens": 72202094.0, "step": 225 }, { "epoch": 1.3216374269005848, "grad_norm": 0.15597436870104375, "learning_rate": 6.611939632616394e-06, "loss": 0.1784, "num_tokens": 72516371.0, "step": 226 }, { "epoch": 1.327485380116959, "grad_norm": 0.15325246999833303, "learning_rate": 6.584354661054765e-06, "loss": 0.1842, "num_tokens": 72828007.0, "step": 227 }, { "epoch": 1.3333333333333333, "grad_norm": 0.14935801239745722, "learning_rate": 6.556726362667394e-06, "loss": 0.1833, "num_tokens": 73134243.0, "step": 228 }, { "epoch": 1.3391812865497075, "grad_norm": 0.16481075845453566, "learning_rate": 6.529055841379509e-06, "loss": 0.176, "num_tokens": 73436138.0, "step": 229 }, { "epoch": 1.345029239766082, "grad_norm": 0.14125268538033928, "learning_rate": 6.501344202803415e-06, "loss": 0.1708, "num_tokens": 73760046.0, "step": 230 }, { "epoch": 1.3508771929824561, "grad_norm": 0.1501570731496053, "learning_rate": 6.473592554194311e-06, "loss": 0.1826, "num_tokens": 74077480.0, "step": 231 }, { "epoch": 1.3567251461988303, "grad_norm": 0.15771910225549807, "learning_rate": 6.445802004406047e-06, "loss": 0.1922, "num_tokens": 74423874.0, "step": 232 }, { "epoch": 1.3625730994152048, "grad_norm": 0.15600805460262265, "learning_rate": 6.417973663846826e-06, "loss": 0.1749, "num_tokens": 74745816.0, "step": 233 }, { "epoch": 1.368421052631579, "grad_norm": 0.14516377176873183, "learning_rate": 6.390108644434828e-06, "loss": 0.18, "num_tokens": 75092262.0, "step": 234 }, { "epoch": 1.3742690058479532, "grad_norm": 0.14392359169053118, "learning_rate": 6.362208059553786e-06, "loss": 0.1799, "num_tokens": 75442533.0, "step": 235 }, { "epoch": 1.3801169590643274, "grad_norm": 0.1548508531809334, "learning_rate": 6.334273024008499e-06, "loss": 0.1705, "num_tokens": 75775480.0, "step": 236 }, { "epoch": 1.3859649122807016, "grad_norm": 0.15386739061806035, "learning_rate": 6.306304653980286e-06, "loss": 0.1722, "num_tokens": 76066461.0, "step": 237 }, { "epoch": 1.391812865497076, "grad_norm": 0.14831080775519306, "learning_rate": 6.278304066982391e-06, "loss": 0.1836, "num_tokens": 76401700.0, "step": 238 }, { "epoch": 1.3976608187134503, "grad_norm": 0.14755841590724592, "learning_rate": 6.250272381815331e-06, "loss": 0.1802, "num_tokens": 76714274.0, "step": 239 }, { "epoch": 1.4035087719298245, "grad_norm": 0.15910670160937837, "learning_rate": 6.222210718522187e-06, "loss": 0.2031, "num_tokens": 77028246.0, "step": 240 }, { "epoch": 1.409356725146199, "grad_norm": 0.15280436173000247, "learning_rate": 6.19412019834386e-06, "loss": 0.1742, "num_tokens": 77364346.0, "step": 241 }, { "epoch": 1.4152046783625731, "grad_norm": 0.13887335477707105, "learning_rate": 6.166001943674266e-06, "loss": 0.1785, "num_tokens": 77748583.0, "step": 242 }, { "epoch": 1.4210526315789473, "grad_norm": 0.1577201352102885, "learning_rate": 6.137857078015487e-06, "loss": 0.1863, "num_tokens": 78064140.0, "step": 243 }, { "epoch": 1.4269005847953216, "grad_norm": 0.14303385830957374, "learning_rate": 6.109686725932882e-06, "loss": 0.1813, "num_tokens": 78411157.0, "step": 244 }, { "epoch": 1.4327485380116958, "grad_norm": 0.14790866721582488, "learning_rate": 6.081492013010154e-06, "loss": 0.1778, "num_tokens": 78723584.0, "step": 245 }, { "epoch": 1.4385964912280702, "grad_norm": 0.14901104148022398, "learning_rate": 6.0532740658043785e-06, "loss": 0.1807, "num_tokens": 79054107.0, "step": 246 }, { "epoch": 1.4444444444444444, "grad_norm": 0.1485395120018961, "learning_rate": 6.025034011800989e-06, "loss": 0.187, "num_tokens": 79386694.0, "step": 247 }, { "epoch": 1.4502923976608186, "grad_norm": 0.14647752236417627, "learning_rate": 5.996772979368715e-06, "loss": 0.1849, "num_tokens": 79718178.0, "step": 248 }, { "epoch": 1.456140350877193, "grad_norm": 0.15032391233353223, "learning_rate": 5.968492097714519e-06, "loss": 0.1744, "num_tokens": 80013286.0, "step": 249 }, { "epoch": 1.4619883040935673, "grad_norm": 0.13111163977489035, "learning_rate": 5.940192496838456e-06, "loss": 0.1683, "num_tokens": 80358177.0, "step": 250 }, { "epoch": 1.4678362573099415, "grad_norm": 0.14507217241507256, "learning_rate": 5.911875307488543e-06, "loss": 0.1697, "num_tokens": 80664979.0, "step": 251 }, { "epoch": 1.4736842105263157, "grad_norm": 0.14371510606580692, "learning_rate": 5.883541661115555e-06, "loss": 0.183, "num_tokens": 81008531.0, "step": 252 }, { "epoch": 1.47953216374269, "grad_norm": 0.14575959771926755, "learning_rate": 5.855192689827838e-06, "loss": 0.1784, "num_tokens": 81327068.0, "step": 253 }, { "epoch": 1.4853801169590644, "grad_norm": 0.14386473495114957, "learning_rate": 5.8268295263460625e-06, "loss": 0.186, "num_tokens": 81660168.0, "step": 254 }, { "epoch": 1.4912280701754386, "grad_norm": 0.15665862540575096, "learning_rate": 5.798453303957968e-06, "loss": 0.1852, "num_tokens": 81988344.0, "step": 255 }, { "epoch": 1.4970760233918128, "grad_norm": 0.1486254433584565, "learning_rate": 5.77006515647308e-06, "loss": 0.1815, "num_tokens": 82292457.0, "step": 256 }, { "epoch": 1.5029239766081872, "grad_norm": 0.15218761718926124, "learning_rate": 5.741666218177402e-06, "loss": 0.1754, "num_tokens": 82577890.0, "step": 257 }, { "epoch": 1.5087719298245614, "grad_norm": 0.1538621360954969, "learning_rate": 5.7132576237881075e-06, "loss": 0.1855, "num_tokens": 82874407.0, "step": 258 }, { "epoch": 1.5146198830409356, "grad_norm": 0.16534859264165339, "learning_rate": 5.684840508408183e-06, "loss": 0.187, "num_tokens": 83181722.0, "step": 259 }, { "epoch": 1.52046783625731, "grad_norm": 0.15058626722493648, "learning_rate": 5.656416007481089e-06, "loss": 0.1793, "num_tokens": 83483808.0, "step": 260 }, { "epoch": 1.526315789473684, "grad_norm": 0.14392786696974194, "learning_rate": 5.627985256745384e-06, "loss": 0.1773, "num_tokens": 83826918.0, "step": 261 }, { "epoch": 1.5321637426900585, "grad_norm": 0.1547539344312234, "learning_rate": 5.5995493921893415e-06, "loss": 0.1747, "num_tokens": 84129500.0, "step": 262 }, { "epoch": 1.5380116959064327, "grad_norm": 0.15476418511066206, "learning_rate": 5.571109550005571e-06, "loss": 0.1831, "num_tokens": 84454231.0, "step": 263 }, { "epoch": 1.543859649122807, "grad_norm": 0.15292624983775452, "learning_rate": 5.542666866545609e-06, "loss": 0.1772, "num_tokens": 84758320.0, "step": 264 }, { "epoch": 1.5497076023391814, "grad_norm": 0.14323940203700627, "learning_rate": 5.5142224782745175e-06, "loss": 0.1742, "num_tokens": 85064629.0, "step": 265 }, { "epoch": 1.5555555555555556, "grad_norm": 0.1569122030468075, "learning_rate": 5.485777521725485e-06, "loss": 0.1823, "num_tokens": 85358431.0, "step": 266 }, { "epoch": 1.5614035087719298, "grad_norm": 0.1571283435279191, "learning_rate": 5.457333133454394e-06, "loss": 0.194, "num_tokens": 85681659.0, "step": 267 }, { "epoch": 1.5672514619883042, "grad_norm": 0.15050727661326063, "learning_rate": 5.4288904499944304e-06, "loss": 0.1843, "num_tokens": 86001433.0, "step": 268 }, { "epoch": 1.5730994152046782, "grad_norm": 0.13840027248721531, "learning_rate": 5.40045060781066e-06, "loss": 0.1828, "num_tokens": 86349235.0, "step": 269 }, { "epoch": 1.5789473684210527, "grad_norm": 0.154830903482515, "learning_rate": 5.3720147432546175e-06, "loss": 0.1891, "num_tokens": 86670991.0, "step": 270 }, { "epoch": 1.5847953216374269, "grad_norm": 0.14875604221537664, "learning_rate": 5.343583992518911e-06, "loss": 0.1838, "num_tokens": 87005937.0, "step": 271 }, { "epoch": 1.590643274853801, "grad_norm": 0.1550420196686663, "learning_rate": 5.315159491591818e-06, "loss": 0.1932, "num_tokens": 87355743.0, "step": 272 }, { "epoch": 1.5964912280701755, "grad_norm": 0.16190605189165996, "learning_rate": 5.286742376211894e-06, "loss": 0.1873, "num_tokens": 87666062.0, "step": 273 }, { "epoch": 1.6023391812865497, "grad_norm": 0.1499999734628863, "learning_rate": 5.2583337818226e-06, "loss": 0.179, "num_tokens": 88010557.0, "step": 274 }, { "epoch": 1.608187134502924, "grad_norm": 0.14174519866822582, "learning_rate": 5.229934843526922e-06, "loss": 0.1839, "num_tokens": 88348530.0, "step": 275 }, { "epoch": 1.6140350877192984, "grad_norm": 0.14514733616156453, "learning_rate": 5.201546696042033e-06, "loss": 0.1732, "num_tokens": 88660232.0, "step": 276 }, { "epoch": 1.6198830409356724, "grad_norm": 0.15247902901919175, "learning_rate": 5.173170473653939e-06, "loss": 0.1838, "num_tokens": 88986178.0, "step": 277 }, { "epoch": 1.6257309941520468, "grad_norm": 0.15161619030379697, "learning_rate": 5.1448073101721644e-06, "loss": 0.184, "num_tokens": 89306790.0, "step": 278 }, { "epoch": 1.631578947368421, "grad_norm": 0.15076713301794256, "learning_rate": 5.1164583388844476e-06, "loss": 0.1764, "num_tokens": 89608787.0, "step": 279 }, { "epoch": 1.6374269005847952, "grad_norm": 0.15081569071358755, "learning_rate": 5.0881246925114595e-06, "loss": 0.1841, "num_tokens": 89925196.0, "step": 280 }, { "epoch": 1.6432748538011697, "grad_norm": 0.14667932336134215, "learning_rate": 5.0598075031615445e-06, "loss": 0.1714, "num_tokens": 90246158.0, "step": 281 }, { "epoch": 1.6491228070175439, "grad_norm": 0.14238827156504316, "learning_rate": 5.031507902285483e-06, "loss": 0.1675, "num_tokens": 90530735.0, "step": 282 }, { "epoch": 1.654970760233918, "grad_norm": 0.15453119377842958, "learning_rate": 5.003227020631287e-06, "loss": 0.1822, "num_tokens": 90867029.0, "step": 283 }, { "epoch": 1.6608187134502925, "grad_norm": 0.14188421149596725, "learning_rate": 4.974965988199015e-06, "loss": 0.1782, "num_tokens": 91197724.0, "step": 284 }, { "epoch": 1.6666666666666665, "grad_norm": 0.13924871637746586, "learning_rate": 4.946725934195622e-06, "loss": 0.1687, "num_tokens": 91501722.0, "step": 285 }, { "epoch": 1.672514619883041, "grad_norm": 0.13972442531890047, "learning_rate": 4.918507986989848e-06, "loss": 0.1721, "num_tokens": 91828252.0, "step": 286 }, { "epoch": 1.6783625730994152, "grad_norm": 0.14136406326583859, "learning_rate": 4.890313274067121e-06, "loss": 0.1787, "num_tokens": 92160609.0, "step": 287 }, { "epoch": 1.6842105263157894, "grad_norm": 0.14317191939225465, "learning_rate": 4.862142921984514e-06, "loss": 0.1816, "num_tokens": 92492656.0, "step": 288 }, { "epoch": 1.6900584795321638, "grad_norm": 0.15273877472629271, "learning_rate": 4.8339980563257345e-06, "loss": 0.1778, "num_tokens": 92783619.0, "step": 289 }, { "epoch": 1.695906432748538, "grad_norm": 0.1488984953802957, "learning_rate": 4.80587980165614e-06, "loss": 0.1605, "num_tokens": 93088774.0, "step": 290 }, { "epoch": 1.7017543859649122, "grad_norm": 0.14456947085083469, "learning_rate": 4.7777892814778145e-06, "loss": 0.185, "num_tokens": 93417888.0, "step": 291 }, { "epoch": 1.7076023391812867, "grad_norm": 0.13947315988135284, "learning_rate": 4.749727618184672e-06, "loss": 0.1673, "num_tokens": 93733741.0, "step": 292 }, { "epoch": 1.7134502923976607, "grad_norm": 0.14851734535481514, "learning_rate": 4.72169593301761e-06, "loss": 0.1819, "num_tokens": 94047466.0, "step": 293 }, { "epoch": 1.719298245614035, "grad_norm": 0.14831214767698989, "learning_rate": 4.693695346019715e-06, "loss": 0.1771, "num_tokens": 94353138.0, "step": 294 }, { "epoch": 1.7251461988304093, "grad_norm": 0.14962487368007496, "learning_rate": 4.665726975991502e-06, "loss": 0.1796, "num_tokens": 94660877.0, "step": 295 }, { "epoch": 1.7309941520467835, "grad_norm": 0.15275250425533088, "learning_rate": 4.637791940446216e-06, "loss": 0.1819, "num_tokens": 94977362.0, "step": 296 }, { "epoch": 1.736842105263158, "grad_norm": 0.1444275240489397, "learning_rate": 4.609891355565172e-06, "loss": 0.1746, "num_tokens": 95287766.0, "step": 297 }, { "epoch": 1.7426900584795322, "grad_norm": 0.14585881193254274, "learning_rate": 4.582026336153175e-06, "loss": 0.1751, "num_tokens": 95580594.0, "step": 298 }, { "epoch": 1.7485380116959064, "grad_norm": 0.14505549895976488, "learning_rate": 4.554197995593953e-06, "loss": 0.1811, "num_tokens": 95914105.0, "step": 299 }, { "epoch": 1.7543859649122808, "grad_norm": 0.14480466196132438, "learning_rate": 4.526407445805692e-06, "loss": 0.1734, "num_tokens": 96221354.0, "step": 300 }, { "epoch": 1.7602339181286548, "grad_norm": 0.14878097366426038, "learning_rate": 4.4986557971965865e-06, "loss": 0.1717, "num_tokens": 96503140.0, "step": 301 }, { "epoch": 1.7660818713450293, "grad_norm": 0.13546319756214165, "learning_rate": 4.4709441586204924e-06, "loss": 0.1787, "num_tokens": 96862132.0, "step": 302 }, { "epoch": 1.7719298245614035, "grad_norm": 0.1545758606312762, "learning_rate": 4.443273637332607e-06, "loss": 0.1817, "num_tokens": 97181205.0, "step": 303 }, { "epoch": 1.7777777777777777, "grad_norm": 0.14857553617463332, "learning_rate": 4.415645338945236e-06, "loss": 0.1835, "num_tokens": 97496038.0, "step": 304 }, { "epoch": 1.7836257309941521, "grad_norm": 0.1433152779712938, "learning_rate": 4.388060367383607e-06, "loss": 0.1807, "num_tokens": 97811641.0, "step": 305 }, { "epoch": 1.7894736842105263, "grad_norm": 0.15284420636801233, "learning_rate": 4.3605198248417745e-06, "loss": 0.1904, "num_tokens": 98139634.0, "step": 306 }, { "epoch": 1.7953216374269005, "grad_norm": 0.15285620589846677, "learning_rate": 4.333024811738565e-06, "loss": 0.1794, "num_tokens": 98454687.0, "step": 307 }, { "epoch": 1.801169590643275, "grad_norm": 0.14480146717017858, "learning_rate": 4.305576426673621e-06, "loss": 0.1733, "num_tokens": 98768020.0, "step": 308 }, { "epoch": 1.807017543859649, "grad_norm": 0.14587759969006922, "learning_rate": 4.278175766383499e-06, "loss": 0.1788, "num_tokens": 99089607.0, "step": 309 }, { "epoch": 1.8128654970760234, "grad_norm": 0.14456314742064028, "learning_rate": 4.250823925697848e-06, "loss": 0.1728, "num_tokens": 99396998.0, "step": 310 }, { "epoch": 1.8187134502923976, "grad_norm": 0.14323326579292064, "learning_rate": 4.223521997495665e-06, "loss": 0.1774, "num_tokens": 99716575.0, "step": 311 }, { "epoch": 1.8245614035087718, "grad_norm": 0.1325445041745875, "learning_rate": 4.196271072661631e-06, "loss": 0.168, "num_tokens": 100064416.0, "step": 312 }, { "epoch": 1.8304093567251463, "grad_norm": 0.13327008176494723, "learning_rate": 4.169072240042514e-06, "loss": 0.1699, "num_tokens": 100409121.0, "step": 313 }, { "epoch": 1.8362573099415205, "grad_norm": 0.13889911640255415, "learning_rate": 4.141926586403677e-06, "loss": 0.1805, "num_tokens": 100765220.0, "step": 314 }, { "epoch": 1.8421052631578947, "grad_norm": 0.1454927979121013, "learning_rate": 4.114835196385636e-06, "loss": 0.1842, "num_tokens": 101081641.0, "step": 315 }, { "epoch": 1.8479532163742691, "grad_norm": 0.1404945457994314, "learning_rate": 4.08779915246074e-06, "loss": 0.1822, "num_tokens": 101412150.0, "step": 316 }, { "epoch": 1.8538011695906431, "grad_norm": 0.13980685227323844, "learning_rate": 4.060819534889909e-06, "loss": 0.1799, "num_tokens": 101744218.0, "step": 317 }, { "epoch": 1.8596491228070176, "grad_norm": 0.14229034728182074, "learning_rate": 4.033897421679472e-06, "loss": 0.1694, "num_tokens": 102086143.0, "step": 318 }, { "epoch": 1.8654970760233918, "grad_norm": 0.1409421468030685, "learning_rate": 4.0070338885381e-06, "loss": 0.183, "num_tokens": 102386095.0, "step": 319 }, { "epoch": 1.871345029239766, "grad_norm": 0.14131996708764916, "learning_rate": 3.980230008833812e-06, "loss": 0.1784, "num_tokens": 102724647.0, "step": 320 }, { "epoch": 1.8771929824561404, "grad_norm": 0.14060583568522164, "learning_rate": 3.953486853551104e-06, "loss": 0.1611, "num_tokens": 103019942.0, "step": 321 }, { "epoch": 1.8830409356725146, "grad_norm": 0.13762893362884943, "learning_rate": 3.926805491248138e-06, "loss": 0.1681, "num_tokens": 103358784.0, "step": 322 }, { "epoch": 1.8888888888888888, "grad_norm": 0.15361551995597972, "learning_rate": 3.900186988014065e-06, "loss": 0.1825, "num_tokens": 103660262.0, "step": 323 }, { "epoch": 1.8947368421052633, "grad_norm": 0.13699916161747416, "learning_rate": 3.873632407426412e-06, "loss": 0.179, "num_tokens": 103981503.0, "step": 324 }, { "epoch": 1.9005847953216373, "grad_norm": 0.14959870826890115, "learning_rate": 3.847142810508596e-06, "loss": 0.1826, "num_tokens": 104304618.0, "step": 325 }, { "epoch": 1.9064327485380117, "grad_norm": 0.13265912139830643, "learning_rate": 3.82071925568752e-06, "loss": 0.1721, "num_tokens": 104676283.0, "step": 326 }, { "epoch": 1.912280701754386, "grad_norm": 0.1421124205261071, "learning_rate": 3.7943627987512953e-06, "loss": 0.1805, "num_tokens": 105009047.0, "step": 327 }, { "epoch": 1.9181286549707601, "grad_norm": 0.1437008669427492, "learning_rate": 3.7680744928070413e-06, "loss": 0.1754, "num_tokens": 105335760.0, "step": 328 }, { "epoch": 1.9239766081871346, "grad_norm": 0.13360094795705102, "learning_rate": 3.741855388238821e-06, "loss": 0.1728, "num_tokens": 105669692.0, "step": 329 }, { "epoch": 1.9298245614035088, "grad_norm": 0.14123898637906518, "learning_rate": 3.715706532665657e-06, "loss": 0.1771, "num_tokens": 105989627.0, "step": 330 }, { "epoch": 1.935672514619883, "grad_norm": 0.15005239176117802, "learning_rate": 3.6896289708996867e-06, "loss": 0.1875, "num_tokens": 106296775.0, "step": 331 }, { "epoch": 1.9415204678362574, "grad_norm": 0.13514021795158399, "learning_rate": 3.6636237449044077e-06, "loss": 0.164, "num_tokens": 106614583.0, "step": 332 }, { "epoch": 1.9473684210526314, "grad_norm": 0.14198706510532838, "learning_rate": 3.637691893753047e-06, "loss": 0.1923, "num_tokens": 106960662.0, "step": 333 }, { "epoch": 1.9532163742690059, "grad_norm": 0.13549287034356422, "learning_rate": 3.611834453587035e-06, "loss": 0.1713, "num_tokens": 107295583.0, "step": 334 }, { "epoch": 1.95906432748538, "grad_norm": 0.14046063970510755, "learning_rate": 3.5860524575746247e-06, "loss": 0.1856, "num_tokens": 107643687.0, "step": 335 }, { "epoch": 1.9649122807017543, "grad_norm": 0.14339861962392317, "learning_rate": 3.5603469358695887e-06, "loss": 0.1763, "num_tokens": 107990974.0, "step": 336 }, { "epoch": 1.9707602339181287, "grad_norm": 0.14801719558718762, "learning_rate": 3.53471891557007e-06, "loss": 0.193, "num_tokens": 108320866.0, "step": 337 }, { "epoch": 1.976608187134503, "grad_norm": 0.13759593336874304, "learning_rate": 3.509169420677545e-06, "loss": 0.1646, "num_tokens": 108635348.0, "step": 338 }, { "epoch": 1.9824561403508771, "grad_norm": 0.14182117983464468, "learning_rate": 3.483699472055897e-06, "loss": 0.175, "num_tokens": 108945985.0, "step": 339 }, { "epoch": 1.9883040935672516, "grad_norm": 0.14601860778521314, "learning_rate": 3.458310087390637e-06, "loss": 0.1746, "num_tokens": 109253189.0, "step": 340 }, { "epoch": 1.9941520467836256, "grad_norm": 0.13963753894588415, "learning_rate": 3.4330022811482317e-06, "loss": 0.1758, "num_tokens": 109573613.0, "step": 341 }, { "epoch": 2.0, "grad_norm": 0.14923037322286145, "learning_rate": 3.4077770645355824e-06, "loss": 0.1806, "num_tokens": 109863441.0, "step": 342 }, { "epoch": 2.0058479532163744, "grad_norm": 0.14979935754858112, "learning_rate": 3.3826354454596024e-06, "loss": 0.1648, "num_tokens": 110172994.0, "step": 343 }, { "epoch": 2.0116959064327484, "grad_norm": 0.14730373048534828, "learning_rate": 3.35757842848696e-06, "loss": 0.1658, "num_tokens": 110485924.0, "step": 344 }, { "epoch": 2.017543859649123, "grad_norm": 0.13744380787225857, "learning_rate": 3.332607014803937e-06, "loss": 0.1672, "num_tokens": 110803906.0, "step": 345 }, { "epoch": 2.023391812865497, "grad_norm": 0.14343267181086955, "learning_rate": 3.307722202176417e-06, "loss": 0.1685, "num_tokens": 111130534.0, "step": 346 }, { "epoch": 2.0292397660818713, "grad_norm": 0.14951907378888113, "learning_rate": 3.2829249849100255e-06, "loss": 0.1656, "num_tokens": 111443229.0, "step": 347 }, { "epoch": 2.0350877192982457, "grad_norm": 0.15277865040934263, "learning_rate": 3.2582163538104038e-06, "loss": 0.172, "num_tokens": 111757363.0, "step": 348 }, { "epoch": 2.0409356725146197, "grad_norm": 0.14756047461787347, "learning_rate": 3.2335972961436095e-06, "loss": 0.1568, "num_tokens": 112093731.0, "step": 349 }, { "epoch": 2.046783625730994, "grad_norm": 0.16032862389663596, "learning_rate": 3.209068795596679e-06, "loss": 0.1658, "num_tokens": 112388610.0, "step": 350 }, { "epoch": 2.0526315789473686, "grad_norm": 0.16521303938360793, "learning_rate": 3.1846318322383164e-06, "loss": 0.1668, "num_tokens": 112693505.0, "step": 351 }, { "epoch": 2.0584795321637426, "grad_norm": 0.15381932734961204, "learning_rate": 3.160287382479738e-06, "loss": 0.1659, "num_tokens": 112979822.0, "step": 352 }, { "epoch": 2.064327485380117, "grad_norm": 0.1375064634023704, "learning_rate": 3.136036419035656e-06, "loss": 0.1666, "num_tokens": 113301917.0, "step": 353 }, { "epoch": 2.0701754385964914, "grad_norm": 0.15582327011120772, "learning_rate": 3.111879910885414e-06, "loss": 0.1743, "num_tokens": 113618502.0, "step": 354 }, { "epoch": 2.0760233918128654, "grad_norm": 0.1447294439687268, "learning_rate": 3.0878188232342708e-06, "loss": 0.1675, "num_tokens": 113947517.0, "step": 355 }, { "epoch": 2.08187134502924, "grad_norm": 0.15087644416114507, "learning_rate": 3.0638541174748284e-06, "loss": 0.1693, "num_tokens": 114275423.0, "step": 356 }, { "epoch": 2.087719298245614, "grad_norm": 0.14016164600859282, "learning_rate": 3.0399867511486247e-06, "loss": 0.1592, "num_tokens": 114588977.0, "step": 357 }, { "epoch": 2.0935672514619883, "grad_norm": 0.13990826210042928, "learning_rate": 3.0162176779078763e-06, "loss": 0.1639, "num_tokens": 114914836.0, "step": 358 }, { "epoch": 2.0994152046783627, "grad_norm": 0.14220573852232116, "learning_rate": 2.9925478474773634e-06, "loss": 0.1533, "num_tokens": 115251681.0, "step": 359 }, { "epoch": 2.1052631578947367, "grad_norm": 0.1462318844171621, "learning_rate": 2.9689782056164874e-06, "loss": 0.1634, "num_tokens": 115583416.0, "step": 360 }, { "epoch": 2.111111111111111, "grad_norm": 0.15963290757197937, "learning_rate": 2.94550969408149e-06, "loss": 0.1799, "num_tokens": 115902865.0, "step": 361 }, { "epoch": 2.116959064327485, "grad_norm": 0.14871595452125072, "learning_rate": 2.9221432505878116e-06, "loss": 0.1624, "num_tokens": 116225855.0, "step": 362 }, { "epoch": 2.1228070175438596, "grad_norm": 0.15147360930740472, "learning_rate": 2.8988798087726295e-06, "loss": 0.1608, "num_tokens": 116539725.0, "step": 363 }, { "epoch": 2.128654970760234, "grad_norm": 0.14416045180339065, "learning_rate": 2.875720298157551e-06, "loss": 0.1576, "num_tokens": 116893688.0, "step": 364 }, { "epoch": 2.134502923976608, "grad_norm": 0.15123893108832548, "learning_rate": 2.8526656441114815e-06, "loss": 0.1685, "num_tokens": 117195485.0, "step": 365 }, { "epoch": 2.1403508771929824, "grad_norm": 0.14700656972796167, "learning_rate": 2.8297167678136363e-06, "loss": 0.1512, "num_tokens": 117517383.0, "step": 366 }, { "epoch": 2.146198830409357, "grad_norm": 0.14798834895868007, "learning_rate": 2.8068745862167423e-06, "loss": 0.165, "num_tokens": 117837819.0, "step": 367 }, { "epoch": 2.152046783625731, "grad_norm": 0.14373047748755674, "learning_rate": 2.784140012010401e-06, "loss": 0.157, "num_tokens": 118172299.0, "step": 368 }, { "epoch": 2.1578947368421053, "grad_norm": 0.14591495023462686, "learning_rate": 2.7615139535846156e-06, "loss": 0.1569, "num_tokens": 118493460.0, "step": 369 }, { "epoch": 2.1637426900584797, "grad_norm": 0.14800266668408754, "learning_rate": 2.7389973149934974e-06, "loss": 0.1657, "num_tokens": 118788247.0, "step": 370 }, { "epoch": 2.1695906432748537, "grad_norm": 0.14815056056984904, "learning_rate": 2.7165909959191472e-06, "loss": 0.1653, "num_tokens": 119112526.0, "step": 371 }, { "epoch": 2.175438596491228, "grad_norm": 0.14800590386326276, "learning_rate": 2.6942958916356997e-06, "loss": 0.155, "num_tokens": 119425646.0, "step": 372 }, { "epoch": 2.181286549707602, "grad_norm": 0.155304755457848, "learning_rate": 2.6721128929735563e-06, "loss": 0.1682, "num_tokens": 119730833.0, "step": 373 }, { "epoch": 2.1871345029239766, "grad_norm": 0.14504483487086625, "learning_rate": 2.6500428862837878e-06, "loss": 0.1591, "num_tokens": 120056226.0, "step": 374 }, { "epoch": 2.192982456140351, "grad_norm": 0.1381416074535997, "learning_rate": 2.6280867534027286e-06, "loss": 0.1628, "num_tokens": 120386366.0, "step": 375 }, { "epoch": 2.198830409356725, "grad_norm": 0.14481228971827553, "learning_rate": 2.6062453716167273e-06, "loss": 0.1681, "num_tokens": 120734433.0, "step": 376 }, { "epoch": 2.2046783625730995, "grad_norm": 0.15691625415835195, "learning_rate": 2.5845196136270994e-06, "loss": 0.1713, "num_tokens": 121015717.0, "step": 377 }, { "epoch": 2.2105263157894735, "grad_norm": 0.14498825747218128, "learning_rate": 2.5629103475152654e-06, "loss": 0.157, "num_tokens": 121314872.0, "step": 378 }, { "epoch": 2.216374269005848, "grad_norm": 0.15217065921697623, "learning_rate": 2.541418436708054e-06, "loss": 0.1641, "num_tokens": 121609720.0, "step": 379 }, { "epoch": 2.2222222222222223, "grad_norm": 0.149827230466305, "learning_rate": 2.520044739943207e-06, "loss": 0.1598, "num_tokens": 121926793.0, "step": 380 }, { "epoch": 2.2280701754385963, "grad_norm": 0.14401874517686966, "learning_rate": 2.498790111235072e-06, "loss": 0.1709, "num_tokens": 122245778.0, "step": 381 }, { "epoch": 2.2339181286549707, "grad_norm": 0.13403812014389224, "learning_rate": 2.47765539984047e-06, "loss": 0.1608, "num_tokens": 122609822.0, "step": 382 }, { "epoch": 2.239766081871345, "grad_norm": 0.14253922733943364, "learning_rate": 2.4566414502247684e-06, "loss": 0.1652, "num_tokens": 122948206.0, "step": 383 }, { "epoch": 2.245614035087719, "grad_norm": 0.1442664664053727, "learning_rate": 2.435749102028139e-06, "loss": 0.1695, "num_tokens": 123281183.0, "step": 384 }, { "epoch": 2.2514619883040936, "grad_norm": 0.14995127804324304, "learning_rate": 2.414979190032008e-06, "loss": 0.1667, "num_tokens": 123606597.0, "step": 385 }, { "epoch": 2.257309941520468, "grad_norm": 0.14167491108605929, "learning_rate": 2.3943325441256993e-06, "loss": 0.162, "num_tokens": 123924915.0, "step": 386 }, { "epoch": 2.263157894736842, "grad_norm": 0.1462621509052835, "learning_rate": 2.373809989273277e-06, "loss": 0.1668, "num_tokens": 124223390.0, "step": 387 }, { "epoch": 2.2690058479532165, "grad_norm": 0.14702160243574425, "learning_rate": 2.353412345480587e-06, "loss": 0.1629, "num_tokens": 124541698.0, "step": 388 }, { "epoch": 2.2748538011695905, "grad_norm": 0.15471872498523825, "learning_rate": 2.3331404277624846e-06, "loss": 0.1582, "num_tokens": 124832848.0, "step": 389 }, { "epoch": 2.280701754385965, "grad_norm": 0.14362745275711297, "learning_rate": 2.312995046110272e-06, "loss": 0.1698, "num_tokens": 125154886.0, "step": 390 }, { "epoch": 2.2865497076023393, "grad_norm": 0.1468561944816842, "learning_rate": 2.292977005459341e-06, "loss": 0.1756, "num_tokens": 125484162.0, "step": 391 }, { "epoch": 2.2923976608187133, "grad_norm": 0.15143848359912496, "learning_rate": 2.2730871056570024e-06, "loss": 0.1605, "num_tokens": 125783060.0, "step": 392 }, { "epoch": 2.2982456140350878, "grad_norm": 0.14651421165143075, "learning_rate": 2.2533261414305243e-06, "loss": 0.165, "num_tokens": 126110333.0, "step": 393 }, { "epoch": 2.3040935672514617, "grad_norm": 0.1475116907221095, "learning_rate": 2.2336949023553924e-06, "loss": 0.1616, "num_tokens": 126442564.0, "step": 394 }, { "epoch": 2.309941520467836, "grad_norm": 0.14460134481385417, "learning_rate": 2.2141941728237467e-06, "loss": 0.1654, "num_tokens": 126752395.0, "step": 395 }, { "epoch": 2.3157894736842106, "grad_norm": 0.14351634697316507, "learning_rate": 2.194824732013047e-06, "loss": 0.1626, "num_tokens": 127089855.0, "step": 396 }, { "epoch": 2.3216374269005846, "grad_norm": 0.13828986539988358, "learning_rate": 2.1755873538549376e-06, "loss": 0.159, "num_tokens": 127411156.0, "step": 397 }, { "epoch": 2.327485380116959, "grad_norm": 0.13645396729907014, "learning_rate": 2.1564828070043275e-06, "loss": 0.1533, "num_tokens": 127736233.0, "step": 398 }, { "epoch": 2.3333333333333335, "grad_norm": 0.14422895920728793, "learning_rate": 2.137511854808672e-06, "loss": 0.1575, "num_tokens": 128060062.0, "step": 399 }, { "epoch": 2.3391812865497075, "grad_norm": 0.1448501743068164, "learning_rate": 2.1186752552774764e-06, "loss": 0.1626, "num_tokens": 128406839.0, "step": 400 }, { "epoch": 2.345029239766082, "grad_norm": 0.13774903808615716, "learning_rate": 2.099973761052007e-06, "loss": 0.1662, "num_tokens": 128766764.0, "step": 401 }, { "epoch": 2.3508771929824563, "grad_norm": 0.1508233167288601, "learning_rate": 2.081408119375219e-06, "loss": 0.1653, "num_tokens": 129070969.0, "step": 402 }, { "epoch": 2.3567251461988303, "grad_norm": 0.14607227185650823, "learning_rate": 2.0629790720618977e-06, "loss": 0.1551, "num_tokens": 129361599.0, "step": 403 }, { "epoch": 2.3625730994152048, "grad_norm": 0.14540307746510248, "learning_rate": 2.044687355469025e-06, "loss": 0.168, "num_tokens": 129689845.0, "step": 404 }, { "epoch": 2.3684210526315788, "grad_norm": 0.14269113809456305, "learning_rate": 2.0265337004663465e-06, "loss": 0.1632, "num_tokens": 130007281.0, "step": 405 }, { "epoch": 2.374269005847953, "grad_norm": 0.14764837206982723, "learning_rate": 2.008518832407176e-06, "loss": 0.16, "num_tokens": 130320317.0, "step": 406 }, { "epoch": 2.3801169590643276, "grad_norm": 0.13886373538463548, "learning_rate": 1.9906434710994098e-06, "loss": 0.1659, "num_tokens": 130668421.0, "step": 407 }, { "epoch": 2.3859649122807016, "grad_norm": 0.14123121708441125, "learning_rate": 1.9729083307767725e-06, "loss": 0.1571, "num_tokens": 130996417.0, "step": 408 }, { "epoch": 2.391812865497076, "grad_norm": 0.14241772556155666, "learning_rate": 1.955314120070269e-06, "loss": 0.1586, "num_tokens": 131324210.0, "step": 409 }, { "epoch": 2.39766081871345, "grad_norm": 0.1536376204686897, "learning_rate": 1.937861541979873e-06, "loss": 0.1866, "num_tokens": 131671552.0, "step": 410 }, { "epoch": 2.4035087719298245, "grad_norm": 0.13606181694914196, "learning_rate": 1.9205512938464465e-06, "loss": 0.163, "num_tokens": 132008743.0, "step": 411 }, { "epoch": 2.409356725146199, "grad_norm": 0.14994033256966707, "learning_rate": 1.903384067323863e-06, "loss": 0.1644, "num_tokens": 132306397.0, "step": 412 }, { "epoch": 2.415204678362573, "grad_norm": 0.1437874855637201, "learning_rate": 1.886360548351381e-06, "loss": 0.1589, "num_tokens": 132627946.0, "step": 413 }, { "epoch": 2.4210526315789473, "grad_norm": 0.14231060929324155, "learning_rate": 1.8694814171262355e-06, "loss": 0.1616, "num_tokens": 132981440.0, "step": 414 }, { "epoch": 2.426900584795322, "grad_norm": 0.14278247493654592, "learning_rate": 1.8527473480764545e-06, "loss": 0.1638, "num_tokens": 133326233.0, "step": 415 }, { "epoch": 2.4327485380116958, "grad_norm": 0.14574512278213558, "learning_rate": 1.8361590098339168e-06, "loss": 0.1701, "num_tokens": 133635483.0, "step": 416 }, { "epoch": 2.43859649122807, "grad_norm": 0.14520072415658936, "learning_rate": 1.8197170652076316e-06, "loss": 0.1729, "num_tokens": 133965064.0, "step": 417 }, { "epoch": 2.4444444444444446, "grad_norm": 0.13880382782292727, "learning_rate": 1.8034221711572633e-06, "loss": 0.1642, "num_tokens": 134297515.0, "step": 418 }, { "epoch": 2.4502923976608186, "grad_norm": 0.14965359883764726, "learning_rate": 1.7872749787668673e-06, "loss": 0.1573, "num_tokens": 134596743.0, "step": 419 }, { "epoch": 2.456140350877193, "grad_norm": 0.14970450216253625, "learning_rate": 1.7712761332188894e-06, "loss": 0.1654, "num_tokens": 134914562.0, "step": 420 }, { "epoch": 2.461988304093567, "grad_norm": 0.15816166023028, "learning_rate": 1.7554262737683803e-06, "loss": 0.1487, "num_tokens": 135230462.0, "step": 421 }, { "epoch": 2.4678362573099415, "grad_norm": 0.1504826241844761, "learning_rate": 1.7397260337174542e-06, "loss": 0.157, "num_tokens": 135546957.0, "step": 422 }, { "epoch": 2.473684210526316, "grad_norm": 0.1394498720471719, "learning_rate": 1.724176040389982e-06, "loss": 0.1591, "num_tokens": 135868298.0, "step": 423 }, { "epoch": 2.47953216374269, "grad_norm": 0.14271119923974052, "learning_rate": 1.708776915106528e-06, "loss": 0.1647, "num_tokens": 136208833.0, "step": 424 }, { "epoch": 2.4853801169590644, "grad_norm": 0.1486501311116008, "learning_rate": 1.6935292731595284e-06, "loss": 0.1674, "num_tokens": 136529945.0, "step": 425 }, { "epoch": 2.4912280701754383, "grad_norm": 0.1568931867106138, "learning_rate": 1.678433723788697e-06, "loss": 0.1612, "num_tokens": 136819684.0, "step": 426 }, { "epoch": 2.497076023391813, "grad_norm": 0.14113107486411444, "learning_rate": 1.6634908701566909e-06, "loss": 0.1576, "num_tokens": 137150211.0, "step": 427 }, { "epoch": 2.502923976608187, "grad_norm": 0.14430747759375342, "learning_rate": 1.6487013093250042e-06, "loss": 0.1577, "num_tokens": 137460607.0, "step": 428 }, { "epoch": 2.5087719298245617, "grad_norm": 0.14188380800567507, "learning_rate": 1.6340656322301158e-06, "loss": 0.1681, "num_tokens": 137808723.0, "step": 429 }, { "epoch": 2.5146198830409356, "grad_norm": 0.13864076262612016, "learning_rate": 1.619584423659875e-06, "loss": 0.1508, "num_tokens": 138122659.0, "step": 430 }, { "epoch": 2.52046783625731, "grad_norm": 0.14329617113357643, "learning_rate": 1.6052582622301398e-06, "loss": 0.1604, "num_tokens": 138456252.0, "step": 431 }, { "epoch": 2.526315789473684, "grad_norm": 0.14485980625620193, "learning_rate": 1.5910877203616515e-06, "loss": 0.1689, "num_tokens": 138794979.0, "step": 432 }, { "epoch": 2.5321637426900585, "grad_norm": 0.14001094426697513, "learning_rate": 1.5770733642571662e-06, "loss": 0.1613, "num_tokens": 139114044.0, "step": 433 }, { "epoch": 2.538011695906433, "grad_norm": 0.14542258651096243, "learning_rate": 1.5632157538788322e-06, "loss": 0.1626, "num_tokens": 139425238.0, "step": 434 }, { "epoch": 2.543859649122807, "grad_norm": 0.1489744939093393, "learning_rate": 1.5495154429258136e-06, "loss": 0.1722, "num_tokens": 139757117.0, "step": 435 }, { "epoch": 2.5497076023391814, "grad_norm": 0.14248652727375938, "learning_rate": 1.5359729788121678e-06, "loss": 0.1633, "num_tokens": 140085804.0, "step": 436 }, { "epoch": 2.5555555555555554, "grad_norm": 0.14025435225815486, "learning_rate": 1.5225889026449754e-06, "loss": 0.1609, "num_tokens": 140409219.0, "step": 437 }, { "epoch": 2.56140350877193, "grad_norm": 0.14823929068819988, "learning_rate": 1.5093637492027136e-06, "loss": 0.165, "num_tokens": 140713444.0, "step": 438 }, { "epoch": 2.5672514619883042, "grad_norm": 0.1370826700643183, "learning_rate": 1.4962980469138932e-06, "loss": 0.1462, "num_tokens": 141032047.0, "step": 439 }, { "epoch": 2.573099415204678, "grad_norm": 0.1522127782198521, "learning_rate": 1.4833923178359428e-06, "loss": 0.1621, "num_tokens": 141338487.0, "step": 440 }, { "epoch": 2.5789473684210527, "grad_norm": 0.14502198127336535, "learning_rate": 1.4706470776343507e-06, "loss": 0.164, "num_tokens": 141669740.0, "step": 441 }, { "epoch": 2.5847953216374266, "grad_norm": 0.15205245564816208, "learning_rate": 1.458062835562058e-06, "loss": 0.164, "num_tokens": 141971862.0, "step": 442 }, { "epoch": 2.590643274853801, "grad_norm": 0.15020510176845542, "learning_rate": 1.4456400944391147e-06, "loss": 0.1677, "num_tokens": 142266889.0, "step": 443 }, { "epoch": 2.5964912280701755, "grad_norm": 0.14391800055857798, "learning_rate": 1.4333793506325832e-06, "loss": 0.1606, "num_tokens": 142584991.0, "step": 444 }, { "epoch": 2.60233918128655, "grad_norm": 0.15111238280234057, "learning_rate": 1.421281094036712e-06, "loss": 0.1604, "num_tokens": 142879877.0, "step": 445 }, { "epoch": 2.608187134502924, "grad_norm": 0.1443837082025472, "learning_rate": 1.4093458080533562e-06, "loss": 0.1681, "num_tokens": 143205197.0, "step": 446 }, { "epoch": 2.6140350877192984, "grad_norm": 0.14463771075478601, "learning_rate": 1.3975739695726649e-06, "loss": 0.1671, "num_tokens": 143531142.0, "step": 447 }, { "epoch": 2.6198830409356724, "grad_norm": 0.14454878880987132, "learning_rate": 1.385966048954027e-06, "loss": 0.1633, "num_tokens": 143858081.0, "step": 448 }, { "epoch": 2.625730994152047, "grad_norm": 0.13984594893802477, "learning_rate": 1.3745225100072737e-06, "loss": 0.1617, "num_tokens": 144185431.0, "step": 449 }, { "epoch": 2.6315789473684212, "grad_norm": 0.13969336960280732, "learning_rate": 1.3632438099741505e-06, "loss": 0.159, "num_tokens": 144497647.0, "step": 450 }, { "epoch": 2.6374269005847952, "grad_norm": 0.14619663223314017, "learning_rate": 1.3521303995100479e-06, "loss": 0.1663, "num_tokens": 144800894.0, "step": 451 }, { "epoch": 2.6432748538011697, "grad_norm": 0.14130801987934924, "learning_rate": 1.3411827226659887e-06, "loss": 0.1592, "num_tokens": 145115720.0, "step": 452 }, { "epoch": 2.6491228070175437, "grad_norm": 0.13126230977093004, "learning_rate": 1.330401216870891e-06, "loss": 0.1479, "num_tokens": 145451398.0, "step": 453 }, { "epoch": 2.654970760233918, "grad_norm": 0.13673717620054995, "learning_rate": 1.3197863129140916e-06, "loss": 0.1564, "num_tokens": 145791079.0, "step": 454 }, { "epoch": 2.6608187134502925, "grad_norm": 0.1468555944041424, "learning_rate": 1.3093384349281268e-06, "loss": 0.1575, "num_tokens": 146094234.0, "step": 455 }, { "epoch": 2.6666666666666665, "grad_norm": 0.1511002120974835, "learning_rate": 1.2990580003717904e-06, "loss": 0.1828, "num_tokens": 146403548.0, "step": 456 }, { "epoch": 2.672514619883041, "grad_norm": 0.139909324379523, "learning_rate": 1.2889454200134522e-06, "loss": 0.1594, "num_tokens": 146732523.0, "step": 457 }, { "epoch": 2.678362573099415, "grad_norm": 0.14396563097050272, "learning_rate": 1.2790010979146467e-06, "loss": 0.1524, "num_tokens": 147040850.0, "step": 458 }, { "epoch": 2.6842105263157894, "grad_norm": 0.15000149938121365, "learning_rate": 1.2692254314139243e-06, "loss": 0.1697, "num_tokens": 147343323.0, "step": 459 }, { "epoch": 2.690058479532164, "grad_norm": 0.15745120782859975, "learning_rate": 1.2596188111109805e-06, "loss": 0.1681, "num_tokens": 147635253.0, "step": 460 }, { "epoch": 2.6959064327485383, "grad_norm": 0.1476631580057043, "learning_rate": 1.2501816208510442e-06, "loss": 0.1638, "num_tokens": 147956892.0, "step": 461 }, { "epoch": 2.7017543859649122, "grad_norm": 0.1423826560287115, "learning_rate": 1.2409142377095435e-06, "loss": 0.1571, "num_tokens": 148260684.0, "step": 462 }, { "epoch": 2.7076023391812867, "grad_norm": 0.14219644101756945, "learning_rate": 1.231817031977037e-06, "loss": 0.1585, "num_tokens": 148571351.0, "step": 463 }, { "epoch": 2.7134502923976607, "grad_norm": 0.15083997934575208, "learning_rate": 1.2228903671444228e-06, "loss": 0.1683, "num_tokens": 148888226.0, "step": 464 }, { "epoch": 2.719298245614035, "grad_norm": 0.13732672374616764, "learning_rate": 1.2141345998884092e-06, "loss": 0.1606, "num_tokens": 149214770.0, "step": 465 }, { "epoch": 2.7251461988304095, "grad_norm": 0.1441249561667527, "learning_rate": 1.2055500800572657e-06, "loss": 0.1626, "num_tokens": 149545405.0, "step": 466 }, { "epoch": 2.7309941520467835, "grad_norm": 0.14325592299925863, "learning_rate": 1.1971371506568442e-06, "loss": 0.1577, "num_tokens": 149846805.0, "step": 467 }, { "epoch": 2.736842105263158, "grad_norm": 0.13745597306904, "learning_rate": 1.1888961478368762e-06, "loss": 0.1605, "num_tokens": 150179635.0, "step": 468 }, { "epoch": 2.742690058479532, "grad_norm": 0.13951839806880817, "learning_rate": 1.1808274008775355e-06, "loss": 0.158, "num_tokens": 150503170.0, "step": 469 }, { "epoch": 2.7485380116959064, "grad_norm": 0.1443209297278742, "learning_rate": 1.1729312321762864e-06, "loss": 0.1575, "num_tokens": 150809725.0, "step": 470 }, { "epoch": 2.754385964912281, "grad_norm": 0.13342746902950017, "learning_rate": 1.1652079572350026e-06, "loss": 0.154, "num_tokens": 151157947.0, "step": 471 }, { "epoch": 2.760233918128655, "grad_norm": 0.14326184156810018, "learning_rate": 1.1576578846473558e-06, "loss": 0.1584, "num_tokens": 151479230.0, "step": 472 }, { "epoch": 2.7660818713450293, "grad_norm": 0.1433950274377657, "learning_rate": 1.1502813160864893e-06, "loss": 0.1628, "num_tokens": 151803444.0, "step": 473 }, { "epoch": 2.7719298245614032, "grad_norm": 0.14139558810493624, "learning_rate": 1.1430785462929644e-06, "loss": 0.169, "num_tokens": 152139382.0, "step": 474 }, { "epoch": 2.7777777777777777, "grad_norm": 0.14115560637915964, "learning_rate": 1.136049863062982e-06, "loss": 0.162, "num_tokens": 152452934.0, "step": 475 }, { "epoch": 2.783625730994152, "grad_norm": 0.15211690242588544, "learning_rate": 1.1291955472368825e-06, "loss": 0.1601, "num_tokens": 152748759.0, "step": 476 }, { "epoch": 2.7894736842105265, "grad_norm": 0.13666222185265295, "learning_rate": 1.1225158726879288e-06, "loss": 0.1665, "num_tokens": 153093029.0, "step": 477 }, { "epoch": 2.7953216374269005, "grad_norm": 0.14398939041397035, "learning_rate": 1.116011106311358e-06, "loss": 0.1568, "num_tokens": 153384854.0, "step": 478 }, { "epoch": 2.801169590643275, "grad_norm": 0.15402905213013776, "learning_rate": 1.1096815080137196e-06, "loss": 0.1874, "num_tokens": 153720419.0, "step": 479 }, { "epoch": 2.807017543859649, "grad_norm": 0.14201446868127077, "learning_rate": 1.103527330702493e-06, "loss": 0.1624, "num_tokens": 154054230.0, "step": 480 }, { "epoch": 2.8128654970760234, "grad_norm": 0.14521001714072115, "learning_rate": 1.0975488202759772e-06, "loss": 0.1625, "num_tokens": 154373913.0, "step": 481 }, { "epoch": 2.818713450292398, "grad_norm": 0.1390122065414189, "learning_rate": 1.0917462156134707e-06, "loss": 0.1599, "num_tokens": 154689857.0, "step": 482 }, { "epoch": 2.824561403508772, "grad_norm": 0.14888462659883256, "learning_rate": 1.0861197485657218e-06, "loss": 0.1643, "num_tokens": 154995696.0, "step": 483 }, { "epoch": 2.8304093567251463, "grad_norm": 0.1420249071569745, "learning_rate": 1.0806696439456695e-06, "loss": 0.1629, "num_tokens": 155312841.0, "step": 484 }, { "epoch": 2.8362573099415203, "grad_norm": 0.14037073859681298, "learning_rate": 1.0753961195194581e-06, "loss": 0.1592, "num_tokens": 155627374.0, "step": 485 }, { "epoch": 2.8421052631578947, "grad_norm": 0.14506758333723185, "learning_rate": 1.070299385997735e-06, "loss": 0.1586, "num_tokens": 155933888.0, "step": 486 }, { "epoch": 2.847953216374269, "grad_norm": 0.14137857222010855, "learning_rate": 1.0653796470272348e-06, "loss": 0.1564, "num_tokens": 156263120.0, "step": 487 }, { "epoch": 2.853801169590643, "grad_norm": 0.14542756104140053, "learning_rate": 1.0606370991826398e-06, "loss": 0.165, "num_tokens": 156585736.0, "step": 488 }, { "epoch": 2.8596491228070176, "grad_norm": 0.13897674449897804, "learning_rate": 1.0560719319587262e-06, "loss": 0.1642, "num_tokens": 156919577.0, "step": 489 }, { "epoch": 2.8654970760233915, "grad_norm": 0.1411765074092308, "learning_rate": 1.051684327762793e-06, "loss": 0.1613, "num_tokens": 157244512.0, "step": 490 }, { "epoch": 2.871345029239766, "grad_norm": 0.14023194573370917, "learning_rate": 1.047474461907374e-06, "loss": 0.1593, "num_tokens": 157555581.0, "step": 491 }, { "epoch": 2.8771929824561404, "grad_norm": 0.13348582864925426, "learning_rate": 1.043442502603231e-06, "loss": 0.1587, "num_tokens": 157890387.0, "step": 492 }, { "epoch": 2.883040935672515, "grad_norm": 0.1458553034999855, "learning_rate": 1.0395886109526346e-06, "loss": 0.1672, "num_tokens": 158227831.0, "step": 493 }, { "epoch": 2.888888888888889, "grad_norm": 0.13433755063333214, "learning_rate": 1.0359129409429269e-06, "loss": 0.1672, "num_tokens": 158593011.0, "step": 494 }, { "epoch": 2.8947368421052633, "grad_norm": 0.13367864457349435, "learning_rate": 1.0324156394403683e-06, "loss": 0.1553, "num_tokens": 158933449.0, "step": 495 }, { "epoch": 2.9005847953216373, "grad_norm": 0.14118001312714748, "learning_rate": 1.0290968461842693e-06, "loss": 0.1577, "num_tokens": 159246578.0, "step": 496 }, { "epoch": 2.9064327485380117, "grad_norm": 0.1463297027696001, "learning_rate": 1.025956693781408e-06, "loss": 0.1678, "num_tokens": 159560178.0, "step": 497 }, { "epoch": 2.912280701754386, "grad_norm": 0.14563817342813995, "learning_rate": 1.0229953077007288e-06, "loss": 0.1659, "num_tokens": 159884093.0, "step": 498 }, { "epoch": 2.91812865497076, "grad_norm": 0.14540407437292474, "learning_rate": 1.0202128062683333e-06, "loss": 0.1722, "num_tokens": 160205475.0, "step": 499 }, { "epoch": 2.9239766081871346, "grad_norm": 0.1405121201590573, "learning_rate": 1.0176093006627485e-06, "loss": 0.1562, "num_tokens": 160545275.0, "step": 500 }, { "epoch": 2.9298245614035086, "grad_norm": 0.14217083046189563, "learning_rate": 1.0151848949104872e-06, "loss": 0.1665, "num_tokens": 160880973.0, "step": 501 }, { "epoch": 2.935672514619883, "grad_norm": 0.1464306529530731, "learning_rate": 1.01293968588189e-06, "loss": 0.1707, "num_tokens": 161205233.0, "step": 502 }, { "epoch": 2.9415204678362574, "grad_norm": 0.1408296921593092, "learning_rate": 1.0108737632872553e-06, "loss": 0.16, "num_tokens": 161521616.0, "step": 503 }, { "epoch": 2.9473684210526314, "grad_norm": 0.14238166027330365, "learning_rate": 1.0089872096732555e-06, "loss": 0.1635, "num_tokens": 161834814.0, "step": 504 }, { "epoch": 2.953216374269006, "grad_norm": 0.1426737290624598, "learning_rate": 1.0072801004196363e-06, "loss": 0.1615, "num_tokens": 162172431.0, "step": 505 }, { "epoch": 2.95906432748538, "grad_norm": 0.14507835204373007, "learning_rate": 1.0057525037362082e-06, "loss": 0.162, "num_tokens": 162500876.0, "step": 506 }, { "epoch": 2.9649122807017543, "grad_norm": 0.14256066597160452, "learning_rate": 1.0044044806601188e-06, "loss": 0.161, "num_tokens": 162830769.0, "step": 507 }, { "epoch": 2.9707602339181287, "grad_norm": 0.13917168255862636, "learning_rate": 1.003236085053414e-06, "loss": 0.1598, "num_tokens": 163185192.0, "step": 508 }, { "epoch": 2.976608187134503, "grad_norm": 0.1402722871419169, "learning_rate": 1.0022473636008867e-06, "loss": 0.164, "num_tokens": 163513051.0, "step": 509 }, { "epoch": 2.982456140350877, "grad_norm": 0.15282843543700755, "learning_rate": 1.0014383558082113e-06, "loss": 0.169, "num_tokens": 163816593.0, "step": 510 }, { "epoch": 2.9883040935672516, "grad_norm": 0.14600064275991873, "learning_rate": 1.000809094000365e-06, "loss": 0.1582, "num_tokens": 164132256.0, "step": 511 }, { "epoch": 2.9941520467836256, "grad_norm": 0.142164254928358, "learning_rate": 1.0003596033203359e-06, "loss": 0.1675, "num_tokens": 164460013.0, "step": 512 }, { "epoch": 3.0, "grad_norm": 0.14017066974116044, "learning_rate": 1.0000899017281195e-06, "loss": 0.1563, "num_tokens": 164791026.0, "step": 513 }, { "epoch": 3.0, "step": 513, "total_flos": 5.228622451394478e+17, "train_loss": 0.21386383229877517, "train_runtime": 6716.6337, "train_samples_per_second": 9.771, "train_steps_per_second": 0.076 } ], "logging_steps": 1, "max_steps": 513, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.228622451394478e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }