| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 513, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005847953216374269, | |
| "grad_norm": 3.346622166062452, | |
| "learning_rate": 0.0, | |
| "loss": 1.1879, | |
| "num_tokens": 309834.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.011695906432748537, | |
| "grad_norm": 3.2150187522491747, | |
| "learning_rate": 6.25e-07, | |
| "loss": 1.1528, | |
| "num_tokens": 626323.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 3.1095611576006044, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.13, | |
| "num_tokens": 962858.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.023391812865497075, | |
| "grad_norm": 3.0927940568383274, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 1.1257, | |
| "num_tokens": 1307919.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.029239766081871343, | |
| "grad_norm": 3.0505204823401963, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.1159, | |
| "num_tokens": 1643402.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 2.8525229189090564, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.1119, | |
| "num_tokens": 1963547.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04093567251461988, | |
| "grad_norm": 2.136607198658089, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 1.0281, | |
| "num_tokens": 2283318.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.04678362573099415, | |
| "grad_norm": 1.9705509244699, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 1.0088, | |
| "num_tokens": 2603880.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 1.2382281811593294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8683, | |
| "num_tokens": 2929732.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.05847953216374269, | |
| "grad_norm": 1.2462399978155196, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.868, | |
| "num_tokens": 3252895.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06432748538011696, | |
| "grad_norm": 1.1972270533702403, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.8151, | |
| "num_tokens": 3578517.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 1.8012881936401126, | |
| "learning_rate": 6.875e-06, | |
| "loss": 0.6379, | |
| "num_tokens": 3911914.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07602339181286549, | |
| "grad_norm": 1.5004715524395629, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.6352, | |
| "num_tokens": 4228515.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.08187134502923976, | |
| "grad_norm": 1.320062812526294, | |
| "learning_rate": 8.125000000000001e-06, | |
| "loss": 0.6228, | |
| "num_tokens": 4536476.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 0.9906906777846411, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.5299, | |
| "num_tokens": 4868361.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0935672514619883, | |
| "grad_norm": 0.8386484072060002, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.4729, | |
| "num_tokens": 5216197.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09941520467836257, | |
| "grad_norm": 0.4900339517100113, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4352, | |
| "num_tokens": 5557003.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.40004540169105984, | |
| "learning_rate": 9.999910098271881e-06, | |
| "loss": 0.3938, | |
| "num_tokens": 5892764.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 0.3513180037096265, | |
| "learning_rate": 9.999640396679666e-06, | |
| "loss": 0.4064, | |
| "num_tokens": 6207362.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 0.6732732756411357, | |
| "learning_rate": 9.999190905999637e-06, | |
| "loss": 0.3789, | |
| "num_tokens": 6496012.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 0.31994176912879413, | |
| "learning_rate": 9.99856164419179e-06, | |
| "loss": 0.3778, | |
| "num_tokens": 6804315.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1286549707602339, | |
| "grad_norm": 0.548016923041077, | |
| "learning_rate": 9.997752636399114e-06, | |
| "loss": 0.3673, | |
| "num_tokens": 7143380.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.13450292397660818, | |
| "grad_norm": 0.29902181791620935, | |
| "learning_rate": 9.996763914946586e-06, | |
| "loss": 0.3593, | |
| "num_tokens": 7463502.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 0.31707406326522014, | |
| "learning_rate": 9.995595519339882e-06, | |
| "loss": 0.3577, | |
| "num_tokens": 7774770.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.14619883040935672, | |
| "grad_norm": 0.27176157042044713, | |
| "learning_rate": 9.994247496263792e-06, | |
| "loss": 0.3395, | |
| "num_tokens": 8087750.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.15204678362573099, | |
| "grad_norm": 0.3065271953474858, | |
| "learning_rate": 9.992719899580364e-06, | |
| "loss": 0.3474, | |
| "num_tokens": 8434239.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.30277827390998685, | |
| "learning_rate": 9.991012790326745e-06, | |
| "loss": 0.3121, | |
| "num_tokens": 8750905.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.16374269005847952, | |
| "grad_norm": 0.25470554589584754, | |
| "learning_rate": 9.989126236712746e-06, | |
| "loss": 0.3197, | |
| "num_tokens": 9075220.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1695906432748538, | |
| "grad_norm": 0.24585407625854147, | |
| "learning_rate": 9.987060314118111e-06, | |
| "loss": 0.3217, | |
| "num_tokens": 9397453.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 0.4536476490136459, | |
| "learning_rate": 9.984815105089515e-06, | |
| "loss": 0.3369, | |
| "num_tokens": 9705728.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.18128654970760233, | |
| "grad_norm": 0.23980035473967873, | |
| "learning_rate": 9.982390699337253e-06, | |
| "loss": 0.2813, | |
| "num_tokens": 10025621.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1871345029239766, | |
| "grad_norm": 0.23564595909740174, | |
| "learning_rate": 9.979787193731666e-06, | |
| "loss": 0.3259, | |
| "num_tokens": 10337176.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 0.20709709888139777, | |
| "learning_rate": 9.977004692299273e-06, | |
| "loss": 0.2945, | |
| "num_tokens": 10651259.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.19883040935672514, | |
| "grad_norm": 0.1973226462113262, | |
| "learning_rate": 9.974043306218595e-06, | |
| "loss": 0.2922, | |
| "num_tokens": 10992918.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.2046783625730994, | |
| "grad_norm": 0.18869956638769483, | |
| "learning_rate": 9.970903153815731e-06, | |
| "loss": 0.2835, | |
| "num_tokens": 11310394.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.18686305886535334, | |
| "learning_rate": 9.967584360559632e-06, | |
| "loss": 0.2925, | |
| "num_tokens": 11636774.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.21637426900584794, | |
| "grad_norm": 0.1725945504953492, | |
| "learning_rate": 9.964087059057075e-06, | |
| "loss": 0.2592, | |
| "num_tokens": 11956378.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.19881803465402237, | |
| "learning_rate": 9.960411389047366e-06, | |
| "loss": 0.2955, | |
| "num_tokens": 12248057.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 0.19691612763541272, | |
| "learning_rate": 9.95655749739677e-06, | |
| "loss": 0.2835, | |
| "num_tokens": 12554456.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 0.17622929852503963, | |
| "learning_rate": 9.952525538092627e-06, | |
| "loss": 0.2726, | |
| "num_tokens": 12880847.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.23976608187134502, | |
| "grad_norm": 0.1622820101848469, | |
| "learning_rate": 9.948315672237208e-06, | |
| "loss": 0.2692, | |
| "num_tokens": 13237415.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 0.16492519002263994, | |
| "learning_rate": 9.943928068041274e-06, | |
| "loss": 0.2791, | |
| "num_tokens": 13600570.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.25146198830409355, | |
| "grad_norm": 0.1659517586811088, | |
| "learning_rate": 9.939362900817362e-06, | |
| "loss": 0.2499, | |
| "num_tokens": 13934393.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2573099415204678, | |
| "grad_norm": 0.1675942094434052, | |
| "learning_rate": 9.934620352972766e-06, | |
| "loss": 0.2709, | |
| "num_tokens": 14283319.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.17033294793266862, | |
| "learning_rate": 9.929700614002265e-06, | |
| "loss": 0.2712, | |
| "num_tokens": 14602358.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.26900584795321636, | |
| "grad_norm": 0.17218950775910574, | |
| "learning_rate": 9.924603880480543e-06, | |
| "loss": 0.2768, | |
| "num_tokens": 14914703.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.27485380116959063, | |
| "grad_norm": 0.17142338750289324, | |
| "learning_rate": 9.919330356054332e-06, | |
| "loss": 0.2677, | |
| "num_tokens": 15226891.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.15935514552682936, | |
| "learning_rate": 9.913880251434279e-06, | |
| "loss": 0.2513, | |
| "num_tokens": 15566110.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.28654970760233917, | |
| "grad_norm": 0.17590713121193502, | |
| "learning_rate": 9.90825378438653e-06, | |
| "loss": 0.2413, | |
| "num_tokens": 15864678.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.29239766081871343, | |
| "grad_norm": 0.17105743981722707, | |
| "learning_rate": 9.902451179724025e-06, | |
| "loss": 0.2566, | |
| "num_tokens": 16166802.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 0.17592812081480816, | |
| "learning_rate": 9.896472669297508e-06, | |
| "loss": 0.2432, | |
| "num_tokens": 16465873.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.30409356725146197, | |
| "grad_norm": 0.16854050409478574, | |
| "learning_rate": 9.890318491986282e-06, | |
| "loss": 0.2434, | |
| "num_tokens": 16764387.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.30994152046783624, | |
| "grad_norm": 0.1722203346036174, | |
| "learning_rate": 9.883988893688645e-06, | |
| "loss": 0.2533, | |
| "num_tokens": 17059312.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.16603305455782896, | |
| "learning_rate": 9.877484127312072e-06, | |
| "loss": 0.2492, | |
| "num_tokens": 17382890.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3216374269005848, | |
| "grad_norm": 0.16334953855577547, | |
| "learning_rate": 9.870804452763118e-06, | |
| "loss": 0.2563, | |
| "num_tokens": 17716146.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.32748538011695905, | |
| "grad_norm": 0.18120405762456362, | |
| "learning_rate": 9.863950136937019e-06, | |
| "loss": 0.2532, | |
| "num_tokens": 18044901.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.16955478627673745, | |
| "learning_rate": 9.856921453707036e-06, | |
| "loss": 0.256, | |
| "num_tokens": 18360773.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3391812865497076, | |
| "grad_norm": 0.15096634608102888, | |
| "learning_rate": 9.849718683913511e-06, | |
| "loss": 0.2259, | |
| "num_tokens": 18694718.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.34502923976608185, | |
| "grad_norm": 0.1757789479931499, | |
| "learning_rate": 9.842342115352647e-06, | |
| "loss": 0.2595, | |
| "num_tokens": 19014702.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.16490033035648094, | |
| "learning_rate": 9.834792042764999e-06, | |
| "loss": 0.2404, | |
| "num_tokens": 19339612.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3567251461988304, | |
| "grad_norm": 0.16019689209153504, | |
| "learning_rate": 9.827068767823713e-06, | |
| "loss": 0.248, | |
| "num_tokens": 19681676.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.36257309941520466, | |
| "grad_norm": 0.1691448945153913, | |
| "learning_rate": 9.819172599122466e-06, | |
| "loss": 0.2346, | |
| "num_tokens": 20000100.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.16082639569066132, | |
| "learning_rate": 9.811103852163126e-06, | |
| "loss": 0.2262, | |
| "num_tokens": 20319423.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3742690058479532, | |
| "grad_norm": 0.15278910823194214, | |
| "learning_rate": 9.802862849343155e-06, | |
| "loss": 0.2281, | |
| "num_tokens": 20664041.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.38011695906432746, | |
| "grad_norm": 0.16155951042022568, | |
| "learning_rate": 9.794449919942736e-06, | |
| "loss": 0.241, | |
| "num_tokens": 20983214.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 0.16422330995295928, | |
| "learning_rate": 9.785865400111593e-06, | |
| "loss": 0.2489, | |
| "num_tokens": 21321454.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.391812865497076, | |
| "grad_norm": 0.16006949845014626, | |
| "learning_rate": 9.777109632855579e-06, | |
| "loss": 0.2471, | |
| "num_tokens": 21641982.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.39766081871345027, | |
| "grad_norm": 0.16727932073445337, | |
| "learning_rate": 9.768182968022964e-06, | |
| "loss": 0.2417, | |
| "num_tokens": 21978836.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 0.17239803759423833, | |
| "learning_rate": 9.759085762290457e-06, | |
| "loss": 0.2377, | |
| "num_tokens": 22268188.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4093567251461988, | |
| "grad_norm": 0.16257813227817636, | |
| "learning_rate": 9.749818379148958e-06, | |
| "loss": 0.2265, | |
| "num_tokens": 22581727.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4152046783625731, | |
| "grad_norm": 0.16056196258322708, | |
| "learning_rate": 9.74038118888902e-06, | |
| "loss": 0.2261, | |
| "num_tokens": 22899881.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.15733806901697214, | |
| "learning_rate": 9.730774568586076e-06, | |
| "loss": 0.2263, | |
| "num_tokens": 23240539.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4269005847953216, | |
| "grad_norm": 0.1679225698442003, | |
| "learning_rate": 9.720998902085354e-06, | |
| "loss": 0.2362, | |
| "num_tokens": 23546933.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4327485380116959, | |
| "grad_norm": 0.16689340356885685, | |
| "learning_rate": 9.71105457998655e-06, | |
| "loss": 0.2309, | |
| "num_tokens": 23867940.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 0.17313794861512294, | |
| "learning_rate": 9.70094199962821e-06, | |
| "loss": 0.2311, | |
| "num_tokens": 24191283.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.16976811460329427, | |
| "learning_rate": 9.690661565071875e-06, | |
| "loss": 0.2341, | |
| "num_tokens": 24504739.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4502923976608187, | |
| "grad_norm": 0.16719875175704804, | |
| "learning_rate": 9.68021368708591e-06, | |
| "loss": 0.2425, | |
| "num_tokens": 24846242.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 0.16237180095271134, | |
| "learning_rate": 9.66959878312911e-06, | |
| "loss": 0.219, | |
| "num_tokens": 25163117.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4619883040935672, | |
| "grad_norm": 0.16510212997973622, | |
| "learning_rate": 9.658817277334013e-06, | |
| "loss": 0.2304, | |
| "num_tokens": 25498031.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 0.16563608461998558, | |
| "learning_rate": 9.647869600489954e-06, | |
| "loss": 0.231, | |
| "num_tokens": 25812720.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.17555857386256016, | |
| "learning_rate": 9.63675619002585e-06, | |
| "loss": 0.2323, | |
| "num_tokens": 26123680.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.47953216374269003, | |
| "grad_norm": 0.164333445701762, | |
| "learning_rate": 9.625477489992727e-06, | |
| "loss": 0.2138, | |
| "num_tokens": 26410771.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4853801169590643, | |
| "grad_norm": 0.17696917107757262, | |
| "learning_rate": 9.614033951045974e-06, | |
| "loss": 0.2286, | |
| "num_tokens": 26716396.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 0.15710163009015682, | |
| "learning_rate": 9.602426030427335e-06, | |
| "loss": 0.22, | |
| "num_tokens": 27038109.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.49707602339181284, | |
| "grad_norm": 0.1677938058002079, | |
| "learning_rate": 9.590654191946645e-06, | |
| "loss": 0.2327, | |
| "num_tokens": 27372562.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5029239766081871, | |
| "grad_norm": 0.15756946980734074, | |
| "learning_rate": 9.578718905963289e-06, | |
| "loss": 0.2274, | |
| "num_tokens": 27719366.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 0.16483666302272912, | |
| "learning_rate": 9.566620649367418e-06, | |
| "loss": 0.23, | |
| "num_tokens": 28062728.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5146198830409356, | |
| "grad_norm": 0.16380030907396115, | |
| "learning_rate": 9.554359905560887e-06, | |
| "loss": 0.2139, | |
| "num_tokens": 28392885.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.52046783625731, | |
| "grad_norm": 0.15375223243216413, | |
| "learning_rate": 9.541937164437942e-06, | |
| "loss": 0.2249, | |
| "num_tokens": 28727888.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.16323721706648206, | |
| "learning_rate": 9.52935292236565e-06, | |
| "loss": 0.2206, | |
| "num_tokens": 29040411.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5321637426900585, | |
| "grad_norm": 0.16395159555865288, | |
| "learning_rate": 9.516607682164058e-06, | |
| "loss": 0.2077, | |
| "num_tokens": 29363581.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5380116959064327, | |
| "grad_norm": 0.173187036283939, | |
| "learning_rate": 9.503701953086107e-06, | |
| "loss": 0.2325, | |
| "num_tokens": 29691373.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 0.15592117720334775, | |
| "learning_rate": 9.490636250797288e-06, | |
| "loss": 0.2215, | |
| "num_tokens": 30026282.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.5497076023391813, | |
| "grad_norm": 0.16780226146886296, | |
| "learning_rate": 9.477411097355025e-06, | |
| "loss": 0.2266, | |
| "num_tokens": 30357776.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.16080371447382497, | |
| "learning_rate": 9.464027021187833e-06, | |
| "loss": 0.2261, | |
| "num_tokens": 30675188.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.15669972327863108, | |
| "learning_rate": 9.450484557074188e-06, | |
| "loss": 0.2247, | |
| "num_tokens": 31041728.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5672514619883041, | |
| "grad_norm": 0.15864033792743365, | |
| "learning_rate": 9.43678424612117e-06, | |
| "loss": 0.2219, | |
| "num_tokens": 31373582.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5730994152046783, | |
| "grad_norm": 0.15897622365337738, | |
| "learning_rate": 9.422926635742834e-06, | |
| "loss": 0.2124, | |
| "num_tokens": 31693920.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.1541883977143332, | |
| "learning_rate": 9.40891227963835e-06, | |
| "loss": 0.2089, | |
| "num_tokens": 32016538.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 0.16113178998178, | |
| "learning_rate": 9.39474173776986e-06, | |
| "loss": 0.2131, | |
| "num_tokens": 32342868.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5906432748538012, | |
| "grad_norm": 0.1639032352257192, | |
| "learning_rate": 9.380415576340127e-06, | |
| "loss": 0.2126, | |
| "num_tokens": 32649722.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 0.15812907943305207, | |
| "learning_rate": 9.365934367769885e-06, | |
| "loss": 0.2046, | |
| "num_tokens": 32961156.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6023391812865497, | |
| "grad_norm": 0.16040845291570488, | |
| "learning_rate": 9.351298690674996e-06, | |
| "loss": 0.212, | |
| "num_tokens": 33286164.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6081871345029239, | |
| "grad_norm": 0.15564337683728058, | |
| "learning_rate": 9.33650912984331e-06, | |
| "loss": 0.2112, | |
| "num_tokens": 33634944.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 0.16026166188911017, | |
| "learning_rate": 9.321566276211304e-06, | |
| "loss": 0.2167, | |
| "num_tokens": 33940455.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6198830409356725, | |
| "grad_norm": 0.15084775438103953, | |
| "learning_rate": 9.306470726840472e-06, | |
| "loss": 0.212, | |
| "num_tokens": 34269432.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6257309941520468, | |
| "grad_norm": 0.16379797154749554, | |
| "learning_rate": 9.291223084893472e-06, | |
| "loss": 0.2259, | |
| "num_tokens": 34564983.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.1626726779429298, | |
| "learning_rate": 9.275823959610019e-06, | |
| "loss": 0.2068, | |
| "num_tokens": 34869398.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.6374269005847953, | |
| "grad_norm": 0.16973276732555354, | |
| "learning_rate": 9.260273966282546e-06, | |
| "loss": 0.2103, | |
| "num_tokens": 35179769.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.6432748538011696, | |
| "grad_norm": 0.16573716072448422, | |
| "learning_rate": 9.244573726231621e-06, | |
| "loss": 0.209, | |
| "num_tokens": 35489608.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 0.16034467135549915, | |
| "learning_rate": 9.22872386678111e-06, | |
| "loss": 0.2056, | |
| "num_tokens": 35795317.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6549707602339181, | |
| "grad_norm": 0.16859253078446698, | |
| "learning_rate": 9.212725021233135e-06, | |
| "loss": 0.2105, | |
| "num_tokens": 36108365.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6608187134502924, | |
| "grad_norm": 0.17271477988986808, | |
| "learning_rate": 9.196577828842738e-06, | |
| "loss": 0.2075, | |
| "num_tokens": 36435675.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.17273881432682334, | |
| "learning_rate": 9.180282934792369e-06, | |
| "loss": 0.2054, | |
| "num_tokens": 36737269.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.672514619883041, | |
| "grad_norm": 0.15988223863731596, | |
| "learning_rate": 9.163840990166085e-06, | |
| "loss": 0.2011, | |
| "num_tokens": 37059436.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6783625730994152, | |
| "grad_norm": 0.1693196726503627, | |
| "learning_rate": 9.147252651923546e-06, | |
| "loss": 0.2202, | |
| "num_tokens": 37382958.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.1605754092944871, | |
| "learning_rate": 9.130518582873765e-06, | |
| "loss": 0.2169, | |
| "num_tokens": 37711301.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6900584795321637, | |
| "grad_norm": 0.16182274147996495, | |
| "learning_rate": 9.11363945164862e-06, | |
| "loss": 0.2022, | |
| "num_tokens": 38034357.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.695906432748538, | |
| "grad_norm": 0.17280602584782606, | |
| "learning_rate": 9.096615932676138e-06, | |
| "loss": 0.2011, | |
| "num_tokens": 38349527.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.16010466565814827, | |
| "learning_rate": 9.079448706153554e-06, | |
| "loss": 0.2016, | |
| "num_tokens": 38654356.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7076023391812866, | |
| "grad_norm": 0.15767250824653006, | |
| "learning_rate": 9.062138458020128e-06, | |
| "loss": 0.1917, | |
| "num_tokens": 38962205.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.7134502923976608, | |
| "grad_norm": 0.17133096215596827, | |
| "learning_rate": 9.044685879929734e-06, | |
| "loss": 0.2087, | |
| "num_tokens": 39267552.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 0.15684580395831532, | |
| "learning_rate": 9.027091669223228e-06, | |
| "loss": 0.203, | |
| "num_tokens": 39570208.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.7251461988304093, | |
| "grad_norm": 0.16789387397403432, | |
| "learning_rate": 9.00935652890059e-06, | |
| "loss": 0.2099, | |
| "num_tokens": 39895778.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.7309941520467836, | |
| "grad_norm": 0.15658978810688212, | |
| "learning_rate": 8.991481167592826e-06, | |
| "loss": 0.2061, | |
| "num_tokens": 40225470.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.15588015347201137, | |
| "learning_rate": 8.973466299533656e-06, | |
| "loss": 0.2047, | |
| "num_tokens": 40559255.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.7426900584795322, | |
| "grad_norm": 0.1750982604863698, | |
| "learning_rate": 8.955312644530976e-06, | |
| "loss": 0.1996, | |
| "num_tokens": 40860587.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.7485380116959064, | |
| "grad_norm": 0.15381841682641284, | |
| "learning_rate": 8.937020927938103e-06, | |
| "loss": 0.2001, | |
| "num_tokens": 41189624.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 0.1577155995424487, | |
| "learning_rate": 8.918591880624783e-06, | |
| "loss": 0.2005, | |
| "num_tokens": 41490687.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7602339181286549, | |
| "grad_norm": 0.15962057074829455, | |
| "learning_rate": 8.900026238947995e-06, | |
| "loss": 0.2115, | |
| "num_tokens": 41818157.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7660818713450293, | |
| "grad_norm": 0.15033669547133874, | |
| "learning_rate": 8.881324744722524e-06, | |
| "loss": 0.1945, | |
| "num_tokens": 42149764.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 0.15680523481040093, | |
| "learning_rate": 8.86248814519133e-06, | |
| "loss": 0.204, | |
| "num_tokens": 42452660.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 0.19538772884868966, | |
| "learning_rate": 8.843517192995673e-06, | |
| "loss": 0.2094, | |
| "num_tokens": 42762176.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.783625730994152, | |
| "grad_norm": 0.1497802316166281, | |
| "learning_rate": 8.824412646145065e-06, | |
| "loss": 0.206, | |
| "num_tokens": 43122509.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.17792632602872682, | |
| "learning_rate": 8.805175267986955e-06, | |
| "loss": 0.2021, | |
| "num_tokens": 43438515.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7953216374269005, | |
| "grad_norm": 0.16345383837628785, | |
| "learning_rate": 8.785805827176256e-06, | |
| "loss": 0.213, | |
| "num_tokens": 43750567.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.8011695906432749, | |
| "grad_norm": 0.15988232783718637, | |
| "learning_rate": 8.766305097644608e-06, | |
| "loss": 0.2076, | |
| "num_tokens": 44061251.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 0.148560033989183, | |
| "learning_rate": 8.746673858569478e-06, | |
| "loss": 0.2056, | |
| "num_tokens": 44402399.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.8128654970760234, | |
| "grad_norm": 0.15498081570702754, | |
| "learning_rate": 8.726912894343e-06, | |
| "loss": 0.2011, | |
| "num_tokens": 44713760.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 0.16409619767618208, | |
| "learning_rate": 8.707022994540659e-06, | |
| "loss": 0.1985, | |
| "num_tokens": 45006733.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 0.16387952215184107, | |
| "learning_rate": 8.687004953889729e-06, | |
| "loss": 0.2117, | |
| "num_tokens": 45319558.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.8304093567251462, | |
| "grad_norm": 0.14972932521892918, | |
| "learning_rate": 8.666859572237517e-06, | |
| "loss": 0.2017, | |
| "num_tokens": 45648581.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.8362573099415205, | |
| "grad_norm": 0.14894137892754095, | |
| "learning_rate": 8.646587654519413e-06, | |
| "loss": 0.2011, | |
| "num_tokens": 45961313.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.15579100817213587, | |
| "learning_rate": 8.626190010726723e-06, | |
| "loss": 0.1851, | |
| "num_tokens": 46290840.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.847953216374269, | |
| "grad_norm": 0.154512527802127, | |
| "learning_rate": 8.605667455874302e-06, | |
| "loss": 0.1903, | |
| "num_tokens": 46585564.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8538011695906432, | |
| "grad_norm": 0.16017530332563623, | |
| "learning_rate": 8.585020809967995e-06, | |
| "loss": 0.2066, | |
| "num_tokens": 46893844.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 0.16010504287695315, | |
| "learning_rate": 8.564250897971862e-06, | |
| "loss": 0.2151, | |
| "num_tokens": 47228507.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.8654970760233918, | |
| "grad_norm": 0.15911701846573467, | |
| "learning_rate": 8.543358549775232e-06, | |
| "loss": 0.2029, | |
| "num_tokens": 47537550.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8713450292397661, | |
| "grad_norm": 0.1385770650249908, | |
| "learning_rate": 8.522344600159532e-06, | |
| "loss": 0.1892, | |
| "num_tokens": 47871896.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.14241522573672255, | |
| "learning_rate": 8.501209888764928e-06, | |
| "loss": 0.2016, | |
| "num_tokens": 48224890.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8830409356725146, | |
| "grad_norm": 0.16042004560579917, | |
| "learning_rate": 8.479955260056793e-06, | |
| "loss": 0.2293, | |
| "num_tokens": 48551394.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.1606415549333606, | |
| "learning_rate": 8.458581563291948e-06, | |
| "loss": 0.1993, | |
| "num_tokens": 48869584.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.13692085860159872, | |
| "learning_rate": 8.437089652484735e-06, | |
| "loss": 0.188, | |
| "num_tokens": 49220358.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.9005847953216374, | |
| "grad_norm": 0.15029963686711117, | |
| "learning_rate": 8.415480386372901e-06, | |
| "loss": 0.2176, | |
| "num_tokens": 49555633.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.9064327485380117, | |
| "grad_norm": 0.15377050265299294, | |
| "learning_rate": 8.393754628383274e-06, | |
| "loss": 0.2078, | |
| "num_tokens": 49857399.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.1600390896381573, | |
| "learning_rate": 8.371913246597272e-06, | |
| "loss": 0.1987, | |
| "num_tokens": 50208717.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.9181286549707602, | |
| "grad_norm": 0.1709173684661271, | |
| "learning_rate": 8.349957113716213e-06, | |
| "loss": 0.212, | |
| "num_tokens": 50502126.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.9239766081871345, | |
| "grad_norm": 0.16675835187365423, | |
| "learning_rate": 8.327887107026445e-06, | |
| "loss": 0.2237, | |
| "num_tokens": 50820497.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 0.14395552142792745, | |
| "learning_rate": 8.305704108364301e-06, | |
| "loss": 0.2076, | |
| "num_tokens": 51154766.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 0.15573171280863216, | |
| "learning_rate": 8.283409004080853e-06, | |
| "loss": 0.2114, | |
| "num_tokens": 51491802.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9415204678362573, | |
| "grad_norm": 0.158518456781856, | |
| "learning_rate": 8.261002685006503e-06, | |
| "loss": 0.2224, | |
| "num_tokens": 51818025.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.15531715834460813, | |
| "learning_rate": 8.238486046415385e-06, | |
| "loss": 0.1937, | |
| "num_tokens": 52118378.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.9532163742690059, | |
| "grad_norm": 0.1515317024708392, | |
| "learning_rate": 8.2158599879896e-06, | |
| "loss": 0.1968, | |
| "num_tokens": 52428129.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.9590643274853801, | |
| "grad_norm": 0.14424883914854034, | |
| "learning_rate": 8.19312541378326e-06, | |
| "loss": 0.193, | |
| "num_tokens": 52735470.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 0.15498488683654527, | |
| "learning_rate": 8.170283232186365e-06, | |
| "loss": 0.1943, | |
| "num_tokens": 53051592.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9707602339181286, | |
| "grad_norm": 0.1754541926428641, | |
| "learning_rate": 8.14733435588852e-06, | |
| "loss": 0.2214, | |
| "num_tokens": 53343217.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.9766081871345029, | |
| "grad_norm": 0.14868906024546139, | |
| "learning_rate": 8.12427970184245e-06, | |
| "loss": 0.1935, | |
| "num_tokens": 53658830.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.1500168809044866, | |
| "learning_rate": 8.101120191227374e-06, | |
| "loss": 0.1981, | |
| "num_tokens": 53999419.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9883040935672515, | |
| "grad_norm": 0.14699416276573674, | |
| "learning_rate": 8.07785674941219e-06, | |
| "loss": 0.194, | |
| "num_tokens": 54309901.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.9941520467836257, | |
| "grad_norm": 0.15232473722185103, | |
| "learning_rate": 8.054490305918512e-06, | |
| "loss": 0.1955, | |
| "num_tokens": 54612844.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1607048554407368, | |
| "learning_rate": 8.031021794383513e-06, | |
| "loss": 0.2092, | |
| "num_tokens": 54926459.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.0058479532163742, | |
| "grad_norm": 0.1625217088750646, | |
| "learning_rate": 8.007452152522639e-06, | |
| "loss": 0.1875, | |
| "num_tokens": 55222609.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.0116959064327484, | |
| "grad_norm": 0.15670723402140246, | |
| "learning_rate": 7.983782322092126e-06, | |
| "loss": 0.1938, | |
| "num_tokens": 55537898.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 0.14242775964832494, | |
| "learning_rate": 7.960013248851375e-06, | |
| "loss": 0.1882, | |
| "num_tokens": 55862238.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.023391812865497, | |
| "grad_norm": 0.1606453358565539, | |
| "learning_rate": 7.936145882525174e-06, | |
| "loss": 0.1877, | |
| "num_tokens": 56180559.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0292397660818713, | |
| "grad_norm": 0.15292057131184103, | |
| "learning_rate": 7.91218117676573e-06, | |
| "loss": 0.1783, | |
| "num_tokens": 56530315.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 0.14722221394043097, | |
| "learning_rate": 7.888120089114586e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 56837967.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.04093567251462, | |
| "grad_norm": 0.15795692601491945, | |
| "learning_rate": 7.863963580964344e-06, | |
| "loss": 0.1772, | |
| "num_tokens": 57149693.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.0467836257309941, | |
| "grad_norm": 0.1700985282278579, | |
| "learning_rate": 7.839712617520263e-06, | |
| "loss": 0.1897, | |
| "num_tokens": 57481658.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.14557972709377917, | |
| "learning_rate": 7.815368167761686e-06, | |
| "loss": 0.1797, | |
| "num_tokens": 57795613.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0584795321637426, | |
| "grad_norm": 0.15501104048829578, | |
| "learning_rate": 7.790931204403323e-06, | |
| "loss": 0.177, | |
| "num_tokens": 58094562.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.064327485380117, | |
| "grad_norm": 0.1521089989961407, | |
| "learning_rate": 7.766402703856391e-06, | |
| "loss": 0.1748, | |
| "num_tokens": 58418586.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 0.16486785923579997, | |
| "learning_rate": 7.741783646189597e-06, | |
| "loss": 0.1927, | |
| "num_tokens": 58745927.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.0760233918128654, | |
| "grad_norm": 0.15410381183903402, | |
| "learning_rate": 7.717075015089976e-06, | |
| "loss": 0.1884, | |
| "num_tokens": 59070496.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.0818713450292399, | |
| "grad_norm": 0.1444493695200652, | |
| "learning_rate": 7.692277797823585e-06, | |
| "loss": 0.1755, | |
| "num_tokens": 59388680.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 0.15330407620774641, | |
| "learning_rate": 7.667392985196064e-06, | |
| "loss": 0.1866, | |
| "num_tokens": 59707236.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.0935672514619883, | |
| "grad_norm": 0.16477562582655433, | |
| "learning_rate": 7.64242157151304e-06, | |
| "loss": 0.1999, | |
| "num_tokens": 60042655.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.0994152046783625, | |
| "grad_norm": 0.140060196586728, | |
| "learning_rate": 7.6173645545404e-06, | |
| "loss": 0.1834, | |
| "num_tokens": 60397091.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.1525186599047059, | |
| "learning_rate": 7.5922229354644195e-06, | |
| "loss": 0.1811, | |
| "num_tokens": 60707243.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.14180526703114305, | |
| "learning_rate": 7.56699771885177e-06, | |
| "loss": 0.1789, | |
| "num_tokens": 61056021.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1169590643274854, | |
| "grad_norm": 0.14606069061084653, | |
| "learning_rate": 7.541689912609365e-06, | |
| "loss": 0.1833, | |
| "num_tokens": 61381476.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 0.1452299835582357, | |
| "learning_rate": 7.516300527944104e-06, | |
| "loss": 0.1889, | |
| "num_tokens": 61710931.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.128654970760234, | |
| "grad_norm": 0.1607339684687444, | |
| "learning_rate": 7.4908305793224565e-06, | |
| "loss": 0.1891, | |
| "num_tokens": 62048426.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.1345029239766082, | |
| "grad_norm": 0.15970871017649693, | |
| "learning_rate": 7.465281084429931e-06, | |
| "loss": 0.1841, | |
| "num_tokens": 62347583.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 0.15135926518093104, | |
| "learning_rate": 7.4396530641304135e-06, | |
| "loss": 0.1817, | |
| "num_tokens": 62662619.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.1461988304093567, | |
| "grad_norm": 0.1463984323420409, | |
| "learning_rate": 7.413947542425377e-06, | |
| "loss": 0.1795, | |
| "num_tokens": 62990146.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.1520467836257309, | |
| "grad_norm": 0.15028061491082353, | |
| "learning_rate": 7.388165546412967e-06, | |
| "loss": 0.1809, | |
| "num_tokens": 63314531.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.15759301916806728, | |
| "learning_rate": 7.362308106246956e-06, | |
| "loss": 0.1842, | |
| "num_tokens": 63647247.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.1637426900584795, | |
| "grad_norm": 0.14433827296829588, | |
| "learning_rate": 7.336376255095592e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 63974328.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.1695906432748537, | |
| "grad_norm": 0.1489036598644256, | |
| "learning_rate": 7.3103710291003134e-06, | |
| "loss": 0.1832, | |
| "num_tokens": 64295392.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.1621435575204086, | |
| "learning_rate": 7.284293467334344e-06, | |
| "loss": 0.1829, | |
| "num_tokens": 64601120.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.1812865497076024, | |
| "grad_norm": 0.15685350805242304, | |
| "learning_rate": 7.258144611761181e-06, | |
| "loss": 0.1828, | |
| "num_tokens": 64910553.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.1871345029239766, | |
| "grad_norm": 0.1537822114754735, | |
| "learning_rate": 7.23192550719296e-06, | |
| "loss": 0.1786, | |
| "num_tokens": 65230586.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 0.1522958629898793, | |
| "learning_rate": 7.2056372012487065e-06, | |
| "loss": 0.1858, | |
| "num_tokens": 65576822.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.198830409356725, | |
| "grad_norm": 0.15072282593856123, | |
| "learning_rate": 7.179280744312481e-06, | |
| "loss": 0.1717, | |
| "num_tokens": 65892198.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.2046783625730995, | |
| "grad_norm": 0.14679430331251794, | |
| "learning_rate": 7.152857189491406e-06, | |
| "loss": 0.1709, | |
| "num_tokens": 66218113.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.14528524091480893, | |
| "learning_rate": 7.126367592573589e-06, | |
| "loss": 0.172, | |
| "num_tokens": 66560316.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.2163742690058479, | |
| "grad_norm": 0.1513531832369065, | |
| "learning_rate": 7.099813011985936e-06, | |
| "loss": 0.1867, | |
| "num_tokens": 66886426.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.14889506905441677, | |
| "learning_rate": 7.073194508751863e-06, | |
| "loss": 0.184, | |
| "num_tokens": 67205115.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.1654317181387804, | |
| "learning_rate": 7.046513146448899e-06, | |
| "loss": 0.1892, | |
| "num_tokens": 67509071.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2339181286549707, | |
| "grad_norm": 0.1510266755197108, | |
| "learning_rate": 7.019769991166189e-06, | |
| "loss": 0.1788, | |
| "num_tokens": 67841682.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.239766081871345, | |
| "grad_norm": 0.1419685060740966, | |
| "learning_rate": 6.992966111461903e-06, | |
| "loss": 0.1685, | |
| "num_tokens": 68147715.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 0.14993568993277867, | |
| "learning_rate": 6.966102578320531e-06, | |
| "loss": 0.1764, | |
| "num_tokens": 68463460.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.2514619883040936, | |
| "grad_norm": 0.14659321843622847, | |
| "learning_rate": 6.9391804651100924e-06, | |
| "loss": 0.1897, | |
| "num_tokens": 68799959.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.2573099415204678, | |
| "grad_norm": 0.15073229254770368, | |
| "learning_rate": 6.912200847539261e-06, | |
| "loss": 0.172, | |
| "num_tokens": 69111397.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.15759494389625772, | |
| "learning_rate": 6.885164803614366e-06, | |
| "loss": 0.1838, | |
| "num_tokens": 69392630.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.2690058479532165, | |
| "grad_norm": 0.14630639885005334, | |
| "learning_rate": 6.858073413596324e-06, | |
| "loss": 0.1807, | |
| "num_tokens": 69701641.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.2748538011695907, | |
| "grad_norm": 0.14730200293827667, | |
| "learning_rate": 6.830927759957487e-06, | |
| "loss": 0.1813, | |
| "num_tokens": 70030091.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 0.14753719595946904, | |
| "learning_rate": 6.80372892733837e-06, | |
| "loss": 0.1804, | |
| "num_tokens": 70348775.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.286549707602339, | |
| "grad_norm": 0.16483571370877223, | |
| "learning_rate": 6.776478002504335e-06, | |
| "loss": 0.1859, | |
| "num_tokens": 70651692.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2923976608187133, | |
| "grad_norm": 0.16648842833487107, | |
| "learning_rate": 6.7491760743021535e-06, | |
| "loss": 0.1903, | |
| "num_tokens": 70956682.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.14671697438686584, | |
| "learning_rate": 6.721824233616503e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 71265203.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.304093567251462, | |
| "grad_norm": 0.15188899959419136, | |
| "learning_rate": 6.694423573326382e-06, | |
| "loss": 0.1782, | |
| "num_tokens": 71583993.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.3099415204678362, | |
| "grad_norm": 0.14838740052418056, | |
| "learning_rate": 6.666975188261437e-06, | |
| "loss": 0.182, | |
| "num_tokens": 71899719.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.1576944297988978, | |
| "learning_rate": 6.639480175158227e-06, | |
| "loss": 0.1784, | |
| "num_tokens": 72202094.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.3216374269005848, | |
| "grad_norm": 0.15597436870104375, | |
| "learning_rate": 6.611939632616394e-06, | |
| "loss": 0.1784, | |
| "num_tokens": 72516371.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.327485380116959, | |
| "grad_norm": 0.15325246999833303, | |
| "learning_rate": 6.584354661054765e-06, | |
| "loss": 0.1842, | |
| "num_tokens": 72828007.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.14935801239745722, | |
| "learning_rate": 6.556726362667394e-06, | |
| "loss": 0.1833, | |
| "num_tokens": 73134243.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.3391812865497075, | |
| "grad_norm": 0.16481075845453566, | |
| "learning_rate": 6.529055841379509e-06, | |
| "loss": 0.176, | |
| "num_tokens": 73436138.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.345029239766082, | |
| "grad_norm": 0.14125268538033928, | |
| "learning_rate": 6.501344202803415e-06, | |
| "loss": 0.1708, | |
| "num_tokens": 73760046.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 0.1501570731496053, | |
| "learning_rate": 6.473592554194311e-06, | |
| "loss": 0.1826, | |
| "num_tokens": 74077480.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.3567251461988303, | |
| "grad_norm": 0.15771910225549807, | |
| "learning_rate": 6.445802004406047e-06, | |
| "loss": 0.1922, | |
| "num_tokens": 74423874.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.3625730994152048, | |
| "grad_norm": 0.15600805460262265, | |
| "learning_rate": 6.417973663846826e-06, | |
| "loss": 0.1749, | |
| "num_tokens": 74745816.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.14516377176873183, | |
| "learning_rate": 6.390108644434828e-06, | |
| "loss": 0.18, | |
| "num_tokens": 75092262.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.3742690058479532, | |
| "grad_norm": 0.14392359169053118, | |
| "learning_rate": 6.362208059553786e-06, | |
| "loss": 0.1799, | |
| "num_tokens": 75442533.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.3801169590643274, | |
| "grad_norm": 0.1548508531809334, | |
| "learning_rate": 6.334273024008499e-06, | |
| "loss": 0.1705, | |
| "num_tokens": 75775480.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 0.15386739061806035, | |
| "learning_rate": 6.306304653980286e-06, | |
| "loss": 0.1722, | |
| "num_tokens": 76066461.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.391812865497076, | |
| "grad_norm": 0.14831080775519306, | |
| "learning_rate": 6.278304066982391e-06, | |
| "loss": 0.1836, | |
| "num_tokens": 76401700.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.3976608187134503, | |
| "grad_norm": 0.14755841590724592, | |
| "learning_rate": 6.250272381815331e-06, | |
| "loss": 0.1802, | |
| "num_tokens": 76714274.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.15910670160937837, | |
| "learning_rate": 6.222210718522187e-06, | |
| "loss": 0.2031, | |
| "num_tokens": 77028246.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.409356725146199, | |
| "grad_norm": 0.15280436173000247, | |
| "learning_rate": 6.19412019834386e-06, | |
| "loss": 0.1742, | |
| "num_tokens": 77364346.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.4152046783625731, | |
| "grad_norm": 0.13887335477707105, | |
| "learning_rate": 6.166001943674266e-06, | |
| "loss": 0.1785, | |
| "num_tokens": 77748583.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.1577201352102885, | |
| "learning_rate": 6.137857078015487e-06, | |
| "loss": 0.1863, | |
| "num_tokens": 78064140.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.4269005847953216, | |
| "grad_norm": 0.14303385830957374, | |
| "learning_rate": 6.109686725932882e-06, | |
| "loss": 0.1813, | |
| "num_tokens": 78411157.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.4327485380116958, | |
| "grad_norm": 0.14790866721582488, | |
| "learning_rate": 6.081492013010154e-06, | |
| "loss": 0.1778, | |
| "num_tokens": 78723584.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 0.14901104148022398, | |
| "learning_rate": 6.0532740658043785e-06, | |
| "loss": 0.1807, | |
| "num_tokens": 79054107.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.1485395120018961, | |
| "learning_rate": 6.025034011800989e-06, | |
| "loss": 0.187, | |
| "num_tokens": 79386694.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.4502923976608186, | |
| "grad_norm": 0.14647752236417627, | |
| "learning_rate": 5.996772979368715e-06, | |
| "loss": 0.1849, | |
| "num_tokens": 79718178.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 0.15032391233353223, | |
| "learning_rate": 5.968492097714519e-06, | |
| "loss": 0.1744, | |
| "num_tokens": 80013286.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.4619883040935673, | |
| "grad_norm": 0.13111163977489035, | |
| "learning_rate": 5.940192496838456e-06, | |
| "loss": 0.1683, | |
| "num_tokens": 80358177.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4678362573099415, | |
| "grad_norm": 0.14507217241507256, | |
| "learning_rate": 5.911875307488543e-06, | |
| "loss": 0.1697, | |
| "num_tokens": 80664979.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.14371510606580692, | |
| "learning_rate": 5.883541661115555e-06, | |
| "loss": 0.183, | |
| "num_tokens": 81008531.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.47953216374269, | |
| "grad_norm": 0.14575959771926755, | |
| "learning_rate": 5.855192689827838e-06, | |
| "loss": 0.1784, | |
| "num_tokens": 81327068.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.4853801169590644, | |
| "grad_norm": 0.14386473495114957, | |
| "learning_rate": 5.8268295263460625e-06, | |
| "loss": 0.186, | |
| "num_tokens": 81660168.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 0.15665862540575096, | |
| "learning_rate": 5.798453303957968e-06, | |
| "loss": 0.1852, | |
| "num_tokens": 81988344.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.4970760233918128, | |
| "grad_norm": 0.1486254433584565, | |
| "learning_rate": 5.77006515647308e-06, | |
| "loss": 0.1815, | |
| "num_tokens": 82292457.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.5029239766081872, | |
| "grad_norm": 0.15218761718926124, | |
| "learning_rate": 5.741666218177402e-06, | |
| "loss": 0.1754, | |
| "num_tokens": 82577890.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.1538621360954969, | |
| "learning_rate": 5.7132576237881075e-06, | |
| "loss": 0.1855, | |
| "num_tokens": 82874407.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.5146198830409356, | |
| "grad_norm": 0.16534859264165339, | |
| "learning_rate": 5.684840508408183e-06, | |
| "loss": 0.187, | |
| "num_tokens": 83181722.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.52046783625731, | |
| "grad_norm": 0.15058626722493648, | |
| "learning_rate": 5.656416007481089e-06, | |
| "loss": 0.1793, | |
| "num_tokens": 83483808.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.14392786696974194, | |
| "learning_rate": 5.627985256745384e-06, | |
| "loss": 0.1773, | |
| "num_tokens": 83826918.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.5321637426900585, | |
| "grad_norm": 0.1547539344312234, | |
| "learning_rate": 5.5995493921893415e-06, | |
| "loss": 0.1747, | |
| "num_tokens": 84129500.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.5380116959064327, | |
| "grad_norm": 0.15476418511066206, | |
| "learning_rate": 5.571109550005571e-06, | |
| "loss": 0.1831, | |
| "num_tokens": 84454231.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.15292624983775452, | |
| "learning_rate": 5.542666866545609e-06, | |
| "loss": 0.1772, | |
| "num_tokens": 84758320.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.5497076023391814, | |
| "grad_norm": 0.14323940203700627, | |
| "learning_rate": 5.5142224782745175e-06, | |
| "loss": 0.1742, | |
| "num_tokens": 85064629.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.1569122030468075, | |
| "learning_rate": 5.485777521725485e-06, | |
| "loss": 0.1823, | |
| "num_tokens": 85358431.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.1571283435279191, | |
| "learning_rate": 5.457333133454394e-06, | |
| "loss": 0.194, | |
| "num_tokens": 85681659.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.5672514619883042, | |
| "grad_norm": 0.15050727661326063, | |
| "learning_rate": 5.4288904499944304e-06, | |
| "loss": 0.1843, | |
| "num_tokens": 86001433.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.5730994152046782, | |
| "grad_norm": 0.13840027248721531, | |
| "learning_rate": 5.40045060781066e-06, | |
| "loss": 0.1828, | |
| "num_tokens": 86349235.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.154830903482515, | |
| "learning_rate": 5.3720147432546175e-06, | |
| "loss": 0.1891, | |
| "num_tokens": 86670991.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5847953216374269, | |
| "grad_norm": 0.14875604221537664, | |
| "learning_rate": 5.343583992518911e-06, | |
| "loss": 0.1838, | |
| "num_tokens": 87005937.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.590643274853801, | |
| "grad_norm": 0.1550420196686663, | |
| "learning_rate": 5.315159491591818e-06, | |
| "loss": 0.1932, | |
| "num_tokens": 87355743.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.16190605189165996, | |
| "learning_rate": 5.286742376211894e-06, | |
| "loss": 0.1873, | |
| "num_tokens": 87666062.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.6023391812865497, | |
| "grad_norm": 0.1499999734628863, | |
| "learning_rate": 5.2583337818226e-06, | |
| "loss": 0.179, | |
| "num_tokens": 88010557.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.608187134502924, | |
| "grad_norm": 0.14174519866822582, | |
| "learning_rate": 5.229934843526922e-06, | |
| "loss": 0.1839, | |
| "num_tokens": 88348530.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.14514733616156453, | |
| "learning_rate": 5.201546696042033e-06, | |
| "loss": 0.1732, | |
| "num_tokens": 88660232.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.6198830409356724, | |
| "grad_norm": 0.15247902901919175, | |
| "learning_rate": 5.173170473653939e-06, | |
| "loss": 0.1838, | |
| "num_tokens": 88986178.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.6257309941520468, | |
| "grad_norm": 0.15161619030379697, | |
| "learning_rate": 5.1448073101721644e-06, | |
| "loss": 0.184, | |
| "num_tokens": 89306790.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.15076713301794256, | |
| "learning_rate": 5.1164583388844476e-06, | |
| "loss": 0.1764, | |
| "num_tokens": 89608787.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.6374269005847952, | |
| "grad_norm": 0.15081569071358755, | |
| "learning_rate": 5.0881246925114595e-06, | |
| "loss": 0.1841, | |
| "num_tokens": 89925196.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.6432748538011697, | |
| "grad_norm": 0.14667932336134215, | |
| "learning_rate": 5.0598075031615445e-06, | |
| "loss": 0.1714, | |
| "num_tokens": 90246158.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.14238827156504316, | |
| "learning_rate": 5.031507902285483e-06, | |
| "loss": 0.1675, | |
| "num_tokens": 90530735.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.654970760233918, | |
| "grad_norm": 0.15453119377842958, | |
| "learning_rate": 5.003227020631287e-06, | |
| "loss": 0.1822, | |
| "num_tokens": 90867029.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.6608187134502925, | |
| "grad_norm": 0.14188421149596725, | |
| "learning_rate": 4.974965988199015e-06, | |
| "loss": 0.1782, | |
| "num_tokens": 91197724.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.13924871637746586, | |
| "learning_rate": 4.946725934195622e-06, | |
| "loss": 0.1687, | |
| "num_tokens": 91501722.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.672514619883041, | |
| "grad_norm": 0.13972442531890047, | |
| "learning_rate": 4.918507986989848e-06, | |
| "loss": 0.1721, | |
| "num_tokens": 91828252.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.6783625730994152, | |
| "grad_norm": 0.14136406326583859, | |
| "learning_rate": 4.890313274067121e-06, | |
| "loss": 0.1787, | |
| "num_tokens": 92160609.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.14317191939225465, | |
| "learning_rate": 4.862142921984514e-06, | |
| "loss": 0.1816, | |
| "num_tokens": 92492656.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.6900584795321638, | |
| "grad_norm": 0.15273877472629271, | |
| "learning_rate": 4.8339980563257345e-06, | |
| "loss": 0.1778, | |
| "num_tokens": 92783619.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.695906432748538, | |
| "grad_norm": 0.1488984953802957, | |
| "learning_rate": 4.80587980165614e-06, | |
| "loss": 0.1605, | |
| "num_tokens": 93088774.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 0.14456947085083469, | |
| "learning_rate": 4.7777892814778145e-06, | |
| "loss": 0.185, | |
| "num_tokens": 93417888.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.7076023391812867, | |
| "grad_norm": 0.13947315988135284, | |
| "learning_rate": 4.749727618184672e-06, | |
| "loss": 0.1673, | |
| "num_tokens": 93733741.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.7134502923976607, | |
| "grad_norm": 0.14851734535481514, | |
| "learning_rate": 4.72169593301761e-06, | |
| "loss": 0.1819, | |
| "num_tokens": 94047466.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.14831214767698989, | |
| "learning_rate": 4.693695346019715e-06, | |
| "loss": 0.1771, | |
| "num_tokens": 94353138.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.7251461988304093, | |
| "grad_norm": 0.14962487368007496, | |
| "learning_rate": 4.665726975991502e-06, | |
| "loss": 0.1796, | |
| "num_tokens": 94660877.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.7309941520467835, | |
| "grad_norm": 0.15275250425533088, | |
| "learning_rate": 4.637791940446216e-06, | |
| "loss": 0.1819, | |
| "num_tokens": 94977362.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.1444275240489397, | |
| "learning_rate": 4.609891355565172e-06, | |
| "loss": 0.1746, | |
| "num_tokens": 95287766.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.7426900584795322, | |
| "grad_norm": 0.14585881193254274, | |
| "learning_rate": 4.582026336153175e-06, | |
| "loss": 0.1751, | |
| "num_tokens": 95580594.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.7485380116959064, | |
| "grad_norm": 0.14505549895976488, | |
| "learning_rate": 4.554197995593953e-06, | |
| "loss": 0.1811, | |
| "num_tokens": 95914105.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.14480466196132438, | |
| "learning_rate": 4.526407445805692e-06, | |
| "loss": 0.1734, | |
| "num_tokens": 96221354.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7602339181286548, | |
| "grad_norm": 0.14878097366426038, | |
| "learning_rate": 4.4986557971965865e-06, | |
| "loss": 0.1717, | |
| "num_tokens": 96503140.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.7660818713450293, | |
| "grad_norm": 0.13546319756214165, | |
| "learning_rate": 4.4709441586204924e-06, | |
| "loss": 0.1787, | |
| "num_tokens": 96862132.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 0.1545758606312762, | |
| "learning_rate": 4.443273637332607e-06, | |
| "loss": 0.1817, | |
| "num_tokens": 97181205.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.14857553617463332, | |
| "learning_rate": 4.415645338945236e-06, | |
| "loss": 0.1835, | |
| "num_tokens": 97496038.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.7836257309941521, | |
| "grad_norm": 0.1433152779712938, | |
| "learning_rate": 4.388060367383607e-06, | |
| "loss": 0.1807, | |
| "num_tokens": 97811641.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.15284420636801233, | |
| "learning_rate": 4.3605198248417745e-06, | |
| "loss": 0.1904, | |
| "num_tokens": 98139634.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.7953216374269005, | |
| "grad_norm": 0.15285620589846677, | |
| "learning_rate": 4.333024811738565e-06, | |
| "loss": 0.1794, | |
| "num_tokens": 98454687.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.801169590643275, | |
| "grad_norm": 0.14480146717017858, | |
| "learning_rate": 4.305576426673621e-06, | |
| "loss": 0.1733, | |
| "num_tokens": 98768020.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 0.14587759969006922, | |
| "learning_rate": 4.278175766383499e-06, | |
| "loss": 0.1788, | |
| "num_tokens": 99089607.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.8128654970760234, | |
| "grad_norm": 0.14456314742064028, | |
| "learning_rate": 4.250823925697848e-06, | |
| "loss": 0.1728, | |
| "num_tokens": 99396998.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.8187134502923976, | |
| "grad_norm": 0.14323326579292064, | |
| "learning_rate": 4.223521997495665e-06, | |
| "loss": 0.1774, | |
| "num_tokens": 99716575.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.1325445041745875, | |
| "learning_rate": 4.196271072661631e-06, | |
| "loss": 0.168, | |
| "num_tokens": 100064416.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.8304093567251463, | |
| "grad_norm": 0.13327008176494723, | |
| "learning_rate": 4.169072240042514e-06, | |
| "loss": 0.1699, | |
| "num_tokens": 100409121.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.8362573099415205, | |
| "grad_norm": 0.13889911640255415, | |
| "learning_rate": 4.141926586403677e-06, | |
| "loss": 0.1805, | |
| "num_tokens": 100765220.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.1454927979121013, | |
| "learning_rate": 4.114835196385636e-06, | |
| "loss": 0.1842, | |
| "num_tokens": 101081641.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.8479532163742691, | |
| "grad_norm": 0.1404945457994314, | |
| "learning_rate": 4.08779915246074e-06, | |
| "loss": 0.1822, | |
| "num_tokens": 101412150.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.8538011695906431, | |
| "grad_norm": 0.13980685227323844, | |
| "learning_rate": 4.060819534889909e-06, | |
| "loss": 0.1799, | |
| "num_tokens": 101744218.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.14229034728182074, | |
| "learning_rate": 4.033897421679472e-06, | |
| "loss": 0.1694, | |
| "num_tokens": 102086143.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.8654970760233918, | |
| "grad_norm": 0.1409421468030685, | |
| "learning_rate": 4.0070338885381e-06, | |
| "loss": 0.183, | |
| "num_tokens": 102386095.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.871345029239766, | |
| "grad_norm": 0.14131996708764916, | |
| "learning_rate": 3.980230008833812e-06, | |
| "loss": 0.1784, | |
| "num_tokens": 102724647.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.14060583568522164, | |
| "learning_rate": 3.953486853551104e-06, | |
| "loss": 0.1611, | |
| "num_tokens": 103019942.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.8830409356725146, | |
| "grad_norm": 0.13762893362884943, | |
| "learning_rate": 3.926805491248138e-06, | |
| "loss": 0.1681, | |
| "num_tokens": 103358784.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.15361551995597972, | |
| "learning_rate": 3.900186988014065e-06, | |
| "loss": 0.1825, | |
| "num_tokens": 103660262.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.13699916161747416, | |
| "learning_rate": 3.873632407426412e-06, | |
| "loss": 0.179, | |
| "num_tokens": 103981503.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.9005847953216373, | |
| "grad_norm": 0.14959870826890115, | |
| "learning_rate": 3.847142810508596e-06, | |
| "loss": 0.1826, | |
| "num_tokens": 104304618.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.9064327485380117, | |
| "grad_norm": 0.13265912139830643, | |
| "learning_rate": 3.82071925568752e-06, | |
| "loss": 0.1721, | |
| "num_tokens": 104676283.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.1421124205261071, | |
| "learning_rate": 3.7943627987512953e-06, | |
| "loss": 0.1805, | |
| "num_tokens": 105009047.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.9181286549707601, | |
| "grad_norm": 0.1437008669427492, | |
| "learning_rate": 3.7680744928070413e-06, | |
| "loss": 0.1754, | |
| "num_tokens": 105335760.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.9239766081871346, | |
| "grad_norm": 0.13360094795705102, | |
| "learning_rate": 3.741855388238821e-06, | |
| "loss": 0.1728, | |
| "num_tokens": 105669692.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.14123898637906518, | |
| "learning_rate": 3.715706532665657e-06, | |
| "loss": 0.1771, | |
| "num_tokens": 105989627.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.935672514619883, | |
| "grad_norm": 0.15005239176117802, | |
| "learning_rate": 3.6896289708996867e-06, | |
| "loss": 0.1875, | |
| "num_tokens": 106296775.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.9415204678362574, | |
| "grad_norm": 0.13514021795158399, | |
| "learning_rate": 3.6636237449044077e-06, | |
| "loss": 0.164, | |
| "num_tokens": 106614583.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.14198706510532838, | |
| "learning_rate": 3.637691893753047e-06, | |
| "loss": 0.1923, | |
| "num_tokens": 106960662.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.9532163742690059, | |
| "grad_norm": 0.13549287034356422, | |
| "learning_rate": 3.611834453587035e-06, | |
| "loss": 0.1713, | |
| "num_tokens": 107295583.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.95906432748538, | |
| "grad_norm": 0.14046063970510755, | |
| "learning_rate": 3.5860524575746247e-06, | |
| "loss": 0.1856, | |
| "num_tokens": 107643687.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.14339861962392317, | |
| "learning_rate": 3.5603469358695887e-06, | |
| "loss": 0.1763, | |
| "num_tokens": 107990974.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.9707602339181287, | |
| "grad_norm": 0.14801719558718762, | |
| "learning_rate": 3.53471891557007e-06, | |
| "loss": 0.193, | |
| "num_tokens": 108320866.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.976608187134503, | |
| "grad_norm": 0.13759593336874304, | |
| "learning_rate": 3.509169420677545e-06, | |
| "loss": 0.1646, | |
| "num_tokens": 108635348.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 0.14182117983464468, | |
| "learning_rate": 3.483699472055897e-06, | |
| "loss": 0.175, | |
| "num_tokens": 108945985.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.9883040935672516, | |
| "grad_norm": 0.14601860778521314, | |
| "learning_rate": 3.458310087390637e-06, | |
| "loss": 0.1746, | |
| "num_tokens": 109253189.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9941520467836256, | |
| "grad_norm": 0.13963753894588415, | |
| "learning_rate": 3.4330022811482317e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 109573613.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.14923037322286145, | |
| "learning_rate": 3.4077770645355824e-06, | |
| "loss": 0.1806, | |
| "num_tokens": 109863441.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.0058479532163744, | |
| "grad_norm": 0.14979935754858112, | |
| "learning_rate": 3.3826354454596024e-06, | |
| "loss": 0.1648, | |
| "num_tokens": 110172994.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.0116959064327484, | |
| "grad_norm": 0.14730373048534828, | |
| "learning_rate": 3.35757842848696e-06, | |
| "loss": 0.1658, | |
| "num_tokens": 110485924.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.017543859649123, | |
| "grad_norm": 0.13744380787225857, | |
| "learning_rate": 3.332607014803937e-06, | |
| "loss": 0.1672, | |
| "num_tokens": 110803906.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.023391812865497, | |
| "grad_norm": 0.14343267181086955, | |
| "learning_rate": 3.307722202176417e-06, | |
| "loss": 0.1685, | |
| "num_tokens": 111130534.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.0292397660818713, | |
| "grad_norm": 0.14951907378888113, | |
| "learning_rate": 3.2829249849100255e-06, | |
| "loss": 0.1656, | |
| "num_tokens": 111443229.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.15277865040934263, | |
| "learning_rate": 3.2582163538104038e-06, | |
| "loss": 0.172, | |
| "num_tokens": 111757363.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.0409356725146197, | |
| "grad_norm": 0.14756047461787347, | |
| "learning_rate": 3.2335972961436095e-06, | |
| "loss": 0.1568, | |
| "num_tokens": 112093731.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.046783625730994, | |
| "grad_norm": 0.16032862389663596, | |
| "learning_rate": 3.209068795596679e-06, | |
| "loss": 0.1658, | |
| "num_tokens": 112388610.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.16521303938360793, | |
| "learning_rate": 3.1846318322383164e-06, | |
| "loss": 0.1668, | |
| "num_tokens": 112693505.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.0584795321637426, | |
| "grad_norm": 0.15381932734961204, | |
| "learning_rate": 3.160287382479738e-06, | |
| "loss": 0.1659, | |
| "num_tokens": 112979822.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.064327485380117, | |
| "grad_norm": 0.1375064634023704, | |
| "learning_rate": 3.136036419035656e-06, | |
| "loss": 0.1666, | |
| "num_tokens": 113301917.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.15582327011120772, | |
| "learning_rate": 3.111879910885414e-06, | |
| "loss": 0.1743, | |
| "num_tokens": 113618502.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.0760233918128654, | |
| "grad_norm": 0.1447294439687268, | |
| "learning_rate": 3.0878188232342708e-06, | |
| "loss": 0.1675, | |
| "num_tokens": 113947517.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.08187134502924, | |
| "grad_norm": 0.15087644416114507, | |
| "learning_rate": 3.0638541174748284e-06, | |
| "loss": 0.1693, | |
| "num_tokens": 114275423.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.087719298245614, | |
| "grad_norm": 0.14016164600859282, | |
| "learning_rate": 3.0399867511486247e-06, | |
| "loss": 0.1592, | |
| "num_tokens": 114588977.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.0935672514619883, | |
| "grad_norm": 0.13990826210042928, | |
| "learning_rate": 3.0162176779078763e-06, | |
| "loss": 0.1639, | |
| "num_tokens": 114914836.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.0994152046783627, | |
| "grad_norm": 0.14220573852232116, | |
| "learning_rate": 2.9925478474773634e-06, | |
| "loss": 0.1533, | |
| "num_tokens": 115251681.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.1462318844171621, | |
| "learning_rate": 2.9689782056164874e-06, | |
| "loss": 0.1634, | |
| "num_tokens": 115583416.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 0.15963290757197937, | |
| "learning_rate": 2.94550969408149e-06, | |
| "loss": 0.1799, | |
| "num_tokens": 115902865.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.116959064327485, | |
| "grad_norm": 0.14871595452125072, | |
| "learning_rate": 2.9221432505878116e-06, | |
| "loss": 0.1624, | |
| "num_tokens": 116225855.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.1228070175438596, | |
| "grad_norm": 0.15147360930740472, | |
| "learning_rate": 2.8988798087726295e-06, | |
| "loss": 0.1608, | |
| "num_tokens": 116539725.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.128654970760234, | |
| "grad_norm": 0.14416045180339065, | |
| "learning_rate": 2.875720298157551e-06, | |
| "loss": 0.1576, | |
| "num_tokens": 116893688.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.134502923976608, | |
| "grad_norm": 0.15123893108832548, | |
| "learning_rate": 2.8526656441114815e-06, | |
| "loss": 0.1685, | |
| "num_tokens": 117195485.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.14700656972796167, | |
| "learning_rate": 2.8297167678136363e-06, | |
| "loss": 0.1512, | |
| "num_tokens": 117517383.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.146198830409357, | |
| "grad_norm": 0.14798834895868007, | |
| "learning_rate": 2.8068745862167423e-06, | |
| "loss": 0.165, | |
| "num_tokens": 117837819.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.152046783625731, | |
| "grad_norm": 0.14373047748755674, | |
| "learning_rate": 2.784140012010401e-06, | |
| "loss": 0.157, | |
| "num_tokens": 118172299.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.14591495023462686, | |
| "learning_rate": 2.7615139535846156e-06, | |
| "loss": 0.1569, | |
| "num_tokens": 118493460.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.1637426900584797, | |
| "grad_norm": 0.14800266668408754, | |
| "learning_rate": 2.7389973149934974e-06, | |
| "loss": 0.1657, | |
| "num_tokens": 118788247.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.1695906432748537, | |
| "grad_norm": 0.14815056056984904, | |
| "learning_rate": 2.7165909959191472e-06, | |
| "loss": 0.1653, | |
| "num_tokens": 119112526.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.14800590386326276, | |
| "learning_rate": 2.6942958916356997e-06, | |
| "loss": 0.155, | |
| "num_tokens": 119425646.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.181286549707602, | |
| "grad_norm": 0.155304755457848, | |
| "learning_rate": 2.6721128929735563e-06, | |
| "loss": 0.1682, | |
| "num_tokens": 119730833.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.1871345029239766, | |
| "grad_norm": 0.14504483487086625, | |
| "learning_rate": 2.6500428862837878e-06, | |
| "loss": 0.1591, | |
| "num_tokens": 120056226.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.192982456140351, | |
| "grad_norm": 0.1381416074535997, | |
| "learning_rate": 2.6280867534027286e-06, | |
| "loss": 0.1628, | |
| "num_tokens": 120386366.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.198830409356725, | |
| "grad_norm": 0.14481228971827553, | |
| "learning_rate": 2.6062453716167273e-06, | |
| "loss": 0.1681, | |
| "num_tokens": 120734433.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.2046783625730995, | |
| "grad_norm": 0.15691625415835195, | |
| "learning_rate": 2.5845196136270994e-06, | |
| "loss": 0.1713, | |
| "num_tokens": 121015717.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.14498825747218128, | |
| "learning_rate": 2.5629103475152654e-06, | |
| "loss": 0.157, | |
| "num_tokens": 121314872.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.216374269005848, | |
| "grad_norm": 0.15217065921697623, | |
| "learning_rate": 2.541418436708054e-06, | |
| "loss": 0.1641, | |
| "num_tokens": 121609720.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.149827230466305, | |
| "learning_rate": 2.520044739943207e-06, | |
| "loss": 0.1598, | |
| "num_tokens": 121926793.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.2280701754385963, | |
| "grad_norm": 0.14401874517686966, | |
| "learning_rate": 2.498790111235072e-06, | |
| "loss": 0.1709, | |
| "num_tokens": 122245778.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.2339181286549707, | |
| "grad_norm": 0.13403812014389224, | |
| "learning_rate": 2.47765539984047e-06, | |
| "loss": 0.1608, | |
| "num_tokens": 122609822.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.239766081871345, | |
| "grad_norm": 0.14253922733943364, | |
| "learning_rate": 2.4566414502247684e-06, | |
| "loss": 0.1652, | |
| "num_tokens": 122948206.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.1442664664053727, | |
| "learning_rate": 2.435749102028139e-06, | |
| "loss": 0.1695, | |
| "num_tokens": 123281183.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.2514619883040936, | |
| "grad_norm": 0.14995127804324304, | |
| "learning_rate": 2.414979190032008e-06, | |
| "loss": 0.1667, | |
| "num_tokens": 123606597.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.257309941520468, | |
| "grad_norm": 0.14167491108605929, | |
| "learning_rate": 2.3943325441256993e-06, | |
| "loss": 0.162, | |
| "num_tokens": 123924915.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.1462621509052835, | |
| "learning_rate": 2.373809989273277e-06, | |
| "loss": 0.1668, | |
| "num_tokens": 124223390.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.2690058479532165, | |
| "grad_norm": 0.14702160243574425, | |
| "learning_rate": 2.353412345480587e-06, | |
| "loss": 0.1629, | |
| "num_tokens": 124541698.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.2748538011695905, | |
| "grad_norm": 0.15471872498523825, | |
| "learning_rate": 2.3331404277624846e-06, | |
| "loss": 0.1582, | |
| "num_tokens": 124832848.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.14362745275711297, | |
| "learning_rate": 2.312995046110272e-06, | |
| "loss": 0.1698, | |
| "num_tokens": 125154886.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.2865497076023393, | |
| "grad_norm": 0.1468561944816842, | |
| "learning_rate": 2.292977005459341e-06, | |
| "loss": 0.1756, | |
| "num_tokens": 125484162.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.2923976608187133, | |
| "grad_norm": 0.15143848359912496, | |
| "learning_rate": 2.2730871056570024e-06, | |
| "loss": 0.1605, | |
| "num_tokens": 125783060.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.2982456140350878, | |
| "grad_norm": 0.14651421165143075, | |
| "learning_rate": 2.2533261414305243e-06, | |
| "loss": 0.165, | |
| "num_tokens": 126110333.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.3040935672514617, | |
| "grad_norm": 0.1475116907221095, | |
| "learning_rate": 2.2336949023553924e-06, | |
| "loss": 0.1616, | |
| "num_tokens": 126442564.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.309941520467836, | |
| "grad_norm": 0.14460134481385417, | |
| "learning_rate": 2.2141941728237467e-06, | |
| "loss": 0.1654, | |
| "num_tokens": 126752395.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.14351634697316507, | |
| "learning_rate": 2.194824732013047e-06, | |
| "loss": 0.1626, | |
| "num_tokens": 127089855.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.3216374269005846, | |
| "grad_norm": 0.13828986539988358, | |
| "learning_rate": 2.1755873538549376e-06, | |
| "loss": 0.159, | |
| "num_tokens": 127411156.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.327485380116959, | |
| "grad_norm": 0.13645396729907014, | |
| "learning_rate": 2.1564828070043275e-06, | |
| "loss": 0.1533, | |
| "num_tokens": 127736233.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.14422895920728793, | |
| "learning_rate": 2.137511854808672e-06, | |
| "loss": 0.1575, | |
| "num_tokens": 128060062.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.3391812865497075, | |
| "grad_norm": 0.1448501743068164, | |
| "learning_rate": 2.1186752552774764e-06, | |
| "loss": 0.1626, | |
| "num_tokens": 128406839.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.345029239766082, | |
| "grad_norm": 0.13774903808615716, | |
| "learning_rate": 2.099973761052007e-06, | |
| "loss": 0.1662, | |
| "num_tokens": 128766764.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.1508233167288601, | |
| "learning_rate": 2.081408119375219e-06, | |
| "loss": 0.1653, | |
| "num_tokens": 129070969.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.3567251461988303, | |
| "grad_norm": 0.14607227185650823, | |
| "learning_rate": 2.0629790720618977e-06, | |
| "loss": 0.1551, | |
| "num_tokens": 129361599.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.3625730994152048, | |
| "grad_norm": 0.14540307746510248, | |
| "learning_rate": 2.044687355469025e-06, | |
| "loss": 0.168, | |
| "num_tokens": 129689845.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.14269113809456305, | |
| "learning_rate": 2.0265337004663465e-06, | |
| "loss": 0.1632, | |
| "num_tokens": 130007281.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.374269005847953, | |
| "grad_norm": 0.14764837206982723, | |
| "learning_rate": 2.008518832407176e-06, | |
| "loss": 0.16, | |
| "num_tokens": 130320317.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.3801169590643276, | |
| "grad_norm": 0.13886373538463548, | |
| "learning_rate": 1.9906434710994098e-06, | |
| "loss": 0.1659, | |
| "num_tokens": 130668421.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.14123121708441125, | |
| "learning_rate": 1.9729083307767725e-06, | |
| "loss": 0.1571, | |
| "num_tokens": 130996417.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.391812865497076, | |
| "grad_norm": 0.14241772556155666, | |
| "learning_rate": 1.955314120070269e-06, | |
| "loss": 0.1586, | |
| "num_tokens": 131324210.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.39766081871345, | |
| "grad_norm": 0.1536376204686897, | |
| "learning_rate": 1.937861541979873e-06, | |
| "loss": 0.1866, | |
| "num_tokens": 131671552.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.4035087719298245, | |
| "grad_norm": 0.13606181694914196, | |
| "learning_rate": 1.9205512938464465e-06, | |
| "loss": 0.163, | |
| "num_tokens": 132008743.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.409356725146199, | |
| "grad_norm": 0.14994033256966707, | |
| "learning_rate": 1.903384067323863e-06, | |
| "loss": 0.1644, | |
| "num_tokens": 132306397.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.415204678362573, | |
| "grad_norm": 0.1437874855637201, | |
| "learning_rate": 1.886360548351381e-06, | |
| "loss": 0.1589, | |
| "num_tokens": 132627946.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.14231060929324155, | |
| "learning_rate": 1.8694814171262355e-06, | |
| "loss": 0.1616, | |
| "num_tokens": 132981440.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.426900584795322, | |
| "grad_norm": 0.14278247493654592, | |
| "learning_rate": 1.8527473480764545e-06, | |
| "loss": 0.1638, | |
| "num_tokens": 133326233.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.4327485380116958, | |
| "grad_norm": 0.14574512278213558, | |
| "learning_rate": 1.8361590098339168e-06, | |
| "loss": 0.1701, | |
| "num_tokens": 133635483.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.43859649122807, | |
| "grad_norm": 0.14520072415658936, | |
| "learning_rate": 1.8197170652076316e-06, | |
| "loss": 0.1729, | |
| "num_tokens": 133965064.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.13880382782292727, | |
| "learning_rate": 1.8034221711572633e-06, | |
| "loss": 0.1642, | |
| "num_tokens": 134297515.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.4502923976608186, | |
| "grad_norm": 0.14965359883764726, | |
| "learning_rate": 1.7872749787668673e-06, | |
| "loss": 0.1573, | |
| "num_tokens": 134596743.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.14970450216253625, | |
| "learning_rate": 1.7712761332188894e-06, | |
| "loss": 0.1654, | |
| "num_tokens": 134914562.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.461988304093567, | |
| "grad_norm": 0.15816166023028, | |
| "learning_rate": 1.7554262737683803e-06, | |
| "loss": 0.1487, | |
| "num_tokens": 135230462.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.4678362573099415, | |
| "grad_norm": 0.1504826241844761, | |
| "learning_rate": 1.7397260337174542e-06, | |
| "loss": 0.157, | |
| "num_tokens": 135546957.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.1394498720471719, | |
| "learning_rate": 1.724176040389982e-06, | |
| "loss": 0.1591, | |
| "num_tokens": 135868298.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.47953216374269, | |
| "grad_norm": 0.14271119923974052, | |
| "learning_rate": 1.708776915106528e-06, | |
| "loss": 0.1647, | |
| "num_tokens": 136208833.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.4853801169590644, | |
| "grad_norm": 0.1486501311116008, | |
| "learning_rate": 1.6935292731595284e-06, | |
| "loss": 0.1674, | |
| "num_tokens": 136529945.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.1568931867106138, | |
| "learning_rate": 1.678433723788697e-06, | |
| "loss": 0.1612, | |
| "num_tokens": 136819684.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.497076023391813, | |
| "grad_norm": 0.14113107486411444, | |
| "learning_rate": 1.6634908701566909e-06, | |
| "loss": 0.1576, | |
| "num_tokens": 137150211.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.502923976608187, | |
| "grad_norm": 0.14430747759375342, | |
| "learning_rate": 1.6487013093250042e-06, | |
| "loss": 0.1577, | |
| "num_tokens": 137460607.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.5087719298245617, | |
| "grad_norm": 0.14188380800567507, | |
| "learning_rate": 1.6340656322301158e-06, | |
| "loss": 0.1681, | |
| "num_tokens": 137808723.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.5146198830409356, | |
| "grad_norm": 0.13864076262612016, | |
| "learning_rate": 1.619584423659875e-06, | |
| "loss": 0.1508, | |
| "num_tokens": 138122659.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.52046783625731, | |
| "grad_norm": 0.14329617113357643, | |
| "learning_rate": 1.6052582622301398e-06, | |
| "loss": 0.1604, | |
| "num_tokens": 138456252.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.14485980625620193, | |
| "learning_rate": 1.5910877203616515e-06, | |
| "loss": 0.1689, | |
| "num_tokens": 138794979.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.5321637426900585, | |
| "grad_norm": 0.14001094426697513, | |
| "learning_rate": 1.5770733642571662e-06, | |
| "loss": 0.1613, | |
| "num_tokens": 139114044.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.538011695906433, | |
| "grad_norm": 0.14542258651096243, | |
| "learning_rate": 1.5632157538788322e-06, | |
| "loss": 0.1626, | |
| "num_tokens": 139425238.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.543859649122807, | |
| "grad_norm": 0.1489744939093393, | |
| "learning_rate": 1.5495154429258136e-06, | |
| "loss": 0.1722, | |
| "num_tokens": 139757117.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.5497076023391814, | |
| "grad_norm": 0.14248652727375938, | |
| "learning_rate": 1.5359729788121678e-06, | |
| "loss": 0.1633, | |
| "num_tokens": 140085804.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.14025435225815486, | |
| "learning_rate": 1.5225889026449754e-06, | |
| "loss": 0.1609, | |
| "num_tokens": 140409219.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.14823929068819988, | |
| "learning_rate": 1.5093637492027136e-06, | |
| "loss": 0.165, | |
| "num_tokens": 140713444.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.5672514619883042, | |
| "grad_norm": 0.1370826700643183, | |
| "learning_rate": 1.4962980469138932e-06, | |
| "loss": 0.1462, | |
| "num_tokens": 141032047.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.573099415204678, | |
| "grad_norm": 0.1522127782198521, | |
| "learning_rate": 1.4833923178359428e-06, | |
| "loss": 0.1621, | |
| "num_tokens": 141338487.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.14502198127336535, | |
| "learning_rate": 1.4706470776343507e-06, | |
| "loss": 0.164, | |
| "num_tokens": 141669740.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.5847953216374266, | |
| "grad_norm": 0.15205245564816208, | |
| "learning_rate": 1.458062835562058e-06, | |
| "loss": 0.164, | |
| "num_tokens": 141971862.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.590643274853801, | |
| "grad_norm": 0.15020510176845542, | |
| "learning_rate": 1.4456400944391147e-06, | |
| "loss": 0.1677, | |
| "num_tokens": 142266889.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.14391800055857798, | |
| "learning_rate": 1.4333793506325832e-06, | |
| "loss": 0.1606, | |
| "num_tokens": 142584991.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.60233918128655, | |
| "grad_norm": 0.15111238280234057, | |
| "learning_rate": 1.421281094036712e-06, | |
| "loss": 0.1604, | |
| "num_tokens": 142879877.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.608187134502924, | |
| "grad_norm": 0.1443837082025472, | |
| "learning_rate": 1.4093458080533562e-06, | |
| "loss": 0.1681, | |
| "num_tokens": 143205197.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.6140350877192984, | |
| "grad_norm": 0.14463771075478601, | |
| "learning_rate": 1.3975739695726649e-06, | |
| "loss": 0.1671, | |
| "num_tokens": 143531142.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.6198830409356724, | |
| "grad_norm": 0.14454878880987132, | |
| "learning_rate": 1.385966048954027e-06, | |
| "loss": 0.1633, | |
| "num_tokens": 143858081.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.625730994152047, | |
| "grad_norm": 0.13984594893802477, | |
| "learning_rate": 1.3745225100072737e-06, | |
| "loss": 0.1617, | |
| "num_tokens": 144185431.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.13969336960280732, | |
| "learning_rate": 1.3632438099741505e-06, | |
| "loss": 0.159, | |
| "num_tokens": 144497647.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.6374269005847952, | |
| "grad_norm": 0.14619663223314017, | |
| "learning_rate": 1.3521303995100479e-06, | |
| "loss": 0.1663, | |
| "num_tokens": 144800894.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.6432748538011697, | |
| "grad_norm": 0.14130801987934924, | |
| "learning_rate": 1.3411827226659887e-06, | |
| "loss": 0.1592, | |
| "num_tokens": 145115720.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.6491228070175437, | |
| "grad_norm": 0.13126230977093004, | |
| "learning_rate": 1.330401216870891e-06, | |
| "loss": 0.1479, | |
| "num_tokens": 145451398.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.654970760233918, | |
| "grad_norm": 0.13673717620054995, | |
| "learning_rate": 1.3197863129140916e-06, | |
| "loss": 0.1564, | |
| "num_tokens": 145791079.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.6608187134502925, | |
| "grad_norm": 0.1468555944041424, | |
| "learning_rate": 1.3093384349281268e-06, | |
| "loss": 0.1575, | |
| "num_tokens": 146094234.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.1511002120974835, | |
| "learning_rate": 1.2990580003717904e-06, | |
| "loss": 0.1828, | |
| "num_tokens": 146403548.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.672514619883041, | |
| "grad_norm": 0.139909324379523, | |
| "learning_rate": 1.2889454200134522e-06, | |
| "loss": 0.1594, | |
| "num_tokens": 146732523.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.678362573099415, | |
| "grad_norm": 0.14396563097050272, | |
| "learning_rate": 1.2790010979146467e-06, | |
| "loss": 0.1524, | |
| "num_tokens": 147040850.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.15000149938121365, | |
| "learning_rate": 1.2692254314139243e-06, | |
| "loss": 0.1697, | |
| "num_tokens": 147343323.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.690058479532164, | |
| "grad_norm": 0.15745120782859975, | |
| "learning_rate": 1.2596188111109805e-06, | |
| "loss": 0.1681, | |
| "num_tokens": 147635253.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.6959064327485383, | |
| "grad_norm": 0.1476631580057043, | |
| "learning_rate": 1.2501816208510442e-06, | |
| "loss": 0.1638, | |
| "num_tokens": 147956892.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.1423826560287115, | |
| "learning_rate": 1.2409142377095435e-06, | |
| "loss": 0.1571, | |
| "num_tokens": 148260684.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.7076023391812867, | |
| "grad_norm": 0.14219644101756945, | |
| "learning_rate": 1.231817031977037e-06, | |
| "loss": 0.1585, | |
| "num_tokens": 148571351.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.7134502923976607, | |
| "grad_norm": 0.15083997934575208, | |
| "learning_rate": 1.2228903671444228e-06, | |
| "loss": 0.1683, | |
| "num_tokens": 148888226.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.719298245614035, | |
| "grad_norm": 0.13732672374616764, | |
| "learning_rate": 1.2141345998884092e-06, | |
| "loss": 0.1606, | |
| "num_tokens": 149214770.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.7251461988304095, | |
| "grad_norm": 0.1441249561667527, | |
| "learning_rate": 1.2055500800572657e-06, | |
| "loss": 0.1626, | |
| "num_tokens": 149545405.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.7309941520467835, | |
| "grad_norm": 0.14325592299925863, | |
| "learning_rate": 1.1971371506568442e-06, | |
| "loss": 0.1577, | |
| "num_tokens": 149846805.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.13745597306904, | |
| "learning_rate": 1.1888961478368762e-06, | |
| "loss": 0.1605, | |
| "num_tokens": 150179635.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.742690058479532, | |
| "grad_norm": 0.13951839806880817, | |
| "learning_rate": 1.1808274008775355e-06, | |
| "loss": 0.158, | |
| "num_tokens": 150503170.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 2.7485380116959064, | |
| "grad_norm": 0.1443209297278742, | |
| "learning_rate": 1.1729312321762864e-06, | |
| "loss": 0.1575, | |
| "num_tokens": 150809725.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.754385964912281, | |
| "grad_norm": 0.13342746902950017, | |
| "learning_rate": 1.1652079572350026e-06, | |
| "loss": 0.154, | |
| "num_tokens": 151157947.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 2.760233918128655, | |
| "grad_norm": 0.14326184156810018, | |
| "learning_rate": 1.1576578846473558e-06, | |
| "loss": 0.1584, | |
| "num_tokens": 151479230.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.7660818713450293, | |
| "grad_norm": 0.1433950274377657, | |
| "learning_rate": 1.1502813160864893e-06, | |
| "loss": 0.1628, | |
| "num_tokens": 151803444.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.14139558810493624, | |
| "learning_rate": 1.1430785462929644e-06, | |
| "loss": 0.169, | |
| "num_tokens": 152139382.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.14115560637915964, | |
| "learning_rate": 1.136049863062982e-06, | |
| "loss": 0.162, | |
| "num_tokens": 152452934.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.783625730994152, | |
| "grad_norm": 0.15211690242588544, | |
| "learning_rate": 1.1291955472368825e-06, | |
| "loss": 0.1601, | |
| "num_tokens": 152748759.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.13666222185265295, | |
| "learning_rate": 1.1225158726879288e-06, | |
| "loss": 0.1665, | |
| "num_tokens": 153093029.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.7953216374269005, | |
| "grad_norm": 0.14398939041397035, | |
| "learning_rate": 1.116011106311358e-06, | |
| "loss": 0.1568, | |
| "num_tokens": 153384854.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.801169590643275, | |
| "grad_norm": 0.15402905213013776, | |
| "learning_rate": 1.1096815080137196e-06, | |
| "loss": 0.1874, | |
| "num_tokens": 153720419.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.14201446868127077, | |
| "learning_rate": 1.103527330702493e-06, | |
| "loss": 0.1624, | |
| "num_tokens": 154054230.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.8128654970760234, | |
| "grad_norm": 0.14521001714072115, | |
| "learning_rate": 1.0975488202759772e-06, | |
| "loss": 0.1625, | |
| "num_tokens": 154373913.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.818713450292398, | |
| "grad_norm": 0.1390122065414189, | |
| "learning_rate": 1.0917462156134707e-06, | |
| "loss": 0.1599, | |
| "num_tokens": 154689857.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.824561403508772, | |
| "grad_norm": 0.14888462659883256, | |
| "learning_rate": 1.0861197485657218e-06, | |
| "loss": 0.1643, | |
| "num_tokens": 154995696.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.8304093567251463, | |
| "grad_norm": 0.1420249071569745, | |
| "learning_rate": 1.0806696439456695e-06, | |
| "loss": 0.1629, | |
| "num_tokens": 155312841.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.8362573099415203, | |
| "grad_norm": 0.14037073859681298, | |
| "learning_rate": 1.0753961195194581e-06, | |
| "loss": 0.1592, | |
| "num_tokens": 155627374.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.14506758333723185, | |
| "learning_rate": 1.070299385997735e-06, | |
| "loss": 0.1586, | |
| "num_tokens": 155933888.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.847953216374269, | |
| "grad_norm": 0.14137857222010855, | |
| "learning_rate": 1.0653796470272348e-06, | |
| "loss": 0.1564, | |
| "num_tokens": 156263120.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.853801169590643, | |
| "grad_norm": 0.14542756104140053, | |
| "learning_rate": 1.0606370991826398e-06, | |
| "loss": 0.165, | |
| "num_tokens": 156585736.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.8596491228070176, | |
| "grad_norm": 0.13897674449897804, | |
| "learning_rate": 1.0560719319587262e-06, | |
| "loss": 0.1642, | |
| "num_tokens": 156919577.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.8654970760233915, | |
| "grad_norm": 0.1411765074092308, | |
| "learning_rate": 1.051684327762793e-06, | |
| "loss": 0.1613, | |
| "num_tokens": 157244512.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.871345029239766, | |
| "grad_norm": 0.14023194573370917, | |
| "learning_rate": 1.047474461907374e-06, | |
| "loss": 0.1593, | |
| "num_tokens": 157555581.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.13348582864925426, | |
| "learning_rate": 1.043442502603231e-06, | |
| "loss": 0.1587, | |
| "num_tokens": 157890387.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.883040935672515, | |
| "grad_norm": 0.1458553034999855, | |
| "learning_rate": 1.0395886109526346e-06, | |
| "loss": 0.1672, | |
| "num_tokens": 158227831.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.13433755063333214, | |
| "learning_rate": 1.0359129409429269e-06, | |
| "loss": 0.1672, | |
| "num_tokens": 158593011.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.13367864457349435, | |
| "learning_rate": 1.0324156394403683e-06, | |
| "loss": 0.1553, | |
| "num_tokens": 158933449.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.9005847953216373, | |
| "grad_norm": 0.14118001312714748, | |
| "learning_rate": 1.0290968461842693e-06, | |
| "loss": 0.1577, | |
| "num_tokens": 159246578.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.9064327485380117, | |
| "grad_norm": 0.1463297027696001, | |
| "learning_rate": 1.025956693781408e-06, | |
| "loss": 0.1678, | |
| "num_tokens": 159560178.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.14563817342813995, | |
| "learning_rate": 1.0229953077007288e-06, | |
| "loss": 0.1659, | |
| "num_tokens": 159884093.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.91812865497076, | |
| "grad_norm": 0.14540407437292474, | |
| "learning_rate": 1.0202128062683333e-06, | |
| "loss": 0.1722, | |
| "num_tokens": 160205475.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.9239766081871346, | |
| "grad_norm": 0.1405121201590573, | |
| "learning_rate": 1.0176093006627485e-06, | |
| "loss": 0.1562, | |
| "num_tokens": 160545275.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9298245614035086, | |
| "grad_norm": 0.14217083046189563, | |
| "learning_rate": 1.0151848949104872e-06, | |
| "loss": 0.1665, | |
| "num_tokens": 160880973.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.935672514619883, | |
| "grad_norm": 0.1464306529530731, | |
| "learning_rate": 1.01293968588189e-06, | |
| "loss": 0.1707, | |
| "num_tokens": 161205233.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.9415204678362574, | |
| "grad_norm": 0.1408296921593092, | |
| "learning_rate": 1.0108737632872553e-06, | |
| "loss": 0.16, | |
| "num_tokens": 161521616.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.14238166027330365, | |
| "learning_rate": 1.0089872096732555e-06, | |
| "loss": 0.1635, | |
| "num_tokens": 161834814.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.953216374269006, | |
| "grad_norm": 0.1426737290624598, | |
| "learning_rate": 1.0072801004196363e-06, | |
| "loss": 0.1615, | |
| "num_tokens": 162172431.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.95906432748538, | |
| "grad_norm": 0.14507835204373007, | |
| "learning_rate": 1.0057525037362082e-06, | |
| "loss": 0.162, | |
| "num_tokens": 162500876.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.9649122807017543, | |
| "grad_norm": 0.14256066597160452, | |
| "learning_rate": 1.0044044806601188e-06, | |
| "loss": 0.161, | |
| "num_tokens": 162830769.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.9707602339181287, | |
| "grad_norm": 0.13917168255862636, | |
| "learning_rate": 1.003236085053414e-06, | |
| "loss": 0.1598, | |
| "num_tokens": 163185192.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.976608187134503, | |
| "grad_norm": 0.1402722871419169, | |
| "learning_rate": 1.0022473636008867e-06, | |
| "loss": 0.164, | |
| "num_tokens": 163513051.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.15282843543700755, | |
| "learning_rate": 1.0014383558082113e-06, | |
| "loss": 0.169, | |
| "num_tokens": 163816593.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.9883040935672516, | |
| "grad_norm": 0.14600064275991873, | |
| "learning_rate": 1.000809094000365e-06, | |
| "loss": 0.1582, | |
| "num_tokens": 164132256.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.9941520467836256, | |
| "grad_norm": 0.142164254928358, | |
| "learning_rate": 1.0003596033203359e-06, | |
| "loss": 0.1675, | |
| "num_tokens": 164460013.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.14017066974116044, | |
| "learning_rate": 1.0000899017281195e-06, | |
| "loss": 0.1563, | |
| "num_tokens": 164791026.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 513, | |
| "total_flos": 5.228622451394478e+17, | |
| "train_loss": 0.21386383229877517, | |
| "train_runtime": 6716.6337, | |
| "train_samples_per_second": 9.771, | |
| "train_steps_per_second": 0.076 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 513, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.228622451394478e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |