text2sql-sft-v7 / trainer_state.json
genies-llm's picture
Model save
0c35b50 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 513,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005847953216374269,
"grad_norm": 3.346622166062452,
"learning_rate": 0.0,
"loss": 1.1879,
"num_tokens": 309834.0,
"step": 1
},
{
"epoch": 0.011695906432748537,
"grad_norm": 3.2150187522491747,
"learning_rate": 6.25e-07,
"loss": 1.1528,
"num_tokens": 626323.0,
"step": 2
},
{
"epoch": 0.017543859649122806,
"grad_norm": 3.1095611576006044,
"learning_rate": 1.25e-06,
"loss": 1.13,
"num_tokens": 962858.0,
"step": 3
},
{
"epoch": 0.023391812865497075,
"grad_norm": 3.0927940568383274,
"learning_rate": 1.8750000000000003e-06,
"loss": 1.1257,
"num_tokens": 1307919.0,
"step": 4
},
{
"epoch": 0.029239766081871343,
"grad_norm": 3.0505204823401963,
"learning_rate": 2.5e-06,
"loss": 1.1159,
"num_tokens": 1643402.0,
"step": 5
},
{
"epoch": 0.03508771929824561,
"grad_norm": 2.8525229189090564,
"learning_rate": 3.125e-06,
"loss": 1.1119,
"num_tokens": 1963547.0,
"step": 6
},
{
"epoch": 0.04093567251461988,
"grad_norm": 2.136607198658089,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.0281,
"num_tokens": 2283318.0,
"step": 7
},
{
"epoch": 0.04678362573099415,
"grad_norm": 1.9705509244699,
"learning_rate": 4.3750000000000005e-06,
"loss": 1.0088,
"num_tokens": 2603880.0,
"step": 8
},
{
"epoch": 0.05263157894736842,
"grad_norm": 1.2382281811593294,
"learning_rate": 5e-06,
"loss": 0.8683,
"num_tokens": 2929732.0,
"step": 9
},
{
"epoch": 0.05847953216374269,
"grad_norm": 1.2462399978155196,
"learning_rate": 5.625e-06,
"loss": 0.868,
"num_tokens": 3252895.0,
"step": 10
},
{
"epoch": 0.06432748538011696,
"grad_norm": 1.1972270533702403,
"learning_rate": 6.25e-06,
"loss": 0.8151,
"num_tokens": 3578517.0,
"step": 11
},
{
"epoch": 0.07017543859649122,
"grad_norm": 1.8012881936401126,
"learning_rate": 6.875e-06,
"loss": 0.6379,
"num_tokens": 3911914.0,
"step": 12
},
{
"epoch": 0.07602339181286549,
"grad_norm": 1.5004715524395629,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6352,
"num_tokens": 4228515.0,
"step": 13
},
{
"epoch": 0.08187134502923976,
"grad_norm": 1.320062812526294,
"learning_rate": 8.125000000000001e-06,
"loss": 0.6228,
"num_tokens": 4536476.0,
"step": 14
},
{
"epoch": 0.08771929824561403,
"grad_norm": 0.9906906777846411,
"learning_rate": 8.750000000000001e-06,
"loss": 0.5299,
"num_tokens": 4868361.0,
"step": 15
},
{
"epoch": 0.0935672514619883,
"grad_norm": 0.8386484072060002,
"learning_rate": 9.375000000000001e-06,
"loss": 0.4729,
"num_tokens": 5216197.0,
"step": 16
},
{
"epoch": 0.09941520467836257,
"grad_norm": 0.4900339517100113,
"learning_rate": 1e-05,
"loss": 0.4352,
"num_tokens": 5557003.0,
"step": 17
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.40004540169105984,
"learning_rate": 9.999910098271881e-06,
"loss": 0.3938,
"num_tokens": 5892764.0,
"step": 18
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.3513180037096265,
"learning_rate": 9.999640396679666e-06,
"loss": 0.4064,
"num_tokens": 6207362.0,
"step": 19
},
{
"epoch": 0.11695906432748537,
"grad_norm": 0.6732732756411357,
"learning_rate": 9.999190905999637e-06,
"loss": 0.3789,
"num_tokens": 6496012.0,
"step": 20
},
{
"epoch": 0.12280701754385964,
"grad_norm": 0.31994176912879413,
"learning_rate": 9.99856164419179e-06,
"loss": 0.3778,
"num_tokens": 6804315.0,
"step": 21
},
{
"epoch": 0.1286549707602339,
"grad_norm": 0.548016923041077,
"learning_rate": 9.997752636399114e-06,
"loss": 0.3673,
"num_tokens": 7143380.0,
"step": 22
},
{
"epoch": 0.13450292397660818,
"grad_norm": 0.29902181791620935,
"learning_rate": 9.996763914946586e-06,
"loss": 0.3593,
"num_tokens": 7463502.0,
"step": 23
},
{
"epoch": 0.14035087719298245,
"grad_norm": 0.31707406326522014,
"learning_rate": 9.995595519339882e-06,
"loss": 0.3577,
"num_tokens": 7774770.0,
"step": 24
},
{
"epoch": 0.14619883040935672,
"grad_norm": 0.27176157042044713,
"learning_rate": 9.994247496263792e-06,
"loss": 0.3395,
"num_tokens": 8087750.0,
"step": 25
},
{
"epoch": 0.15204678362573099,
"grad_norm": 0.3065271953474858,
"learning_rate": 9.992719899580364e-06,
"loss": 0.3474,
"num_tokens": 8434239.0,
"step": 26
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.30277827390998685,
"learning_rate": 9.991012790326745e-06,
"loss": 0.3121,
"num_tokens": 8750905.0,
"step": 27
},
{
"epoch": 0.16374269005847952,
"grad_norm": 0.25470554589584754,
"learning_rate": 9.989126236712746e-06,
"loss": 0.3197,
"num_tokens": 9075220.0,
"step": 28
},
{
"epoch": 0.1695906432748538,
"grad_norm": 0.24585407625854147,
"learning_rate": 9.987060314118111e-06,
"loss": 0.3217,
"num_tokens": 9397453.0,
"step": 29
},
{
"epoch": 0.17543859649122806,
"grad_norm": 0.4536476490136459,
"learning_rate": 9.984815105089515e-06,
"loss": 0.3369,
"num_tokens": 9705728.0,
"step": 30
},
{
"epoch": 0.18128654970760233,
"grad_norm": 0.23980035473967873,
"learning_rate": 9.982390699337253e-06,
"loss": 0.2813,
"num_tokens": 10025621.0,
"step": 31
},
{
"epoch": 0.1871345029239766,
"grad_norm": 0.23564595909740174,
"learning_rate": 9.979787193731666e-06,
"loss": 0.3259,
"num_tokens": 10337176.0,
"step": 32
},
{
"epoch": 0.19298245614035087,
"grad_norm": 0.20709709888139777,
"learning_rate": 9.977004692299273e-06,
"loss": 0.2945,
"num_tokens": 10651259.0,
"step": 33
},
{
"epoch": 0.19883040935672514,
"grad_norm": 0.1973226462113262,
"learning_rate": 9.974043306218595e-06,
"loss": 0.2922,
"num_tokens": 10992918.0,
"step": 34
},
{
"epoch": 0.2046783625730994,
"grad_norm": 0.18869956638769483,
"learning_rate": 9.970903153815731e-06,
"loss": 0.2835,
"num_tokens": 11310394.0,
"step": 35
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.18686305886535334,
"learning_rate": 9.967584360559632e-06,
"loss": 0.2925,
"num_tokens": 11636774.0,
"step": 36
},
{
"epoch": 0.21637426900584794,
"grad_norm": 0.1725945504953492,
"learning_rate": 9.964087059057075e-06,
"loss": 0.2592,
"num_tokens": 11956378.0,
"step": 37
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.19881803465402237,
"learning_rate": 9.960411389047366e-06,
"loss": 0.2955,
"num_tokens": 12248057.0,
"step": 38
},
{
"epoch": 0.22807017543859648,
"grad_norm": 0.19691612763541272,
"learning_rate": 9.95655749739677e-06,
"loss": 0.2835,
"num_tokens": 12554456.0,
"step": 39
},
{
"epoch": 0.23391812865497075,
"grad_norm": 0.17622929852503963,
"learning_rate": 9.952525538092627e-06,
"loss": 0.2726,
"num_tokens": 12880847.0,
"step": 40
},
{
"epoch": 0.23976608187134502,
"grad_norm": 0.1622820101848469,
"learning_rate": 9.948315672237208e-06,
"loss": 0.2692,
"num_tokens": 13237415.0,
"step": 41
},
{
"epoch": 0.24561403508771928,
"grad_norm": 0.16492519002263994,
"learning_rate": 9.943928068041274e-06,
"loss": 0.2791,
"num_tokens": 13600570.0,
"step": 42
},
{
"epoch": 0.25146198830409355,
"grad_norm": 0.1659517586811088,
"learning_rate": 9.939362900817362e-06,
"loss": 0.2499,
"num_tokens": 13934393.0,
"step": 43
},
{
"epoch": 0.2573099415204678,
"grad_norm": 0.1675942094434052,
"learning_rate": 9.934620352972766e-06,
"loss": 0.2709,
"num_tokens": 14283319.0,
"step": 44
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.17033294793266862,
"learning_rate": 9.929700614002265e-06,
"loss": 0.2712,
"num_tokens": 14602358.0,
"step": 45
},
{
"epoch": 0.26900584795321636,
"grad_norm": 0.17218950775910574,
"learning_rate": 9.924603880480543e-06,
"loss": 0.2768,
"num_tokens": 14914703.0,
"step": 46
},
{
"epoch": 0.27485380116959063,
"grad_norm": 0.17142338750289324,
"learning_rate": 9.919330356054332e-06,
"loss": 0.2677,
"num_tokens": 15226891.0,
"step": 47
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.15935514552682936,
"learning_rate": 9.913880251434279e-06,
"loss": 0.2513,
"num_tokens": 15566110.0,
"step": 48
},
{
"epoch": 0.28654970760233917,
"grad_norm": 0.17590713121193502,
"learning_rate": 9.90825378438653e-06,
"loss": 0.2413,
"num_tokens": 15864678.0,
"step": 49
},
{
"epoch": 0.29239766081871343,
"grad_norm": 0.17105743981722707,
"learning_rate": 9.902451179724025e-06,
"loss": 0.2566,
"num_tokens": 16166802.0,
"step": 50
},
{
"epoch": 0.2982456140350877,
"grad_norm": 0.17592812081480816,
"learning_rate": 9.896472669297508e-06,
"loss": 0.2432,
"num_tokens": 16465873.0,
"step": 51
},
{
"epoch": 0.30409356725146197,
"grad_norm": 0.16854050409478574,
"learning_rate": 9.890318491986282e-06,
"loss": 0.2434,
"num_tokens": 16764387.0,
"step": 52
},
{
"epoch": 0.30994152046783624,
"grad_norm": 0.1722203346036174,
"learning_rate": 9.883988893688645e-06,
"loss": 0.2533,
"num_tokens": 17059312.0,
"step": 53
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.16603305455782896,
"learning_rate": 9.877484127312072e-06,
"loss": 0.2492,
"num_tokens": 17382890.0,
"step": 54
},
{
"epoch": 0.3216374269005848,
"grad_norm": 0.16334953855577547,
"learning_rate": 9.870804452763118e-06,
"loss": 0.2563,
"num_tokens": 17716146.0,
"step": 55
},
{
"epoch": 0.32748538011695905,
"grad_norm": 0.18120405762456362,
"learning_rate": 9.863950136937019e-06,
"loss": 0.2532,
"num_tokens": 18044901.0,
"step": 56
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.16955478627673745,
"learning_rate": 9.856921453707036e-06,
"loss": 0.256,
"num_tokens": 18360773.0,
"step": 57
},
{
"epoch": 0.3391812865497076,
"grad_norm": 0.15096634608102888,
"learning_rate": 9.849718683913511e-06,
"loss": 0.2259,
"num_tokens": 18694718.0,
"step": 58
},
{
"epoch": 0.34502923976608185,
"grad_norm": 0.1757789479931499,
"learning_rate": 9.842342115352647e-06,
"loss": 0.2595,
"num_tokens": 19014702.0,
"step": 59
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.16490033035648094,
"learning_rate": 9.834792042764999e-06,
"loss": 0.2404,
"num_tokens": 19339612.0,
"step": 60
},
{
"epoch": 0.3567251461988304,
"grad_norm": 0.16019689209153504,
"learning_rate": 9.827068767823713e-06,
"loss": 0.248,
"num_tokens": 19681676.0,
"step": 61
},
{
"epoch": 0.36257309941520466,
"grad_norm": 0.1691448945153913,
"learning_rate": 9.819172599122466e-06,
"loss": 0.2346,
"num_tokens": 20000100.0,
"step": 62
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.16082639569066132,
"learning_rate": 9.811103852163126e-06,
"loss": 0.2262,
"num_tokens": 20319423.0,
"step": 63
},
{
"epoch": 0.3742690058479532,
"grad_norm": 0.15278910823194214,
"learning_rate": 9.802862849343155e-06,
"loss": 0.2281,
"num_tokens": 20664041.0,
"step": 64
},
{
"epoch": 0.38011695906432746,
"grad_norm": 0.16155951042022568,
"learning_rate": 9.794449919942736e-06,
"loss": 0.241,
"num_tokens": 20983214.0,
"step": 65
},
{
"epoch": 0.38596491228070173,
"grad_norm": 0.16422330995295928,
"learning_rate": 9.785865400111593e-06,
"loss": 0.2489,
"num_tokens": 21321454.0,
"step": 66
},
{
"epoch": 0.391812865497076,
"grad_norm": 0.16006949845014626,
"learning_rate": 9.777109632855579e-06,
"loss": 0.2471,
"num_tokens": 21641982.0,
"step": 67
},
{
"epoch": 0.39766081871345027,
"grad_norm": 0.16727932073445337,
"learning_rate": 9.768182968022964e-06,
"loss": 0.2417,
"num_tokens": 21978836.0,
"step": 68
},
{
"epoch": 0.40350877192982454,
"grad_norm": 0.17239803759423833,
"learning_rate": 9.759085762290457e-06,
"loss": 0.2377,
"num_tokens": 22268188.0,
"step": 69
},
{
"epoch": 0.4093567251461988,
"grad_norm": 0.16257813227817636,
"learning_rate": 9.749818379148958e-06,
"loss": 0.2265,
"num_tokens": 22581727.0,
"step": 70
},
{
"epoch": 0.4152046783625731,
"grad_norm": 0.16056196258322708,
"learning_rate": 9.74038118888902e-06,
"loss": 0.2261,
"num_tokens": 22899881.0,
"step": 71
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.15733806901697214,
"learning_rate": 9.730774568586076e-06,
"loss": 0.2263,
"num_tokens": 23240539.0,
"step": 72
},
{
"epoch": 0.4269005847953216,
"grad_norm": 0.1679225698442003,
"learning_rate": 9.720998902085354e-06,
"loss": 0.2362,
"num_tokens": 23546933.0,
"step": 73
},
{
"epoch": 0.4327485380116959,
"grad_norm": 0.16689340356885685,
"learning_rate": 9.71105457998655e-06,
"loss": 0.2309,
"num_tokens": 23867940.0,
"step": 74
},
{
"epoch": 0.43859649122807015,
"grad_norm": 0.17313794861512294,
"learning_rate": 9.70094199962821e-06,
"loss": 0.2311,
"num_tokens": 24191283.0,
"step": 75
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.16976811460329427,
"learning_rate": 9.690661565071875e-06,
"loss": 0.2341,
"num_tokens": 24504739.0,
"step": 76
},
{
"epoch": 0.4502923976608187,
"grad_norm": 0.16719875175704804,
"learning_rate": 9.68021368708591e-06,
"loss": 0.2425,
"num_tokens": 24846242.0,
"step": 77
},
{
"epoch": 0.45614035087719296,
"grad_norm": 0.16237180095271134,
"learning_rate": 9.66959878312911e-06,
"loss": 0.219,
"num_tokens": 25163117.0,
"step": 78
},
{
"epoch": 0.4619883040935672,
"grad_norm": 0.16510212997973622,
"learning_rate": 9.658817277334013e-06,
"loss": 0.2304,
"num_tokens": 25498031.0,
"step": 79
},
{
"epoch": 0.4678362573099415,
"grad_norm": 0.16563608461998558,
"learning_rate": 9.647869600489954e-06,
"loss": 0.231,
"num_tokens": 25812720.0,
"step": 80
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.17555857386256016,
"learning_rate": 9.63675619002585e-06,
"loss": 0.2323,
"num_tokens": 26123680.0,
"step": 81
},
{
"epoch": 0.47953216374269003,
"grad_norm": 0.164333445701762,
"learning_rate": 9.625477489992727e-06,
"loss": 0.2138,
"num_tokens": 26410771.0,
"step": 82
},
{
"epoch": 0.4853801169590643,
"grad_norm": 0.17696917107757262,
"learning_rate": 9.614033951045974e-06,
"loss": 0.2286,
"num_tokens": 26716396.0,
"step": 83
},
{
"epoch": 0.49122807017543857,
"grad_norm": 0.15710163009015682,
"learning_rate": 9.602426030427335e-06,
"loss": 0.22,
"num_tokens": 27038109.0,
"step": 84
},
{
"epoch": 0.49707602339181284,
"grad_norm": 0.1677938058002079,
"learning_rate": 9.590654191946645e-06,
"loss": 0.2327,
"num_tokens": 27372562.0,
"step": 85
},
{
"epoch": 0.5029239766081871,
"grad_norm": 0.15756946980734074,
"learning_rate": 9.578718905963289e-06,
"loss": 0.2274,
"num_tokens": 27719366.0,
"step": 86
},
{
"epoch": 0.5087719298245614,
"grad_norm": 0.16483666302272912,
"learning_rate": 9.566620649367418e-06,
"loss": 0.23,
"num_tokens": 28062728.0,
"step": 87
},
{
"epoch": 0.5146198830409356,
"grad_norm": 0.16380030907396115,
"learning_rate": 9.554359905560887e-06,
"loss": 0.2139,
"num_tokens": 28392885.0,
"step": 88
},
{
"epoch": 0.52046783625731,
"grad_norm": 0.15375223243216413,
"learning_rate": 9.541937164437942e-06,
"loss": 0.2249,
"num_tokens": 28727888.0,
"step": 89
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.16323721706648206,
"learning_rate": 9.52935292236565e-06,
"loss": 0.2206,
"num_tokens": 29040411.0,
"step": 90
},
{
"epoch": 0.5321637426900585,
"grad_norm": 0.16395159555865288,
"learning_rate": 9.516607682164058e-06,
"loss": 0.2077,
"num_tokens": 29363581.0,
"step": 91
},
{
"epoch": 0.5380116959064327,
"grad_norm": 0.173187036283939,
"learning_rate": 9.503701953086107e-06,
"loss": 0.2325,
"num_tokens": 29691373.0,
"step": 92
},
{
"epoch": 0.543859649122807,
"grad_norm": 0.15592117720334775,
"learning_rate": 9.490636250797288e-06,
"loss": 0.2215,
"num_tokens": 30026282.0,
"step": 93
},
{
"epoch": 0.5497076023391813,
"grad_norm": 0.16780226146886296,
"learning_rate": 9.477411097355025e-06,
"loss": 0.2266,
"num_tokens": 30357776.0,
"step": 94
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.16080371447382497,
"learning_rate": 9.464027021187833e-06,
"loss": 0.2261,
"num_tokens": 30675188.0,
"step": 95
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.15669972327863108,
"learning_rate": 9.450484557074188e-06,
"loss": 0.2247,
"num_tokens": 31041728.0,
"step": 96
},
{
"epoch": 0.5672514619883041,
"grad_norm": 0.15864033792743365,
"learning_rate": 9.43678424612117e-06,
"loss": 0.2219,
"num_tokens": 31373582.0,
"step": 97
},
{
"epoch": 0.5730994152046783,
"grad_norm": 0.15897622365337738,
"learning_rate": 9.422926635742834e-06,
"loss": 0.2124,
"num_tokens": 31693920.0,
"step": 98
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.1541883977143332,
"learning_rate": 9.40891227963835e-06,
"loss": 0.2089,
"num_tokens": 32016538.0,
"step": 99
},
{
"epoch": 0.5847953216374269,
"grad_norm": 0.16113178998178,
"learning_rate": 9.39474173776986e-06,
"loss": 0.2131,
"num_tokens": 32342868.0,
"step": 100
},
{
"epoch": 0.5906432748538012,
"grad_norm": 0.1639032352257192,
"learning_rate": 9.380415576340127e-06,
"loss": 0.2126,
"num_tokens": 32649722.0,
"step": 101
},
{
"epoch": 0.5964912280701754,
"grad_norm": 0.15812907943305207,
"learning_rate": 9.365934367769885e-06,
"loss": 0.2046,
"num_tokens": 32961156.0,
"step": 102
},
{
"epoch": 0.6023391812865497,
"grad_norm": 0.16040845291570488,
"learning_rate": 9.351298690674996e-06,
"loss": 0.212,
"num_tokens": 33286164.0,
"step": 103
},
{
"epoch": 0.6081871345029239,
"grad_norm": 0.15564337683728058,
"learning_rate": 9.33650912984331e-06,
"loss": 0.2112,
"num_tokens": 33634944.0,
"step": 104
},
{
"epoch": 0.6140350877192983,
"grad_norm": 0.16026166188911017,
"learning_rate": 9.321566276211304e-06,
"loss": 0.2167,
"num_tokens": 33940455.0,
"step": 105
},
{
"epoch": 0.6198830409356725,
"grad_norm": 0.15084775438103953,
"learning_rate": 9.306470726840472e-06,
"loss": 0.212,
"num_tokens": 34269432.0,
"step": 106
},
{
"epoch": 0.6257309941520468,
"grad_norm": 0.16379797154749554,
"learning_rate": 9.291223084893472e-06,
"loss": 0.2259,
"num_tokens": 34564983.0,
"step": 107
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.1626726779429298,
"learning_rate": 9.275823959610019e-06,
"loss": 0.2068,
"num_tokens": 34869398.0,
"step": 108
},
{
"epoch": 0.6374269005847953,
"grad_norm": 0.16973276732555354,
"learning_rate": 9.260273966282546e-06,
"loss": 0.2103,
"num_tokens": 35179769.0,
"step": 109
},
{
"epoch": 0.6432748538011696,
"grad_norm": 0.16573716072448422,
"learning_rate": 9.244573726231621e-06,
"loss": 0.209,
"num_tokens": 35489608.0,
"step": 110
},
{
"epoch": 0.6491228070175439,
"grad_norm": 0.16034467135549915,
"learning_rate": 9.22872386678111e-06,
"loss": 0.2056,
"num_tokens": 35795317.0,
"step": 111
},
{
"epoch": 0.6549707602339181,
"grad_norm": 0.16859253078446698,
"learning_rate": 9.212725021233135e-06,
"loss": 0.2105,
"num_tokens": 36108365.0,
"step": 112
},
{
"epoch": 0.6608187134502924,
"grad_norm": 0.17271477988986808,
"learning_rate": 9.196577828842738e-06,
"loss": 0.2075,
"num_tokens": 36435675.0,
"step": 113
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.17273881432682334,
"learning_rate": 9.180282934792369e-06,
"loss": 0.2054,
"num_tokens": 36737269.0,
"step": 114
},
{
"epoch": 0.672514619883041,
"grad_norm": 0.15988223863731596,
"learning_rate": 9.163840990166085e-06,
"loss": 0.2011,
"num_tokens": 37059436.0,
"step": 115
},
{
"epoch": 0.6783625730994152,
"grad_norm": 0.1693196726503627,
"learning_rate": 9.147252651923546e-06,
"loss": 0.2202,
"num_tokens": 37382958.0,
"step": 116
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.1605754092944871,
"learning_rate": 9.130518582873765e-06,
"loss": 0.2169,
"num_tokens": 37711301.0,
"step": 117
},
{
"epoch": 0.6900584795321637,
"grad_norm": 0.16182274147996495,
"learning_rate": 9.11363945164862e-06,
"loss": 0.2022,
"num_tokens": 38034357.0,
"step": 118
},
{
"epoch": 0.695906432748538,
"grad_norm": 0.17280602584782606,
"learning_rate": 9.096615932676138e-06,
"loss": 0.2011,
"num_tokens": 38349527.0,
"step": 119
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.16010466565814827,
"learning_rate": 9.079448706153554e-06,
"loss": 0.2016,
"num_tokens": 38654356.0,
"step": 120
},
{
"epoch": 0.7076023391812866,
"grad_norm": 0.15767250824653006,
"learning_rate": 9.062138458020128e-06,
"loss": 0.1917,
"num_tokens": 38962205.0,
"step": 121
},
{
"epoch": 0.7134502923976608,
"grad_norm": 0.17133096215596827,
"learning_rate": 9.044685879929734e-06,
"loss": 0.2087,
"num_tokens": 39267552.0,
"step": 122
},
{
"epoch": 0.7192982456140351,
"grad_norm": 0.15684580395831532,
"learning_rate": 9.027091669223228e-06,
"loss": 0.203,
"num_tokens": 39570208.0,
"step": 123
},
{
"epoch": 0.7251461988304093,
"grad_norm": 0.16789387397403432,
"learning_rate": 9.00935652890059e-06,
"loss": 0.2099,
"num_tokens": 39895778.0,
"step": 124
},
{
"epoch": 0.7309941520467836,
"grad_norm": 0.15658978810688212,
"learning_rate": 8.991481167592826e-06,
"loss": 0.2061,
"num_tokens": 40225470.0,
"step": 125
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.15588015347201137,
"learning_rate": 8.973466299533656e-06,
"loss": 0.2047,
"num_tokens": 40559255.0,
"step": 126
},
{
"epoch": 0.7426900584795322,
"grad_norm": 0.1750982604863698,
"learning_rate": 8.955312644530976e-06,
"loss": 0.1996,
"num_tokens": 40860587.0,
"step": 127
},
{
"epoch": 0.7485380116959064,
"grad_norm": 0.15381841682641284,
"learning_rate": 8.937020927938103e-06,
"loss": 0.2001,
"num_tokens": 41189624.0,
"step": 128
},
{
"epoch": 0.7543859649122807,
"grad_norm": 0.1577155995424487,
"learning_rate": 8.918591880624783e-06,
"loss": 0.2005,
"num_tokens": 41490687.0,
"step": 129
},
{
"epoch": 0.7602339181286549,
"grad_norm": 0.15962057074829455,
"learning_rate": 8.900026238947995e-06,
"loss": 0.2115,
"num_tokens": 41818157.0,
"step": 130
},
{
"epoch": 0.7660818713450293,
"grad_norm": 0.15033669547133874,
"learning_rate": 8.881324744722524e-06,
"loss": 0.1945,
"num_tokens": 42149764.0,
"step": 131
},
{
"epoch": 0.7719298245614035,
"grad_norm": 0.15680523481040093,
"learning_rate": 8.86248814519133e-06,
"loss": 0.204,
"num_tokens": 42452660.0,
"step": 132
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.19538772884868966,
"learning_rate": 8.843517192995673e-06,
"loss": 0.2094,
"num_tokens": 42762176.0,
"step": 133
},
{
"epoch": 0.783625730994152,
"grad_norm": 0.1497802316166281,
"learning_rate": 8.824412646145065e-06,
"loss": 0.206,
"num_tokens": 43122509.0,
"step": 134
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.17792632602872682,
"learning_rate": 8.805175267986955e-06,
"loss": 0.2021,
"num_tokens": 43438515.0,
"step": 135
},
{
"epoch": 0.7953216374269005,
"grad_norm": 0.16345383837628785,
"learning_rate": 8.785805827176256e-06,
"loss": 0.213,
"num_tokens": 43750567.0,
"step": 136
},
{
"epoch": 0.8011695906432749,
"grad_norm": 0.15988232783718637,
"learning_rate": 8.766305097644608e-06,
"loss": 0.2076,
"num_tokens": 44061251.0,
"step": 137
},
{
"epoch": 0.8070175438596491,
"grad_norm": 0.148560033989183,
"learning_rate": 8.746673858569478e-06,
"loss": 0.2056,
"num_tokens": 44402399.0,
"step": 138
},
{
"epoch": 0.8128654970760234,
"grad_norm": 0.15498081570702754,
"learning_rate": 8.726912894343e-06,
"loss": 0.2011,
"num_tokens": 44713760.0,
"step": 139
},
{
"epoch": 0.8187134502923976,
"grad_norm": 0.16409619767618208,
"learning_rate": 8.707022994540659e-06,
"loss": 0.1985,
"num_tokens": 45006733.0,
"step": 140
},
{
"epoch": 0.8245614035087719,
"grad_norm": 0.16387952215184107,
"learning_rate": 8.687004953889729e-06,
"loss": 0.2117,
"num_tokens": 45319558.0,
"step": 141
},
{
"epoch": 0.8304093567251462,
"grad_norm": 0.14972932521892918,
"learning_rate": 8.666859572237517e-06,
"loss": 0.2017,
"num_tokens": 45648581.0,
"step": 142
},
{
"epoch": 0.8362573099415205,
"grad_norm": 0.14894137892754095,
"learning_rate": 8.646587654519413e-06,
"loss": 0.2011,
"num_tokens": 45961313.0,
"step": 143
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.15579100817213587,
"learning_rate": 8.626190010726723e-06,
"loss": 0.1851,
"num_tokens": 46290840.0,
"step": 144
},
{
"epoch": 0.847953216374269,
"grad_norm": 0.154512527802127,
"learning_rate": 8.605667455874302e-06,
"loss": 0.1903,
"num_tokens": 46585564.0,
"step": 145
},
{
"epoch": 0.8538011695906432,
"grad_norm": 0.16017530332563623,
"learning_rate": 8.585020809967995e-06,
"loss": 0.2066,
"num_tokens": 46893844.0,
"step": 146
},
{
"epoch": 0.8596491228070176,
"grad_norm": 0.16010504287695315,
"learning_rate": 8.564250897971862e-06,
"loss": 0.2151,
"num_tokens": 47228507.0,
"step": 147
},
{
"epoch": 0.8654970760233918,
"grad_norm": 0.15911701846573467,
"learning_rate": 8.543358549775232e-06,
"loss": 0.2029,
"num_tokens": 47537550.0,
"step": 148
},
{
"epoch": 0.8713450292397661,
"grad_norm": 0.1385770650249908,
"learning_rate": 8.522344600159532e-06,
"loss": 0.1892,
"num_tokens": 47871896.0,
"step": 149
},
{
"epoch": 0.8771929824561403,
"grad_norm": 0.14241522573672255,
"learning_rate": 8.501209888764928e-06,
"loss": 0.2016,
"num_tokens": 48224890.0,
"step": 150
},
{
"epoch": 0.8830409356725146,
"grad_norm": 0.16042004560579917,
"learning_rate": 8.479955260056793e-06,
"loss": 0.2293,
"num_tokens": 48551394.0,
"step": 151
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.1606415549333606,
"learning_rate": 8.458581563291948e-06,
"loss": 0.1993,
"num_tokens": 48869584.0,
"step": 152
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.13692085860159872,
"learning_rate": 8.437089652484735e-06,
"loss": 0.188,
"num_tokens": 49220358.0,
"step": 153
},
{
"epoch": 0.9005847953216374,
"grad_norm": 0.15029963686711117,
"learning_rate": 8.415480386372901e-06,
"loss": 0.2176,
"num_tokens": 49555633.0,
"step": 154
},
{
"epoch": 0.9064327485380117,
"grad_norm": 0.15377050265299294,
"learning_rate": 8.393754628383274e-06,
"loss": 0.2078,
"num_tokens": 49857399.0,
"step": 155
},
{
"epoch": 0.9122807017543859,
"grad_norm": 0.1600390896381573,
"learning_rate": 8.371913246597272e-06,
"loss": 0.1987,
"num_tokens": 50208717.0,
"step": 156
},
{
"epoch": 0.9181286549707602,
"grad_norm": 0.1709173684661271,
"learning_rate": 8.349957113716213e-06,
"loss": 0.212,
"num_tokens": 50502126.0,
"step": 157
},
{
"epoch": 0.9239766081871345,
"grad_norm": 0.16675835187365423,
"learning_rate": 8.327887107026445e-06,
"loss": 0.2237,
"num_tokens": 50820497.0,
"step": 158
},
{
"epoch": 0.9298245614035088,
"grad_norm": 0.14395552142792745,
"learning_rate": 8.305704108364301e-06,
"loss": 0.2076,
"num_tokens": 51154766.0,
"step": 159
},
{
"epoch": 0.935672514619883,
"grad_norm": 0.15573171280863216,
"learning_rate": 8.283409004080853e-06,
"loss": 0.2114,
"num_tokens": 51491802.0,
"step": 160
},
{
"epoch": 0.9415204678362573,
"grad_norm": 0.158518456781856,
"learning_rate": 8.261002685006503e-06,
"loss": 0.2224,
"num_tokens": 51818025.0,
"step": 161
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.15531715834460813,
"learning_rate": 8.238486046415385e-06,
"loss": 0.1937,
"num_tokens": 52118378.0,
"step": 162
},
{
"epoch": 0.9532163742690059,
"grad_norm": 0.1515317024708392,
"learning_rate": 8.2158599879896e-06,
"loss": 0.1968,
"num_tokens": 52428129.0,
"step": 163
},
{
"epoch": 0.9590643274853801,
"grad_norm": 0.14424883914854034,
"learning_rate": 8.19312541378326e-06,
"loss": 0.193,
"num_tokens": 52735470.0,
"step": 164
},
{
"epoch": 0.9649122807017544,
"grad_norm": 0.15498488683654527,
"learning_rate": 8.170283232186365e-06,
"loss": 0.1943,
"num_tokens": 53051592.0,
"step": 165
},
{
"epoch": 0.9707602339181286,
"grad_norm": 0.1754541926428641,
"learning_rate": 8.14733435588852e-06,
"loss": 0.2214,
"num_tokens": 53343217.0,
"step": 166
},
{
"epoch": 0.9766081871345029,
"grad_norm": 0.14868906024546139,
"learning_rate": 8.12427970184245e-06,
"loss": 0.1935,
"num_tokens": 53658830.0,
"step": 167
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.1500168809044866,
"learning_rate": 8.101120191227374e-06,
"loss": 0.1981,
"num_tokens": 53999419.0,
"step": 168
},
{
"epoch": 0.9883040935672515,
"grad_norm": 0.14699416276573674,
"learning_rate": 8.07785674941219e-06,
"loss": 0.194,
"num_tokens": 54309901.0,
"step": 169
},
{
"epoch": 0.9941520467836257,
"grad_norm": 0.15232473722185103,
"learning_rate": 8.054490305918512e-06,
"loss": 0.1955,
"num_tokens": 54612844.0,
"step": 170
},
{
"epoch": 1.0,
"grad_norm": 0.1607048554407368,
"learning_rate": 8.031021794383513e-06,
"loss": 0.2092,
"num_tokens": 54926459.0,
"step": 171
},
{
"epoch": 1.0058479532163742,
"grad_norm": 0.1625217088750646,
"learning_rate": 8.007452152522639e-06,
"loss": 0.1875,
"num_tokens": 55222609.0,
"step": 172
},
{
"epoch": 1.0116959064327484,
"grad_norm": 0.15670723402140246,
"learning_rate": 7.983782322092126e-06,
"loss": 0.1938,
"num_tokens": 55537898.0,
"step": 173
},
{
"epoch": 1.0175438596491229,
"grad_norm": 0.14242775964832494,
"learning_rate": 7.960013248851375e-06,
"loss": 0.1882,
"num_tokens": 55862238.0,
"step": 174
},
{
"epoch": 1.023391812865497,
"grad_norm": 0.1606453358565539,
"learning_rate": 7.936145882525174e-06,
"loss": 0.1877,
"num_tokens": 56180559.0,
"step": 175
},
{
"epoch": 1.0292397660818713,
"grad_norm": 0.15292057131184103,
"learning_rate": 7.91218117676573e-06,
"loss": 0.1783,
"num_tokens": 56530315.0,
"step": 176
},
{
"epoch": 1.0350877192982457,
"grad_norm": 0.14722221394043097,
"learning_rate": 7.888120089114586e-06,
"loss": 0.1758,
"num_tokens": 56837967.0,
"step": 177
},
{
"epoch": 1.04093567251462,
"grad_norm": 0.15795692601491945,
"learning_rate": 7.863963580964344e-06,
"loss": 0.1772,
"num_tokens": 57149693.0,
"step": 178
},
{
"epoch": 1.0467836257309941,
"grad_norm": 0.1700985282278579,
"learning_rate": 7.839712617520263e-06,
"loss": 0.1897,
"num_tokens": 57481658.0,
"step": 179
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.14557972709377917,
"learning_rate": 7.815368167761686e-06,
"loss": 0.1797,
"num_tokens": 57795613.0,
"step": 180
},
{
"epoch": 1.0584795321637426,
"grad_norm": 0.15501104048829578,
"learning_rate": 7.790931204403323e-06,
"loss": 0.177,
"num_tokens": 58094562.0,
"step": 181
},
{
"epoch": 1.064327485380117,
"grad_norm": 0.1521089989961407,
"learning_rate": 7.766402703856391e-06,
"loss": 0.1748,
"num_tokens": 58418586.0,
"step": 182
},
{
"epoch": 1.0701754385964912,
"grad_norm": 0.16486785923579997,
"learning_rate": 7.741783646189597e-06,
"loss": 0.1927,
"num_tokens": 58745927.0,
"step": 183
},
{
"epoch": 1.0760233918128654,
"grad_norm": 0.15410381183903402,
"learning_rate": 7.717075015089976e-06,
"loss": 0.1884,
"num_tokens": 59070496.0,
"step": 184
},
{
"epoch": 1.0818713450292399,
"grad_norm": 0.1444493695200652,
"learning_rate": 7.692277797823585e-06,
"loss": 0.1755,
"num_tokens": 59388680.0,
"step": 185
},
{
"epoch": 1.087719298245614,
"grad_norm": 0.15330407620774641,
"learning_rate": 7.667392985196064e-06,
"loss": 0.1866,
"num_tokens": 59707236.0,
"step": 186
},
{
"epoch": 1.0935672514619883,
"grad_norm": 0.16477562582655433,
"learning_rate": 7.64242157151304e-06,
"loss": 0.1999,
"num_tokens": 60042655.0,
"step": 187
},
{
"epoch": 1.0994152046783625,
"grad_norm": 0.140060196586728,
"learning_rate": 7.6173645545404e-06,
"loss": 0.1834,
"num_tokens": 60397091.0,
"step": 188
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.1525186599047059,
"learning_rate": 7.5922229354644195e-06,
"loss": 0.1811,
"num_tokens": 60707243.0,
"step": 189
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.14180526703114305,
"learning_rate": 7.56699771885177e-06,
"loss": 0.1789,
"num_tokens": 61056021.0,
"step": 190
},
{
"epoch": 1.1169590643274854,
"grad_norm": 0.14606069061084653,
"learning_rate": 7.541689912609365e-06,
"loss": 0.1833,
"num_tokens": 61381476.0,
"step": 191
},
{
"epoch": 1.1228070175438596,
"grad_norm": 0.1452299835582357,
"learning_rate": 7.516300527944104e-06,
"loss": 0.1889,
"num_tokens": 61710931.0,
"step": 192
},
{
"epoch": 1.128654970760234,
"grad_norm": 0.1607339684687444,
"learning_rate": 7.4908305793224565e-06,
"loss": 0.1891,
"num_tokens": 62048426.0,
"step": 193
},
{
"epoch": 1.1345029239766082,
"grad_norm": 0.15970871017649693,
"learning_rate": 7.465281084429931e-06,
"loss": 0.1841,
"num_tokens": 62347583.0,
"step": 194
},
{
"epoch": 1.1403508771929824,
"grad_norm": 0.15135926518093104,
"learning_rate": 7.4396530641304135e-06,
"loss": 0.1817,
"num_tokens": 62662619.0,
"step": 195
},
{
"epoch": 1.1461988304093567,
"grad_norm": 0.1463984323420409,
"learning_rate": 7.413947542425377e-06,
"loss": 0.1795,
"num_tokens": 62990146.0,
"step": 196
},
{
"epoch": 1.1520467836257309,
"grad_norm": 0.15028061491082353,
"learning_rate": 7.388165546412967e-06,
"loss": 0.1809,
"num_tokens": 63314531.0,
"step": 197
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.15759301916806728,
"learning_rate": 7.362308106246956e-06,
"loss": 0.1842,
"num_tokens": 63647247.0,
"step": 198
},
{
"epoch": 1.1637426900584795,
"grad_norm": 0.14433827296829588,
"learning_rate": 7.336376255095592e-06,
"loss": 0.1758,
"num_tokens": 63974328.0,
"step": 199
},
{
"epoch": 1.1695906432748537,
"grad_norm": 0.1489036598644256,
"learning_rate": 7.3103710291003134e-06,
"loss": 0.1832,
"num_tokens": 64295392.0,
"step": 200
},
{
"epoch": 1.1754385964912282,
"grad_norm": 0.1621435575204086,
"learning_rate": 7.284293467334344e-06,
"loss": 0.1829,
"num_tokens": 64601120.0,
"step": 201
},
{
"epoch": 1.1812865497076024,
"grad_norm": 0.15685350805242304,
"learning_rate": 7.258144611761181e-06,
"loss": 0.1828,
"num_tokens": 64910553.0,
"step": 202
},
{
"epoch": 1.1871345029239766,
"grad_norm": 0.1537822114754735,
"learning_rate": 7.23192550719296e-06,
"loss": 0.1786,
"num_tokens": 65230586.0,
"step": 203
},
{
"epoch": 1.1929824561403508,
"grad_norm": 0.1522958629898793,
"learning_rate": 7.2056372012487065e-06,
"loss": 0.1858,
"num_tokens": 65576822.0,
"step": 204
},
{
"epoch": 1.198830409356725,
"grad_norm": 0.15072282593856123,
"learning_rate": 7.179280744312481e-06,
"loss": 0.1717,
"num_tokens": 65892198.0,
"step": 205
},
{
"epoch": 1.2046783625730995,
"grad_norm": 0.14679430331251794,
"learning_rate": 7.152857189491406e-06,
"loss": 0.1709,
"num_tokens": 66218113.0,
"step": 206
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.14528524091480893,
"learning_rate": 7.126367592573589e-06,
"loss": 0.172,
"num_tokens": 66560316.0,
"step": 207
},
{
"epoch": 1.2163742690058479,
"grad_norm": 0.1513531832369065,
"learning_rate": 7.099813011985936e-06,
"loss": 0.1867,
"num_tokens": 66886426.0,
"step": 208
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.14889506905441677,
"learning_rate": 7.073194508751863e-06,
"loss": 0.184,
"num_tokens": 67205115.0,
"step": 209
},
{
"epoch": 1.2280701754385965,
"grad_norm": 0.1654317181387804,
"learning_rate": 7.046513146448899e-06,
"loss": 0.1892,
"num_tokens": 67509071.0,
"step": 210
},
{
"epoch": 1.2339181286549707,
"grad_norm": 0.1510266755197108,
"learning_rate": 7.019769991166189e-06,
"loss": 0.1788,
"num_tokens": 67841682.0,
"step": 211
},
{
"epoch": 1.239766081871345,
"grad_norm": 0.1419685060740966,
"learning_rate": 6.992966111461903e-06,
"loss": 0.1685,
"num_tokens": 68147715.0,
"step": 212
},
{
"epoch": 1.2456140350877192,
"grad_norm": 0.14993568993277867,
"learning_rate": 6.966102578320531e-06,
"loss": 0.1764,
"num_tokens": 68463460.0,
"step": 213
},
{
"epoch": 1.2514619883040936,
"grad_norm": 0.14659321843622847,
"learning_rate": 6.9391804651100924e-06,
"loss": 0.1897,
"num_tokens": 68799959.0,
"step": 214
},
{
"epoch": 1.2573099415204678,
"grad_norm": 0.15073229254770368,
"learning_rate": 6.912200847539261e-06,
"loss": 0.172,
"num_tokens": 69111397.0,
"step": 215
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.15759494389625772,
"learning_rate": 6.885164803614366e-06,
"loss": 0.1838,
"num_tokens": 69392630.0,
"step": 216
},
{
"epoch": 1.2690058479532165,
"grad_norm": 0.14630639885005334,
"learning_rate": 6.858073413596324e-06,
"loss": 0.1807,
"num_tokens": 69701641.0,
"step": 217
},
{
"epoch": 1.2748538011695907,
"grad_norm": 0.14730200293827667,
"learning_rate": 6.830927759957487e-06,
"loss": 0.1813,
"num_tokens": 70030091.0,
"step": 218
},
{
"epoch": 1.280701754385965,
"grad_norm": 0.14753719595946904,
"learning_rate": 6.80372892733837e-06,
"loss": 0.1804,
"num_tokens": 70348775.0,
"step": 219
},
{
"epoch": 1.286549707602339,
"grad_norm": 0.16483571370877223,
"learning_rate": 6.776478002504335e-06,
"loss": 0.1859,
"num_tokens": 70651692.0,
"step": 220
},
{
"epoch": 1.2923976608187133,
"grad_norm": 0.16648842833487107,
"learning_rate": 6.7491760743021535e-06,
"loss": 0.1903,
"num_tokens": 70956682.0,
"step": 221
},
{
"epoch": 1.2982456140350878,
"grad_norm": 0.14671697438686584,
"learning_rate": 6.721824233616503e-06,
"loss": 0.1758,
"num_tokens": 71265203.0,
"step": 222
},
{
"epoch": 1.304093567251462,
"grad_norm": 0.15188899959419136,
"learning_rate": 6.694423573326382e-06,
"loss": 0.1782,
"num_tokens": 71583993.0,
"step": 223
},
{
"epoch": 1.3099415204678362,
"grad_norm": 0.14838740052418056,
"learning_rate": 6.666975188261437e-06,
"loss": 0.182,
"num_tokens": 71899719.0,
"step": 224
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.1576944297988978,
"learning_rate": 6.639480175158227e-06,
"loss": 0.1784,
"num_tokens": 72202094.0,
"step": 225
},
{
"epoch": 1.3216374269005848,
"grad_norm": 0.15597436870104375,
"learning_rate": 6.611939632616394e-06,
"loss": 0.1784,
"num_tokens": 72516371.0,
"step": 226
},
{
"epoch": 1.327485380116959,
"grad_norm": 0.15325246999833303,
"learning_rate": 6.584354661054765e-06,
"loss": 0.1842,
"num_tokens": 72828007.0,
"step": 227
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.14935801239745722,
"learning_rate": 6.556726362667394e-06,
"loss": 0.1833,
"num_tokens": 73134243.0,
"step": 228
},
{
"epoch": 1.3391812865497075,
"grad_norm": 0.16481075845453566,
"learning_rate": 6.529055841379509e-06,
"loss": 0.176,
"num_tokens": 73436138.0,
"step": 229
},
{
"epoch": 1.345029239766082,
"grad_norm": 0.14125268538033928,
"learning_rate": 6.501344202803415e-06,
"loss": 0.1708,
"num_tokens": 73760046.0,
"step": 230
},
{
"epoch": 1.3508771929824561,
"grad_norm": 0.1501570731496053,
"learning_rate": 6.473592554194311e-06,
"loss": 0.1826,
"num_tokens": 74077480.0,
"step": 231
},
{
"epoch": 1.3567251461988303,
"grad_norm": 0.15771910225549807,
"learning_rate": 6.445802004406047e-06,
"loss": 0.1922,
"num_tokens": 74423874.0,
"step": 232
},
{
"epoch": 1.3625730994152048,
"grad_norm": 0.15600805460262265,
"learning_rate": 6.417973663846826e-06,
"loss": 0.1749,
"num_tokens": 74745816.0,
"step": 233
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.14516377176873183,
"learning_rate": 6.390108644434828e-06,
"loss": 0.18,
"num_tokens": 75092262.0,
"step": 234
},
{
"epoch": 1.3742690058479532,
"grad_norm": 0.14392359169053118,
"learning_rate": 6.362208059553786e-06,
"loss": 0.1799,
"num_tokens": 75442533.0,
"step": 235
},
{
"epoch": 1.3801169590643274,
"grad_norm": 0.1548508531809334,
"learning_rate": 6.334273024008499e-06,
"loss": 0.1705,
"num_tokens": 75775480.0,
"step": 236
},
{
"epoch": 1.3859649122807016,
"grad_norm": 0.15386739061806035,
"learning_rate": 6.306304653980286e-06,
"loss": 0.1722,
"num_tokens": 76066461.0,
"step": 237
},
{
"epoch": 1.391812865497076,
"grad_norm": 0.14831080775519306,
"learning_rate": 6.278304066982391e-06,
"loss": 0.1836,
"num_tokens": 76401700.0,
"step": 238
},
{
"epoch": 1.3976608187134503,
"grad_norm": 0.14755841590724592,
"learning_rate": 6.250272381815331e-06,
"loss": 0.1802,
"num_tokens": 76714274.0,
"step": 239
},
{
"epoch": 1.4035087719298245,
"grad_norm": 0.15910670160937837,
"learning_rate": 6.222210718522187e-06,
"loss": 0.2031,
"num_tokens": 77028246.0,
"step": 240
},
{
"epoch": 1.409356725146199,
"grad_norm": 0.15280436173000247,
"learning_rate": 6.19412019834386e-06,
"loss": 0.1742,
"num_tokens": 77364346.0,
"step": 241
},
{
"epoch": 1.4152046783625731,
"grad_norm": 0.13887335477707105,
"learning_rate": 6.166001943674266e-06,
"loss": 0.1785,
"num_tokens": 77748583.0,
"step": 242
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.1577201352102885,
"learning_rate": 6.137857078015487e-06,
"loss": 0.1863,
"num_tokens": 78064140.0,
"step": 243
},
{
"epoch": 1.4269005847953216,
"grad_norm": 0.14303385830957374,
"learning_rate": 6.109686725932882e-06,
"loss": 0.1813,
"num_tokens": 78411157.0,
"step": 244
},
{
"epoch": 1.4327485380116958,
"grad_norm": 0.14790866721582488,
"learning_rate": 6.081492013010154e-06,
"loss": 0.1778,
"num_tokens": 78723584.0,
"step": 245
},
{
"epoch": 1.4385964912280702,
"grad_norm": 0.14901104148022398,
"learning_rate": 6.0532740658043785e-06,
"loss": 0.1807,
"num_tokens": 79054107.0,
"step": 246
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.1485395120018961,
"learning_rate": 6.025034011800989e-06,
"loss": 0.187,
"num_tokens": 79386694.0,
"step": 247
},
{
"epoch": 1.4502923976608186,
"grad_norm": 0.14647752236417627,
"learning_rate": 5.996772979368715e-06,
"loss": 0.1849,
"num_tokens": 79718178.0,
"step": 248
},
{
"epoch": 1.456140350877193,
"grad_norm": 0.15032391233353223,
"learning_rate": 5.968492097714519e-06,
"loss": 0.1744,
"num_tokens": 80013286.0,
"step": 249
},
{
"epoch": 1.4619883040935673,
"grad_norm": 0.13111163977489035,
"learning_rate": 5.940192496838456e-06,
"loss": 0.1683,
"num_tokens": 80358177.0,
"step": 250
},
{
"epoch": 1.4678362573099415,
"grad_norm": 0.14507217241507256,
"learning_rate": 5.911875307488543e-06,
"loss": 0.1697,
"num_tokens": 80664979.0,
"step": 251
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.14371510606580692,
"learning_rate": 5.883541661115555e-06,
"loss": 0.183,
"num_tokens": 81008531.0,
"step": 252
},
{
"epoch": 1.47953216374269,
"grad_norm": 0.14575959771926755,
"learning_rate": 5.855192689827838e-06,
"loss": 0.1784,
"num_tokens": 81327068.0,
"step": 253
},
{
"epoch": 1.4853801169590644,
"grad_norm": 0.14386473495114957,
"learning_rate": 5.8268295263460625e-06,
"loss": 0.186,
"num_tokens": 81660168.0,
"step": 254
},
{
"epoch": 1.4912280701754386,
"grad_norm": 0.15665862540575096,
"learning_rate": 5.798453303957968e-06,
"loss": 0.1852,
"num_tokens": 81988344.0,
"step": 255
},
{
"epoch": 1.4970760233918128,
"grad_norm": 0.1486254433584565,
"learning_rate": 5.77006515647308e-06,
"loss": 0.1815,
"num_tokens": 82292457.0,
"step": 256
},
{
"epoch": 1.5029239766081872,
"grad_norm": 0.15218761718926124,
"learning_rate": 5.741666218177402e-06,
"loss": 0.1754,
"num_tokens": 82577890.0,
"step": 257
},
{
"epoch": 1.5087719298245614,
"grad_norm": 0.1538621360954969,
"learning_rate": 5.7132576237881075e-06,
"loss": 0.1855,
"num_tokens": 82874407.0,
"step": 258
},
{
"epoch": 1.5146198830409356,
"grad_norm": 0.16534859264165339,
"learning_rate": 5.684840508408183e-06,
"loss": 0.187,
"num_tokens": 83181722.0,
"step": 259
},
{
"epoch": 1.52046783625731,
"grad_norm": 0.15058626722493648,
"learning_rate": 5.656416007481089e-06,
"loss": 0.1793,
"num_tokens": 83483808.0,
"step": 260
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.14392786696974194,
"learning_rate": 5.627985256745384e-06,
"loss": 0.1773,
"num_tokens": 83826918.0,
"step": 261
},
{
"epoch": 1.5321637426900585,
"grad_norm": 0.1547539344312234,
"learning_rate": 5.5995493921893415e-06,
"loss": 0.1747,
"num_tokens": 84129500.0,
"step": 262
},
{
"epoch": 1.5380116959064327,
"grad_norm": 0.15476418511066206,
"learning_rate": 5.571109550005571e-06,
"loss": 0.1831,
"num_tokens": 84454231.0,
"step": 263
},
{
"epoch": 1.543859649122807,
"grad_norm": 0.15292624983775452,
"learning_rate": 5.542666866545609e-06,
"loss": 0.1772,
"num_tokens": 84758320.0,
"step": 264
},
{
"epoch": 1.5497076023391814,
"grad_norm": 0.14323940203700627,
"learning_rate": 5.5142224782745175e-06,
"loss": 0.1742,
"num_tokens": 85064629.0,
"step": 265
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.1569122030468075,
"learning_rate": 5.485777521725485e-06,
"loss": 0.1823,
"num_tokens": 85358431.0,
"step": 266
},
{
"epoch": 1.5614035087719298,
"grad_norm": 0.1571283435279191,
"learning_rate": 5.457333133454394e-06,
"loss": 0.194,
"num_tokens": 85681659.0,
"step": 267
},
{
"epoch": 1.5672514619883042,
"grad_norm": 0.15050727661326063,
"learning_rate": 5.4288904499944304e-06,
"loss": 0.1843,
"num_tokens": 86001433.0,
"step": 268
},
{
"epoch": 1.5730994152046782,
"grad_norm": 0.13840027248721531,
"learning_rate": 5.40045060781066e-06,
"loss": 0.1828,
"num_tokens": 86349235.0,
"step": 269
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.154830903482515,
"learning_rate": 5.3720147432546175e-06,
"loss": 0.1891,
"num_tokens": 86670991.0,
"step": 270
},
{
"epoch": 1.5847953216374269,
"grad_norm": 0.14875604221537664,
"learning_rate": 5.343583992518911e-06,
"loss": 0.1838,
"num_tokens": 87005937.0,
"step": 271
},
{
"epoch": 1.590643274853801,
"grad_norm": 0.1550420196686663,
"learning_rate": 5.315159491591818e-06,
"loss": 0.1932,
"num_tokens": 87355743.0,
"step": 272
},
{
"epoch": 1.5964912280701755,
"grad_norm": 0.16190605189165996,
"learning_rate": 5.286742376211894e-06,
"loss": 0.1873,
"num_tokens": 87666062.0,
"step": 273
},
{
"epoch": 1.6023391812865497,
"grad_norm": 0.1499999734628863,
"learning_rate": 5.2583337818226e-06,
"loss": 0.179,
"num_tokens": 88010557.0,
"step": 274
},
{
"epoch": 1.608187134502924,
"grad_norm": 0.14174519866822582,
"learning_rate": 5.229934843526922e-06,
"loss": 0.1839,
"num_tokens": 88348530.0,
"step": 275
},
{
"epoch": 1.6140350877192984,
"grad_norm": 0.14514733616156453,
"learning_rate": 5.201546696042033e-06,
"loss": 0.1732,
"num_tokens": 88660232.0,
"step": 276
},
{
"epoch": 1.6198830409356724,
"grad_norm": 0.15247902901919175,
"learning_rate": 5.173170473653939e-06,
"loss": 0.1838,
"num_tokens": 88986178.0,
"step": 277
},
{
"epoch": 1.6257309941520468,
"grad_norm": 0.15161619030379697,
"learning_rate": 5.1448073101721644e-06,
"loss": 0.184,
"num_tokens": 89306790.0,
"step": 278
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.15076713301794256,
"learning_rate": 5.1164583388844476e-06,
"loss": 0.1764,
"num_tokens": 89608787.0,
"step": 279
},
{
"epoch": 1.6374269005847952,
"grad_norm": 0.15081569071358755,
"learning_rate": 5.0881246925114595e-06,
"loss": 0.1841,
"num_tokens": 89925196.0,
"step": 280
},
{
"epoch": 1.6432748538011697,
"grad_norm": 0.14667932336134215,
"learning_rate": 5.0598075031615445e-06,
"loss": 0.1714,
"num_tokens": 90246158.0,
"step": 281
},
{
"epoch": 1.6491228070175439,
"grad_norm": 0.14238827156504316,
"learning_rate": 5.031507902285483e-06,
"loss": 0.1675,
"num_tokens": 90530735.0,
"step": 282
},
{
"epoch": 1.654970760233918,
"grad_norm": 0.15453119377842958,
"learning_rate": 5.003227020631287e-06,
"loss": 0.1822,
"num_tokens": 90867029.0,
"step": 283
},
{
"epoch": 1.6608187134502925,
"grad_norm": 0.14188421149596725,
"learning_rate": 4.974965988199015e-06,
"loss": 0.1782,
"num_tokens": 91197724.0,
"step": 284
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.13924871637746586,
"learning_rate": 4.946725934195622e-06,
"loss": 0.1687,
"num_tokens": 91501722.0,
"step": 285
},
{
"epoch": 1.672514619883041,
"grad_norm": 0.13972442531890047,
"learning_rate": 4.918507986989848e-06,
"loss": 0.1721,
"num_tokens": 91828252.0,
"step": 286
},
{
"epoch": 1.6783625730994152,
"grad_norm": 0.14136406326583859,
"learning_rate": 4.890313274067121e-06,
"loss": 0.1787,
"num_tokens": 92160609.0,
"step": 287
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.14317191939225465,
"learning_rate": 4.862142921984514e-06,
"loss": 0.1816,
"num_tokens": 92492656.0,
"step": 288
},
{
"epoch": 1.6900584795321638,
"grad_norm": 0.15273877472629271,
"learning_rate": 4.8339980563257345e-06,
"loss": 0.1778,
"num_tokens": 92783619.0,
"step": 289
},
{
"epoch": 1.695906432748538,
"grad_norm": 0.1488984953802957,
"learning_rate": 4.80587980165614e-06,
"loss": 0.1605,
"num_tokens": 93088774.0,
"step": 290
},
{
"epoch": 1.7017543859649122,
"grad_norm": 0.14456947085083469,
"learning_rate": 4.7777892814778145e-06,
"loss": 0.185,
"num_tokens": 93417888.0,
"step": 291
},
{
"epoch": 1.7076023391812867,
"grad_norm": 0.13947315988135284,
"learning_rate": 4.749727618184672e-06,
"loss": 0.1673,
"num_tokens": 93733741.0,
"step": 292
},
{
"epoch": 1.7134502923976607,
"grad_norm": 0.14851734535481514,
"learning_rate": 4.72169593301761e-06,
"loss": 0.1819,
"num_tokens": 94047466.0,
"step": 293
},
{
"epoch": 1.719298245614035,
"grad_norm": 0.14831214767698989,
"learning_rate": 4.693695346019715e-06,
"loss": 0.1771,
"num_tokens": 94353138.0,
"step": 294
},
{
"epoch": 1.7251461988304093,
"grad_norm": 0.14962487368007496,
"learning_rate": 4.665726975991502e-06,
"loss": 0.1796,
"num_tokens": 94660877.0,
"step": 295
},
{
"epoch": 1.7309941520467835,
"grad_norm": 0.15275250425533088,
"learning_rate": 4.637791940446216e-06,
"loss": 0.1819,
"num_tokens": 94977362.0,
"step": 296
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.1444275240489397,
"learning_rate": 4.609891355565172e-06,
"loss": 0.1746,
"num_tokens": 95287766.0,
"step": 297
},
{
"epoch": 1.7426900584795322,
"grad_norm": 0.14585881193254274,
"learning_rate": 4.582026336153175e-06,
"loss": 0.1751,
"num_tokens": 95580594.0,
"step": 298
},
{
"epoch": 1.7485380116959064,
"grad_norm": 0.14505549895976488,
"learning_rate": 4.554197995593953e-06,
"loss": 0.1811,
"num_tokens": 95914105.0,
"step": 299
},
{
"epoch": 1.7543859649122808,
"grad_norm": 0.14480466196132438,
"learning_rate": 4.526407445805692e-06,
"loss": 0.1734,
"num_tokens": 96221354.0,
"step": 300
},
{
"epoch": 1.7602339181286548,
"grad_norm": 0.14878097366426038,
"learning_rate": 4.4986557971965865e-06,
"loss": 0.1717,
"num_tokens": 96503140.0,
"step": 301
},
{
"epoch": 1.7660818713450293,
"grad_norm": 0.13546319756214165,
"learning_rate": 4.4709441586204924e-06,
"loss": 0.1787,
"num_tokens": 96862132.0,
"step": 302
},
{
"epoch": 1.7719298245614035,
"grad_norm": 0.1545758606312762,
"learning_rate": 4.443273637332607e-06,
"loss": 0.1817,
"num_tokens": 97181205.0,
"step": 303
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.14857553617463332,
"learning_rate": 4.415645338945236e-06,
"loss": 0.1835,
"num_tokens": 97496038.0,
"step": 304
},
{
"epoch": 1.7836257309941521,
"grad_norm": 0.1433152779712938,
"learning_rate": 4.388060367383607e-06,
"loss": 0.1807,
"num_tokens": 97811641.0,
"step": 305
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.15284420636801233,
"learning_rate": 4.3605198248417745e-06,
"loss": 0.1904,
"num_tokens": 98139634.0,
"step": 306
},
{
"epoch": 1.7953216374269005,
"grad_norm": 0.15285620589846677,
"learning_rate": 4.333024811738565e-06,
"loss": 0.1794,
"num_tokens": 98454687.0,
"step": 307
},
{
"epoch": 1.801169590643275,
"grad_norm": 0.14480146717017858,
"learning_rate": 4.305576426673621e-06,
"loss": 0.1733,
"num_tokens": 98768020.0,
"step": 308
},
{
"epoch": 1.807017543859649,
"grad_norm": 0.14587759969006922,
"learning_rate": 4.278175766383499e-06,
"loss": 0.1788,
"num_tokens": 99089607.0,
"step": 309
},
{
"epoch": 1.8128654970760234,
"grad_norm": 0.14456314742064028,
"learning_rate": 4.250823925697848e-06,
"loss": 0.1728,
"num_tokens": 99396998.0,
"step": 310
},
{
"epoch": 1.8187134502923976,
"grad_norm": 0.14323326579292064,
"learning_rate": 4.223521997495665e-06,
"loss": 0.1774,
"num_tokens": 99716575.0,
"step": 311
},
{
"epoch": 1.8245614035087718,
"grad_norm": 0.1325445041745875,
"learning_rate": 4.196271072661631e-06,
"loss": 0.168,
"num_tokens": 100064416.0,
"step": 312
},
{
"epoch": 1.8304093567251463,
"grad_norm": 0.13327008176494723,
"learning_rate": 4.169072240042514e-06,
"loss": 0.1699,
"num_tokens": 100409121.0,
"step": 313
},
{
"epoch": 1.8362573099415205,
"grad_norm": 0.13889911640255415,
"learning_rate": 4.141926586403677e-06,
"loss": 0.1805,
"num_tokens": 100765220.0,
"step": 314
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.1454927979121013,
"learning_rate": 4.114835196385636e-06,
"loss": 0.1842,
"num_tokens": 101081641.0,
"step": 315
},
{
"epoch": 1.8479532163742691,
"grad_norm": 0.1404945457994314,
"learning_rate": 4.08779915246074e-06,
"loss": 0.1822,
"num_tokens": 101412150.0,
"step": 316
},
{
"epoch": 1.8538011695906431,
"grad_norm": 0.13980685227323844,
"learning_rate": 4.060819534889909e-06,
"loss": 0.1799,
"num_tokens": 101744218.0,
"step": 317
},
{
"epoch": 1.8596491228070176,
"grad_norm": 0.14229034728182074,
"learning_rate": 4.033897421679472e-06,
"loss": 0.1694,
"num_tokens": 102086143.0,
"step": 318
},
{
"epoch": 1.8654970760233918,
"grad_norm": 0.1409421468030685,
"learning_rate": 4.0070338885381e-06,
"loss": 0.183,
"num_tokens": 102386095.0,
"step": 319
},
{
"epoch": 1.871345029239766,
"grad_norm": 0.14131996708764916,
"learning_rate": 3.980230008833812e-06,
"loss": 0.1784,
"num_tokens": 102724647.0,
"step": 320
},
{
"epoch": 1.8771929824561404,
"grad_norm": 0.14060583568522164,
"learning_rate": 3.953486853551104e-06,
"loss": 0.1611,
"num_tokens": 103019942.0,
"step": 321
},
{
"epoch": 1.8830409356725146,
"grad_norm": 0.13762893362884943,
"learning_rate": 3.926805491248138e-06,
"loss": 0.1681,
"num_tokens": 103358784.0,
"step": 322
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.15361551995597972,
"learning_rate": 3.900186988014065e-06,
"loss": 0.1825,
"num_tokens": 103660262.0,
"step": 323
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.13699916161747416,
"learning_rate": 3.873632407426412e-06,
"loss": 0.179,
"num_tokens": 103981503.0,
"step": 324
},
{
"epoch": 1.9005847953216373,
"grad_norm": 0.14959870826890115,
"learning_rate": 3.847142810508596e-06,
"loss": 0.1826,
"num_tokens": 104304618.0,
"step": 325
},
{
"epoch": 1.9064327485380117,
"grad_norm": 0.13265912139830643,
"learning_rate": 3.82071925568752e-06,
"loss": 0.1721,
"num_tokens": 104676283.0,
"step": 326
},
{
"epoch": 1.912280701754386,
"grad_norm": 0.1421124205261071,
"learning_rate": 3.7943627987512953e-06,
"loss": 0.1805,
"num_tokens": 105009047.0,
"step": 327
},
{
"epoch": 1.9181286549707601,
"grad_norm": 0.1437008669427492,
"learning_rate": 3.7680744928070413e-06,
"loss": 0.1754,
"num_tokens": 105335760.0,
"step": 328
},
{
"epoch": 1.9239766081871346,
"grad_norm": 0.13360094795705102,
"learning_rate": 3.741855388238821e-06,
"loss": 0.1728,
"num_tokens": 105669692.0,
"step": 329
},
{
"epoch": 1.9298245614035088,
"grad_norm": 0.14123898637906518,
"learning_rate": 3.715706532665657e-06,
"loss": 0.1771,
"num_tokens": 105989627.0,
"step": 330
},
{
"epoch": 1.935672514619883,
"grad_norm": 0.15005239176117802,
"learning_rate": 3.6896289708996867e-06,
"loss": 0.1875,
"num_tokens": 106296775.0,
"step": 331
},
{
"epoch": 1.9415204678362574,
"grad_norm": 0.13514021795158399,
"learning_rate": 3.6636237449044077e-06,
"loss": 0.164,
"num_tokens": 106614583.0,
"step": 332
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.14198706510532838,
"learning_rate": 3.637691893753047e-06,
"loss": 0.1923,
"num_tokens": 106960662.0,
"step": 333
},
{
"epoch": 1.9532163742690059,
"grad_norm": 0.13549287034356422,
"learning_rate": 3.611834453587035e-06,
"loss": 0.1713,
"num_tokens": 107295583.0,
"step": 334
},
{
"epoch": 1.95906432748538,
"grad_norm": 0.14046063970510755,
"learning_rate": 3.5860524575746247e-06,
"loss": 0.1856,
"num_tokens": 107643687.0,
"step": 335
},
{
"epoch": 1.9649122807017543,
"grad_norm": 0.14339861962392317,
"learning_rate": 3.5603469358695887e-06,
"loss": 0.1763,
"num_tokens": 107990974.0,
"step": 336
},
{
"epoch": 1.9707602339181287,
"grad_norm": 0.14801719558718762,
"learning_rate": 3.53471891557007e-06,
"loss": 0.193,
"num_tokens": 108320866.0,
"step": 337
},
{
"epoch": 1.976608187134503,
"grad_norm": 0.13759593336874304,
"learning_rate": 3.509169420677545e-06,
"loss": 0.1646,
"num_tokens": 108635348.0,
"step": 338
},
{
"epoch": 1.9824561403508771,
"grad_norm": 0.14182117983464468,
"learning_rate": 3.483699472055897e-06,
"loss": 0.175,
"num_tokens": 108945985.0,
"step": 339
},
{
"epoch": 1.9883040935672516,
"grad_norm": 0.14601860778521314,
"learning_rate": 3.458310087390637e-06,
"loss": 0.1746,
"num_tokens": 109253189.0,
"step": 340
},
{
"epoch": 1.9941520467836256,
"grad_norm": 0.13963753894588415,
"learning_rate": 3.4330022811482317e-06,
"loss": 0.1758,
"num_tokens": 109573613.0,
"step": 341
},
{
"epoch": 2.0,
"grad_norm": 0.14923037322286145,
"learning_rate": 3.4077770645355824e-06,
"loss": 0.1806,
"num_tokens": 109863441.0,
"step": 342
},
{
"epoch": 2.0058479532163744,
"grad_norm": 0.14979935754858112,
"learning_rate": 3.3826354454596024e-06,
"loss": 0.1648,
"num_tokens": 110172994.0,
"step": 343
},
{
"epoch": 2.0116959064327484,
"grad_norm": 0.14730373048534828,
"learning_rate": 3.35757842848696e-06,
"loss": 0.1658,
"num_tokens": 110485924.0,
"step": 344
},
{
"epoch": 2.017543859649123,
"grad_norm": 0.13744380787225857,
"learning_rate": 3.332607014803937e-06,
"loss": 0.1672,
"num_tokens": 110803906.0,
"step": 345
},
{
"epoch": 2.023391812865497,
"grad_norm": 0.14343267181086955,
"learning_rate": 3.307722202176417e-06,
"loss": 0.1685,
"num_tokens": 111130534.0,
"step": 346
},
{
"epoch": 2.0292397660818713,
"grad_norm": 0.14951907378888113,
"learning_rate": 3.2829249849100255e-06,
"loss": 0.1656,
"num_tokens": 111443229.0,
"step": 347
},
{
"epoch": 2.0350877192982457,
"grad_norm": 0.15277865040934263,
"learning_rate": 3.2582163538104038e-06,
"loss": 0.172,
"num_tokens": 111757363.0,
"step": 348
},
{
"epoch": 2.0409356725146197,
"grad_norm": 0.14756047461787347,
"learning_rate": 3.2335972961436095e-06,
"loss": 0.1568,
"num_tokens": 112093731.0,
"step": 349
},
{
"epoch": 2.046783625730994,
"grad_norm": 0.16032862389663596,
"learning_rate": 3.209068795596679e-06,
"loss": 0.1658,
"num_tokens": 112388610.0,
"step": 350
},
{
"epoch": 2.0526315789473686,
"grad_norm": 0.16521303938360793,
"learning_rate": 3.1846318322383164e-06,
"loss": 0.1668,
"num_tokens": 112693505.0,
"step": 351
},
{
"epoch": 2.0584795321637426,
"grad_norm": 0.15381932734961204,
"learning_rate": 3.160287382479738e-06,
"loss": 0.1659,
"num_tokens": 112979822.0,
"step": 352
},
{
"epoch": 2.064327485380117,
"grad_norm": 0.1375064634023704,
"learning_rate": 3.136036419035656e-06,
"loss": 0.1666,
"num_tokens": 113301917.0,
"step": 353
},
{
"epoch": 2.0701754385964914,
"grad_norm": 0.15582327011120772,
"learning_rate": 3.111879910885414e-06,
"loss": 0.1743,
"num_tokens": 113618502.0,
"step": 354
},
{
"epoch": 2.0760233918128654,
"grad_norm": 0.1447294439687268,
"learning_rate": 3.0878188232342708e-06,
"loss": 0.1675,
"num_tokens": 113947517.0,
"step": 355
},
{
"epoch": 2.08187134502924,
"grad_norm": 0.15087644416114507,
"learning_rate": 3.0638541174748284e-06,
"loss": 0.1693,
"num_tokens": 114275423.0,
"step": 356
},
{
"epoch": 2.087719298245614,
"grad_norm": 0.14016164600859282,
"learning_rate": 3.0399867511486247e-06,
"loss": 0.1592,
"num_tokens": 114588977.0,
"step": 357
},
{
"epoch": 2.0935672514619883,
"grad_norm": 0.13990826210042928,
"learning_rate": 3.0162176779078763e-06,
"loss": 0.1639,
"num_tokens": 114914836.0,
"step": 358
},
{
"epoch": 2.0994152046783627,
"grad_norm": 0.14220573852232116,
"learning_rate": 2.9925478474773634e-06,
"loss": 0.1533,
"num_tokens": 115251681.0,
"step": 359
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.1462318844171621,
"learning_rate": 2.9689782056164874e-06,
"loss": 0.1634,
"num_tokens": 115583416.0,
"step": 360
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.15963290757197937,
"learning_rate": 2.94550969408149e-06,
"loss": 0.1799,
"num_tokens": 115902865.0,
"step": 361
},
{
"epoch": 2.116959064327485,
"grad_norm": 0.14871595452125072,
"learning_rate": 2.9221432505878116e-06,
"loss": 0.1624,
"num_tokens": 116225855.0,
"step": 362
},
{
"epoch": 2.1228070175438596,
"grad_norm": 0.15147360930740472,
"learning_rate": 2.8988798087726295e-06,
"loss": 0.1608,
"num_tokens": 116539725.0,
"step": 363
},
{
"epoch": 2.128654970760234,
"grad_norm": 0.14416045180339065,
"learning_rate": 2.875720298157551e-06,
"loss": 0.1576,
"num_tokens": 116893688.0,
"step": 364
},
{
"epoch": 2.134502923976608,
"grad_norm": 0.15123893108832548,
"learning_rate": 2.8526656441114815e-06,
"loss": 0.1685,
"num_tokens": 117195485.0,
"step": 365
},
{
"epoch": 2.1403508771929824,
"grad_norm": 0.14700656972796167,
"learning_rate": 2.8297167678136363e-06,
"loss": 0.1512,
"num_tokens": 117517383.0,
"step": 366
},
{
"epoch": 2.146198830409357,
"grad_norm": 0.14798834895868007,
"learning_rate": 2.8068745862167423e-06,
"loss": 0.165,
"num_tokens": 117837819.0,
"step": 367
},
{
"epoch": 2.152046783625731,
"grad_norm": 0.14373047748755674,
"learning_rate": 2.784140012010401e-06,
"loss": 0.157,
"num_tokens": 118172299.0,
"step": 368
},
{
"epoch": 2.1578947368421053,
"grad_norm": 0.14591495023462686,
"learning_rate": 2.7615139535846156e-06,
"loss": 0.1569,
"num_tokens": 118493460.0,
"step": 369
},
{
"epoch": 2.1637426900584797,
"grad_norm": 0.14800266668408754,
"learning_rate": 2.7389973149934974e-06,
"loss": 0.1657,
"num_tokens": 118788247.0,
"step": 370
},
{
"epoch": 2.1695906432748537,
"grad_norm": 0.14815056056984904,
"learning_rate": 2.7165909959191472e-06,
"loss": 0.1653,
"num_tokens": 119112526.0,
"step": 371
},
{
"epoch": 2.175438596491228,
"grad_norm": 0.14800590386326276,
"learning_rate": 2.6942958916356997e-06,
"loss": 0.155,
"num_tokens": 119425646.0,
"step": 372
},
{
"epoch": 2.181286549707602,
"grad_norm": 0.155304755457848,
"learning_rate": 2.6721128929735563e-06,
"loss": 0.1682,
"num_tokens": 119730833.0,
"step": 373
},
{
"epoch": 2.1871345029239766,
"grad_norm": 0.14504483487086625,
"learning_rate": 2.6500428862837878e-06,
"loss": 0.1591,
"num_tokens": 120056226.0,
"step": 374
},
{
"epoch": 2.192982456140351,
"grad_norm": 0.1381416074535997,
"learning_rate": 2.6280867534027286e-06,
"loss": 0.1628,
"num_tokens": 120386366.0,
"step": 375
},
{
"epoch": 2.198830409356725,
"grad_norm": 0.14481228971827553,
"learning_rate": 2.6062453716167273e-06,
"loss": 0.1681,
"num_tokens": 120734433.0,
"step": 376
},
{
"epoch": 2.2046783625730995,
"grad_norm": 0.15691625415835195,
"learning_rate": 2.5845196136270994e-06,
"loss": 0.1713,
"num_tokens": 121015717.0,
"step": 377
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.14498825747218128,
"learning_rate": 2.5629103475152654e-06,
"loss": 0.157,
"num_tokens": 121314872.0,
"step": 378
},
{
"epoch": 2.216374269005848,
"grad_norm": 0.15217065921697623,
"learning_rate": 2.541418436708054e-06,
"loss": 0.1641,
"num_tokens": 121609720.0,
"step": 379
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.149827230466305,
"learning_rate": 2.520044739943207e-06,
"loss": 0.1598,
"num_tokens": 121926793.0,
"step": 380
},
{
"epoch": 2.2280701754385963,
"grad_norm": 0.14401874517686966,
"learning_rate": 2.498790111235072e-06,
"loss": 0.1709,
"num_tokens": 122245778.0,
"step": 381
},
{
"epoch": 2.2339181286549707,
"grad_norm": 0.13403812014389224,
"learning_rate": 2.47765539984047e-06,
"loss": 0.1608,
"num_tokens": 122609822.0,
"step": 382
},
{
"epoch": 2.239766081871345,
"grad_norm": 0.14253922733943364,
"learning_rate": 2.4566414502247684e-06,
"loss": 0.1652,
"num_tokens": 122948206.0,
"step": 383
},
{
"epoch": 2.245614035087719,
"grad_norm": 0.1442664664053727,
"learning_rate": 2.435749102028139e-06,
"loss": 0.1695,
"num_tokens": 123281183.0,
"step": 384
},
{
"epoch": 2.2514619883040936,
"grad_norm": 0.14995127804324304,
"learning_rate": 2.414979190032008e-06,
"loss": 0.1667,
"num_tokens": 123606597.0,
"step": 385
},
{
"epoch": 2.257309941520468,
"grad_norm": 0.14167491108605929,
"learning_rate": 2.3943325441256993e-06,
"loss": 0.162,
"num_tokens": 123924915.0,
"step": 386
},
{
"epoch": 2.263157894736842,
"grad_norm": 0.1462621509052835,
"learning_rate": 2.373809989273277e-06,
"loss": 0.1668,
"num_tokens": 124223390.0,
"step": 387
},
{
"epoch": 2.2690058479532165,
"grad_norm": 0.14702160243574425,
"learning_rate": 2.353412345480587e-06,
"loss": 0.1629,
"num_tokens": 124541698.0,
"step": 388
},
{
"epoch": 2.2748538011695905,
"grad_norm": 0.15471872498523825,
"learning_rate": 2.3331404277624846e-06,
"loss": 0.1582,
"num_tokens": 124832848.0,
"step": 389
},
{
"epoch": 2.280701754385965,
"grad_norm": 0.14362745275711297,
"learning_rate": 2.312995046110272e-06,
"loss": 0.1698,
"num_tokens": 125154886.0,
"step": 390
},
{
"epoch": 2.2865497076023393,
"grad_norm": 0.1468561944816842,
"learning_rate": 2.292977005459341e-06,
"loss": 0.1756,
"num_tokens": 125484162.0,
"step": 391
},
{
"epoch": 2.2923976608187133,
"grad_norm": 0.15143848359912496,
"learning_rate": 2.2730871056570024e-06,
"loss": 0.1605,
"num_tokens": 125783060.0,
"step": 392
},
{
"epoch": 2.2982456140350878,
"grad_norm": 0.14651421165143075,
"learning_rate": 2.2533261414305243e-06,
"loss": 0.165,
"num_tokens": 126110333.0,
"step": 393
},
{
"epoch": 2.3040935672514617,
"grad_norm": 0.1475116907221095,
"learning_rate": 2.2336949023553924e-06,
"loss": 0.1616,
"num_tokens": 126442564.0,
"step": 394
},
{
"epoch": 2.309941520467836,
"grad_norm": 0.14460134481385417,
"learning_rate": 2.2141941728237467e-06,
"loss": 0.1654,
"num_tokens": 126752395.0,
"step": 395
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.14351634697316507,
"learning_rate": 2.194824732013047e-06,
"loss": 0.1626,
"num_tokens": 127089855.0,
"step": 396
},
{
"epoch": 2.3216374269005846,
"grad_norm": 0.13828986539988358,
"learning_rate": 2.1755873538549376e-06,
"loss": 0.159,
"num_tokens": 127411156.0,
"step": 397
},
{
"epoch": 2.327485380116959,
"grad_norm": 0.13645396729907014,
"learning_rate": 2.1564828070043275e-06,
"loss": 0.1533,
"num_tokens": 127736233.0,
"step": 398
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.14422895920728793,
"learning_rate": 2.137511854808672e-06,
"loss": 0.1575,
"num_tokens": 128060062.0,
"step": 399
},
{
"epoch": 2.3391812865497075,
"grad_norm": 0.1448501743068164,
"learning_rate": 2.1186752552774764e-06,
"loss": 0.1626,
"num_tokens": 128406839.0,
"step": 400
},
{
"epoch": 2.345029239766082,
"grad_norm": 0.13774903808615716,
"learning_rate": 2.099973761052007e-06,
"loss": 0.1662,
"num_tokens": 128766764.0,
"step": 401
},
{
"epoch": 2.3508771929824563,
"grad_norm": 0.1508233167288601,
"learning_rate": 2.081408119375219e-06,
"loss": 0.1653,
"num_tokens": 129070969.0,
"step": 402
},
{
"epoch": 2.3567251461988303,
"grad_norm": 0.14607227185650823,
"learning_rate": 2.0629790720618977e-06,
"loss": 0.1551,
"num_tokens": 129361599.0,
"step": 403
},
{
"epoch": 2.3625730994152048,
"grad_norm": 0.14540307746510248,
"learning_rate": 2.044687355469025e-06,
"loss": 0.168,
"num_tokens": 129689845.0,
"step": 404
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.14269113809456305,
"learning_rate": 2.0265337004663465e-06,
"loss": 0.1632,
"num_tokens": 130007281.0,
"step": 405
},
{
"epoch": 2.374269005847953,
"grad_norm": 0.14764837206982723,
"learning_rate": 2.008518832407176e-06,
"loss": 0.16,
"num_tokens": 130320317.0,
"step": 406
},
{
"epoch": 2.3801169590643276,
"grad_norm": 0.13886373538463548,
"learning_rate": 1.9906434710994098e-06,
"loss": 0.1659,
"num_tokens": 130668421.0,
"step": 407
},
{
"epoch": 2.3859649122807016,
"grad_norm": 0.14123121708441125,
"learning_rate": 1.9729083307767725e-06,
"loss": 0.1571,
"num_tokens": 130996417.0,
"step": 408
},
{
"epoch": 2.391812865497076,
"grad_norm": 0.14241772556155666,
"learning_rate": 1.955314120070269e-06,
"loss": 0.1586,
"num_tokens": 131324210.0,
"step": 409
},
{
"epoch": 2.39766081871345,
"grad_norm": 0.1536376204686897,
"learning_rate": 1.937861541979873e-06,
"loss": 0.1866,
"num_tokens": 131671552.0,
"step": 410
},
{
"epoch": 2.4035087719298245,
"grad_norm": 0.13606181694914196,
"learning_rate": 1.9205512938464465e-06,
"loss": 0.163,
"num_tokens": 132008743.0,
"step": 411
},
{
"epoch": 2.409356725146199,
"grad_norm": 0.14994033256966707,
"learning_rate": 1.903384067323863e-06,
"loss": 0.1644,
"num_tokens": 132306397.0,
"step": 412
},
{
"epoch": 2.415204678362573,
"grad_norm": 0.1437874855637201,
"learning_rate": 1.886360548351381e-06,
"loss": 0.1589,
"num_tokens": 132627946.0,
"step": 413
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.14231060929324155,
"learning_rate": 1.8694814171262355e-06,
"loss": 0.1616,
"num_tokens": 132981440.0,
"step": 414
},
{
"epoch": 2.426900584795322,
"grad_norm": 0.14278247493654592,
"learning_rate": 1.8527473480764545e-06,
"loss": 0.1638,
"num_tokens": 133326233.0,
"step": 415
},
{
"epoch": 2.4327485380116958,
"grad_norm": 0.14574512278213558,
"learning_rate": 1.8361590098339168e-06,
"loss": 0.1701,
"num_tokens": 133635483.0,
"step": 416
},
{
"epoch": 2.43859649122807,
"grad_norm": 0.14520072415658936,
"learning_rate": 1.8197170652076316e-06,
"loss": 0.1729,
"num_tokens": 133965064.0,
"step": 417
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.13880382782292727,
"learning_rate": 1.8034221711572633e-06,
"loss": 0.1642,
"num_tokens": 134297515.0,
"step": 418
},
{
"epoch": 2.4502923976608186,
"grad_norm": 0.14965359883764726,
"learning_rate": 1.7872749787668673e-06,
"loss": 0.1573,
"num_tokens": 134596743.0,
"step": 419
},
{
"epoch": 2.456140350877193,
"grad_norm": 0.14970450216253625,
"learning_rate": 1.7712761332188894e-06,
"loss": 0.1654,
"num_tokens": 134914562.0,
"step": 420
},
{
"epoch": 2.461988304093567,
"grad_norm": 0.15816166023028,
"learning_rate": 1.7554262737683803e-06,
"loss": 0.1487,
"num_tokens": 135230462.0,
"step": 421
},
{
"epoch": 2.4678362573099415,
"grad_norm": 0.1504826241844761,
"learning_rate": 1.7397260337174542e-06,
"loss": 0.157,
"num_tokens": 135546957.0,
"step": 422
},
{
"epoch": 2.473684210526316,
"grad_norm": 0.1394498720471719,
"learning_rate": 1.724176040389982e-06,
"loss": 0.1591,
"num_tokens": 135868298.0,
"step": 423
},
{
"epoch": 2.47953216374269,
"grad_norm": 0.14271119923974052,
"learning_rate": 1.708776915106528e-06,
"loss": 0.1647,
"num_tokens": 136208833.0,
"step": 424
},
{
"epoch": 2.4853801169590644,
"grad_norm": 0.1486501311116008,
"learning_rate": 1.6935292731595284e-06,
"loss": 0.1674,
"num_tokens": 136529945.0,
"step": 425
},
{
"epoch": 2.4912280701754383,
"grad_norm": 0.1568931867106138,
"learning_rate": 1.678433723788697e-06,
"loss": 0.1612,
"num_tokens": 136819684.0,
"step": 426
},
{
"epoch": 2.497076023391813,
"grad_norm": 0.14113107486411444,
"learning_rate": 1.6634908701566909e-06,
"loss": 0.1576,
"num_tokens": 137150211.0,
"step": 427
},
{
"epoch": 2.502923976608187,
"grad_norm": 0.14430747759375342,
"learning_rate": 1.6487013093250042e-06,
"loss": 0.1577,
"num_tokens": 137460607.0,
"step": 428
},
{
"epoch": 2.5087719298245617,
"grad_norm": 0.14188380800567507,
"learning_rate": 1.6340656322301158e-06,
"loss": 0.1681,
"num_tokens": 137808723.0,
"step": 429
},
{
"epoch": 2.5146198830409356,
"grad_norm": 0.13864076262612016,
"learning_rate": 1.619584423659875e-06,
"loss": 0.1508,
"num_tokens": 138122659.0,
"step": 430
},
{
"epoch": 2.52046783625731,
"grad_norm": 0.14329617113357643,
"learning_rate": 1.6052582622301398e-06,
"loss": 0.1604,
"num_tokens": 138456252.0,
"step": 431
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.14485980625620193,
"learning_rate": 1.5910877203616515e-06,
"loss": 0.1689,
"num_tokens": 138794979.0,
"step": 432
},
{
"epoch": 2.5321637426900585,
"grad_norm": 0.14001094426697513,
"learning_rate": 1.5770733642571662e-06,
"loss": 0.1613,
"num_tokens": 139114044.0,
"step": 433
},
{
"epoch": 2.538011695906433,
"grad_norm": 0.14542258651096243,
"learning_rate": 1.5632157538788322e-06,
"loss": 0.1626,
"num_tokens": 139425238.0,
"step": 434
},
{
"epoch": 2.543859649122807,
"grad_norm": 0.1489744939093393,
"learning_rate": 1.5495154429258136e-06,
"loss": 0.1722,
"num_tokens": 139757117.0,
"step": 435
},
{
"epoch": 2.5497076023391814,
"grad_norm": 0.14248652727375938,
"learning_rate": 1.5359729788121678e-06,
"loss": 0.1633,
"num_tokens": 140085804.0,
"step": 436
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.14025435225815486,
"learning_rate": 1.5225889026449754e-06,
"loss": 0.1609,
"num_tokens": 140409219.0,
"step": 437
},
{
"epoch": 2.56140350877193,
"grad_norm": 0.14823929068819988,
"learning_rate": 1.5093637492027136e-06,
"loss": 0.165,
"num_tokens": 140713444.0,
"step": 438
},
{
"epoch": 2.5672514619883042,
"grad_norm": 0.1370826700643183,
"learning_rate": 1.4962980469138932e-06,
"loss": 0.1462,
"num_tokens": 141032047.0,
"step": 439
},
{
"epoch": 2.573099415204678,
"grad_norm": 0.1522127782198521,
"learning_rate": 1.4833923178359428e-06,
"loss": 0.1621,
"num_tokens": 141338487.0,
"step": 440
},
{
"epoch": 2.5789473684210527,
"grad_norm": 0.14502198127336535,
"learning_rate": 1.4706470776343507e-06,
"loss": 0.164,
"num_tokens": 141669740.0,
"step": 441
},
{
"epoch": 2.5847953216374266,
"grad_norm": 0.15205245564816208,
"learning_rate": 1.458062835562058e-06,
"loss": 0.164,
"num_tokens": 141971862.0,
"step": 442
},
{
"epoch": 2.590643274853801,
"grad_norm": 0.15020510176845542,
"learning_rate": 1.4456400944391147e-06,
"loss": 0.1677,
"num_tokens": 142266889.0,
"step": 443
},
{
"epoch": 2.5964912280701755,
"grad_norm": 0.14391800055857798,
"learning_rate": 1.4333793506325832e-06,
"loss": 0.1606,
"num_tokens": 142584991.0,
"step": 444
},
{
"epoch": 2.60233918128655,
"grad_norm": 0.15111238280234057,
"learning_rate": 1.421281094036712e-06,
"loss": 0.1604,
"num_tokens": 142879877.0,
"step": 445
},
{
"epoch": 2.608187134502924,
"grad_norm": 0.1443837082025472,
"learning_rate": 1.4093458080533562e-06,
"loss": 0.1681,
"num_tokens": 143205197.0,
"step": 446
},
{
"epoch": 2.6140350877192984,
"grad_norm": 0.14463771075478601,
"learning_rate": 1.3975739695726649e-06,
"loss": 0.1671,
"num_tokens": 143531142.0,
"step": 447
},
{
"epoch": 2.6198830409356724,
"grad_norm": 0.14454878880987132,
"learning_rate": 1.385966048954027e-06,
"loss": 0.1633,
"num_tokens": 143858081.0,
"step": 448
},
{
"epoch": 2.625730994152047,
"grad_norm": 0.13984594893802477,
"learning_rate": 1.3745225100072737e-06,
"loss": 0.1617,
"num_tokens": 144185431.0,
"step": 449
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.13969336960280732,
"learning_rate": 1.3632438099741505e-06,
"loss": 0.159,
"num_tokens": 144497647.0,
"step": 450
},
{
"epoch": 2.6374269005847952,
"grad_norm": 0.14619663223314017,
"learning_rate": 1.3521303995100479e-06,
"loss": 0.1663,
"num_tokens": 144800894.0,
"step": 451
},
{
"epoch": 2.6432748538011697,
"grad_norm": 0.14130801987934924,
"learning_rate": 1.3411827226659887e-06,
"loss": 0.1592,
"num_tokens": 145115720.0,
"step": 452
},
{
"epoch": 2.6491228070175437,
"grad_norm": 0.13126230977093004,
"learning_rate": 1.330401216870891e-06,
"loss": 0.1479,
"num_tokens": 145451398.0,
"step": 453
},
{
"epoch": 2.654970760233918,
"grad_norm": 0.13673717620054995,
"learning_rate": 1.3197863129140916e-06,
"loss": 0.1564,
"num_tokens": 145791079.0,
"step": 454
},
{
"epoch": 2.6608187134502925,
"grad_norm": 0.1468555944041424,
"learning_rate": 1.3093384349281268e-06,
"loss": 0.1575,
"num_tokens": 146094234.0,
"step": 455
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.1511002120974835,
"learning_rate": 1.2990580003717904e-06,
"loss": 0.1828,
"num_tokens": 146403548.0,
"step": 456
},
{
"epoch": 2.672514619883041,
"grad_norm": 0.139909324379523,
"learning_rate": 1.2889454200134522e-06,
"loss": 0.1594,
"num_tokens": 146732523.0,
"step": 457
},
{
"epoch": 2.678362573099415,
"grad_norm": 0.14396563097050272,
"learning_rate": 1.2790010979146467e-06,
"loss": 0.1524,
"num_tokens": 147040850.0,
"step": 458
},
{
"epoch": 2.6842105263157894,
"grad_norm": 0.15000149938121365,
"learning_rate": 1.2692254314139243e-06,
"loss": 0.1697,
"num_tokens": 147343323.0,
"step": 459
},
{
"epoch": 2.690058479532164,
"grad_norm": 0.15745120782859975,
"learning_rate": 1.2596188111109805e-06,
"loss": 0.1681,
"num_tokens": 147635253.0,
"step": 460
},
{
"epoch": 2.6959064327485383,
"grad_norm": 0.1476631580057043,
"learning_rate": 1.2501816208510442e-06,
"loss": 0.1638,
"num_tokens": 147956892.0,
"step": 461
},
{
"epoch": 2.7017543859649122,
"grad_norm": 0.1423826560287115,
"learning_rate": 1.2409142377095435e-06,
"loss": 0.1571,
"num_tokens": 148260684.0,
"step": 462
},
{
"epoch": 2.7076023391812867,
"grad_norm": 0.14219644101756945,
"learning_rate": 1.231817031977037e-06,
"loss": 0.1585,
"num_tokens": 148571351.0,
"step": 463
},
{
"epoch": 2.7134502923976607,
"grad_norm": 0.15083997934575208,
"learning_rate": 1.2228903671444228e-06,
"loss": 0.1683,
"num_tokens": 148888226.0,
"step": 464
},
{
"epoch": 2.719298245614035,
"grad_norm": 0.13732672374616764,
"learning_rate": 1.2141345998884092e-06,
"loss": 0.1606,
"num_tokens": 149214770.0,
"step": 465
},
{
"epoch": 2.7251461988304095,
"grad_norm": 0.1441249561667527,
"learning_rate": 1.2055500800572657e-06,
"loss": 0.1626,
"num_tokens": 149545405.0,
"step": 466
},
{
"epoch": 2.7309941520467835,
"grad_norm": 0.14325592299925863,
"learning_rate": 1.1971371506568442e-06,
"loss": 0.1577,
"num_tokens": 149846805.0,
"step": 467
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.13745597306904,
"learning_rate": 1.1888961478368762e-06,
"loss": 0.1605,
"num_tokens": 150179635.0,
"step": 468
},
{
"epoch": 2.742690058479532,
"grad_norm": 0.13951839806880817,
"learning_rate": 1.1808274008775355e-06,
"loss": 0.158,
"num_tokens": 150503170.0,
"step": 469
},
{
"epoch": 2.7485380116959064,
"grad_norm": 0.1443209297278742,
"learning_rate": 1.1729312321762864e-06,
"loss": 0.1575,
"num_tokens": 150809725.0,
"step": 470
},
{
"epoch": 2.754385964912281,
"grad_norm": 0.13342746902950017,
"learning_rate": 1.1652079572350026e-06,
"loss": 0.154,
"num_tokens": 151157947.0,
"step": 471
},
{
"epoch": 2.760233918128655,
"grad_norm": 0.14326184156810018,
"learning_rate": 1.1576578846473558e-06,
"loss": 0.1584,
"num_tokens": 151479230.0,
"step": 472
},
{
"epoch": 2.7660818713450293,
"grad_norm": 0.1433950274377657,
"learning_rate": 1.1502813160864893e-06,
"loss": 0.1628,
"num_tokens": 151803444.0,
"step": 473
},
{
"epoch": 2.7719298245614032,
"grad_norm": 0.14139558810493624,
"learning_rate": 1.1430785462929644e-06,
"loss": 0.169,
"num_tokens": 152139382.0,
"step": 474
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.14115560637915964,
"learning_rate": 1.136049863062982e-06,
"loss": 0.162,
"num_tokens": 152452934.0,
"step": 475
},
{
"epoch": 2.783625730994152,
"grad_norm": 0.15211690242588544,
"learning_rate": 1.1291955472368825e-06,
"loss": 0.1601,
"num_tokens": 152748759.0,
"step": 476
},
{
"epoch": 2.7894736842105265,
"grad_norm": 0.13666222185265295,
"learning_rate": 1.1225158726879288e-06,
"loss": 0.1665,
"num_tokens": 153093029.0,
"step": 477
},
{
"epoch": 2.7953216374269005,
"grad_norm": 0.14398939041397035,
"learning_rate": 1.116011106311358e-06,
"loss": 0.1568,
"num_tokens": 153384854.0,
"step": 478
},
{
"epoch": 2.801169590643275,
"grad_norm": 0.15402905213013776,
"learning_rate": 1.1096815080137196e-06,
"loss": 0.1874,
"num_tokens": 153720419.0,
"step": 479
},
{
"epoch": 2.807017543859649,
"grad_norm": 0.14201446868127077,
"learning_rate": 1.103527330702493e-06,
"loss": 0.1624,
"num_tokens": 154054230.0,
"step": 480
},
{
"epoch": 2.8128654970760234,
"grad_norm": 0.14521001714072115,
"learning_rate": 1.0975488202759772e-06,
"loss": 0.1625,
"num_tokens": 154373913.0,
"step": 481
},
{
"epoch": 2.818713450292398,
"grad_norm": 0.1390122065414189,
"learning_rate": 1.0917462156134707e-06,
"loss": 0.1599,
"num_tokens": 154689857.0,
"step": 482
},
{
"epoch": 2.824561403508772,
"grad_norm": 0.14888462659883256,
"learning_rate": 1.0861197485657218e-06,
"loss": 0.1643,
"num_tokens": 154995696.0,
"step": 483
},
{
"epoch": 2.8304093567251463,
"grad_norm": 0.1420249071569745,
"learning_rate": 1.0806696439456695e-06,
"loss": 0.1629,
"num_tokens": 155312841.0,
"step": 484
},
{
"epoch": 2.8362573099415203,
"grad_norm": 0.14037073859681298,
"learning_rate": 1.0753961195194581e-06,
"loss": 0.1592,
"num_tokens": 155627374.0,
"step": 485
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.14506758333723185,
"learning_rate": 1.070299385997735e-06,
"loss": 0.1586,
"num_tokens": 155933888.0,
"step": 486
},
{
"epoch": 2.847953216374269,
"grad_norm": 0.14137857222010855,
"learning_rate": 1.0653796470272348e-06,
"loss": 0.1564,
"num_tokens": 156263120.0,
"step": 487
},
{
"epoch": 2.853801169590643,
"grad_norm": 0.14542756104140053,
"learning_rate": 1.0606370991826398e-06,
"loss": 0.165,
"num_tokens": 156585736.0,
"step": 488
},
{
"epoch": 2.8596491228070176,
"grad_norm": 0.13897674449897804,
"learning_rate": 1.0560719319587262e-06,
"loss": 0.1642,
"num_tokens": 156919577.0,
"step": 489
},
{
"epoch": 2.8654970760233915,
"grad_norm": 0.1411765074092308,
"learning_rate": 1.051684327762793e-06,
"loss": 0.1613,
"num_tokens": 157244512.0,
"step": 490
},
{
"epoch": 2.871345029239766,
"grad_norm": 0.14023194573370917,
"learning_rate": 1.047474461907374e-06,
"loss": 0.1593,
"num_tokens": 157555581.0,
"step": 491
},
{
"epoch": 2.8771929824561404,
"grad_norm": 0.13348582864925426,
"learning_rate": 1.043442502603231e-06,
"loss": 0.1587,
"num_tokens": 157890387.0,
"step": 492
},
{
"epoch": 2.883040935672515,
"grad_norm": 0.1458553034999855,
"learning_rate": 1.0395886109526346e-06,
"loss": 0.1672,
"num_tokens": 158227831.0,
"step": 493
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.13433755063333214,
"learning_rate": 1.0359129409429269e-06,
"loss": 0.1672,
"num_tokens": 158593011.0,
"step": 494
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.13367864457349435,
"learning_rate": 1.0324156394403683e-06,
"loss": 0.1553,
"num_tokens": 158933449.0,
"step": 495
},
{
"epoch": 2.9005847953216373,
"grad_norm": 0.14118001312714748,
"learning_rate": 1.0290968461842693e-06,
"loss": 0.1577,
"num_tokens": 159246578.0,
"step": 496
},
{
"epoch": 2.9064327485380117,
"grad_norm": 0.1463297027696001,
"learning_rate": 1.025956693781408e-06,
"loss": 0.1678,
"num_tokens": 159560178.0,
"step": 497
},
{
"epoch": 2.912280701754386,
"grad_norm": 0.14563817342813995,
"learning_rate": 1.0229953077007288e-06,
"loss": 0.1659,
"num_tokens": 159884093.0,
"step": 498
},
{
"epoch": 2.91812865497076,
"grad_norm": 0.14540407437292474,
"learning_rate": 1.0202128062683333e-06,
"loss": 0.1722,
"num_tokens": 160205475.0,
"step": 499
},
{
"epoch": 2.9239766081871346,
"grad_norm": 0.1405121201590573,
"learning_rate": 1.0176093006627485e-06,
"loss": 0.1562,
"num_tokens": 160545275.0,
"step": 500
},
{
"epoch": 2.9298245614035086,
"grad_norm": 0.14217083046189563,
"learning_rate": 1.0151848949104872e-06,
"loss": 0.1665,
"num_tokens": 160880973.0,
"step": 501
},
{
"epoch": 2.935672514619883,
"grad_norm": 0.1464306529530731,
"learning_rate": 1.01293968588189e-06,
"loss": 0.1707,
"num_tokens": 161205233.0,
"step": 502
},
{
"epoch": 2.9415204678362574,
"grad_norm": 0.1408296921593092,
"learning_rate": 1.0108737632872553e-06,
"loss": 0.16,
"num_tokens": 161521616.0,
"step": 503
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.14238166027330365,
"learning_rate": 1.0089872096732555e-06,
"loss": 0.1635,
"num_tokens": 161834814.0,
"step": 504
},
{
"epoch": 2.953216374269006,
"grad_norm": 0.1426737290624598,
"learning_rate": 1.0072801004196363e-06,
"loss": 0.1615,
"num_tokens": 162172431.0,
"step": 505
},
{
"epoch": 2.95906432748538,
"grad_norm": 0.14507835204373007,
"learning_rate": 1.0057525037362082e-06,
"loss": 0.162,
"num_tokens": 162500876.0,
"step": 506
},
{
"epoch": 2.9649122807017543,
"grad_norm": 0.14256066597160452,
"learning_rate": 1.0044044806601188e-06,
"loss": 0.161,
"num_tokens": 162830769.0,
"step": 507
},
{
"epoch": 2.9707602339181287,
"grad_norm": 0.13917168255862636,
"learning_rate": 1.003236085053414e-06,
"loss": 0.1598,
"num_tokens": 163185192.0,
"step": 508
},
{
"epoch": 2.976608187134503,
"grad_norm": 0.1402722871419169,
"learning_rate": 1.0022473636008867e-06,
"loss": 0.164,
"num_tokens": 163513051.0,
"step": 509
},
{
"epoch": 2.982456140350877,
"grad_norm": 0.15282843543700755,
"learning_rate": 1.0014383558082113e-06,
"loss": 0.169,
"num_tokens": 163816593.0,
"step": 510
},
{
"epoch": 2.9883040935672516,
"grad_norm": 0.14600064275991873,
"learning_rate": 1.000809094000365e-06,
"loss": 0.1582,
"num_tokens": 164132256.0,
"step": 511
},
{
"epoch": 2.9941520467836256,
"grad_norm": 0.142164254928358,
"learning_rate": 1.0003596033203359e-06,
"loss": 0.1675,
"num_tokens": 164460013.0,
"step": 512
},
{
"epoch": 3.0,
"grad_norm": 0.14017066974116044,
"learning_rate": 1.0000899017281195e-06,
"loss": 0.1563,
"num_tokens": 164791026.0,
"step": 513
},
{
"epoch": 3.0,
"step": 513,
"total_flos": 5.228622451394478e+17,
"train_loss": 0.21386383229877517,
"train_runtime": 6716.6337,
"train_samples_per_second": 9.771,
"train_steps_per_second": 0.076
}
],
"logging_steps": 1,
"max_steps": 513,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.228622451394478e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}