OLMo-2-1124-7B-Instruct_SFTv02.06 / trainer_state.json
Neelectric's picture
Model save
1a31e0c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9902912621359223,
"eval_steps": 500,
"global_step": 308,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006472491909385114,
"grad_norm": 5.306275844573975,
"learning_rate": 0.0,
"loss": 1.1501,
"num_tokens": 963559.0,
"step": 1
},
{
"epoch": 0.012944983818770227,
"grad_norm": 5.308248996734619,
"learning_rate": 3.2258064516129035e-07,
"loss": 1.1455,
"num_tokens": 1927760.0,
"step": 2
},
{
"epoch": 0.019417475728155338,
"grad_norm": 5.251079559326172,
"learning_rate": 6.451612903225807e-07,
"loss": 1.1391,
"num_tokens": 2896369.0,
"step": 3
},
{
"epoch": 0.025889967637540454,
"grad_norm": 5.1489105224609375,
"learning_rate": 9.67741935483871e-07,
"loss": 1.1296,
"num_tokens": 3886583.0,
"step": 4
},
{
"epoch": 0.032362459546925564,
"grad_norm": 4.995655059814453,
"learning_rate": 1.2903225806451614e-06,
"loss": 1.1222,
"num_tokens": 4864476.0,
"step": 5
},
{
"epoch": 0.038834951456310676,
"grad_norm": 4.873946189880371,
"learning_rate": 1.6129032258064516e-06,
"loss": 1.1242,
"num_tokens": 5873295.0,
"step": 6
},
{
"epoch": 0.045307443365695796,
"grad_norm": 4.26630973815918,
"learning_rate": 1.935483870967742e-06,
"loss": 1.0614,
"num_tokens": 6876484.0,
"step": 7
},
{
"epoch": 0.05177993527508091,
"grad_norm": 4.14864444732666,
"learning_rate": 2.2580645161290324e-06,
"loss": 1.0421,
"num_tokens": 7871664.0,
"step": 8
},
{
"epoch": 0.05825242718446602,
"grad_norm": 3.062345504760742,
"learning_rate": 2.580645161290323e-06,
"loss": 0.9609,
"num_tokens": 8869931.0,
"step": 9
},
{
"epoch": 0.06472491909385113,
"grad_norm": 2.806725263595581,
"learning_rate": 2.903225806451613e-06,
"loss": 0.9313,
"num_tokens": 9874687.0,
"step": 10
},
{
"epoch": 0.07119741100323625,
"grad_norm": 2.670482873916626,
"learning_rate": 3.225806451612903e-06,
"loss": 0.9088,
"num_tokens": 10842628.0,
"step": 11
},
{
"epoch": 0.07766990291262135,
"grad_norm": 1.8532583713531494,
"learning_rate": 3.548387096774194e-06,
"loss": 0.8088,
"num_tokens": 11815438.0,
"step": 12
},
{
"epoch": 0.08414239482200647,
"grad_norm": 1.6065739393234253,
"learning_rate": 3.870967741935484e-06,
"loss": 0.7868,
"num_tokens": 12788222.0,
"step": 13
},
{
"epoch": 0.09061488673139159,
"grad_norm": 1.3665945529937744,
"learning_rate": 4.193548387096774e-06,
"loss": 0.7594,
"num_tokens": 13756270.0,
"step": 14
},
{
"epoch": 0.0970873786407767,
"grad_norm": 1.265770673751831,
"learning_rate": 4.516129032258065e-06,
"loss": 0.7406,
"num_tokens": 14689813.0,
"step": 15
},
{
"epoch": 0.10355987055016182,
"grad_norm": 1.1766362190246582,
"learning_rate": 4.838709677419355e-06,
"loss": 0.6938,
"num_tokens": 15670657.0,
"step": 16
},
{
"epoch": 0.11003236245954692,
"grad_norm": 1.0507408380508423,
"learning_rate": 5.161290322580646e-06,
"loss": 0.6834,
"num_tokens": 16640491.0,
"step": 17
},
{
"epoch": 0.11650485436893204,
"grad_norm": 0.8553951382637024,
"learning_rate": 5.483870967741935e-06,
"loss": 0.6743,
"num_tokens": 17611388.0,
"step": 18
},
{
"epoch": 0.12297734627831715,
"grad_norm": 0.6638854146003723,
"learning_rate": 5.806451612903226e-06,
"loss": 0.6604,
"num_tokens": 18582653.0,
"step": 19
},
{
"epoch": 0.12944983818770225,
"grad_norm": 0.5444458723068237,
"learning_rate": 6.129032258064517e-06,
"loss": 0.6427,
"num_tokens": 19557459.0,
"step": 20
},
{
"epoch": 0.13592233009708737,
"grad_norm": 0.4539978504180908,
"learning_rate": 6.451612903225806e-06,
"loss": 0.6348,
"num_tokens": 20524158.0,
"step": 21
},
{
"epoch": 0.1423948220064725,
"grad_norm": 0.491314172744751,
"learning_rate": 6.774193548387097e-06,
"loss": 0.6084,
"num_tokens": 21502674.0,
"step": 22
},
{
"epoch": 0.1488673139158576,
"grad_norm": 0.47383806109428406,
"learning_rate": 7.096774193548388e-06,
"loss": 0.603,
"num_tokens": 22446294.0,
"step": 23
},
{
"epoch": 0.1553398058252427,
"grad_norm": 0.3826221525669098,
"learning_rate": 7.4193548387096784e-06,
"loss": 0.5929,
"num_tokens": 23430225.0,
"step": 24
},
{
"epoch": 0.16181229773462782,
"grad_norm": 0.33175480365753174,
"learning_rate": 7.741935483870968e-06,
"loss": 0.5837,
"num_tokens": 24380419.0,
"step": 25
},
{
"epoch": 0.16828478964401294,
"grad_norm": 0.3459703326225281,
"learning_rate": 8.064516129032258e-06,
"loss": 0.5689,
"num_tokens": 25343499.0,
"step": 26
},
{
"epoch": 0.17475728155339806,
"grad_norm": 0.3513648509979248,
"learning_rate": 8.387096774193549e-06,
"loss": 0.5649,
"num_tokens": 26315921.0,
"step": 27
},
{
"epoch": 0.18122977346278318,
"grad_norm": 0.28894567489624023,
"learning_rate": 8.70967741935484e-06,
"loss": 0.5654,
"num_tokens": 27255686.0,
"step": 28
},
{
"epoch": 0.18770226537216828,
"grad_norm": 0.2634493112564087,
"learning_rate": 9.03225806451613e-06,
"loss": 0.555,
"num_tokens": 28232449.0,
"step": 29
},
{
"epoch": 0.1941747572815534,
"grad_norm": 0.25820741057395935,
"learning_rate": 9.35483870967742e-06,
"loss": 0.5464,
"num_tokens": 29213099.0,
"step": 30
},
{
"epoch": 0.20064724919093851,
"grad_norm": 0.22817398607730865,
"learning_rate": 9.67741935483871e-06,
"loss": 0.5437,
"num_tokens": 30210546.0,
"step": 31
},
{
"epoch": 0.20711974110032363,
"grad_norm": 0.2116086781024933,
"learning_rate": 1e-05,
"loss": 0.5284,
"num_tokens": 31190215.0,
"step": 32
},
{
"epoch": 0.21359223300970873,
"grad_norm": 0.22091032564640045,
"learning_rate": 1e-05,
"loss": 0.5235,
"num_tokens": 32152368.0,
"step": 33
},
{
"epoch": 0.22006472491909385,
"grad_norm": 0.21065133810043335,
"learning_rate": 1e-05,
"loss": 0.5313,
"num_tokens": 33138410.0,
"step": 34
},
{
"epoch": 0.22653721682847897,
"grad_norm": 0.1995198130607605,
"learning_rate": 1e-05,
"loss": 0.5205,
"num_tokens": 34103559.0,
"step": 35
},
{
"epoch": 0.23300970873786409,
"grad_norm": 0.193314790725708,
"learning_rate": 1e-05,
"loss": 0.5202,
"num_tokens": 35099702.0,
"step": 36
},
{
"epoch": 0.23948220064724918,
"grad_norm": 0.1861106902360916,
"learning_rate": 1e-05,
"loss": 0.5186,
"num_tokens": 36067246.0,
"step": 37
},
{
"epoch": 0.2459546925566343,
"grad_norm": 0.1703341007232666,
"learning_rate": 1e-05,
"loss": 0.5074,
"num_tokens": 37074157.0,
"step": 38
},
{
"epoch": 0.2524271844660194,
"grad_norm": 0.19160355627536774,
"learning_rate": 1e-05,
"loss": 0.5029,
"num_tokens": 38026864.0,
"step": 39
},
{
"epoch": 0.2588996763754045,
"grad_norm": 0.17060473561286926,
"learning_rate": 1e-05,
"loss": 0.4967,
"num_tokens": 39007950.0,
"step": 40
},
{
"epoch": 0.26537216828478966,
"grad_norm": 0.17483553290367126,
"learning_rate": 1e-05,
"loss": 0.5045,
"num_tokens": 40004404.0,
"step": 41
},
{
"epoch": 0.27184466019417475,
"grad_norm": 0.16585178673267365,
"learning_rate": 1e-05,
"loss": 0.4985,
"num_tokens": 40979367.0,
"step": 42
},
{
"epoch": 0.2783171521035599,
"grad_norm": 0.16377580165863037,
"learning_rate": 1e-05,
"loss": 0.4941,
"num_tokens": 41954716.0,
"step": 43
},
{
"epoch": 0.284789644012945,
"grad_norm": 0.16064569354057312,
"learning_rate": 1e-05,
"loss": 0.5018,
"num_tokens": 42927994.0,
"step": 44
},
{
"epoch": 0.2912621359223301,
"grad_norm": 0.15038461983203888,
"learning_rate": 1e-05,
"loss": 0.4892,
"num_tokens": 43870458.0,
"step": 45
},
{
"epoch": 0.2977346278317152,
"grad_norm": 0.15048164129257202,
"learning_rate": 1e-05,
"loss": 0.4884,
"num_tokens": 44845205.0,
"step": 46
},
{
"epoch": 0.3042071197411003,
"grad_norm": 0.15347424149513245,
"learning_rate": 1e-05,
"loss": 0.4861,
"num_tokens": 45817785.0,
"step": 47
},
{
"epoch": 0.3106796116504854,
"grad_norm": 0.14698895812034607,
"learning_rate": 1e-05,
"loss": 0.4881,
"num_tokens": 46768552.0,
"step": 48
},
{
"epoch": 0.31715210355987056,
"grad_norm": 0.14178597927093506,
"learning_rate": 1e-05,
"loss": 0.4721,
"num_tokens": 47720390.0,
"step": 49
},
{
"epoch": 0.32362459546925565,
"grad_norm": 0.15780387818813324,
"learning_rate": 1e-05,
"loss": 0.4818,
"num_tokens": 48659119.0,
"step": 50
},
{
"epoch": 0.3300970873786408,
"grad_norm": 0.15007564425468445,
"learning_rate": 1e-05,
"loss": 0.4812,
"num_tokens": 49635896.0,
"step": 51
},
{
"epoch": 0.3365695792880259,
"grad_norm": 0.14719824492931366,
"learning_rate": 1e-05,
"loss": 0.4795,
"num_tokens": 50617481.0,
"step": 52
},
{
"epoch": 0.343042071197411,
"grad_norm": 0.14689336717128754,
"learning_rate": 1e-05,
"loss": 0.4749,
"num_tokens": 51580322.0,
"step": 53
},
{
"epoch": 0.34951456310679613,
"grad_norm": 0.14928674697875977,
"learning_rate": 1e-05,
"loss": 0.4772,
"num_tokens": 52544983.0,
"step": 54
},
{
"epoch": 0.3559870550161812,
"grad_norm": 0.14493519067764282,
"learning_rate": 1e-05,
"loss": 0.4696,
"num_tokens": 53538431.0,
"step": 55
},
{
"epoch": 0.36245954692556637,
"grad_norm": 0.1502736210823059,
"learning_rate": 1e-05,
"loss": 0.4701,
"num_tokens": 54497097.0,
"step": 56
},
{
"epoch": 0.36893203883495146,
"grad_norm": 0.13822625577449799,
"learning_rate": 1e-05,
"loss": 0.46,
"num_tokens": 55515695.0,
"step": 57
},
{
"epoch": 0.37540453074433655,
"grad_norm": 0.15165378153324127,
"learning_rate": 1e-05,
"loss": 0.463,
"num_tokens": 56471100.0,
"step": 58
},
{
"epoch": 0.3818770226537217,
"grad_norm": 0.1460576206445694,
"learning_rate": 1e-05,
"loss": 0.4681,
"num_tokens": 57460948.0,
"step": 59
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.1480175405740738,
"learning_rate": 1e-05,
"loss": 0.4673,
"num_tokens": 58450663.0,
"step": 60
},
{
"epoch": 0.3948220064724919,
"grad_norm": 0.15429073572158813,
"learning_rate": 1e-05,
"loss": 0.4617,
"num_tokens": 59407638.0,
"step": 61
},
{
"epoch": 0.40129449838187703,
"grad_norm": 0.14732158184051514,
"learning_rate": 1e-05,
"loss": 0.4577,
"num_tokens": 60358388.0,
"step": 62
},
{
"epoch": 0.4077669902912621,
"grad_norm": 0.1532926857471466,
"learning_rate": 1e-05,
"loss": 0.4534,
"num_tokens": 61306137.0,
"step": 63
},
{
"epoch": 0.41423948220064727,
"grad_norm": 0.15105584263801575,
"learning_rate": 1e-05,
"loss": 0.4643,
"num_tokens": 62298616.0,
"step": 64
},
{
"epoch": 0.42071197411003236,
"grad_norm": 0.15281681716442108,
"learning_rate": 1e-05,
"loss": 0.4544,
"num_tokens": 63242620.0,
"step": 65
},
{
"epoch": 0.42718446601941745,
"grad_norm": 0.14645300805568695,
"learning_rate": 1e-05,
"loss": 0.4711,
"num_tokens": 64231555.0,
"step": 66
},
{
"epoch": 0.4336569579288026,
"grad_norm": 0.1446738988161087,
"learning_rate": 1e-05,
"loss": 0.4586,
"num_tokens": 65190302.0,
"step": 67
},
{
"epoch": 0.4401294498381877,
"grad_norm": 0.15015272796154022,
"learning_rate": 1e-05,
"loss": 0.4581,
"num_tokens": 66175194.0,
"step": 68
},
{
"epoch": 0.44660194174757284,
"grad_norm": 0.14811600744724274,
"learning_rate": 1e-05,
"loss": 0.4547,
"num_tokens": 67146659.0,
"step": 69
},
{
"epoch": 0.45307443365695793,
"grad_norm": 0.15160052478313446,
"learning_rate": 1e-05,
"loss": 0.4389,
"num_tokens": 68115421.0,
"step": 70
},
{
"epoch": 0.459546925566343,
"grad_norm": 0.15062326192855835,
"learning_rate": 1e-05,
"loss": 0.4372,
"num_tokens": 69078962.0,
"step": 71
},
{
"epoch": 0.46601941747572817,
"grad_norm": 0.15071865916252136,
"learning_rate": 1e-05,
"loss": 0.4611,
"num_tokens": 70056552.0,
"step": 72
},
{
"epoch": 0.47249190938511326,
"grad_norm": 0.1435338407754898,
"learning_rate": 1e-05,
"loss": 0.4549,
"num_tokens": 71056405.0,
"step": 73
},
{
"epoch": 0.47896440129449835,
"grad_norm": 0.1523878276348114,
"learning_rate": 1e-05,
"loss": 0.4652,
"num_tokens": 72011963.0,
"step": 74
},
{
"epoch": 0.4854368932038835,
"grad_norm": 0.1481529027223587,
"learning_rate": 1e-05,
"loss": 0.4524,
"num_tokens": 72974088.0,
"step": 75
},
{
"epoch": 0.4919093851132686,
"grad_norm": 0.14620022475719452,
"learning_rate": 1e-05,
"loss": 0.4472,
"num_tokens": 73963066.0,
"step": 76
},
{
"epoch": 0.49838187702265374,
"grad_norm": 0.14328692853450775,
"learning_rate": 1e-05,
"loss": 0.4397,
"num_tokens": 74927507.0,
"step": 77
},
{
"epoch": 0.5048543689320388,
"grad_norm": 0.14814729988574982,
"learning_rate": 1e-05,
"loss": 0.4445,
"num_tokens": 75913302.0,
"step": 78
},
{
"epoch": 0.511326860841424,
"grad_norm": 0.14375941455364227,
"learning_rate": 1e-05,
"loss": 0.4532,
"num_tokens": 76871637.0,
"step": 79
},
{
"epoch": 0.517799352750809,
"grad_norm": 0.15578249096870422,
"learning_rate": 1e-05,
"loss": 0.4472,
"num_tokens": 77844738.0,
"step": 80
},
{
"epoch": 0.5242718446601942,
"grad_norm": 0.14302082359790802,
"learning_rate": 1e-05,
"loss": 0.4407,
"num_tokens": 78789735.0,
"step": 81
},
{
"epoch": 0.5307443365695793,
"grad_norm": 0.16008679568767548,
"learning_rate": 1e-05,
"loss": 0.4382,
"num_tokens": 79740057.0,
"step": 82
},
{
"epoch": 0.5372168284789643,
"grad_norm": 0.14800778031349182,
"learning_rate": 1e-05,
"loss": 0.4451,
"num_tokens": 80691344.0,
"step": 83
},
{
"epoch": 0.5436893203883495,
"grad_norm": 0.14400288462638855,
"learning_rate": 1e-05,
"loss": 0.4441,
"num_tokens": 81695065.0,
"step": 84
},
{
"epoch": 0.5501618122977346,
"grad_norm": 0.13986420631408691,
"learning_rate": 1e-05,
"loss": 0.4414,
"num_tokens": 82670591.0,
"step": 85
},
{
"epoch": 0.5566343042071198,
"grad_norm": 0.15451516211032867,
"learning_rate": 1e-05,
"loss": 0.4415,
"num_tokens": 83619525.0,
"step": 86
},
{
"epoch": 0.5631067961165048,
"grad_norm": 0.14956173300743103,
"learning_rate": 1e-05,
"loss": 0.441,
"num_tokens": 84583829.0,
"step": 87
},
{
"epoch": 0.56957928802589,
"grad_norm": 0.15179790556430817,
"learning_rate": 1e-05,
"loss": 0.4407,
"num_tokens": 85554557.0,
"step": 88
},
{
"epoch": 0.5760517799352751,
"grad_norm": 0.15142634510993958,
"learning_rate": 1e-05,
"loss": 0.442,
"num_tokens": 86534800.0,
"step": 89
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.16368825733661652,
"learning_rate": 1e-05,
"loss": 0.4456,
"num_tokens": 87479322.0,
"step": 90
},
{
"epoch": 0.5889967637540453,
"grad_norm": 0.16267365217208862,
"learning_rate": 1e-05,
"loss": 0.4436,
"num_tokens": 88440604.0,
"step": 91
},
{
"epoch": 0.5954692556634305,
"grad_norm": 0.1441015601158142,
"learning_rate": 1e-05,
"loss": 0.4407,
"num_tokens": 89440902.0,
"step": 92
},
{
"epoch": 0.6019417475728155,
"grad_norm": 0.15438257157802582,
"learning_rate": 1e-05,
"loss": 0.4335,
"num_tokens": 90420292.0,
"step": 93
},
{
"epoch": 0.6084142394822006,
"grad_norm": 0.15095576643943787,
"learning_rate": 1e-05,
"loss": 0.4427,
"num_tokens": 91365927.0,
"step": 94
},
{
"epoch": 0.6148867313915858,
"grad_norm": 0.15472491085529327,
"learning_rate": 1e-05,
"loss": 0.4369,
"num_tokens": 92339481.0,
"step": 95
},
{
"epoch": 0.6213592233009708,
"grad_norm": 0.15544648468494415,
"learning_rate": 1e-05,
"loss": 0.4323,
"num_tokens": 93277920.0,
"step": 96
},
{
"epoch": 0.627831715210356,
"grad_norm": 0.1473296582698822,
"learning_rate": 1e-05,
"loss": 0.4314,
"num_tokens": 94233058.0,
"step": 97
},
{
"epoch": 0.6343042071197411,
"grad_norm": 0.15555965900421143,
"learning_rate": 1e-05,
"loss": 0.4443,
"num_tokens": 95195865.0,
"step": 98
},
{
"epoch": 0.6407766990291263,
"grad_norm": 0.15503650903701782,
"learning_rate": 1e-05,
"loss": 0.4269,
"num_tokens": 96165533.0,
"step": 99
},
{
"epoch": 0.6472491909385113,
"grad_norm": 0.1431349366903305,
"learning_rate": 1e-05,
"loss": 0.4291,
"num_tokens": 97128908.0,
"step": 100
},
{
"epoch": 0.6537216828478964,
"grad_norm": 0.1569562703371048,
"learning_rate": 1e-05,
"loss": 0.4301,
"num_tokens": 98137812.0,
"step": 101
},
{
"epoch": 0.6601941747572816,
"grad_norm": 0.14877967536449432,
"learning_rate": 1e-05,
"loss": 0.4264,
"num_tokens": 99133445.0,
"step": 102
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.16731275618076324,
"learning_rate": 1e-05,
"loss": 0.4193,
"num_tokens": 100106034.0,
"step": 103
},
{
"epoch": 0.6731391585760518,
"grad_norm": 0.15022587776184082,
"learning_rate": 1e-05,
"loss": 0.4466,
"num_tokens": 101060830.0,
"step": 104
},
{
"epoch": 0.6796116504854369,
"grad_norm": 0.15176573395729065,
"learning_rate": 1e-05,
"loss": 0.431,
"num_tokens": 102032708.0,
"step": 105
},
{
"epoch": 0.686084142394822,
"grad_norm": 0.1639350950717926,
"learning_rate": 1e-05,
"loss": 0.4145,
"num_tokens": 103020888.0,
"step": 106
},
{
"epoch": 0.6925566343042071,
"grad_norm": 0.1504807472229004,
"learning_rate": 1e-05,
"loss": 0.4377,
"num_tokens": 103986883.0,
"step": 107
},
{
"epoch": 0.6990291262135923,
"grad_norm": 0.16267862915992737,
"learning_rate": 1e-05,
"loss": 0.4348,
"num_tokens": 104978514.0,
"step": 108
},
{
"epoch": 0.7055016181229773,
"grad_norm": 0.14575445652008057,
"learning_rate": 1e-05,
"loss": 0.423,
"num_tokens": 105938542.0,
"step": 109
},
{
"epoch": 0.7119741100323624,
"grad_norm": 0.17577598989009857,
"learning_rate": 1e-05,
"loss": 0.425,
"num_tokens": 106909993.0,
"step": 110
},
{
"epoch": 0.7184466019417476,
"grad_norm": 0.14932656288146973,
"learning_rate": 1e-05,
"loss": 0.4292,
"num_tokens": 107872313.0,
"step": 111
},
{
"epoch": 0.7249190938511327,
"grad_norm": 0.15973471105098724,
"learning_rate": 1e-05,
"loss": 0.423,
"num_tokens": 108876000.0,
"step": 112
},
{
"epoch": 0.7313915857605178,
"grad_norm": 0.16627554595470428,
"learning_rate": 1e-05,
"loss": 0.4234,
"num_tokens": 109857790.0,
"step": 113
},
{
"epoch": 0.7378640776699029,
"grad_norm": 0.1428242325782776,
"learning_rate": 1e-05,
"loss": 0.4242,
"num_tokens": 110826497.0,
"step": 114
},
{
"epoch": 0.7443365695792881,
"grad_norm": 0.15781018137931824,
"learning_rate": 1e-05,
"loss": 0.4228,
"num_tokens": 111775780.0,
"step": 115
},
{
"epoch": 0.7508090614886731,
"grad_norm": 0.15125828981399536,
"learning_rate": 1e-05,
"loss": 0.4248,
"num_tokens": 112755203.0,
"step": 116
},
{
"epoch": 0.7572815533980582,
"grad_norm": 0.16092541813850403,
"learning_rate": 1e-05,
"loss": 0.4246,
"num_tokens": 113720983.0,
"step": 117
},
{
"epoch": 0.7637540453074434,
"grad_norm": 0.14613084495067596,
"learning_rate": 1e-05,
"loss": 0.4156,
"num_tokens": 114690500.0,
"step": 118
},
{
"epoch": 0.7702265372168284,
"grad_norm": 0.16003067791461945,
"learning_rate": 1e-05,
"loss": 0.4162,
"num_tokens": 115663752.0,
"step": 119
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.15294891595840454,
"learning_rate": 1e-05,
"loss": 0.4136,
"num_tokens": 116656118.0,
"step": 120
},
{
"epoch": 0.7831715210355987,
"grad_norm": 0.14649586379528046,
"learning_rate": 1e-05,
"loss": 0.4226,
"num_tokens": 117647006.0,
"step": 121
},
{
"epoch": 0.7896440129449838,
"grad_norm": 0.15786471962928772,
"learning_rate": 1e-05,
"loss": 0.4139,
"num_tokens": 118637898.0,
"step": 122
},
{
"epoch": 0.7961165048543689,
"grad_norm": 0.15297958254814148,
"learning_rate": 1e-05,
"loss": 0.4234,
"num_tokens": 119611539.0,
"step": 123
},
{
"epoch": 0.8025889967637541,
"grad_norm": 0.15327708423137665,
"learning_rate": 1e-05,
"loss": 0.4252,
"num_tokens": 120576485.0,
"step": 124
},
{
"epoch": 0.8090614886731392,
"grad_norm": 0.14421789348125458,
"learning_rate": 1e-05,
"loss": 0.4253,
"num_tokens": 121567862.0,
"step": 125
},
{
"epoch": 0.8155339805825242,
"grad_norm": 0.15074017643928528,
"learning_rate": 1e-05,
"loss": 0.4156,
"num_tokens": 122530893.0,
"step": 126
},
{
"epoch": 0.8220064724919094,
"grad_norm": 0.15563052892684937,
"learning_rate": 1e-05,
"loss": 0.4176,
"num_tokens": 123480227.0,
"step": 127
},
{
"epoch": 0.8284789644012945,
"grad_norm": 0.13963429629802704,
"learning_rate": 1e-05,
"loss": 0.4312,
"num_tokens": 124462070.0,
"step": 128
},
{
"epoch": 0.8349514563106796,
"grad_norm": 0.14507392048835754,
"learning_rate": 1e-05,
"loss": 0.4246,
"num_tokens": 125440428.0,
"step": 129
},
{
"epoch": 0.8414239482200647,
"grad_norm": 0.14936408400535583,
"learning_rate": 1e-05,
"loss": 0.4214,
"num_tokens": 126416310.0,
"step": 130
},
{
"epoch": 0.8478964401294499,
"grad_norm": 0.14725163578987122,
"learning_rate": 1e-05,
"loss": 0.4221,
"num_tokens": 127406436.0,
"step": 131
},
{
"epoch": 0.8543689320388349,
"grad_norm": 0.1589491367340088,
"learning_rate": 1e-05,
"loss": 0.4191,
"num_tokens": 128372197.0,
"step": 132
},
{
"epoch": 0.86084142394822,
"grad_norm": 0.14656752347946167,
"learning_rate": 1e-05,
"loss": 0.4174,
"num_tokens": 129371264.0,
"step": 133
},
{
"epoch": 0.8673139158576052,
"grad_norm": 0.14391183853149414,
"learning_rate": 1e-05,
"loss": 0.4026,
"num_tokens": 130306080.0,
"step": 134
},
{
"epoch": 0.8737864077669902,
"grad_norm": 0.14788095653057098,
"learning_rate": 1e-05,
"loss": 0.4227,
"num_tokens": 131261937.0,
"step": 135
},
{
"epoch": 0.8802588996763754,
"grad_norm": 0.13957837224006653,
"learning_rate": 1e-05,
"loss": 0.416,
"num_tokens": 132223182.0,
"step": 136
},
{
"epoch": 0.8867313915857605,
"grad_norm": 0.14059896767139435,
"learning_rate": 1e-05,
"loss": 0.4194,
"num_tokens": 133211562.0,
"step": 137
},
{
"epoch": 0.8932038834951457,
"grad_norm": 0.14847581088542938,
"learning_rate": 1e-05,
"loss": 0.4171,
"num_tokens": 134208133.0,
"step": 138
},
{
"epoch": 0.8996763754045307,
"grad_norm": 0.14683924615383148,
"learning_rate": 1e-05,
"loss": 0.4191,
"num_tokens": 135194807.0,
"step": 139
},
{
"epoch": 0.9061488673139159,
"grad_norm": 0.14641155302524567,
"learning_rate": 1e-05,
"loss": 0.4178,
"num_tokens": 136175941.0,
"step": 140
},
{
"epoch": 0.912621359223301,
"grad_norm": 0.1390344202518463,
"learning_rate": 1e-05,
"loss": 0.4291,
"num_tokens": 137117638.0,
"step": 141
},
{
"epoch": 0.919093851132686,
"grad_norm": 0.1498357057571411,
"learning_rate": 1e-05,
"loss": 0.4077,
"num_tokens": 138102412.0,
"step": 142
},
{
"epoch": 0.9255663430420712,
"grad_norm": 0.14186346530914307,
"learning_rate": 1e-05,
"loss": 0.4099,
"num_tokens": 139056304.0,
"step": 143
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.14950646460056305,
"learning_rate": 1e-05,
"loss": 0.4154,
"num_tokens": 140002988.0,
"step": 144
},
{
"epoch": 0.9385113268608414,
"grad_norm": 0.14963679015636444,
"learning_rate": 1e-05,
"loss": 0.4184,
"num_tokens": 140939323.0,
"step": 145
},
{
"epoch": 0.9449838187702265,
"grad_norm": 0.15120644867420197,
"learning_rate": 1e-05,
"loss": 0.4126,
"num_tokens": 141898246.0,
"step": 146
},
{
"epoch": 0.9514563106796117,
"grad_norm": 0.162687748670578,
"learning_rate": 1e-05,
"loss": 0.4204,
"num_tokens": 142861118.0,
"step": 147
},
{
"epoch": 0.9579288025889967,
"grad_norm": 0.15032649040222168,
"learning_rate": 1e-05,
"loss": 0.4136,
"num_tokens": 143845628.0,
"step": 148
},
{
"epoch": 0.9644012944983819,
"grad_norm": 0.14711233973503113,
"learning_rate": 1e-05,
"loss": 0.4253,
"num_tokens": 144816454.0,
"step": 149
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.1571267545223236,
"learning_rate": 1e-05,
"loss": 0.4142,
"num_tokens": 145791528.0,
"step": 150
},
{
"epoch": 0.9773462783171522,
"grad_norm": 0.15269917249679565,
"learning_rate": 1e-05,
"loss": 0.4266,
"num_tokens": 146776225.0,
"step": 151
},
{
"epoch": 0.9838187702265372,
"grad_norm": 0.14772002398967743,
"learning_rate": 1e-05,
"loss": 0.4096,
"num_tokens": 147716038.0,
"step": 152
},
{
"epoch": 0.9902912621359223,
"grad_norm": 0.16296416521072388,
"learning_rate": 1e-05,
"loss": 0.407,
"num_tokens": 148641036.0,
"step": 153
},
{
"epoch": 0.9967637540453075,
"grad_norm": 0.15487153828144073,
"learning_rate": 1e-05,
"loss": 0.4103,
"num_tokens": 149611850.0,
"step": 154
},
{
"epoch": 1.0,
"grad_norm": 0.15487153828144073,
"learning_rate": 1e-05,
"loss": 0.4121,
"num_tokens": 150082857.0,
"step": 155
},
{
"epoch": 1.006472491909385,
"grad_norm": 0.23388831317424774,
"learning_rate": 1e-05,
"loss": 0.3944,
"num_tokens": 151065207.0,
"step": 156
},
{
"epoch": 1.0129449838187703,
"grad_norm": 0.17887376248836517,
"learning_rate": 1e-05,
"loss": 0.3865,
"num_tokens": 152046682.0,
"step": 157
},
{
"epoch": 1.0194174757281553,
"grad_norm": 0.1608133316040039,
"learning_rate": 1e-05,
"loss": 0.3896,
"num_tokens": 153014543.0,
"step": 158
},
{
"epoch": 1.0258899676375404,
"grad_norm": 0.17408691346645355,
"learning_rate": 1e-05,
"loss": 0.3962,
"num_tokens": 153988434.0,
"step": 159
},
{
"epoch": 1.0323624595469256,
"grad_norm": 0.1740640550851822,
"learning_rate": 1e-05,
"loss": 0.3925,
"num_tokens": 154938356.0,
"step": 160
},
{
"epoch": 1.0388349514563107,
"grad_norm": 0.1563650667667389,
"learning_rate": 1e-05,
"loss": 0.3924,
"num_tokens": 155920665.0,
"step": 161
},
{
"epoch": 1.0453074433656957,
"grad_norm": 0.16022861003875732,
"learning_rate": 1e-05,
"loss": 0.3946,
"num_tokens": 156878844.0,
"step": 162
},
{
"epoch": 1.051779935275081,
"grad_norm": 0.1814146637916565,
"learning_rate": 1e-05,
"loss": 0.3855,
"num_tokens": 157852056.0,
"step": 163
},
{
"epoch": 1.058252427184466,
"grad_norm": 0.1558738648891449,
"learning_rate": 1e-05,
"loss": 0.3948,
"num_tokens": 158847308.0,
"step": 164
},
{
"epoch": 1.064724919093851,
"grad_norm": 0.1744164228439331,
"learning_rate": 1e-05,
"loss": 0.3968,
"num_tokens": 159815676.0,
"step": 165
},
{
"epoch": 1.0711974110032363,
"grad_norm": 0.15834972262382507,
"learning_rate": 1e-05,
"loss": 0.3949,
"num_tokens": 160783247.0,
"step": 166
},
{
"epoch": 1.0776699029126213,
"grad_norm": 0.16717448830604553,
"learning_rate": 1e-05,
"loss": 0.3905,
"num_tokens": 161757978.0,
"step": 167
},
{
"epoch": 1.0841423948220066,
"grad_norm": 0.1640552282333374,
"learning_rate": 1e-05,
"loss": 0.3853,
"num_tokens": 162697927.0,
"step": 168
},
{
"epoch": 1.0906148867313916,
"grad_norm": 0.1878693401813507,
"learning_rate": 1e-05,
"loss": 0.3935,
"num_tokens": 163662092.0,
"step": 169
},
{
"epoch": 1.0970873786407767,
"grad_norm": 0.15845640003681183,
"learning_rate": 1e-05,
"loss": 0.3836,
"num_tokens": 164636005.0,
"step": 170
},
{
"epoch": 1.103559870550162,
"grad_norm": 0.17992043495178223,
"learning_rate": 1e-05,
"loss": 0.3908,
"num_tokens": 165575709.0,
"step": 171
},
{
"epoch": 1.110032362459547,
"grad_norm": 0.15234288573265076,
"learning_rate": 1e-05,
"loss": 0.3789,
"num_tokens": 166581172.0,
"step": 172
},
{
"epoch": 1.116504854368932,
"grad_norm": 0.15983612835407257,
"learning_rate": 1e-05,
"loss": 0.389,
"num_tokens": 167529454.0,
"step": 173
},
{
"epoch": 1.1229773462783172,
"grad_norm": 0.1675143986940384,
"learning_rate": 1e-05,
"loss": 0.3799,
"num_tokens": 168484414.0,
"step": 174
},
{
"epoch": 1.1294498381877023,
"grad_norm": 0.15397170186042786,
"learning_rate": 1e-05,
"loss": 0.3921,
"num_tokens": 169452401.0,
"step": 175
},
{
"epoch": 1.1359223300970873,
"grad_norm": 0.17627382278442383,
"learning_rate": 1e-05,
"loss": 0.383,
"num_tokens": 170407712.0,
"step": 176
},
{
"epoch": 1.1423948220064726,
"grad_norm": 0.18198609352111816,
"learning_rate": 1e-05,
"loss": 0.3901,
"num_tokens": 171418075.0,
"step": 177
},
{
"epoch": 1.1488673139158576,
"grad_norm": 0.1528196781873703,
"learning_rate": 1e-05,
"loss": 0.3893,
"num_tokens": 172409907.0,
"step": 178
},
{
"epoch": 1.1553398058252426,
"grad_norm": 0.17974089086055756,
"learning_rate": 1e-05,
"loss": 0.3845,
"num_tokens": 173368352.0,
"step": 179
},
{
"epoch": 1.161812297734628,
"grad_norm": 0.16560745239257812,
"learning_rate": 1e-05,
"loss": 0.389,
"num_tokens": 174328021.0,
"step": 180
},
{
"epoch": 1.168284789644013,
"grad_norm": 0.16693539917469025,
"learning_rate": 1e-05,
"loss": 0.3859,
"num_tokens": 175328614.0,
"step": 181
},
{
"epoch": 1.174757281553398,
"grad_norm": 0.20071224868297577,
"learning_rate": 1e-05,
"loss": 0.3881,
"num_tokens": 176287442.0,
"step": 182
},
{
"epoch": 1.1812297734627832,
"grad_norm": 0.1772185117006302,
"learning_rate": 1e-05,
"loss": 0.3924,
"num_tokens": 177272590.0,
"step": 183
},
{
"epoch": 1.1877022653721683,
"grad_norm": 0.17591412365436554,
"learning_rate": 1e-05,
"loss": 0.3937,
"num_tokens": 178243495.0,
"step": 184
},
{
"epoch": 1.1941747572815533,
"grad_norm": 0.17770753800868988,
"learning_rate": 1e-05,
"loss": 0.3895,
"num_tokens": 179230835.0,
"step": 185
},
{
"epoch": 1.2006472491909386,
"grad_norm": 0.16321398317813873,
"learning_rate": 1e-05,
"loss": 0.3842,
"num_tokens": 180197716.0,
"step": 186
},
{
"epoch": 1.2071197411003236,
"grad_norm": 0.19365891814231873,
"learning_rate": 1e-05,
"loss": 0.397,
"num_tokens": 181168136.0,
"step": 187
},
{
"epoch": 1.2135922330097086,
"grad_norm": 0.15928582847118378,
"learning_rate": 1e-05,
"loss": 0.3871,
"num_tokens": 182127126.0,
"step": 188
},
{
"epoch": 1.220064724919094,
"grad_norm": 0.1753508299589157,
"learning_rate": 1e-05,
"loss": 0.3892,
"num_tokens": 183085892.0,
"step": 189
},
{
"epoch": 1.226537216828479,
"grad_norm": 0.16824939846992493,
"learning_rate": 1e-05,
"loss": 0.3868,
"num_tokens": 184057299.0,
"step": 190
},
{
"epoch": 1.233009708737864,
"grad_norm": 0.1595918834209442,
"learning_rate": 1e-05,
"loss": 0.3828,
"num_tokens": 184998791.0,
"step": 191
},
{
"epoch": 1.2394822006472492,
"grad_norm": 0.1542261838912964,
"learning_rate": 1e-05,
"loss": 0.3795,
"num_tokens": 185986785.0,
"step": 192
},
{
"epoch": 1.2459546925566343,
"grad_norm": 0.15647530555725098,
"learning_rate": 1e-05,
"loss": 0.3742,
"num_tokens": 186929881.0,
"step": 193
},
{
"epoch": 1.2524271844660193,
"grad_norm": 0.15532921254634857,
"learning_rate": 1e-05,
"loss": 0.39,
"num_tokens": 187902972.0,
"step": 194
},
{
"epoch": 1.2588996763754046,
"grad_norm": 0.15017175674438477,
"learning_rate": 1e-05,
"loss": 0.375,
"num_tokens": 188884007.0,
"step": 195
},
{
"epoch": 1.2653721682847896,
"grad_norm": 0.16391442716121674,
"learning_rate": 1e-05,
"loss": 0.3774,
"num_tokens": 189812757.0,
"step": 196
},
{
"epoch": 1.2718446601941746,
"grad_norm": 0.14583992958068848,
"learning_rate": 1e-05,
"loss": 0.3799,
"num_tokens": 190775296.0,
"step": 197
},
{
"epoch": 1.27831715210356,
"grad_norm": 0.15327832102775574,
"learning_rate": 1e-05,
"loss": 0.3819,
"num_tokens": 191750113.0,
"step": 198
},
{
"epoch": 1.284789644012945,
"grad_norm": 0.152728870511055,
"learning_rate": 1e-05,
"loss": 0.383,
"num_tokens": 192691419.0,
"step": 199
},
{
"epoch": 1.29126213592233,
"grad_norm": 0.1549627035856247,
"learning_rate": 1e-05,
"loss": 0.3882,
"num_tokens": 193680379.0,
"step": 200
},
{
"epoch": 1.2977346278317152,
"grad_norm": 0.15266035497188568,
"learning_rate": 1e-05,
"loss": 0.3816,
"num_tokens": 194655500.0,
"step": 201
},
{
"epoch": 1.3042071197411003,
"grad_norm": 0.1430487483739853,
"learning_rate": 1e-05,
"loss": 0.3829,
"num_tokens": 195598008.0,
"step": 202
},
{
"epoch": 1.3106796116504853,
"grad_norm": 0.1570296436548233,
"learning_rate": 1e-05,
"loss": 0.3862,
"num_tokens": 196583549.0,
"step": 203
},
{
"epoch": 1.3171521035598706,
"grad_norm": 0.14150913059711456,
"learning_rate": 1e-05,
"loss": 0.3831,
"num_tokens": 197555491.0,
"step": 204
},
{
"epoch": 1.3236245954692556,
"grad_norm": 0.15058490633964539,
"learning_rate": 1e-05,
"loss": 0.3766,
"num_tokens": 198552040.0,
"step": 205
},
{
"epoch": 1.3300970873786409,
"grad_norm": 0.14992493391036987,
"learning_rate": 1e-05,
"loss": 0.3775,
"num_tokens": 199517007.0,
"step": 206
},
{
"epoch": 1.3365695792880259,
"grad_norm": 0.14830483496189117,
"learning_rate": 1e-05,
"loss": 0.3844,
"num_tokens": 200490485.0,
"step": 207
},
{
"epoch": 1.343042071197411,
"grad_norm": 0.1546541452407837,
"learning_rate": 1e-05,
"loss": 0.3899,
"num_tokens": 201459121.0,
"step": 208
},
{
"epoch": 1.3495145631067962,
"grad_norm": 0.15092389285564423,
"learning_rate": 1e-05,
"loss": 0.3794,
"num_tokens": 202378391.0,
"step": 209
},
{
"epoch": 1.3559870550161812,
"grad_norm": 0.15256242454051971,
"learning_rate": 1e-05,
"loss": 0.3868,
"num_tokens": 203389991.0,
"step": 210
},
{
"epoch": 1.3624595469255665,
"grad_norm": 0.15065321326255798,
"learning_rate": 1e-05,
"loss": 0.3854,
"num_tokens": 204350028.0,
"step": 211
},
{
"epoch": 1.3689320388349515,
"grad_norm": 0.1523621529340744,
"learning_rate": 1e-05,
"loss": 0.3859,
"num_tokens": 205321725.0,
"step": 212
},
{
"epoch": 1.3754045307443366,
"grad_norm": 0.1598656326532364,
"learning_rate": 1e-05,
"loss": 0.3858,
"num_tokens": 206300400.0,
"step": 213
},
{
"epoch": 1.3818770226537218,
"grad_norm": 0.1452968567609787,
"learning_rate": 1e-05,
"loss": 0.3812,
"num_tokens": 207269349.0,
"step": 214
},
{
"epoch": 1.3883495145631068,
"grad_norm": 0.15008953213691711,
"learning_rate": 1e-05,
"loss": 0.3783,
"num_tokens": 208239995.0,
"step": 215
},
{
"epoch": 1.3948220064724919,
"grad_norm": 0.1555267572402954,
"learning_rate": 1e-05,
"loss": 0.3866,
"num_tokens": 209197405.0,
"step": 216
},
{
"epoch": 1.4012944983818771,
"grad_norm": 0.1534145027399063,
"learning_rate": 1e-05,
"loss": 0.3788,
"num_tokens": 210173069.0,
"step": 217
},
{
"epoch": 1.4077669902912622,
"grad_norm": 0.14885641634464264,
"learning_rate": 1e-05,
"loss": 0.3764,
"num_tokens": 211140476.0,
"step": 218
},
{
"epoch": 1.4142394822006472,
"grad_norm": 0.1480827033519745,
"learning_rate": 1e-05,
"loss": 0.3773,
"num_tokens": 212101096.0,
"step": 219
},
{
"epoch": 1.4207119741100325,
"grad_norm": 0.16137006878852844,
"learning_rate": 1e-05,
"loss": 0.3844,
"num_tokens": 213068720.0,
"step": 220
},
{
"epoch": 1.4271844660194175,
"grad_norm": 0.1478767842054367,
"learning_rate": 1e-05,
"loss": 0.3827,
"num_tokens": 214020099.0,
"step": 221
},
{
"epoch": 1.4336569579288025,
"grad_norm": 0.16222462058067322,
"learning_rate": 1e-05,
"loss": 0.3761,
"num_tokens": 214947806.0,
"step": 222
},
{
"epoch": 1.4401294498381878,
"grad_norm": 0.1649448126554489,
"learning_rate": 1e-05,
"loss": 0.3794,
"num_tokens": 215925598.0,
"step": 223
},
{
"epoch": 1.4466019417475728,
"grad_norm": 0.14599865674972534,
"learning_rate": 1e-05,
"loss": 0.374,
"num_tokens": 216915650.0,
"step": 224
},
{
"epoch": 1.4530744336569579,
"grad_norm": 0.16679567098617554,
"learning_rate": 1e-05,
"loss": 0.3823,
"num_tokens": 217865373.0,
"step": 225
},
{
"epoch": 1.4595469255663431,
"grad_norm": 0.15487605333328247,
"learning_rate": 1e-05,
"loss": 0.373,
"num_tokens": 218823612.0,
"step": 226
},
{
"epoch": 1.4660194174757282,
"grad_norm": 0.1633458137512207,
"learning_rate": 1e-05,
"loss": 0.3722,
"num_tokens": 219799962.0,
"step": 227
},
{
"epoch": 1.4724919093851132,
"grad_norm": 0.15614818036556244,
"learning_rate": 1e-05,
"loss": 0.3828,
"num_tokens": 220771199.0,
"step": 228
},
{
"epoch": 1.4789644012944985,
"grad_norm": 0.17319650948047638,
"learning_rate": 1e-05,
"loss": 0.3777,
"num_tokens": 221736326.0,
"step": 229
},
{
"epoch": 1.4854368932038835,
"grad_norm": 0.16175642609596252,
"learning_rate": 1e-05,
"loss": 0.377,
"num_tokens": 222710609.0,
"step": 230
},
{
"epoch": 1.4919093851132685,
"grad_norm": 0.16791830956935883,
"learning_rate": 1e-05,
"loss": 0.3739,
"num_tokens": 223688658.0,
"step": 231
},
{
"epoch": 1.4983818770226538,
"grad_norm": 0.15006020665168762,
"learning_rate": 1e-05,
"loss": 0.3789,
"num_tokens": 224689265.0,
"step": 232
},
{
"epoch": 1.5048543689320388,
"grad_norm": 0.16425134241580963,
"learning_rate": 1e-05,
"loss": 0.3769,
"num_tokens": 225643475.0,
"step": 233
},
{
"epoch": 1.5113268608414239,
"grad_norm": 0.14565372467041016,
"learning_rate": 1e-05,
"loss": 0.3807,
"num_tokens": 226617652.0,
"step": 234
},
{
"epoch": 1.5177993527508091,
"grad_norm": 0.16489112377166748,
"learning_rate": 1e-05,
"loss": 0.3755,
"num_tokens": 227553267.0,
"step": 235
},
{
"epoch": 1.5242718446601942,
"grad_norm": 0.15542085468769073,
"learning_rate": 1e-05,
"loss": 0.3778,
"num_tokens": 228538404.0,
"step": 236
},
{
"epoch": 1.5307443365695792,
"grad_norm": 0.16758009791374207,
"learning_rate": 1e-05,
"loss": 0.3741,
"num_tokens": 229526832.0,
"step": 237
},
{
"epoch": 1.5372168284789645,
"grad_norm": 0.15440639853477478,
"learning_rate": 1e-05,
"loss": 0.3791,
"num_tokens": 230514566.0,
"step": 238
},
{
"epoch": 1.5436893203883495,
"grad_norm": 0.16300874948501587,
"learning_rate": 1e-05,
"loss": 0.3759,
"num_tokens": 231499488.0,
"step": 239
},
{
"epoch": 1.5501618122977345,
"grad_norm": 0.1678025871515274,
"learning_rate": 1e-05,
"loss": 0.3868,
"num_tokens": 232467260.0,
"step": 240
},
{
"epoch": 1.5566343042071198,
"grad_norm": 0.15477962791919708,
"learning_rate": 1e-05,
"loss": 0.3786,
"num_tokens": 233453448.0,
"step": 241
},
{
"epoch": 1.5631067961165048,
"grad_norm": 0.16532817482948303,
"learning_rate": 1e-05,
"loss": 0.3729,
"num_tokens": 234468336.0,
"step": 242
},
{
"epoch": 1.5695792880258899,
"grad_norm": 0.15214623510837555,
"learning_rate": 1e-05,
"loss": 0.3826,
"num_tokens": 235437157.0,
"step": 243
},
{
"epoch": 1.5760517799352751,
"grad_norm": 0.16525112092494965,
"learning_rate": 1e-05,
"loss": 0.3836,
"num_tokens": 236420720.0,
"step": 244
},
{
"epoch": 1.5825242718446602,
"grad_norm": 0.164701908826828,
"learning_rate": 1e-05,
"loss": 0.3801,
"num_tokens": 237408717.0,
"step": 245
},
{
"epoch": 1.5889967637540452,
"grad_norm": 0.1614416241645813,
"learning_rate": 1e-05,
"loss": 0.3835,
"num_tokens": 238399497.0,
"step": 246
},
{
"epoch": 1.5954692556634305,
"grad_norm": 0.17205291986465454,
"learning_rate": 1e-05,
"loss": 0.385,
"num_tokens": 239341614.0,
"step": 247
},
{
"epoch": 1.6019417475728155,
"grad_norm": 0.172869473695755,
"learning_rate": 1e-05,
"loss": 0.378,
"num_tokens": 240303758.0,
"step": 248
},
{
"epoch": 1.6084142394822005,
"grad_norm": 0.170328289270401,
"learning_rate": 1e-05,
"loss": 0.378,
"num_tokens": 241254594.0,
"step": 249
},
{
"epoch": 1.6148867313915858,
"grad_norm": 0.15210796892642975,
"learning_rate": 1e-05,
"loss": 0.3825,
"num_tokens": 242199383.0,
"step": 250
},
{
"epoch": 1.6213592233009708,
"grad_norm": 0.17345553636550903,
"learning_rate": 1e-05,
"loss": 0.376,
"num_tokens": 243205373.0,
"step": 251
},
{
"epoch": 1.6278317152103559,
"grad_norm": 0.15487349033355713,
"learning_rate": 1e-05,
"loss": 0.371,
"num_tokens": 244177909.0,
"step": 252
},
{
"epoch": 1.6343042071197411,
"grad_norm": 0.1735333651304245,
"learning_rate": 1e-05,
"loss": 0.3826,
"num_tokens": 245165218.0,
"step": 253
},
{
"epoch": 1.6407766990291264,
"grad_norm": 0.1656838059425354,
"learning_rate": 1e-05,
"loss": 0.3714,
"num_tokens": 246106607.0,
"step": 254
},
{
"epoch": 1.6472491909385112,
"grad_norm": 0.16804338991641998,
"learning_rate": 1e-05,
"loss": 0.3777,
"num_tokens": 247086207.0,
"step": 255
},
{
"epoch": 1.6537216828478964,
"grad_norm": 0.15802405774593353,
"learning_rate": 1e-05,
"loss": 0.3811,
"num_tokens": 248068946.0,
"step": 256
},
{
"epoch": 1.6601941747572817,
"grad_norm": 0.16986878216266632,
"learning_rate": 1e-05,
"loss": 0.372,
"num_tokens": 249047854.0,
"step": 257
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.1500914990901947,
"learning_rate": 1e-05,
"loss": 0.3844,
"num_tokens": 250033293.0,
"step": 258
},
{
"epoch": 1.6731391585760518,
"grad_norm": 0.15746842324733734,
"learning_rate": 1e-05,
"loss": 0.3799,
"num_tokens": 251008614.0,
"step": 259
},
{
"epoch": 1.679611650485437,
"grad_norm": 0.15219233930110931,
"learning_rate": 1e-05,
"loss": 0.3728,
"num_tokens": 251959677.0,
"step": 260
},
{
"epoch": 1.6860841423948218,
"grad_norm": 0.1517256200313568,
"learning_rate": 1e-05,
"loss": 0.3789,
"num_tokens": 252935571.0,
"step": 261
},
{
"epoch": 1.692556634304207,
"grad_norm": 0.15222369134426117,
"learning_rate": 1e-05,
"loss": 0.3806,
"num_tokens": 253879032.0,
"step": 262
},
{
"epoch": 1.6990291262135924,
"grad_norm": 0.15999265015125275,
"learning_rate": 1e-05,
"loss": 0.3646,
"num_tokens": 254826418.0,
"step": 263
},
{
"epoch": 1.7055016181229772,
"grad_norm": 0.160135418176651,
"learning_rate": 1e-05,
"loss": 0.3817,
"num_tokens": 255785864.0,
"step": 264
},
{
"epoch": 1.7119741100323624,
"grad_norm": 0.1449073851108551,
"learning_rate": 1e-05,
"loss": 0.3722,
"num_tokens": 256751813.0,
"step": 265
},
{
"epoch": 1.7184466019417477,
"grad_norm": 0.15163810551166534,
"learning_rate": 1e-05,
"loss": 0.3802,
"num_tokens": 257721834.0,
"step": 266
},
{
"epoch": 1.7249190938511327,
"grad_norm": 0.14865443110466003,
"learning_rate": 1e-05,
"loss": 0.3796,
"num_tokens": 258673121.0,
"step": 267
},
{
"epoch": 1.7313915857605178,
"grad_norm": 0.15070468187332153,
"learning_rate": 1e-05,
"loss": 0.3842,
"num_tokens": 259660088.0,
"step": 268
},
{
"epoch": 1.737864077669903,
"grad_norm": 0.1530027985572815,
"learning_rate": 1e-05,
"loss": 0.3846,
"num_tokens": 260659515.0,
"step": 269
},
{
"epoch": 1.744336569579288,
"grad_norm": 0.17373213171958923,
"learning_rate": 1e-05,
"loss": 0.3787,
"num_tokens": 261673283.0,
"step": 270
},
{
"epoch": 1.750809061488673,
"grad_norm": 0.17764431238174438,
"learning_rate": 1e-05,
"loss": 0.3714,
"num_tokens": 262663793.0,
"step": 271
},
{
"epoch": 1.7572815533980584,
"grad_norm": 0.1590733826160431,
"learning_rate": 1e-05,
"loss": 0.3766,
"num_tokens": 263631573.0,
"step": 272
},
{
"epoch": 1.7637540453074434,
"grad_norm": 0.18542887270450592,
"learning_rate": 1e-05,
"loss": 0.3827,
"num_tokens": 264625690.0,
"step": 273
},
{
"epoch": 1.7702265372168284,
"grad_norm": 0.17420196533203125,
"learning_rate": 1e-05,
"loss": 0.3663,
"num_tokens": 265585386.0,
"step": 274
},
{
"epoch": 1.7766990291262137,
"grad_norm": 0.14788936078548431,
"learning_rate": 1e-05,
"loss": 0.3868,
"num_tokens": 266565331.0,
"step": 275
},
{
"epoch": 1.7831715210355987,
"grad_norm": 0.17412249743938446,
"learning_rate": 1e-05,
"loss": 0.3754,
"num_tokens": 267570388.0,
"step": 276
},
{
"epoch": 1.7896440129449838,
"grad_norm": 0.1655820608139038,
"learning_rate": 1e-05,
"loss": 0.3748,
"num_tokens": 268536852.0,
"step": 277
},
{
"epoch": 1.796116504854369,
"grad_norm": 0.15587899088859558,
"learning_rate": 1e-05,
"loss": 0.3827,
"num_tokens": 269499895.0,
"step": 278
},
{
"epoch": 1.802588996763754,
"grad_norm": 0.1700071096420288,
"learning_rate": 1e-05,
"loss": 0.3708,
"num_tokens": 270449454.0,
"step": 279
},
{
"epoch": 1.809061488673139,
"grad_norm": 0.1482965350151062,
"learning_rate": 1e-05,
"loss": 0.3701,
"num_tokens": 271445748.0,
"step": 280
},
{
"epoch": 1.8155339805825244,
"grad_norm": 0.16300396621227264,
"learning_rate": 1e-05,
"loss": 0.3764,
"num_tokens": 272437678.0,
"step": 281
},
{
"epoch": 1.8220064724919094,
"grad_norm": 0.14801423251628876,
"learning_rate": 1e-05,
"loss": 0.3806,
"num_tokens": 273421632.0,
"step": 282
},
{
"epoch": 1.8284789644012944,
"grad_norm": 0.1659514456987381,
"learning_rate": 1e-05,
"loss": 0.369,
"num_tokens": 274382121.0,
"step": 283
},
{
"epoch": 1.8349514563106797,
"grad_norm": 0.14842261373996735,
"learning_rate": 1e-05,
"loss": 0.3658,
"num_tokens": 275355409.0,
"step": 284
},
{
"epoch": 1.8414239482200647,
"grad_norm": 0.14394618570804596,
"learning_rate": 1e-05,
"loss": 0.3714,
"num_tokens": 276363366.0,
"step": 285
},
{
"epoch": 1.8478964401294498,
"grad_norm": 0.14794841408729553,
"learning_rate": 1e-05,
"loss": 0.3768,
"num_tokens": 277316291.0,
"step": 286
},
{
"epoch": 1.854368932038835,
"grad_norm": 0.15567384660243988,
"learning_rate": 1e-05,
"loss": 0.3742,
"num_tokens": 278277485.0,
"step": 287
},
{
"epoch": 1.86084142394822,
"grad_norm": 0.13822609186172485,
"learning_rate": 1e-05,
"loss": 0.3689,
"num_tokens": 279263638.0,
"step": 288
},
{
"epoch": 1.867313915857605,
"grad_norm": 0.1645592600107193,
"learning_rate": 1e-05,
"loss": 0.3685,
"num_tokens": 280229543.0,
"step": 289
},
{
"epoch": 1.8737864077669903,
"grad_norm": 0.1565285474061966,
"learning_rate": 1e-05,
"loss": 0.3681,
"num_tokens": 281207364.0,
"step": 290
},
{
"epoch": 1.8802588996763754,
"grad_norm": 0.15756022930145264,
"learning_rate": 1e-05,
"loss": 0.3668,
"num_tokens": 282156358.0,
"step": 291
},
{
"epoch": 1.8867313915857604,
"grad_norm": 0.15506701171398163,
"learning_rate": 1e-05,
"loss": 0.3697,
"num_tokens": 283119435.0,
"step": 292
},
{
"epoch": 1.8932038834951457,
"grad_norm": 0.15256839990615845,
"learning_rate": 1e-05,
"loss": 0.3793,
"num_tokens": 284084676.0,
"step": 293
},
{
"epoch": 1.8996763754045307,
"grad_norm": 0.16919973492622375,
"learning_rate": 1e-05,
"loss": 0.3895,
"num_tokens": 285051682.0,
"step": 294
},
{
"epoch": 1.9061488673139158,
"grad_norm": 0.1467684656381607,
"learning_rate": 1e-05,
"loss": 0.3741,
"num_tokens": 286037634.0,
"step": 295
},
{
"epoch": 1.912621359223301,
"grad_norm": 0.1547224074602127,
"learning_rate": 1e-05,
"loss": 0.3677,
"num_tokens": 287028258.0,
"step": 296
},
{
"epoch": 1.919093851132686,
"grad_norm": 0.14955021440982819,
"learning_rate": 1e-05,
"loss": 0.371,
"num_tokens": 287996979.0,
"step": 297
},
{
"epoch": 1.925566343042071,
"grad_norm": 0.14600345492362976,
"learning_rate": 1e-05,
"loss": 0.3704,
"num_tokens": 288984893.0,
"step": 298
},
{
"epoch": 1.9320388349514563,
"grad_norm": 0.15038008987903595,
"learning_rate": 1e-05,
"loss": 0.3728,
"num_tokens": 289948988.0,
"step": 299
},
{
"epoch": 1.9385113268608414,
"grad_norm": 0.1539427489042282,
"learning_rate": 1e-05,
"loss": 0.3786,
"num_tokens": 290947258.0,
"step": 300
},
{
"epoch": 1.9449838187702264,
"grad_norm": 0.1640380322933197,
"learning_rate": 1e-05,
"loss": 0.3735,
"num_tokens": 291896825.0,
"step": 301
},
{
"epoch": 1.9514563106796117,
"grad_norm": 0.13670173287391663,
"learning_rate": 1e-05,
"loss": 0.3639,
"num_tokens": 292874918.0,
"step": 302
},
{
"epoch": 1.9579288025889967,
"grad_norm": 0.1619482934474945,
"learning_rate": 1e-05,
"loss": 0.3761,
"num_tokens": 293858095.0,
"step": 303
},
{
"epoch": 1.9644012944983817,
"grad_norm": 0.14659424126148224,
"learning_rate": 1e-05,
"loss": 0.3793,
"num_tokens": 294831363.0,
"step": 304
},
{
"epoch": 1.970873786407767,
"grad_norm": 0.1463099718093872,
"learning_rate": 1e-05,
"loss": 0.3667,
"num_tokens": 295815029.0,
"step": 305
},
{
"epoch": 1.9773462783171523,
"grad_norm": 0.15758393704891205,
"learning_rate": 1e-05,
"loss": 0.381,
"num_tokens": 296783225.0,
"step": 306
},
{
"epoch": 1.983818770226537,
"grad_norm": 0.14858050644397736,
"learning_rate": 1e-05,
"loss": 0.37,
"num_tokens": 297740638.0,
"step": 307
},
{
"epoch": 1.9902912621359223,
"grad_norm": 0.1502712517976761,
"learning_rate": 1e-05,
"loss": 0.382,
"num_tokens": 298713508.0,
"step": 308
},
{
"epoch": 1.9902912621359223,
"step": 308,
"total_flos": 1.2344471118126514e+19,
"train_loss": 0.44806588683035464,
"train_runtime": 9308.2443,
"train_samples_per_second": 14.854,
"train_steps_per_second": 0.033
}
],
"logging_steps": 1,
"max_steps": 308,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 16,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2344471118126514e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}