SmolLM3-SFT / trainer_state.json
gshasiri's picture
Model save
2bf72b1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004326663061114116,
"grad_norm": 12.063708720251004,
"learning_rate": 0.0,
"loss": 1.3101,
"num_tokens": 8052727.0,
"step": 1
},
{
"epoch": 0.008653326122228232,
"grad_norm": 11.727718712189134,
"learning_rate": 5.714285714285715e-07,
"loss": 1.3318,
"num_tokens": 16316249.0,
"step": 2
},
{
"epoch": 0.012979989183342347,
"grad_norm": 11.537944660412041,
"learning_rate": 1.142857142857143e-06,
"loss": 1.3489,
"num_tokens": 24414956.0,
"step": 3
},
{
"epoch": 0.017306652244456464,
"grad_norm": 11.513834135754882,
"learning_rate": 1.7142857142857145e-06,
"loss": 1.3106,
"num_tokens": 32432061.0,
"step": 4
},
{
"epoch": 0.02163331530557058,
"grad_norm": 10.709616265045359,
"learning_rate": 2.285714285714286e-06,
"loss": 1.3397,
"num_tokens": 40560264.0,
"step": 5
},
{
"epoch": 0.025959978366684695,
"grad_norm": 6.3776460128645125,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.3048,
"num_tokens": 48746761.0,
"step": 6
},
{
"epoch": 0.03028664142779881,
"grad_norm": 4.803209403173688,
"learning_rate": 3.428571428571429e-06,
"loss": 1.2789,
"num_tokens": 56923208.0,
"step": 7
},
{
"epoch": 0.03461330448891293,
"grad_norm": 3.3389242242644204,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2859,
"num_tokens": 65037147.0,
"step": 8
},
{
"epoch": 0.038939967550027044,
"grad_norm": 2.67959921100429,
"learning_rate": 4.571428571428572e-06,
"loss": 1.2151,
"num_tokens": 73157157.0,
"step": 9
},
{
"epoch": 0.04326663061114116,
"grad_norm": 2.672716941514162,
"learning_rate": 5.142857142857142e-06,
"loss": 1.27,
"num_tokens": 81330425.0,
"step": 10
},
{
"epoch": 0.047593293672255274,
"grad_norm": 2.3564616974865276,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.2467,
"num_tokens": 89515900.0,
"step": 11
},
{
"epoch": 0.05191995673336939,
"grad_norm": 1.4041532628718385,
"learning_rate": 6.285714285714286e-06,
"loss": 1.2437,
"num_tokens": 97822905.0,
"step": 12
},
{
"epoch": 0.056246619794483504,
"grad_norm": 3.4995096434595614,
"learning_rate": 6.857142857142858e-06,
"loss": 1.228,
"num_tokens": 105891211.0,
"step": 13
},
{
"epoch": 0.06057328285559762,
"grad_norm": 1.7414005281229072,
"learning_rate": 7.428571428571429e-06,
"loss": 1.2271,
"num_tokens": 114159655.0,
"step": 14
},
{
"epoch": 0.06489994591671173,
"grad_norm": 3.301008162368563,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2085,
"num_tokens": 122302268.0,
"step": 15
},
{
"epoch": 0.06922660897782586,
"grad_norm": 1.515296843459891,
"learning_rate": 8.571428571428571e-06,
"loss": 1.2182,
"num_tokens": 130363765.0,
"step": 16
},
{
"epoch": 0.07355327203893997,
"grad_norm": 0.8966168784111427,
"learning_rate": 9.142857142857144e-06,
"loss": 1.1771,
"num_tokens": 138559833.0,
"step": 17
},
{
"epoch": 0.07787993510005409,
"grad_norm": 1.0197011395640607,
"learning_rate": 9.714285714285715e-06,
"loss": 1.1908,
"num_tokens": 146796904.0,
"step": 18
},
{
"epoch": 0.0822065981611682,
"grad_norm": 0.8539259040543176,
"learning_rate": 1.0285714285714285e-05,
"loss": 1.1752,
"num_tokens": 155019734.0,
"step": 19
},
{
"epoch": 0.08653326122228232,
"grad_norm": 0.9490693214323738,
"learning_rate": 1.0857142857142858e-05,
"loss": 1.1664,
"num_tokens": 163012614.0,
"step": 20
},
{
"epoch": 0.09085992428339643,
"grad_norm": 0.7637306204373832,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.1527,
"num_tokens": 171242469.0,
"step": 21
},
{
"epoch": 0.09518658734451055,
"grad_norm": 0.684327460495295,
"learning_rate": 1.2e-05,
"loss": 1.169,
"num_tokens": 179418820.0,
"step": 22
},
{
"epoch": 0.09951325040562466,
"grad_norm": 0.8747628249391602,
"learning_rate": 1.2571428571428572e-05,
"loss": 1.1426,
"num_tokens": 187617438.0,
"step": 23
},
{
"epoch": 0.10383991346673878,
"grad_norm": 0.7383283690375211,
"learning_rate": 1.3142857142857145e-05,
"loss": 1.166,
"num_tokens": 195684109.0,
"step": 24
},
{
"epoch": 0.10816657652785289,
"grad_norm": 0.7774312103590332,
"learning_rate": 1.3714285714285716e-05,
"loss": 1.1729,
"num_tokens": 8143851.0,
"step": 25
},
{
"epoch": 0.11249323958896701,
"grad_norm": 0.742987701591983,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.1418,
"num_tokens": 16231014.0,
"step": 26
},
{
"epoch": 0.11681990265008113,
"grad_norm": 0.7597974072212311,
"learning_rate": 1.4857142857142858e-05,
"loss": 1.1519,
"num_tokens": 24491951.0,
"step": 27
},
{
"epoch": 0.12114656571119524,
"grad_norm": 0.5863008064226061,
"learning_rate": 1.542857142857143e-05,
"loss": 1.1519,
"num_tokens": 32878252.0,
"step": 28
},
{
"epoch": 0.12547322877230935,
"grad_norm": 0.8462710417949215,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.1511,
"num_tokens": 41054015.0,
"step": 29
},
{
"epoch": 0.12979989183342347,
"grad_norm": 0.7552796906152683,
"learning_rate": 1.6571428571428574e-05,
"loss": 1.1372,
"num_tokens": 49241122.0,
"step": 30
},
{
"epoch": 0.1341265548945376,
"grad_norm": 0.7260275701543588,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.1449,
"num_tokens": 57442255.0,
"step": 31
},
{
"epoch": 0.13845321795565171,
"grad_norm": 0.956217372168425,
"learning_rate": 1.7714285714285717e-05,
"loss": 1.139,
"num_tokens": 65747927.0,
"step": 32
},
{
"epoch": 0.1427798810167658,
"grad_norm": 0.8902649541620489,
"learning_rate": 1.8285714285714288e-05,
"loss": 1.123,
"num_tokens": 73728881.0,
"step": 33
},
{
"epoch": 0.14710654407787993,
"grad_norm": 0.940638327725262,
"learning_rate": 1.885714285714286e-05,
"loss": 1.1332,
"num_tokens": 81823147.0,
"step": 34
},
{
"epoch": 0.15143320713899405,
"grad_norm": 1.05820771639681,
"learning_rate": 1.942857142857143e-05,
"loss": 1.1233,
"num_tokens": 90140961.0,
"step": 35
},
{
"epoch": 0.15575987020010817,
"grad_norm": 0.6786564535689352,
"learning_rate": 2e-05,
"loss": 1.1134,
"num_tokens": 98314830.0,
"step": 36
},
{
"epoch": 0.16008653326122227,
"grad_norm": 1.3940550517981376,
"learning_rate": 1.9999964908096047e-05,
"loss": 1.1349,
"num_tokens": 106342502.0,
"step": 37
},
{
"epoch": 0.1644131963223364,
"grad_norm": 0.6715963909078057,
"learning_rate": 1.9999859632657835e-05,
"loss": 1.0989,
"num_tokens": 114627503.0,
"step": 38
},
{
"epoch": 0.1687398593834505,
"grad_norm": 0.9985206811473272,
"learning_rate": 1.9999684174506328e-05,
"loss": 1.1383,
"num_tokens": 122857334.0,
"step": 39
},
{
"epoch": 0.17306652244456464,
"grad_norm": 0.9483815912954356,
"learning_rate": 1.999943853500978e-05,
"loss": 1.1224,
"num_tokens": 131055706.0,
"step": 40
},
{
"epoch": 0.17739318550567876,
"grad_norm": 0.8683918297064362,
"learning_rate": 1.9999122716083737e-05,
"loss": 1.1573,
"num_tokens": 139253842.0,
"step": 41
},
{
"epoch": 0.18171984856679285,
"grad_norm": 0.6429259475965016,
"learning_rate": 1.9998736720191024e-05,
"loss": 1.1007,
"num_tokens": 147222819.0,
"step": 42
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.9456343421520338,
"learning_rate": 1.999828055034171e-05,
"loss": 1.1222,
"num_tokens": 155421830.0,
"step": 43
},
{
"epoch": 0.1903731746890211,
"grad_norm": 0.8535995280206158,
"learning_rate": 1.99977542100931e-05,
"loss": 1.0983,
"num_tokens": 163580725.0,
"step": 44
},
{
"epoch": 0.19469983775013522,
"grad_norm": 0.7453534127868048,
"learning_rate": 1.99971577035497e-05,
"loss": 1.1045,
"num_tokens": 171682606.0,
"step": 45
},
{
"epoch": 0.1990265008112493,
"grad_norm": 0.7952563274763261,
"learning_rate": 1.999649103536319e-05,
"loss": 1.091,
"num_tokens": 179937509.0,
"step": 46
},
{
"epoch": 0.20335316387236343,
"grad_norm": 0.6958121333057308,
"learning_rate": 1.9995754210732382e-05,
"loss": 1.0797,
"num_tokens": 188304288.0,
"step": 47
},
{
"epoch": 0.20767982693347756,
"grad_norm": 0.8009419384487555,
"learning_rate": 1.999494723540318e-05,
"loss": 1.1117,
"num_tokens": 196584086.0,
"step": 48
},
{
"epoch": 0.21200648999459168,
"grad_norm": 1.1770686099470742,
"learning_rate": 1.9994070115668543e-05,
"loss": 1.1127,
"num_tokens": 204821676.0,
"step": 49
},
{
"epoch": 0.21633315305570577,
"grad_norm": 0.5760382647360603,
"learning_rate": 1.9993122858368424e-05,
"loss": 1.0736,
"num_tokens": 213042491.0,
"step": 50
},
{
"epoch": 0.2206598161168199,
"grad_norm": 0.9211760426362613,
"learning_rate": 1.9992105470889727e-05,
"loss": 1.1007,
"num_tokens": 221162073.0,
"step": 51
},
{
"epoch": 0.22498647917793402,
"grad_norm": 0.728065400596324,
"learning_rate": 1.9991017961166245e-05,
"loss": 1.0852,
"num_tokens": 229361548.0,
"step": 52
},
{
"epoch": 0.22931314223904814,
"grad_norm": 0.9499983156478771,
"learning_rate": 1.9989860337678596e-05,
"loss": 1.0711,
"num_tokens": 237483363.0,
"step": 53
},
{
"epoch": 0.23363980530016226,
"grad_norm": 0.6425952198321203,
"learning_rate": 1.998863260945416e-05,
"loss": 1.0803,
"num_tokens": 245616670.0,
"step": 54
},
{
"epoch": 0.23796646836127636,
"grad_norm": 0.6756995511647822,
"learning_rate": 1.998733478606701e-05,
"loss": 1.118,
"num_tokens": 253758578.0,
"step": 55
},
{
"epoch": 0.24229313142239048,
"grad_norm": 0.7048601099063915,
"learning_rate": 1.998596687763783e-05,
"loss": 1.0927,
"num_tokens": 261949951.0,
"step": 56
},
{
"epoch": 0.2466197944835046,
"grad_norm": 0.7411902717404214,
"learning_rate": 1.998452889483385e-05,
"loss": 1.0838,
"num_tokens": 270112359.0,
"step": 57
},
{
"epoch": 0.2509464575446187,
"grad_norm": 0.7583633449162208,
"learning_rate": 1.9983020848868745e-05,
"loss": 1.0751,
"num_tokens": 278249438.0,
"step": 58
},
{
"epoch": 0.2552731206057328,
"grad_norm": 0.5895229075183283,
"learning_rate": 1.9981442751502562e-05,
"loss": 1.1097,
"num_tokens": 286515364.0,
"step": 59
},
{
"epoch": 0.25959978366684694,
"grad_norm": 0.8689168674816765,
"learning_rate": 1.9979794615041623e-05,
"loss": 1.0633,
"num_tokens": 294730511.0,
"step": 60
},
{
"epoch": 0.26392644672796106,
"grad_norm": 0.8437342550894297,
"learning_rate": 1.997807645233842e-05,
"loss": 1.0772,
"num_tokens": 302885915.0,
"step": 61
},
{
"epoch": 0.2682531097890752,
"grad_norm": 0.7886516822119517,
"learning_rate": 1.9976288276791537e-05,
"loss": 1.0848,
"num_tokens": 310939993.0,
"step": 62
},
{
"epoch": 0.2725797728501893,
"grad_norm": 0.89230967002305,
"learning_rate": 1.9974430102345526e-05,
"loss": 1.1142,
"num_tokens": 319044771.0,
"step": 63
},
{
"epoch": 0.27690643591130343,
"grad_norm": 0.8141528756159552,
"learning_rate": 1.9972501943490805e-05,
"loss": 1.116,
"num_tokens": 327163024.0,
"step": 64
},
{
"epoch": 0.28123309897241755,
"grad_norm": 0.8792199270720278,
"learning_rate": 1.9970503815263543e-05,
"loss": 1.0768,
"num_tokens": 335493759.0,
"step": 65
},
{
"epoch": 0.2855597620335316,
"grad_norm": 0.5782878344170364,
"learning_rate": 1.9968435733245542e-05,
"loss": 1.0713,
"num_tokens": 343741024.0,
"step": 66
},
{
"epoch": 0.28988642509464574,
"grad_norm": 1.0783400413776696,
"learning_rate": 1.9966297713564123e-05,
"loss": 1.0777,
"num_tokens": 351897173.0,
"step": 67
},
{
"epoch": 0.29421308815575986,
"grad_norm": 0.5912367198455412,
"learning_rate": 1.9964089772891998e-05,
"loss": 1.0587,
"num_tokens": 360108643.0,
"step": 68
},
{
"epoch": 0.298539751216874,
"grad_norm": 0.9560230781906641,
"learning_rate": 1.9961811928447124e-05,
"loss": 1.0513,
"num_tokens": 368404831.0,
"step": 69
},
{
"epoch": 0.3028664142779881,
"grad_norm": 0.6694334113583972,
"learning_rate": 1.9959464197992592e-05,
"loss": 1.0941,
"num_tokens": 376701186.0,
"step": 70
},
{
"epoch": 0.3071930773391022,
"grad_norm": 0.9953837845607135,
"learning_rate": 1.995704659983648e-05,
"loss": 1.1126,
"num_tokens": 384948998.0,
"step": 71
},
{
"epoch": 0.31151974040021635,
"grad_norm": 0.801064734619278,
"learning_rate": 1.9954559152831705e-05,
"loss": 1.0698,
"num_tokens": 393114257.0,
"step": 72
},
{
"epoch": 0.31584640346133047,
"grad_norm": 0.7206220961948427,
"learning_rate": 1.995200187637587e-05,
"loss": 1.043,
"num_tokens": 401208167.0,
"step": 73
},
{
"epoch": 0.32017306652244454,
"grad_norm": 0.6357689365088348,
"learning_rate": 1.9949374790411134e-05,
"loss": 1.0434,
"num_tokens": 409465133.0,
"step": 74
},
{
"epoch": 0.32449972958355866,
"grad_norm": 0.8975335464913397,
"learning_rate": 1.9946677915424045e-05,
"loss": 1.0802,
"num_tokens": 417678210.0,
"step": 75
},
{
"epoch": 0.3288263926446728,
"grad_norm": 0.6684128266308609,
"learning_rate": 1.994391127244537e-05,
"loss": 1.0638,
"num_tokens": 425836991.0,
"step": 76
},
{
"epoch": 0.3331530557057869,
"grad_norm": 0.8335803466238701,
"learning_rate": 1.994107488304995e-05,
"loss": 1.0404,
"num_tokens": 434061883.0,
"step": 77
},
{
"epoch": 0.337479718766901,
"grad_norm": 0.701262475559687,
"learning_rate": 1.993816876935652e-05,
"loss": 1.072,
"num_tokens": 442148727.0,
"step": 78
},
{
"epoch": 0.34180638182801515,
"grad_norm": 5.389426853375867,
"learning_rate": 1.9935192954027537e-05,
"loss": 1.0877,
"num_tokens": 450223300.0,
"step": 79
},
{
"epoch": 0.34613304488912927,
"grad_norm": 1.2247442284969425,
"learning_rate": 1.9932147460269007e-05,
"loss": 1.0742,
"num_tokens": 458483353.0,
"step": 80
},
{
"epoch": 0.3504597079502434,
"grad_norm": 0.6785079644135494,
"learning_rate": 1.9929032311830303e-05,
"loss": 1.0609,
"num_tokens": 466519955.0,
"step": 81
},
{
"epoch": 0.3547863710113575,
"grad_norm": 0.9818785578102666,
"learning_rate": 1.9925847533003976e-05,
"loss": 1.0626,
"num_tokens": 474749256.0,
"step": 82
},
{
"epoch": 0.3591130340724716,
"grad_norm": 0.9226344028092789,
"learning_rate": 1.9922593148625573e-05,
"loss": 1.0946,
"num_tokens": 483036885.0,
"step": 83
},
{
"epoch": 0.3634396971335857,
"grad_norm": 0.6996572072248013,
"learning_rate": 1.9919269184073435e-05,
"loss": 1.0366,
"num_tokens": 491030270.0,
"step": 84
},
{
"epoch": 0.3677663601946998,
"grad_norm": 0.817594387577693,
"learning_rate": 1.9915875665268508e-05,
"loss": 1.0879,
"num_tokens": 499011152.0,
"step": 85
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.7201730564502099,
"learning_rate": 1.9912412618674134e-05,
"loss": 1.0486,
"num_tokens": 507139514.0,
"step": 86
},
{
"epoch": 0.37641968631692807,
"grad_norm": 0.7992318950396967,
"learning_rate": 1.9908880071295844e-05,
"loss": 1.0639,
"num_tokens": 515327001.0,
"step": 87
},
{
"epoch": 0.3807463493780422,
"grad_norm": 0.6151247030430017,
"learning_rate": 1.990527805068115e-05,
"loss": 1.0425,
"num_tokens": 523524018.0,
"step": 88
},
{
"epoch": 0.3850730124391563,
"grad_norm": 0.6686898707632741,
"learning_rate": 1.9901606584919336e-05,
"loss": 1.0562,
"num_tokens": 531727355.0,
"step": 89
},
{
"epoch": 0.38939967550027044,
"grad_norm": 0.6423157646116131,
"learning_rate": 1.989786570264123e-05,
"loss": 1.0581,
"num_tokens": 539809499.0,
"step": 90
},
{
"epoch": 0.39372633856138456,
"grad_norm": 0.7387999657802858,
"learning_rate": 1.9894055433018977e-05,
"loss": 1.0578,
"num_tokens": 548048650.0,
"step": 91
},
{
"epoch": 0.3980530016224986,
"grad_norm": 0.8411899425159689,
"learning_rate": 1.9890175805765834e-05,
"loss": 1.0808,
"num_tokens": 556181435.0,
"step": 92
},
{
"epoch": 0.40237966468361275,
"grad_norm": 0.775911076614371,
"learning_rate": 1.9886226851135904e-05,
"loss": 1.0798,
"num_tokens": 564455785.0,
"step": 93
},
{
"epoch": 0.40670632774472687,
"grad_norm": 0.700418884054443,
"learning_rate": 1.988220859992394e-05,
"loss": 1.0864,
"num_tokens": 572734893.0,
"step": 94
},
{
"epoch": 0.411032990805841,
"grad_norm": 0.6926462904609825,
"learning_rate": 1.987812108346506e-05,
"loss": 1.0779,
"num_tokens": 580907237.0,
"step": 95
},
{
"epoch": 0.4153596538669551,
"grad_norm": 0.5780746275245598,
"learning_rate": 1.9873964333634546e-05,
"loss": 1.0606,
"num_tokens": 588950468.0,
"step": 96
},
{
"epoch": 0.41968631692806924,
"grad_norm": 0.8269363889294378,
"learning_rate": 1.9869738382847567e-05,
"loss": 1.0515,
"num_tokens": 597058764.0,
"step": 97
},
{
"epoch": 0.42401297998918336,
"grad_norm": 0.7858627732714016,
"learning_rate": 1.9865443264058936e-05,
"loss": 1.0289,
"num_tokens": 605330511.0,
"step": 98
},
{
"epoch": 0.4283396430502975,
"grad_norm": 0.5098978815659675,
"learning_rate": 1.9861079010762852e-05,
"loss": 1.0614,
"num_tokens": 613468042.0,
"step": 99
},
{
"epoch": 0.43266630611141155,
"grad_norm": 0.9837835335786151,
"learning_rate": 1.9856645656992637e-05,
"loss": 1.0528,
"num_tokens": 621617034.0,
"step": 100
},
{
"epoch": 0.43699296917252567,
"grad_norm": 0.6960932513964322,
"learning_rate": 1.9852143237320475e-05,
"loss": 1.0773,
"num_tokens": 629963486.0,
"step": 101
},
{
"epoch": 0.4413196322336398,
"grad_norm": 0.8282041321720025,
"learning_rate": 1.9847571786857142e-05,
"loss": 1.055,
"num_tokens": 638074685.0,
"step": 102
},
{
"epoch": 0.4456462952947539,
"grad_norm": 0.8123048467055004,
"learning_rate": 1.984293134125172e-05,
"loss": 1.0691,
"num_tokens": 646360851.0,
"step": 103
},
{
"epoch": 0.44997295835586804,
"grad_norm": 0.6160311721079573,
"learning_rate": 1.9838221936691347e-05,
"loss": 1.0863,
"num_tokens": 654661863.0,
"step": 104
},
{
"epoch": 0.45429962141698216,
"grad_norm": 0.8232116092132997,
"learning_rate": 1.9833443609900896e-05,
"loss": 1.0177,
"num_tokens": 662981026.0,
"step": 105
},
{
"epoch": 0.4586262844780963,
"grad_norm": 0.543372168006543,
"learning_rate": 1.9828596398142725e-05,
"loss": 1.044,
"num_tokens": 671216096.0,
"step": 106
},
{
"epoch": 0.4629529475392104,
"grad_norm": 0.9423564400007317,
"learning_rate": 1.9823680339216363e-05,
"loss": 1.0448,
"num_tokens": 679484445.0,
"step": 107
},
{
"epoch": 0.4672796106003245,
"grad_norm": 0.6916636602583178,
"learning_rate": 1.9818695471458224e-05,
"loss": 1.0487,
"num_tokens": 687812578.0,
"step": 108
},
{
"epoch": 0.4716062736614386,
"grad_norm": 1.0709477226072333,
"learning_rate": 1.9813641833741308e-05,
"loss": 1.0705,
"num_tokens": 696049253.0,
"step": 109
},
{
"epoch": 0.4759329367225527,
"grad_norm": 0.7763980799420581,
"learning_rate": 1.9808519465474898e-05,
"loss": 1.0923,
"num_tokens": 704261185.0,
"step": 110
},
{
"epoch": 0.48025959978366684,
"grad_norm": 1.05928288241113,
"learning_rate": 1.9803328406604252e-05,
"loss": 1.0701,
"num_tokens": 712339843.0,
"step": 111
},
{
"epoch": 0.48458626284478096,
"grad_norm": 0.8164014894407516,
"learning_rate": 1.979806869761029e-05,
"loss": 1.0841,
"num_tokens": 720578736.0,
"step": 112
},
{
"epoch": 0.4889129259058951,
"grad_norm": 0.7814622580999907,
"learning_rate": 1.9792740379509274e-05,
"loss": 1.038,
"num_tokens": 728873337.0,
"step": 113
},
{
"epoch": 0.4932395889670092,
"grad_norm": 0.9950430098048184,
"learning_rate": 1.9787343493852508e-05,
"loss": 1.0695,
"num_tokens": 737002170.0,
"step": 114
},
{
"epoch": 0.4975662520281233,
"grad_norm": 0.7337952839442135,
"learning_rate": 1.9781878082725982e-05,
"loss": 1.0662,
"num_tokens": 745278394.0,
"step": 115
},
{
"epoch": 0.5018929150892374,
"grad_norm": 0.811331088998138,
"learning_rate": 1.977634418875007e-05,
"loss": 1.0453,
"num_tokens": 753501022.0,
"step": 116
},
{
"epoch": 0.5062195781503516,
"grad_norm": 2.392844149917475,
"learning_rate": 1.9770741855079197e-05,
"loss": 1.0585,
"num_tokens": 761674657.0,
"step": 117
},
{
"epoch": 0.5105462412114656,
"grad_norm": 0.9750359184413522,
"learning_rate": 1.976507112540148e-05,
"loss": 1.0204,
"num_tokens": 769925505.0,
"step": 118
},
{
"epoch": 0.5148729042725798,
"grad_norm": 0.5808964572584242,
"learning_rate": 1.9759332043938408e-05,
"loss": 1.0629,
"num_tokens": 778048637.0,
"step": 119
},
{
"epoch": 0.5191995673336939,
"grad_norm": 2.0190518414090257,
"learning_rate": 1.9753524655444495e-05,
"loss": 1.055,
"num_tokens": 786210971.0,
"step": 120
},
{
"epoch": 0.5235262303948081,
"grad_norm": 1.300368784946342,
"learning_rate": 1.974764900520692e-05,
"loss": 1.0442,
"num_tokens": 794450890.0,
"step": 121
},
{
"epoch": 0.5278528934559221,
"grad_norm": 0.7478247329402357,
"learning_rate": 1.9741705139045183e-05,
"loss": 1.0273,
"num_tokens": 802668849.0,
"step": 122
},
{
"epoch": 0.5321795565170362,
"grad_norm": 0.7925626643246553,
"learning_rate": 1.9735693103310747e-05,
"loss": 1.0436,
"num_tokens": 810826072.0,
"step": 123
},
{
"epoch": 0.5365062195781504,
"grad_norm": 0.8408791086220021,
"learning_rate": 1.9729612944886677e-05,
"loss": 1.0386,
"num_tokens": 819012511.0,
"step": 124
},
{
"epoch": 0.5408328826392644,
"grad_norm": 0.705243046024607,
"learning_rate": 1.9723464711187267e-05,
"loss": 1.0584,
"num_tokens": 827153219.0,
"step": 125
},
{
"epoch": 0.5451595457003786,
"grad_norm": 0.6815712426490166,
"learning_rate": 1.9717248450157682e-05,
"loss": 1.0573,
"num_tokens": 835379532.0,
"step": 126
},
{
"epoch": 0.5494862087614927,
"grad_norm": 0.73642201821198,
"learning_rate": 1.9710964210273567e-05,
"loss": 1.043,
"num_tokens": 843463437.0,
"step": 127
},
{
"epoch": 0.5538128718226069,
"grad_norm": 0.7038070626897667,
"learning_rate": 1.9704612040540698e-05,
"loss": 1.0404,
"num_tokens": 851604009.0,
"step": 128
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.6972978549881204,
"learning_rate": 1.969819199049456e-05,
"loss": 1.0488,
"num_tokens": 859466113.0,
"step": 129
},
{
"epoch": 0.5624661979448351,
"grad_norm": 0.7164998231426472,
"learning_rate": 1.9691704110199997e-05,
"loss": 1.0768,
"num_tokens": 867843649.0,
"step": 130
},
{
"epoch": 0.5667928610059492,
"grad_norm": 0.6453947425095201,
"learning_rate": 1.9685148450250802e-05,
"loss": 1.0588,
"num_tokens": 876181521.0,
"step": 131
},
{
"epoch": 0.5711195240670632,
"grad_norm": 0.6809744904176902,
"learning_rate": 1.9678525061769332e-05,
"loss": 1.0546,
"num_tokens": 884409671.0,
"step": 132
},
{
"epoch": 0.5754461871281774,
"grad_norm": 0.6874574416882646,
"learning_rate": 1.96718339964061e-05,
"loss": 1.0038,
"num_tokens": 892626806.0,
"step": 133
},
{
"epoch": 0.5797728501892915,
"grad_norm": 0.7460633332766357,
"learning_rate": 1.9665075306339373e-05,
"loss": 1.0521,
"num_tokens": 900828840.0,
"step": 134
},
{
"epoch": 0.5840995132504057,
"grad_norm": 0.725769240642372,
"learning_rate": 1.9658249044274773e-05,
"loss": 1.0552,
"num_tokens": 908969468.0,
"step": 135
},
{
"epoch": 0.5884261763115197,
"grad_norm": 1.3895008041252126,
"learning_rate": 1.965135526344487e-05,
"loss": 1.0346,
"num_tokens": 917020568.0,
"step": 136
},
{
"epoch": 0.5927528393726339,
"grad_norm": 0.7981436115786992,
"learning_rate": 1.964439401760875e-05,
"loss": 1.0473,
"num_tokens": 925187380.0,
"step": 137
},
{
"epoch": 0.597079502433748,
"grad_norm": 1.3753330128722794,
"learning_rate": 1.9637365361051602e-05,
"loss": 1.0606,
"num_tokens": 933387237.0,
"step": 138
},
{
"epoch": 0.601406165494862,
"grad_norm": 0.8330566649762117,
"learning_rate": 1.9630269348584303e-05,
"loss": 1.0731,
"num_tokens": 941428016.0,
"step": 139
},
{
"epoch": 0.6057328285559762,
"grad_norm": 1.436449192271606,
"learning_rate": 1.9623106035542988e-05,
"loss": 1.0497,
"num_tokens": 949470688.0,
"step": 140
},
{
"epoch": 0.6100594916170903,
"grad_norm": 1.246511400682441,
"learning_rate": 1.9615875477788607e-05,
"loss": 1.039,
"num_tokens": 957603328.0,
"step": 141
},
{
"epoch": 0.6143861546782045,
"grad_norm": 1.009120574140708,
"learning_rate": 1.9608577731706502e-05,
"loss": 1.0123,
"num_tokens": 965709162.0,
"step": 142
},
{
"epoch": 0.6187128177393185,
"grad_norm": 0.9947417068391968,
"learning_rate": 1.9601212854205965e-05,
"loss": 1.0227,
"num_tokens": 973985569.0,
"step": 143
},
{
"epoch": 0.6230394808004327,
"grad_norm": 0.6903270192877234,
"learning_rate": 1.959378090271979e-05,
"loss": 1.0408,
"num_tokens": 982165724.0,
"step": 144
},
{
"epoch": 0.6273661438615468,
"grad_norm": 1.051406929345143,
"learning_rate": 1.9586281935203823e-05,
"loss": 1.0328,
"num_tokens": 990346736.0,
"step": 145
},
{
"epoch": 0.6316928069226609,
"grad_norm": 0.70540346117896,
"learning_rate": 1.9578716010136524e-05,
"loss": 1.0354,
"num_tokens": 998496426.0,
"step": 146
},
{
"epoch": 0.636019469983775,
"grad_norm": 0.9469873445550671,
"learning_rate": 1.9571083186518495e-05,
"loss": 1.06,
"num_tokens": 1006665416.0,
"step": 147
},
{
"epoch": 0.6403461330448891,
"grad_norm": 0.8052442525964558,
"learning_rate": 1.956338352387203e-05,
"loss": 1.0473,
"num_tokens": 1014768764.0,
"step": 148
},
{
"epoch": 0.6446727961060033,
"grad_norm": 0.6789355824383974,
"learning_rate": 1.955561708224064e-05,
"loss": 1.0253,
"num_tokens": 1023020143.0,
"step": 149
},
{
"epoch": 0.6489994591671173,
"grad_norm": 0.8715361890721862,
"learning_rate": 1.9547783922188605e-05,
"loss": 1.0315,
"num_tokens": 1031209660.0,
"step": 150
},
{
"epoch": 0.6533261222282315,
"grad_norm": 0.645674014648836,
"learning_rate": 1.953988410480047e-05,
"loss": 1.0419,
"num_tokens": 1039525232.0,
"step": 151
},
{
"epoch": 0.6576527852893456,
"grad_norm": 5.190388510459407,
"learning_rate": 1.9531917691680605e-05,
"loss": 1.0205,
"num_tokens": 1047879166.0,
"step": 152
},
{
"epoch": 0.6619794483504597,
"grad_norm": 1.423350204561511,
"learning_rate": 1.95238847449527e-05,
"loss": 1.0683,
"num_tokens": 1056096106.0,
"step": 153
},
{
"epoch": 0.6663061114115738,
"grad_norm": 0.7873289191766745,
"learning_rate": 1.9515785327259283e-05,
"loss": 1.0276,
"num_tokens": 1064133140.0,
"step": 154
},
{
"epoch": 0.670632774472688,
"grad_norm": 1.2079248544903594,
"learning_rate": 1.950761950176125e-05,
"loss": 1.0292,
"num_tokens": 1072439522.0,
"step": 155
},
{
"epoch": 0.674959437533802,
"grad_norm": 1.2768662541750604,
"learning_rate": 1.949938733213733e-05,
"loss": 1.0655,
"num_tokens": 1080614495.0,
"step": 156
},
{
"epoch": 0.6792861005949161,
"grad_norm": 1.0319413112299183,
"learning_rate": 1.9491088882583653e-05,
"loss": 1.0346,
"num_tokens": 1088737339.0,
"step": 157
},
{
"epoch": 0.6836127636560303,
"grad_norm": 0.9551004654341626,
"learning_rate": 1.948272421781319e-05,
"loss": 1.0464,
"num_tokens": 1096942894.0,
"step": 158
},
{
"epoch": 0.6879394267171444,
"grad_norm": 1.0191544176749112,
"learning_rate": 1.9474293403055273e-05,
"loss": 1.0614,
"num_tokens": 1105139333.0,
"step": 159
},
{
"epoch": 0.6922660897782585,
"grad_norm": 0.9291601915995058,
"learning_rate": 1.9465796504055095e-05,
"loss": 1.0583,
"num_tokens": 1113209692.0,
"step": 160
},
{
"epoch": 0.6965927528393726,
"grad_norm": 0.7716222635405472,
"learning_rate": 1.9457233587073177e-05,
"loss": 1.0223,
"num_tokens": 1121410540.0,
"step": 161
},
{
"epoch": 0.7009194159004868,
"grad_norm": 5.407882430974452,
"learning_rate": 1.9448604718884868e-05,
"loss": 1.0364,
"num_tokens": 1129595028.0,
"step": 162
},
{
"epoch": 0.7052460789616009,
"grad_norm": 1.6307777217274049,
"learning_rate": 1.9439909966779816e-05,
"loss": 1.0252,
"num_tokens": 1137932356.0,
"step": 163
},
{
"epoch": 0.709572742022715,
"grad_norm": 0.7568287231544429,
"learning_rate": 1.943114939856144e-05,
"loss": 1.0071,
"num_tokens": 1146191607.0,
"step": 164
},
{
"epoch": 0.7138994050838291,
"grad_norm": 1.2144114452361634,
"learning_rate": 1.942232308254642e-05,
"loss": 1.0426,
"num_tokens": 1154479388.0,
"step": 165
},
{
"epoch": 0.7182260681449432,
"grad_norm": 1.0485800704463941,
"learning_rate": 1.941343108756413e-05,
"loss": 1.023,
"num_tokens": 1162787341.0,
"step": 166
},
{
"epoch": 0.7225527312060573,
"grad_norm": 1.1656800361671036,
"learning_rate": 1.9404473482956143e-05,
"loss": 1.0349,
"num_tokens": 1171083191.0,
"step": 167
},
{
"epoch": 0.7268793942671714,
"grad_norm": 0.7559925278495352,
"learning_rate": 1.9395450338575655e-05,
"loss": 1.0143,
"num_tokens": 1179175970.0,
"step": 168
},
{
"epoch": 0.7312060573282856,
"grad_norm": 1.0467326732986577,
"learning_rate": 1.938636172478695e-05,
"loss": 1.016,
"num_tokens": 1187347926.0,
"step": 169
},
{
"epoch": 0.7355327203893997,
"grad_norm": 0.8195812665726188,
"learning_rate": 1.937720771246488e-05,
"loss": 1.0222,
"num_tokens": 1195697998.0,
"step": 170
},
{
"epoch": 0.7398593834505138,
"grad_norm": 0.8533808464409776,
"learning_rate": 1.9367988372994264e-05,
"loss": 1.0552,
"num_tokens": 1203836889.0,
"step": 171
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.6988527076226115,
"learning_rate": 1.9358703778269362e-05,
"loss": 1.0207,
"num_tokens": 1212045311.0,
"step": 172
},
{
"epoch": 0.7485127095727421,
"grad_norm": 0.7157144584664211,
"learning_rate": 1.934935400069331e-05,
"loss": 1.0419,
"num_tokens": 1220252456.0,
"step": 173
},
{
"epoch": 0.7528393726338561,
"grad_norm": 0.6380442216825093,
"learning_rate": 1.933993911317755e-05,
"loss": 1.0043,
"num_tokens": 1228367124.0,
"step": 174
},
{
"epoch": 0.7571660356949702,
"grad_norm": 0.5945370228493201,
"learning_rate": 1.933045918914127e-05,
"loss": 1.0519,
"num_tokens": 1236533919.0,
"step": 175
},
{
"epoch": 0.7614926987560844,
"grad_norm": 0.8284989924164381,
"learning_rate": 1.932091430251082e-05,
"loss": 1.0669,
"num_tokens": 1244845033.0,
"step": 176
},
{
"epoch": 0.7658193618171985,
"grad_norm": 4.649372515055804,
"learning_rate": 1.931130452771914e-05,
"loss": 1.0512,
"num_tokens": 1252967541.0,
"step": 177
},
{
"epoch": 0.7701460248783126,
"grad_norm": 0.9360299522999799,
"learning_rate": 1.930162993970519e-05,
"loss": 1.022,
"num_tokens": 1261225085.0,
"step": 178
},
{
"epoch": 0.7744726879394267,
"grad_norm": 0.5167446323720225,
"learning_rate": 1.9291890613913353e-05,
"loss": 1.0204,
"num_tokens": 1269356299.0,
"step": 179
},
{
"epoch": 0.7787993510005409,
"grad_norm": 0.6071868007581671,
"learning_rate": 1.9282086626292835e-05,
"loss": 1.0114,
"num_tokens": 1277586117.0,
"step": 180
},
{
"epoch": 0.7831260140616549,
"grad_norm": 0.6571171117401933,
"learning_rate": 1.9272218053297113e-05,
"loss": 1.0318,
"num_tokens": 1285775878.0,
"step": 181
},
{
"epoch": 0.7874526771227691,
"grad_norm": 0.5942174702410566,
"learning_rate": 1.9262284971883293e-05,
"loss": 1.0147,
"num_tokens": 1293811971.0,
"step": 182
},
{
"epoch": 0.7917793401838832,
"grad_norm": 0.5624732411675234,
"learning_rate": 1.925228745951155e-05,
"loss": 1.0539,
"num_tokens": 1302092313.0,
"step": 183
},
{
"epoch": 0.7961060032449973,
"grad_norm": 0.8397395351542903,
"learning_rate": 1.9242225594144487e-05,
"loss": 1.0255,
"num_tokens": 1310198939.0,
"step": 184
},
{
"epoch": 0.8004326663061114,
"grad_norm": 0.5175398862515561,
"learning_rate": 1.9232099454246547e-05,
"loss": 1.0268,
"num_tokens": 1318331118.0,
"step": 185
},
{
"epoch": 0.8047593293672255,
"grad_norm": 1.0546926210587257,
"learning_rate": 1.922190911878341e-05,
"loss": 1.0241,
"num_tokens": 1326291524.0,
"step": 186
},
{
"epoch": 0.8090859924283397,
"grad_norm": 0.7638236700049821,
"learning_rate": 1.9211654667221356e-05,
"loss": 1.0222,
"num_tokens": 1334457499.0,
"step": 187
},
{
"epoch": 0.8134126554894537,
"grad_norm": 0.8595408443222351,
"learning_rate": 1.9201336179526662e-05,
"loss": 1.0428,
"num_tokens": 1342599951.0,
"step": 188
},
{
"epoch": 0.8177393185505679,
"grad_norm": 0.7876334403831634,
"learning_rate": 1.9190953736164962e-05,
"loss": 1.0451,
"num_tokens": 1350886069.0,
"step": 189
},
{
"epoch": 0.822065981611682,
"grad_norm": 0.6045137327621173,
"learning_rate": 1.918050741810064e-05,
"loss": 1.007,
"num_tokens": 1359135384.0,
"step": 190
},
{
"epoch": 0.826392644672796,
"grad_norm": 0.7005328713623081,
"learning_rate": 1.916999730679618e-05,
"loss": 1.0071,
"num_tokens": 1367261648.0,
"step": 191
},
{
"epoch": 0.8307193077339102,
"grad_norm": 0.5292196634413274,
"learning_rate": 1.9159423484211542e-05,
"loss": 1.0382,
"num_tokens": 1375344605.0,
"step": 192
},
{
"epoch": 0.8350459707950243,
"grad_norm": 0.7266484905494246,
"learning_rate": 1.9148786032803516e-05,
"loss": 1.0424,
"num_tokens": 1383221004.0,
"step": 193
},
{
"epoch": 0.8393726338561385,
"grad_norm": 0.6760097186229455,
"learning_rate": 1.9138085035525088e-05,
"loss": 1.0329,
"num_tokens": 1391421045.0,
"step": 194
},
{
"epoch": 0.8436992969172525,
"grad_norm": 0.5857244218105304,
"learning_rate": 1.912732057582479e-05,
"loss": 1.0063,
"num_tokens": 1399440266.0,
"step": 195
},
{
"epoch": 0.8480259599783667,
"grad_norm": 0.8290327240335202,
"learning_rate": 1.9116492737646025e-05,
"loss": 1.0044,
"num_tokens": 1407440904.0,
"step": 196
},
{
"epoch": 0.8523526230394808,
"grad_norm": 0.5055803198102621,
"learning_rate": 1.9105601605426464e-05,
"loss": 1.0017,
"num_tokens": 1415547487.0,
"step": 197
},
{
"epoch": 0.856679286100595,
"grad_norm": 0.842860045337866,
"learning_rate": 1.909464726409734e-05,
"loss": 1.012,
"num_tokens": 1423784486.0,
"step": 198
},
{
"epoch": 0.861005949161709,
"grad_norm": 0.654168133223008,
"learning_rate": 1.9083629799082806e-05,
"loss": 1.0282,
"num_tokens": 1432002900.0,
"step": 199
},
{
"epoch": 0.8653326122228231,
"grad_norm": 0.6784775913175939,
"learning_rate": 1.9072549296299272e-05,
"loss": 1.0185,
"num_tokens": 1440057714.0,
"step": 200
},
{
"epoch": 0.8696592752839373,
"grad_norm": 0.7106471735455494,
"learning_rate": 1.9061405842154716e-05,
"loss": 1.0492,
"num_tokens": 1448138846.0,
"step": 201
},
{
"epoch": 0.8739859383450513,
"grad_norm": 0.6800123277515188,
"learning_rate": 1.9050199523548042e-05,
"loss": 1.0141,
"num_tokens": 1456485502.0,
"step": 202
},
{
"epoch": 0.8783126014061655,
"grad_norm": 0.617362481570814,
"learning_rate": 1.9038930427868367e-05,
"loss": 1.0682,
"num_tokens": 1464589584.0,
"step": 203
},
{
"epoch": 0.8826392644672796,
"grad_norm": 0.7859831645553701,
"learning_rate": 1.9027598642994357e-05,
"loss": 1.0467,
"num_tokens": 1472644084.0,
"step": 204
},
{
"epoch": 0.8869659275283938,
"grad_norm": 1.247567866410136,
"learning_rate": 1.901620425729356e-05,
"loss": 1.0194,
"num_tokens": 1480866465.0,
"step": 205
},
{
"epoch": 0.8912925905895078,
"grad_norm": 0.7038700676463598,
"learning_rate": 1.900474735962168e-05,
"loss": 1.0148,
"num_tokens": 1489135207.0,
"step": 206
},
{
"epoch": 0.895619253650622,
"grad_norm": 1.0886191351908234,
"learning_rate": 1.89932280393219e-05,
"loss": 1.0368,
"num_tokens": 1497458419.0,
"step": 207
},
{
"epoch": 0.8999459167117361,
"grad_norm": 0.7162950117168408,
"learning_rate": 1.8981646386224205e-05,
"loss": 1.014,
"num_tokens": 1505698225.0,
"step": 208
},
{
"epoch": 0.9042725797728501,
"grad_norm": 1.343065307722291,
"learning_rate": 1.8970002490644643e-05,
"loss": 1.0351,
"num_tokens": 1513767920.0,
"step": 209
},
{
"epoch": 0.9085992428339643,
"grad_norm": 1.0074287474494303,
"learning_rate": 1.8958296443384655e-05,
"loss": 1.0342,
"num_tokens": 1521949857.0,
"step": 210
},
{
"epoch": 0.9129259058950784,
"grad_norm": 1.1963801705182988,
"learning_rate": 1.8946528335730344e-05,
"loss": 1.0122,
"num_tokens": 1530125689.0,
"step": 211
},
{
"epoch": 0.9172525689561926,
"grad_norm": 0.9762540750082646,
"learning_rate": 1.8934698259451784e-05,
"loss": 0.9763,
"num_tokens": 1538076331.0,
"step": 212
},
{
"epoch": 0.9215792320173066,
"grad_norm": 1.2050370011602503,
"learning_rate": 1.8922806306802283e-05,
"loss": 1.0114,
"num_tokens": 1546085841.0,
"step": 213
},
{
"epoch": 0.9259058950784208,
"grad_norm": 1.019244393618009,
"learning_rate": 1.891085257051768e-05,
"loss": 1.0102,
"num_tokens": 1554250179.0,
"step": 214
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.2220847331235862,
"learning_rate": 1.8898837143815604e-05,
"loss": 1.0606,
"num_tokens": 1562575332.0,
"step": 215
},
{
"epoch": 0.934559221200649,
"grad_norm": 1.0804822558122213,
"learning_rate": 1.8886760120394774e-05,
"loss": 1.0086,
"num_tokens": 1570784477.0,
"step": 216
},
{
"epoch": 0.9388858842617631,
"grad_norm": 1.0198301768654703,
"learning_rate": 1.8874621594434242e-05,
"loss": 1.0374,
"num_tokens": 1578962689.0,
"step": 217
},
{
"epoch": 0.9432125473228772,
"grad_norm": 0.9362158917261221,
"learning_rate": 1.8862421660592673e-05,
"loss": 1.0284,
"num_tokens": 1587242415.0,
"step": 218
},
{
"epoch": 0.9475392103839914,
"grad_norm": 0.8668622386683569,
"learning_rate": 1.8850160414007595e-05,
"loss": 1.0378,
"num_tokens": 1595320714.0,
"step": 219
},
{
"epoch": 0.9518658734451054,
"grad_norm": 0.822217400307939,
"learning_rate": 1.883783795029468e-05,
"loss": 1.0299,
"num_tokens": 1603469822.0,
"step": 220
},
{
"epoch": 0.9561925365062196,
"grad_norm": 0.753849423474381,
"learning_rate": 1.8825454365546974e-05,
"loss": 1.0207,
"num_tokens": 1611630522.0,
"step": 221
},
{
"epoch": 0.9605191995673337,
"grad_norm": 0.8186514232729086,
"learning_rate": 1.8813009756334156e-05,
"loss": 1.0247,
"num_tokens": 1619824501.0,
"step": 222
},
{
"epoch": 0.9648458626284478,
"grad_norm": 0.6595218775089154,
"learning_rate": 1.8800504219701788e-05,
"loss": 0.9922,
"num_tokens": 1628163466.0,
"step": 223
},
{
"epoch": 0.9691725256895619,
"grad_norm": 0.7728060286552597,
"learning_rate": 1.8787937853170563e-05,
"loss": 1.0026,
"num_tokens": 1636389426.0,
"step": 224
},
{
"epoch": 0.9734991887506761,
"grad_norm": 0.595600898984859,
"learning_rate": 1.8775310754735518e-05,
"loss": 1.0253,
"num_tokens": 1644683544.0,
"step": 225
},
{
"epoch": 0.9778258518117902,
"grad_norm": 0.7038506695713876,
"learning_rate": 1.8762623022865317e-05,
"loss": 1.0104,
"num_tokens": 1652751148.0,
"step": 226
},
{
"epoch": 0.9821525148729042,
"grad_norm": 0.5773805207875454,
"learning_rate": 1.874987475650144e-05,
"loss": 1.008,
"num_tokens": 1660831749.0,
"step": 227
},
{
"epoch": 0.9864791779340184,
"grad_norm": 0.7144988810186003,
"learning_rate": 1.873706605505742e-05,
"loss": 1.0089,
"num_tokens": 1669068203.0,
"step": 228
},
{
"epoch": 0.9908058409951325,
"grad_norm": 0.47273218493032554,
"learning_rate": 1.8724197018418092e-05,
"loss": 1.0212,
"num_tokens": 1677321958.0,
"step": 229
},
{
"epoch": 0.9951325040562466,
"grad_norm": 0.8706008306058246,
"learning_rate": 1.8711267746938787e-05,
"loss": 1.0347,
"num_tokens": 1685530964.0,
"step": 230
},
{
"epoch": 0.9994591671173607,
"grad_norm": 0.5393694224243536,
"learning_rate": 1.869827834144456e-05,
"loss": 1.0167,
"num_tokens": 1693877164.0,
"step": 231
},
{
"epoch": 1.0,
"grad_norm": 1.0116059536968622,
"learning_rate": 1.8685228903229408e-05,
"loss": 0.9419,
"num_tokens": 1694884683.0,
"step": 232
},
{
"epoch": 1.004326663061114,
"grad_norm": 0.8523845557819635,
"learning_rate": 1.8672119534055465e-05,
"loss": 1.0281,
"num_tokens": 1703177073.0,
"step": 233
},
{
"epoch": 1.0086533261222281,
"grad_norm": 0.7971322678050575,
"learning_rate": 1.8658950336152227e-05,
"loss": 1.0409,
"num_tokens": 1711430466.0,
"step": 234
},
{
"epoch": 1.0129799891833424,
"grad_norm": 0.7054801324762776,
"learning_rate": 1.864572141221575e-05,
"loss": 1.0053,
"num_tokens": 1719439190.0,
"step": 235
},
{
"epoch": 1.0173066522444565,
"grad_norm": 0.6283763715716751,
"learning_rate": 1.8632432865407835e-05,
"loss": 0.9948,
"num_tokens": 1727609354.0,
"step": 236
},
{
"epoch": 1.0216333153055706,
"grad_norm": 0.8687854722145413,
"learning_rate": 1.861908479935524e-05,
"loss": 1.019,
"num_tokens": 1735661093.0,
"step": 237
},
{
"epoch": 1.0259599783666846,
"grad_norm": 0.5500318403146999,
"learning_rate": 1.8605677318148872e-05,
"loss": 1.0003,
"num_tokens": 1743819625.0,
"step": 238
},
{
"epoch": 1.030286641427799,
"grad_norm": 0.9311777237720954,
"learning_rate": 1.859221052634295e-05,
"loss": 1.0116,
"num_tokens": 1751877899.0,
"step": 239
},
{
"epoch": 1.034613304488913,
"grad_norm": 0.7175578463961261,
"learning_rate": 1.8578684528954232e-05,
"loss": 1.0081,
"num_tokens": 1760146337.0,
"step": 240
},
{
"epoch": 1.038939967550027,
"grad_norm": 3.3107808035192896,
"learning_rate": 1.8565099431461158e-05,
"loss": 1.0055,
"num_tokens": 1768338684.0,
"step": 241
},
{
"epoch": 1.043266630611141,
"grad_norm": 1.5202129013625794,
"learning_rate": 1.8551455339803053e-05,
"loss": 1.0269,
"num_tokens": 1776491986.0,
"step": 242
},
{
"epoch": 1.0475932936722552,
"grad_norm": 3.0266554495559577,
"learning_rate": 1.8537752360379277e-05,
"loss": 1.0205,
"num_tokens": 1784795461.0,
"step": 243
},
{
"epoch": 1.0519199567333695,
"grad_norm": 1.0080337963121409,
"learning_rate": 1.852399060004842e-05,
"loss": 1.0164,
"num_tokens": 1792906059.0,
"step": 244
},
{
"epoch": 1.0562466197944835,
"grad_norm": 0.6874961714524145,
"learning_rate": 1.8510170166127453e-05,
"loss": 0.9905,
"num_tokens": 1801216289.0,
"step": 245
},
{
"epoch": 1.0605732828555976,
"grad_norm": 0.9732598709336382,
"learning_rate": 1.8496291166390898e-05,
"loss": 1.0108,
"num_tokens": 1809486577.0,
"step": 246
},
{
"epoch": 1.0648999459167117,
"grad_norm": 0.5494563319646898,
"learning_rate": 1.848235370906998e-05,
"loss": 0.9772,
"num_tokens": 1817790020.0,
"step": 247
},
{
"epoch": 1.069226608977826,
"grad_norm": 1.006676590805476,
"learning_rate": 1.8468357902851788e-05,
"loss": 1.0049,
"num_tokens": 1825857770.0,
"step": 248
},
{
"epoch": 1.07355327203894,
"grad_norm": 0.8087330733140408,
"learning_rate": 1.845430385687844e-05,
"loss": 0.9978,
"num_tokens": 1834035227.0,
"step": 249
},
{
"epoch": 1.077879935100054,
"grad_norm": 0.9239546334768601,
"learning_rate": 1.84401916807462e-05,
"loss": 1.0173,
"num_tokens": 1842218920.0,
"step": 250
},
{
"epoch": 1.0822065981611682,
"grad_norm": 0.7441844314059644,
"learning_rate": 1.8426021484504655e-05,
"loss": 1.0218,
"num_tokens": 1850389037.0,
"step": 251
},
{
"epoch": 1.0865332612222822,
"grad_norm": 0.8917787308415308,
"learning_rate": 1.8411793378655847e-05,
"loss": 1.0076,
"num_tokens": 1858664333.0,
"step": 252
},
{
"epoch": 1.0908599242833965,
"grad_norm": 0.6628470852468289,
"learning_rate": 1.83975074741534e-05,
"loss": 1.0061,
"num_tokens": 1866687616.0,
"step": 253
},
{
"epoch": 1.0951865873445106,
"grad_norm": 0.9425059819695242,
"learning_rate": 1.8383163882401664e-05,
"loss": 1.0126,
"num_tokens": 1874927392.0,
"step": 254
},
{
"epoch": 1.0995132504056246,
"grad_norm": 0.8014508302359454,
"learning_rate": 1.836876271525485e-05,
"loss": 1.0231,
"num_tokens": 1883000858.0,
"step": 255
},
{
"epoch": 1.1038399134667387,
"grad_norm": 0.7398590141389113,
"learning_rate": 1.8354304085016157e-05,
"loss": 0.9832,
"num_tokens": 1891276371.0,
"step": 256
},
{
"epoch": 1.1081665765278528,
"grad_norm": 0.7402302271445659,
"learning_rate": 1.8339788104436886e-05,
"loss": 1.0052,
"num_tokens": 1899358130.0,
"step": 257
},
{
"epoch": 1.112493239588967,
"grad_norm": 0.658496295512364,
"learning_rate": 1.8325214886715567e-05,
"loss": 1.0036,
"num_tokens": 1907341426.0,
"step": 258
},
{
"epoch": 1.1168199026500811,
"grad_norm": 0.5716390513296978,
"learning_rate": 1.8310584545497075e-05,
"loss": 1.0275,
"num_tokens": 1915436031.0,
"step": 259
},
{
"epoch": 1.1211465657111952,
"grad_norm": 0.7032730355198555,
"learning_rate": 1.829589719487176e-05,
"loss": 0.9917,
"num_tokens": 1923730470.0,
"step": 260
},
{
"epoch": 1.1254732287723093,
"grad_norm": 0.5511876551401437,
"learning_rate": 1.8281152949374527e-05,
"loss": 0.9666,
"num_tokens": 1931795232.0,
"step": 261
},
{
"epoch": 1.1297998918334236,
"grad_norm": 0.7898567891349134,
"learning_rate": 1.8266351923983967e-05,
"loss": 0.9967,
"num_tokens": 1939773367.0,
"step": 262
},
{
"epoch": 1.1341265548945376,
"grad_norm": 0.599615186489735,
"learning_rate": 1.8251494234121445e-05,
"loss": 0.9802,
"num_tokens": 1948032241.0,
"step": 263
},
{
"epoch": 1.1384532179556517,
"grad_norm": 0.8668923604915937,
"learning_rate": 1.823657999565021e-05,
"loss": 1.027,
"num_tokens": 1956343781.0,
"step": 264
},
{
"epoch": 1.1427798810167658,
"grad_norm": 0.6738644701502109,
"learning_rate": 1.8221609324874503e-05,
"loss": 0.9755,
"num_tokens": 1964501930.0,
"step": 265
},
{
"epoch": 1.14710654407788,
"grad_norm": 0.8489600222509309,
"learning_rate": 1.82065823385386e-05,
"loss": 1.0475,
"num_tokens": 1972762164.0,
"step": 266
},
{
"epoch": 1.151433207138994,
"grad_norm": 0.6976538570690681,
"learning_rate": 1.819149915382598e-05,
"loss": 1.0346,
"num_tokens": 1980766973.0,
"step": 267
},
{
"epoch": 1.1557598702001082,
"grad_norm": 0.6334291327020631,
"learning_rate": 1.8176359888358332e-05,
"loss": 0.9949,
"num_tokens": 1988984376.0,
"step": 268
},
{
"epoch": 1.1600865332612222,
"grad_norm": 0.7758411292456114,
"learning_rate": 1.8161164660194697e-05,
"loss": 1.0063,
"num_tokens": 1997243756.0,
"step": 269
},
{
"epoch": 1.1644131963223363,
"grad_norm": 0.53766380879933,
"learning_rate": 1.814591358783052e-05,
"loss": 0.9687,
"num_tokens": 2005281688.0,
"step": 270
},
{
"epoch": 1.1687398593834506,
"grad_norm": 0.7298357532804184,
"learning_rate": 1.813060679019672e-05,
"loss": 1.0034,
"num_tokens": 2013514643.0,
"step": 271
},
{
"epoch": 1.1730665224445647,
"grad_norm": 0.5869951640518326,
"learning_rate": 1.811524438665878e-05,
"loss": 0.9717,
"num_tokens": 2021706426.0,
"step": 272
},
{
"epoch": 1.1773931855056787,
"grad_norm": 0.7409836437587911,
"learning_rate": 1.809982649701581e-05,
"loss": 1.0074,
"num_tokens": 2029841931.0,
"step": 273
},
{
"epoch": 1.1817198485667928,
"grad_norm": 0.5601651834188004,
"learning_rate": 1.808435324149961e-05,
"loss": 1.0175,
"num_tokens": 2038051382.0,
"step": 274
},
{
"epoch": 1.1860465116279069,
"grad_norm": 0.7059057420034878,
"learning_rate": 1.806882474077374e-05,
"loss": 0.9781,
"num_tokens": 2046194678.0,
"step": 275
},
{
"epoch": 1.1903731746890212,
"grad_norm": 0.5285941165253452,
"learning_rate": 1.805324111593256e-05,
"loss": 0.9934,
"num_tokens": 2054410246.0,
"step": 276
},
{
"epoch": 1.1946998377501352,
"grad_norm": 0.8160359988210635,
"learning_rate": 1.8037602488500313e-05,
"loss": 1.0225,
"num_tokens": 2062614982.0,
"step": 277
},
{
"epoch": 1.1990265008112493,
"grad_norm": 0.7592201740439128,
"learning_rate": 1.8021908980430153e-05,
"loss": 1.0009,
"num_tokens": 2070765170.0,
"step": 278
},
{
"epoch": 1.2033531638723634,
"grad_norm": 0.6963096899729393,
"learning_rate": 1.8006160714103213e-05,
"loss": 1.0061,
"num_tokens": 2078798709.0,
"step": 279
},
{
"epoch": 1.2076798269334776,
"grad_norm": 0.7659996635522403,
"learning_rate": 1.7990357812327634e-05,
"loss": 1.0477,
"num_tokens": 2087010586.0,
"step": 280
},
{
"epoch": 1.2120064899945917,
"grad_norm": 0.6004295360408359,
"learning_rate": 1.797450039833762e-05,
"loss": 1.0005,
"num_tokens": 2095235843.0,
"step": 281
},
{
"epoch": 1.2163331530557058,
"grad_norm": 0.7399161134131866,
"learning_rate": 1.7958588595792467e-05,
"loss": 1.0077,
"num_tokens": 2103386607.0,
"step": 282
},
{
"epoch": 1.2206598161168198,
"grad_norm": 0.6508563400956765,
"learning_rate": 1.794262252877561e-05,
"loss": 0.9971,
"num_tokens": 2111487429.0,
"step": 283
},
{
"epoch": 1.2249864791779341,
"grad_norm": 0.6883886284438999,
"learning_rate": 1.7926602321793652e-05,
"loss": 0.9877,
"num_tokens": 2119602589.0,
"step": 284
},
{
"epoch": 1.2293131422390482,
"grad_norm": 0.7341240468720364,
"learning_rate": 1.791052809977538e-05,
"loss": 1.0029,
"num_tokens": 2127670452.0,
"step": 285
},
{
"epoch": 1.2336398053001623,
"grad_norm": 0.6043417685905373,
"learning_rate": 1.7894399988070804e-05,
"loss": 0.9953,
"num_tokens": 2135883917.0,
"step": 286
},
{
"epoch": 1.2379664683612763,
"grad_norm": 0.9246150340955609,
"learning_rate": 1.787821811245018e-05,
"loss": 0.9928,
"num_tokens": 2144047925.0,
"step": 287
},
{
"epoch": 1.2422931314223904,
"grad_norm": 0.5523779600689154,
"learning_rate": 1.7861982599103033e-05,
"loss": 1.0071,
"num_tokens": 2152384051.0,
"step": 288
},
{
"epoch": 1.2466197944835047,
"grad_norm": 0.8571329459319135,
"learning_rate": 1.7845693574637145e-05,
"loss": 1.0071,
"num_tokens": 2160699270.0,
"step": 289
},
{
"epoch": 1.2509464575446188,
"grad_norm": 0.7056191532296572,
"learning_rate": 1.7829351166077613e-05,
"loss": 1.0144,
"num_tokens": 2168902172.0,
"step": 290
},
{
"epoch": 1.2552731206057328,
"grad_norm": 0.6503760866034937,
"learning_rate": 1.781295550086581e-05,
"loss": 1.0066,
"num_tokens": 2177239381.0,
"step": 291
},
{
"epoch": 1.2595997836668469,
"grad_norm": 0.7990458653208856,
"learning_rate": 1.779650670685843e-05,
"loss": 0.997,
"num_tokens": 2185441306.0,
"step": 292
},
{
"epoch": 1.263926446727961,
"grad_norm": 0.6149308243930688,
"learning_rate": 1.7780004912326482e-05,
"loss": 0.9778,
"num_tokens": 2193573128.0,
"step": 293
},
{
"epoch": 1.2682531097890752,
"grad_norm": 0.6225445687843602,
"learning_rate": 1.7763450245954265e-05,
"loss": 0.9604,
"num_tokens": 2201777204.0,
"step": 294
},
{
"epoch": 1.2725797728501893,
"grad_norm": 0.6292492547633257,
"learning_rate": 1.7746842836838397e-05,
"loss": 0.9954,
"num_tokens": 2210018927.0,
"step": 295
},
{
"epoch": 1.2769064359113034,
"grad_norm": 0.6516142728390708,
"learning_rate": 1.773018281448679e-05,
"loss": 0.9806,
"num_tokens": 2218100500.0,
"step": 296
},
{
"epoch": 1.2812330989724177,
"grad_norm": 0.5320881419249361,
"learning_rate": 1.7713470308817642e-05,
"loss": 1.0039,
"num_tokens": 2226355010.0,
"step": 297
},
{
"epoch": 1.2855597620335315,
"grad_norm": 0.5584190002591476,
"learning_rate": 1.769670545015843e-05,
"loss": 1.0101,
"num_tokens": 2234658122.0,
"step": 298
},
{
"epoch": 1.2898864250946458,
"grad_norm": 0.6678876958868184,
"learning_rate": 1.7679888369244895e-05,
"loss": 0.9922,
"num_tokens": 2242975637.0,
"step": 299
},
{
"epoch": 1.2942130881557599,
"grad_norm": 0.4558349860883781,
"learning_rate": 1.7663019197220003e-05,
"loss": 0.9782,
"num_tokens": 2251107863.0,
"step": 300
},
{
"epoch": 1.298539751216874,
"grad_norm": 0.5253621106010484,
"learning_rate": 1.7646098065632956e-05,
"loss": 0.9693,
"num_tokens": 2259286425.0,
"step": 301
},
{
"epoch": 1.3028664142779882,
"grad_norm": 0.5270006972835457,
"learning_rate": 1.7629125106438132e-05,
"loss": 0.9963,
"num_tokens": 2267601260.0,
"step": 302
},
{
"epoch": 1.3071930773391023,
"grad_norm": 0.5164120144566925,
"learning_rate": 1.7612100451994077e-05,
"loss": 0.9976,
"num_tokens": 2275904161.0,
"step": 303
},
{
"epoch": 1.3115197404002163,
"grad_norm": 0.6543412137671887,
"learning_rate": 1.759502423506246e-05,
"loss": 0.9804,
"num_tokens": 2284244187.0,
"step": 304
},
{
"epoch": 1.3158464034613304,
"grad_norm": 0.5832832348747982,
"learning_rate": 1.7577896588807065e-05,
"loss": 1.0123,
"num_tokens": 2292398951.0,
"step": 305
},
{
"epoch": 1.3201730665224445,
"grad_norm": 0.5621179876214708,
"learning_rate": 1.7560717646792704e-05,
"loss": 1.003,
"num_tokens": 2300523004.0,
"step": 306
},
{
"epoch": 1.3244997295835588,
"grad_norm": 0.4864049233927397,
"learning_rate": 1.7543487542984227e-05,
"loss": 0.9805,
"num_tokens": 2308678897.0,
"step": 307
},
{
"epoch": 1.3288263926446728,
"grad_norm": 0.6631344821261796,
"learning_rate": 1.752620641174544e-05,
"loss": 1.0121,
"num_tokens": 2316917049.0,
"step": 308
},
{
"epoch": 1.333153055705787,
"grad_norm": 0.4724599093603475,
"learning_rate": 1.750887438783808e-05,
"loss": 1.0085,
"num_tokens": 2325186123.0,
"step": 309
},
{
"epoch": 1.337479718766901,
"grad_norm": 0.5689296032553274,
"learning_rate": 1.749149160642075e-05,
"loss": 0.978,
"num_tokens": 2333446937.0,
"step": 310
},
{
"epoch": 1.341806381828015,
"grad_norm": 0.6132116514129226,
"learning_rate": 1.7474058203047863e-05,
"loss": 1.0103,
"num_tokens": 2341787109.0,
"step": 311
},
{
"epoch": 1.3461330448891293,
"grad_norm": 0.48324225857414066,
"learning_rate": 1.745657431366861e-05,
"loss": 1.0389,
"num_tokens": 2349822363.0,
"step": 312
},
{
"epoch": 1.3504597079502434,
"grad_norm": 0.6027402450479121,
"learning_rate": 1.743904007462587e-05,
"loss": 0.992,
"num_tokens": 2358047057.0,
"step": 313
},
{
"epoch": 1.3547863710113575,
"grad_norm": 0.5459463271112598,
"learning_rate": 1.742145562265516e-05,
"loss": 0.9789,
"num_tokens": 2366212866.0,
"step": 314
},
{
"epoch": 1.3591130340724715,
"grad_norm": 0.6005164633273204,
"learning_rate": 1.7403821094883572e-05,
"loss": 1.0129,
"num_tokens": 2374409556.0,
"step": 315
},
{
"epoch": 1.3634396971335856,
"grad_norm": 0.44348087107159345,
"learning_rate": 1.738613662882869e-05,
"loss": 0.9625,
"num_tokens": 2382456975.0,
"step": 316
},
{
"epoch": 1.3677663601946999,
"grad_norm": 0.5583508504708583,
"learning_rate": 1.7368402362397537e-05,
"loss": 0.9857,
"num_tokens": 2390530712.0,
"step": 317
},
{
"epoch": 1.372093023255814,
"grad_norm": 0.5361240200931625,
"learning_rate": 1.7350618433885487e-05,
"loss": 0.996,
"num_tokens": 2398702919.0,
"step": 318
},
{
"epoch": 1.376419686316928,
"grad_norm": 0.6419494735313418,
"learning_rate": 1.7332784981975183e-05,
"loss": 0.9783,
"num_tokens": 2406786796.0,
"step": 319
},
{
"epoch": 1.3807463493780423,
"grad_norm": 0.46442325392330475,
"learning_rate": 1.731490214573547e-05,
"loss": 0.9767,
"num_tokens": 2414957373.0,
"step": 320
},
{
"epoch": 1.3850730124391564,
"grad_norm": 0.5504177189685487,
"learning_rate": 1.729697006462029e-05,
"loss": 0.9564,
"num_tokens": 2423238537.0,
"step": 321
},
{
"epoch": 1.3893996755002704,
"grad_norm": 0.5568430696730932,
"learning_rate": 1.7278988878467616e-05,
"loss": 0.9875,
"num_tokens": 2431242288.0,
"step": 322
},
{
"epoch": 1.3937263385613845,
"grad_norm": 0.6944929843782134,
"learning_rate": 1.7260958727498358e-05,
"loss": 0.9879,
"num_tokens": 2439410045.0,
"step": 323
},
{
"epoch": 1.3980530016224986,
"grad_norm": 0.43050663385999666,
"learning_rate": 1.7242879752315246e-05,
"loss": 1.0351,
"num_tokens": 2447761767.0,
"step": 324
},
{
"epoch": 1.4023796646836129,
"grad_norm": 0.5645224462691939,
"learning_rate": 1.722475209390176e-05,
"loss": 0.9672,
"num_tokens": 2455907686.0,
"step": 325
},
{
"epoch": 1.406706327744727,
"grad_norm": 0.5505917448888162,
"learning_rate": 1.720657589362103e-05,
"loss": 0.9999,
"num_tokens": 2464177214.0,
"step": 326
},
{
"epoch": 1.411032990805841,
"grad_norm": 0.6503856248771197,
"learning_rate": 1.7188351293214707e-05,
"loss": 0.9932,
"num_tokens": 2472423451.0,
"step": 327
},
{
"epoch": 1.415359653866955,
"grad_norm": 0.5177449838157887,
"learning_rate": 1.7170078434801893e-05,
"loss": 0.9814,
"num_tokens": 2480548139.0,
"step": 328
},
{
"epoch": 1.4196863169280691,
"grad_norm": 0.5428671196864796,
"learning_rate": 1.7151757460878006e-05,
"loss": 0.9839,
"num_tokens": 2488700875.0,
"step": 329
},
{
"epoch": 1.4240129799891834,
"grad_norm": 0.6795948596741072,
"learning_rate": 1.713338851431368e-05,
"loss": 0.9831,
"num_tokens": 2496833920.0,
"step": 330
},
{
"epoch": 1.4283396430502975,
"grad_norm": 0.4956381502852352,
"learning_rate": 1.7114971738353652e-05,
"loss": 0.99,
"num_tokens": 2504957672.0,
"step": 331
},
{
"epoch": 1.4326663061114115,
"grad_norm": 0.5599271824993283,
"learning_rate": 1.7096507276615638e-05,
"loss": 0.9968,
"num_tokens": 2513212255.0,
"step": 332
},
{
"epoch": 1.4369929691725256,
"grad_norm": 0.5272083444046696,
"learning_rate": 1.707799527308922e-05,
"loss": 0.9881,
"num_tokens": 2521365508.0,
"step": 333
},
{
"epoch": 1.4413196322336397,
"grad_norm": 0.6114656981188148,
"learning_rate": 1.7059435872134725e-05,
"loss": 0.9658,
"num_tokens": 2529600061.0,
"step": 334
},
{
"epoch": 1.445646295294754,
"grad_norm": 0.5311777583844968,
"learning_rate": 1.7040829218482083e-05,
"loss": 1.0014,
"num_tokens": 2537565588.0,
"step": 335
},
{
"epoch": 1.449972958355868,
"grad_norm": 0.594295616323625,
"learning_rate": 1.7022175457229726e-05,
"loss": 0.9961,
"num_tokens": 2545669630.0,
"step": 336
},
{
"epoch": 1.454299621416982,
"grad_norm": 7.315914279837485,
"learning_rate": 1.7003474733843423e-05,
"loss": 1.0044,
"num_tokens": 2553827025.0,
"step": 337
},
{
"epoch": 1.4586262844780964,
"grad_norm": 0.9226883902311878,
"learning_rate": 1.6984727194155172e-05,
"loss": 0.9889,
"num_tokens": 2562035770.0,
"step": 338
},
{
"epoch": 1.4629529475392105,
"grad_norm": 0.4628985508669373,
"learning_rate": 1.696593298436206e-05,
"loss": 1.0136,
"num_tokens": 2570209243.0,
"step": 339
},
{
"epoch": 1.4672796106003245,
"grad_norm": 0.6513152028153515,
"learning_rate": 1.6947092251025103e-05,
"loss": 0.9955,
"num_tokens": 2578291736.0,
"step": 340
},
{
"epoch": 1.4716062736614386,
"grad_norm": 0.5546903898287193,
"learning_rate": 1.6928205141068125e-05,
"loss": 1.0059,
"num_tokens": 2586425114.0,
"step": 341
},
{
"epoch": 1.4759329367225527,
"grad_norm": 0.6184393569733373,
"learning_rate": 1.690927180177661e-05,
"loss": 0.9803,
"num_tokens": 2594721102.0,
"step": 342
},
{
"epoch": 1.480259599783667,
"grad_norm": 0.7432953081745481,
"learning_rate": 1.6890292380796534e-05,
"loss": 0.9996,
"num_tokens": 2602895947.0,
"step": 343
},
{
"epoch": 1.484586262844781,
"grad_norm": 0.5083465411774949,
"learning_rate": 1.687126702613324e-05,
"loss": 0.9853,
"num_tokens": 2611136812.0,
"step": 344
},
{
"epoch": 1.488912925905895,
"grad_norm": 0.6819386036131501,
"learning_rate": 1.685219588615026e-05,
"loss": 1.012,
"num_tokens": 2619141613.0,
"step": 345
},
{
"epoch": 1.4932395889670091,
"grad_norm": 0.5767934097291644,
"learning_rate": 1.683307910956818e-05,
"loss": 0.977,
"num_tokens": 2627283279.0,
"step": 346
},
{
"epoch": 1.4975662520281232,
"grad_norm": 0.6396276200966129,
"learning_rate": 1.6813916845463462e-05,
"loss": 0.9982,
"num_tokens": 2635365385.0,
"step": 347
},
{
"epoch": 1.5018929150892375,
"grad_norm": 0.5492232920660722,
"learning_rate": 1.6794709243267288e-05,
"loss": 0.9719,
"num_tokens": 2643617216.0,
"step": 348
},
{
"epoch": 1.5062195781503516,
"grad_norm": 0.6330063608519101,
"learning_rate": 1.6775456452764398e-05,
"loss": 1.01,
"num_tokens": 2651803411.0,
"step": 349
},
{
"epoch": 1.5105462412114656,
"grad_norm": 0.45914817800081437,
"learning_rate": 1.6756158624091923e-05,
"loss": 0.9947,
"num_tokens": 2660138375.0,
"step": 350
},
{
"epoch": 1.51487290427258,
"grad_norm": 0.6030719337908035,
"learning_rate": 1.673681590773821e-05,
"loss": 1.0027,
"num_tokens": 2668455705.0,
"step": 351
},
{
"epoch": 1.5191995673336938,
"grad_norm": 0.4539111994752465,
"learning_rate": 1.671742845454164e-05,
"loss": 0.9764,
"num_tokens": 2676612397.0,
"step": 352
},
{
"epoch": 1.523526230394808,
"grad_norm": 0.6279947858419637,
"learning_rate": 1.6697996415689473e-05,
"loss": 0.9975,
"num_tokens": 2684750468.0,
"step": 353
},
{
"epoch": 1.5278528934559221,
"grad_norm": 0.5588793031254649,
"learning_rate": 1.667851994271665e-05,
"loss": 0.9724,
"num_tokens": 2692847293.0,
"step": 354
},
{
"epoch": 1.5321795565170362,
"grad_norm": 0.6174071639009994,
"learning_rate": 1.6658999187504615e-05,
"loss": 0.9763,
"num_tokens": 2700888345.0,
"step": 355
},
{
"epoch": 1.5365062195781505,
"grad_norm": 0.44484731949205647,
"learning_rate": 1.6639434302280145e-05,
"loss": 0.962,
"num_tokens": 2709053608.0,
"step": 356
},
{
"epoch": 1.5408328826392643,
"grad_norm": 0.5714553401258878,
"learning_rate": 1.6619825439614143e-05,
"loss": 0.9919,
"num_tokens": 2717170050.0,
"step": 357
},
{
"epoch": 1.5451595457003786,
"grad_norm": 0.48460960422536814,
"learning_rate": 1.660017275242046e-05,
"loss": 0.9892,
"num_tokens": 2725229510.0,
"step": 358
},
{
"epoch": 1.5494862087614927,
"grad_norm": 0.5112248791874374,
"learning_rate": 1.6580476393954694e-05,
"loss": 0.9662,
"num_tokens": 2733428514.0,
"step": 359
},
{
"epoch": 1.5538128718226067,
"grad_norm": 0.4309041720695094,
"learning_rate": 1.6560736517813013e-05,
"loss": 0.9885,
"num_tokens": 2741589691.0,
"step": 360
},
{
"epoch": 1.558139534883721,
"grad_norm": 0.553317003415503,
"learning_rate": 1.6540953277930925e-05,
"loss": 1.0059,
"num_tokens": 2749928247.0,
"step": 361
},
{
"epoch": 1.562466197944835,
"grad_norm": 0.5129455597122908,
"learning_rate": 1.6521126828582118e-05,
"loss": 0.9823,
"num_tokens": 2758121545.0,
"step": 362
},
{
"epoch": 1.5667928610059492,
"grad_norm": 0.5998715255176499,
"learning_rate": 1.6501257324377227e-05,
"loss": 1.0031,
"num_tokens": 2766372133.0,
"step": 363
},
{
"epoch": 1.5711195240670632,
"grad_norm": 0.38349746741335283,
"learning_rate": 1.648134492026263e-05,
"loss": 1.0038,
"num_tokens": 2774564035.0,
"step": 364
},
{
"epoch": 1.5754461871281773,
"grad_norm": 0.5607648982620999,
"learning_rate": 1.6461389771519263e-05,
"loss": 0.9557,
"num_tokens": 2782723296.0,
"step": 365
},
{
"epoch": 1.5797728501892916,
"grad_norm": 0.5745161983634366,
"learning_rate": 1.6441392033761378e-05,
"loss": 0.9981,
"num_tokens": 2790897244.0,
"step": 366
},
{
"epoch": 1.5840995132504057,
"grad_norm": 0.4868016408983241,
"learning_rate": 1.6421351862935348e-05,
"loss": 0.995,
"num_tokens": 2799161194.0,
"step": 367
},
{
"epoch": 1.5884261763115197,
"grad_norm": 0.6663144418274797,
"learning_rate": 1.6401269415318462e-05,
"loss": 0.9716,
"num_tokens": 2807099848.0,
"step": 368
},
{
"epoch": 1.592752839372634,
"grad_norm": 0.3999865533654534,
"learning_rate": 1.6381144847517672e-05,
"loss": 0.9727,
"num_tokens": 2815387290.0,
"step": 369
},
{
"epoch": 1.5970795024337479,
"grad_norm": 0.5902182552201716,
"learning_rate": 1.6360978316468404e-05,
"loss": 0.9756,
"num_tokens": 2823576095.0,
"step": 370
},
{
"epoch": 1.6014061654948621,
"grad_norm": 0.4725880021279996,
"learning_rate": 1.6340769979433314e-05,
"loss": 0.9894,
"num_tokens": 2831697837.0,
"step": 371
},
{
"epoch": 1.6057328285559762,
"grad_norm": 0.5618830540568662,
"learning_rate": 1.632051999400108e-05,
"loss": 0.9769,
"num_tokens": 2839950847.0,
"step": 372
},
{
"epoch": 1.6100594916170903,
"grad_norm": 0.49481337447529106,
"learning_rate": 1.6300228518085148e-05,
"loss": 0.9892,
"num_tokens": 2848152749.0,
"step": 373
},
{
"epoch": 1.6143861546782046,
"grad_norm": 0.5629300935165885,
"learning_rate": 1.6279895709922534e-05,
"loss": 0.9968,
"num_tokens": 2856267040.0,
"step": 374
},
{
"epoch": 1.6187128177393184,
"grad_norm": 0.5321521440430792,
"learning_rate": 1.625952172807255e-05,
"loss": 0.9772,
"num_tokens": 2864487220.0,
"step": 375
},
{
"epoch": 1.6230394808004327,
"grad_norm": 0.4621380145986486,
"learning_rate": 1.6239106731415604e-05,
"loss": 0.9616,
"num_tokens": 2872722633.0,
"step": 376
},
{
"epoch": 1.6273661438615468,
"grad_norm": 0.5566861055001202,
"learning_rate": 1.6218650879151946e-05,
"loss": 0.992,
"num_tokens": 2880964813.0,
"step": 377
},
{
"epoch": 1.6316928069226608,
"grad_norm": 0.5691044633846349,
"learning_rate": 1.6198154330800408e-05,
"loss": 0.9852,
"num_tokens": 2889076459.0,
"step": 378
},
{
"epoch": 1.6360194699837751,
"grad_norm": 0.4585342412787512,
"learning_rate": 1.6177617246197206e-05,
"loss": 1.0049,
"num_tokens": 2897211346.0,
"step": 379
},
{
"epoch": 1.640346133044889,
"grad_norm": 0.5392035073552278,
"learning_rate": 1.615703978549464e-05,
"loss": 0.956,
"num_tokens": 2905280542.0,
"step": 380
},
{
"epoch": 1.6446727961060033,
"grad_norm": 0.5178806174754045,
"learning_rate": 1.6136422109159887e-05,
"loss": 0.9749,
"num_tokens": 2913458269.0,
"step": 381
},
{
"epoch": 1.6489994591671173,
"grad_norm": 0.46877312760108464,
"learning_rate": 1.611576437797373e-05,
"loss": 0.9773,
"num_tokens": 2921615325.0,
"step": 382
},
{
"epoch": 1.6533261222282314,
"grad_norm": 0.5155483807676432,
"learning_rate": 1.60950667530293e-05,
"loss": 0.9955,
"num_tokens": 2929887058.0,
"step": 383
},
{
"epoch": 1.6576527852893457,
"grad_norm": 0.5072802808973179,
"learning_rate": 1.607432939573084e-05,
"loss": 0.962,
"num_tokens": 2938013416.0,
"step": 384
},
{
"epoch": 1.6619794483504597,
"grad_norm": 0.5129210712159445,
"learning_rate": 1.605355246779243e-05,
"loss": 1.0206,
"num_tokens": 2946138869.0,
"step": 385
},
{
"epoch": 1.6663061114115738,
"grad_norm": 0.5839938970559614,
"learning_rate": 1.6032736131236722e-05,
"loss": 0.9575,
"num_tokens": 2954330909.0,
"step": 386
},
{
"epoch": 1.670632774472688,
"grad_norm": 0.5339695370345615,
"learning_rate": 1.6011880548393694e-05,
"loss": 0.9475,
"num_tokens": 2962652473.0,
"step": 387
},
{
"epoch": 1.674959437533802,
"grad_norm": 0.6181152846819087,
"learning_rate": 1.5990985881899367e-05,
"loss": 0.9765,
"num_tokens": 2970856021.0,
"step": 388
},
{
"epoch": 1.6792861005949162,
"grad_norm": 0.4441388775819552,
"learning_rate": 1.597005229469455e-05,
"loss": 0.9635,
"num_tokens": 2978996470.0,
"step": 389
},
{
"epoch": 1.6836127636560303,
"grad_norm": 0.7034052637679556,
"learning_rate": 1.594907995002356e-05,
"loss": 0.9745,
"num_tokens": 2987205522.0,
"step": 390
},
{
"epoch": 1.6879394267171444,
"grad_norm": 0.4820716961447039,
"learning_rate": 1.5928069011432955e-05,
"loss": 0.9565,
"num_tokens": 2995251335.0,
"step": 391
},
{
"epoch": 1.6922660897782587,
"grad_norm": 0.6096635073457451,
"learning_rate": 1.590701964277025e-05,
"loss": 0.989,
"num_tokens": 3003455824.0,
"step": 392
},
{
"epoch": 1.6965927528393725,
"grad_norm": 0.6098082087418374,
"learning_rate": 1.588593200818266e-05,
"loss": 0.9852,
"num_tokens": 3011809584.0,
"step": 393
},
{
"epoch": 1.7009194159004868,
"grad_norm": 0.706589588823252,
"learning_rate": 1.5864806272115786e-05,
"loss": 0.9742,
"num_tokens": 3020038193.0,
"step": 394
},
{
"epoch": 1.7052460789616009,
"grad_norm": 0.5332508357896025,
"learning_rate": 1.5843642599312365e-05,
"loss": 0.9658,
"num_tokens": 3028247851.0,
"step": 395
},
{
"epoch": 1.709572742022715,
"grad_norm": 0.5967126870935362,
"learning_rate": 1.582244115481097e-05,
"loss": 0.9787,
"num_tokens": 3036393672.0,
"step": 396
},
{
"epoch": 1.7138994050838292,
"grad_norm": 0.5618945234242413,
"learning_rate": 1.5801202103944725e-05,
"loss": 0.9716,
"num_tokens": 3044494253.0,
"step": 397
},
{
"epoch": 1.718226068144943,
"grad_norm": 0.6427240201436156,
"learning_rate": 1.577992561234001e-05,
"loss": 0.9905,
"num_tokens": 3052731889.0,
"step": 398
},
{
"epoch": 1.7225527312060573,
"grad_norm": 0.45664767030415754,
"learning_rate": 1.5758611845915188e-05,
"loss": 0.9914,
"num_tokens": 3060830320.0,
"step": 399
},
{
"epoch": 1.7268793942671714,
"grad_norm": 0.5680532472735643,
"learning_rate": 1.573726097087928e-05,
"loss": 0.9912,
"num_tokens": 3068973700.0,
"step": 400
},
{
"epoch": 1.7312060573282855,
"grad_norm": 0.5644498364761277,
"learning_rate": 1.5715873153730713e-05,
"loss": 0.9665,
"num_tokens": 3077224259.0,
"step": 401
},
{
"epoch": 1.7355327203893998,
"grad_norm": 0.5014411330180275,
"learning_rate": 1.5694448561255972e-05,
"loss": 0.9652,
"num_tokens": 3085594237.0,
"step": 402
},
{
"epoch": 1.7398593834505138,
"grad_norm": 0.41568492870972895,
"learning_rate": 1.5672987360528334e-05,
"loss": 0.9553,
"num_tokens": 3093879360.0,
"step": 403
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.5967844492183373,
"learning_rate": 1.5651489718906553e-05,
"loss": 0.9832,
"num_tokens": 3102164647.0,
"step": 404
},
{
"epoch": 1.7485127095727422,
"grad_norm": 0.5543655378915199,
"learning_rate": 1.5629955804033558e-05,
"loss": 0.9806,
"num_tokens": 3110263161.0,
"step": 405
},
{
"epoch": 1.752839372633856,
"grad_norm": 0.4435918962836691,
"learning_rate": 1.5608385783835145e-05,
"loss": 0.9604,
"num_tokens": 3118470439.0,
"step": 406
},
{
"epoch": 1.7571660356949703,
"grad_norm": 0.5005651315002034,
"learning_rate": 1.558677982651866e-05,
"loss": 0.9688,
"num_tokens": 3126716914.0,
"step": 407
},
{
"epoch": 1.7614926987560844,
"grad_norm": 4.391443015936278,
"learning_rate": 1.5565138100571703e-05,
"loss": 0.9629,
"num_tokens": 3134906980.0,
"step": 408
},
{
"epoch": 1.7658193618171985,
"grad_norm": 0.9040714523990061,
"learning_rate": 1.5543460774760798e-05,
"loss": 0.981,
"num_tokens": 3143124349.0,
"step": 409
},
{
"epoch": 1.7701460248783127,
"grad_norm": 0.43080204624761487,
"learning_rate": 1.5521748018130082e-05,
"loss": 1.0155,
"num_tokens": 3151490545.0,
"step": 410
},
{
"epoch": 1.7744726879394266,
"grad_norm": 0.763386547386101,
"learning_rate": 1.55e-05,
"loss": 1.0146,
"num_tokens": 3159636609.0,
"step": 411
},
{
"epoch": 1.7787993510005409,
"grad_norm": 0.6119279005971348,
"learning_rate": 1.5478216889965965e-05,
"loss": 0.9982,
"num_tokens": 3167775862.0,
"step": 412
},
{
"epoch": 1.783126014061655,
"grad_norm": 0.5651533282088765,
"learning_rate": 1.545639885789704e-05,
"loss": 0.9818,
"num_tokens": 3176055047.0,
"step": 413
},
{
"epoch": 1.787452677122769,
"grad_norm": 0.7405859257267926,
"learning_rate": 1.5434546073934625e-05,
"loss": 0.9995,
"num_tokens": 3184104527.0,
"step": 414
},
{
"epoch": 1.7917793401838833,
"grad_norm": 0.5233140162738117,
"learning_rate": 1.541265870849112e-05,
"loss": 0.9852,
"num_tokens": 3192240853.0,
"step": 415
},
{
"epoch": 1.7961060032449971,
"grad_norm": 0.7809493822032231,
"learning_rate": 1.5390736932248595e-05,
"loss": 0.9343,
"num_tokens": 3200252152.0,
"step": 416
},
{
"epoch": 1.8004326663061114,
"grad_norm": 0.5756707174156201,
"learning_rate": 1.5368780916157466e-05,
"loss": 0.9817,
"num_tokens": 3208314962.0,
"step": 417
},
{
"epoch": 1.8047593293672255,
"grad_norm": 0.8443572669557455,
"learning_rate": 1.5346790831435157e-05,
"loss": 1.0031,
"num_tokens": 3216354701.0,
"step": 418
},
{
"epoch": 1.8090859924283396,
"grad_norm": 0.7330225783691384,
"learning_rate": 1.5324766849564766e-05,
"loss": 0.9701,
"num_tokens": 3224541175.0,
"step": 419
},
{
"epoch": 1.8134126554894539,
"grad_norm": 0.8597981617306653,
"learning_rate": 1.5302709142293732e-05,
"loss": 1.003,
"num_tokens": 3232690124.0,
"step": 420
},
{
"epoch": 1.817739318550568,
"grad_norm": 0.7597629364718675,
"learning_rate": 1.528061788163248e-05,
"loss": 0.9418,
"num_tokens": 3240925095.0,
"step": 421
},
{
"epoch": 1.822065981611682,
"grad_norm": 0.7515345846987643,
"learning_rate": 1.52584932398531e-05,
"loss": 0.9852,
"num_tokens": 3249017852.0,
"step": 422
},
{
"epoch": 1.826392644672796,
"grad_norm": 0.6736337890470313,
"learning_rate": 1.5236335389487997e-05,
"loss": 0.937,
"num_tokens": 3257064643.0,
"step": 423
},
{
"epoch": 1.8307193077339101,
"grad_norm": 0.6249241728233079,
"learning_rate": 1.5214144503328532e-05,
"loss": 0.9513,
"num_tokens": 3265289767.0,
"step": 424
},
{
"epoch": 1.8350459707950244,
"grad_norm": 0.6298196557489565,
"learning_rate": 1.5191920754423698e-05,
"loss": 0.9642,
"num_tokens": 3273615215.0,
"step": 425
},
{
"epoch": 1.8393726338561385,
"grad_norm": 0.5745220636985442,
"learning_rate": 1.5169664316078758e-05,
"loss": 0.9586,
"num_tokens": 3281703406.0,
"step": 426
},
{
"epoch": 1.8436992969172525,
"grad_norm": 0.5956954636450505,
"learning_rate": 1.514737536185388e-05,
"loss": 1.0039,
"num_tokens": 3289933861.0,
"step": 427
},
{
"epoch": 1.8480259599783668,
"grad_norm": 0.5109096269242125,
"learning_rate": 1.512505406556281e-05,
"loss": 0.9607,
"num_tokens": 3298102935.0,
"step": 428
},
{
"epoch": 1.8523526230394807,
"grad_norm": 0.6631068486693279,
"learning_rate": 1.5102700601271503e-05,
"loss": 0.9675,
"num_tokens": 3306272221.0,
"step": 429
},
{
"epoch": 1.856679286100595,
"grad_norm": 0.4409566460742665,
"learning_rate": 1.5080315143296758e-05,
"loss": 0.9777,
"num_tokens": 3314431095.0,
"step": 430
},
{
"epoch": 1.861005949161709,
"grad_norm": 0.8262060971600915,
"learning_rate": 1.5057897866204878e-05,
"loss": 0.9601,
"num_tokens": 3322663426.0,
"step": 431
},
{
"epoch": 1.865332612222823,
"grad_norm": 0.5726708983593121,
"learning_rate": 1.5035448944810293e-05,
"loss": 0.9925,
"num_tokens": 3330865224.0,
"step": 432
},
{
"epoch": 1.8696592752839374,
"grad_norm": 0.8747512008129081,
"learning_rate": 1.5012968554174198e-05,
"loss": 0.9698,
"num_tokens": 3339107321.0,
"step": 433
},
{
"epoch": 1.8739859383450512,
"grad_norm": 0.7598171132270467,
"learning_rate": 1.4990456869603193e-05,
"loss": 0.9853,
"num_tokens": 3347289915.0,
"step": 434
},
{
"epoch": 1.8783126014061655,
"grad_norm": 0.7035073136630965,
"learning_rate": 1.4967914066647928e-05,
"loss": 0.9693,
"num_tokens": 3355448024.0,
"step": 435
},
{
"epoch": 1.8826392644672796,
"grad_norm": 0.734295536740874,
"learning_rate": 1.4945340321101698e-05,
"loss": 0.9864,
"num_tokens": 3363675135.0,
"step": 436
},
{
"epoch": 1.8869659275283936,
"grad_norm": 0.5988812777750793,
"learning_rate": 1.4922735808999107e-05,
"loss": 0.9649,
"num_tokens": 3371726378.0,
"step": 437
},
{
"epoch": 1.891292590589508,
"grad_norm": 0.8750638771397959,
"learning_rate": 1.4900100706614686e-05,
"loss": 0.9901,
"num_tokens": 3379870297.0,
"step": 438
},
{
"epoch": 1.895619253650622,
"grad_norm": 0.5808382556577097,
"learning_rate": 1.4877435190461506e-05,
"loss": 0.9744,
"num_tokens": 3388200725.0,
"step": 439
},
{
"epoch": 1.899945916711736,
"grad_norm": 0.9363462985119361,
"learning_rate": 1.4854739437289814e-05,
"loss": 0.9861,
"num_tokens": 3396309659.0,
"step": 440
},
{
"epoch": 1.9042725797728501,
"grad_norm": 0.7948490424737458,
"learning_rate": 1.4832013624085654e-05,
"loss": 0.9837,
"num_tokens": 3404510694.0,
"step": 441
},
{
"epoch": 1.9085992428339642,
"grad_norm": 0.8980882451384175,
"learning_rate": 1.4809257928069487e-05,
"loss": 1.008,
"num_tokens": 3412645391.0,
"step": 442
},
{
"epoch": 1.9129259058950785,
"grad_norm": 0.7040381990057594,
"learning_rate": 1.4786472526694795e-05,
"loss": 0.9592,
"num_tokens": 3420985745.0,
"step": 443
},
{
"epoch": 1.9172525689561926,
"grad_norm": 0.8991768615059288,
"learning_rate": 1.4763657597646713e-05,
"loss": 0.9968,
"num_tokens": 3429248541.0,
"step": 444
},
{
"epoch": 1.9215792320173066,
"grad_norm": 0.6633916419975918,
"learning_rate": 1.4740813318840652e-05,
"loss": 0.9784,
"num_tokens": 3437348661.0,
"step": 445
},
{
"epoch": 1.925905895078421,
"grad_norm": 0.8965515298346237,
"learning_rate": 1.4717939868420878e-05,
"loss": 0.9531,
"num_tokens": 3445445221.0,
"step": 446
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.6871072191147697,
"learning_rate": 1.4695037424759153e-05,
"loss": 0.9747,
"num_tokens": 3453746865.0,
"step": 447
},
{
"epoch": 1.934559221200649,
"grad_norm": 0.9238825067701328,
"learning_rate": 1.4672106166453337e-05,
"loss": 0.977,
"num_tokens": 3461909596.0,
"step": 448
},
{
"epoch": 1.9388858842617631,
"grad_norm": 0.7490553984675871,
"learning_rate": 1.4649146272325984e-05,
"loss": 0.9329,
"num_tokens": 3470039928.0,
"step": 449
},
{
"epoch": 1.9432125473228772,
"grad_norm": 0.8854672895396621,
"learning_rate": 1.4626157921422965e-05,
"loss": 0.9569,
"num_tokens": 3478190617.0,
"step": 450
},
{
"epoch": 1.9475392103839915,
"grad_norm": 0.6923182092043201,
"learning_rate": 1.4603141293012057e-05,
"loss": 0.9862,
"num_tokens": 3486403151.0,
"step": 451
},
{
"epoch": 1.9518658734451053,
"grad_norm": 0.8231453812124571,
"learning_rate": 1.458009656658155e-05,
"loss": 0.9803,
"num_tokens": 3494717203.0,
"step": 452
},
{
"epoch": 1.9561925365062196,
"grad_norm": 0.6255900621201269,
"learning_rate": 1.4557023921838851e-05,
"loss": 0.9653,
"num_tokens": 3502823483.0,
"step": 453
},
{
"epoch": 1.9605191995673337,
"grad_norm": 0.8440131825774589,
"learning_rate": 1.4533923538709076e-05,
"loss": 1.0024,
"num_tokens": 3510882681.0,
"step": 454
},
{
"epoch": 1.9648458626284477,
"grad_norm": 0.7692850246575329,
"learning_rate": 1.4510795597333658e-05,
"loss": 0.9875,
"num_tokens": 3519029644.0,
"step": 455
},
{
"epoch": 1.969172525689562,
"grad_norm": 0.7640584934065241,
"learning_rate": 1.4487640278068929e-05,
"loss": 0.9599,
"num_tokens": 3527262793.0,
"step": 456
},
{
"epoch": 1.973499188750676,
"grad_norm": 0.6469547825051354,
"learning_rate": 1.4464457761484716e-05,
"loss": 0.9485,
"num_tokens": 3535520575.0,
"step": 457
},
{
"epoch": 1.9778258518117902,
"grad_norm": 0.7154622216746662,
"learning_rate": 1.4441248228362943e-05,
"loss": 0.9432,
"num_tokens": 3543566968.0,
"step": 458
},
{
"epoch": 1.9821525148729042,
"grad_norm": 0.5753194167751761,
"learning_rate": 1.4418011859696213e-05,
"loss": 0.9916,
"num_tokens": 3551798760.0,
"step": 459
},
{
"epoch": 1.9864791779340183,
"grad_norm": 0.820941286931545,
"learning_rate": 1.4394748836686392e-05,
"loss": 0.981,
"num_tokens": 3559961473.0,
"step": 460
},
{
"epoch": 1.9908058409951326,
"grad_norm": 0.6759926818924319,
"learning_rate": 1.437145934074321e-05,
"loss": 0.9613,
"num_tokens": 3568259851.0,
"step": 461
},
{
"epoch": 1.9951325040562466,
"grad_norm": 0.7636814498744505,
"learning_rate": 1.4348143553482834e-05,
"loss": 0.9542,
"num_tokens": 3576256828.0,
"step": 462
},
{
"epoch": 1.9994591671173607,
"grad_norm": 0.6253415385490834,
"learning_rate": 1.4324801656726457e-05,
"loss": 1.0001,
"num_tokens": 3584445935.0,
"step": 463
},
{
"epoch": 2.0,
"grad_norm": 0.8654452684362129,
"learning_rate": 1.4301433832498879e-05,
"loss": 1.0153,
"num_tokens": 3585494269.0,
"step": 464
},
{
"epoch": 2.0043266630611143,
"grad_norm": 0.7804102878180353,
"learning_rate": 1.4278040263027087e-05,
"loss": 0.9711,
"num_tokens": 3593559452.0,
"step": 465
},
{
"epoch": 2.008653326122228,
"grad_norm": 0.616918692608059,
"learning_rate": 1.425462113073883e-05,
"loss": 0.9421,
"num_tokens": 3601663035.0,
"step": 466
},
{
"epoch": 2.0129799891833424,
"grad_norm": 0.6370584091299962,
"learning_rate": 1.4231176618261218e-05,
"loss": 0.9627,
"num_tokens": 3609794842.0,
"step": 467
},
{
"epoch": 2.0173066522444563,
"grad_norm": 0.5439959676935536,
"learning_rate": 1.4207706908419257e-05,
"loss": 0.9312,
"num_tokens": 3618030710.0,
"step": 468
},
{
"epoch": 2.0216333153055706,
"grad_norm": 0.8536853956647672,
"learning_rate": 1.4184212184234465e-05,
"loss": 0.9631,
"num_tokens": 3626167487.0,
"step": 469
},
{
"epoch": 2.025959978366685,
"grad_norm": 0.5640997140033263,
"learning_rate": 1.416069262892342e-05,
"loss": 0.978,
"num_tokens": 3634141707.0,
"step": 470
},
{
"epoch": 2.0302866414277987,
"grad_norm": 0.7089110329625673,
"learning_rate": 1.4137148425896338e-05,
"loss": 0.9813,
"num_tokens": 3642347731.0,
"step": 471
},
{
"epoch": 2.034613304488913,
"grad_norm": 0.5076945792366204,
"learning_rate": 1.4113579758755645e-05,
"loss": 0.9475,
"num_tokens": 3650536256.0,
"step": 472
},
{
"epoch": 2.038939967550027,
"grad_norm": 0.7641036202893745,
"learning_rate": 1.4089986811294537e-05,
"loss": 0.9695,
"num_tokens": 3658660767.0,
"step": 473
},
{
"epoch": 2.043266630611141,
"grad_norm": 0.6084043720101941,
"learning_rate": 1.4066369767495567e-05,
"loss": 0.953,
"num_tokens": 3666730262.0,
"step": 474
},
{
"epoch": 2.0475932936722554,
"grad_norm": 0.6900255247384617,
"learning_rate": 1.4042728811529175e-05,
"loss": 0.9338,
"num_tokens": 3674874052.0,
"step": 475
},
{
"epoch": 2.0519199567333692,
"grad_norm": 0.6673406272001278,
"learning_rate": 1.4019064127752298e-05,
"loss": 0.9583,
"num_tokens": 3682866698.0,
"step": 476
},
{
"epoch": 2.0562466197944835,
"grad_norm": 0.5619696739721406,
"learning_rate": 1.399537590070688e-05,
"loss": 0.9444,
"num_tokens": 3691093555.0,
"step": 477
},
{
"epoch": 2.060573282855598,
"grad_norm": 0.5401104501685228,
"learning_rate": 1.3971664315118483e-05,
"loss": 0.9158,
"num_tokens": 3699284128.0,
"step": 478
},
{
"epoch": 2.0648999459167117,
"grad_norm": 0.6082865072416918,
"learning_rate": 1.3947929555894813e-05,
"loss": 0.9866,
"num_tokens": 3707572868.0,
"step": 479
},
{
"epoch": 2.069226608977826,
"grad_norm": 0.5219700556096115,
"learning_rate": 1.392417180812429e-05,
"loss": 0.925,
"num_tokens": 3715837638.0,
"step": 480
},
{
"epoch": 2.07355327203894,
"grad_norm": 0.5818687691162563,
"learning_rate": 1.3900391257074601e-05,
"loss": 0.9313,
"num_tokens": 3723956355.0,
"step": 481
},
{
"epoch": 2.077879935100054,
"grad_norm": 0.6243227814666679,
"learning_rate": 1.3876588088191264e-05,
"loss": 0.9702,
"num_tokens": 3732169409.0,
"step": 482
},
{
"epoch": 2.0822065981611684,
"grad_norm": 0.4599940187012167,
"learning_rate": 1.3852762487096168e-05,
"loss": 0.9626,
"num_tokens": 3740297617.0,
"step": 483
},
{
"epoch": 2.086533261222282,
"grad_norm": 0.7390777191418446,
"learning_rate": 1.3828914639586138e-05,
"loss": 0.988,
"num_tokens": 3748584821.0,
"step": 484
},
{
"epoch": 2.0908599242833965,
"grad_norm": 1.288123858274811,
"learning_rate": 1.3805044731631475e-05,
"loss": 0.9624,
"num_tokens": 3756811621.0,
"step": 485
},
{
"epoch": 2.0951865873445104,
"grad_norm": 0.6711188741890706,
"learning_rate": 1.3781152949374527e-05,
"loss": 0.9752,
"num_tokens": 3764852654.0,
"step": 486
},
{
"epoch": 2.0995132504056246,
"grad_norm": 0.4902348109671292,
"learning_rate": 1.3757239479128204e-05,
"loss": 0.9257,
"num_tokens": 3773125548.0,
"step": 487
},
{
"epoch": 2.103839913466739,
"grad_norm": 0.5653073702582646,
"learning_rate": 1.373330450737455e-05,
"loss": 0.9357,
"num_tokens": 3781308940.0,
"step": 488
},
{
"epoch": 2.1081665765278528,
"grad_norm": 0.4760945896815548,
"learning_rate": 1.3709348220763287e-05,
"loss": 0.956,
"num_tokens": 3789493303.0,
"step": 489
},
{
"epoch": 2.112493239588967,
"grad_norm": 2.455426675675134,
"learning_rate": 1.3685370806110343e-05,
"loss": 0.9677,
"num_tokens": 3797741283.0,
"step": 490
},
{
"epoch": 2.1168199026500814,
"grad_norm": 2.0574524671779053,
"learning_rate": 1.3661372450396422e-05,
"loss": 0.9932,
"num_tokens": 3805823170.0,
"step": 491
},
{
"epoch": 2.121146565711195,
"grad_norm": 0.6481290112552625,
"learning_rate": 1.3637353340765518e-05,
"loss": 0.9522,
"num_tokens": 3813836268.0,
"step": 492
},
{
"epoch": 2.1254732287723095,
"grad_norm": 0.6162414056852944,
"learning_rate": 1.3613313664523476e-05,
"loss": 0.9827,
"num_tokens": 3822125722.0,
"step": 493
},
{
"epoch": 2.1297998918334233,
"grad_norm": 0.590791719137404,
"learning_rate": 1.3589253609136517e-05,
"loss": 0.9612,
"num_tokens": 3830333623.0,
"step": 494
},
{
"epoch": 2.1341265548945376,
"grad_norm": 0.5684707116662864,
"learning_rate": 1.3565173362229787e-05,
"loss": 0.984,
"num_tokens": 3838471268.0,
"step": 495
},
{
"epoch": 2.138453217955652,
"grad_norm": 0.6053160814163213,
"learning_rate": 1.354107311158589e-05,
"loss": 0.9667,
"num_tokens": 3846752488.0,
"step": 496
},
{
"epoch": 2.1427798810167658,
"grad_norm": 0.47119322042756134,
"learning_rate": 1.3516953045143421e-05,
"loss": 1.0044,
"num_tokens": 3855015535.0,
"step": 497
},
{
"epoch": 2.14710654407788,
"grad_norm": 0.6890877937333092,
"learning_rate": 1.3492813350995501e-05,
"loss": 0.9558,
"num_tokens": 3863332871.0,
"step": 498
},
{
"epoch": 2.151433207138994,
"grad_norm": 0.4720532087871396,
"learning_rate": 1.3468654217388322e-05,
"loss": 0.9438,
"num_tokens": 3871417760.0,
"step": 499
},
{
"epoch": 2.155759870200108,
"grad_norm": 0.6530842825033443,
"learning_rate": 1.344447583271965e-05,
"loss": 0.9335,
"num_tokens": 3879494222.0,
"step": 500
},
{
"epoch": 2.1600865332612225,
"grad_norm": 0.48813659380264507,
"learning_rate": 1.342027838553739e-05,
"loss": 0.9807,
"num_tokens": 3887572575.0,
"step": 501
},
{
"epoch": 2.1644131963223363,
"grad_norm": 0.5432512528502169,
"learning_rate": 1.3396062064538103e-05,
"loss": 0.9508,
"num_tokens": 3895875651.0,
"step": 502
},
{
"epoch": 2.1687398593834506,
"grad_norm": 0.5246641442927332,
"learning_rate": 1.3371827058565517e-05,
"loss": 0.9335,
"num_tokens": 3903948482.0,
"step": 503
},
{
"epoch": 2.1730665224445644,
"grad_norm": 0.37219239494720535,
"learning_rate": 1.3347573556609075e-05,
"loss": 0.9336,
"num_tokens": 3912132117.0,
"step": 504
},
{
"epoch": 2.1773931855056787,
"grad_norm": 0.5512319309802416,
"learning_rate": 1.332330174780246e-05,
"loss": 0.9435,
"num_tokens": 3920270330.0,
"step": 505
},
{
"epoch": 2.181719848566793,
"grad_norm": 0.4318054234173368,
"learning_rate": 1.3299011821422116e-05,
"loss": 0.9372,
"num_tokens": 3928514380.0,
"step": 506
},
{
"epoch": 2.186046511627907,
"grad_norm": 0.48811090489725933,
"learning_rate": 1.3274703966885765e-05,
"loss": 0.9624,
"num_tokens": 3936610571.0,
"step": 507
},
{
"epoch": 2.190373174689021,
"grad_norm": 0.48047513202626985,
"learning_rate": 1.3250378373750941e-05,
"loss": 0.9579,
"num_tokens": 3944799565.0,
"step": 508
},
{
"epoch": 2.194699837750135,
"grad_norm": 0.4381062126465775,
"learning_rate": 1.3226035231713504e-05,
"loss": 0.9609,
"num_tokens": 3952965563.0,
"step": 509
},
{
"epoch": 2.1990265008112493,
"grad_norm": 0.3842482894518455,
"learning_rate": 1.3201674730606166e-05,
"loss": 0.9454,
"num_tokens": 3960955314.0,
"step": 510
},
{
"epoch": 2.2033531638723636,
"grad_norm": 0.4028195527619942,
"learning_rate": 1.317729706039701e-05,
"loss": 0.9569,
"num_tokens": 3969007594.0,
"step": 511
},
{
"epoch": 2.2076798269334774,
"grad_norm": 0.5267165379312942,
"learning_rate": 1.3152902411188007e-05,
"loss": 0.9383,
"num_tokens": 3977305210.0,
"step": 512
},
{
"epoch": 2.2120064899945917,
"grad_norm": 0.408161580284809,
"learning_rate": 1.3128490973213523e-05,
"loss": 0.9651,
"num_tokens": 3985475847.0,
"step": 513
},
{
"epoch": 2.2163331530557056,
"grad_norm": 0.5824772012067178,
"learning_rate": 1.3104062936838863e-05,
"loss": 0.9904,
"num_tokens": 3993521666.0,
"step": 514
},
{
"epoch": 2.22065981611682,
"grad_norm": 0.4592171198699058,
"learning_rate": 1.3079618492558763e-05,
"loss": 0.9509,
"num_tokens": 4001881055.0,
"step": 515
},
{
"epoch": 2.224986479177934,
"grad_norm": 0.4288405177420417,
"learning_rate": 1.3055157830995904e-05,
"loss": 0.9267,
"num_tokens": 4010098333.0,
"step": 516
},
{
"epoch": 2.229313142239048,
"grad_norm": 0.62245881444073,
"learning_rate": 1.3030681142899437e-05,
"loss": 0.924,
"num_tokens": 4018409777.0,
"step": 517
},
{
"epoch": 2.2336398053001623,
"grad_norm": 0.40777593363684145,
"learning_rate": 1.3006188619143505e-05,
"loss": 0.9726,
"num_tokens": 4026647609.0,
"step": 518
},
{
"epoch": 2.2379664683612766,
"grad_norm": 0.522692181425156,
"learning_rate": 1.2981680450725715e-05,
"loss": 0.9621,
"num_tokens": 4034775036.0,
"step": 519
},
{
"epoch": 2.2422931314223904,
"grad_norm": 0.47960290009387085,
"learning_rate": 1.2957156828765694e-05,
"loss": 0.9189,
"num_tokens": 4043062480.0,
"step": 520
},
{
"epoch": 2.2466197944835047,
"grad_norm": 0.4678126732331299,
"learning_rate": 1.2932617944503572e-05,
"loss": 0.9546,
"num_tokens": 4051272045.0,
"step": 521
},
{
"epoch": 2.2509464575446185,
"grad_norm": 0.398362030110093,
"learning_rate": 1.2908063989298493e-05,
"loss": 0.9287,
"num_tokens": 4059530591.0,
"step": 522
},
{
"epoch": 2.255273120605733,
"grad_norm": 0.45585060066098454,
"learning_rate": 1.2883495154627138e-05,
"loss": 0.9418,
"num_tokens": 4067674515.0,
"step": 523
},
{
"epoch": 2.259599783666847,
"grad_norm": 0.4335281616940614,
"learning_rate": 1.2858911632082211e-05,
"loss": 0.9743,
"num_tokens": 4075850527.0,
"step": 524
},
{
"epoch": 2.263926446727961,
"grad_norm": 0.45201891721310733,
"learning_rate": 1.2834313613370966e-05,
"loss": 0.9391,
"num_tokens": 4083981564.0,
"step": 525
},
{
"epoch": 2.2682531097890752,
"grad_norm": 0.44150737521062516,
"learning_rate": 1.2809701290313683e-05,
"loss": 0.9467,
"num_tokens": 4092119819.0,
"step": 526
},
{
"epoch": 2.2725797728501895,
"grad_norm": 0.45433913386085706,
"learning_rate": 1.278507485484221e-05,
"loss": 0.9393,
"num_tokens": 4100346271.0,
"step": 527
},
{
"epoch": 2.2769064359113034,
"grad_norm": 0.5507166636281073,
"learning_rate": 1.2760434498998434e-05,
"loss": 0.9391,
"num_tokens": 4108661518.0,
"step": 528
},
{
"epoch": 2.2812330989724177,
"grad_norm": 0.37701447868392113,
"learning_rate": 1.27357804149328e-05,
"loss": 0.9524,
"num_tokens": 4116748380.0,
"step": 529
},
{
"epoch": 2.2855597620335315,
"grad_norm": 0.5746317779731845,
"learning_rate": 1.2711112794902813e-05,
"loss": 0.9421,
"num_tokens": 4124999062.0,
"step": 530
},
{
"epoch": 2.289886425094646,
"grad_norm": 0.4463101017830629,
"learning_rate": 1.2686431831271523e-05,
"loss": 0.9625,
"num_tokens": 4133186868.0,
"step": 531
},
{
"epoch": 2.29421308815576,
"grad_norm": 0.5370113028092661,
"learning_rate": 1.2661737716506043e-05,
"loss": 0.9885,
"num_tokens": 4141449821.0,
"step": 532
},
{
"epoch": 2.298539751216874,
"grad_norm": 0.4423097943357427,
"learning_rate": 1.2637030643176042e-05,
"loss": 0.9542,
"num_tokens": 4149652432.0,
"step": 533
},
{
"epoch": 2.302866414277988,
"grad_norm": 0.5511789232845956,
"learning_rate": 1.2612310803952244e-05,
"loss": 0.9465,
"num_tokens": 4157808329.0,
"step": 534
},
{
"epoch": 2.307193077339102,
"grad_norm": 0.3876488127130369,
"learning_rate": 1.2587578391604913e-05,
"loss": 0.9499,
"num_tokens": 4165809160.0,
"step": 535
},
{
"epoch": 2.3115197404002163,
"grad_norm": 0.6177035631999804,
"learning_rate": 1.2562833599002376e-05,
"loss": 0.9677,
"num_tokens": 4174000454.0,
"step": 536
},
{
"epoch": 2.3158464034613306,
"grad_norm": 0.42991099207796024,
"learning_rate": 1.2538076619109492e-05,
"loss": 0.9427,
"num_tokens": 4182078542.0,
"step": 537
},
{
"epoch": 2.3201730665224445,
"grad_norm": 0.641966921774766,
"learning_rate": 1.2513307644986165e-05,
"loss": 0.9263,
"num_tokens": 4190219796.0,
"step": 538
},
{
"epoch": 2.3244997295835588,
"grad_norm": 0.5474359287041366,
"learning_rate": 1.2488526869785831e-05,
"loss": 0.9759,
"num_tokens": 4198442007.0,
"step": 539
},
{
"epoch": 2.3288263926446726,
"grad_norm": 0.6031392642669225,
"learning_rate": 1.2463734486753953e-05,
"loss": 0.9768,
"num_tokens": 4206693320.0,
"step": 540
},
{
"epoch": 2.333153055705787,
"grad_norm": 0.6408527604910021,
"learning_rate": 1.2438930689226516e-05,
"loss": 0.9794,
"num_tokens": 4214990068.0,
"step": 541
},
{
"epoch": 2.337479718766901,
"grad_norm": 0.5038473523210545,
"learning_rate": 1.241411567062851e-05,
"loss": 0.971,
"num_tokens": 4223038722.0,
"step": 542
},
{
"epoch": 2.341806381828015,
"grad_norm": 0.6388179575100953,
"learning_rate": 1.2389289624472443e-05,
"loss": 0.9506,
"num_tokens": 4231285793.0,
"step": 543
},
{
"epoch": 2.3461330448891293,
"grad_norm": 0.4013689490010426,
"learning_rate": 1.2364452744356803e-05,
"loss": 0.9847,
"num_tokens": 4239600068.0,
"step": 544
},
{
"epoch": 2.350459707950243,
"grad_norm": 0.770106065718662,
"learning_rate": 1.2339605223964571e-05,
"loss": 0.949,
"num_tokens": 4247861801.0,
"step": 545
},
{
"epoch": 2.3547863710113575,
"grad_norm": 0.577361696338379,
"learning_rate": 1.2314747257061705e-05,
"loss": 0.9571,
"num_tokens": 4256038814.0,
"step": 546
},
{
"epoch": 2.3591130340724717,
"grad_norm": 0.6764182668606968,
"learning_rate": 1.2289879037495626e-05,
"loss": 0.9526,
"num_tokens": 4264362015.0,
"step": 547
},
{
"epoch": 2.3634396971335856,
"grad_norm": 0.6826859929166379,
"learning_rate": 1.22650007591937e-05,
"loss": 0.9867,
"num_tokens": 4272552088.0,
"step": 548
},
{
"epoch": 2.3677663601947,
"grad_norm": 1.1712639273168959,
"learning_rate": 1.2240112616161743e-05,
"loss": 0.9291,
"num_tokens": 4280783431.0,
"step": 549
},
{
"epoch": 2.3720930232558137,
"grad_norm": 0.9029578260961084,
"learning_rate": 1.2215214802482493e-05,
"loss": 0.9471,
"num_tokens": 4288988956.0,
"step": 550
},
{
"epoch": 2.376419686316928,
"grad_norm": 0.5373017530946762,
"learning_rate": 1.2190307512314104e-05,
"loss": 0.948,
"num_tokens": 4297187620.0,
"step": 551
},
{
"epoch": 2.3807463493780423,
"grad_norm": 0.7957685972548623,
"learning_rate": 1.2165390939888622e-05,
"loss": 0.9799,
"num_tokens": 4305382112.0,
"step": 552
},
{
"epoch": 2.385073012439156,
"grad_norm": 0.6252518036830393,
"learning_rate": 1.2140465279510494e-05,
"loss": 1.0004,
"num_tokens": 4313476472.0,
"step": 553
},
{
"epoch": 2.3893996755002704,
"grad_norm": 0.6705980809309716,
"learning_rate": 1.2115530725555016e-05,
"loss": 0.9856,
"num_tokens": 4321660199.0,
"step": 554
},
{
"epoch": 2.3937263385613847,
"grad_norm": 0.5375506977246428,
"learning_rate": 1.2090587472466857e-05,
"loss": 0.9443,
"num_tokens": 4329933962.0,
"step": 555
},
{
"epoch": 2.3980530016224986,
"grad_norm": 0.6972561334867685,
"learning_rate": 1.2065635714758513e-05,
"loss": 0.9612,
"num_tokens": 4338257005.0,
"step": 556
},
{
"epoch": 2.402379664683613,
"grad_norm": 0.5367223757844836,
"learning_rate": 1.2040675647008796e-05,
"loss": 0.9479,
"num_tokens": 4346287588.0,
"step": 557
},
{
"epoch": 2.4067063277447267,
"grad_norm": 0.6651025311923312,
"learning_rate": 1.2015707463861334e-05,
"loss": 0.9591,
"num_tokens": 4354313732.0,
"step": 558
},
{
"epoch": 2.411032990805841,
"grad_norm": 0.5230642046498125,
"learning_rate": 1.199073136002304e-05,
"loss": 0.9895,
"num_tokens": 4362453217.0,
"step": 559
},
{
"epoch": 2.4153596538669553,
"grad_norm": 0.6436791423085466,
"learning_rate": 1.1965747530262581e-05,
"loss": 0.9888,
"num_tokens": 4370690238.0,
"step": 560
},
{
"epoch": 2.419686316928069,
"grad_norm": 0.5418843658820947,
"learning_rate": 1.1940756169408882e-05,
"loss": 0.9744,
"num_tokens": 4378964382.0,
"step": 561
},
{
"epoch": 2.4240129799891834,
"grad_norm": 0.6124584539065394,
"learning_rate": 1.1915757472349598e-05,
"loss": 0.9637,
"num_tokens": 4386973954.0,
"step": 562
},
{
"epoch": 2.4283396430502977,
"grad_norm": 0.5060160404773808,
"learning_rate": 1.1890751634029586e-05,
"loss": 0.9693,
"num_tokens": 4395090749.0,
"step": 563
},
{
"epoch": 2.4326663061114115,
"grad_norm": 0.6267585754010649,
"learning_rate": 1.18657388494494e-05,
"loss": 0.9309,
"num_tokens": 4403333165.0,
"step": 564
},
{
"epoch": 2.436992969172526,
"grad_norm": 0.554981345863305,
"learning_rate": 1.1840719313663758e-05,
"loss": 0.9704,
"num_tokens": 4411451209.0,
"step": 565
},
{
"epoch": 2.4413196322336397,
"grad_norm": 0.6155646667621201,
"learning_rate": 1.1815693221780024e-05,
"loss": 0.9698,
"num_tokens": 4419790504.0,
"step": 566
},
{
"epoch": 2.445646295294754,
"grad_norm": 0.5336569352551696,
"learning_rate": 1.1790660768956692e-05,
"loss": 0.9662,
"num_tokens": 4427867123.0,
"step": 567
},
{
"epoch": 2.4499729583558683,
"grad_norm": 0.6203324832845126,
"learning_rate": 1.1765622150401855e-05,
"loss": 0.9797,
"num_tokens": 4436107636.0,
"step": 568
},
{
"epoch": 2.454299621416982,
"grad_norm": 0.5526647902620689,
"learning_rate": 1.1740577561371692e-05,
"loss": 0.931,
"num_tokens": 4444241142.0,
"step": 569
},
{
"epoch": 2.4586262844780964,
"grad_norm": 0.6147177036547514,
"learning_rate": 1.1715527197168938e-05,
"loss": 0.9559,
"num_tokens": 4452511750.0,
"step": 570
},
{
"epoch": 2.4629529475392102,
"grad_norm": 0.5300983630062421,
"learning_rate": 1.1690471253141368e-05,
"loss": 0.9497,
"num_tokens": 4460721683.0,
"step": 571
},
{
"epoch": 2.4672796106003245,
"grad_norm": 0.6023194661956321,
"learning_rate": 1.1665409924680266e-05,
"loss": 0.9793,
"num_tokens": 4468873356.0,
"step": 572
},
{
"epoch": 2.471606273661439,
"grad_norm": 0.5295856874230167,
"learning_rate": 1.1640343407218904e-05,
"loss": 0.9503,
"num_tokens": 4476919366.0,
"step": 573
},
{
"epoch": 2.4759329367225527,
"grad_norm": 0.5665385218247417,
"learning_rate": 1.1615271896231019e-05,
"loss": 0.9662,
"num_tokens": 4485019312.0,
"step": 574
},
{
"epoch": 2.480259599783667,
"grad_norm": 0.49532958355297396,
"learning_rate": 1.1590195587229297e-05,
"loss": 0.9853,
"num_tokens": 4493371908.0,
"step": 575
},
{
"epoch": 2.484586262844781,
"grad_norm": 0.5916742957768434,
"learning_rate": 1.1565114675763823e-05,
"loss": 0.9583,
"num_tokens": 4501379320.0,
"step": 576
},
{
"epoch": 2.488912925905895,
"grad_norm": 0.5199993454816884,
"learning_rate": 1.1540029357420588e-05,
"loss": 0.9358,
"num_tokens": 4509554917.0,
"step": 577
},
{
"epoch": 2.4932395889670094,
"grad_norm": 0.6131431462830838,
"learning_rate": 1.1514939827819945e-05,
"loss": 0.9703,
"num_tokens": 4517700650.0,
"step": 578
},
{
"epoch": 2.497566252028123,
"grad_norm": 0.5276906368093617,
"learning_rate": 1.1489846282615083e-05,
"loss": 0.945,
"num_tokens": 4525942676.0,
"step": 579
},
{
"epoch": 2.5018929150892375,
"grad_norm": 0.6310140050988171,
"learning_rate": 1.1464748917490512e-05,
"loss": 0.9712,
"num_tokens": 4534150166.0,
"step": 580
},
{
"epoch": 2.5062195781503513,
"grad_norm": 0.5659729405686769,
"learning_rate": 1.1439647928160523e-05,
"loss": 0.9645,
"num_tokens": 4542343152.0,
"step": 581
},
{
"epoch": 2.5105462412114656,
"grad_norm": 0.5200842069223084,
"learning_rate": 1.1414543510367673e-05,
"loss": 0.9576,
"num_tokens": 4550593514.0,
"step": 582
},
{
"epoch": 2.51487290427258,
"grad_norm": 0.48643885067096815,
"learning_rate": 1.1389435859881255e-05,
"loss": 0.9514,
"num_tokens": 4558736992.0,
"step": 583
},
{
"epoch": 2.5191995673336938,
"grad_norm": 0.5621639097985508,
"learning_rate": 1.1364325172495773e-05,
"loss": 0.9563,
"num_tokens": 4566863792.0,
"step": 584
},
{
"epoch": 2.523526230394808,
"grad_norm": 0.46925979555832403,
"learning_rate": 1.1339211644029412e-05,
"loss": 0.919,
"num_tokens": 4575114098.0,
"step": 585
},
{
"epoch": 2.527852893455922,
"grad_norm": 0.6021494127734743,
"learning_rate": 1.1314095470322512e-05,
"loss": 0.9139,
"num_tokens": 4583292940.0,
"step": 586
},
{
"epoch": 2.532179556517036,
"grad_norm": 0.5826582566216992,
"learning_rate": 1.1288976847236034e-05,
"loss": 0.9488,
"num_tokens": 4591542916.0,
"step": 587
},
{
"epoch": 2.5365062195781505,
"grad_norm": 0.5045409853325099,
"learning_rate": 1.1263855970650058e-05,
"loss": 0.9779,
"num_tokens": 4599599139.0,
"step": 588
},
{
"epoch": 2.5408328826392643,
"grad_norm": 0.5657692884716502,
"learning_rate": 1.1238733036462215e-05,
"loss": 0.9818,
"num_tokens": 4607817199.0,
"step": 589
},
{
"epoch": 2.5451595457003786,
"grad_norm": 0.46862372757938014,
"learning_rate": 1.1213608240586201e-05,
"loss": 0.9498,
"num_tokens": 4616026307.0,
"step": 590
},
{
"epoch": 2.5494862087614925,
"grad_norm": 0.4860095951265856,
"learning_rate": 1.1188481778950214e-05,
"loss": 0.9385,
"num_tokens": 4624047379.0,
"step": 591
},
{
"epoch": 2.5538128718226067,
"grad_norm": 0.45733906332352875,
"learning_rate": 1.1163353847495455e-05,
"loss": 0.9218,
"num_tokens": 4632297344.0,
"step": 592
},
{
"epoch": 2.558139534883721,
"grad_norm": 0.4447225224407655,
"learning_rate": 1.1138224642174578e-05,
"loss": 0.9314,
"num_tokens": 4640502412.0,
"step": 593
},
{
"epoch": 2.5624661979448353,
"grad_norm": 0.4769586032532646,
"learning_rate": 1.1113094358950177e-05,
"loss": 0.962,
"num_tokens": 4648602406.0,
"step": 594
},
{
"epoch": 2.566792861005949,
"grad_norm": 0.42700328299134344,
"learning_rate": 1.1087963193793242e-05,
"loss": 0.9326,
"num_tokens": 4656901673.0,
"step": 595
},
{
"epoch": 2.571119524067063,
"grad_norm": 0.4504153309965811,
"learning_rate": 1.1062831342681655e-05,
"loss": 0.929,
"num_tokens": 4665058500.0,
"step": 596
},
{
"epoch": 2.5754461871281773,
"grad_norm": 0.4964922520196292,
"learning_rate": 1.1037699001598636e-05,
"loss": 0.935,
"num_tokens": 4673282001.0,
"step": 597
},
{
"epoch": 2.5797728501892916,
"grad_norm": 0.40215258351767974,
"learning_rate": 1.1012566366531232e-05,
"loss": 0.9728,
"num_tokens": 4681534289.0,
"step": 598
},
{
"epoch": 2.584099513250406,
"grad_norm": 0.4191004702486287,
"learning_rate": 1.0987433633468771e-05,
"loss": 0.9745,
"num_tokens": 4689651407.0,
"step": 599
},
{
"epoch": 2.5884261763115197,
"grad_norm": 0.37353628363917635,
"learning_rate": 1.0962300998401367e-05,
"loss": 0.9079,
"num_tokens": 4697947274.0,
"step": 600
},
{
"epoch": 2.592752839372634,
"grad_norm": 0.3812708767310222,
"learning_rate": 1.0937168657318347e-05,
"loss": 0.9597,
"num_tokens": 4705880972.0,
"step": 601
},
{
"epoch": 2.597079502433748,
"grad_norm": 0.43583790957520196,
"learning_rate": 1.091203680620676e-05,
"loss": 0.9573,
"num_tokens": 4714073064.0,
"step": 602
},
{
"epoch": 2.601406165494862,
"grad_norm": 0.35271036617589024,
"learning_rate": 1.0886905641049828e-05,
"loss": 0.9197,
"num_tokens": 4722198446.0,
"step": 603
},
{
"epoch": 2.6057328285559764,
"grad_norm": 0.3781948396692369,
"learning_rate": 1.0861775357825424e-05,
"loss": 0.9524,
"num_tokens": 4730250511.0,
"step": 604
},
{
"epoch": 2.6100594916170903,
"grad_norm": 0.4533417243971015,
"learning_rate": 1.0836646152504548e-05,
"loss": 0.9757,
"num_tokens": 4738501206.0,
"step": 605
},
{
"epoch": 2.6143861546782046,
"grad_norm": 0.45748276165170915,
"learning_rate": 1.0811518221049787e-05,
"loss": 0.9298,
"num_tokens": 4746530561.0,
"step": 606
},
{
"epoch": 2.6187128177393184,
"grad_norm": 0.41575087815553385,
"learning_rate": 1.0786391759413805e-05,
"loss": 0.9356,
"num_tokens": 4754522348.0,
"step": 607
},
{
"epoch": 2.6230394808004327,
"grad_norm": 0.3970192273773917,
"learning_rate": 1.0761266963537786e-05,
"loss": 0.9725,
"num_tokens": 4762779442.0,
"step": 608
},
{
"epoch": 2.627366143861547,
"grad_norm": 0.37169659490918194,
"learning_rate": 1.0736144029349947e-05,
"loss": 0.9404,
"num_tokens": 4771069539.0,
"step": 609
},
{
"epoch": 2.631692806922661,
"grad_norm": 0.37442651321213727,
"learning_rate": 1.0711023152763967e-05,
"loss": 0.931,
"num_tokens": 4779346220.0,
"step": 610
},
{
"epoch": 2.636019469983775,
"grad_norm": 0.44735405291732344,
"learning_rate": 1.0685904529677496e-05,
"loss": 0.9761,
"num_tokens": 4787563265.0,
"step": 611
},
{
"epoch": 2.640346133044889,
"grad_norm": 0.35959081797177606,
"learning_rate": 1.066078835597059e-05,
"loss": 0.9422,
"num_tokens": 4795789059.0,
"step": 612
},
{
"epoch": 2.6446727961060033,
"grad_norm": 0.34980726494078923,
"learning_rate": 1.063567482750423e-05,
"loss": 0.977,
"num_tokens": 4804010832.0,
"step": 613
},
{
"epoch": 2.6489994591671175,
"grad_norm": 0.3907706043819872,
"learning_rate": 1.061056414011875e-05,
"loss": 0.933,
"num_tokens": 4812294976.0,
"step": 614
},
{
"epoch": 2.6533261222282314,
"grad_norm": 0.4119640894365054,
"learning_rate": 1.0585456489632328e-05,
"loss": 0.9311,
"num_tokens": 4820531707.0,
"step": 615
},
{
"epoch": 2.6576527852893457,
"grad_norm": 0.354677452859528,
"learning_rate": 1.0560352071839482e-05,
"loss": 0.9395,
"num_tokens": 4828837649.0,
"step": 616
},
{
"epoch": 2.6619794483504595,
"grad_norm": 0.3239073861757467,
"learning_rate": 1.0535251082509493e-05,
"loss": 0.9754,
"num_tokens": 4836950312.0,
"step": 617
},
{
"epoch": 2.666306111411574,
"grad_norm": 0.4230298637290067,
"learning_rate": 1.0510153717384922e-05,
"loss": 0.9206,
"num_tokens": 4845196377.0,
"step": 618
},
{
"epoch": 2.670632774472688,
"grad_norm": 0.4854466095116207,
"learning_rate": 1.0485060172180058e-05,
"loss": 0.9677,
"num_tokens": 4853465214.0,
"step": 619
},
{
"epoch": 2.674959437533802,
"grad_norm": 0.3941871664505198,
"learning_rate": 1.0459970642579419e-05,
"loss": 0.9754,
"num_tokens": 4861677592.0,
"step": 620
},
{
"epoch": 2.6792861005949162,
"grad_norm": 0.33437251990630384,
"learning_rate": 1.0434885324236182e-05,
"loss": 0.932,
"num_tokens": 4869954573.0,
"step": 621
},
{
"epoch": 2.68361276365603,
"grad_norm": 0.4097442193963148,
"learning_rate": 1.040980441277071e-05,
"loss": 0.9558,
"num_tokens": 4878037940.0,
"step": 622
},
{
"epoch": 2.6879394267171444,
"grad_norm": 0.3468569242173382,
"learning_rate": 1.0384728103768984e-05,
"loss": 0.9744,
"num_tokens": 4886162108.0,
"step": 623
},
{
"epoch": 2.6922660897782587,
"grad_norm": 0.35424420890206254,
"learning_rate": 1.0359656592781102e-05,
"loss": 0.9671,
"num_tokens": 4893976133.0,
"step": 624
},
{
"epoch": 2.6965927528393725,
"grad_norm": 0.3616987336239977,
"learning_rate": 1.0334590075319737e-05,
"loss": 0.9529,
"num_tokens": 4902247512.0,
"step": 625
},
{
"epoch": 2.700919415900487,
"grad_norm": 1.5398266966086605,
"learning_rate": 1.0309528746858633e-05,
"loss": 0.9405,
"num_tokens": 4910413738.0,
"step": 626
},
{
"epoch": 2.7052460789616006,
"grad_norm": 0.63781417279165,
"learning_rate": 1.0284472802831064e-05,
"loss": 0.994,
"num_tokens": 4918714880.0,
"step": 627
},
{
"epoch": 2.709572742022715,
"grad_norm": 0.3631737849921556,
"learning_rate": 1.0259422438628311e-05,
"loss": 0.9463,
"num_tokens": 4926916759.0,
"step": 628
},
{
"epoch": 2.713899405083829,
"grad_norm": 0.40687223170267717,
"learning_rate": 1.023437784959815e-05,
"loss": 0.8911,
"num_tokens": 4935201255.0,
"step": 629
},
{
"epoch": 2.718226068144943,
"grad_norm": 0.48334368577273396,
"learning_rate": 1.0209339231043314e-05,
"loss": 0.979,
"num_tokens": 4943203704.0,
"step": 630
},
{
"epoch": 2.7225527312060573,
"grad_norm": 0.38429152843144504,
"learning_rate": 1.0184306778219982e-05,
"loss": 0.9614,
"num_tokens": 4951361362.0,
"step": 631
},
{
"epoch": 2.726879394267171,
"grad_norm": 0.3375014960801925,
"learning_rate": 1.0159280686336247e-05,
"loss": 0.9464,
"num_tokens": 4959475675.0,
"step": 632
},
{
"epoch": 2.7312060573282855,
"grad_norm": 0.5278716176582829,
"learning_rate": 1.0134261150550607e-05,
"loss": 0.9578,
"num_tokens": 4967495353.0,
"step": 633
},
{
"epoch": 2.7355327203893998,
"grad_norm": 0.38762155738765125,
"learning_rate": 1.0109248365970417e-05,
"loss": 0.9471,
"num_tokens": 4975635351.0,
"step": 634
},
{
"epoch": 2.739859383450514,
"grad_norm": 0.42752590625715914,
"learning_rate": 1.0084242527650405e-05,
"loss": 0.9337,
"num_tokens": 4983586441.0,
"step": 635
},
{
"epoch": 2.744186046511628,
"grad_norm": 0.3486963485067961,
"learning_rate": 1.005924383059112e-05,
"loss": 0.9427,
"num_tokens": 4991751016.0,
"step": 636
},
{
"epoch": 2.748512709572742,
"grad_norm": 0.34837213958685875,
"learning_rate": 1.003425246973742e-05,
"loss": 0.9311,
"num_tokens": 4999725885.0,
"step": 637
},
{
"epoch": 2.752839372633856,
"grad_norm": 0.4023922893858053,
"learning_rate": 1.0009268639976963e-05,
"loss": 0.9673,
"num_tokens": 5008028648.0,
"step": 638
},
{
"epoch": 2.7571660356949703,
"grad_norm": 0.4231190144779909,
"learning_rate": 9.984292536138667e-06,
"loss": 0.937,
"num_tokens": 5016272386.0,
"step": 639
},
{
"epoch": 2.7614926987560846,
"grad_norm": 3.0302353492150087,
"learning_rate": 9.959324352991208e-06,
"loss": 0.9676,
"num_tokens": 5024339120.0,
"step": 640
},
{
"epoch": 2.7658193618171985,
"grad_norm": 0.5891540777938301,
"learning_rate": 9.934364285241492e-06,
"loss": 0.9493,
"num_tokens": 5032678352.0,
"step": 641
},
{
"epoch": 2.7701460248783127,
"grad_norm": 0.3713633204848213,
"learning_rate": 9.90941252753315e-06,
"loss": 0.9253,
"num_tokens": 5040939116.0,
"step": 642
},
{
"epoch": 2.7744726879394266,
"grad_norm": 0.5678305665625419,
"learning_rate": 9.884469274444985e-06,
"loss": 0.9397,
"num_tokens": 5049128904.0,
"step": 643
},
{
"epoch": 2.778799351000541,
"grad_norm": 0.5210583704451124,
"learning_rate": 9.859534720489512e-06,
"loss": 0.9197,
"num_tokens": 5057221429.0,
"step": 644
},
{
"epoch": 2.783126014061655,
"grad_norm": 0.5058583173880178,
"learning_rate": 9.834609060111379e-06,
"loss": 0.9345,
"num_tokens": 5065603969.0,
"step": 645
},
{
"epoch": 2.787452677122769,
"grad_norm": 1.065747933692024,
"learning_rate": 9.809692487685897e-06,
"loss": 0.9641,
"num_tokens": 5073922039.0,
"step": 646
},
{
"epoch": 2.7917793401838833,
"grad_norm": 0.6392727901163089,
"learning_rate": 9.784785197517508e-06,
"loss": 0.978,
"num_tokens": 5082066062.0,
"step": 647
},
{
"epoch": 2.796106003244997,
"grad_norm": 0.47665360710824717,
"learning_rate": 9.75988738383826e-06,
"loss": 0.9659,
"num_tokens": 5090277224.0,
"step": 648
},
{
"epoch": 2.8004326663061114,
"grad_norm": 0.4296714120604327,
"learning_rate": 9.734999240806305e-06,
"loss": 0.9419,
"num_tokens": 5098498265.0,
"step": 649
},
{
"epoch": 2.8047593293672257,
"grad_norm": 0.46022607243851776,
"learning_rate": 9.710120962504377e-06,
"loss": 0.9299,
"num_tokens": 5106762065.0,
"step": 650
},
{
"epoch": 2.8090859924283396,
"grad_norm": 0.38037491268678614,
"learning_rate": 9.685252742938298e-06,
"loss": 0.9604,
"num_tokens": 5114866977.0,
"step": 651
},
{
"epoch": 2.813412655489454,
"grad_norm": 0.4450482562419481,
"learning_rate": 9.660394776035432e-06,
"loss": 0.9645,
"num_tokens": 5123156278.0,
"step": 652
},
{
"epoch": 2.8177393185505677,
"grad_norm": 0.39576092998234047,
"learning_rate": 9.635547255643203e-06,
"loss": 0.9446,
"num_tokens": 5131331083.0,
"step": 653
},
{
"epoch": 2.822065981611682,
"grad_norm": 0.41154398069393333,
"learning_rate": 9.610710375527561e-06,
"loss": 0.9246,
"num_tokens": 5139347026.0,
"step": 654
},
{
"epoch": 2.8263926446727963,
"grad_norm": 0.3421006708746006,
"learning_rate": 9.585884329371496e-06,
"loss": 0.9861,
"num_tokens": 5147540492.0,
"step": 655
},
{
"epoch": 2.83071930773391,
"grad_norm": 0.4747580560478711,
"learning_rate": 9.561069310773487e-06,
"loss": 0.9542,
"num_tokens": 5155870243.0,
"step": 656
},
{
"epoch": 2.8350459707950244,
"grad_norm": 0.4224754159893733,
"learning_rate": 9.536265513246048e-06,
"loss": 0.935,
"num_tokens": 5163926726.0,
"step": 657
},
{
"epoch": 2.8393726338561383,
"grad_norm": 0.46762870636929993,
"learning_rate": 9.511473130214173e-06,
"loss": 0.9705,
"num_tokens": 5172107458.0,
"step": 658
},
{
"epoch": 2.8436992969172525,
"grad_norm": 0.4965576656745177,
"learning_rate": 9.486692355013838e-06,
"loss": 0.9183,
"num_tokens": 5180290484.0,
"step": 659
},
{
"epoch": 2.848025959978367,
"grad_norm": 0.3970648154104729,
"learning_rate": 9.461923380890513e-06,
"loss": 0.9421,
"num_tokens": 5188522043.0,
"step": 660
},
{
"epoch": 2.8523526230394807,
"grad_norm": 0.4619137564242492,
"learning_rate": 9.437166400997629e-06,
"loss": 0.9724,
"num_tokens": 5196656199.0,
"step": 661
},
{
"epoch": 2.856679286100595,
"grad_norm": 0.38039081459659835,
"learning_rate": 9.41242160839509e-06,
"loss": 0.9309,
"num_tokens": 5204830052.0,
"step": 662
},
{
"epoch": 2.861005949161709,
"grad_norm": 0.3969836134364504,
"learning_rate": 9.387689196047761e-06,
"loss": 0.9708,
"num_tokens": 5212973545.0,
"step": 663
},
{
"epoch": 2.865332612222823,
"grad_norm": 0.367110555698261,
"learning_rate": 9.36296935682396e-06,
"loss": 0.9443,
"num_tokens": 5221092259.0,
"step": 664
},
{
"epoch": 2.8696592752839374,
"grad_norm": 0.34785384103882466,
"learning_rate": 9.33826228349396e-06,
"loss": 0.9454,
"num_tokens": 5229402791.0,
"step": 665
},
{
"epoch": 2.8739859383450512,
"grad_norm": 0.40112917048533714,
"learning_rate": 9.313568168728478e-06,
"loss": 0.9502,
"num_tokens": 5237456315.0,
"step": 666
},
{
"epoch": 2.8783126014061655,
"grad_norm": 0.3426746194857651,
"learning_rate": 9.28888720509719e-06,
"loss": 0.9071,
"num_tokens": 5245758513.0,
"step": 667
},
{
"epoch": 2.8826392644672794,
"grad_norm": 0.41050977689909746,
"learning_rate": 9.264219585067197e-06,
"loss": 0.9598,
"num_tokens": 5254017826.0,
"step": 668
},
{
"epoch": 2.8869659275283936,
"grad_norm": 0.34041904977731363,
"learning_rate": 9.239565501001568e-06,
"loss": 0.9455,
"num_tokens": 5262031305.0,
"step": 669
},
{
"epoch": 2.891292590589508,
"grad_norm": 0.31648195039745797,
"learning_rate": 9.214925145157793e-06,
"loss": 0.952,
"num_tokens": 5270250950.0,
"step": 670
},
{
"epoch": 2.8956192536506222,
"grad_norm": 0.3463504968110696,
"learning_rate": 9.190298709686321e-06,
"loss": 0.9338,
"num_tokens": 5278464277.0,
"step": 671
},
{
"epoch": 2.899945916711736,
"grad_norm": 0.28534318933782366,
"learning_rate": 9.165686386629039e-06,
"loss": 0.9476,
"num_tokens": 5286697113.0,
"step": 672
},
{
"epoch": 2.90427257977285,
"grad_norm": 0.3397081347153792,
"learning_rate": 9.141088367917792e-06,
"loss": 0.9433,
"num_tokens": 5294847314.0,
"step": 673
},
{
"epoch": 2.908599242833964,
"grad_norm": 0.3230949625825632,
"learning_rate": 9.116504845372865e-06,
"loss": 0.9141,
"num_tokens": 5302924987.0,
"step": 674
},
{
"epoch": 2.9129259058950785,
"grad_norm": 0.34833673066450366,
"learning_rate": 9.091936010701513e-06,
"loss": 0.9698,
"num_tokens": 5311177487.0,
"step": 675
},
{
"epoch": 2.917252568956193,
"grad_norm": 0.3619472901098368,
"learning_rate": 9.067382055496431e-06,
"loss": 0.9719,
"num_tokens": 5319219559.0,
"step": 676
},
{
"epoch": 2.9215792320173066,
"grad_norm": 0.350338044609913,
"learning_rate": 9.042843171234307e-06,
"loss": 0.9576,
"num_tokens": 5327511456.0,
"step": 677
},
{
"epoch": 2.925905895078421,
"grad_norm": 0.3088705516097823,
"learning_rate": 9.018319549274288e-06,
"loss": 0.9562,
"num_tokens": 5335791238.0,
"step": 678
},
{
"epoch": 2.9302325581395348,
"grad_norm": 0.33745754789789145,
"learning_rate": 8.993811380856496e-06,
"loss": 0.9341,
"num_tokens": 5343893681.0,
"step": 679
},
{
"epoch": 2.934559221200649,
"grad_norm": 0.34258505632081965,
"learning_rate": 8.969318857100564e-06,
"loss": 0.9429,
"num_tokens": 5352079321.0,
"step": 680
},
{
"epoch": 2.9388858842617633,
"grad_norm": 0.2995800360605643,
"learning_rate": 8.944842169004099e-06,
"loss": 0.9589,
"num_tokens": 5360263368.0,
"step": 681
},
{
"epoch": 2.943212547322877,
"grad_norm": 0.3773143251992146,
"learning_rate": 8.920381507441243e-06,
"loss": 0.9362,
"num_tokens": 5368504895.0,
"step": 682
},
{
"epoch": 2.9475392103839915,
"grad_norm": 0.34689165094951335,
"learning_rate": 8.89593706316114e-06,
"loss": 0.9264,
"num_tokens": 5376846780.0,
"step": 683
},
{
"epoch": 2.9518658734451053,
"grad_norm": 0.41412476158382777,
"learning_rate": 8.87150902678648e-06,
"loss": 0.9594,
"num_tokens": 5384933248.0,
"step": 684
},
{
"epoch": 2.9561925365062196,
"grad_norm": 0.36680127928889933,
"learning_rate": 8.847097588811998e-06,
"loss": 0.9589,
"num_tokens": 5393187704.0,
"step": 685
},
{
"epoch": 2.960519199567334,
"grad_norm": 0.34424202273538157,
"learning_rate": 8.822702939602991e-06,
"loss": 0.9542,
"num_tokens": 5401348320.0,
"step": 686
},
{
"epoch": 2.9648458626284477,
"grad_norm": 0.3173741339592272,
"learning_rate": 8.798325269393837e-06,
"loss": 0.9313,
"num_tokens": 5409586273.0,
"step": 687
},
{
"epoch": 2.969172525689562,
"grad_norm": 0.3244123403034035,
"learning_rate": 8.773964768286496e-06,
"loss": 0.9612,
"num_tokens": 5417804753.0,
"step": 688
},
{
"epoch": 2.973499188750676,
"grad_norm": 0.32961010771746146,
"learning_rate": 8.749621626249064e-06,
"loss": 0.9268,
"num_tokens": 5426038605.0,
"step": 689
},
{
"epoch": 2.97782585181179,
"grad_norm": 0.3463195021104508,
"learning_rate": 8.725296033114236e-06,
"loss": 0.9314,
"num_tokens": 5434234877.0,
"step": 690
},
{
"epoch": 2.9821525148729044,
"grad_norm": 0.3050357082850999,
"learning_rate": 8.700988178577887e-06,
"loss": 0.9222,
"num_tokens": 5442351769.0,
"step": 691
},
{
"epoch": 2.9864791779340183,
"grad_norm": 0.3170878559410219,
"learning_rate": 8.676698252197542e-06,
"loss": 0.9628,
"num_tokens": 5450580632.0,
"step": 692
},
{
"epoch": 2.9908058409951326,
"grad_norm": 0.3366257932732851,
"learning_rate": 8.652426443390931e-06,
"loss": 0.9407,
"num_tokens": 5458798186.0,
"step": 693
},
{
"epoch": 2.9951325040562464,
"grad_norm": 0.4018191360229351,
"learning_rate": 8.628172941434488e-06,
"loss": 0.9606,
"num_tokens": 5466877716.0,
"step": 694
},
{
"epoch": 2.9994591671173607,
"grad_norm": 0.3274075286425149,
"learning_rate": 8.603937935461901e-06,
"loss": 0.9164,
"num_tokens": 5475055549.0,
"step": 695
},
{
"epoch": 3.0,
"grad_norm": 0.5614038305725747,
"learning_rate": 8.57972161446261e-06,
"loss": 0.9008,
"num_tokens": 5476104008.0,
"step": 696
},
{
"epoch": 3.0043266630611143,
"grad_norm": 0.4105885304776379,
"learning_rate": 8.55552416728035e-06,
"loss": 0.913,
"num_tokens": 5484223267.0,
"step": 697
},
{
"epoch": 3.008653326122228,
"grad_norm": 0.3773905264475136,
"learning_rate": 8.531345782611683e-06,
"loss": 0.9167,
"num_tokens": 5492414464.0,
"step": 698
},
{
"epoch": 3.0129799891833424,
"grad_norm": 0.3566408355562363,
"learning_rate": 8.5071866490045e-06,
"loss": 0.9194,
"num_tokens": 5500623429.0,
"step": 699
},
{
"epoch": 3.0173066522444563,
"grad_norm": 0.4193673926911609,
"learning_rate": 8.483046954856585e-06,
"loss": 0.9491,
"num_tokens": 5508774636.0,
"step": 700
},
{
"epoch": 3.0216333153055706,
"grad_norm": 0.35556886195846743,
"learning_rate": 8.458926888414112e-06,
"loss": 0.9589,
"num_tokens": 5517065103.0,
"step": 701
},
{
"epoch": 3.025959978366685,
"grad_norm": 0.41653240041710277,
"learning_rate": 8.434826637770217e-06,
"loss": 0.9173,
"num_tokens": 5525187881.0,
"step": 702
},
{
"epoch": 3.0302866414277987,
"grad_norm": 0.3146167010937598,
"learning_rate": 8.410746390863487e-06,
"loss": 0.9006,
"num_tokens": 5533313938.0,
"step": 703
},
{
"epoch": 3.034613304488913,
"grad_norm": 0.4044270250363667,
"learning_rate": 8.386686335476529e-06,
"loss": 0.923,
"num_tokens": 5541377157.0,
"step": 704
},
{
"epoch": 3.038939967550027,
"grad_norm": 0.3395022183255716,
"learning_rate": 8.362646659234485e-06,
"loss": 0.8981,
"num_tokens": 5549611519.0,
"step": 705
},
{
"epoch": 3.043266630611141,
"grad_norm": 0.4471600294802068,
"learning_rate": 8.338627549603585e-06,
"loss": 0.9222,
"num_tokens": 5557919196.0,
"step": 706
},
{
"epoch": 3.0475932936722554,
"grad_norm": 0.5305326751152935,
"learning_rate": 8.314629193889658e-06,
"loss": 0.9397,
"num_tokens": 5566104499.0,
"step": 707
},
{
"epoch": 3.0519199567333692,
"grad_norm": 0.35027612291144045,
"learning_rate": 8.290651779236718e-06,
"loss": 0.9131,
"num_tokens": 5574322504.0,
"step": 708
},
{
"epoch": 3.0562466197944835,
"grad_norm": 0.33827746335457937,
"learning_rate": 8.266695492625454e-06,
"loss": 0.9456,
"num_tokens": 5582427975.0,
"step": 709
},
{
"epoch": 3.060573282855598,
"grad_norm": 0.40908561698414975,
"learning_rate": 8.242760520871797e-06,
"loss": 0.9471,
"num_tokens": 5590629219.0,
"step": 710
},
{
"epoch": 3.0648999459167117,
"grad_norm": 0.3186114149151915,
"learning_rate": 8.218847050625476e-06,
"loss": 0.9024,
"num_tokens": 5598721452.0,
"step": 711
},
{
"epoch": 3.069226608977826,
"grad_norm": 0.3081259118888582,
"learning_rate": 8.194955268368526e-06,
"loss": 0.9243,
"num_tokens": 5606863531.0,
"step": 712
},
{
"epoch": 3.07355327203894,
"grad_norm": 0.35787737365668276,
"learning_rate": 8.171085360413867e-06,
"loss": 0.9459,
"num_tokens": 5614997329.0,
"step": 713
},
{
"epoch": 3.077879935100054,
"grad_norm": 0.32537016650973943,
"learning_rate": 8.147237512903834e-06,
"loss": 0.9094,
"num_tokens": 5622970021.0,
"step": 714
},
{
"epoch": 3.0822065981611684,
"grad_norm": 0.3105881415673291,
"learning_rate": 8.12341191180874e-06,
"loss": 0.927,
"num_tokens": 5631238351.0,
"step": 715
},
{
"epoch": 3.086533261222282,
"grad_norm": 0.3008636618396721,
"learning_rate": 8.099608742925403e-06,
"loss": 0.9215,
"num_tokens": 5639467774.0,
"step": 716
},
{
"epoch": 3.0908599242833965,
"grad_norm": 0.326147375459268,
"learning_rate": 8.075828191875714e-06,
"loss": 0.931,
"num_tokens": 5647579841.0,
"step": 717
},
{
"epoch": 3.0951865873445104,
"grad_norm": 0.3522548972988388,
"learning_rate": 8.052070444105188e-06,
"loss": 0.9095,
"num_tokens": 5655755448.0,
"step": 718
},
{
"epoch": 3.0995132504056246,
"grad_norm": 3.89200308036797,
"learning_rate": 8.028335684881517e-06,
"loss": 0.9562,
"num_tokens": 5663831787.0,
"step": 719
},
{
"epoch": 3.103839913466739,
"grad_norm": 0.5290111236398576,
"learning_rate": 8.00462409929312e-06,
"loss": 0.9091,
"num_tokens": 5672081053.0,
"step": 720
},
{
"epoch": 3.1081665765278528,
"grad_norm": 0.31017972774713937,
"learning_rate": 7.980935872247706e-06,
"loss": 0.8981,
"num_tokens": 5680229315.0,
"step": 721
},
{
"epoch": 3.112493239588967,
"grad_norm": 0.44238544431516175,
"learning_rate": 7.957271188470828e-06,
"loss": 0.9383,
"num_tokens": 5688587865.0,
"step": 722
},
{
"epoch": 3.1168199026500814,
"grad_norm": 0.36162049774260335,
"learning_rate": 7.933630232504437e-06,
"loss": 0.9364,
"num_tokens": 5696702957.0,
"step": 723
},
{
"epoch": 3.121146565711195,
"grad_norm": 0.3305075065326217,
"learning_rate": 7.910013188705464e-06,
"loss": 0.9394,
"num_tokens": 5704940872.0,
"step": 724
},
{
"epoch": 3.1254732287723095,
"grad_norm": 0.3612306221470166,
"learning_rate": 7.88642024124436e-06,
"loss": 0.9533,
"num_tokens": 5713220714.0,
"step": 725
},
{
"epoch": 3.1297998918334233,
"grad_norm": 0.35107569970160807,
"learning_rate": 7.862851574103665e-06,
"loss": 0.9291,
"num_tokens": 5721074137.0,
"step": 726
},
{
"epoch": 3.1341265548945376,
"grad_norm": 0.368464722379813,
"learning_rate": 7.839307371076581e-06,
"loss": 0.9288,
"num_tokens": 5729151610.0,
"step": 727
},
{
"epoch": 3.138453217955652,
"grad_norm": 0.34514054184426424,
"learning_rate": 7.815787815765536e-06,
"loss": 0.9613,
"num_tokens": 5737530913.0,
"step": 728
},
{
"epoch": 3.1427798810167658,
"grad_norm": 0.35482210860728847,
"learning_rate": 7.792293091580746e-06,
"loss": 0.955,
"num_tokens": 5745744229.0,
"step": 729
},
{
"epoch": 3.14710654407788,
"grad_norm": 0.32475020191847437,
"learning_rate": 7.768823381738786e-06,
"loss": 0.9209,
"num_tokens": 5753821386.0,
"step": 730
},
{
"epoch": 3.151433207138994,
"grad_norm": 0.34317842748446037,
"learning_rate": 7.74537886926117e-06,
"loss": 0.9305,
"num_tokens": 5762051038.0,
"step": 731
},
{
"epoch": 3.155759870200108,
"grad_norm": 0.28148438370865,
"learning_rate": 7.721959736972918e-06,
"loss": 0.9517,
"num_tokens": 5770277494.0,
"step": 732
},
{
"epoch": 3.1600865332612225,
"grad_norm": 0.3857808901571952,
"learning_rate": 7.698566167501124e-06,
"loss": 0.9133,
"num_tokens": 5778496571.0,
"step": 733
},
{
"epoch": 3.1644131963223363,
"grad_norm": 0.2789904847537803,
"learning_rate": 7.675198343273546e-06,
"loss": 0.9086,
"num_tokens": 5786685495.0,
"step": 734
},
{
"epoch": 3.1687398593834506,
"grad_norm": 0.37671200165653507,
"learning_rate": 7.651856446517172e-06,
"loss": 0.9147,
"num_tokens": 5794832046.0,
"step": 735
},
{
"epoch": 3.1730665224445644,
"grad_norm": 0.30040878970925106,
"learning_rate": 7.628540659256791e-06,
"loss": 0.9403,
"num_tokens": 5803097014.0,
"step": 736
},
{
"epoch": 3.1773931855056787,
"grad_norm": 0.3219509548910601,
"learning_rate": 7.605251163313614e-06,
"loss": 0.9388,
"num_tokens": 5811278088.0,
"step": 737
},
{
"epoch": 3.181719848566793,
"grad_norm": 0.36758751240035475,
"learning_rate": 7.581988140303791e-06,
"loss": 0.9206,
"num_tokens": 5819328655.0,
"step": 738
},
{
"epoch": 3.186046511627907,
"grad_norm": 0.29512444869432597,
"learning_rate": 7.558751771637059e-06,
"loss": 0.9167,
"num_tokens": 5827380764.0,
"step": 739
},
{
"epoch": 3.190373174689021,
"grad_norm": 2.0124288170095754,
"learning_rate": 7.535542238515285e-06,
"loss": 0.9128,
"num_tokens": 5835351467.0,
"step": 740
},
{
"epoch": 3.194699837750135,
"grad_norm": 0.4989076582507637,
"learning_rate": 7.512359721931075e-06,
"loss": 0.967,
"num_tokens": 5843312078.0,
"step": 741
},
{
"epoch": 3.1990265008112493,
"grad_norm": 0.3140029576082317,
"learning_rate": 7.489204402666344e-06,
"loss": 0.9486,
"num_tokens": 5851393198.0,
"step": 742
},
{
"epoch": 3.2033531638723636,
"grad_norm": 0.4125226908546305,
"learning_rate": 7.466076461290925e-06,
"loss": 0.9468,
"num_tokens": 5859740781.0,
"step": 743
},
{
"epoch": 3.2076798269334774,
"grad_norm": 0.39438107603387274,
"learning_rate": 7.442976078161155e-06,
"loss": 0.9247,
"num_tokens": 5868046866.0,
"step": 744
},
{
"epoch": 3.2120064899945917,
"grad_norm": 0.35730355992230906,
"learning_rate": 7.419903433418454e-06,
"loss": 0.9599,
"num_tokens": 5876294797.0,
"step": 745
},
{
"epoch": 3.2163331530557056,
"grad_norm": 0.3483111629570251,
"learning_rate": 7.396858706987948e-06,
"loss": 0.939,
"num_tokens": 5884474850.0,
"step": 746
},
{
"epoch": 3.22065981611682,
"grad_norm": 0.36151168683199636,
"learning_rate": 7.373842078577038e-06,
"loss": 0.9431,
"num_tokens": 5892667501.0,
"step": 747
},
{
"epoch": 3.224986479177934,
"grad_norm": 0.3099927796740935,
"learning_rate": 7.350853727674019e-06,
"loss": 0.9233,
"num_tokens": 5900881285.0,
"step": 748
},
{
"epoch": 3.229313142239048,
"grad_norm": 0.33503892378294126,
"learning_rate": 7.327893833546666e-06,
"loss": 0.9496,
"num_tokens": 5909032821.0,
"step": 749
},
{
"epoch": 3.2336398053001623,
"grad_norm": 0.30206423338613264,
"learning_rate": 7.3049625752408485e-06,
"loss": 0.9302,
"num_tokens": 5917206619.0,
"step": 750
},
{
"epoch": 3.2379664683612766,
"grad_norm": 0.3494142163645988,
"learning_rate": 7.282060131579125e-06,
"loss": 0.9062,
"num_tokens": 5925492396.0,
"step": 751
},
{
"epoch": 3.2422931314223904,
"grad_norm": 0.35685720307280216,
"learning_rate": 7.259186681159349e-06,
"loss": 0.908,
"num_tokens": 5933754893.0,
"step": 752
},
{
"epoch": 3.2466197944835047,
"grad_norm": 0.3282045615510198,
"learning_rate": 7.236342402353289e-06,
"loss": 0.9339,
"num_tokens": 5941936261.0,
"step": 753
},
{
"epoch": 3.2509464575446185,
"grad_norm": 0.33739095848560247,
"learning_rate": 7.213527473305211e-06,
"loss": 0.9322,
"num_tokens": 5950172687.0,
"step": 754
},
{
"epoch": 3.255273120605733,
"grad_norm": 0.3740851819217219,
"learning_rate": 7.1907420719305185e-06,
"loss": 0.917,
"num_tokens": 5958489000.0,
"step": 755
},
{
"epoch": 3.259599783666847,
"grad_norm": 0.298804748301486,
"learning_rate": 7.167986375914347e-06,
"loss": 0.9376,
"num_tokens": 5966823523.0,
"step": 756
},
{
"epoch": 3.263926446727961,
"grad_norm": 0.36616478845543216,
"learning_rate": 7.145260562710188e-06,
"loss": 0.9445,
"num_tokens": 5975019052.0,
"step": 757
},
{
"epoch": 3.2682531097890752,
"grad_norm": 0.35620532611163425,
"learning_rate": 7.1225648095384994e-06,
"loss": 0.951,
"num_tokens": 5983270411.0,
"step": 758
},
{
"epoch": 3.2725797728501895,
"grad_norm": 0.2988864186906242,
"learning_rate": 7.099899293385317e-06,
"loss": 0.9334,
"num_tokens": 5991383255.0,
"step": 759
},
{
"epoch": 3.2769064359113034,
"grad_norm": 0.3325133244022023,
"learning_rate": 7.077264191000895e-06,
"loss": 0.9277,
"num_tokens": 5999661742.0,
"step": 760
},
{
"epoch": 3.2812330989724177,
"grad_norm": 0.35243717348743153,
"learning_rate": 7.054659678898304e-06,
"loss": 0.9398,
"num_tokens": 6007956049.0,
"step": 761
},
{
"epoch": 3.2855597620335315,
"grad_norm": 0.28359237790060177,
"learning_rate": 7.032085933352075e-06,
"loss": 0.9716,
"num_tokens": 6016079855.0,
"step": 762
},
{
"epoch": 3.289886425094646,
"grad_norm": 0.2917404250644013,
"learning_rate": 7.0095431303968035e-06,
"loss": 0.9161,
"num_tokens": 6024330663.0,
"step": 763
},
{
"epoch": 3.29421308815576,
"grad_norm": 0.27888385505990193,
"learning_rate": 6.987031445825805e-06,
"loss": 0.9304,
"num_tokens": 6032564116.0,
"step": 764
},
{
"epoch": 3.298539751216874,
"grad_norm": 0.3196527440559267,
"learning_rate": 6.964551055189712e-06,
"loss": 0.9095,
"num_tokens": 6040658380.0,
"step": 765
},
{
"epoch": 3.302866414277988,
"grad_norm": 0.7134391554970761,
"learning_rate": 6.942102133795126e-06,
"loss": 0.9254,
"num_tokens": 6048731691.0,
"step": 766
},
{
"epoch": 3.307193077339102,
"grad_norm": 0.4023470106512136,
"learning_rate": 6.919684856703244e-06,
"loss": 0.9311,
"num_tokens": 6056818812.0,
"step": 767
},
{
"epoch": 3.3115197404002163,
"grad_norm": 0.3052498648114602,
"learning_rate": 6.897299398728503e-06,
"loss": 0.941,
"num_tokens": 6064982756.0,
"step": 768
},
{
"epoch": 3.3158464034613306,
"grad_norm": 0.2785893249101237,
"learning_rate": 6.874945934437192e-06,
"loss": 0.8953,
"num_tokens": 6073271792.0,
"step": 769
},
{
"epoch": 3.3201730665224445,
"grad_norm": 0.3294134296277513,
"learning_rate": 6.852624638146123e-06,
"loss": 0.9185,
"num_tokens": 6081302170.0,
"step": 770
},
{
"epoch": 3.3244997295835588,
"grad_norm": 0.27742083830120734,
"learning_rate": 6.830335683921248e-06,
"loss": 0.9409,
"num_tokens": 6089584899.0,
"step": 771
},
{
"epoch": 3.3288263926446726,
"grad_norm": 0.29411318866548547,
"learning_rate": 6.808079245576303e-06,
"loss": 0.9518,
"num_tokens": 6097617556.0,
"step": 772
},
{
"epoch": 3.333153055705787,
"grad_norm": 0.26948329262472964,
"learning_rate": 6.785855496671471e-06,
"loss": 0.946,
"num_tokens": 6105889261.0,
"step": 773
},
{
"epoch": 3.337479718766901,
"grad_norm": 0.293679847613122,
"learning_rate": 6.763664610512007e-06,
"loss": 0.8957,
"num_tokens": 6114035928.0,
"step": 774
},
{
"epoch": 3.341806381828015,
"grad_norm": 0.32098790507703917,
"learning_rate": 6.741506760146903e-06,
"loss": 0.9222,
"num_tokens": 6122198138.0,
"step": 775
},
{
"epoch": 3.3461330448891293,
"grad_norm": 0.3050861139648036,
"learning_rate": 6.719382118367523e-06,
"loss": 0.9565,
"num_tokens": 6130496867.0,
"step": 776
},
{
"epoch": 3.350459707950243,
"grad_norm": 0.2953351225139781,
"learning_rate": 6.697290857706271e-06,
"loss": 0.943,
"num_tokens": 6138729262.0,
"step": 777
},
{
"epoch": 3.3547863710113575,
"grad_norm": 0.3084437070089497,
"learning_rate": 6.675233150435234e-06,
"loss": 0.9066,
"num_tokens": 6147052020.0,
"step": 778
},
{
"epoch": 3.3591130340724717,
"grad_norm": 0.30274593086650614,
"learning_rate": 6.653209168564847e-06,
"loss": 0.9322,
"num_tokens": 6155200347.0,
"step": 779
},
{
"epoch": 3.3634396971335856,
"grad_norm": 0.29878082533309175,
"learning_rate": 6.631219083842535e-06,
"loss": 0.9203,
"num_tokens": 6163279432.0,
"step": 780
},
{
"epoch": 3.3677663601947,
"grad_norm": 0.29441364527784664,
"learning_rate": 6.609263067751406e-06,
"loss": 0.9471,
"num_tokens": 6171453047.0,
"step": 781
},
{
"epoch": 3.3720930232558137,
"grad_norm": 0.33375117226467504,
"learning_rate": 6.587341291508882e-06,
"loss": 0.9152,
"num_tokens": 6179561278.0,
"step": 782
},
{
"epoch": 3.376419686316928,
"grad_norm": 0.2801957802924117,
"learning_rate": 6.565453926065377e-06,
"loss": 0.9172,
"num_tokens": 6187682586.0,
"step": 783
},
{
"epoch": 3.3807463493780423,
"grad_norm": 0.3024317409157831,
"learning_rate": 6.543601142102964e-06,
"loss": 0.9159,
"num_tokens": 6195851861.0,
"step": 784
},
{
"epoch": 3.385073012439156,
"grad_norm": 0.32720306934594906,
"learning_rate": 6.521783110034038e-06,
"loss": 0.925,
"num_tokens": 6203979372.0,
"step": 785
},
{
"epoch": 3.3893996755002704,
"grad_norm": 0.3038756358264362,
"learning_rate": 6.500000000000003e-06,
"loss": 0.9009,
"num_tokens": 6212201742.0,
"step": 786
},
{
"epoch": 3.3937263385613847,
"grad_norm": 0.27865244384411847,
"learning_rate": 6.478251981869919e-06,
"loss": 0.93,
"num_tokens": 6220292265.0,
"step": 787
},
{
"epoch": 3.3980530016224986,
"grad_norm": 0.27563984049065876,
"learning_rate": 6.4565392252392066e-06,
"loss": 0.9167,
"num_tokens": 6228496839.0,
"step": 788
},
{
"epoch": 3.402379664683613,
"grad_norm": 0.3179833299526435,
"learning_rate": 6.434861899428299e-06,
"loss": 0.9263,
"num_tokens": 6236728852.0,
"step": 789
},
{
"epoch": 3.4067063277447267,
"grad_norm": 0.2769787878341213,
"learning_rate": 6.41322017348134e-06,
"loss": 0.9465,
"num_tokens": 6244700023.0,
"step": 790
},
{
"epoch": 3.411032990805841,
"grad_norm": 0.3394752547637937,
"learning_rate": 6.391614216164859e-06,
"loss": 0.9564,
"num_tokens": 6252905728.0,
"step": 791
},
{
"epoch": 3.4153596538669553,
"grad_norm": 0.29740964104918016,
"learning_rate": 6.370044195966443e-06,
"loss": 0.9377,
"num_tokens": 6260996249.0,
"step": 792
},
{
"epoch": 3.419686316928069,
"grad_norm": 0.29478500973407357,
"learning_rate": 6.3485102810934495e-06,
"loss": 0.9603,
"num_tokens": 6269249711.0,
"step": 793
},
{
"epoch": 3.4240129799891834,
"grad_norm": 0.3291226660785669,
"learning_rate": 6.327012639471668e-06,
"loss": 0.9345,
"num_tokens": 6277425640.0,
"step": 794
},
{
"epoch": 3.4283396430502977,
"grad_norm": 0.29781461848718044,
"learning_rate": 6.305551438744031e-06,
"loss": 0.9278,
"num_tokens": 6285662840.0,
"step": 795
},
{
"epoch": 3.4326663061114115,
"grad_norm": 0.2869526690619102,
"learning_rate": 6.2841268462692894e-06,
"loss": 0.9404,
"num_tokens": 6293843869.0,
"step": 796
},
{
"epoch": 3.436992969172526,
"grad_norm": 0.3520399233785471,
"learning_rate": 6.262739029120721e-06,
"loss": 0.9328,
"num_tokens": 6302081427.0,
"step": 797
},
{
"epoch": 3.4413196322336397,
"grad_norm": 0.277183264753593,
"learning_rate": 6.241388154084815e-06,
"loss": 0.8792,
"num_tokens": 6310350558.0,
"step": 798
},
{
"epoch": 3.445646295294754,
"grad_norm": 0.3559461779938261,
"learning_rate": 6.220074387659993e-06,
"loss": 0.9365,
"num_tokens": 6318527498.0,
"step": 799
},
{
"epoch": 3.4499729583558683,
"grad_norm": 0.3120065081521411,
"learning_rate": 6.198797896055277e-06,
"loss": 0.9108,
"num_tokens": 6326734548.0,
"step": 800
},
{
"epoch": 3.454299621416982,
"grad_norm": 0.2948366567102916,
"learning_rate": 6.177558845189029e-06,
"loss": 0.9409,
"num_tokens": 6334913287.0,
"step": 801
},
{
"epoch": 3.4586262844780964,
"grad_norm": 0.3130208095797164,
"learning_rate": 6.156357400687636e-06,
"loss": 0.9381,
"num_tokens": 6342974418.0,
"step": 802
},
{
"epoch": 3.4629529475392102,
"grad_norm": 0.2780789055689837,
"learning_rate": 6.135193727884217e-06,
"loss": 0.9134,
"num_tokens": 6351115042.0,
"step": 803
},
{
"epoch": 3.4672796106003245,
"grad_norm": 0.33413747564114543,
"learning_rate": 6.114067991817345e-06,
"loss": 0.9162,
"num_tokens": 6359218103.0,
"step": 804
},
{
"epoch": 3.471606273661439,
"grad_norm": 0.2856448915194415,
"learning_rate": 6.09298035722975e-06,
"loss": 0.9469,
"num_tokens": 6367267149.0,
"step": 805
},
{
"epoch": 3.4759329367225527,
"grad_norm": 0.3010782239547219,
"learning_rate": 6.07193098856705e-06,
"loss": 0.9281,
"num_tokens": 6375383838.0,
"step": 806
},
{
"epoch": 3.480259599783667,
"grad_norm": 0.3598745028912282,
"learning_rate": 6.050920049976443e-06,
"loss": 0.9297,
"num_tokens": 6383520730.0,
"step": 807
},
{
"epoch": 3.484586262844781,
"grad_norm": 0.2655948499376127,
"learning_rate": 6.029947705305453e-06,
"loss": 0.9118,
"num_tokens": 6391710224.0,
"step": 808
},
{
"epoch": 3.488912925905895,
"grad_norm": 0.374075695585731,
"learning_rate": 6.009014118100638e-06,
"loss": 0.9175,
"num_tokens": 6399847271.0,
"step": 809
},
{
"epoch": 3.4932395889670094,
"grad_norm": 0.3031326750096311,
"learning_rate": 5.988119451606312e-06,
"loss": 0.936,
"num_tokens": 6407983914.0,
"step": 810
},
{
"epoch": 3.497566252028123,
"grad_norm": 0.36971692188767286,
"learning_rate": 5.96726386876328e-06,
"loss": 0.9025,
"num_tokens": 6416189403.0,
"step": 811
},
{
"epoch": 3.5018929150892375,
"grad_norm": 0.3146277238724712,
"learning_rate": 5.946447532207571e-06,
"loss": 0.8989,
"num_tokens": 6424489803.0,
"step": 812
},
{
"epoch": 3.5062195781503513,
"grad_norm": 0.2982112963563615,
"learning_rate": 5.92567060426916e-06,
"loss": 0.9487,
"num_tokens": 6432747661.0,
"step": 813
},
{
"epoch": 3.5105462412114656,
"grad_norm": 0.37960863086345786,
"learning_rate": 5.904933246970699e-06,
"loss": 0.9156,
"num_tokens": 6440886009.0,
"step": 814
},
{
"epoch": 3.51487290427258,
"grad_norm": 0.296061478455817,
"learning_rate": 5.884235622026278e-06,
"loss": 0.9234,
"num_tokens": 6448957359.0,
"step": 815
},
{
"epoch": 3.5191995673336938,
"grad_norm": 0.3382022290934504,
"learning_rate": 5.863577890840116e-06,
"loss": 0.9509,
"num_tokens": 6457102027.0,
"step": 816
},
{
"epoch": 3.523526230394808,
"grad_norm": 0.290191146124253,
"learning_rate": 5.842960214505366e-06,
"loss": 0.942,
"num_tokens": 6465359395.0,
"step": 817
},
{
"epoch": 3.527852893455922,
"grad_norm": 0.36396926595892626,
"learning_rate": 5.8223827538027974e-06,
"loss": 0.9527,
"num_tokens": 6473562045.0,
"step": 818
},
{
"epoch": 3.532179556517036,
"grad_norm": 0.26519025564463417,
"learning_rate": 5.801845669199594e-06,
"loss": 0.9286,
"num_tokens": 6481741265.0,
"step": 819
},
{
"epoch": 3.5365062195781505,
"grad_norm": 0.3007276045464739,
"learning_rate": 5.781349120848057e-06,
"loss": 0.9206,
"num_tokens": 6489853496.0,
"step": 820
},
{
"epoch": 3.5408328826392643,
"grad_norm": 0.3152801349177596,
"learning_rate": 5.760893268584398e-06,
"loss": 0.9349,
"num_tokens": 6498038082.0,
"step": 821
},
{
"epoch": 3.5451595457003786,
"grad_norm": 0.31400051598257284,
"learning_rate": 5.740478271927452e-06,
"loss": 0.9645,
"num_tokens": 6506234847.0,
"step": 822
},
{
"epoch": 3.5494862087614925,
"grad_norm": 0.2831939350731783,
"learning_rate": 5.720104290077469e-06,
"loss": 0.9127,
"num_tokens": 6514345357.0,
"step": 823
},
{
"epoch": 3.5538128718226067,
"grad_norm": 0.31626659365486787,
"learning_rate": 5.6997714819148534e-06,
"loss": 0.9327,
"num_tokens": 6522489578.0,
"step": 824
},
{
"epoch": 3.558139534883721,
"grad_norm": 0.3035258072630229,
"learning_rate": 5.679480005998923e-06,
"loss": 0.9202,
"num_tokens": 6530750993.0,
"step": 825
},
{
"epoch": 3.5624661979448353,
"grad_norm": 0.3255604103790486,
"learning_rate": 5.659230020566689e-06,
"loss": 0.952,
"num_tokens": 6539008159.0,
"step": 826
},
{
"epoch": 3.566792861005949,
"grad_norm": 0.28719166832906956,
"learning_rate": 5.639021683531598e-06,
"loss": 0.9219,
"num_tokens": 6547267835.0,
"step": 827
},
{
"epoch": 3.571119524067063,
"grad_norm": 0.279274924202361,
"learning_rate": 5.618855152482334e-06,
"loss": 0.9245,
"num_tokens": 6555511093.0,
"step": 828
},
{
"epoch": 3.5754461871281773,
"grad_norm": 0.31504033599617814,
"learning_rate": 5.5987305846815425e-06,
"loss": 0.948,
"num_tokens": 6563644940.0,
"step": 829
},
{
"epoch": 3.5797728501892916,
"grad_norm": 0.2885737141200188,
"learning_rate": 5.578648137064655e-06,
"loss": 0.9383,
"num_tokens": 6571675207.0,
"step": 830
},
{
"epoch": 3.584099513250406,
"grad_norm": 0.287602171748228,
"learning_rate": 5.558607966238627e-06,
"loss": 0.9322,
"num_tokens": 6579800832.0,
"step": 831
},
{
"epoch": 3.5884261763115197,
"grad_norm": 0.323812546545481,
"learning_rate": 5.5386102284807395e-06,
"loss": 0.9254,
"num_tokens": 6587945088.0,
"step": 832
},
{
"epoch": 3.592752839372634,
"grad_norm": 0.28501015265860297,
"learning_rate": 5.518655079737371e-06,
"loss": 0.9275,
"num_tokens": 6596138413.0,
"step": 833
},
{
"epoch": 3.597079502433748,
"grad_norm": 0.2844443740370572,
"learning_rate": 5.498742675622777e-06,
"loss": 0.9795,
"num_tokens": 6604206973.0,
"step": 834
},
{
"epoch": 3.601406165494862,
"grad_norm": 0.2716290558748706,
"learning_rate": 5.478873171417884e-06,
"loss": 0.9244,
"num_tokens": 6612245627.0,
"step": 835
},
{
"epoch": 3.6057328285559764,
"grad_norm": 0.2660089227467234,
"learning_rate": 5.459046722069077e-06,
"loss": 0.943,
"num_tokens": 6620394090.0,
"step": 836
},
{
"epoch": 3.6100594916170903,
"grad_norm": 0.27963142700479227,
"learning_rate": 5.439263482186993e-06,
"loss": 0.9504,
"num_tokens": 6628667370.0,
"step": 837
},
{
"epoch": 3.6143861546782046,
"grad_norm": 0.30095431039169446,
"learning_rate": 5.419523606045307e-06,
"loss": 0.9128,
"num_tokens": 6636964851.0,
"step": 838
},
{
"epoch": 3.6187128177393184,
"grad_norm": 0.29117050215907364,
"learning_rate": 5.399827247579543e-06,
"loss": 0.9421,
"num_tokens": 6645115078.0,
"step": 839
},
{
"epoch": 3.6230394808004327,
"grad_norm": 0.27927887812944785,
"learning_rate": 5.3801745603858606e-06,
"loss": 0.9252,
"num_tokens": 6653355252.0,
"step": 840
},
{
"epoch": 3.627366143861547,
"grad_norm": 0.28681655737112005,
"learning_rate": 5.36056569771986e-06,
"loss": 0.9202,
"num_tokens": 6661518176.0,
"step": 841
},
{
"epoch": 3.631692806922661,
"grad_norm": 0.2994382383584901,
"learning_rate": 5.341000812495387e-06,
"loss": 0.9006,
"num_tokens": 6669758163.0,
"step": 842
},
{
"epoch": 3.636019469983775,
"grad_norm": 0.2859783006285974,
"learning_rate": 5.3214800572833535e-06,
"loss": 0.9562,
"num_tokens": 6678002025.0,
"step": 843
},
{
"epoch": 3.640346133044889,
"grad_norm": 0.3007253324040871,
"learning_rate": 5.302003584310531e-06,
"loss": 0.9235,
"num_tokens": 6686250664.0,
"step": 844
},
{
"epoch": 3.6446727961060033,
"grad_norm": 0.29129203810321896,
"learning_rate": 5.282571545458361e-06,
"loss": 0.9554,
"num_tokens": 6694483330.0,
"step": 845
},
{
"epoch": 3.6489994591671175,
"grad_norm": 0.29382448761915053,
"learning_rate": 5.263184092261793e-06,
"loss": 0.9242,
"num_tokens": 6702400177.0,
"step": 846
},
{
"epoch": 3.6533261222282314,
"grad_norm": 0.27678930676739016,
"learning_rate": 5.243841375908079e-06,
"loss": 0.9637,
"num_tokens": 6710452713.0,
"step": 847
},
{
"epoch": 3.6576527852893457,
"grad_norm": 0.30873496314841525,
"learning_rate": 5.2245435472356075e-06,
"loss": 0.9235,
"num_tokens": 6718658666.0,
"step": 848
},
{
"epoch": 3.6619794483504595,
"grad_norm": 0.2702325244264808,
"learning_rate": 5.205290756732717e-06,
"loss": 0.9308,
"num_tokens": 6726891839.0,
"step": 849
},
{
"epoch": 3.666306111411574,
"grad_norm": 0.3630867653307208,
"learning_rate": 5.186083154536545e-06,
"loss": 0.946,
"num_tokens": 6735027364.0,
"step": 850
},
{
"epoch": 3.670632774472688,
"grad_norm": 0.2969734274544958,
"learning_rate": 5.166920890431822e-06,
"loss": 0.917,
"num_tokens": 6743405824.0,
"step": 851
},
{
"epoch": 3.674959437533802,
"grad_norm": 0.3437347470803757,
"learning_rate": 5.147804113849739e-06,
"loss": 0.9249,
"num_tokens": 6751720174.0,
"step": 852
},
{
"epoch": 3.6792861005949162,
"grad_norm": 0.3299578442270291,
"learning_rate": 5.128732973866764e-06,
"loss": 0.9397,
"num_tokens": 6759962969.0,
"step": 853
},
{
"epoch": 3.68361276365603,
"grad_norm": 0.2881740077030876,
"learning_rate": 5.109707619203468e-06,
"loss": 0.9229,
"num_tokens": 6768214898.0,
"step": 854
},
{
"epoch": 3.6879394267171444,
"grad_norm": 0.31574582431617765,
"learning_rate": 5.090728198223393e-06,
"loss": 0.9161,
"num_tokens": 6776411515.0,
"step": 855
},
{
"epoch": 3.6922660897782587,
"grad_norm": 0.27411621043292045,
"learning_rate": 5.071794858931875e-06,
"loss": 0.9408,
"num_tokens": 6784748226.0,
"step": 856
},
{
"epoch": 3.6965927528393725,
"grad_norm": 0.30409723396559724,
"learning_rate": 5.052907748974902e-06,
"loss": 0.8973,
"num_tokens": 6792880288.0,
"step": 857
},
{
"epoch": 3.700919415900487,
"grad_norm": 0.26352582652848877,
"learning_rate": 5.034067015637945e-06,
"loss": 0.9587,
"num_tokens": 6801081860.0,
"step": 858
},
{
"epoch": 3.7052460789616006,
"grad_norm": 0.2948163224706812,
"learning_rate": 5.015272805844829e-06,
"loss": 0.959,
"num_tokens": 6809396324.0,
"step": 859
},
{
"epoch": 3.709572742022715,
"grad_norm": 0.26728114208333664,
"learning_rate": 4.996525266156582e-06,
"loss": 0.9194,
"num_tokens": 6817556880.0,
"step": 860
},
{
"epoch": 3.713899405083829,
"grad_norm": 0.24917090858865323,
"learning_rate": 4.977824542770279e-06,
"loss": 0.8967,
"num_tokens": 6825826880.0,
"step": 861
},
{
"epoch": 3.718226068144943,
"grad_norm": 0.26223408456264524,
"learning_rate": 4.959170781517917e-06,
"loss": 0.8977,
"num_tokens": 6834120960.0,
"step": 862
},
{
"epoch": 3.7225527312060573,
"grad_norm": 0.24834835176052955,
"learning_rate": 4.940564127865276e-06,
"loss": 0.9078,
"num_tokens": 6842342130.0,
"step": 863
},
{
"epoch": 3.726879394267171,
"grad_norm": 0.28897403903709534,
"learning_rate": 4.92200472691078e-06,
"loss": 0.9533,
"num_tokens": 6850501851.0,
"step": 864
},
{
"epoch": 3.7312060573282855,
"grad_norm": 0.24176033099925953,
"learning_rate": 4.903492723384366e-06,
"loss": 0.9427,
"num_tokens": 6858679110.0,
"step": 865
},
{
"epoch": 3.7355327203893998,
"grad_norm": 1.2200892765164042,
"learning_rate": 4.885028261646354e-06,
"loss": 0.9313,
"num_tokens": 6866722996.0,
"step": 866
},
{
"epoch": 3.739859383450514,
"grad_norm": 0.4354124277947738,
"learning_rate": 4.866611485686323e-06,
"loss": 0.9647,
"num_tokens": 6874881319.0,
"step": 867
},
{
"epoch": 3.744186046511628,
"grad_norm": 0.3097484640168433,
"learning_rate": 4.848242539121998e-06,
"loss": 0.9158,
"num_tokens": 6882948710.0,
"step": 868
},
{
"epoch": 3.748512709572742,
"grad_norm": 0.3408052167056257,
"learning_rate": 4.8299215651981095e-06,
"loss": 0.9496,
"num_tokens": 6891090626.0,
"step": 869
},
{
"epoch": 3.752839372633856,
"grad_norm": 0.35810397924998577,
"learning_rate": 4.8116487067852945e-06,
"loss": 0.901,
"num_tokens": 6899242627.0,
"step": 870
},
{
"epoch": 3.7571660356949703,
"grad_norm": 0.34291620033093606,
"learning_rate": 4.793424106378972e-06,
"loss": 0.9192,
"num_tokens": 6907463249.0,
"step": 871
},
{
"epoch": 3.7614926987560846,
"grad_norm": 0.33243753034162843,
"learning_rate": 4.77524790609824e-06,
"loss": 0.9461,
"num_tokens": 6915673495.0,
"step": 872
},
{
"epoch": 3.7658193618171985,
"grad_norm": 0.32596987523166643,
"learning_rate": 4.7571202476847575e-06,
"loss": 0.8984,
"num_tokens": 6923786563.0,
"step": 873
},
{
"epoch": 3.7701460248783127,
"grad_norm": 0.33239664166194055,
"learning_rate": 4.739041272501643e-06,
"loss": 0.9011,
"num_tokens": 6931994777.0,
"step": 874
},
{
"epoch": 3.7744726879394266,
"grad_norm": 0.3224636410025114,
"learning_rate": 4.721011121532384e-06,
"loss": 0.9437,
"num_tokens": 6940150249.0,
"step": 875
},
{
"epoch": 3.778799351000541,
"grad_norm": 0.29121879157929753,
"learning_rate": 4.703029935379711e-06,
"loss": 0.9497,
"num_tokens": 6948396232.0,
"step": 876
},
{
"epoch": 3.783126014061655,
"grad_norm": 0.2916266453802922,
"learning_rate": 4.685097854264535e-06,
"loss": 0.9295,
"num_tokens": 6956364532.0,
"step": 877
},
{
"epoch": 3.787452677122769,
"grad_norm": 0.28830831706624727,
"learning_rate": 4.66721501802482e-06,
"loss": 0.9037,
"num_tokens": 6964475231.0,
"step": 878
},
{
"epoch": 3.7917793401838833,
"grad_norm": 0.25385471758857564,
"learning_rate": 4.649381566114517e-06,
"loss": 0.9157,
"num_tokens": 6972702655.0,
"step": 879
},
{
"epoch": 3.796106003244997,
"grad_norm": 0.2783858674776144,
"learning_rate": 4.631597637602465e-06,
"loss": 0.8843,
"num_tokens": 6980708146.0,
"step": 880
},
{
"epoch": 3.8004326663061114,
"grad_norm": 0.25631955554373037,
"learning_rate": 4.613863371171314e-06,
"loss": 0.9488,
"num_tokens": 6988956749.0,
"step": 881
},
{
"epoch": 3.8047593293672257,
"grad_norm": 0.2370426912422331,
"learning_rate": 4.5961789051164325e-06,
"loss": 0.9294,
"num_tokens": 6997191073.0,
"step": 882
},
{
"epoch": 3.8090859924283396,
"grad_norm": 0.25552438858767745,
"learning_rate": 4.578544377344841e-06,
"loss": 0.9112,
"num_tokens": 7005392482.0,
"step": 883
},
{
"epoch": 3.813412655489454,
"grad_norm": 0.2399956009221735,
"learning_rate": 4.560959925374133e-06,
"loss": 0.9182,
"num_tokens": 7013476301.0,
"step": 884
},
{
"epoch": 3.8177393185505677,
"grad_norm": 0.26400729271667883,
"learning_rate": 4.543425686331394e-06,
"loss": 0.9093,
"num_tokens": 7021567003.0,
"step": 885
},
{
"epoch": 3.822065981611682,
"grad_norm": 0.25155595917456375,
"learning_rate": 4.525941796952142e-06,
"loss": 0.9759,
"num_tokens": 7029905460.0,
"step": 886
},
{
"epoch": 3.8263926446727963,
"grad_norm": 0.2748577842617266,
"learning_rate": 4.5085083935792566e-06,
"loss": 0.9306,
"num_tokens": 7038155146.0,
"step": 887
},
{
"epoch": 3.83071930773391,
"grad_norm": 0.23872591495204412,
"learning_rate": 4.491125612161924e-06,
"loss": 0.9194,
"num_tokens": 7046355670.0,
"step": 888
},
{
"epoch": 3.8350459707950244,
"grad_norm": 0.2746878100367909,
"learning_rate": 4.47379358825456e-06,
"loss": 0.9471,
"num_tokens": 7054615035.0,
"step": 889
},
{
"epoch": 3.8393726338561383,
"grad_norm": 0.257786053865835,
"learning_rate": 4.456512457015775e-06,
"loss": 0.9375,
"num_tokens": 7062921452.0,
"step": 890
},
{
"epoch": 3.8436992969172525,
"grad_norm": 0.2546439958259274,
"learning_rate": 4.4392823532072984e-06,
"loss": 0.9681,
"num_tokens": 7071058240.0,
"step": 891
},
{
"epoch": 3.848025959978367,
"grad_norm": 0.25787071531179456,
"learning_rate": 4.422103411192941e-06,
"loss": 0.9433,
"num_tokens": 7079255328.0,
"step": 892
},
{
"epoch": 3.8523526230394807,
"grad_norm": 0.26573100227653684,
"learning_rate": 4.404975764937541e-06,
"loss": 0.9419,
"num_tokens": 7087533637.0,
"step": 893
},
{
"epoch": 3.856679286100595,
"grad_norm": 0.26310774628285044,
"learning_rate": 4.387899548005927e-06,
"loss": 0.9418,
"num_tokens": 7095658511.0,
"step": 894
},
{
"epoch": 3.861005949161709,
"grad_norm": 0.282182857985642,
"learning_rate": 4.370874893561872e-06,
"loss": 0.9541,
"num_tokens": 7103829697.0,
"step": 895
},
{
"epoch": 3.865332612222823,
"grad_norm": 0.2619523701917039,
"learning_rate": 4.353901934367045e-06,
"loss": 0.926,
"num_tokens": 7111825762.0,
"step": 896
},
{
"epoch": 3.8696592752839374,
"grad_norm": 0.264850106481059,
"learning_rate": 4.336980802779998e-06,
"loss": 0.9393,
"num_tokens": 7119948260.0,
"step": 897
},
{
"epoch": 3.8739859383450512,
"grad_norm": 0.2704427422262429,
"learning_rate": 4.320111630755109e-06,
"loss": 0.9488,
"num_tokens": 7128064886.0,
"step": 898
},
{
"epoch": 3.8783126014061655,
"grad_norm": 0.24688426282829962,
"learning_rate": 4.303294549841573e-06,
"loss": 0.9644,
"num_tokens": 7136276452.0,
"step": 899
},
{
"epoch": 3.8826392644672794,
"grad_norm": 0.24921212539281543,
"learning_rate": 4.286529691182362e-06,
"loss": 0.9675,
"num_tokens": 7144530885.0,
"step": 900
},
{
"epoch": 3.8869659275283936,
"grad_norm": 0.2491156891345615,
"learning_rate": 4.269817185513215e-06,
"loss": 0.9272,
"num_tokens": 7152678990.0,
"step": 901
},
{
"epoch": 3.891292590589508,
"grad_norm": 0.23412745486849854,
"learning_rate": 4.253157163161605e-06,
"loss": 0.8895,
"num_tokens": 7160980893.0,
"step": 902
},
{
"epoch": 3.8956192536506222,
"grad_norm": 0.2655564126452194,
"learning_rate": 4.236549754045737e-06,
"loss": 0.9073,
"num_tokens": 7169195089.0,
"step": 903
},
{
"epoch": 3.899945916711736,
"grad_norm": 0.2482515063514521,
"learning_rate": 4.2199950876735215e-06,
"loss": 0.9411,
"num_tokens": 7177386797.0,
"step": 904
},
{
"epoch": 3.90427257977285,
"grad_norm": 0.28405164933859445,
"learning_rate": 4.203493293141569e-06,
"loss": 0.9504,
"num_tokens": 7185698847.0,
"step": 905
},
{
"epoch": 3.908599242833964,
"grad_norm": 0.30931512212107404,
"learning_rate": 4.187044499134194e-06,
"loss": 0.9361,
"num_tokens": 7193881926.0,
"step": 906
},
{
"epoch": 3.9129259058950785,
"grad_norm": 0.2547147190501147,
"learning_rate": 4.170648833922391e-06,
"loss": 0.9185,
"num_tokens": 7202143917.0,
"step": 907
},
{
"epoch": 3.917252568956193,
"grad_norm": 0.2883629979901297,
"learning_rate": 4.154306425362856e-06,
"loss": 0.9031,
"num_tokens": 7210349690.0,
"step": 908
},
{
"epoch": 3.9215792320173066,
"grad_norm": 0.248809007780818,
"learning_rate": 4.1380174008969685e-06,
"loss": 0.9212,
"num_tokens": 7218471788.0,
"step": 909
},
{
"epoch": 3.925905895078421,
"grad_norm": 0.29335764844457096,
"learning_rate": 4.121781887549819e-06,
"loss": 0.9547,
"num_tokens": 7226737841.0,
"step": 910
},
{
"epoch": 3.9302325581395348,
"grad_norm": 0.23301711632951558,
"learning_rate": 4.1056000119292e-06,
"loss": 0.9091,
"num_tokens": 7234929843.0,
"step": 911
},
{
"epoch": 3.934559221200649,
"grad_norm": 0.2681121586933482,
"learning_rate": 4.089471900224625e-06,
"loss": 0.9547,
"num_tokens": 7243103469.0,
"step": 912
},
{
"epoch": 3.9388858842617633,
"grad_norm": 0.24013450588798013,
"learning_rate": 4.07339767820635e-06,
"loss": 0.9566,
"num_tokens": 7251346629.0,
"step": 913
},
{
"epoch": 3.943212547322877,
"grad_norm": 0.2774200992515359,
"learning_rate": 4.057377471224389e-06,
"loss": 0.9225,
"num_tokens": 7259431732.0,
"step": 914
},
{
"epoch": 3.9475392103839915,
"grad_norm": 0.2455641564191236,
"learning_rate": 4.041411404207534e-06,
"loss": 0.8953,
"num_tokens": 7267640813.0,
"step": 915
},
{
"epoch": 3.9518658734451053,
"grad_norm": 0.2966275021843376,
"learning_rate": 4.025499601662385e-06,
"loss": 0.922,
"num_tokens": 7275749532.0,
"step": 916
},
{
"epoch": 3.9561925365062196,
"grad_norm": 0.2640929917423771,
"learning_rate": 4.009642187672371e-06,
"loss": 0.9231,
"num_tokens": 7283963651.0,
"step": 917
},
{
"epoch": 3.960519199567334,
"grad_norm": 0.3246691150518756,
"learning_rate": 3.99383928589679e-06,
"loss": 0.901,
"num_tokens": 7292247142.0,
"step": 918
},
{
"epoch": 3.9648458626284477,
"grad_norm": 0.26463531126722656,
"learning_rate": 3.9780910195698505e-06,
"loss": 0.8923,
"num_tokens": 7300417116.0,
"step": 919
},
{
"epoch": 3.969172525689562,
"grad_norm": 0.35122691809047024,
"learning_rate": 3.9623975114996905e-06,
"loss": 0.932,
"num_tokens": 7308379545.0,
"step": 920
},
{
"epoch": 3.973499188750676,
"grad_norm": 0.2894341921143494,
"learning_rate": 3.946758884067443e-06,
"loss": 0.9398,
"num_tokens": 7316480967.0,
"step": 921
},
{
"epoch": 3.97782585181179,
"grad_norm": 0.30340316959991614,
"learning_rate": 3.9311752592262636e-06,
"loss": 0.9314,
"num_tokens": 7324483988.0,
"step": 922
},
{
"epoch": 3.9821525148729044,
"grad_norm": 0.2592529824891951,
"learning_rate": 3.915646758500391e-06,
"loss": 0.9429,
"num_tokens": 7332700887.0,
"step": 923
},
{
"epoch": 3.9864791779340183,
"grad_norm": 0.2684948193663238,
"learning_rate": 3.900173502984195e-06,
"loss": 0.9254,
"num_tokens": 7340983465.0,
"step": 924
},
{
"epoch": 3.9908058409951326,
"grad_norm": 0.24383973325622357,
"learning_rate": 3.884755613341223e-06,
"loss": 0.9226,
"num_tokens": 7349197836.0,
"step": 925
},
{
"epoch": 3.9951325040562464,
"grad_norm": 0.267690631400667,
"learning_rate": 3.8693932098032845e-06,
"loss": 0.9218,
"num_tokens": 7357431144.0,
"step": 926
},
{
"epoch": 3.9994591671173607,
"grad_norm": 0.2588911360407846,
"learning_rate": 3.854086412169482e-06,
"loss": 0.9086,
"num_tokens": 7365719207.0,
"step": 927
},
{
"epoch": 4.0,
"grad_norm": 0.5005024065190141,
"learning_rate": 3.838835339805301e-06,
"loss": 0.9425,
"num_tokens": 7366713790.0,
"step": 928
},
{
"epoch": 4.004326663061114,
"grad_norm": 0.3225210487220433,
"learning_rate": 3.8236401116416686e-06,
"loss": 0.9273,
"num_tokens": 7375010232.0,
"step": 929
},
{
"epoch": 4.008653326122229,
"grad_norm": 0.2579915323141952,
"learning_rate": 3.8085008461740245e-06,
"loss": 0.9168,
"num_tokens": 7383228901.0,
"step": 930
},
{
"epoch": 4.012979989183342,
"grad_norm": 0.2831742045415681,
"learning_rate": 3.7934176614614004e-06,
"loss": 0.9198,
"num_tokens": 7391491934.0,
"step": 931
},
{
"epoch": 4.017306652244456,
"grad_norm": 0.2737796783844379,
"learning_rate": 3.778390675125503e-06,
"loss": 0.9441,
"num_tokens": 7399677041.0,
"step": 932
},
{
"epoch": 4.0216333153055706,
"grad_norm": 0.255136002812028,
"learning_rate": 3.7634200043497886e-06,
"loss": 0.9298,
"num_tokens": 7407919210.0,
"step": 933
},
{
"epoch": 4.025959978366685,
"grad_norm": 0.26259596411462577,
"learning_rate": 3.7485057658785564e-06,
"loss": 0.92,
"num_tokens": 7415945637.0,
"step": 934
},
{
"epoch": 4.030286641427799,
"grad_norm": 0.24948824793026556,
"learning_rate": 3.733648076016035e-06,
"loss": 0.911,
"num_tokens": 7423795891.0,
"step": 935
},
{
"epoch": 4.0346133044889125,
"grad_norm": 0.29219230996482654,
"learning_rate": 3.7188470506254746e-06,
"loss": 0.9126,
"num_tokens": 7431984437.0,
"step": 936
},
{
"epoch": 4.038939967550027,
"grad_norm": 0.26890911344982393,
"learning_rate": 3.704102805128242e-06,
"loss": 0.9187,
"num_tokens": 7440332143.0,
"step": 937
},
{
"epoch": 4.043266630611141,
"grad_norm": 0.2798260115969542,
"learning_rate": 3.6894154545029255e-06,
"loss": 0.9472,
"num_tokens": 7448558223.0,
"step": 938
},
{
"epoch": 4.047593293672255,
"grad_norm": 0.2849548143296411,
"learning_rate": 3.6747851132844392e-06,
"loss": 0.9024,
"num_tokens": 7456775767.0,
"step": 939
},
{
"epoch": 4.05191995673337,
"grad_norm": 0.2604631125958381,
"learning_rate": 3.660211895563117e-06,
"loss": 0.9086,
"num_tokens": 7464832642.0,
"step": 940
},
{
"epoch": 4.056246619794483,
"grad_norm": 0.27136813930111225,
"learning_rate": 3.6456959149838443e-06,
"loss": 0.8951,
"num_tokens": 7472913404.0,
"step": 941
},
{
"epoch": 4.060573282855597,
"grad_norm": 0.2321722818439474,
"learning_rate": 3.6312372847451503e-06,
"loss": 0.9225,
"num_tokens": 7481058731.0,
"step": 942
},
{
"epoch": 4.064899945916712,
"grad_norm": 0.24363607914962288,
"learning_rate": 3.61683611759834e-06,
"loss": 0.8979,
"num_tokens": 7489381773.0,
"step": 943
},
{
"epoch": 4.069226608977826,
"grad_norm": 0.2512840779101482,
"learning_rate": 3.6024925258466028e-06,
"loss": 0.9016,
"num_tokens": 7497474453.0,
"step": 944
},
{
"epoch": 4.07355327203894,
"grad_norm": 0.2650459708817925,
"learning_rate": 3.5882066213441537e-06,
"loss": 0.9292,
"num_tokens": 7505713748.0,
"step": 945
},
{
"epoch": 4.077879935100054,
"grad_norm": 0.2628128175408556,
"learning_rate": 3.573978515495345e-06,
"loss": 0.9476,
"num_tokens": 7513871362.0,
"step": 946
},
{
"epoch": 4.082206598161168,
"grad_norm": 0.25397010876410847,
"learning_rate": 3.559808319253801e-06,
"loss": 0.9151,
"num_tokens": 7522106850.0,
"step": 947
},
{
"epoch": 4.086533261222282,
"grad_norm": 0.263659923128617,
"learning_rate": 3.545696143121563e-06,
"loss": 0.9468,
"num_tokens": 7530473282.0,
"step": 948
},
{
"epoch": 4.0908599242833965,
"grad_norm": 0.23015134804471984,
"learning_rate": 3.5316420971482133e-06,
"loss": 0.9174,
"num_tokens": 7538547933.0,
"step": 949
},
{
"epoch": 4.095186587344511,
"grad_norm": 0.26513242851859053,
"learning_rate": 3.5176462909300257e-06,
"loss": 0.9153,
"num_tokens": 7546860011.0,
"step": 950
},
{
"epoch": 4.099513250405625,
"grad_norm": 0.24911847035989185,
"learning_rate": 3.5037088336091054e-06,
"loss": 0.9399,
"num_tokens": 7555038327.0,
"step": 951
},
{
"epoch": 4.1038399134667385,
"grad_norm": 0.2528255498389864,
"learning_rate": 3.48982983387255e-06,
"loss": 0.9401,
"num_tokens": 7563202397.0,
"step": 952
},
{
"epoch": 4.108166576527853,
"grad_norm": 0.2784976002129969,
"learning_rate": 3.476009399951582e-06,
"loss": 0.9086,
"num_tokens": 7571320131.0,
"step": 953
},
{
"epoch": 4.112493239588967,
"grad_norm": 0.23426691089213464,
"learning_rate": 3.4622476396207254e-06,
"loss": 0.9384,
"num_tokens": 7579479607.0,
"step": 954
},
{
"epoch": 4.116819902650081,
"grad_norm": 0.24357078878404217,
"learning_rate": 3.4485446601969507e-06,
"loss": 0.8955,
"num_tokens": 7587468144.0,
"step": 955
},
{
"epoch": 4.121146565711196,
"grad_norm": 0.22163907535402036,
"learning_rate": 3.4349005685388424e-06,
"loss": 0.8934,
"num_tokens": 7595579912.0,
"step": 956
},
{
"epoch": 4.125473228772309,
"grad_norm": 0.23264492656762667,
"learning_rate": 3.4213154710457708e-06,
"loss": 0.9391,
"num_tokens": 7603924326.0,
"step": 957
},
{
"epoch": 4.129799891833423,
"grad_norm": 0.2433223472459945,
"learning_rate": 3.4077894736570515e-06,
"loss": 0.8828,
"num_tokens": 7611982955.0,
"step": 958
},
{
"epoch": 4.134126554894538,
"grad_norm": 0.23069127266197514,
"learning_rate": 3.3943226818511333e-06,
"loss": 0.8969,
"num_tokens": 7620306000.0,
"step": 959
},
{
"epoch": 4.138453217955652,
"grad_norm": 0.22614376605829753,
"learning_rate": 3.3809152006447587e-06,
"loss": 0.9137,
"num_tokens": 7628430275.0,
"step": 960
},
{
"epoch": 4.142779881016766,
"grad_norm": 0.24947724722258544,
"learning_rate": 3.367567134592167e-06,
"loss": 0.8879,
"num_tokens": 7636687515.0,
"step": 961
},
{
"epoch": 4.14710654407788,
"grad_norm": 0.21810329623060942,
"learning_rate": 3.354278587784253e-06,
"loss": 0.9061,
"num_tokens": 7644885409.0,
"step": 962
},
{
"epoch": 4.151433207138994,
"grad_norm": 0.23579980625448524,
"learning_rate": 3.341049663847775e-06,
"loss": 0.9298,
"num_tokens": 7653226663.0,
"step": 963
},
{
"epoch": 4.155759870200108,
"grad_norm": 0.2210900639251755,
"learning_rate": 3.3278804659445384e-06,
"loss": 0.9015,
"num_tokens": 7661515588.0,
"step": 964
},
{
"epoch": 4.1600865332612225,
"grad_norm": 0.25256619991347495,
"learning_rate": 3.3147710967705948e-06,
"loss": 0.9,
"num_tokens": 7669557188.0,
"step": 965
},
{
"epoch": 4.164413196322337,
"grad_norm": 0.2344894675908079,
"learning_rate": 3.301721658555441e-06,
"loss": 0.8805,
"num_tokens": 7677831073.0,
"step": 966
},
{
"epoch": 4.16873985938345,
"grad_norm": 0.252979642536075,
"learning_rate": 3.288732253061214e-06,
"loss": 0.9336,
"num_tokens": 7686006093.0,
"step": 967
},
{
"epoch": 4.173066522444564,
"grad_norm": 0.24604867580703016,
"learning_rate": 3.2758029815819105e-06,
"loss": 0.9023,
"num_tokens": 7694213953.0,
"step": 968
},
{
"epoch": 4.177393185505679,
"grad_norm": 0.2228523611045584,
"learning_rate": 3.2629339449425813e-06,
"loss": 0.9303,
"num_tokens": 7702384350.0,
"step": 969
},
{
"epoch": 4.181719848566793,
"grad_norm": 0.22509827369260485,
"learning_rate": 3.2501252434985642e-06,
"loss": 0.9264,
"num_tokens": 7710606640.0,
"step": 970
},
{
"epoch": 4.186046511627907,
"grad_norm": 0.2667456008002394,
"learning_rate": 3.237376977134683e-06,
"loss": 0.9378,
"num_tokens": 7718777881.0,
"step": 971
},
{
"epoch": 4.190373174689021,
"grad_norm": 0.22092392495232688,
"learning_rate": 3.2246892452644827e-06,
"loss": 0.9203,
"num_tokens": 7726989645.0,
"step": 972
},
{
"epoch": 4.194699837750135,
"grad_norm": 0.23367876249626124,
"learning_rate": 3.212062146829442e-06,
"loss": 0.8967,
"num_tokens": 7735155541.0,
"step": 973
},
{
"epoch": 4.199026500811249,
"grad_norm": 0.24919500016307755,
"learning_rate": 3.1994957802982153e-06,
"loss": 0.9472,
"num_tokens": 7743378650.0,
"step": 974
},
{
"epoch": 4.203353163872364,
"grad_norm": 0.2362455094117848,
"learning_rate": 3.1869902436658484e-06,
"loss": 0.9017,
"num_tokens": 7751497549.0,
"step": 975
},
{
"epoch": 4.207679826933478,
"grad_norm": 0.2344310291886097,
"learning_rate": 3.1745456344530296e-06,
"loss": 0.8933,
"num_tokens": 7759791744.0,
"step": 976
},
{
"epoch": 4.212006489994591,
"grad_norm": 0.25066147270392997,
"learning_rate": 3.1621620497053225e-06,
"loss": 0.9159,
"num_tokens": 7767921426.0,
"step": 977
},
{
"epoch": 4.2163331530557056,
"grad_norm": 0.23902995246327793,
"learning_rate": 3.149839585992407e-06,
"loss": 0.9059,
"num_tokens": 7776086986.0,
"step": 978
},
{
"epoch": 4.22065981611682,
"grad_norm": 0.25258997276156464,
"learning_rate": 3.1375783394073323e-06,
"loss": 0.9102,
"num_tokens": 7784151941.0,
"step": 979
},
{
"epoch": 4.224986479177934,
"grad_norm": 0.2643483528443113,
"learning_rate": 3.125378405565762e-06,
"loss": 0.9229,
"num_tokens": 7792420160.0,
"step": 980
},
{
"epoch": 4.229313142239048,
"grad_norm": 0.24031178404950676,
"learning_rate": 3.11323987960523e-06,
"loss": 0.9035,
"num_tokens": 7800617397.0,
"step": 981
},
{
"epoch": 4.233639805300163,
"grad_norm": 0.23933163316740128,
"learning_rate": 3.1011628561843982e-06,
"loss": 0.8876,
"num_tokens": 7808759242.0,
"step": 982
},
{
"epoch": 4.237966468361276,
"grad_norm": 0.25169308565149046,
"learning_rate": 3.0891474294823253e-06,
"loss": 0.9071,
"num_tokens": 7816911001.0,
"step": 983
},
{
"epoch": 4.24229313142239,
"grad_norm": 0.237314725332469,
"learning_rate": 3.0771936931977185e-06,
"loss": 0.8977,
"num_tokens": 7825012529.0,
"step": 984
},
{
"epoch": 4.246619794483505,
"grad_norm": 0.2173955562906754,
"learning_rate": 3.065301740548219e-06,
"loss": 0.9154,
"num_tokens": 7833313011.0,
"step": 985
},
{
"epoch": 4.250946457544619,
"grad_norm": 0.22898129847081045,
"learning_rate": 3.053471664269658e-06,
"loss": 0.8967,
"num_tokens": 7841562459.0,
"step": 986
},
{
"epoch": 4.255273120605732,
"grad_norm": 0.24221863163289162,
"learning_rate": 3.0417035566153498e-06,
"loss": 0.9092,
"num_tokens": 7849697964.0,
"step": 987
},
{
"epoch": 4.259599783666847,
"grad_norm": 0.2462244118669248,
"learning_rate": 3.029997509355361e-06,
"loss": 0.9092,
"num_tokens": 7857816562.0,
"step": 988
},
{
"epoch": 4.263926446727961,
"grad_norm": 0.23906292297331386,
"learning_rate": 3.018353613775798e-06,
"loss": 0.9227,
"num_tokens": 7866172435.0,
"step": 989
},
{
"epoch": 4.268253109789075,
"grad_norm": 0.26238496146659623,
"learning_rate": 3.0067719606781e-06,
"loss": 0.8959,
"num_tokens": 7874266917.0,
"step": 990
},
{
"epoch": 4.2725797728501895,
"grad_norm": 0.2406443914678014,
"learning_rate": 2.9952526403783227e-06,
"loss": 0.9318,
"num_tokens": 7882511531.0,
"step": 991
},
{
"epoch": 4.276906435911304,
"grad_norm": 0.2453279920036015,
"learning_rate": 2.9837957427064413e-06,
"loss": 0.9088,
"num_tokens": 7890688249.0,
"step": 992
},
{
"epoch": 4.281233098972417,
"grad_norm": 0.2513178827661643,
"learning_rate": 2.9724013570056436e-06,
"loss": 0.9286,
"num_tokens": 7898728597.0,
"step": 993
},
{
"epoch": 4.2855597620335315,
"grad_norm": 0.22432182690828315,
"learning_rate": 2.9610695721316396e-06,
"loss": 0.8988,
"num_tokens": 7906976032.0,
"step": 994
},
{
"epoch": 4.289886425094646,
"grad_norm": 0.2524437800844289,
"learning_rate": 2.949800476451963e-06,
"loss": 0.8918,
"num_tokens": 7915251932.0,
"step": 995
},
{
"epoch": 4.29421308815576,
"grad_norm": 0.23484108974910797,
"learning_rate": 2.9385941578452842e-06,
"loss": 0.9183,
"num_tokens": 7923323128.0,
"step": 996
},
{
"epoch": 4.298539751216874,
"grad_norm": 0.25577170783084596,
"learning_rate": 2.9274507037007312e-06,
"loss": 0.9555,
"num_tokens": 7931421185.0,
"step": 997
},
{
"epoch": 4.302866414277988,
"grad_norm": 0.2760603667425678,
"learning_rate": 2.9163702009171936e-06,
"loss": 0.9093,
"num_tokens": 7939532377.0,
"step": 998
},
{
"epoch": 4.307193077339102,
"grad_norm": 0.22875893145403134,
"learning_rate": 2.9053527359026605e-06,
"loss": 0.8907,
"num_tokens": 7947766698.0,
"step": 999
},
{
"epoch": 4.311519740400216,
"grad_norm": 0.2583150353816036,
"learning_rate": 2.8943983945735375e-06,
"loss": 0.9276,
"num_tokens": 7955887724.0,
"step": 1000
},
{
"epoch": 4.315846403461331,
"grad_norm": 0.24320219179355962,
"learning_rate": 2.883507262353977e-06,
"loss": 0.9101,
"num_tokens": 7963982677.0,
"step": 1001
},
{
"epoch": 4.320173066522445,
"grad_norm": 0.2833332765864319,
"learning_rate": 2.8726794241752163e-06,
"loss": 0.9044,
"num_tokens": 7972210481.0,
"step": 1002
},
{
"epoch": 4.324499729583558,
"grad_norm": 0.24214965298560892,
"learning_rate": 2.861914964474913e-06,
"loss": 0.9446,
"num_tokens": 7980398111.0,
"step": 1003
},
{
"epoch": 4.328826392644673,
"grad_norm": 0.24570245569395735,
"learning_rate": 2.8512139671964844e-06,
"loss": 0.9193,
"num_tokens": 7988615985.0,
"step": 1004
},
{
"epoch": 4.333153055705787,
"grad_norm": 0.2677223593998129,
"learning_rate": 2.8405765157884615e-06,
"loss": 0.9208,
"num_tokens": 7996771846.0,
"step": 1005
},
{
"epoch": 4.337479718766901,
"grad_norm": 0.22595326917303651,
"learning_rate": 2.830002693203823e-06,
"loss": 0.9052,
"num_tokens": 8004877666.0,
"step": 1006
},
{
"epoch": 4.3418063818280155,
"grad_norm": 0.2776838569441126,
"learning_rate": 2.8194925818993617e-06,
"loss": 0.8786,
"num_tokens": 8012805147.0,
"step": 1007
},
{
"epoch": 4.346133044889129,
"grad_norm": 0.23838814276876832,
"learning_rate": 2.8090462638350397e-06,
"loss": 0.9403,
"num_tokens": 8020848497.0,
"step": 1008
},
{
"epoch": 4.350459707950243,
"grad_norm": 0.23575983788350954,
"learning_rate": 2.7986638204733407e-06,
"loss": 0.9112,
"num_tokens": 8029064640.0,
"step": 1009
},
{
"epoch": 4.3547863710113575,
"grad_norm": 0.2451032311621356,
"learning_rate": 2.788345332778646e-06,
"loss": 0.9097,
"num_tokens": 8037282567.0,
"step": 1010
},
{
"epoch": 4.359113034072472,
"grad_norm": 1.1599064195215374,
"learning_rate": 2.778090881216592e-06,
"loss": 0.9209,
"num_tokens": 8045431558.0,
"step": 1011
},
{
"epoch": 4.363439697133586,
"grad_norm": 0.30873535879406216,
"learning_rate": 2.7679005457534557e-06,
"loss": 0.9147,
"num_tokens": 8053635644.0,
"step": 1012
},
{
"epoch": 4.367766360194699,
"grad_norm": 0.24702430360632943,
"learning_rate": 2.757774405855519e-06,
"loss": 0.8908,
"num_tokens": 8061854534.0,
"step": 1013
},
{
"epoch": 4.372093023255814,
"grad_norm": 0.2710714812381768,
"learning_rate": 2.747712540488454e-06,
"loss": 0.8975,
"num_tokens": 8070195917.0,
"step": 1014
},
{
"epoch": 4.376419686316928,
"grad_norm": 0.26762222390109924,
"learning_rate": 2.737715028116707e-06,
"loss": 0.9346,
"num_tokens": 8078364915.0,
"step": 1015
},
{
"epoch": 4.380746349378042,
"grad_norm": 0.24113215046758327,
"learning_rate": 2.727781946702891e-06,
"loss": 0.9001,
"num_tokens": 8086275582.0,
"step": 1016
},
{
"epoch": 4.385073012439157,
"grad_norm": 0.29603018439057366,
"learning_rate": 2.717913373707167e-06,
"loss": 0.9007,
"num_tokens": 8094238589.0,
"step": 1017
},
{
"epoch": 4.38939967550027,
"grad_norm": 0.2440085316018602,
"learning_rate": 2.708109386086653e-06,
"loss": 0.8845,
"num_tokens": 8102346728.0,
"step": 1018
},
{
"epoch": 4.393726338561384,
"grad_norm": 0.26675724694934916,
"learning_rate": 2.6983700602948116e-06,
"loss": 0.9249,
"num_tokens": 8110578253.0,
"step": 1019
},
{
"epoch": 4.398053001622499,
"grad_norm": 0.2681087433723698,
"learning_rate": 2.68869547228086e-06,
"loss": 0.8965,
"num_tokens": 8118504559.0,
"step": 1020
},
{
"epoch": 4.402379664683613,
"grad_norm": 0.21927043163651067,
"learning_rate": 2.679085697489183e-06,
"loss": 0.8941,
"num_tokens": 8126743920.0,
"step": 1021
},
{
"epoch": 4.406706327744727,
"grad_norm": 0.27323929746962095,
"learning_rate": 2.6695408108587314e-06,
"loss": 0.9277,
"num_tokens": 8135026189.0,
"step": 1022
},
{
"epoch": 4.411032990805841,
"grad_norm": 0.22857677582074945,
"learning_rate": 2.6600608868224516e-06,
"loss": 0.9125,
"num_tokens": 8143175762.0,
"step": 1023
},
{
"epoch": 4.415359653866955,
"grad_norm": 0.25364165355402224,
"learning_rate": 2.6506459993066918e-06,
"loss": 0.9513,
"num_tokens": 8151388760.0,
"step": 1024
},
{
"epoch": 4.419686316928069,
"grad_norm": 0.22639641169387734,
"learning_rate": 2.6412962217306415e-06,
"loss": 0.9668,
"num_tokens": 8159573555.0,
"step": 1025
},
{
"epoch": 4.424012979989183,
"grad_norm": 0.22430407380048098,
"learning_rate": 2.632011627005738e-06,
"loss": 0.8929,
"num_tokens": 8167719180.0,
"step": 1026
},
{
"epoch": 4.428339643050298,
"grad_norm": 0.23948485782246384,
"learning_rate": 2.6227922875351196e-06,
"loss": 0.8916,
"num_tokens": 8175831208.0,
"step": 1027
},
{
"epoch": 4.432666306111411,
"grad_norm": 0.24643284137803,
"learning_rate": 2.6136382752130486e-06,
"loss": 0.938,
"num_tokens": 8184024547.0,
"step": 1028
},
{
"epoch": 4.436992969172525,
"grad_norm": 0.24582088544418418,
"learning_rate": 2.6045496614243485e-06,
"loss": 0.8963,
"num_tokens": 8192200295.0,
"step": 1029
},
{
"epoch": 4.44131963223364,
"grad_norm": 0.2440944566577538,
"learning_rate": 2.5955265170438593e-06,
"loss": 0.8883,
"num_tokens": 8200316579.0,
"step": 1030
},
{
"epoch": 4.445646295294754,
"grad_norm": 0.24904469869515644,
"learning_rate": 2.5865689124358707e-06,
"loss": 0.9436,
"num_tokens": 8208558171.0,
"step": 1031
},
{
"epoch": 4.449972958355868,
"grad_norm": 0.24911259576594413,
"learning_rate": 2.5776769174535835e-06,
"loss": 0.9147,
"num_tokens": 8216669746.0,
"step": 1032
},
{
"epoch": 4.4542996214169825,
"grad_norm": 0.2234007333497611,
"learning_rate": 2.56885060143856e-06,
"loss": 0.9154,
"num_tokens": 8224995031.0,
"step": 1033
},
{
"epoch": 4.458626284478096,
"grad_norm": 0.24266587214310398,
"learning_rate": 2.560090033220187e-06,
"loss": 0.8957,
"num_tokens": 8233114355.0,
"step": 1034
},
{
"epoch": 4.46295294753921,
"grad_norm": 0.24147325182416088,
"learning_rate": 2.5513952811151338e-06,
"loss": 0.9117,
"num_tokens": 8241441116.0,
"step": 1035
},
{
"epoch": 4.4672796106003245,
"grad_norm": 0.2370461402183908,
"learning_rate": 2.5427664129268253e-06,
"loss": 0.8981,
"num_tokens": 8249541761.0,
"step": 1036
},
{
"epoch": 4.471606273661439,
"grad_norm": 0.23708579704850133,
"learning_rate": 2.5342034959449075e-06,
"loss": 0.9417,
"num_tokens": 8257714724.0,
"step": 1037
},
{
"epoch": 4.475932936722553,
"grad_norm": 0.23455593822165277,
"learning_rate": 2.5257065969447297e-06,
"loss": 0.9377,
"num_tokens": 8265816974.0,
"step": 1038
},
{
"epoch": 4.4802595997836665,
"grad_norm": 0.21487013012395628,
"learning_rate": 2.5172757821868144e-06,
"loss": 0.9327,
"num_tokens": 8273974273.0,
"step": 1039
},
{
"epoch": 4.484586262844781,
"grad_norm": 0.2254553909706869,
"learning_rate": 2.5089111174163483e-06,
"loss": 0.934,
"num_tokens": 8282132554.0,
"step": 1040
},
{
"epoch": 4.488912925905895,
"grad_norm": 0.2131771890264051,
"learning_rate": 2.5006126678626714e-06,
"loss": 0.8974,
"num_tokens": 8290234332.0,
"step": 1041
},
{
"epoch": 4.493239588967009,
"grad_norm": 0.22236682158923896,
"learning_rate": 2.492380498238756e-06,
"loss": 0.927,
"num_tokens": 8298413944.0,
"step": 1042
},
{
"epoch": 4.497566252028124,
"grad_norm": 0.2121975495704575,
"learning_rate": 2.4842146727407173e-06,
"loss": 0.9284,
"num_tokens": 8306583885.0,
"step": 1043
},
{
"epoch": 4.501892915089237,
"grad_norm": 0.2169856691232569,
"learning_rate": 2.4761152550473024e-06,
"loss": 0.9212,
"num_tokens": 8314795632.0,
"step": 1044
},
{
"epoch": 4.506219578150351,
"grad_norm": 0.22019170298331212,
"learning_rate": 2.468082308319397e-06,
"loss": 0.906,
"num_tokens": 8323018451.0,
"step": 1045
},
{
"epoch": 4.510546241211466,
"grad_norm": 0.2198386279437345,
"learning_rate": 2.4601158951995314e-06,
"loss": 0.9064,
"num_tokens": 8330954402.0,
"step": 1046
},
{
"epoch": 4.51487290427258,
"grad_norm": 0.21923668460774162,
"learning_rate": 2.4522160778113993e-06,
"loss": 0.9015,
"num_tokens": 8339229368.0,
"step": 1047
},
{
"epoch": 4.519199567333694,
"grad_norm": 0.21892201525761043,
"learning_rate": 2.44438291775936e-06,
"loss": 0.9273,
"num_tokens": 8347465643.0,
"step": 1048
},
{
"epoch": 4.523526230394808,
"grad_norm": 0.21551779191440174,
"learning_rate": 2.4366164761279707e-06,
"loss": 0.9187,
"num_tokens": 8355626064.0,
"step": 1049
},
{
"epoch": 4.527852893455922,
"grad_norm": 0.22282930039690238,
"learning_rate": 2.4289168134815065e-06,
"loss": 0.9576,
"num_tokens": 8363857534.0,
"step": 1050
},
{
"epoch": 4.532179556517036,
"grad_norm": 0.20982941027778246,
"learning_rate": 2.421283989863478e-06,
"loss": 0.8772,
"num_tokens": 8372105463.0,
"step": 1051
},
{
"epoch": 4.5365062195781505,
"grad_norm": 0.22455423628788954,
"learning_rate": 2.41371806479618e-06,
"loss": 0.9402,
"num_tokens": 8380368203.0,
"step": 1052
},
{
"epoch": 4.540832882639265,
"grad_norm": 0.23091228772734626,
"learning_rate": 2.406219097280214e-06,
"loss": 0.9479,
"num_tokens": 8388521209.0,
"step": 1053
},
{
"epoch": 4.545159545700379,
"grad_norm": 0.2398615660642649,
"learning_rate": 2.398787145794037e-06,
"loss": 0.9416,
"num_tokens": 8396627451.0,
"step": 1054
},
{
"epoch": 4.5494862087614925,
"grad_norm": 0.23939806557885646,
"learning_rate": 2.3914222682934986e-06,
"loss": 0.9161,
"num_tokens": 8404837650.0,
"step": 1055
},
{
"epoch": 4.553812871822607,
"grad_norm": 0.24446417367897916,
"learning_rate": 2.3841245222113953e-06,
"loss": 0.9306,
"num_tokens": 8412812351.0,
"step": 1056
},
{
"epoch": 4.558139534883721,
"grad_norm": 0.22572000773778622,
"learning_rate": 2.3768939644570143e-06,
"loss": 0.8796,
"num_tokens": 8420971902.0,
"step": 1057
},
{
"epoch": 4.562466197944835,
"grad_norm": 0.24474536426880072,
"learning_rate": 2.3697306514156978e-06,
"loss": 0.9244,
"num_tokens": 8429225388.0,
"step": 1058
},
{
"epoch": 4.566792861005949,
"grad_norm": 0.21928892468156305,
"learning_rate": 2.3626346389484005e-06,
"loss": 0.9096,
"num_tokens": 8437382172.0,
"step": 1059
},
{
"epoch": 4.571119524067063,
"grad_norm": 0.22842290654280803,
"learning_rate": 2.3556059823912524e-06,
"loss": 0.8858,
"num_tokens": 8445391658.0,
"step": 1060
},
{
"epoch": 4.575446187128177,
"grad_norm": 0.23251949213565723,
"learning_rate": 2.34864473655513e-06,
"loss": 0.9307,
"num_tokens": 8453520637.0,
"step": 1061
},
{
"epoch": 4.579772850189292,
"grad_norm": 0.21342174591976593,
"learning_rate": 2.341750955725227e-06,
"loss": 0.9321,
"num_tokens": 8461655842.0,
"step": 1062
},
{
"epoch": 4.584099513250406,
"grad_norm": 0.2542248578765479,
"learning_rate": 2.334924693660631e-06,
"loss": 0.9073,
"num_tokens": 8469896692.0,
"step": 1063
},
{
"epoch": 4.58842617631152,
"grad_norm": 0.24425754732071317,
"learning_rate": 2.328166003593904e-06,
"loss": 0.9106,
"num_tokens": 8478037823.0,
"step": 1064
},
{
"epoch": 4.592752839372634,
"grad_norm": 0.23248234618828792,
"learning_rate": 2.3214749382306696e-06,
"loss": 0.9215,
"num_tokens": 8486264180.0,
"step": 1065
},
{
"epoch": 4.597079502433748,
"grad_norm": 0.23757606431595135,
"learning_rate": 2.3148515497491976e-06,
"loss": 0.9219,
"num_tokens": 8494182868.0,
"step": 1066
},
{
"epoch": 4.601406165494862,
"grad_norm": 0.2561573985410888,
"learning_rate": 2.308295889800004e-06,
"loss": 0.9215,
"num_tokens": 8502445035.0,
"step": 1067
},
{
"epoch": 4.605732828555976,
"grad_norm": 0.23162663966204003,
"learning_rate": 2.3018080095054418e-06,
"loss": 0.9111,
"num_tokens": 8510724601.0,
"step": 1068
},
{
"epoch": 4.61005949161709,
"grad_norm": 0.26717575358601775,
"learning_rate": 2.2953879594593046e-06,
"loss": 0.9489,
"num_tokens": 8518808391.0,
"step": 1069
},
{
"epoch": 4.614386154678204,
"grad_norm": 0.2529372514931457,
"learning_rate": 2.2890357897264325e-06,
"loss": 0.928,
"num_tokens": 8526913901.0,
"step": 1070
},
{
"epoch": 4.618712817739318,
"grad_norm": 0.21355116829146173,
"learning_rate": 2.2827515498423204e-06,
"loss": 0.9649,
"num_tokens": 8534983780.0,
"step": 1071
},
{
"epoch": 4.623039480800433,
"grad_norm": 0.2456451804025508,
"learning_rate": 2.276535288812734e-06,
"loss": 0.9306,
"num_tokens": 8543253072.0,
"step": 1072
},
{
"epoch": 4.627366143861547,
"grad_norm": 0.230007736093212,
"learning_rate": 2.2703870551133246e-06,
"loss": 0.9239,
"num_tokens": 8551528178.0,
"step": 1073
},
{
"epoch": 4.631692806922661,
"grad_norm": 0.22045377494054158,
"learning_rate": 2.264306896689255e-06,
"loss": 0.9196,
"num_tokens": 8559817119.0,
"step": 1074
},
{
"epoch": 4.636019469983775,
"grad_norm": 0.2556158783264956,
"learning_rate": 2.2582948609548205e-06,
"loss": 0.9377,
"num_tokens": 8567832779.0,
"step": 1075
},
{
"epoch": 4.640346133044889,
"grad_norm": 0.2648793592168965,
"learning_rate": 2.2523509947930847e-06,
"loss": 0.9121,
"num_tokens": 8576038992.0,
"step": 1076
},
{
"epoch": 4.644672796106003,
"grad_norm": 0.2425175080351857,
"learning_rate": 2.2464753445555083e-06,
"loss": 0.9136,
"num_tokens": 8584231271.0,
"step": 1077
},
{
"epoch": 4.6489994591671175,
"grad_norm": 0.24879039753969864,
"learning_rate": 2.2406679560615948e-06,
"loss": 0.9131,
"num_tokens": 8592443058.0,
"step": 1078
},
{
"epoch": 4.653326122228232,
"grad_norm": 0.22681872861351485,
"learning_rate": 2.2349288745985235e-06,
"loss": 0.8941,
"num_tokens": 8600701259.0,
"step": 1079
},
{
"epoch": 4.657652785289345,
"grad_norm": 0.2513200301630142,
"learning_rate": 2.229258144920805e-06,
"loss": 0.8976,
"num_tokens": 8609010693.0,
"step": 1080
},
{
"epoch": 4.6619794483504595,
"grad_norm": 0.22219846006857522,
"learning_rate": 2.223655811249931e-06,
"loss": 0.952,
"num_tokens": 8617103908.0,
"step": 1081
},
{
"epoch": 4.666306111411574,
"grad_norm": 0.22978741078825024,
"learning_rate": 2.218121917274023e-06,
"loss": 0.9114,
"num_tokens": 8625308325.0,
"step": 1082
},
{
"epoch": 4.670632774472688,
"grad_norm": 0.23640956812538738,
"learning_rate": 2.2126565061474972e-06,
"loss": 0.9322,
"num_tokens": 8633613797.0,
"step": 1083
},
{
"epoch": 4.674959437533802,
"grad_norm": 0.23087974818902274,
"learning_rate": 2.207259620490727e-06,
"loss": 0.8983,
"num_tokens": 8641914517.0,
"step": 1084
},
{
"epoch": 4.679286100594916,
"grad_norm": 0.32959187451424254,
"learning_rate": 2.2019313023897142e-06,
"loss": 0.8854,
"num_tokens": 8649959913.0,
"step": 1085
},
{
"epoch": 4.68361276365603,
"grad_norm": 0.24230388409008827,
"learning_rate": 2.1966715933957493e-06,
"loss": 0.9461,
"num_tokens": 8657909497.0,
"step": 1086
},
{
"epoch": 4.687939426717144,
"grad_norm": 0.22642078906501323,
"learning_rate": 2.191480534525103e-06,
"loss": 0.886,
"num_tokens": 8666053678.0,
"step": 1087
},
{
"epoch": 4.692266089778259,
"grad_norm": 0.22694813074036918,
"learning_rate": 2.1863581662586945e-06,
"loss": 0.9011,
"num_tokens": 8674210962.0,
"step": 1088
},
{
"epoch": 4.696592752839373,
"grad_norm": 0.2298824888061183,
"learning_rate": 2.1813045285417785e-06,
"loss": 0.9267,
"num_tokens": 8682409942.0,
"step": 1089
},
{
"epoch": 4.700919415900486,
"grad_norm": 0.23919688355935076,
"learning_rate": 2.1763196607836393e-06,
"loss": 0.9098,
"num_tokens": 8690511024.0,
"step": 1090
},
{
"epoch": 4.705246078961601,
"grad_norm": 0.21956784819647854,
"learning_rate": 2.1714036018572764e-06,
"loss": 0.9384,
"num_tokens": 8698559389.0,
"step": 1091
},
{
"epoch": 4.709572742022715,
"grad_norm": 0.24610989261964578,
"learning_rate": 2.1665563900991043e-06,
"loss": 0.9225,
"num_tokens": 8706586074.0,
"step": 1092
},
{
"epoch": 4.713899405083829,
"grad_norm": 0.21977533775137006,
"learning_rate": 2.1617780633086545e-06,
"loss": 0.9376,
"num_tokens": 8714666218.0,
"step": 1093
},
{
"epoch": 4.7182260681449435,
"grad_norm": 0.213849406897767,
"learning_rate": 2.1570686587482796e-06,
"loss": 0.9224,
"num_tokens": 8722796038.0,
"step": 1094
},
{
"epoch": 4.722552731206058,
"grad_norm": 0.22286159150427315,
"learning_rate": 2.1524282131428615e-06,
"loss": 0.939,
"num_tokens": 8730944398.0,
"step": 1095
},
{
"epoch": 4.726879394267171,
"grad_norm": 0.21529351177056805,
"learning_rate": 2.147856762679528e-06,
"loss": 0.938,
"num_tokens": 8739279574.0,
"step": 1096
},
{
"epoch": 4.7312060573282855,
"grad_norm": 0.20965774958349334,
"learning_rate": 2.143354343007367e-06,
"loss": 0.9062,
"num_tokens": 8747329404.0,
"step": 1097
},
{
"epoch": 4.7355327203894,
"grad_norm": 0.20617803221216705,
"learning_rate": 2.1389209892371525e-06,
"loss": 0.923,
"num_tokens": 8755606713.0,
"step": 1098
},
{
"epoch": 4.739859383450514,
"grad_norm": 0.21016031766306656,
"learning_rate": 2.1345567359410665e-06,
"loss": 0.8719,
"num_tokens": 8763860984.0,
"step": 1099
},
{
"epoch": 4.7441860465116275,
"grad_norm": 0.20384128159589965,
"learning_rate": 2.1302616171524356e-06,
"loss": 0.9181,
"num_tokens": 8772171202.0,
"step": 1100
},
{
"epoch": 4.748512709572742,
"grad_norm": 0.20893323187166496,
"learning_rate": 2.1260356663654562e-06,
"loss": 0.9057,
"num_tokens": 8780360379.0,
"step": 1101
},
{
"epoch": 4.752839372633856,
"grad_norm": 0.20612438954780368,
"learning_rate": 2.1218789165349425e-06,
"loss": 0.9152,
"num_tokens": 8788673205.0,
"step": 1102
},
{
"epoch": 4.75716603569497,
"grad_norm": 0.2125809442142549,
"learning_rate": 2.117791400076065e-06,
"loss": 0.9329,
"num_tokens": 8796892652.0,
"step": 1103
},
{
"epoch": 4.761492698756085,
"grad_norm": 0.20743329950388847,
"learning_rate": 2.113773148864097e-06,
"loss": 0.9149,
"num_tokens": 8805045905.0,
"step": 1104
},
{
"epoch": 4.765819361817199,
"grad_norm": 0.20929881513400472,
"learning_rate": 2.1098241942341703e-06,
"loss": 0.905,
"num_tokens": 8813300823.0,
"step": 1105
},
{
"epoch": 4.770146024878312,
"grad_norm": 0.21793262700048058,
"learning_rate": 2.105944566981025e-06,
"loss": 0.919,
"num_tokens": 8821469096.0,
"step": 1106
},
{
"epoch": 4.774472687939427,
"grad_norm": 0.21402361258647987,
"learning_rate": 2.1021342973587747e-06,
"loss": 0.9063,
"num_tokens": 8829574419.0,
"step": 1107
},
{
"epoch": 4.778799351000541,
"grad_norm": 0.20268959562105737,
"learning_rate": 2.098393415080667e-06,
"loss": 0.9449,
"num_tokens": 8837909937.0,
"step": 1108
},
{
"epoch": 4.783126014061655,
"grad_norm": 0.21873853332943569,
"learning_rate": 2.0947219493188515e-06,
"loss": 0.8878,
"num_tokens": 8845922366.0,
"step": 1109
},
{
"epoch": 4.7874526771227695,
"grad_norm": 0.22427043627143187,
"learning_rate": 2.0911199287041585e-06,
"loss": 0.9175,
"num_tokens": 8854195314.0,
"step": 1110
},
{
"epoch": 4.791779340183883,
"grad_norm": 0.22242942729652762,
"learning_rate": 2.087587381325867e-06,
"loss": 0.9302,
"num_tokens": 8862237356.0,
"step": 1111
},
{
"epoch": 4.796106003244997,
"grad_norm": 0.22691459369076583,
"learning_rate": 2.0841243347314926e-06,
"loss": 0.9207,
"num_tokens": 8870498074.0,
"step": 1112
},
{
"epoch": 4.800432666306111,
"grad_norm": 0.21029802017820462,
"learning_rate": 2.080730815926566e-06,
"loss": 0.908,
"num_tokens": 8878725976.0,
"step": 1113
},
{
"epoch": 4.804759329367226,
"grad_norm": 0.2238375831744874,
"learning_rate": 2.0774068513744294e-06,
"loss": 0.9154,
"num_tokens": 8886863134.0,
"step": 1114
},
{
"epoch": 4.80908599242834,
"grad_norm": 0.23640277975937646,
"learning_rate": 2.0741524669960258e-06,
"loss": 0.9058,
"num_tokens": 8894933619.0,
"step": 1115
},
{
"epoch": 4.813412655489453,
"grad_norm": 0.20220296661913104,
"learning_rate": 2.0709676881697004e-06,
"loss": 0.8861,
"num_tokens": 8903264869.0,
"step": 1116
},
{
"epoch": 4.817739318550568,
"grad_norm": 0.23727722007354193,
"learning_rate": 2.0678525397309945e-06,
"loss": 0.9271,
"num_tokens": 8911421588.0,
"step": 1117
},
{
"epoch": 4.822065981611682,
"grad_norm": 0.23235343963895627,
"learning_rate": 2.0648070459724656e-06,
"loss": 0.924,
"num_tokens": 8919591119.0,
"step": 1118
},
{
"epoch": 4.826392644672796,
"grad_norm": 0.2199249951981579,
"learning_rate": 2.061831230643482e-06,
"loss": 0.9049,
"num_tokens": 8927950607.0,
"step": 1119
},
{
"epoch": 4.830719307733911,
"grad_norm": 0.2194764604982523,
"learning_rate": 2.0589251169500524e-06,
"loss": 0.921,
"num_tokens": 8936155001.0,
"step": 1120
},
{
"epoch": 4.835045970795024,
"grad_norm": 0.23153319677802373,
"learning_rate": 2.056088727554633e-06,
"loss": 0.8995,
"num_tokens": 8944442434.0,
"step": 1121
},
{
"epoch": 4.839372633856138,
"grad_norm": 0.22249165496560577,
"learning_rate": 2.0533220845759586e-06,
"loss": 0.8921,
"num_tokens": 8952767111.0,
"step": 1122
},
{
"epoch": 4.8436992969172525,
"grad_norm": 0.19670623434078507,
"learning_rate": 2.0506252095888685e-06,
"loss": 0.8599,
"num_tokens": 8961063920.0,
"step": 1123
},
{
"epoch": 4.848025959978367,
"grad_norm": 0.20818885095654868,
"learning_rate": 2.0479981236241335e-06,
"loss": 0.9202,
"num_tokens": 8969286529.0,
"step": 1124
},
{
"epoch": 4.852352623039481,
"grad_norm": 0.2512842270408484,
"learning_rate": 2.0454408471682986e-06,
"loss": 0.9138,
"num_tokens": 8977525391.0,
"step": 1125
},
{
"epoch": 4.856679286100595,
"grad_norm": 0.20124864644138032,
"learning_rate": 2.0429534001635194e-06,
"loss": 0.9072,
"num_tokens": 8985751690.0,
"step": 1126
},
{
"epoch": 4.861005949161709,
"grad_norm": 0.22703475290609854,
"learning_rate": 2.0405358020074076e-06,
"loss": 0.9098,
"num_tokens": 8993976990.0,
"step": 1127
},
{
"epoch": 4.865332612222823,
"grad_norm": 0.23749997007783907,
"learning_rate": 2.0381880715528786e-06,
"loss": 0.8907,
"num_tokens": 9002137261.0,
"step": 1128
},
{
"epoch": 4.869659275283937,
"grad_norm": 0.21976926195972712,
"learning_rate": 2.0359102271080062e-06,
"loss": 0.9438,
"num_tokens": 9010389288.0,
"step": 1129
},
{
"epoch": 4.873985938345052,
"grad_norm": 0.24792727403538053,
"learning_rate": 2.0337022864358786e-06,
"loss": 0.906,
"num_tokens": 9018658763.0,
"step": 1130
},
{
"epoch": 4.878312601406165,
"grad_norm": 0.2614860295951354,
"learning_rate": 2.031564266754461e-06,
"loss": 0.914,
"num_tokens": 9026699766.0,
"step": 1131
},
{
"epoch": 4.882639264467279,
"grad_norm": 0.2450275316189634,
"learning_rate": 2.0294961847364616e-06,
"loss": 0.9317,
"num_tokens": 9034918076.0,
"step": 1132
},
{
"epoch": 4.886965927528394,
"grad_norm": 0.24200253390740495,
"learning_rate": 2.0274980565091975e-06,
"loss": 0.8878,
"num_tokens": 9043115999.0,
"step": 1133
},
{
"epoch": 4.891292590589508,
"grad_norm": 0.2320164712307313,
"learning_rate": 2.025569897654475e-06,
"loss": 0.9276,
"num_tokens": 9051203798.0,
"step": 1134
},
{
"epoch": 4.895619253650622,
"grad_norm": 0.243473369606954,
"learning_rate": 2.0237117232084633e-06,
"loss": 0.9349,
"num_tokens": 9059325975.0,
"step": 1135
},
{
"epoch": 4.8999459167117365,
"grad_norm": 0.23928329522353953,
"learning_rate": 2.0219235476615828e-06,
"loss": 0.9132,
"num_tokens": 9067540449.0,
"step": 1136
},
{
"epoch": 4.90427257977285,
"grad_norm": 0.22430652521916916,
"learning_rate": 2.0202053849583807e-06,
"loss": 0.9032,
"num_tokens": 9075738214.0,
"step": 1137
},
{
"epoch": 4.908599242833964,
"grad_norm": 0.2096685655932549,
"learning_rate": 2.0185572484974404e-06,
"loss": 0.9093,
"num_tokens": 9083846921.0,
"step": 1138
},
{
"epoch": 4.9129259058950785,
"grad_norm": 0.21341345052548796,
"learning_rate": 2.0169791511312564e-06,
"loss": 0.9164,
"num_tokens": 9091982051.0,
"step": 1139
},
{
"epoch": 4.917252568956193,
"grad_norm": 0.24338222425544925,
"learning_rate": 2.0154711051661524e-06,
"loss": 0.9296,
"num_tokens": 9100243162.0,
"step": 1140
},
{
"epoch": 4.921579232017306,
"grad_norm": 0.22192657024672127,
"learning_rate": 2.014033122362171e-06,
"loss": 0.8941,
"num_tokens": 9108550126.0,
"step": 1141
},
{
"epoch": 4.9259058950784205,
"grad_norm": 0.22250281726737248,
"learning_rate": 2.0126652139329934e-06,
"loss": 0.9376,
"num_tokens": 9116730576.0,
"step": 1142
},
{
"epoch": 4.930232558139535,
"grad_norm": 0.21835212746251859,
"learning_rate": 2.0113673905458433e-06,
"loss": 0.9288,
"num_tokens": 9125114079.0,
"step": 1143
},
{
"epoch": 4.934559221200649,
"grad_norm": 0.2359765746718839,
"learning_rate": 2.0101396623214068e-06,
"loss": 0.937,
"num_tokens": 9133266569.0,
"step": 1144
},
{
"epoch": 4.938885884261763,
"grad_norm": 0.22227350570305626,
"learning_rate": 2.008982038833758e-06,
"loss": 0.9288,
"num_tokens": 9141497993.0,
"step": 1145
},
{
"epoch": 4.943212547322878,
"grad_norm": 0.2312417625381218,
"learning_rate": 2.0078945291102746e-06,
"loss": 0.8898,
"num_tokens": 9149640610.0,
"step": 1146
},
{
"epoch": 4.947539210383991,
"grad_norm": 0.22458979646702876,
"learning_rate": 2.0068771416315785e-06,
"loss": 0.9012,
"num_tokens": 9157856842.0,
"step": 1147
},
{
"epoch": 4.951865873445105,
"grad_norm": 0.22778028587745122,
"learning_rate": 2.0059298843314594e-06,
"loss": 0.9213,
"num_tokens": 9166201687.0,
"step": 1148
},
{
"epoch": 4.95619253650622,
"grad_norm": 0.22936925584883466,
"learning_rate": 2.005052764596822e-06,
"loss": 0.945,
"num_tokens": 9174460788.0,
"step": 1149
},
{
"epoch": 4.960519199567334,
"grad_norm": 0.21176895797925974,
"learning_rate": 2.0042457892676203e-06,
"loss": 0.905,
"num_tokens": 9182715086.0,
"step": 1150
},
{
"epoch": 4.964845862628448,
"grad_norm": 0.2267907836662072,
"learning_rate": 2.003508964636811e-06,
"loss": 0.9195,
"num_tokens": 9190927902.0,
"step": 1151
},
{
"epoch": 4.969172525689562,
"grad_norm": 0.20904877816094458,
"learning_rate": 2.0028422964503007e-06,
"loss": 0.9175,
"num_tokens": 9199252989.0,
"step": 1152
},
{
"epoch": 4.973499188750676,
"grad_norm": 0.21745305262154316,
"learning_rate": 2.002245789906901e-06,
"loss": 0.9088,
"num_tokens": 9207627705.0,
"step": 1153
},
{
"epoch": 4.97782585181179,
"grad_norm": 0.21658469544242928,
"learning_rate": 2.0017194496582903e-06,
"loss": 0.9116,
"num_tokens": 9215752545.0,
"step": 1154
},
{
"epoch": 4.9821525148729044,
"grad_norm": 0.228091048385714,
"learning_rate": 2.001263279808977e-06,
"loss": 0.9311,
"num_tokens": 9223826485.0,
"step": 1155
},
{
"epoch": 4.986479177934019,
"grad_norm": 0.22068068132470603,
"learning_rate": 2.0008772839162623e-06,
"loss": 0.9244,
"num_tokens": 9232130653.0,
"step": 1156
},
{
"epoch": 4.990805840995132,
"grad_norm": 0.21601781584175028,
"learning_rate": 2.000561464990222e-06,
"loss": 0.9161,
"num_tokens": 9240121021.0,
"step": 1157
},
{
"epoch": 4.995132504056246,
"grad_norm": 0.21668791628808873,
"learning_rate": 2.0003158254936748e-06,
"loss": 0.8818,
"num_tokens": 9248287497.0,
"step": 1158
},
{
"epoch": 4.999459167117361,
"grad_norm": 0.2256692786070936,
"learning_rate": 2.000140367342166e-06,
"loss": 0.9174,
"num_tokens": 9256321099.0,
"step": 1159
},
{
"epoch": 5.0,
"grad_norm": 0.5189925985904169,
"learning_rate": 2.000035091903955e-06,
"loss": 0.8958,
"num_tokens": 9257323549.0,
"step": 1160
},
{
"epoch": 5.0,
"step": 1160,
"total_flos": 3832398801043456.0,
"train_loss": 0.9466197261522556,
"train_runtime": 88419.3231,
"train_samples_per_second": 1.673,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 1160,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 24,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3832398801043456.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}