eac123's picture
Upload final checkpoint (checkpoint-804)
2274c96 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1336015462875366,
"epoch": 0.003738317757009346,
"grad_norm": 0.4115395247936249,
"learning_rate": 0.0002,
"loss": 2.4710798263549805,
"mean_token_accuracy": 0.5324664115905762,
"num_tokens": 16496.0,
"step": 1
},
{
"entropy": 1.2463930547237396,
"epoch": 0.007476635514018692,
"grad_norm": 0.3692863881587982,
"learning_rate": 0.0002,
"loss": 2.165541648864746,
"mean_token_accuracy": 0.5610552132129669,
"num_tokens": 32901.0,
"step": 2
},
{
"entropy": 1.4113854467868805,
"epoch": 0.011214953271028037,
"grad_norm": 0.2915845811367035,
"learning_rate": 0.0002,
"loss": 1.7357215881347656,
"mean_token_accuracy": 0.5886629670858383,
"num_tokens": 49245.0,
"step": 3
},
{
"entropy": 1.379658043384552,
"epoch": 0.014953271028037384,
"grad_norm": 0.23361942172050476,
"learning_rate": 0.0002,
"loss": 1.410735011100769,
"mean_token_accuracy": 0.6355755776166916,
"num_tokens": 65811.0,
"step": 4
},
{
"entropy": 1.3623565435409546,
"epoch": 0.018691588785046728,
"grad_norm": 0.26191750168800354,
"learning_rate": 0.0002,
"loss": 1.2986161708831787,
"mean_token_accuracy": 0.6415031999349594,
"num_tokens": 82189.0,
"step": 5
},
{
"entropy": 1.2727859914302826,
"epoch": 0.022429906542056073,
"grad_norm": 0.1533316969871521,
"learning_rate": 0.0002,
"loss": 1.1948474645614624,
"mean_token_accuracy": 0.6546026170253754,
"num_tokens": 98489.0,
"step": 6
},
{
"entropy": 1.2184827625751495,
"epoch": 0.026168224299065422,
"grad_norm": 0.10424298793077469,
"learning_rate": 0.0002,
"loss": 1.1188591718673706,
"mean_token_accuracy": 0.6631771177053452,
"num_tokens": 114851.0,
"step": 7
},
{
"entropy": 1.1237380504608154,
"epoch": 0.029906542056074768,
"grad_norm": 0.10689449310302734,
"learning_rate": 0.0002,
"loss": 1.0371830463409424,
"mean_token_accuracy": 0.6718492060899734,
"num_tokens": 131220.0,
"step": 8
},
{
"entropy": 1.0455615520477295,
"epoch": 0.03364485981308411,
"grad_norm": 0.12944048643112183,
"learning_rate": 0.0002,
"loss": 0.9913585782051086,
"mean_token_accuracy": 0.6828599572181702,
"num_tokens": 147616.0,
"step": 9
},
{
"entropy": 0.9801072925329208,
"epoch": 0.037383177570093455,
"grad_norm": 0.1291113793849945,
"learning_rate": 0.0002,
"loss": 0.9284825325012207,
"mean_token_accuracy": 0.7001921981573105,
"num_tokens": 164002.0,
"step": 10
},
{
"entropy": 0.953565314412117,
"epoch": 0.041121495327102804,
"grad_norm": 0.10645624995231628,
"learning_rate": 0.0002,
"loss": 0.8795915842056274,
"mean_token_accuracy": 0.7043117135763168,
"num_tokens": 180220.0,
"step": 11
},
{
"entropy": 0.9155157953500748,
"epoch": 0.044859813084112146,
"grad_norm": 0.11287244409322739,
"learning_rate": 0.0002,
"loss": 0.8326205015182495,
"mean_token_accuracy": 0.7109687179327011,
"num_tokens": 196521.0,
"step": 12
},
{
"entropy": 0.8468948155641556,
"epoch": 0.048598130841121495,
"grad_norm": 0.10245727747678757,
"learning_rate": 0.0002,
"loss": 0.8009377121925354,
"mean_token_accuracy": 0.7149728387594223,
"num_tokens": 212778.0,
"step": 13
},
{
"entropy": 0.7708506435155869,
"epoch": 0.052336448598130844,
"grad_norm": 0.09908365458250046,
"learning_rate": 0.0002,
"loss": 0.7473602890968323,
"mean_token_accuracy": 0.7281823754310608,
"num_tokens": 228942.0,
"step": 14
},
{
"entropy": 0.7574831545352936,
"epoch": 0.056074766355140186,
"grad_norm": 0.10171845555305481,
"learning_rate": 0.0002,
"loss": 0.7353494167327881,
"mean_token_accuracy": 0.7308090776205063,
"num_tokens": 245256.0,
"step": 15
},
{
"entropy": 0.6849008500576019,
"epoch": 0.059813084112149535,
"grad_norm": 0.08664627373218536,
"learning_rate": 0.0002,
"loss": 0.6817273497581482,
"mean_token_accuracy": 0.7445196211338043,
"num_tokens": 261288.0,
"step": 16
},
{
"entropy": 0.6784532964229584,
"epoch": 0.06355140186915888,
"grad_norm": 0.08904161304235458,
"learning_rate": 0.0002,
"loss": 0.6835237741470337,
"mean_token_accuracy": 0.7402277588844299,
"num_tokens": 277473.0,
"step": 17
},
{
"entropy": 0.6737232953310013,
"epoch": 0.06728971962616823,
"grad_norm": 0.08908089250326157,
"learning_rate": 0.0002,
"loss": 0.6696494817733765,
"mean_token_accuracy": 0.7452213168144226,
"num_tokens": 293986.0,
"step": 18
},
{
"entropy": 0.676809772849083,
"epoch": 0.07102803738317758,
"grad_norm": 0.08826066553592682,
"learning_rate": 0.0002,
"loss": 0.6623877286911011,
"mean_token_accuracy": 0.747529536485672,
"num_tokens": 310269.0,
"step": 19
},
{
"entropy": 0.6532965898513794,
"epoch": 0.07476635514018691,
"grad_norm": 0.08917281031608582,
"learning_rate": 0.0002,
"loss": 0.6443736553192139,
"mean_token_accuracy": 0.7480695396661758,
"num_tokens": 326491.0,
"step": 20
},
{
"entropy": 0.6552709937095642,
"epoch": 0.07850467289719626,
"grad_norm": 0.08073496073484421,
"learning_rate": 0.0002,
"loss": 0.6399368643760681,
"mean_token_accuracy": 0.7507821917533875,
"num_tokens": 342841.0,
"step": 21
},
{
"entropy": 0.6378396600484848,
"epoch": 0.08224299065420561,
"grad_norm": 0.063417449593544,
"learning_rate": 0.0002,
"loss": 0.6258761882781982,
"mean_token_accuracy": 0.7539727091789246,
"num_tokens": 359584.0,
"step": 22
},
{
"entropy": 0.6046861261129379,
"epoch": 0.08598130841121496,
"grad_norm": 0.06905008107423782,
"learning_rate": 0.0002,
"loss": 0.6049938201904297,
"mean_token_accuracy": 0.7625735104084015,
"num_tokens": 375502.0,
"step": 23
},
{
"entropy": 0.6043607741594315,
"epoch": 0.08971962616822429,
"grad_norm": 0.0712490975856781,
"learning_rate": 0.0002,
"loss": 0.6081230640411377,
"mean_token_accuracy": 0.761991336941719,
"num_tokens": 391668.0,
"step": 24
},
{
"entropy": 0.5921229273080826,
"epoch": 0.09345794392523364,
"grad_norm": 0.06059383973479271,
"learning_rate": 0.0002,
"loss": 0.5966373682022095,
"mean_token_accuracy": 0.7640610188245773,
"num_tokens": 408064.0,
"step": 25
},
{
"entropy": 0.6013955473899841,
"epoch": 0.09719626168224299,
"grad_norm": 0.05800875276327133,
"learning_rate": 0.0002,
"loss": 0.6032594442367554,
"mean_token_accuracy": 0.7606146037578583,
"num_tokens": 424308.0,
"step": 26
},
{
"entropy": 0.6059402525424957,
"epoch": 0.10093457943925234,
"grad_norm": 0.05799295753240585,
"learning_rate": 0.0002,
"loss": 0.6014454960823059,
"mean_token_accuracy": 0.7633127868175507,
"num_tokens": 440626.0,
"step": 27
},
{
"entropy": 0.6059208810329437,
"epoch": 0.10467289719626169,
"grad_norm": 0.06835797429084778,
"learning_rate": 0.0002,
"loss": 0.5960400104522705,
"mean_token_accuracy": 0.7644040137529373,
"num_tokens": 457127.0,
"step": 28
},
{
"entropy": 0.6063490360975266,
"epoch": 0.10841121495327102,
"grad_norm": 0.08442196249961853,
"learning_rate": 0.0002,
"loss": 0.5988196730613708,
"mean_token_accuracy": 0.7642622292041779,
"num_tokens": 473449.0,
"step": 29
},
{
"entropy": 0.6044150143861771,
"epoch": 0.11214953271028037,
"grad_norm": 0.05611753463745117,
"learning_rate": 0.0002,
"loss": 0.5849661231040955,
"mean_token_accuracy": 0.7694830596446991,
"num_tokens": 489953.0,
"step": 30
},
{
"entropy": 0.5886638015508652,
"epoch": 0.11588785046728972,
"grad_norm": 0.055090922862291336,
"learning_rate": 0.0002,
"loss": 0.5829939842224121,
"mean_token_accuracy": 0.769635483622551,
"num_tokens": 506414.0,
"step": 31
},
{
"entropy": 0.5746142864227295,
"epoch": 0.11962616822429907,
"grad_norm": 0.049661796540021896,
"learning_rate": 0.0002,
"loss": 0.5790735483169556,
"mean_token_accuracy": 0.7714909315109253,
"num_tokens": 522742.0,
"step": 32
},
{
"entropy": 0.5767629146575928,
"epoch": 0.1233644859813084,
"grad_norm": 0.04847181588411331,
"learning_rate": 0.0002,
"loss": 0.580193281173706,
"mean_token_accuracy": 0.7714395672082901,
"num_tokens": 539199.0,
"step": 33
},
{
"entropy": 0.5745265781879425,
"epoch": 0.12710280373831775,
"grad_norm": 0.05860326439142227,
"learning_rate": 0.0002,
"loss": 0.5901641845703125,
"mean_token_accuracy": 0.7679091691970825,
"num_tokens": 555326.0,
"step": 34
},
{
"entropy": 0.567798376083374,
"epoch": 0.1308411214953271,
"grad_norm": 0.05234525725245476,
"learning_rate": 0.0002,
"loss": 0.5799325704574585,
"mean_token_accuracy": 0.766155481338501,
"num_tokens": 571808.0,
"step": 35
},
{
"entropy": 0.5698586851358414,
"epoch": 0.13457943925233645,
"grad_norm": 0.041219986975193024,
"learning_rate": 0.0002,
"loss": 0.573387086391449,
"mean_token_accuracy": 0.769883319735527,
"num_tokens": 588161.0,
"step": 36
},
{
"entropy": 0.5851186513900757,
"epoch": 0.1383177570093458,
"grad_norm": 0.04337616264820099,
"learning_rate": 0.0002,
"loss": 0.5821909308433533,
"mean_token_accuracy": 0.7661230564117432,
"num_tokens": 604598.0,
"step": 37
},
{
"entropy": 0.5961429327726364,
"epoch": 0.14205607476635515,
"grad_norm": 0.05468963831663132,
"learning_rate": 0.0002,
"loss": 0.5940048098564148,
"mean_token_accuracy": 0.7601669579744339,
"num_tokens": 620746.0,
"step": 38
},
{
"entropy": 0.5826456397771835,
"epoch": 0.14579439252336449,
"grad_norm": 0.047812167555093765,
"learning_rate": 0.0002,
"loss": 0.5687558054924011,
"mean_token_accuracy": 0.771986335515976,
"num_tokens": 637151.0,
"step": 39
},
{
"entropy": 0.5903666168451309,
"epoch": 0.14953271028037382,
"grad_norm": 0.044994354248046875,
"learning_rate": 0.0002,
"loss": 0.5762028098106384,
"mean_token_accuracy": 0.7677688300609589,
"num_tokens": 653530.0,
"step": 40
},
{
"entropy": 0.5751803368330002,
"epoch": 0.15327102803738318,
"grad_norm": 0.04342395439743996,
"learning_rate": 0.0002,
"loss": 0.5721427798271179,
"mean_token_accuracy": 0.7731492966413498,
"num_tokens": 669957.0,
"step": 41
},
{
"entropy": 0.5582813173532486,
"epoch": 0.15700934579439252,
"grad_norm": 0.05154528096318245,
"learning_rate": 0.0002,
"loss": 0.5713383555412292,
"mean_token_accuracy": 0.7701951861381531,
"num_tokens": 685933.0,
"step": 42
},
{
"entropy": 0.5747530311346054,
"epoch": 0.16074766355140188,
"grad_norm": 0.05052989348769188,
"learning_rate": 0.0002,
"loss": 0.5861970782279968,
"mean_token_accuracy": 0.7652492970228195,
"num_tokens": 702131.0,
"step": 43
},
{
"entropy": 0.5861315429210663,
"epoch": 0.16448598130841122,
"grad_norm": 0.043960776180028915,
"learning_rate": 0.0002,
"loss": 0.5891501903533936,
"mean_token_accuracy": 0.7628277689218521,
"num_tokens": 718330.0,
"step": 44
},
{
"entropy": 0.5868926346302032,
"epoch": 0.16822429906542055,
"grad_norm": 0.035861797630786896,
"learning_rate": 0.0002,
"loss": 0.5814363360404968,
"mean_token_accuracy": 0.7670950144529343,
"num_tokens": 734754.0,
"step": 45
},
{
"entropy": 0.5696061849594116,
"epoch": 0.17196261682242991,
"grad_norm": 0.03567943349480629,
"learning_rate": 0.0002,
"loss": 0.5582084655761719,
"mean_token_accuracy": 0.7754767388105392,
"num_tokens": 750952.0,
"step": 46
},
{
"entropy": 0.5884592086076736,
"epoch": 0.17570093457943925,
"grad_norm": 0.04051043465733528,
"learning_rate": 0.0002,
"loss": 0.5837826132774353,
"mean_token_accuracy": 0.7652305215597153,
"num_tokens": 767136.0,
"step": 47
},
{
"entropy": 0.568819597363472,
"epoch": 0.17943925233644858,
"grad_norm": 0.04234869405627251,
"learning_rate": 0.0002,
"loss": 0.5664035081863403,
"mean_token_accuracy": 0.7719341665506363,
"num_tokens": 783513.0,
"step": 48
},
{
"entropy": 0.553595632314682,
"epoch": 0.18317757009345795,
"grad_norm": 0.04170480743050575,
"learning_rate": 0.0002,
"loss": 0.564354658126831,
"mean_token_accuracy": 0.7749540507793427,
"num_tokens": 799703.0,
"step": 49
},
{
"entropy": 0.5621031820774078,
"epoch": 0.18691588785046728,
"grad_norm": 0.042460180819034576,
"learning_rate": 0.0002,
"loss": 0.576507568359375,
"mean_token_accuracy": 0.7702780216932297,
"num_tokens": 815979.0,
"step": 50
},
{
"entropy": 0.5803797841072083,
"epoch": 0.19065420560747665,
"grad_norm": 0.036130718886852264,
"learning_rate": 0.0002,
"loss": 0.5826534628868103,
"mean_token_accuracy": 0.767243430018425,
"num_tokens": 832435.0,
"step": 51
},
{
"entropy": 0.5492766499519348,
"epoch": 0.19439252336448598,
"grad_norm": 0.04120517149567604,
"learning_rate": 0.0002,
"loss": 0.5535300374031067,
"mean_token_accuracy": 0.7766350656747818,
"num_tokens": 848601.0,
"step": 52
},
{
"entropy": 0.5690171420574188,
"epoch": 0.19813084112149532,
"grad_norm": 0.03631429374217987,
"learning_rate": 0.0002,
"loss": 0.5688353776931763,
"mean_token_accuracy": 0.7699357271194458,
"num_tokens": 864779.0,
"step": 53
},
{
"entropy": 0.5830478370189667,
"epoch": 0.20186915887850468,
"grad_norm": 0.03915117308497429,
"learning_rate": 0.0002,
"loss": 0.5719392895698547,
"mean_token_accuracy": 0.7702472358942032,
"num_tokens": 881366.0,
"step": 54
},
{
"entropy": 0.5905578434467316,
"epoch": 0.205607476635514,
"grad_norm": 0.038457099348306656,
"learning_rate": 0.0002,
"loss": 0.5855496525764465,
"mean_token_accuracy": 0.7646182626485825,
"num_tokens": 897955.0,
"step": 55
},
{
"entropy": 0.5837848633527756,
"epoch": 0.20934579439252338,
"grad_norm": 0.04033343121409416,
"learning_rate": 0.0002,
"loss": 0.5784925222396851,
"mean_token_accuracy": 0.7649644762277603,
"num_tokens": 914164.0,
"step": 56
},
{
"entropy": 0.5470199286937714,
"epoch": 0.2130841121495327,
"grad_norm": 0.036680735647678375,
"learning_rate": 0.0002,
"loss": 0.5427253246307373,
"mean_token_accuracy": 0.7822186052799225,
"num_tokens": 930444.0,
"step": 57
},
{
"entropy": 0.5544598549604416,
"epoch": 0.21682242990654205,
"grad_norm": 0.04701124131679535,
"learning_rate": 0.0002,
"loss": 0.569618821144104,
"mean_token_accuracy": 0.771122008562088,
"num_tokens": 946567.0,
"step": 58
},
{
"entropy": 0.5725786834955215,
"epoch": 0.2205607476635514,
"grad_norm": 0.04193125665187836,
"learning_rate": 0.0002,
"loss": 0.5894483923912048,
"mean_token_accuracy": 0.7642552405595779,
"num_tokens": 962894.0,
"step": 59
},
{
"entropy": 0.5668687969446182,
"epoch": 0.22429906542056074,
"grad_norm": 0.033951517194509506,
"learning_rate": 0.0002,
"loss": 0.5699459314346313,
"mean_token_accuracy": 0.7729462385177612,
"num_tokens": 979210.0,
"step": 60
},
{
"entropy": 0.5792391896247864,
"epoch": 0.22803738317757008,
"grad_norm": 0.041912537068128586,
"learning_rate": 0.0002,
"loss": 0.5683349370956421,
"mean_token_accuracy": 0.7706285119056702,
"num_tokens": 995540.0,
"step": 61
},
{
"entropy": 0.5809753388166428,
"epoch": 0.23177570093457944,
"grad_norm": 0.036393389105796814,
"learning_rate": 0.0002,
"loss": 0.5727679133415222,
"mean_token_accuracy": 0.7684315294027328,
"num_tokens": 1011805.0,
"step": 62
},
{
"entropy": 0.5670438855886459,
"epoch": 0.23551401869158878,
"grad_norm": 0.03674926608800888,
"learning_rate": 0.0002,
"loss": 0.5604680776596069,
"mean_token_accuracy": 0.7723257541656494,
"num_tokens": 1028009.0,
"step": 63
},
{
"entropy": 0.5653442144393921,
"epoch": 0.23925233644859814,
"grad_norm": 0.03534647822380066,
"learning_rate": 0.0002,
"loss": 0.5580601096153259,
"mean_token_accuracy": 0.7755836397409439,
"num_tokens": 1044521.0,
"step": 64
},
{
"entropy": 0.5762730091810226,
"epoch": 0.24299065420560748,
"grad_norm": 0.03369547426700592,
"learning_rate": 0.0002,
"loss": 0.5709710121154785,
"mean_token_accuracy": 0.7710799872875214,
"num_tokens": 1060984.0,
"step": 65
},
{
"entropy": 0.56136754155159,
"epoch": 0.2467289719626168,
"grad_norm": 0.050162531435489655,
"learning_rate": 0.0002,
"loss": 0.5662704706192017,
"mean_token_accuracy": 0.7702763229608536,
"num_tokens": 1077512.0,
"step": 66
},
{
"entropy": 0.5493937730789185,
"epoch": 0.2504672897196262,
"grad_norm": 0.0446079783141613,
"learning_rate": 0.0002,
"loss": 0.563389778137207,
"mean_token_accuracy": 0.7724475711584091,
"num_tokens": 1093860.0,
"step": 67
},
{
"entropy": 0.5527212023735046,
"epoch": 0.2542056074766355,
"grad_norm": 0.04445589333772659,
"learning_rate": 0.0002,
"loss": 0.553238034248352,
"mean_token_accuracy": 0.777790442109108,
"num_tokens": 1109927.0,
"step": 68
},
{
"entropy": 0.5742960721254349,
"epoch": 0.25794392523364484,
"grad_norm": 0.03155473247170448,
"learning_rate": 0.0002,
"loss": 0.5755714774131775,
"mean_token_accuracy": 0.7682003676891327,
"num_tokens": 1126507.0,
"step": 69
},
{
"entropy": 0.570902407169342,
"epoch": 0.2616822429906542,
"grad_norm": 0.03776158019900322,
"learning_rate": 0.0002,
"loss": 0.5687341094017029,
"mean_token_accuracy": 0.7690709233283997,
"num_tokens": 1142690.0,
"step": 70
},
{
"entropy": 0.5869749188423157,
"epoch": 0.26542056074766357,
"grad_norm": 0.03637450933456421,
"learning_rate": 0.0002,
"loss": 0.5745267271995544,
"mean_token_accuracy": 0.7675913572311401,
"num_tokens": 1158998.0,
"step": 71
},
{
"entropy": 0.5770464688539505,
"epoch": 0.2691588785046729,
"grad_norm": 0.03824329748749733,
"learning_rate": 0.0002,
"loss": 0.5806713104248047,
"mean_token_accuracy": 0.765295684337616,
"num_tokens": 1175369.0,
"step": 72
},
{
"entropy": 0.5496443659067154,
"epoch": 0.27289719626168224,
"grad_norm": 0.03833479806780815,
"learning_rate": 0.0002,
"loss": 0.552317202091217,
"mean_token_accuracy": 0.7775600254535675,
"num_tokens": 1191776.0,
"step": 73
},
{
"entropy": 0.5672993659973145,
"epoch": 0.2766355140186916,
"grad_norm": 0.035141605883836746,
"learning_rate": 0.0002,
"loss": 0.5738911032676697,
"mean_token_accuracy": 0.769673228263855,
"num_tokens": 1208289.0,
"step": 74
},
{
"entropy": 0.5747457444667816,
"epoch": 0.2803738317757009,
"grad_norm": 0.03779706731438637,
"learning_rate": 0.0002,
"loss": 0.580111026763916,
"mean_token_accuracy": 0.7651933431625366,
"num_tokens": 1224804.0,
"step": 75
},
{
"entropy": 0.5685230642557144,
"epoch": 0.2841121495327103,
"grad_norm": 0.03369152173399925,
"learning_rate": 0.0002,
"loss": 0.571203351020813,
"mean_token_accuracy": 0.7706969380378723,
"num_tokens": 1240994.0,
"step": 76
},
{
"entropy": 0.5724664479494095,
"epoch": 0.28785046728971964,
"grad_norm": 0.03279148414731026,
"learning_rate": 0.0002,
"loss": 0.5703553557395935,
"mean_token_accuracy": 0.7710930705070496,
"num_tokens": 1257180.0,
"step": 77
},
{
"entropy": 0.570750430226326,
"epoch": 0.29158878504672897,
"grad_norm": 0.035474326461553574,
"learning_rate": 0.0002,
"loss": 0.57155442237854,
"mean_token_accuracy": 0.7676969021558762,
"num_tokens": 1273176.0,
"step": 78
},
{
"entropy": 0.5746997892856598,
"epoch": 0.2953271028037383,
"grad_norm": 0.03326554223895073,
"learning_rate": 0.0002,
"loss": 0.5764865279197693,
"mean_token_accuracy": 0.7667145133018494,
"num_tokens": 1289572.0,
"step": 79
},
{
"entropy": 0.5560239851474762,
"epoch": 0.29906542056074764,
"grad_norm": 0.033652499318122864,
"learning_rate": 0.0002,
"loss": 0.5541852712631226,
"mean_token_accuracy": 0.7752721756696701,
"num_tokens": 1305646.0,
"step": 80
},
{
"entropy": 0.5700062215328217,
"epoch": 0.30280373831775703,
"grad_norm": 0.036336466670036316,
"learning_rate": 0.0002,
"loss": 0.5715289115905762,
"mean_token_accuracy": 0.7702216506004333,
"num_tokens": 1322328.0,
"step": 81
},
{
"entropy": 0.5599597245454788,
"epoch": 0.30654205607476637,
"grad_norm": 0.032290052622556686,
"learning_rate": 0.0002,
"loss": 0.5614467859268188,
"mean_token_accuracy": 0.7732760310173035,
"num_tokens": 1338359.0,
"step": 82
},
{
"entropy": 0.5446556061506271,
"epoch": 0.3102803738317757,
"grad_norm": 0.03226450830698013,
"learning_rate": 0.0002,
"loss": 0.5512461066246033,
"mean_token_accuracy": 0.7779420912265778,
"num_tokens": 1354321.0,
"step": 83
},
{
"entropy": 0.5505060404539108,
"epoch": 0.31401869158878504,
"grad_norm": 0.035315077751874924,
"learning_rate": 0.0002,
"loss": 0.5553967952728271,
"mean_token_accuracy": 0.7761841863393784,
"num_tokens": 1370409.0,
"step": 84
},
{
"entropy": 0.5602358281612396,
"epoch": 0.3177570093457944,
"grad_norm": 0.031360018998384476,
"learning_rate": 0.0002,
"loss": 0.5553810596466064,
"mean_token_accuracy": 0.7750610113143921,
"num_tokens": 1386951.0,
"step": 85
},
{
"entropy": 0.5592145472764969,
"epoch": 0.32149532710280376,
"grad_norm": 0.03307170048356056,
"learning_rate": 0.0002,
"loss": 0.5547728538513184,
"mean_token_accuracy": 0.7769513875246048,
"num_tokens": 1403318.0,
"step": 86
},
{
"entropy": 0.5478426665067673,
"epoch": 0.3252336448598131,
"grad_norm": 0.03468095511198044,
"learning_rate": 0.0002,
"loss": 0.5475176572799683,
"mean_token_accuracy": 0.7787642478942871,
"num_tokens": 1419588.0,
"step": 87
},
{
"entropy": 0.5575945675373077,
"epoch": 0.32897196261682243,
"grad_norm": 0.0372730977833271,
"learning_rate": 0.0002,
"loss": 0.5592425465583801,
"mean_token_accuracy": 0.7753143310546875,
"num_tokens": 1435879.0,
"step": 88
},
{
"entropy": 0.5516618192195892,
"epoch": 0.33271028037383177,
"grad_norm": 0.03459680825471878,
"learning_rate": 0.0002,
"loss": 0.5590015649795532,
"mean_token_accuracy": 0.7763092070817947,
"num_tokens": 1452255.0,
"step": 89
},
{
"entropy": 0.5537828356027603,
"epoch": 0.3364485981308411,
"grad_norm": 0.037478331476449966,
"learning_rate": 0.0002,
"loss": 0.5628093481063843,
"mean_token_accuracy": 0.7731254547834396,
"num_tokens": 1468440.0,
"step": 90
},
{
"entropy": 0.5597833395004272,
"epoch": 0.3401869158878505,
"grad_norm": 0.03566694259643555,
"learning_rate": 0.0002,
"loss": 0.5576118230819702,
"mean_token_accuracy": 0.7733734101057053,
"num_tokens": 1484803.0,
"step": 91
},
{
"entropy": 0.5624473690986633,
"epoch": 0.34392523364485983,
"grad_norm": 0.038208235055208206,
"learning_rate": 0.0002,
"loss": 0.5643529891967773,
"mean_token_accuracy": 0.773946151137352,
"num_tokens": 1500849.0,
"step": 92
},
{
"entropy": 0.5809104889631271,
"epoch": 0.34766355140186916,
"grad_norm": 0.03173667564988136,
"learning_rate": 0.0002,
"loss": 0.5739686489105225,
"mean_token_accuracy": 0.7694463729858398,
"num_tokens": 1517263.0,
"step": 93
},
{
"entropy": 0.5697960555553436,
"epoch": 0.3514018691588785,
"grad_norm": 0.03167756646871567,
"learning_rate": 0.0002,
"loss": 0.5665271878242493,
"mean_token_accuracy": 0.7699908316135406,
"num_tokens": 1533648.0,
"step": 94
},
{
"entropy": 0.5966296941041946,
"epoch": 0.35514018691588783,
"grad_norm": 0.036720361560583115,
"learning_rate": 0.0002,
"loss": 0.5901257395744324,
"mean_token_accuracy": 0.7647226899862289,
"num_tokens": 1550084.0,
"step": 95
},
{
"entropy": 0.5599866956472397,
"epoch": 0.35887850467289717,
"grad_norm": 0.03618223965167999,
"learning_rate": 0.0002,
"loss": 0.5656697750091553,
"mean_token_accuracy": 0.7732058614492416,
"num_tokens": 1566526.0,
"step": 96
},
{
"entropy": 0.5660023838281631,
"epoch": 0.36261682242990656,
"grad_norm": 0.037616875022649765,
"learning_rate": 0.0002,
"loss": 0.5731638669967651,
"mean_token_accuracy": 0.7681225687265396,
"num_tokens": 1582887.0,
"step": 97
},
{
"entropy": 0.5692461878061295,
"epoch": 0.3663551401869159,
"grad_norm": 0.04291412979364395,
"learning_rate": 0.0002,
"loss": 0.5790476202964783,
"mean_token_accuracy": 0.7658884823322296,
"num_tokens": 1599367.0,
"step": 98
},
{
"entropy": 0.5626956224441528,
"epoch": 0.37009345794392523,
"grad_norm": 0.03269932419061661,
"learning_rate": 0.0002,
"loss": 0.5623303651809692,
"mean_token_accuracy": 0.7726950198411942,
"num_tokens": 1615716.0,
"step": 99
},
{
"entropy": 0.5417574644088745,
"epoch": 0.37383177570093457,
"grad_norm": 0.029643645510077477,
"learning_rate": 0.0002,
"loss": 0.5503037571907043,
"mean_token_accuracy": 0.7786638289690018,
"num_tokens": 1631985.0,
"step": 100
},
{
"entropy": 0.5644317716360092,
"epoch": 0.3775700934579439,
"grad_norm": 0.03810103237628937,
"learning_rate": 0.0002,
"loss": 0.5641601085662842,
"mean_token_accuracy": 0.7715529501438141,
"num_tokens": 1648148.0,
"step": 101
},
{
"entropy": 0.5648799985647202,
"epoch": 0.3813084112149533,
"grad_norm": 0.02914907969534397,
"learning_rate": 0.0002,
"loss": 0.5619527101516724,
"mean_token_accuracy": 0.7744928747415543,
"num_tokens": 1664554.0,
"step": 102
},
{
"entropy": 0.5753660798072815,
"epoch": 0.3850467289719626,
"grad_norm": 0.02887723594903946,
"learning_rate": 0.0002,
"loss": 0.5688785314559937,
"mean_token_accuracy": 0.7692504674196243,
"num_tokens": 1680782.0,
"step": 103
},
{
"entropy": 0.561363086104393,
"epoch": 0.38878504672897196,
"grad_norm": 0.028774583712220192,
"learning_rate": 0.0002,
"loss": 0.560323178768158,
"mean_token_accuracy": 0.7716943174600601,
"num_tokens": 1696855.0,
"step": 104
},
{
"entropy": 0.5558189004659653,
"epoch": 0.3925233644859813,
"grad_norm": 0.030897047370672226,
"learning_rate": 0.0002,
"loss": 0.5627227425575256,
"mean_token_accuracy": 0.7728832811117172,
"num_tokens": 1713092.0,
"step": 105
},
{
"entropy": 0.5579479783773422,
"epoch": 0.39626168224299063,
"grad_norm": 0.03168272599577904,
"learning_rate": 0.0002,
"loss": 0.5611063241958618,
"mean_token_accuracy": 0.7737848162651062,
"num_tokens": 1729174.0,
"step": 106
},
{
"entropy": 0.5593132227659225,
"epoch": 0.4,
"grad_norm": 0.030001681298017502,
"learning_rate": 0.0002,
"loss": 0.5634371638298035,
"mean_token_accuracy": 0.7737011611461639,
"num_tokens": 1745387.0,
"step": 107
},
{
"entropy": 0.5454982221126556,
"epoch": 0.40373831775700936,
"grad_norm": 0.033263012766838074,
"learning_rate": 0.0002,
"loss": 0.5490332841873169,
"mean_token_accuracy": 0.7772792428731918,
"num_tokens": 1761446.0,
"step": 108
},
{
"entropy": 0.5551732182502747,
"epoch": 0.4074766355140187,
"grad_norm": 0.030698338523507118,
"learning_rate": 0.0002,
"loss": 0.5535954236984253,
"mean_token_accuracy": 0.773947462439537,
"num_tokens": 1778105.0,
"step": 109
},
{
"entropy": 0.5650522261857986,
"epoch": 0.411214953271028,
"grad_norm": 0.02939177118241787,
"learning_rate": 0.0002,
"loss": 0.5615048408508301,
"mean_token_accuracy": 0.7712746411561966,
"num_tokens": 1794562.0,
"step": 110
},
{
"entropy": 0.5696343183517456,
"epoch": 0.41495327102803736,
"grad_norm": 0.03011537715792656,
"learning_rate": 0.0002,
"loss": 0.5706506967544556,
"mean_token_accuracy": 0.7699969708919525,
"num_tokens": 1810779.0,
"step": 111
},
{
"entropy": 0.5387005656957626,
"epoch": 0.41869158878504675,
"grad_norm": 0.033464495092630386,
"learning_rate": 0.0002,
"loss": 0.5423218607902527,
"mean_token_accuracy": 0.7795679718255997,
"num_tokens": 1827208.0,
"step": 112
},
{
"entropy": 0.5597733706235886,
"epoch": 0.4224299065420561,
"grad_norm": 0.029017142951488495,
"learning_rate": 0.0002,
"loss": 0.5561181306838989,
"mean_token_accuracy": 0.7743376046419144,
"num_tokens": 1843649.0,
"step": 113
},
{
"entropy": 0.5541809946298599,
"epoch": 0.4261682242990654,
"grad_norm": 0.030042298138141632,
"learning_rate": 0.0002,
"loss": 0.5544824600219727,
"mean_token_accuracy": 0.7773302495479584,
"num_tokens": 1859919.0,
"step": 114
},
{
"entropy": 0.5697837471961975,
"epoch": 0.42990654205607476,
"grad_norm": 0.029710182920098305,
"learning_rate": 0.0002,
"loss": 0.5684210658073425,
"mean_token_accuracy": 0.7717447876930237,
"num_tokens": 1876288.0,
"step": 115
},
{
"entropy": 0.5591758489608765,
"epoch": 0.4336448598130841,
"grad_norm": 0.031515248119831085,
"learning_rate": 0.0002,
"loss": 0.5618751645088196,
"mean_token_accuracy": 0.77419513463974,
"num_tokens": 1892685.0,
"step": 116
},
{
"entropy": 0.5360209345817566,
"epoch": 0.4373831775700935,
"grad_norm": 0.036333996802568436,
"learning_rate": 0.0002,
"loss": 0.5519132614135742,
"mean_token_accuracy": 0.77690489590168,
"num_tokens": 1908983.0,
"step": 117
},
{
"entropy": 0.5584719926118851,
"epoch": 0.4411214953271028,
"grad_norm": 0.03057498298585415,
"learning_rate": 0.0002,
"loss": 0.5668904185295105,
"mean_token_accuracy": 0.7719320356845856,
"num_tokens": 1925134.0,
"step": 118
},
{
"entropy": 0.5634136199951172,
"epoch": 0.44485981308411215,
"grad_norm": 0.038503021001815796,
"learning_rate": 0.0002,
"loss": 0.5522302389144897,
"mean_token_accuracy": 0.7777165621519089,
"num_tokens": 1941319.0,
"step": 119
},
{
"entropy": 0.5695697367191315,
"epoch": 0.4485981308411215,
"grad_norm": 0.02690051682293415,
"learning_rate": 0.0002,
"loss": 0.5623375773429871,
"mean_token_accuracy": 0.7749422192573547,
"num_tokens": 1957576.0,
"step": 120
},
{
"entropy": 0.5670370161533356,
"epoch": 0.4523364485981308,
"grad_norm": 0.030103027820587158,
"learning_rate": 0.0002,
"loss": 0.5645368695259094,
"mean_token_accuracy": 0.7715286463499069,
"num_tokens": 1973598.0,
"step": 121
},
{
"entropy": 0.5673844367265701,
"epoch": 0.45607476635514016,
"grad_norm": 0.03927698731422424,
"learning_rate": 0.0002,
"loss": 0.5738642811775208,
"mean_token_accuracy": 0.7676763832569122,
"num_tokens": 1989896.0,
"step": 122
},
{
"entropy": 0.5642601549625397,
"epoch": 0.45981308411214955,
"grad_norm": 0.040063194930553436,
"learning_rate": 0.0002,
"loss": 0.5772222280502319,
"mean_token_accuracy": 0.7651336938142776,
"num_tokens": 2006217.0,
"step": 123
},
{
"entropy": 0.5646145790815353,
"epoch": 0.4635514018691589,
"grad_norm": 0.02972179837524891,
"learning_rate": 0.0002,
"loss": 0.5596722960472107,
"mean_token_accuracy": 0.7738584727048874,
"num_tokens": 2022407.0,
"step": 124
},
{
"entropy": 0.5680184960365295,
"epoch": 0.4672897196261682,
"grad_norm": 0.03161488473415375,
"learning_rate": 0.0002,
"loss": 0.5569790601730347,
"mean_token_accuracy": 0.7752905040979385,
"num_tokens": 2038990.0,
"step": 125
},
{
"entropy": 0.5721628367900848,
"epoch": 0.47102803738317756,
"grad_norm": 0.03150559216737747,
"learning_rate": 0.0002,
"loss": 0.56056147813797,
"mean_token_accuracy": 0.7753510475158691,
"num_tokens": 2055485.0,
"step": 126
},
{
"entropy": 0.5526139587163925,
"epoch": 0.4747663551401869,
"grad_norm": 0.02876976877450943,
"learning_rate": 0.0002,
"loss": 0.555187463760376,
"mean_token_accuracy": 0.7740543335676193,
"num_tokens": 2071792.0,
"step": 127
},
{
"entropy": 0.542378157377243,
"epoch": 0.4785046728971963,
"grad_norm": 0.03460092097520828,
"learning_rate": 0.0002,
"loss": 0.5530366897583008,
"mean_token_accuracy": 0.7747022658586502,
"num_tokens": 2087874.0,
"step": 128
},
{
"entropy": 0.5451681464910507,
"epoch": 0.4822429906542056,
"grad_norm": 0.02991570346057415,
"learning_rate": 0.0002,
"loss": 0.549987256526947,
"mean_token_accuracy": 0.7774564474821091,
"num_tokens": 2104238.0,
"step": 129
},
{
"entropy": 0.5554285645484924,
"epoch": 0.48598130841121495,
"grad_norm": 0.0326702855527401,
"learning_rate": 0.0002,
"loss": 0.5605641603469849,
"mean_token_accuracy": 0.7726142853498459,
"num_tokens": 2120477.0,
"step": 130
},
{
"entropy": 0.555129811167717,
"epoch": 0.4897196261682243,
"grad_norm": 0.031020283699035645,
"learning_rate": 0.0002,
"loss": 0.5525497198104858,
"mean_token_accuracy": 0.7749627828598022,
"num_tokens": 2136857.0,
"step": 131
},
{
"entropy": 0.5660799294710159,
"epoch": 0.4934579439252336,
"grad_norm": 0.03083673305809498,
"learning_rate": 0.0002,
"loss": 0.5555440187454224,
"mean_token_accuracy": 0.7719593346118927,
"num_tokens": 2153526.0,
"step": 132
},
{
"entropy": 0.5561708807945251,
"epoch": 0.497196261682243,
"grad_norm": 0.031476520001888275,
"learning_rate": 0.0002,
"loss": 0.555605411529541,
"mean_token_accuracy": 0.7762354910373688,
"num_tokens": 2169651.0,
"step": 133
},
{
"entropy": 0.525283932685852,
"epoch": 0.5009345794392523,
"grad_norm": 0.03160262852907181,
"learning_rate": 0.0002,
"loss": 0.5320227742195129,
"mean_token_accuracy": 0.7818241119384766,
"num_tokens": 2185700.0,
"step": 134
},
{
"entropy": 0.5597178190946579,
"epoch": 0.5046728971962616,
"grad_norm": 0.03169814869761467,
"learning_rate": 0.0002,
"loss": 0.5603609681129456,
"mean_token_accuracy": 0.7734936475753784,
"num_tokens": 2201832.0,
"step": 135
},
{
"entropy": 0.5660498142242432,
"epoch": 0.508411214953271,
"grad_norm": 0.03322802484035492,
"learning_rate": 0.0002,
"loss": 0.570435643196106,
"mean_token_accuracy": 0.7702528983354568,
"num_tokens": 2218197.0,
"step": 136
},
{
"entropy": 0.5471976101398468,
"epoch": 0.5121495327102804,
"grad_norm": 0.031250759959220886,
"learning_rate": 0.0002,
"loss": 0.5555264353752136,
"mean_token_accuracy": 0.7744151949882507,
"num_tokens": 2234366.0,
"step": 137
},
{
"entropy": 0.5514054894447327,
"epoch": 0.5158878504672897,
"grad_norm": 0.026281429454684258,
"learning_rate": 0.0002,
"loss": 0.5531660318374634,
"mean_token_accuracy": 0.7755394726991653,
"num_tokens": 2250665.0,
"step": 138
},
{
"entropy": 0.5651220381259918,
"epoch": 0.5196261682242991,
"grad_norm": 0.031022025272250175,
"learning_rate": 0.0002,
"loss": 0.564669132232666,
"mean_token_accuracy": 0.773309201002121,
"num_tokens": 2266978.0,
"step": 139
},
{
"entropy": 0.5677877366542816,
"epoch": 0.5233644859813084,
"grad_norm": 0.030657587572932243,
"learning_rate": 0.0002,
"loss": 0.564283013343811,
"mean_token_accuracy": 0.7711436003446579,
"num_tokens": 2283321.0,
"step": 140
},
{
"entropy": 0.5454884767532349,
"epoch": 0.5271028037383177,
"grad_norm": 0.029621724039316177,
"learning_rate": 0.0002,
"loss": 0.5448048710823059,
"mean_token_accuracy": 0.7774412333965302,
"num_tokens": 2299654.0,
"step": 141
},
{
"entropy": 0.5593066215515137,
"epoch": 0.5308411214953271,
"grad_norm": 0.03370071202516556,
"learning_rate": 0.0002,
"loss": 0.5656630992889404,
"mean_token_accuracy": 0.7700357884168625,
"num_tokens": 2315917.0,
"step": 142
},
{
"entropy": 0.5630017071962357,
"epoch": 0.5345794392523364,
"grad_norm": 0.03445977345108986,
"learning_rate": 0.0002,
"loss": 0.5749462842941284,
"mean_token_accuracy": 0.7682285755872726,
"num_tokens": 2332053.0,
"step": 143
},
{
"entropy": 0.5692644715309143,
"epoch": 0.5383177570093458,
"grad_norm": 0.034105394035577774,
"learning_rate": 0.0002,
"loss": 0.5713233351707458,
"mean_token_accuracy": 0.7670455425977707,
"num_tokens": 2348321.0,
"step": 144
},
{
"entropy": 0.5742600113153458,
"epoch": 0.5420560747663551,
"grad_norm": 0.031007220968604088,
"learning_rate": 0.0002,
"loss": 0.571353554725647,
"mean_token_accuracy": 0.76962810754776,
"num_tokens": 2364386.0,
"step": 145
},
{
"entropy": 0.5725259482860565,
"epoch": 0.5457943925233645,
"grad_norm": 0.030071116983890533,
"learning_rate": 0.0002,
"loss": 0.5640747547149658,
"mean_token_accuracy": 0.7740518748760223,
"num_tokens": 2380815.0,
"step": 146
},
{
"entropy": 0.5748542249202728,
"epoch": 0.5495327102803739,
"grad_norm": 0.03353971987962723,
"learning_rate": 0.0002,
"loss": 0.5691145062446594,
"mean_token_accuracy": 0.7703811824321747,
"num_tokens": 2396915.0,
"step": 147
},
{
"entropy": 0.5501144975423813,
"epoch": 0.5532710280373832,
"grad_norm": 0.029002781957387924,
"learning_rate": 0.0002,
"loss": 0.5473450422286987,
"mean_token_accuracy": 0.7768280953168869,
"num_tokens": 2412894.0,
"step": 148
},
{
"entropy": 0.5640593320131302,
"epoch": 0.5570093457943925,
"grad_norm": 0.0339277982711792,
"learning_rate": 0.0002,
"loss": 0.568105936050415,
"mean_token_accuracy": 0.7686444222927094,
"num_tokens": 2429333.0,
"step": 149
},
{
"entropy": 0.5358926355838776,
"epoch": 0.5607476635514018,
"grad_norm": 0.03321727365255356,
"learning_rate": 0.0002,
"loss": 0.5451691150665283,
"mean_token_accuracy": 0.7813747376203537,
"num_tokens": 2445547.0,
"step": 150
},
{
"entropy": 0.575822114944458,
"epoch": 0.5644859813084112,
"grad_norm": 0.028913335874676704,
"learning_rate": 0.0002,
"loss": 0.5796110033988953,
"mean_token_accuracy": 0.7663715481758118,
"num_tokens": 2461739.0,
"step": 151
},
{
"entropy": 0.5666410624980927,
"epoch": 0.5682242990654206,
"grad_norm": 0.030346350744366646,
"learning_rate": 0.0002,
"loss": 0.5563742518424988,
"mean_token_accuracy": 0.7750760018825531,
"num_tokens": 2478290.0,
"step": 152
},
{
"entropy": 0.5700524747371674,
"epoch": 0.5719626168224299,
"grad_norm": 0.03455440327525139,
"learning_rate": 0.0002,
"loss": 0.5611424446105957,
"mean_token_accuracy": 0.7719277888536453,
"num_tokens": 2494845.0,
"step": 153
},
{
"entropy": 0.561910405755043,
"epoch": 0.5757009345794393,
"grad_norm": 0.029596278443932533,
"learning_rate": 0.0002,
"loss": 0.5637333393096924,
"mean_token_accuracy": 0.771451935172081,
"num_tokens": 2511497.0,
"step": 154
},
{
"entropy": 0.5496856719255447,
"epoch": 0.5794392523364486,
"grad_norm": 0.02896132506430149,
"learning_rate": 0.0002,
"loss": 0.5627070665359497,
"mean_token_accuracy": 0.7726458758115768,
"num_tokens": 2527582.0,
"step": 155
},
{
"entropy": 0.5563309341669083,
"epoch": 0.5831775700934579,
"grad_norm": 0.04145891219377518,
"learning_rate": 0.0002,
"loss": 0.5785839557647705,
"mean_token_accuracy": 0.7629837244749069,
"num_tokens": 2543948.0,
"step": 156
},
{
"entropy": 0.5635025650262833,
"epoch": 0.5869158878504673,
"grad_norm": 0.028125908225774765,
"learning_rate": 0.0002,
"loss": 0.5688048005104065,
"mean_token_accuracy": 0.7708674967288971,
"num_tokens": 2560174.0,
"step": 157
},
{
"entropy": 0.5650362074375153,
"epoch": 0.5906542056074766,
"grad_norm": 0.031838495284318924,
"learning_rate": 0.0002,
"loss": 0.5594847798347473,
"mean_token_accuracy": 0.7728245556354523,
"num_tokens": 2576418.0,
"step": 158
},
{
"entropy": 0.5560010820627213,
"epoch": 0.594392523364486,
"grad_norm": 0.03514372557401657,
"learning_rate": 0.0002,
"loss": 0.5445454120635986,
"mean_token_accuracy": 0.7787751257419586,
"num_tokens": 2592454.0,
"step": 159
},
{
"entropy": 0.552829384803772,
"epoch": 0.5981308411214953,
"grad_norm": 0.028390226885676384,
"learning_rate": 0.0002,
"loss": 0.5493785738945007,
"mean_token_accuracy": 0.7761707901954651,
"num_tokens": 2608586.0,
"step": 160
},
{
"entropy": 0.5553926527500153,
"epoch": 0.6018691588785047,
"grad_norm": 0.02847958728671074,
"learning_rate": 0.0002,
"loss": 0.5555365681648254,
"mean_token_accuracy": 0.7766669541597366,
"num_tokens": 2624962.0,
"step": 161
},
{
"entropy": 0.551996037364006,
"epoch": 0.6056074766355141,
"grad_norm": 0.03402937948703766,
"learning_rate": 0.0002,
"loss": 0.557694673538208,
"mean_token_accuracy": 0.7744593769311905,
"num_tokens": 2641382.0,
"step": 162
},
{
"entropy": 0.5671762228012085,
"epoch": 0.6093457943925233,
"grad_norm": 0.03495490923523903,
"learning_rate": 0.0002,
"loss": 0.5758394002914429,
"mean_token_accuracy": 0.7660740315914154,
"num_tokens": 2657986.0,
"step": 163
},
{
"entropy": 0.5575901418924332,
"epoch": 0.6130841121495327,
"grad_norm": 0.03418085724115372,
"learning_rate": 0.0002,
"loss": 0.5583428740501404,
"mean_token_accuracy": 0.7739714235067368,
"num_tokens": 2673995.0,
"step": 164
},
{
"entropy": 0.5644998699426651,
"epoch": 0.616822429906542,
"grad_norm": 0.028694115579128265,
"learning_rate": 0.0002,
"loss": 0.5556347370147705,
"mean_token_accuracy": 0.775534600019455,
"num_tokens": 2690249.0,
"step": 165
},
{
"entropy": 0.5767987668514252,
"epoch": 0.6205607476635514,
"grad_norm": 0.03323300555348396,
"learning_rate": 0.0002,
"loss": 0.5688591003417969,
"mean_token_accuracy": 0.7711433321237564,
"num_tokens": 2706818.0,
"step": 166
},
{
"entropy": 0.5557750165462494,
"epoch": 0.6242990654205608,
"grad_norm": 0.030084028840065002,
"learning_rate": 0.0002,
"loss": 0.5595380067825317,
"mean_token_accuracy": 0.7722294181585312,
"num_tokens": 2722820.0,
"step": 167
},
{
"entropy": 0.562026247382164,
"epoch": 0.6280373831775701,
"grad_norm": 0.03125706687569618,
"learning_rate": 0.0002,
"loss": 0.5637321472167969,
"mean_token_accuracy": 0.7692414969205856,
"num_tokens": 2739398.0,
"step": 168
},
{
"entropy": 0.5448627471923828,
"epoch": 0.6317757009345795,
"grad_norm": 0.03390555456280708,
"learning_rate": 0.0002,
"loss": 0.5494401454925537,
"mean_token_accuracy": 0.7776045203208923,
"num_tokens": 2755453.0,
"step": 169
},
{
"entropy": 0.5523964762687683,
"epoch": 0.6355140186915887,
"grad_norm": 0.03687772527337074,
"learning_rate": 0.0002,
"loss": 0.5620272159576416,
"mean_token_accuracy": 0.7718589901924133,
"num_tokens": 2771533.0,
"step": 170
},
{
"entropy": 0.5672519207000732,
"epoch": 0.6392523364485981,
"grad_norm": 0.035152945667505264,
"learning_rate": 0.0002,
"loss": 0.5725542306900024,
"mean_token_accuracy": 0.768815353512764,
"num_tokens": 2787816.0,
"step": 171
},
{
"entropy": 0.5715326368808746,
"epoch": 0.6429906542056075,
"grad_norm": 0.032671887427568436,
"learning_rate": 0.0002,
"loss": 0.5690709352493286,
"mean_token_accuracy": 0.7705206274986267,
"num_tokens": 2804253.0,
"step": 172
},
{
"entropy": 0.5771492570638657,
"epoch": 0.6467289719626168,
"grad_norm": 0.03344012424349785,
"learning_rate": 0.0002,
"loss": 0.5672138929367065,
"mean_token_accuracy": 0.7719729393720627,
"num_tokens": 2820473.0,
"step": 173
},
{
"entropy": 0.5444837659597397,
"epoch": 0.6504672897196262,
"grad_norm": 0.029676884412765503,
"learning_rate": 0.0002,
"loss": 0.5400466322898865,
"mean_token_accuracy": 0.7845920622348785,
"num_tokens": 2836738.0,
"step": 174
},
{
"entropy": 0.5679149776697159,
"epoch": 0.6542056074766355,
"grad_norm": 0.03190155327320099,
"learning_rate": 0.0002,
"loss": 0.5703109502792358,
"mean_token_accuracy": 0.7677883356809616,
"num_tokens": 2853015.0,
"step": 175
},
{
"entropy": 0.5386882424354553,
"epoch": 0.6579439252336449,
"grad_norm": 0.03156553953886032,
"learning_rate": 0.0002,
"loss": 0.5451309680938721,
"mean_token_accuracy": 0.7785861194133759,
"num_tokens": 2869326.0,
"step": 176
},
{
"entropy": 0.5546389669179916,
"epoch": 0.6616822429906543,
"grad_norm": 0.03298742696642876,
"learning_rate": 0.0002,
"loss": 0.5598126649856567,
"mean_token_accuracy": 0.7714642137289047,
"num_tokens": 2885638.0,
"step": 177
},
{
"entropy": 0.5554563403129578,
"epoch": 0.6654205607476635,
"grad_norm": 0.034988123923540115,
"learning_rate": 0.0002,
"loss": 0.5639896392822266,
"mean_token_accuracy": 0.7712263017892838,
"num_tokens": 2902116.0,
"step": 178
},
{
"entropy": 0.5492645055055618,
"epoch": 0.6691588785046729,
"grad_norm": 0.03213873505592346,
"learning_rate": 0.0002,
"loss": 0.5490330457687378,
"mean_token_accuracy": 0.7778918445110321,
"num_tokens": 2918514.0,
"step": 179
},
{
"entropy": 0.5809471905231476,
"epoch": 0.6728971962616822,
"grad_norm": 0.02829456329345703,
"learning_rate": 0.0002,
"loss": 0.5780236721038818,
"mean_token_accuracy": 0.7631959617137909,
"num_tokens": 2935180.0,
"step": 180
},
{
"entropy": 0.5545472204685211,
"epoch": 0.6766355140186916,
"grad_norm": 0.026784643530845642,
"learning_rate": 0.0002,
"loss": 0.5539122819900513,
"mean_token_accuracy": 0.7744273245334625,
"num_tokens": 2951485.0,
"step": 181
},
{
"entropy": 0.5583300441503525,
"epoch": 0.680373831775701,
"grad_norm": 0.028181226924061775,
"learning_rate": 0.0002,
"loss": 0.5567899942398071,
"mean_token_accuracy": 0.7753158956766129,
"num_tokens": 2967799.0,
"step": 182
},
{
"entropy": 0.5597800463438034,
"epoch": 0.6841121495327103,
"grad_norm": 0.027700597420334816,
"learning_rate": 0.0002,
"loss": 0.559861958026886,
"mean_token_accuracy": 0.772071048617363,
"num_tokens": 2984240.0,
"step": 183
},
{
"entropy": 0.5409596711397171,
"epoch": 0.6878504672897197,
"grad_norm": 0.030223077163100243,
"learning_rate": 0.0002,
"loss": 0.5486294031143188,
"mean_token_accuracy": 0.7773659527301788,
"num_tokens": 3000681.0,
"step": 184
},
{
"entropy": 0.5551634728908539,
"epoch": 0.6915887850467289,
"grad_norm": 0.02896454744040966,
"learning_rate": 0.0002,
"loss": 0.5600041151046753,
"mean_token_accuracy": 0.7721187323331833,
"num_tokens": 3017042.0,
"step": 185
},
{
"entropy": 0.5551397949457169,
"epoch": 0.6953271028037383,
"grad_norm": 0.02665393240749836,
"learning_rate": 0.0002,
"loss": 0.556494414806366,
"mean_token_accuracy": 0.7747326493263245,
"num_tokens": 3033356.0,
"step": 186
},
{
"entropy": 0.5497598797082901,
"epoch": 0.6990654205607477,
"grad_norm": 0.026862069964408875,
"learning_rate": 0.0002,
"loss": 0.5495949983596802,
"mean_token_accuracy": 0.7788131833076477,
"num_tokens": 3049609.0,
"step": 187
},
{
"entropy": 0.5756572186946869,
"epoch": 0.702803738317757,
"grad_norm": 0.028672486543655396,
"learning_rate": 0.0002,
"loss": 0.5735815763473511,
"mean_token_accuracy": 0.7667711675167084,
"num_tokens": 3065873.0,
"step": 188
},
{
"entropy": 0.560253381729126,
"epoch": 0.7065420560747664,
"grad_norm": 0.029232166707515717,
"learning_rate": 0.0002,
"loss": 0.5650488138198853,
"mean_token_accuracy": 0.768238291144371,
"num_tokens": 3081904.0,
"step": 189
},
{
"entropy": 0.5659812092781067,
"epoch": 0.7102803738317757,
"grad_norm": 0.028001444414258003,
"learning_rate": 0.0002,
"loss": 0.563786506652832,
"mean_token_accuracy": 0.7705834209918976,
"num_tokens": 3098208.0,
"step": 190
},
{
"entropy": 0.5397079735994339,
"epoch": 0.7140186915887851,
"grad_norm": 0.030035637319087982,
"learning_rate": 0.0002,
"loss": 0.5431380271911621,
"mean_token_accuracy": 0.7773479521274567,
"num_tokens": 3114448.0,
"step": 191
},
{
"entropy": 0.5607352703809738,
"epoch": 0.7177570093457943,
"grad_norm": 0.026054881513118744,
"learning_rate": 0.0002,
"loss": 0.5583080649375916,
"mean_token_accuracy": 0.7758101969957352,
"num_tokens": 3130755.0,
"step": 192
},
{
"entropy": 0.551689624786377,
"epoch": 0.7214953271028037,
"grad_norm": 0.02845809981226921,
"learning_rate": 0.0002,
"loss": 0.5481313467025757,
"mean_token_accuracy": 0.7777986079454422,
"num_tokens": 3147133.0,
"step": 193
},
{
"entropy": 0.5639677792787552,
"epoch": 0.7252336448598131,
"grad_norm": 0.029969094321131706,
"learning_rate": 0.0002,
"loss": 0.5681430697441101,
"mean_token_accuracy": 0.7705964744091034,
"num_tokens": 3163582.0,
"step": 194
},
{
"entropy": 0.5548544675111771,
"epoch": 0.7289719626168224,
"grad_norm": 0.026430293917655945,
"learning_rate": 0.0002,
"loss": 0.5528862476348877,
"mean_token_accuracy": 0.7741632461547852,
"num_tokens": 3180102.0,
"step": 195
},
{
"entropy": 0.5530348271131516,
"epoch": 0.7327102803738318,
"grad_norm": 0.026484189555048943,
"learning_rate": 0.0002,
"loss": 0.5540847778320312,
"mean_token_accuracy": 0.7735424339771271,
"num_tokens": 3196312.0,
"step": 196
},
{
"entropy": 0.5409010052680969,
"epoch": 0.7364485981308411,
"grad_norm": 0.030766047537326813,
"learning_rate": 0.0002,
"loss": 0.5487144589424133,
"mean_token_accuracy": 0.7778207361698151,
"num_tokens": 3212408.0,
"step": 197
},
{
"entropy": 0.5607801675796509,
"epoch": 0.7401869158878505,
"grad_norm": 0.029135972261428833,
"learning_rate": 0.0002,
"loss": 0.5579065680503845,
"mean_token_accuracy": 0.7756243348121643,
"num_tokens": 3228688.0,
"step": 198
},
{
"entropy": 0.5638224929571152,
"epoch": 0.7439252336448599,
"grad_norm": 0.028466643765568733,
"learning_rate": 0.0002,
"loss": 0.5634393095970154,
"mean_token_accuracy": 0.770130917429924,
"num_tokens": 3244856.0,
"step": 199
},
{
"entropy": 0.5390120446681976,
"epoch": 0.7476635514018691,
"grad_norm": 0.029409240931272507,
"learning_rate": 0.0002,
"loss": 0.5443782210350037,
"mean_token_accuracy": 0.7796739190816879,
"num_tokens": 3261004.0,
"step": 200
},
{
"entropy": 0.5513757616281509,
"epoch": 0.7514018691588785,
"grad_norm": 0.032466452568769455,
"learning_rate": 0.0002,
"loss": 0.5502808690071106,
"mean_token_accuracy": 0.7751527577638626,
"num_tokens": 3277310.0,
"step": 201
},
{
"entropy": 0.5808768719434738,
"epoch": 0.7551401869158878,
"grad_norm": 0.02947174198925495,
"learning_rate": 0.0002,
"loss": 0.5795295238494873,
"mean_token_accuracy": 0.7640405744314194,
"num_tokens": 3293719.0,
"step": 202
},
{
"entropy": 0.5713460445404053,
"epoch": 0.7588785046728972,
"grad_norm": 0.02874363400042057,
"learning_rate": 0.0002,
"loss": 0.5726850032806396,
"mean_token_accuracy": 0.7662371546030045,
"num_tokens": 3310262.0,
"step": 203
},
{
"entropy": 0.5619738698005676,
"epoch": 0.7626168224299066,
"grad_norm": 0.028361184522509575,
"learning_rate": 0.0002,
"loss": 0.5660584568977356,
"mean_token_accuracy": 0.7703312337398529,
"num_tokens": 3326670.0,
"step": 204
},
{
"entropy": 0.5531926304101944,
"epoch": 0.7663551401869159,
"grad_norm": 0.029734794050455093,
"learning_rate": 0.0002,
"loss": 0.5551853775978088,
"mean_token_accuracy": 0.7757412046194077,
"num_tokens": 3343182.0,
"step": 205
},
{
"entropy": 0.5436140149831772,
"epoch": 0.7700934579439253,
"grad_norm": 0.027612119913101196,
"learning_rate": 0.0002,
"loss": 0.5460025668144226,
"mean_token_accuracy": 0.7787571996450424,
"num_tokens": 3359734.0,
"step": 206
},
{
"entropy": 0.5484267920255661,
"epoch": 0.7738317757009345,
"grad_norm": 0.0273665152490139,
"learning_rate": 0.0002,
"loss": 0.5512120723724365,
"mean_token_accuracy": 0.7762885689735413,
"num_tokens": 3375965.0,
"step": 207
},
{
"entropy": 0.5604408234357834,
"epoch": 0.7775700934579439,
"grad_norm": 0.03310655429959297,
"learning_rate": 0.0002,
"loss": 0.5644571185112,
"mean_token_accuracy": 0.7733126729726791,
"num_tokens": 3392102.0,
"step": 208
},
{
"entropy": 0.5418381690979004,
"epoch": 0.7813084112149533,
"grad_norm": 0.03232184052467346,
"learning_rate": 0.0002,
"loss": 0.5521958470344543,
"mean_token_accuracy": 0.7741148620843887,
"num_tokens": 3408306.0,
"step": 209
},
{
"entropy": 0.5678922086954117,
"epoch": 0.7850467289719626,
"grad_norm": 0.02696731500327587,
"learning_rate": 0.0002,
"loss": 0.5638433694839478,
"mean_token_accuracy": 0.7702384293079376,
"num_tokens": 3424846.0,
"step": 210
},
{
"entropy": 0.5885234028100967,
"epoch": 0.788785046728972,
"grad_norm": 0.032732248306274414,
"learning_rate": 0.0002,
"loss": 0.5857526659965515,
"mean_token_accuracy": 0.7618716955184937,
"num_tokens": 3441315.0,
"step": 211
},
{
"entropy": 0.5481836199760437,
"epoch": 0.7925233644859813,
"grad_norm": 0.03158198669552803,
"learning_rate": 0.0002,
"loss": 0.5456998348236084,
"mean_token_accuracy": 0.7771993726491928,
"num_tokens": 3457579.0,
"step": 212
},
{
"entropy": 0.5607763081789017,
"epoch": 0.7962616822429907,
"grad_norm": 0.03416353091597557,
"learning_rate": 0.0002,
"loss": 0.5663735270500183,
"mean_token_accuracy": 0.7718233168125153,
"num_tokens": 3474205.0,
"step": 213
},
{
"entropy": 0.5533930957317352,
"epoch": 0.8,
"grad_norm": 0.02877282351255417,
"learning_rate": 0.0002,
"loss": 0.5556164383888245,
"mean_token_accuracy": 0.7742215096950531,
"num_tokens": 3490438.0,
"step": 214
},
{
"entropy": 0.5604168176651001,
"epoch": 0.8037383177570093,
"grad_norm": 0.026928121224045753,
"learning_rate": 0.0002,
"loss": 0.5551791191101074,
"mean_token_accuracy": 0.77230204641819,
"num_tokens": 3506851.0,
"step": 215
},
{
"entropy": 0.5647037774324417,
"epoch": 0.8074766355140187,
"grad_norm": 0.03445446118712425,
"learning_rate": 0.0002,
"loss": 0.5678783655166626,
"mean_token_accuracy": 0.7699416279792786,
"num_tokens": 3523043.0,
"step": 216
},
{
"entropy": 0.571955680847168,
"epoch": 0.811214953271028,
"grad_norm": 0.028322864323854446,
"learning_rate": 0.0002,
"loss": 0.5738518238067627,
"mean_token_accuracy": 0.7654245793819427,
"num_tokens": 3539365.0,
"step": 217
},
{
"entropy": 0.5523362904787064,
"epoch": 0.8149532710280374,
"grad_norm": 0.033752068877220154,
"learning_rate": 0.0002,
"loss": 0.5535821914672852,
"mean_token_accuracy": 0.7761557102203369,
"num_tokens": 3555412.0,
"step": 218
},
{
"entropy": 0.5571073293685913,
"epoch": 0.8186915887850468,
"grad_norm": 0.03274444863200188,
"learning_rate": 0.0002,
"loss": 0.5591251850128174,
"mean_token_accuracy": 0.7738742381334305,
"num_tokens": 3571607.0,
"step": 219
},
{
"entropy": 0.5460310876369476,
"epoch": 0.822429906542056,
"grad_norm": 0.03267780691385269,
"learning_rate": 0.0002,
"loss": 0.5483282208442688,
"mean_token_accuracy": 0.774459958076477,
"num_tokens": 3588112.0,
"step": 220
},
{
"entropy": 0.5458645969629288,
"epoch": 0.8261682242990654,
"grad_norm": 0.029655037447810173,
"learning_rate": 0.0002,
"loss": 0.553710401058197,
"mean_token_accuracy": 0.7749865502119064,
"num_tokens": 3604422.0,
"step": 221
},
{
"entropy": 0.5589277297258377,
"epoch": 0.8299065420560747,
"grad_norm": 0.0299095269292593,
"learning_rate": 0.0002,
"loss": 0.5621532201766968,
"mean_token_accuracy": 0.7721328884363174,
"num_tokens": 3620586.0,
"step": 222
},
{
"entropy": 0.5576933324337006,
"epoch": 0.8336448598130841,
"grad_norm": 0.031302373856306076,
"learning_rate": 0.0002,
"loss": 0.5637439489364624,
"mean_token_accuracy": 0.7706159353256226,
"num_tokens": 3636859.0,
"step": 223
},
{
"entropy": 0.5583267956972122,
"epoch": 0.8373831775700935,
"grad_norm": 0.02684536948800087,
"learning_rate": 0.0002,
"loss": 0.5605804920196533,
"mean_token_accuracy": 0.7703929096460342,
"num_tokens": 3653154.0,
"step": 224
},
{
"entropy": 0.5555603951215744,
"epoch": 0.8411214953271028,
"grad_norm": 0.025324055925011635,
"learning_rate": 0.0002,
"loss": 0.5553929805755615,
"mean_token_accuracy": 0.773400217294693,
"num_tokens": 3669474.0,
"step": 225
},
{
"entropy": 0.5502129048109055,
"epoch": 0.8448598130841122,
"grad_norm": 0.03151983022689819,
"learning_rate": 0.0002,
"loss": 0.5402862429618835,
"mean_token_accuracy": 0.7839637249708176,
"num_tokens": 3685885.0,
"step": 226
},
{
"entropy": 0.5631079375743866,
"epoch": 0.8485981308411215,
"grad_norm": 0.026639366522431374,
"learning_rate": 0.0002,
"loss": 0.5603518486022949,
"mean_token_accuracy": 0.7707885354757309,
"num_tokens": 3702475.0,
"step": 227
},
{
"entropy": 0.5576464682817459,
"epoch": 0.8523364485981308,
"grad_norm": 0.028526777401566505,
"learning_rate": 0.0002,
"loss": 0.5615932941436768,
"mean_token_accuracy": 0.7698924392461777,
"num_tokens": 3718675.0,
"step": 228
},
{
"entropy": 0.5553766041994095,
"epoch": 0.8560747663551402,
"grad_norm": 0.028387868776917458,
"learning_rate": 0.0002,
"loss": 0.5598117709159851,
"mean_token_accuracy": 0.7748202681541443,
"num_tokens": 3734973.0,
"step": 229
},
{
"entropy": 0.5636192113161087,
"epoch": 0.8598130841121495,
"grad_norm": 0.029663704335689545,
"learning_rate": 0.0002,
"loss": 0.5619429349899292,
"mean_token_accuracy": 0.7697723060846329,
"num_tokens": 3751197.0,
"step": 230
},
{
"entropy": 0.5656130164861679,
"epoch": 0.8635514018691589,
"grad_norm": 0.027196481823921204,
"learning_rate": 0.0002,
"loss": 0.559482753276825,
"mean_token_accuracy": 0.7736194878816605,
"num_tokens": 3767681.0,
"step": 231
},
{
"entropy": 0.5610507130622864,
"epoch": 0.8672897196261682,
"grad_norm": 0.02665848098695278,
"learning_rate": 0.0002,
"loss": 0.5574455857276917,
"mean_token_accuracy": 0.7723447382450104,
"num_tokens": 3784223.0,
"step": 232
},
{
"entropy": 0.5565789192914963,
"epoch": 0.8710280373831776,
"grad_norm": 0.029676776379346848,
"learning_rate": 0.0002,
"loss": 0.5581963062286377,
"mean_token_accuracy": 0.7723328024148941,
"num_tokens": 3800606.0,
"step": 233
},
{
"entropy": 0.5488535314798355,
"epoch": 0.874766355140187,
"grad_norm": 0.026432445272803307,
"learning_rate": 0.0002,
"loss": 0.5548264384269714,
"mean_token_accuracy": 0.776095449924469,
"num_tokens": 3817211.0,
"step": 234
},
{
"entropy": 0.5432089567184448,
"epoch": 0.8785046728971962,
"grad_norm": 0.028454309329390526,
"learning_rate": 0.0002,
"loss": 0.5551573038101196,
"mean_token_accuracy": 0.7737965285778046,
"num_tokens": 3833562.0,
"step": 235
},
{
"entropy": 0.5564523041248322,
"epoch": 0.8822429906542056,
"grad_norm": 0.03045317530632019,
"learning_rate": 0.0002,
"loss": 0.5593273043632507,
"mean_token_accuracy": 0.7728880196809769,
"num_tokens": 3849716.0,
"step": 236
},
{
"entropy": 0.5449672043323517,
"epoch": 0.8859813084112149,
"grad_norm": 0.026425793766975403,
"learning_rate": 0.0002,
"loss": 0.5469970107078552,
"mean_token_accuracy": 0.777935191988945,
"num_tokens": 3865915.0,
"step": 237
},
{
"entropy": 0.5773142129182816,
"epoch": 0.8897196261682243,
"grad_norm": 0.024763669818639755,
"learning_rate": 0.0002,
"loss": 0.5751665830612183,
"mean_token_accuracy": 0.7665848284959793,
"num_tokens": 3882374.0,
"step": 238
},
{
"entropy": 0.5337313264608383,
"epoch": 0.8934579439252337,
"grad_norm": 0.027221228927373886,
"learning_rate": 0.0002,
"loss": 0.5295661687850952,
"mean_token_accuracy": 0.7860913276672363,
"num_tokens": 3898501.0,
"step": 239
},
{
"entropy": 0.5395989120006561,
"epoch": 0.897196261682243,
"grad_norm": 0.026916388422250748,
"learning_rate": 0.0002,
"loss": 0.5377291440963745,
"mean_token_accuracy": 0.7827803045511246,
"num_tokens": 3914802.0,
"step": 240
},
{
"entropy": 0.56096251308918,
"epoch": 0.9009345794392524,
"grad_norm": 0.03178329020738602,
"learning_rate": 0.0002,
"loss": 0.5572348237037659,
"mean_token_accuracy": 0.774958074092865,
"num_tokens": 3931307.0,
"step": 241
},
{
"entropy": 0.5351977944374084,
"epoch": 0.9046728971962616,
"grad_norm": 0.027758494019508362,
"learning_rate": 0.0002,
"loss": 0.5389144420623779,
"mean_token_accuracy": 0.7842132151126862,
"num_tokens": 3947818.0,
"step": 242
},
{
"entropy": 0.5689495801925659,
"epoch": 0.908411214953271,
"grad_norm": 0.028313076123595238,
"learning_rate": 0.0002,
"loss": 0.5732687711715698,
"mean_token_accuracy": 0.7685291916131973,
"num_tokens": 3964238.0,
"step": 243
},
{
"entropy": 0.5562418401241302,
"epoch": 0.9121495327102803,
"grad_norm": 0.028738385066390038,
"learning_rate": 0.0002,
"loss": 0.5559317469596863,
"mean_token_accuracy": 0.7747041881084442,
"num_tokens": 3980625.0,
"step": 244
},
{
"entropy": 0.5630334913730621,
"epoch": 0.9158878504672897,
"grad_norm": 0.024547314271330833,
"learning_rate": 0.0002,
"loss": 0.560680627822876,
"mean_token_accuracy": 0.7717334777116776,
"num_tokens": 3997248.0,
"step": 245
},
{
"entropy": 0.5409311354160309,
"epoch": 0.9196261682242991,
"grad_norm": 0.029392484575510025,
"learning_rate": 0.0002,
"loss": 0.5488813519477844,
"mean_token_accuracy": 0.7771373838186264,
"num_tokens": 4013356.0,
"step": 246
},
{
"entropy": 0.5529599785804749,
"epoch": 0.9233644859813084,
"grad_norm": 0.024964116513729095,
"learning_rate": 0.0002,
"loss": 0.5492331385612488,
"mean_token_accuracy": 0.778782069683075,
"num_tokens": 4029521.0,
"step": 247
},
{
"entropy": 0.5397895872592926,
"epoch": 0.9271028037383178,
"grad_norm": 0.026621561497449875,
"learning_rate": 0.0002,
"loss": 0.5443588495254517,
"mean_token_accuracy": 0.7782554626464844,
"num_tokens": 4045913.0,
"step": 248
},
{
"entropy": 0.5582248121500015,
"epoch": 0.930841121495327,
"grad_norm": 0.02803446725010872,
"learning_rate": 0.0002,
"loss": 0.5627061128616333,
"mean_token_accuracy": 0.7742072343826294,
"num_tokens": 4062448.0,
"step": 249
},
{
"entropy": 0.5673990696668625,
"epoch": 0.9345794392523364,
"grad_norm": 0.03014424815773964,
"learning_rate": 0.0002,
"loss": 0.5727946162223816,
"mean_token_accuracy": 0.7685662358999252,
"num_tokens": 4078711.0,
"step": 250
},
{
"entropy": 0.566023588180542,
"epoch": 0.9383177570093458,
"grad_norm": 0.030524935573339462,
"learning_rate": 0.0002,
"loss": 0.5595183372497559,
"mean_token_accuracy": 0.7738057672977448,
"num_tokens": 4095240.0,
"step": 251
},
{
"entropy": 0.5499134510755539,
"epoch": 0.9420560747663551,
"grad_norm": 0.02502668835222721,
"learning_rate": 0.0002,
"loss": 0.5446998476982117,
"mean_token_accuracy": 0.7789950519800186,
"num_tokens": 4111687.0,
"step": 252
},
{
"entropy": 0.5639411062002182,
"epoch": 0.9457943925233645,
"grad_norm": 0.03420841693878174,
"learning_rate": 0.0002,
"loss": 0.5659236311912537,
"mean_token_accuracy": 0.7703807950019836,
"num_tokens": 4128093.0,
"step": 253
},
{
"entropy": 0.5703455805778503,
"epoch": 0.9495327102803738,
"grad_norm": 0.0303607527166605,
"learning_rate": 0.0002,
"loss": 0.5696687698364258,
"mean_token_accuracy": 0.7690610140562057,
"num_tokens": 4144612.0,
"step": 254
},
{
"entropy": 0.558226928114891,
"epoch": 0.9532710280373832,
"grad_norm": 0.03168858587741852,
"learning_rate": 0.0002,
"loss": 0.5676078200340271,
"mean_token_accuracy": 0.7693912833929062,
"num_tokens": 4161169.0,
"step": 255
},
{
"entropy": 0.5530082136392593,
"epoch": 0.9570093457943926,
"grad_norm": 0.027083205059170723,
"learning_rate": 0.0002,
"loss": 0.5579201579093933,
"mean_token_accuracy": 0.772939920425415,
"num_tokens": 4177454.0,
"step": 256
},
{
"entropy": 0.5732781291007996,
"epoch": 0.9607476635514018,
"grad_norm": 0.025865184143185616,
"learning_rate": 0.0002,
"loss": 0.5745596289634705,
"mean_token_accuracy": 0.7667286545038223,
"num_tokens": 4193733.0,
"step": 257
},
{
"entropy": 0.5650701373815536,
"epoch": 0.9644859813084112,
"grad_norm": 0.03244631364941597,
"learning_rate": 0.0002,
"loss": 0.5617667436599731,
"mean_token_accuracy": 0.7715478390455246,
"num_tokens": 4209843.0,
"step": 258
},
{
"entropy": 0.5724828094244003,
"epoch": 0.9682242990654205,
"grad_norm": 0.02807115763425827,
"learning_rate": 0.0002,
"loss": 0.5692450404167175,
"mean_token_accuracy": 0.76779405772686,
"num_tokens": 4226262.0,
"step": 259
},
{
"entropy": 0.5677514672279358,
"epoch": 0.9719626168224299,
"grad_norm": 0.024189095944166183,
"learning_rate": 0.0002,
"loss": 0.5623309016227722,
"mean_token_accuracy": 0.7734705060720444,
"num_tokens": 4242877.0,
"step": 260
},
{
"entropy": 0.56018927693367,
"epoch": 0.9757009345794393,
"grad_norm": 0.030152512714266777,
"learning_rate": 0.0002,
"loss": 0.5675455927848816,
"mean_token_accuracy": 0.7673967182636261,
"num_tokens": 4259432.0,
"step": 261
},
{
"entropy": 0.5601605176925659,
"epoch": 0.9794392523364486,
"grad_norm": 0.0288025364279747,
"learning_rate": 0.0002,
"loss": 0.5698415040969849,
"mean_token_accuracy": 0.7686598151922226,
"num_tokens": 4275917.0,
"step": 262
},
{
"entropy": 0.5593424290418625,
"epoch": 0.983177570093458,
"grad_norm": 0.024790652096271515,
"learning_rate": 0.0002,
"loss": 0.5574150085449219,
"mean_token_accuracy": 0.7770240753889084,
"num_tokens": 4292310.0,
"step": 263
},
{
"entropy": 0.5394274890422821,
"epoch": 0.9869158878504672,
"grad_norm": 0.02477172389626503,
"learning_rate": 0.0002,
"loss": 0.5407758951187134,
"mean_token_accuracy": 0.780282586812973,
"num_tokens": 4308380.0,
"step": 264
},
{
"entropy": 0.5651121735572815,
"epoch": 0.9906542056074766,
"grad_norm": 0.028029976412653923,
"learning_rate": 0.0002,
"loss": 0.5648099184036255,
"mean_token_accuracy": 0.7703951746225357,
"num_tokens": 4324834.0,
"step": 265
},
{
"entropy": 0.5426322817802429,
"epoch": 0.994392523364486,
"grad_norm": 0.025631116703152657,
"learning_rate": 0.0002,
"loss": 0.5393193364143372,
"mean_token_accuracy": 0.7813181281089783,
"num_tokens": 4341233.0,
"step": 266
},
{
"entropy": 0.5464787781238556,
"epoch": 0.9981308411214953,
"grad_norm": 0.029863541945815086,
"learning_rate": 0.0002,
"loss": 0.5550025701522827,
"mean_token_accuracy": 0.7747247219085693,
"num_tokens": 4357682.0,
"step": 267
},
{
"entropy": 0.5607179999351501,
"epoch": 1.0,
"grad_norm": 0.03738218545913696,
"learning_rate": 0.0002,
"loss": 0.5586302876472473,
"mean_token_accuracy": 0.7706243097782135,
"num_tokens": 4364958.0,
"step": 268
},
{
"entropy": 0.5429188311100006,
"epoch": 1.0037383177570094,
"grad_norm": 0.031045127660036087,
"learning_rate": 0.0002,
"loss": 0.5379543900489807,
"mean_token_accuracy": 0.7818119078874588,
"num_tokens": 4381160.0,
"step": 269
},
{
"entropy": 0.5693697482347488,
"epoch": 1.0074766355140188,
"grad_norm": 0.034702617675065994,
"learning_rate": 0.0002,
"loss": 0.5631182789802551,
"mean_token_accuracy": 0.7740933299064636,
"num_tokens": 4397580.0,
"step": 270
},
{
"entropy": 0.5556007027626038,
"epoch": 1.011214953271028,
"grad_norm": 0.029613088816404343,
"learning_rate": 0.0002,
"loss": 0.5564326643943787,
"mean_token_accuracy": 0.7747503072023392,
"num_tokens": 4413970.0,
"step": 271
},
{
"entropy": 0.5529852658510208,
"epoch": 1.0149532710280373,
"grad_norm": 0.028977181762456894,
"learning_rate": 0.0002,
"loss": 0.5552069544792175,
"mean_token_accuracy": 0.7720492333173752,
"num_tokens": 4430293.0,
"step": 272
},
{
"entropy": 0.5520482361316681,
"epoch": 1.0186915887850467,
"grad_norm": 0.03374192863702774,
"learning_rate": 0.0002,
"loss": 0.5517052412033081,
"mean_token_accuracy": 0.7761924266815186,
"num_tokens": 4446900.0,
"step": 273
},
{
"entropy": 0.5477887243032455,
"epoch": 1.0224299065420561,
"grad_norm": 0.02954636886715889,
"learning_rate": 0.0002,
"loss": 0.5459023714065552,
"mean_token_accuracy": 0.7766608893871307,
"num_tokens": 4463329.0,
"step": 274
},
{
"entropy": 0.5484108775854111,
"epoch": 1.0261682242990655,
"grad_norm": 0.029792649671435356,
"learning_rate": 0.0002,
"loss": 0.553299069404602,
"mean_token_accuracy": 0.7751943320035934,
"num_tokens": 4479679.0,
"step": 275
},
{
"entropy": 0.5480824410915375,
"epoch": 1.0299065420560747,
"grad_norm": 0.03428385779261589,
"learning_rate": 0.0002,
"loss": 0.54673171043396,
"mean_token_accuracy": 0.7777809202671051,
"num_tokens": 4496261.0,
"step": 276
},
{
"entropy": 0.5371964275836945,
"epoch": 1.033644859813084,
"grad_norm": 0.027453402057290077,
"learning_rate": 0.0002,
"loss": 0.5412828922271729,
"mean_token_accuracy": 0.7782962769269943,
"num_tokens": 4512363.0,
"step": 277
},
{
"entropy": 0.5626021921634674,
"epoch": 1.0373831775700935,
"grad_norm": 0.03147402033209801,
"learning_rate": 0.0002,
"loss": 0.5639899373054504,
"mean_token_accuracy": 0.772662416100502,
"num_tokens": 4528687.0,
"step": 278
},
{
"entropy": 0.5309132784605026,
"epoch": 1.0411214953271029,
"grad_norm": 0.03592999279499054,
"learning_rate": 0.0002,
"loss": 0.5408714413642883,
"mean_token_accuracy": 0.7803217619657516,
"num_tokens": 4544861.0,
"step": 279
},
{
"entropy": 0.5621335506439209,
"epoch": 1.0448598130841122,
"grad_norm": 0.027180444449186325,
"learning_rate": 0.0002,
"loss": 0.5557287931442261,
"mean_token_accuracy": 0.7766296565532684,
"num_tokens": 4561446.0,
"step": 280
},
{
"entropy": 0.5597621351480484,
"epoch": 1.0485981308411214,
"grad_norm": 0.030723722651600838,
"learning_rate": 0.0002,
"loss": 0.5488376617431641,
"mean_token_accuracy": 0.7752789407968521,
"num_tokens": 4577902.0,
"step": 281
},
{
"entropy": 0.5447895377874374,
"epoch": 1.0523364485981308,
"grad_norm": 0.03346191346645355,
"learning_rate": 0.0002,
"loss": 0.54459547996521,
"mean_token_accuracy": 0.7764092832803726,
"num_tokens": 4593907.0,
"step": 282
},
{
"entropy": 0.5376723855733871,
"epoch": 1.0560747663551402,
"grad_norm": 0.029941193759441376,
"learning_rate": 0.0002,
"loss": 0.5396949052810669,
"mean_token_accuracy": 0.7800134569406509,
"num_tokens": 4610281.0,
"step": 283
},
{
"entropy": 0.532968744635582,
"epoch": 1.0598130841121496,
"grad_norm": 0.03566444665193558,
"learning_rate": 0.0002,
"loss": 0.5449310541152954,
"mean_token_accuracy": 0.7814425081014633,
"num_tokens": 4626569.0,
"step": 284
},
{
"entropy": 0.5349016040563583,
"epoch": 1.063551401869159,
"grad_norm": 0.03160771727561951,
"learning_rate": 0.0002,
"loss": 0.5422961115837097,
"mean_token_accuracy": 0.7798893004655838,
"num_tokens": 4643058.0,
"step": 285
},
{
"entropy": 0.533850871026516,
"epoch": 1.0672897196261681,
"grad_norm": 0.036520425230264664,
"learning_rate": 0.0002,
"loss": 0.5418434739112854,
"mean_token_accuracy": 0.7801807075738907,
"num_tokens": 4659171.0,
"step": 286
},
{
"entropy": 0.5512394160032272,
"epoch": 1.0710280373831775,
"grad_norm": 0.030453668907284737,
"learning_rate": 0.0002,
"loss": 0.547731339931488,
"mean_token_accuracy": 0.77372145652771,
"num_tokens": 4675372.0,
"step": 287
},
{
"entropy": 0.5371382534503937,
"epoch": 1.074766355140187,
"grad_norm": 0.031432170420885086,
"learning_rate": 0.0002,
"loss": 0.5252817869186401,
"mean_token_accuracy": 0.7852388918399811,
"num_tokens": 4691895.0,
"step": 288
},
{
"entropy": 0.5536183714866638,
"epoch": 1.0785046728971963,
"grad_norm": 0.036878716200590134,
"learning_rate": 0.0002,
"loss": 0.5542073249816895,
"mean_token_accuracy": 0.7766832113265991,
"num_tokens": 4708579.0,
"step": 289
},
{
"entropy": 0.5479064285755157,
"epoch": 1.0822429906542057,
"grad_norm": 0.031178997829556465,
"learning_rate": 0.0002,
"loss": 0.5539444088935852,
"mean_token_accuracy": 0.7733383923768997,
"num_tokens": 4725006.0,
"step": 290
},
{
"entropy": 0.5490889102220535,
"epoch": 1.0859813084112149,
"grad_norm": 0.03600861504673958,
"learning_rate": 0.0002,
"loss": 0.5477103590965271,
"mean_token_accuracy": 0.7760229259729385,
"num_tokens": 4741146.0,
"step": 291
},
{
"entropy": 0.5331408083438873,
"epoch": 1.0897196261682243,
"grad_norm": 0.029067492112517357,
"learning_rate": 0.0002,
"loss": 0.5310513377189636,
"mean_token_accuracy": 0.7808917611837387,
"num_tokens": 4757405.0,
"step": 292
},
{
"entropy": 0.5732952356338501,
"epoch": 1.0934579439252337,
"grad_norm": 0.027897845953702927,
"learning_rate": 0.0002,
"loss": 0.5689205527305603,
"mean_token_accuracy": 0.7669987231492996,
"num_tokens": 4773935.0,
"step": 293
},
{
"entropy": 0.5514747202396393,
"epoch": 1.097196261682243,
"grad_norm": 0.03678213432431221,
"learning_rate": 0.0002,
"loss": 0.5475887060165405,
"mean_token_accuracy": 0.7782610803842545,
"num_tokens": 4790197.0,
"step": 294
},
{
"entropy": 0.5528618544340134,
"epoch": 1.1009345794392524,
"grad_norm": 0.03136972337961197,
"learning_rate": 0.0002,
"loss": 0.5539395213127136,
"mean_token_accuracy": 0.7734730243682861,
"num_tokens": 4806625.0,
"step": 295
},
{
"entropy": 0.5395589917898178,
"epoch": 1.1046728971962616,
"grad_norm": 0.030648380517959595,
"learning_rate": 0.0002,
"loss": 0.5440752506256104,
"mean_token_accuracy": 0.7809486091136932,
"num_tokens": 4823046.0,
"step": 296
},
{
"entropy": 0.5670987218618393,
"epoch": 1.108411214953271,
"grad_norm": 0.028722837567329407,
"learning_rate": 0.0002,
"loss": 0.5669575929641724,
"mean_token_accuracy": 0.7682226747274399,
"num_tokens": 4839449.0,
"step": 297
},
{
"entropy": 0.5453528463840485,
"epoch": 1.1121495327102804,
"grad_norm": 0.03358433395624161,
"learning_rate": 0.0002,
"loss": 0.5394450426101685,
"mean_token_accuracy": 0.7793479263782501,
"num_tokens": 4855702.0,
"step": 298
},
{
"entropy": 0.5313688218593597,
"epoch": 1.1158878504672898,
"grad_norm": 0.031751058995723724,
"learning_rate": 0.0002,
"loss": 0.5339279174804688,
"mean_token_accuracy": 0.7852170914411545,
"num_tokens": 4872035.0,
"step": 299
},
{
"entropy": 0.5542233884334564,
"epoch": 1.1196261682242992,
"grad_norm": 0.030381185933947563,
"learning_rate": 0.0002,
"loss": 0.5629603862762451,
"mean_token_accuracy": 0.76924729347229,
"num_tokens": 4888405.0,
"step": 300
},
{
"entropy": 0.5514146685600281,
"epoch": 1.1233644859813083,
"grad_norm": 0.028884021565318108,
"learning_rate": 0.0002,
"loss": 0.550013542175293,
"mean_token_accuracy": 0.7766973823308945,
"num_tokens": 4904871.0,
"step": 301
},
{
"entropy": 0.5544252693653107,
"epoch": 1.1271028037383177,
"grad_norm": 0.03688167408108711,
"learning_rate": 0.0002,
"loss": 0.5589375495910645,
"mean_token_accuracy": 0.7750934660434723,
"num_tokens": 4921370.0,
"step": 302
},
{
"entropy": 0.5409253090620041,
"epoch": 1.1308411214953271,
"grad_norm": 0.026449156925082207,
"learning_rate": 0.0002,
"loss": 0.5402511358261108,
"mean_token_accuracy": 0.7794521301984787,
"num_tokens": 4937635.0,
"step": 303
},
{
"entropy": 0.5496914833784103,
"epoch": 1.1345794392523365,
"grad_norm": 0.030888745561242104,
"learning_rate": 0.0002,
"loss": 0.5520302653312683,
"mean_token_accuracy": 0.7741389274597168,
"num_tokens": 4953795.0,
"step": 304
},
{
"entropy": 0.5356033593416214,
"epoch": 1.1383177570093457,
"grad_norm": 0.030453680083155632,
"learning_rate": 0.0002,
"loss": 0.5415939092636108,
"mean_token_accuracy": 0.7807344794273376,
"num_tokens": 4970296.0,
"step": 305
},
{
"entropy": 0.53813037276268,
"epoch": 1.142056074766355,
"grad_norm": 0.03046366199851036,
"learning_rate": 0.0002,
"loss": 0.5416396856307983,
"mean_token_accuracy": 0.7764643579721451,
"num_tokens": 4986502.0,
"step": 306
},
{
"entropy": 0.5428405404090881,
"epoch": 1.1457943925233645,
"grad_norm": 0.03174874931573868,
"learning_rate": 0.0002,
"loss": 0.5486522912979126,
"mean_token_accuracy": 0.7775285989046097,
"num_tokens": 5002702.0,
"step": 307
},
{
"entropy": 0.5566747784614563,
"epoch": 1.1495327102803738,
"grad_norm": 0.028818320482969284,
"learning_rate": 0.0002,
"loss": 0.5562471151351929,
"mean_token_accuracy": 0.77483069896698,
"num_tokens": 5019050.0,
"step": 308
},
{
"entropy": 0.5498685240745544,
"epoch": 1.1532710280373832,
"grad_norm": 0.028088422492146492,
"learning_rate": 0.0002,
"loss": 0.5427108407020569,
"mean_token_accuracy": 0.7781059741973877,
"num_tokens": 5035367.0,
"step": 309
},
{
"entropy": 0.5676623731851578,
"epoch": 1.1570093457943926,
"grad_norm": 0.02635916881263256,
"learning_rate": 0.0002,
"loss": 0.5621261596679688,
"mean_token_accuracy": 0.7690412253141403,
"num_tokens": 5051623.0,
"step": 310
},
{
"entropy": 0.5571839809417725,
"epoch": 1.1607476635514018,
"grad_norm": 0.030562767758965492,
"learning_rate": 0.0002,
"loss": 0.5547442436218262,
"mean_token_accuracy": 0.773685023188591,
"num_tokens": 5067784.0,
"step": 311
},
{
"entropy": 0.5521961599588394,
"epoch": 1.1644859813084112,
"grad_norm": 0.02953186444938183,
"learning_rate": 0.0002,
"loss": 0.5498039722442627,
"mean_token_accuracy": 0.7766331732273102,
"num_tokens": 5084198.0,
"step": 312
},
{
"entropy": 0.5448037981987,
"epoch": 1.1682242990654206,
"grad_norm": 0.04071420431137085,
"learning_rate": 0.0002,
"loss": 0.5559482574462891,
"mean_token_accuracy": 0.7727169245481491,
"num_tokens": 5100585.0,
"step": 313
},
{
"entropy": 0.5439905822277069,
"epoch": 1.17196261682243,
"grad_norm": 0.031825143843889236,
"learning_rate": 0.0002,
"loss": 0.5438477396965027,
"mean_token_accuracy": 0.7780765742063522,
"num_tokens": 5116856.0,
"step": 314
},
{
"entropy": 0.5614278465509415,
"epoch": 1.1757009345794391,
"grad_norm": 0.03391456976532936,
"learning_rate": 0.0002,
"loss": 0.5585231781005859,
"mean_token_accuracy": 0.774724468588829,
"num_tokens": 5133123.0,
"step": 315
},
{
"entropy": 0.5348840728402138,
"epoch": 1.1794392523364485,
"grad_norm": 0.030404910445213318,
"learning_rate": 0.0002,
"loss": 0.5299553275108337,
"mean_token_accuracy": 0.7871359586715698,
"num_tokens": 5149505.0,
"step": 316
},
{
"entropy": 0.5417611449956894,
"epoch": 1.183177570093458,
"grad_norm": 0.03005358763039112,
"learning_rate": 0.0002,
"loss": 0.5521109700202942,
"mean_token_accuracy": 0.7752534449100494,
"num_tokens": 5165665.0,
"step": 317
},
{
"entropy": 0.5467934459447861,
"epoch": 1.1869158878504673,
"grad_norm": 0.030464891344308853,
"learning_rate": 0.0002,
"loss": 0.5535311698913574,
"mean_token_accuracy": 0.7757606655359268,
"num_tokens": 5182312.0,
"step": 318
},
{
"entropy": 0.55706687271595,
"epoch": 1.1906542056074767,
"grad_norm": 0.03402930125594139,
"learning_rate": 0.0002,
"loss": 0.56557697057724,
"mean_token_accuracy": 0.773482084274292,
"num_tokens": 5198753.0,
"step": 319
},
{
"entropy": 0.5285287350416183,
"epoch": 1.194392523364486,
"grad_norm": 0.03398562967777252,
"learning_rate": 0.0002,
"loss": 0.5356812477111816,
"mean_token_accuracy": 0.781065508723259,
"num_tokens": 5214716.0,
"step": 320
},
{
"entropy": 0.5561061501502991,
"epoch": 1.1981308411214953,
"grad_norm": 0.04313025251030922,
"learning_rate": 0.0002,
"loss": 0.5472796559333801,
"mean_token_accuracy": 0.7778294533491135,
"num_tokens": 5230933.0,
"step": 321
},
{
"entropy": 0.556538999080658,
"epoch": 1.2018691588785047,
"grad_norm": 0.03227441757917404,
"learning_rate": 0.0002,
"loss": 0.5438181161880493,
"mean_token_accuracy": 0.7791680693626404,
"num_tokens": 5247202.0,
"step": 322
},
{
"entropy": 0.5609522461891174,
"epoch": 1.205607476635514,
"grad_norm": 0.03183369338512421,
"learning_rate": 0.0002,
"loss": 0.5561162829399109,
"mean_token_accuracy": 0.7751743495464325,
"num_tokens": 5263696.0,
"step": 323
},
{
"entropy": 0.5427358001470566,
"epoch": 1.2093457943925234,
"grad_norm": 0.03253727778792381,
"learning_rate": 0.0002,
"loss": 0.5515695214271545,
"mean_token_accuracy": 0.7756281793117523,
"num_tokens": 5280141.0,
"step": 324
},
{
"entropy": 0.5160750597715378,
"epoch": 1.2130841121495326,
"grad_norm": 0.03668288141489029,
"learning_rate": 0.0002,
"loss": 0.526226282119751,
"mean_token_accuracy": 0.7851300984621048,
"num_tokens": 5296198.0,
"step": 325
},
{
"entropy": 0.5500008910894394,
"epoch": 1.216822429906542,
"grad_norm": 0.03275466337800026,
"learning_rate": 0.0002,
"loss": 0.5556660890579224,
"mean_token_accuracy": 0.7739221006631851,
"num_tokens": 5312653.0,
"step": 326
},
{
"entropy": 0.5459257364273071,
"epoch": 1.2205607476635514,
"grad_norm": 0.02891591377556324,
"learning_rate": 0.0002,
"loss": 0.5413340330123901,
"mean_token_accuracy": 0.781257688999176,
"num_tokens": 5328926.0,
"step": 327
},
{
"entropy": 0.5695579349994659,
"epoch": 1.2242990654205608,
"grad_norm": 0.0299241840839386,
"learning_rate": 0.0002,
"loss": 0.5636513233184814,
"mean_token_accuracy": 0.7732590138912201,
"num_tokens": 5345213.0,
"step": 328
},
{
"entropy": 0.5591664463281631,
"epoch": 1.2280373831775702,
"grad_norm": 0.034591834992170334,
"learning_rate": 0.0002,
"loss": 0.5587798953056335,
"mean_token_accuracy": 0.7725549340248108,
"num_tokens": 5361493.0,
"step": 329
},
{
"entropy": 0.5631786286830902,
"epoch": 1.2317757009345796,
"grad_norm": 0.03143571689724922,
"learning_rate": 0.0002,
"loss": 0.5540720224380493,
"mean_token_accuracy": 0.7765887975692749,
"num_tokens": 5378085.0,
"step": 330
},
{
"entropy": 0.5508914291858673,
"epoch": 1.2355140186915887,
"grad_norm": 0.032595690339803696,
"learning_rate": 0.0002,
"loss": 0.5526955723762512,
"mean_token_accuracy": 0.7747674286365509,
"num_tokens": 5394458.0,
"step": 331
},
{
"entropy": 0.536909781396389,
"epoch": 1.2392523364485981,
"grad_norm": 0.033028744161129,
"learning_rate": 0.0002,
"loss": 0.5481626987457275,
"mean_token_accuracy": 0.7782605588436127,
"num_tokens": 5410880.0,
"step": 332
},
{
"entropy": 0.5499342679977417,
"epoch": 1.2429906542056075,
"grad_norm": 0.03855755180120468,
"learning_rate": 0.0002,
"loss": 0.5627814531326294,
"mean_token_accuracy": 0.7700037658214569,
"num_tokens": 5426885.0,
"step": 333
},
{
"entropy": 0.5494136810302734,
"epoch": 1.246728971962617,
"grad_norm": 0.03397782891988754,
"learning_rate": 0.0002,
"loss": 0.5508397817611694,
"mean_token_accuracy": 0.7756514847278595,
"num_tokens": 5443330.0,
"step": 334
},
{
"entropy": 0.5679187029600143,
"epoch": 1.250467289719626,
"grad_norm": 0.03217748925089836,
"learning_rate": 0.0002,
"loss": 0.5683805346488953,
"mean_token_accuracy": 0.770328551530838,
"num_tokens": 5459602.0,
"step": 335
},
{
"entropy": 0.5620801448822021,
"epoch": 1.2542056074766355,
"grad_norm": 0.03699919581413269,
"learning_rate": 0.0002,
"loss": 0.556020200252533,
"mean_token_accuracy": 0.7749847769737244,
"num_tokens": 5475920.0,
"step": 336
},
{
"entropy": 0.5483541190624237,
"epoch": 1.2579439252336448,
"grad_norm": 0.027093922719359398,
"learning_rate": 0.0002,
"loss": 0.5420067310333252,
"mean_token_accuracy": 0.7774698734283447,
"num_tokens": 5492418.0,
"step": 337
},
{
"entropy": 0.5432356148958206,
"epoch": 1.2616822429906542,
"grad_norm": 0.029740024358034134,
"learning_rate": 0.0002,
"loss": 0.5436828136444092,
"mean_token_accuracy": 0.7754241824150085,
"num_tokens": 5508720.0,
"step": 338
},
{
"entropy": 0.5282722562551498,
"epoch": 1.2654205607476636,
"grad_norm": 0.02825041115283966,
"learning_rate": 0.0002,
"loss": 0.5287445783615112,
"mean_token_accuracy": 0.785777822136879,
"num_tokens": 5524810.0,
"step": 339
},
{
"entropy": 0.5574855506420135,
"epoch": 1.269158878504673,
"grad_norm": 0.03507409617304802,
"learning_rate": 0.0002,
"loss": 0.5642590522766113,
"mean_token_accuracy": 0.7694929391145706,
"num_tokens": 5541154.0,
"step": 340
},
{
"entropy": 0.5311331301927567,
"epoch": 1.2728971962616822,
"grad_norm": 0.029530638828873634,
"learning_rate": 0.0002,
"loss": 0.5375971794128418,
"mean_token_accuracy": 0.7804928719997406,
"num_tokens": 5557415.0,
"step": 341
},
{
"entropy": 0.5492513477802277,
"epoch": 1.2766355140186916,
"grad_norm": 0.03299937769770622,
"learning_rate": 0.0002,
"loss": 0.5487713813781738,
"mean_token_accuracy": 0.7776053845882416,
"num_tokens": 5573593.0,
"step": 342
},
{
"entropy": 0.5501092821359634,
"epoch": 1.280373831775701,
"grad_norm": 0.03342421352863312,
"learning_rate": 0.0002,
"loss": 0.5497907996177673,
"mean_token_accuracy": 0.7747702449560165,
"num_tokens": 5590001.0,
"step": 343
},
{
"entropy": 0.5520797073841095,
"epoch": 1.2841121495327104,
"grad_norm": 0.029625268653035164,
"learning_rate": 0.0002,
"loss": 0.5493736267089844,
"mean_token_accuracy": 0.7800589352846146,
"num_tokens": 5606174.0,
"step": 344
},
{
"entropy": 0.5360356196761131,
"epoch": 1.2878504672897195,
"grad_norm": 0.03089168108999729,
"learning_rate": 0.0002,
"loss": 0.5362368226051331,
"mean_token_accuracy": 0.7833685129880905,
"num_tokens": 5622436.0,
"step": 345
},
{
"entropy": 0.5267095118761063,
"epoch": 1.291588785046729,
"grad_norm": 0.03297918289899826,
"learning_rate": 0.0002,
"loss": 0.5281186699867249,
"mean_token_accuracy": 0.7881515920162201,
"num_tokens": 5638451.0,
"step": 346
},
{
"entropy": 0.5502850115299225,
"epoch": 1.2953271028037383,
"grad_norm": 0.047267865389585495,
"learning_rate": 0.0002,
"loss": 0.5505760312080383,
"mean_token_accuracy": 0.7761109918355942,
"num_tokens": 5655041.0,
"step": 347
},
{
"entropy": 0.5508257895708084,
"epoch": 1.2990654205607477,
"grad_norm": 0.028140036389231682,
"learning_rate": 0.0002,
"loss": 0.5515832304954529,
"mean_token_accuracy": 0.7750399112701416,
"num_tokens": 5671677.0,
"step": 348
},
{
"entropy": 0.5565541088581085,
"epoch": 1.302803738317757,
"grad_norm": 0.032449062913656235,
"learning_rate": 0.0002,
"loss": 0.5538536310195923,
"mean_token_accuracy": 0.7736092507839203,
"num_tokens": 5688187.0,
"step": 349
},
{
"entropy": 0.5361721217632294,
"epoch": 1.3065420560747665,
"grad_norm": 0.029190748929977417,
"learning_rate": 0.0002,
"loss": 0.5377737879753113,
"mean_token_accuracy": 0.7808200567960739,
"num_tokens": 5704636.0,
"step": 350
},
{
"entropy": 0.5346792191267014,
"epoch": 1.3102803738317756,
"grad_norm": 0.03473074361681938,
"learning_rate": 0.0002,
"loss": 0.5417028665542603,
"mean_token_accuracy": 0.778437003493309,
"num_tokens": 5721160.0,
"step": 351
},
{
"entropy": 0.5305602103471756,
"epoch": 1.314018691588785,
"grad_norm": 0.03426121547818184,
"learning_rate": 0.0002,
"loss": 0.5302631258964539,
"mean_token_accuracy": 0.7822723984718323,
"num_tokens": 5737508.0,
"step": 352
},
{
"entropy": 0.5443065613508224,
"epoch": 1.3177570093457944,
"grad_norm": 0.031232863664627075,
"learning_rate": 0.0002,
"loss": 0.5438801050186157,
"mean_token_accuracy": 0.7807773351669312,
"num_tokens": 5753931.0,
"step": 353
},
{
"entropy": 0.5547338575124741,
"epoch": 1.3214953271028038,
"grad_norm": 0.03515113145112991,
"learning_rate": 0.0002,
"loss": 0.5590701103210449,
"mean_token_accuracy": 0.7718778848648071,
"num_tokens": 5770396.0,
"step": 354
},
{
"entropy": 0.5776932686567307,
"epoch": 1.325233644859813,
"grad_norm": 0.031292639672756195,
"learning_rate": 0.0002,
"loss": 0.5758817791938782,
"mean_token_accuracy": 0.76340052485466,
"num_tokens": 5786743.0,
"step": 355
},
{
"entropy": 0.5471627116203308,
"epoch": 1.3289719626168224,
"grad_norm": 0.02935577929019928,
"learning_rate": 0.0002,
"loss": 0.5406426787376404,
"mean_token_accuracy": 0.7801960557699203,
"num_tokens": 5803296.0,
"step": 356
},
{
"entropy": 0.5335498154163361,
"epoch": 1.3327102803738318,
"grad_norm": 0.029476149007678032,
"learning_rate": 0.0002,
"loss": 0.5379401445388794,
"mean_token_accuracy": 0.7807924002408981,
"num_tokens": 5819523.0,
"step": 357
},
{
"entropy": 0.571747362613678,
"epoch": 1.3364485981308412,
"grad_norm": 0.030969126150012016,
"learning_rate": 0.0002,
"loss": 0.5734298825263977,
"mean_token_accuracy": 0.7665233165025711,
"num_tokens": 5835904.0,
"step": 358
},
{
"entropy": 0.5278273224830627,
"epoch": 1.3401869158878505,
"grad_norm": 0.035017624497413635,
"learning_rate": 0.0002,
"loss": 0.5390288233757019,
"mean_token_accuracy": 0.7818515002727509,
"num_tokens": 5852087.0,
"step": 359
},
{
"entropy": 0.5494511723518372,
"epoch": 1.34392523364486,
"grad_norm": 0.0332498699426651,
"learning_rate": 0.0002,
"loss": 0.5546149611473083,
"mean_token_accuracy": 0.7754078060388565,
"num_tokens": 5868313.0,
"step": 360
},
{
"entropy": 0.5656353235244751,
"epoch": 1.347663551401869,
"grad_norm": 0.029156476259231567,
"learning_rate": 0.0002,
"loss": 0.5639902353286743,
"mean_token_accuracy": 0.7691005319356918,
"num_tokens": 5884673.0,
"step": 361
},
{
"entropy": 0.5517591834068298,
"epoch": 1.3514018691588785,
"grad_norm": 0.033162813633680344,
"learning_rate": 0.0002,
"loss": 0.5487698316574097,
"mean_token_accuracy": 0.7762563526630402,
"num_tokens": 5901026.0,
"step": 362
},
{
"entropy": 0.5693054497241974,
"epoch": 1.355140186915888,
"grad_norm": 0.03303493186831474,
"learning_rate": 0.0002,
"loss": 0.5636650323867798,
"mean_token_accuracy": 0.7702258229255676,
"num_tokens": 5917299.0,
"step": 363
},
{
"entropy": 0.5485306680202484,
"epoch": 1.358878504672897,
"grad_norm": 0.028174106031656265,
"learning_rate": 0.0002,
"loss": 0.5443013310432434,
"mean_token_accuracy": 0.7785944491624832,
"num_tokens": 5933711.0,
"step": 364
},
{
"entropy": 0.5455866008996964,
"epoch": 1.3626168224299064,
"grad_norm": 0.03680690750479698,
"learning_rate": 0.0002,
"loss": 0.5549443364143372,
"mean_token_accuracy": 0.7760016471147537,
"num_tokens": 5949851.0,
"step": 365
},
{
"entropy": 0.5625369846820831,
"epoch": 1.3663551401869158,
"grad_norm": 0.03274211287498474,
"learning_rate": 0.0002,
"loss": 0.5614032745361328,
"mean_token_accuracy": 0.7710064649581909,
"num_tokens": 5966219.0,
"step": 366
},
{
"entropy": 0.5512880086898804,
"epoch": 1.3700934579439252,
"grad_norm": 0.029914218932390213,
"learning_rate": 0.0002,
"loss": 0.5541912317276001,
"mean_token_accuracy": 0.7744521200656891,
"num_tokens": 5982685.0,
"step": 367
},
{
"entropy": 0.5462228506803513,
"epoch": 1.3738317757009346,
"grad_norm": 0.03740010783076286,
"learning_rate": 0.0002,
"loss": 0.542587161064148,
"mean_token_accuracy": 0.7833080589771271,
"num_tokens": 5999012.0,
"step": 368
},
{
"entropy": 0.5561699420213699,
"epoch": 1.377570093457944,
"grad_norm": 0.03154682740569115,
"learning_rate": 0.0002,
"loss": 0.5543806552886963,
"mean_token_accuracy": 0.7729498744010925,
"num_tokens": 6015418.0,
"step": 369
},
{
"entropy": 0.5295282006263733,
"epoch": 1.3813084112149534,
"grad_norm": 0.029992269352078438,
"learning_rate": 0.0002,
"loss": 0.5347234010696411,
"mean_token_accuracy": 0.7826734483242035,
"num_tokens": 6031664.0,
"step": 370
},
{
"entropy": 0.5307233035564423,
"epoch": 1.3850467289719626,
"grad_norm": 0.0387556329369545,
"learning_rate": 0.0002,
"loss": 0.5442472696304321,
"mean_token_accuracy": 0.7788428515195847,
"num_tokens": 6047789.0,
"step": 371
},
{
"entropy": 0.5666087120771408,
"epoch": 1.388785046728972,
"grad_norm": 0.03485598787665367,
"learning_rate": 0.0002,
"loss": 0.5701879858970642,
"mean_token_accuracy": 0.7664644569158554,
"num_tokens": 6064072.0,
"step": 372
},
{
"entropy": 0.5600801408290863,
"epoch": 1.3925233644859814,
"grad_norm": 0.030468204990029335,
"learning_rate": 0.0002,
"loss": 0.557839035987854,
"mean_token_accuracy": 0.7774783074855804,
"num_tokens": 6080233.0,
"step": 373
},
{
"entropy": 0.5573039948940277,
"epoch": 1.3962616822429905,
"grad_norm": 0.03327672928571701,
"learning_rate": 0.0002,
"loss": 0.5551377534866333,
"mean_token_accuracy": 0.7740774154663086,
"num_tokens": 6096552.0,
"step": 374
},
{
"entropy": 0.5559895187616348,
"epoch": 1.4,
"grad_norm": 0.029464859515428543,
"learning_rate": 0.0002,
"loss": 0.5499491691589355,
"mean_token_accuracy": 0.778936430811882,
"num_tokens": 6112721.0,
"step": 375
},
{
"entropy": 0.5373993217945099,
"epoch": 1.4037383177570093,
"grad_norm": 0.033405598253011703,
"learning_rate": 0.0002,
"loss": 0.5378676652908325,
"mean_token_accuracy": 0.78409743309021,
"num_tokens": 6128876.0,
"step": 376
},
{
"entropy": 0.5293000936508179,
"epoch": 1.4074766355140187,
"grad_norm": 0.03749069571495056,
"learning_rate": 0.0002,
"loss": 0.5442302823066711,
"mean_token_accuracy": 0.7793403714895248,
"num_tokens": 6145070.0,
"step": 377
},
{
"entropy": 0.5288459360599518,
"epoch": 1.411214953271028,
"grad_norm": 0.0304460097104311,
"learning_rate": 0.0002,
"loss": 0.5322169661521912,
"mean_token_accuracy": 0.7845710813999176,
"num_tokens": 6161358.0,
"step": 378
},
{
"entropy": 0.5396905541419983,
"epoch": 1.4149532710280375,
"grad_norm": 0.0334291011095047,
"learning_rate": 0.0002,
"loss": 0.536848783493042,
"mean_token_accuracy": 0.7786440551280975,
"num_tokens": 6177744.0,
"step": 379
},
{
"entropy": 0.5749261528253555,
"epoch": 1.4186915887850469,
"grad_norm": 0.03149184212088585,
"learning_rate": 0.0002,
"loss": 0.5657936334609985,
"mean_token_accuracy": 0.7711158096790314,
"num_tokens": 6194294.0,
"step": 380
},
{
"entropy": 0.5584524124860764,
"epoch": 1.422429906542056,
"grad_norm": 0.03502335026860237,
"learning_rate": 0.0002,
"loss": 0.5578019618988037,
"mean_token_accuracy": 0.7754084765911102,
"num_tokens": 6210591.0,
"step": 381
},
{
"entropy": 0.5385516434907913,
"epoch": 1.4261682242990654,
"grad_norm": 0.029922619462013245,
"learning_rate": 0.0002,
"loss": 0.5379009246826172,
"mean_token_accuracy": 0.7822572886943817,
"num_tokens": 6226836.0,
"step": 382
},
{
"entropy": 0.5303553491830826,
"epoch": 1.4299065420560748,
"grad_norm": 0.03207620605826378,
"learning_rate": 0.0002,
"loss": 0.5399402379989624,
"mean_token_accuracy": 0.7848275154829025,
"num_tokens": 6243140.0,
"step": 383
},
{
"entropy": 0.5435499548912048,
"epoch": 1.433644859813084,
"grad_norm": 0.034929681569337845,
"learning_rate": 0.0002,
"loss": 0.5510104298591614,
"mean_token_accuracy": 0.7754337340593338,
"num_tokens": 6259135.0,
"step": 384
},
{
"entropy": 0.5495016276836395,
"epoch": 1.4373831775700934,
"grad_norm": 0.02961392141878605,
"learning_rate": 0.0002,
"loss": 0.5518282651901245,
"mean_token_accuracy": 0.7770158797502518,
"num_tokens": 6275478.0,
"step": 385
},
{
"entropy": 0.5597821772098541,
"epoch": 1.4411214953271028,
"grad_norm": 0.03038998879492283,
"learning_rate": 0.0002,
"loss": 0.5598548650741577,
"mean_token_accuracy": 0.7717087864875793,
"num_tokens": 6292022.0,
"step": 386
},
{
"entropy": 0.5554857552051544,
"epoch": 1.4448598130841122,
"grad_norm": 0.034831635653972626,
"learning_rate": 0.0002,
"loss": 0.5589088201522827,
"mean_token_accuracy": 0.7742104977369308,
"num_tokens": 6308395.0,
"step": 387
},
{
"entropy": 0.5330976247787476,
"epoch": 1.4485981308411215,
"grad_norm": 0.03864655643701553,
"learning_rate": 0.0002,
"loss": 0.5340397357940674,
"mean_token_accuracy": 0.7843937277793884,
"num_tokens": 6324443.0,
"step": 388
},
{
"entropy": 0.5459477603435516,
"epoch": 1.452336448598131,
"grad_norm": 0.03552354499697685,
"learning_rate": 0.0002,
"loss": 0.546898603439331,
"mean_token_accuracy": 0.7767336070537567,
"num_tokens": 6340452.0,
"step": 389
},
{
"entropy": 0.555869922041893,
"epoch": 1.45607476635514,
"grad_norm": 0.042999885976314545,
"learning_rate": 0.0002,
"loss": 0.5562218427658081,
"mean_token_accuracy": 0.772677481174469,
"num_tokens": 6356737.0,
"step": 390
},
{
"entropy": 0.5476373881101608,
"epoch": 1.4598130841121495,
"grad_norm": 0.034353937953710556,
"learning_rate": 0.0002,
"loss": 0.5502485632896423,
"mean_token_accuracy": 0.7757505625486374,
"num_tokens": 6372959.0,
"step": 391
},
{
"entropy": 0.5542000085115433,
"epoch": 1.4635514018691589,
"grad_norm": 0.030675135552883148,
"learning_rate": 0.0002,
"loss": 0.5507063865661621,
"mean_token_accuracy": 0.7746506035327911,
"num_tokens": 6389285.0,
"step": 392
},
{
"entropy": 0.5308681577444077,
"epoch": 1.4672897196261683,
"grad_norm": 0.03328751027584076,
"learning_rate": 0.0002,
"loss": 0.5308902263641357,
"mean_token_accuracy": 0.7832993865013123,
"num_tokens": 6405473.0,
"step": 393
},
{
"entropy": 0.5490089803934097,
"epoch": 1.4710280373831774,
"grad_norm": 0.03258799389004707,
"learning_rate": 0.0002,
"loss": 0.5524098873138428,
"mean_token_accuracy": 0.7753634303808212,
"num_tokens": 6421682.0,
"step": 394
},
{
"entropy": 0.5617490261793137,
"epoch": 1.4747663551401868,
"grad_norm": 0.03237268701195717,
"learning_rate": 0.0002,
"loss": 0.5609363913536072,
"mean_token_accuracy": 0.7727462351322174,
"num_tokens": 6438225.0,
"step": 395
},
{
"entropy": 0.5548438280820847,
"epoch": 1.4785046728971962,
"grad_norm": 0.0355081707239151,
"learning_rate": 0.0002,
"loss": 0.5486972332000732,
"mean_token_accuracy": 0.7752490490674973,
"num_tokens": 6454558.0,
"step": 396
},
{
"entropy": 0.539698138833046,
"epoch": 1.4822429906542056,
"grad_norm": 0.03101828694343567,
"learning_rate": 0.0002,
"loss": 0.5438753366470337,
"mean_token_accuracy": 0.776269868016243,
"num_tokens": 6470673.0,
"step": 397
},
{
"entropy": 0.5318429321050644,
"epoch": 1.485981308411215,
"grad_norm": 0.040831487625837326,
"learning_rate": 0.0002,
"loss": 0.5361422300338745,
"mean_token_accuracy": 0.7855317145586014,
"num_tokens": 6486739.0,
"step": 398
},
{
"entropy": 0.5382596254348755,
"epoch": 1.4897196261682244,
"grad_norm": 0.03325575962662697,
"learning_rate": 0.0002,
"loss": 0.5401434302330017,
"mean_token_accuracy": 0.7797534018754959,
"num_tokens": 6502900.0,
"step": 399
},
{
"entropy": 0.5596988648176193,
"epoch": 1.4934579439252336,
"grad_norm": 0.028764478862285614,
"learning_rate": 0.0002,
"loss": 0.5577390193939209,
"mean_token_accuracy": 0.7748348712921143,
"num_tokens": 6519408.0,
"step": 400
},
{
"entropy": 0.5493527054786682,
"epoch": 1.497196261682243,
"grad_norm": 0.028892861679196358,
"learning_rate": 0.0002,
"loss": 0.5473135709762573,
"mean_token_accuracy": 0.777830645442009,
"num_tokens": 6535811.0,
"step": 401
},
{
"entropy": 0.5402602255344391,
"epoch": 1.5009345794392523,
"grad_norm": 0.03191126883029938,
"learning_rate": 0.0002,
"loss": 0.5474570989608765,
"mean_token_accuracy": 0.7774458974599838,
"num_tokens": 6552173.0,
"step": 402
},
{
"entropy": 0.540817379951477,
"epoch": 1.5046728971962615,
"grad_norm": 0.03177822753787041,
"learning_rate": 0.0002,
"loss": 0.548837423324585,
"mean_token_accuracy": 0.7776143550872803,
"num_tokens": 6568527.0,
"step": 403
},
{
"entropy": 0.5428208336234093,
"epoch": 1.508411214953271,
"grad_norm": 0.030568130314350128,
"learning_rate": 0.0002,
"loss": 0.5432289242744446,
"mean_token_accuracy": 0.7798717468976974,
"num_tokens": 6584756.0,
"step": 404
},
{
"entropy": 0.5466499626636505,
"epoch": 1.5121495327102803,
"grad_norm": 0.032929882407188416,
"learning_rate": 0.0002,
"loss": 0.5407195687294006,
"mean_token_accuracy": 0.7786379009485245,
"num_tokens": 6601082.0,
"step": 405
},
{
"entropy": 0.5593132823705673,
"epoch": 1.5158878504672897,
"grad_norm": 0.03837394341826439,
"learning_rate": 0.0002,
"loss": 0.5646262168884277,
"mean_token_accuracy": 0.771564781665802,
"num_tokens": 6617429.0,
"step": 406
},
{
"entropy": 0.5453289300203323,
"epoch": 1.519626168224299,
"grad_norm": 0.03576509654521942,
"learning_rate": 0.0002,
"loss": 0.5487722158432007,
"mean_token_accuracy": 0.7768426388502121,
"num_tokens": 6633826.0,
"step": 407
},
{
"entropy": 0.53939288854599,
"epoch": 1.5233644859813085,
"grad_norm": 0.032857585698366165,
"learning_rate": 0.0002,
"loss": 0.5385522246360779,
"mean_token_accuracy": 0.7790959179401398,
"num_tokens": 6650240.0,
"step": 408
},
{
"entropy": 0.5520011931657791,
"epoch": 1.5271028037383179,
"grad_norm": 0.030627621337771416,
"learning_rate": 0.0002,
"loss": 0.5516581535339355,
"mean_token_accuracy": 0.7760986834764481,
"num_tokens": 6666454.0,
"step": 409
},
{
"entropy": 0.5406108945608139,
"epoch": 1.5308411214953273,
"grad_norm": 0.036952704191207886,
"learning_rate": 0.0002,
"loss": 0.545346736907959,
"mean_token_accuracy": 0.7765967845916748,
"num_tokens": 6682741.0,
"step": 410
},
{
"entropy": 0.5551878213882446,
"epoch": 1.5345794392523364,
"grad_norm": 0.02871653437614441,
"learning_rate": 0.0002,
"loss": 0.54979008436203,
"mean_token_accuracy": 0.7789790332317352,
"num_tokens": 6699160.0,
"step": 411
},
{
"entropy": 0.5512814819812775,
"epoch": 1.5383177570093458,
"grad_norm": 0.03201194107532501,
"learning_rate": 0.0002,
"loss": 0.5527634620666504,
"mean_token_accuracy": 0.7734574526548386,
"num_tokens": 6715511.0,
"step": 412
},
{
"entropy": 0.5432283580303192,
"epoch": 1.542056074766355,
"grad_norm": 0.040297310799360275,
"learning_rate": 0.0002,
"loss": 0.5455228686332703,
"mean_token_accuracy": 0.7767939269542694,
"num_tokens": 6731688.0,
"step": 413
},
{
"entropy": 0.5464504212141037,
"epoch": 1.5457943925233644,
"grad_norm": 0.03343544527888298,
"learning_rate": 0.0002,
"loss": 0.543891191482544,
"mean_token_accuracy": 0.7797385454177856,
"num_tokens": 6747995.0,
"step": 414
},
{
"entropy": 0.5669636428356171,
"epoch": 1.5495327102803738,
"grad_norm": 0.03769576549530029,
"learning_rate": 0.0002,
"loss": 0.5689972639083862,
"mean_token_accuracy": 0.7693852484226227,
"num_tokens": 6764353.0,
"step": 415
},
{
"entropy": 0.5392922759056091,
"epoch": 1.5532710280373832,
"grad_norm": 0.03238385543227196,
"learning_rate": 0.0002,
"loss": 0.5441082715988159,
"mean_token_accuracy": 0.779180720448494,
"num_tokens": 6780896.0,
"step": 416
},
{
"entropy": 0.530147522687912,
"epoch": 1.5570093457943925,
"grad_norm": 0.040036849677562714,
"learning_rate": 0.0002,
"loss": 0.5422973036766052,
"mean_token_accuracy": 0.7789286226034164,
"num_tokens": 6797151.0,
"step": 417
},
{
"entropy": 0.5386764258146286,
"epoch": 1.560747663551402,
"grad_norm": 0.03689395636320114,
"learning_rate": 0.0002,
"loss": 0.5467624068260193,
"mean_token_accuracy": 0.7778990417718887,
"num_tokens": 6813386.0,
"step": 418
},
{
"entropy": 0.5509621798992157,
"epoch": 1.5644859813084113,
"grad_norm": 0.029403693974018097,
"learning_rate": 0.0002,
"loss": 0.5459365248680115,
"mean_token_accuracy": 0.7784391641616821,
"num_tokens": 6829627.0,
"step": 419
},
{
"entropy": 0.5576108992099762,
"epoch": 1.5682242990654207,
"grad_norm": 0.03426877036690712,
"learning_rate": 0.0002,
"loss": 0.5519037246704102,
"mean_token_accuracy": 0.7766879051923752,
"num_tokens": 6845675.0,
"step": 420
},
{
"entropy": 0.5511836111545563,
"epoch": 1.5719626168224299,
"grad_norm": 0.03294205665588379,
"learning_rate": 0.0002,
"loss": 0.5434479117393494,
"mean_token_accuracy": 0.7805502861738205,
"num_tokens": 6861921.0,
"step": 421
},
{
"entropy": 0.5404133796691895,
"epoch": 1.5757009345794393,
"grad_norm": 0.032488446682691574,
"learning_rate": 0.0002,
"loss": 0.5410423278808594,
"mean_token_accuracy": 0.7808396965265274,
"num_tokens": 6877883.0,
"step": 422
},
{
"entropy": 0.5403463542461395,
"epoch": 1.5794392523364484,
"grad_norm": 0.03610778972506523,
"learning_rate": 0.0002,
"loss": 0.5484398603439331,
"mean_token_accuracy": 0.775899812579155,
"num_tokens": 6894361.0,
"step": 423
},
{
"entropy": 0.5344756990671158,
"epoch": 1.5831775700934578,
"grad_norm": 0.040382951498031616,
"learning_rate": 0.0002,
"loss": 0.5388015508651733,
"mean_token_accuracy": 0.7805848121643066,
"num_tokens": 6910715.0,
"step": 424
},
{
"entropy": 0.5353002026677132,
"epoch": 1.5869158878504672,
"grad_norm": 0.03316662460565567,
"learning_rate": 0.0002,
"loss": 0.5393432974815369,
"mean_token_accuracy": 0.7816650718450546,
"num_tokens": 6927150.0,
"step": 425
},
{
"entropy": 0.5770704746246338,
"epoch": 1.5906542056074766,
"grad_norm": 0.034545231610536575,
"learning_rate": 0.0002,
"loss": 0.579833984375,
"mean_token_accuracy": 0.7628369480371475,
"num_tokens": 6943549.0,
"step": 426
},
{
"entropy": 0.5552347898483276,
"epoch": 1.594392523364486,
"grad_norm": 0.03268204629421234,
"learning_rate": 0.0002,
"loss": 0.5537080764770508,
"mean_token_accuracy": 0.7791409194469452,
"num_tokens": 6959832.0,
"step": 427
},
{
"entropy": 0.5671118795871735,
"epoch": 1.5981308411214954,
"grad_norm": 0.025902021676301956,
"learning_rate": 0.0002,
"loss": 0.5616373419761658,
"mean_token_accuracy": 0.771975114941597,
"num_tokens": 6976368.0,
"step": 428
},
{
"entropy": 0.5544670224189758,
"epoch": 1.6018691588785048,
"grad_norm": 0.0315086655318737,
"learning_rate": 0.0002,
"loss": 0.5545330047607422,
"mean_token_accuracy": 0.7738883197307587,
"num_tokens": 6992718.0,
"step": 429
},
{
"entropy": 0.5558904558420181,
"epoch": 1.6056074766355142,
"grad_norm": 0.033460259437561035,
"learning_rate": 0.0002,
"loss": 0.5574325323104858,
"mean_token_accuracy": 0.772273600101471,
"num_tokens": 7009062.0,
"step": 430
},
{
"entropy": 0.5590114444494247,
"epoch": 1.6093457943925233,
"grad_norm": 0.029064292088150978,
"learning_rate": 0.0002,
"loss": 0.5580740571022034,
"mean_token_accuracy": 0.7744424343109131,
"num_tokens": 7025645.0,
"step": 431
},
{
"entropy": 0.5402631610631943,
"epoch": 1.6130841121495327,
"grad_norm": 0.04296636953949928,
"learning_rate": 0.0002,
"loss": 0.5493630170822144,
"mean_token_accuracy": 0.7780915945768356,
"num_tokens": 7041830.0,
"step": 432
},
{
"entropy": 0.5555061250925064,
"epoch": 1.616822429906542,
"grad_norm": 0.03312353044748306,
"learning_rate": 0.0002,
"loss": 0.5578774809837341,
"mean_token_accuracy": 0.7739899456501007,
"num_tokens": 7058231.0,
"step": 433
},
{
"entropy": 0.5563363283872604,
"epoch": 1.6205607476635513,
"grad_norm": 0.03301616013050079,
"learning_rate": 0.0002,
"loss": 0.5517432689666748,
"mean_token_accuracy": 0.7788877487182617,
"num_tokens": 7074655.0,
"step": 434
},
{
"entropy": 0.5507991462945938,
"epoch": 1.6242990654205607,
"grad_norm": 0.03195936232805252,
"learning_rate": 0.0002,
"loss": 0.5476133227348328,
"mean_token_accuracy": 0.7775176912546158,
"num_tokens": 7090766.0,
"step": 435
},
{
"entropy": 0.5565993189811707,
"epoch": 1.62803738317757,
"grad_norm": 0.03229626268148422,
"learning_rate": 0.0002,
"loss": 0.5532009601593018,
"mean_token_accuracy": 0.7752693891525269,
"num_tokens": 7106963.0,
"step": 436
},
{
"entropy": 0.5465118885040283,
"epoch": 1.6317757009345795,
"grad_norm": 0.034706246107816696,
"learning_rate": 0.0002,
"loss": 0.551576554775238,
"mean_token_accuracy": 0.7718321233987808,
"num_tokens": 7122926.0,
"step": 437
},
{
"entropy": 0.5443113446235657,
"epoch": 1.6355140186915889,
"grad_norm": 0.04082060605287552,
"learning_rate": 0.0002,
"loss": 0.5574634671211243,
"mean_token_accuracy": 0.7741082310676575,
"num_tokens": 7139165.0,
"step": 438
},
{
"entropy": 0.5489460676908493,
"epoch": 1.6392523364485982,
"grad_norm": 0.03261584788560867,
"learning_rate": 0.0002,
"loss": 0.5546178817749023,
"mean_token_accuracy": 0.7754340916872025,
"num_tokens": 7155500.0,
"step": 439
},
{
"entropy": 0.5663624107837677,
"epoch": 1.6429906542056076,
"grad_norm": 0.030861368402838707,
"learning_rate": 0.0002,
"loss": 0.564441442489624,
"mean_token_accuracy": 0.7708708792924881,
"num_tokens": 7171927.0,
"step": 440
},
{
"entropy": 0.5702053755521774,
"epoch": 1.6467289719626168,
"grad_norm": 0.03468736633658409,
"learning_rate": 0.0002,
"loss": 0.5645827651023865,
"mean_token_accuracy": 0.768431767821312,
"num_tokens": 7188341.0,
"step": 441
},
{
"entropy": 0.5505633056163788,
"epoch": 1.6504672897196262,
"grad_norm": 0.03153201565146446,
"learning_rate": 0.0002,
"loss": 0.5395671725273132,
"mean_token_accuracy": 0.7812985777854919,
"num_tokens": 7204527.0,
"step": 442
},
{
"entropy": 0.5565541088581085,
"epoch": 1.6542056074766354,
"grad_norm": 0.033020708709955215,
"learning_rate": 0.0002,
"loss": 0.557956874370575,
"mean_token_accuracy": 0.7709688693284988,
"num_tokens": 7220831.0,
"step": 443
},
{
"entropy": 0.5384746044874191,
"epoch": 1.6579439252336448,
"grad_norm": 0.0418318547308445,
"learning_rate": 0.0002,
"loss": 0.5513378977775574,
"mean_token_accuracy": 0.7791547626256943,
"num_tokens": 7236949.0,
"step": 444
},
{
"entropy": 0.5353372693061829,
"epoch": 1.6616822429906541,
"grad_norm": 0.03820660710334778,
"learning_rate": 0.0002,
"loss": 0.5490580201148987,
"mean_token_accuracy": 0.7749721854925156,
"num_tokens": 7253242.0,
"step": 445
},
{
"entropy": 0.5484792143106461,
"epoch": 1.6654205607476635,
"grad_norm": 0.03215263411402702,
"learning_rate": 0.0002,
"loss": 0.5497522354125977,
"mean_token_accuracy": 0.7769928872585297,
"num_tokens": 7269457.0,
"step": 446
},
{
"entropy": 0.5664080828428268,
"epoch": 1.669158878504673,
"grad_norm": 0.02815551683306694,
"learning_rate": 0.0002,
"loss": 0.5563632249832153,
"mean_token_accuracy": 0.7749156504869461,
"num_tokens": 7285879.0,
"step": 447
},
{
"entropy": 0.5464235991239548,
"epoch": 1.6728971962616823,
"grad_norm": 0.02781211957335472,
"learning_rate": 0.0002,
"loss": 0.5405099391937256,
"mean_token_accuracy": 0.781552255153656,
"num_tokens": 7302263.0,
"step": 448
},
{
"entropy": 0.5339583903551102,
"epoch": 1.6766355140186917,
"grad_norm": 0.02980860136449337,
"learning_rate": 0.0002,
"loss": 0.5369037985801697,
"mean_token_accuracy": 0.7814508825540543,
"num_tokens": 7318270.0,
"step": 449
},
{
"entropy": 0.5407254248857498,
"epoch": 1.680373831775701,
"grad_norm": 0.03138496354222298,
"learning_rate": 0.0002,
"loss": 0.5460474491119385,
"mean_token_accuracy": 0.7780201584100723,
"num_tokens": 7334492.0,
"step": 450
},
{
"entropy": 0.5503694117069244,
"epoch": 1.6841121495327103,
"grad_norm": 0.033992450684309006,
"learning_rate": 0.0002,
"loss": 0.5556005239486694,
"mean_token_accuracy": 0.7745715081691742,
"num_tokens": 7350627.0,
"step": 451
},
{
"entropy": 0.5451936274766922,
"epoch": 1.6878504672897197,
"grad_norm": 0.03251323476433754,
"learning_rate": 0.0002,
"loss": 0.5443669557571411,
"mean_token_accuracy": 0.7780810743570328,
"num_tokens": 7367005.0,
"step": 452
},
{
"entropy": 0.5657957345247269,
"epoch": 1.6915887850467288,
"grad_norm": 0.034646324813365936,
"learning_rate": 0.0002,
"loss": 0.5615976452827454,
"mean_token_accuracy": 0.7718859612941742,
"num_tokens": 7383262.0,
"step": 453
},
{
"entropy": 0.5525887459516525,
"epoch": 1.6953271028037382,
"grad_norm": 0.04024709016084671,
"learning_rate": 0.0002,
"loss": 0.5542372465133667,
"mean_token_accuracy": 0.7756317108869553,
"num_tokens": 7399750.0,
"step": 454
},
{
"entropy": 0.5493184924125671,
"epoch": 1.6990654205607476,
"grad_norm": 0.030978472903370857,
"learning_rate": 0.0002,
"loss": 0.5475279688835144,
"mean_token_accuracy": 0.7762274444103241,
"num_tokens": 7415800.0,
"step": 455
},
{
"entropy": 0.5400003641843796,
"epoch": 1.702803738317757,
"grad_norm": 0.03376868739724159,
"learning_rate": 0.0002,
"loss": 0.5407789349555969,
"mean_token_accuracy": 0.7818103283643723,
"num_tokens": 7431961.0,
"step": 456
},
{
"entropy": 0.535884216427803,
"epoch": 1.7065420560747664,
"grad_norm": 0.031221890822052956,
"learning_rate": 0.0002,
"loss": 0.5440670847892761,
"mean_token_accuracy": 0.7796338200569153,
"num_tokens": 7448202.0,
"step": 457
},
{
"entropy": 0.5389861762523651,
"epoch": 1.7102803738317758,
"grad_norm": 0.035680338740348816,
"learning_rate": 0.0002,
"loss": 0.5449787974357605,
"mean_token_accuracy": 0.7797497361898422,
"num_tokens": 7464671.0,
"step": 458
},
{
"entropy": 0.5451969653367996,
"epoch": 1.7140186915887852,
"grad_norm": 0.03255719691514969,
"learning_rate": 0.0002,
"loss": 0.5538266897201538,
"mean_token_accuracy": 0.776149570941925,
"num_tokens": 7480992.0,
"step": 459
},
{
"entropy": 0.5643452405929565,
"epoch": 1.7177570093457943,
"grad_norm": 0.03378691151738167,
"learning_rate": 0.0002,
"loss": 0.5571281313896179,
"mean_token_accuracy": 0.7731311619281769,
"num_tokens": 7497232.0,
"step": 460
},
{
"entropy": 0.5346335917711258,
"epoch": 1.7214953271028037,
"grad_norm": 0.03035924583673477,
"learning_rate": 0.0002,
"loss": 0.5269172191619873,
"mean_token_accuracy": 0.7836929112672806,
"num_tokens": 7513644.0,
"step": 461
},
{
"entropy": 0.5628820955753326,
"epoch": 1.7252336448598131,
"grad_norm": 0.03539309278130531,
"learning_rate": 0.0002,
"loss": 0.5605576634407043,
"mean_token_accuracy": 0.7706831097602844,
"num_tokens": 7529830.0,
"step": 462
},
{
"entropy": 0.5182670503854752,
"epoch": 1.7289719626168223,
"grad_norm": 0.036859650164842606,
"learning_rate": 0.0002,
"loss": 0.5209002494812012,
"mean_token_accuracy": 0.7879375368356705,
"num_tokens": 7545846.0,
"step": 463
},
{
"entropy": 0.5474621504545212,
"epoch": 1.7327102803738317,
"grad_norm": 0.037796422839164734,
"learning_rate": 0.0002,
"loss": 0.5536765456199646,
"mean_token_accuracy": 0.7753565907478333,
"num_tokens": 7562267.0,
"step": 464
},
{
"entropy": 0.5636439174413681,
"epoch": 1.736448598130841,
"grad_norm": 0.037271831184625626,
"learning_rate": 0.0002,
"loss": 0.5606362223625183,
"mean_token_accuracy": 0.7704486697912216,
"num_tokens": 7578670.0,
"step": 465
},
{
"entropy": 0.5483116805553436,
"epoch": 1.7401869158878505,
"grad_norm": 0.031047314405441284,
"learning_rate": 0.0002,
"loss": 0.5489611029624939,
"mean_token_accuracy": 0.7756731957197189,
"num_tokens": 7595113.0,
"step": 466
},
{
"entropy": 0.5289314538240433,
"epoch": 1.7439252336448599,
"grad_norm": 0.035078927874565125,
"learning_rate": 0.0002,
"loss": 0.5344489216804504,
"mean_token_accuracy": 0.7853281199932098,
"num_tokens": 7611153.0,
"step": 467
},
{
"entropy": 0.541694313287735,
"epoch": 1.7476635514018692,
"grad_norm": 0.030235178768634796,
"learning_rate": 0.0002,
"loss": 0.5412616729736328,
"mean_token_accuracy": 0.7781483829021454,
"num_tokens": 7627712.0,
"step": 468
},
{
"entropy": 0.5554275363683701,
"epoch": 1.7514018691588786,
"grad_norm": 0.036943912506103516,
"learning_rate": 0.0002,
"loss": 0.5531514286994934,
"mean_token_accuracy": 0.7756786197423935,
"num_tokens": 7643922.0,
"step": 469
},
{
"entropy": 0.5472631007432938,
"epoch": 1.7551401869158878,
"grad_norm": 0.030970100313425064,
"learning_rate": 0.0002,
"loss": 0.5467809438705444,
"mean_token_accuracy": 0.780939131975174,
"num_tokens": 7660096.0,
"step": 470
},
{
"entropy": 0.525331124663353,
"epoch": 1.7588785046728972,
"grad_norm": 0.04763743281364441,
"learning_rate": 0.0002,
"loss": 0.5361969470977783,
"mean_token_accuracy": 0.782649889588356,
"num_tokens": 7676237.0,
"step": 471
},
{
"entropy": 0.5514428466558456,
"epoch": 1.7626168224299066,
"grad_norm": 0.02942316047847271,
"learning_rate": 0.0002,
"loss": 0.5563341975212097,
"mean_token_accuracy": 0.773899495601654,
"num_tokens": 7692848.0,
"step": 472
},
{
"entropy": 0.5428648442029953,
"epoch": 1.7663551401869158,
"grad_norm": 0.038572002202272415,
"learning_rate": 0.0002,
"loss": 0.5449008941650391,
"mean_token_accuracy": 0.7810295820236206,
"num_tokens": 7708895.0,
"step": 473
},
{
"entropy": 0.5526584386825562,
"epoch": 1.7700934579439251,
"grad_norm": 0.03303026407957077,
"learning_rate": 0.0002,
"loss": 0.5465356111526489,
"mean_token_accuracy": 0.7774733603000641,
"num_tokens": 7725206.0,
"step": 474
},
{
"entropy": 0.5638225227594376,
"epoch": 1.7738317757009345,
"grad_norm": 0.029633166268467903,
"learning_rate": 0.0002,
"loss": 0.5624324083328247,
"mean_token_accuracy": 0.7697116434574127,
"num_tokens": 7741838.0,
"step": 475
},
{
"entropy": 0.5561016201972961,
"epoch": 1.777570093457944,
"grad_norm": 0.0328570231795311,
"learning_rate": 0.0002,
"loss": 0.5563735961914062,
"mean_token_accuracy": 0.7721449285745621,
"num_tokens": 7758049.0,
"step": 476
},
{
"entropy": 0.5516675412654877,
"epoch": 1.7813084112149533,
"grad_norm": 0.03453238308429718,
"learning_rate": 0.0002,
"loss": 0.5518988370895386,
"mean_token_accuracy": 0.7777107656002045,
"num_tokens": 7774257.0,
"step": 477
},
{
"entropy": 0.5394668728113174,
"epoch": 1.7850467289719627,
"grad_norm": 0.03409087657928467,
"learning_rate": 0.0002,
"loss": 0.5432859659194946,
"mean_token_accuracy": 0.7796248197555542,
"num_tokens": 7790837.0,
"step": 478
},
{
"entropy": 0.5491889864206314,
"epoch": 1.788785046728972,
"grad_norm": 0.03139546513557434,
"learning_rate": 0.0002,
"loss": 0.5477681159973145,
"mean_token_accuracy": 0.7775027453899384,
"num_tokens": 7807302.0,
"step": 479
},
{
"entropy": 0.5528343021869659,
"epoch": 1.7925233644859813,
"grad_norm": 0.031248709186911583,
"learning_rate": 0.0002,
"loss": 0.5557167530059814,
"mean_token_accuracy": 0.7744993418455124,
"num_tokens": 7823635.0,
"step": 480
},
{
"entropy": 0.5458249896764755,
"epoch": 1.7962616822429907,
"grad_norm": 0.03402215987443924,
"learning_rate": 0.0002,
"loss": 0.5505017042160034,
"mean_token_accuracy": 0.7759317308664322,
"num_tokens": 7839914.0,
"step": 481
},
{
"entropy": 0.552555724978447,
"epoch": 1.8,
"grad_norm": 0.030951669439673424,
"learning_rate": 0.0002,
"loss": 0.560877799987793,
"mean_token_accuracy": 0.77203568816185,
"num_tokens": 7856194.0,
"step": 482
},
{
"entropy": 0.5391200333833694,
"epoch": 1.8037383177570092,
"grad_norm": 0.04003436490893364,
"learning_rate": 0.0002,
"loss": 0.5390163660049438,
"mean_token_accuracy": 0.7827838510274887,
"num_tokens": 7872434.0,
"step": 483
},
{
"entropy": 0.5392342656850815,
"epoch": 1.8074766355140186,
"grad_norm": 0.03150493651628494,
"learning_rate": 0.0002,
"loss": 0.5406180620193481,
"mean_token_accuracy": 0.7828439474105835,
"num_tokens": 7888751.0,
"step": 484
},
{
"entropy": 0.5622579157352448,
"epoch": 1.811214953271028,
"grad_norm": 0.03376127406954765,
"learning_rate": 0.0002,
"loss": 0.5644164681434631,
"mean_token_accuracy": 0.7707268595695496,
"num_tokens": 7905072.0,
"step": 485
},
{
"entropy": 0.5327235907316208,
"epoch": 1.8149532710280374,
"grad_norm": 0.028277890756726265,
"learning_rate": 0.0002,
"loss": 0.5303685069084167,
"mean_token_accuracy": 0.7862435132265091,
"num_tokens": 7921459.0,
"step": 486
},
{
"entropy": 0.5588890165090561,
"epoch": 1.8186915887850468,
"grad_norm": 0.03095029853284359,
"learning_rate": 0.0002,
"loss": 0.5525569915771484,
"mean_token_accuracy": 0.7770346105098724,
"num_tokens": 7937961.0,
"step": 487
},
{
"entropy": 0.5573548376560211,
"epoch": 1.8224299065420562,
"grad_norm": 0.03045843541622162,
"learning_rate": 0.0002,
"loss": 0.5535331964492798,
"mean_token_accuracy": 0.7766827940940857,
"num_tokens": 7954609.0,
"step": 488
},
{
"entropy": 0.5567604452371597,
"epoch": 1.8261682242990656,
"grad_norm": 0.029482809826731682,
"learning_rate": 0.0002,
"loss": 0.5576134324073792,
"mean_token_accuracy": 0.772316038608551,
"num_tokens": 7971097.0,
"step": 489
},
{
"entropy": 0.5545413047075272,
"epoch": 1.8299065420560747,
"grad_norm": 0.03891676291823387,
"learning_rate": 0.0002,
"loss": 0.5648533701896667,
"mean_token_accuracy": 0.7718105167150497,
"num_tokens": 7987377.0,
"step": 490
},
{
"entropy": 0.5786599218845367,
"epoch": 1.8336448598130841,
"grad_norm": 0.030758248642086983,
"learning_rate": 0.0002,
"loss": 0.5835361480712891,
"mean_token_accuracy": 0.762917771935463,
"num_tokens": 8003799.0,
"step": 491
},
{
"entropy": 0.5397150218486786,
"epoch": 1.8373831775700935,
"grad_norm": 0.03965795785188675,
"learning_rate": 0.0002,
"loss": 0.538779616355896,
"mean_token_accuracy": 0.7839108556509018,
"num_tokens": 8020279.0,
"step": 492
},
{
"entropy": 0.5535183995962143,
"epoch": 1.8411214953271027,
"grad_norm": 0.03004513867199421,
"learning_rate": 0.0002,
"loss": 0.5507811903953552,
"mean_token_accuracy": 0.7755124121904373,
"num_tokens": 8036491.0,
"step": 493
},
{
"entropy": 0.5442592799663544,
"epoch": 1.844859813084112,
"grad_norm": 0.03522132337093353,
"learning_rate": 0.0002,
"loss": 0.5478004217147827,
"mean_token_accuracy": 0.7766154408454895,
"num_tokens": 8052807.0,
"step": 494
},
{
"entropy": 0.5266854241490364,
"epoch": 1.8485981308411215,
"grad_norm": 0.030206192284822464,
"learning_rate": 0.0002,
"loss": 0.529688835144043,
"mean_token_accuracy": 0.7819836139678955,
"num_tokens": 8068712.0,
"step": 495
},
{
"entropy": 0.5283671095967293,
"epoch": 1.8523364485981308,
"grad_norm": 0.03329138457775116,
"learning_rate": 0.0002,
"loss": 0.5376101136207581,
"mean_token_accuracy": 0.7793748378753662,
"num_tokens": 8085084.0,
"step": 496
},
{
"entropy": 0.5712718665599823,
"epoch": 1.8560747663551402,
"grad_norm": 0.0325874425470829,
"learning_rate": 0.0002,
"loss": 0.5709162950515747,
"mean_token_accuracy": 0.7662056684494019,
"num_tokens": 8101731.0,
"step": 497
},
{
"entropy": 0.5663121491670609,
"epoch": 1.8598130841121496,
"grad_norm": 0.03357568010687828,
"learning_rate": 0.0002,
"loss": 0.5650657415390015,
"mean_token_accuracy": 0.7691219747066498,
"num_tokens": 8118244.0,
"step": 498
},
{
"entropy": 0.5427432358264923,
"epoch": 1.863551401869159,
"grad_norm": 0.03203551098704338,
"learning_rate": 0.0002,
"loss": 0.5398803949356079,
"mean_token_accuracy": 0.7808598130941391,
"num_tokens": 8134657.0,
"step": 499
},
{
"entropy": 0.5573120266199112,
"epoch": 1.8672897196261682,
"grad_norm": 0.029932986944913864,
"learning_rate": 0.0002,
"loss": 0.5522656440734863,
"mean_token_accuracy": 0.7727643102407455,
"num_tokens": 8151058.0,
"step": 500
},
{
"entropy": 0.5573428720235825,
"epoch": 1.8710280373831776,
"grad_norm": 0.02661440148949623,
"learning_rate": 0.0002,
"loss": 0.5512294173240662,
"mean_token_accuracy": 0.7765780538320541,
"num_tokens": 8167736.0,
"step": 501
},
{
"entropy": 0.5472890585660934,
"epoch": 1.874766355140187,
"grad_norm": 0.028882022947072983,
"learning_rate": 0.0002,
"loss": 0.5479044318199158,
"mean_token_accuracy": 0.777178093791008,
"num_tokens": 8183857.0,
"step": 502
},
{
"entropy": 0.5511818528175354,
"epoch": 1.8785046728971961,
"grad_norm": 0.032389186322689056,
"learning_rate": 0.0002,
"loss": 0.5552236437797546,
"mean_token_accuracy": 0.7762337774038315,
"num_tokens": 8199955.0,
"step": 503
},
{
"entropy": 0.546854555606842,
"epoch": 1.8822429906542055,
"grad_norm": 0.0336172878742218,
"learning_rate": 0.0002,
"loss": 0.55290687084198,
"mean_token_accuracy": 0.7735693603754044,
"num_tokens": 8216221.0,
"step": 504
},
{
"entropy": 0.5447833836078644,
"epoch": 1.885981308411215,
"grad_norm": 0.0326668806374073,
"learning_rate": 0.0002,
"loss": 0.5433166027069092,
"mean_token_accuracy": 0.7759248912334442,
"num_tokens": 8232519.0,
"step": 505
},
{
"entropy": 0.5311590135097504,
"epoch": 1.8897196261682243,
"grad_norm": 0.0328470915555954,
"learning_rate": 0.0002,
"loss": 0.5332115888595581,
"mean_token_accuracy": 0.7827264666557312,
"num_tokens": 8248973.0,
"step": 506
},
{
"entropy": 0.5405398160219193,
"epoch": 1.8934579439252337,
"grad_norm": 0.03319946303963661,
"learning_rate": 0.0002,
"loss": 0.5498695969581604,
"mean_token_accuracy": 0.7756136506795883,
"num_tokens": 8265054.0,
"step": 507
},
{
"entropy": 0.5590761750936508,
"epoch": 1.897196261682243,
"grad_norm": 0.03323895111680031,
"learning_rate": 0.0002,
"loss": 0.5674346685409546,
"mean_token_accuracy": 0.7680935710668564,
"num_tokens": 8281659.0,
"step": 508
},
{
"entropy": 0.5502993315458298,
"epoch": 1.9009345794392525,
"grad_norm": 0.036393504589796066,
"learning_rate": 0.0002,
"loss": 0.5518926382064819,
"mean_token_accuracy": 0.7772549986839294,
"num_tokens": 8298120.0,
"step": 509
},
{
"entropy": 0.5434653609991074,
"epoch": 1.9046728971962616,
"grad_norm": 0.030826875939965248,
"learning_rate": 0.0002,
"loss": 0.5373662710189819,
"mean_token_accuracy": 0.7814789414405823,
"num_tokens": 8314165.0,
"step": 510
},
{
"entropy": 0.5616354942321777,
"epoch": 1.908411214953271,
"grad_norm": 0.03320663422346115,
"learning_rate": 0.0002,
"loss": 0.5573338866233826,
"mean_token_accuracy": 0.7744273245334625,
"num_tokens": 8330561.0,
"step": 511
},
{
"entropy": 0.5629893988370895,
"epoch": 1.9121495327102802,
"grad_norm": 0.03727097064256668,
"learning_rate": 0.0002,
"loss": 0.5611152648925781,
"mean_token_accuracy": 0.773328885436058,
"num_tokens": 8346708.0,
"step": 512
},
{
"entropy": 0.5592319965362549,
"epoch": 1.9158878504672896,
"grad_norm": 0.03037538379430771,
"learning_rate": 0.0002,
"loss": 0.5616269111633301,
"mean_token_accuracy": 0.7723426669836044,
"num_tokens": 8362957.0,
"step": 513
},
{
"entropy": 0.549030601978302,
"epoch": 1.919626168224299,
"grad_norm": 0.03563016280531883,
"learning_rate": 0.0002,
"loss": 0.5529686808586121,
"mean_token_accuracy": 0.7743269205093384,
"num_tokens": 8379387.0,
"step": 514
},
{
"entropy": 0.5441324412822723,
"epoch": 1.9233644859813084,
"grad_norm": 0.031737376004457474,
"learning_rate": 0.0002,
"loss": 0.5500344038009644,
"mean_token_accuracy": 0.7763906866312027,
"num_tokens": 8395747.0,
"step": 515
},
{
"entropy": 0.5507270097732544,
"epoch": 1.9271028037383178,
"grad_norm": 0.03285627067089081,
"learning_rate": 0.0002,
"loss": 0.5587583780288696,
"mean_token_accuracy": 0.7742376923561096,
"num_tokens": 8412181.0,
"step": 516
},
{
"entropy": 0.5456591248512268,
"epoch": 1.9308411214953272,
"grad_norm": 0.03147684410214424,
"learning_rate": 0.0002,
"loss": 0.5484343767166138,
"mean_token_accuracy": 0.7780278623104095,
"num_tokens": 8428664.0,
"step": 517
},
{
"entropy": 0.5484454035758972,
"epoch": 1.9345794392523366,
"grad_norm": 0.036278773099184036,
"learning_rate": 0.0002,
"loss": 0.5547294616699219,
"mean_token_accuracy": 0.7715467214584351,
"num_tokens": 8444942.0,
"step": 518
},
{
"entropy": 0.5404845178127289,
"epoch": 1.938317757009346,
"grad_norm": 0.032282162457704544,
"learning_rate": 0.0002,
"loss": 0.5409780740737915,
"mean_token_accuracy": 0.779809907078743,
"num_tokens": 8461035.0,
"step": 519
},
{
"entropy": 0.5527531504631042,
"epoch": 1.9420560747663551,
"grad_norm": 0.03141535073518753,
"learning_rate": 0.0002,
"loss": 0.5439110994338989,
"mean_token_accuracy": 0.7789405584335327,
"num_tokens": 8477333.0,
"step": 520
},
{
"entropy": 0.5531508475542068,
"epoch": 1.9457943925233645,
"grad_norm": 0.032372504472732544,
"learning_rate": 0.0002,
"loss": 0.5456727147102356,
"mean_token_accuracy": 0.7779283076524734,
"num_tokens": 8493646.0,
"step": 521
},
{
"entropy": 0.558539628982544,
"epoch": 1.9495327102803737,
"grad_norm": 0.03805968537926674,
"learning_rate": 0.0002,
"loss": 0.5575815439224243,
"mean_token_accuracy": 0.7742009460926056,
"num_tokens": 8510069.0,
"step": 522
},
{
"entropy": 0.5543098747730255,
"epoch": 1.953271028037383,
"grad_norm": 0.03495538979768753,
"learning_rate": 0.0002,
"loss": 0.558309018611908,
"mean_token_accuracy": 0.7752062678337097,
"num_tokens": 8526687.0,
"step": 523
},
{
"entropy": 0.5394291281700134,
"epoch": 1.9570093457943925,
"grad_norm": 0.060034435242414474,
"learning_rate": 0.0002,
"loss": 0.5499407649040222,
"mean_token_accuracy": 0.7747859209775925,
"num_tokens": 8543194.0,
"step": 524
},
{
"entropy": 0.5493269860744476,
"epoch": 1.9607476635514018,
"grad_norm": 0.03242463245987892,
"learning_rate": 0.0002,
"loss": 0.5581871271133423,
"mean_token_accuracy": 0.7717736065387726,
"num_tokens": 8559635.0,
"step": 525
},
{
"entropy": 0.5709338933229446,
"epoch": 1.9644859813084112,
"grad_norm": 0.09612691402435303,
"learning_rate": 0.0002,
"loss": 0.5857856273651123,
"mean_token_accuracy": 0.7716985046863556,
"num_tokens": 8575682.0,
"step": 526
},
{
"entropy": 0.5535681843757629,
"epoch": 1.9682242990654206,
"grad_norm": 0.03228386864066124,
"learning_rate": 0.0002,
"loss": 0.5427148342132568,
"mean_token_accuracy": 0.7775698453187943,
"num_tokens": 8591993.0,
"step": 527
},
{
"entropy": 0.5595205128192902,
"epoch": 1.97196261682243,
"grad_norm": 0.05833456665277481,
"learning_rate": 0.0002,
"loss": 0.5632327795028687,
"mean_token_accuracy": 0.7714700251817703,
"num_tokens": 8608390.0,
"step": 528
},
{
"entropy": 0.5412962287664413,
"epoch": 1.9757009345794394,
"grad_norm": 0.04238782078027725,
"learning_rate": 0.0002,
"loss": 0.5416378378868103,
"mean_token_accuracy": 0.7781312763690948,
"num_tokens": 8624553.0,
"step": 529
},
{
"entropy": 0.5466502904891968,
"epoch": 1.9794392523364486,
"grad_norm": 0.038432635366916656,
"learning_rate": 0.0002,
"loss": 0.5546246767044067,
"mean_token_accuracy": 0.7747474908828735,
"num_tokens": 8640859.0,
"step": 530
},
{
"entropy": 0.5358689278364182,
"epoch": 1.983177570093458,
"grad_norm": 0.03189871460199356,
"learning_rate": 0.0002,
"loss": 0.5437639355659485,
"mean_token_accuracy": 0.7790816277265549,
"num_tokens": 8657164.0,
"step": 531
},
{
"entropy": 0.5428293794393539,
"epoch": 1.9869158878504671,
"grad_norm": 0.031927406787872314,
"learning_rate": 0.0002,
"loss": 0.5386630892753601,
"mean_token_accuracy": 0.7813318967819214,
"num_tokens": 8673653.0,
"step": 532
},
{
"entropy": 0.5520585179328918,
"epoch": 1.9906542056074765,
"grad_norm": 0.036430567502975464,
"learning_rate": 0.0002,
"loss": 0.5499424338340759,
"mean_token_accuracy": 0.7754785418510437,
"num_tokens": 8689872.0,
"step": 533
},
{
"entropy": 0.5492618307471275,
"epoch": 1.994392523364486,
"grad_norm": 0.03422766923904419,
"learning_rate": 0.0002,
"loss": 0.5523169040679932,
"mean_token_accuracy": 0.7751457393169403,
"num_tokens": 8706316.0,
"step": 534
},
{
"entropy": 0.5318035036325455,
"epoch": 1.9981308411214953,
"grad_norm": 0.029748188331723213,
"learning_rate": 0.0002,
"loss": 0.5351126790046692,
"mean_token_accuracy": 0.7828892469406128,
"num_tokens": 8722797.0,
"step": 535
},
{
"entropy": 0.5385511517524719,
"epoch": 2.0,
"grad_norm": 0.05353870987892151,
"learning_rate": 0.0002,
"loss": 0.5426778197288513,
"mean_token_accuracy": 0.7800059914588928,
"num_tokens": 8729632.0,
"step": 536
},
{
"entropy": 0.5657109320163727,
"epoch": 2.0037383177570094,
"grad_norm": 0.03845514729619026,
"learning_rate": 0.0002,
"loss": 0.5532518029212952,
"mean_token_accuracy": 0.7752802222967148,
"num_tokens": 8746094.0,
"step": 537
},
{
"entropy": 0.5414439141750336,
"epoch": 2.007476635514019,
"grad_norm": 0.030992809683084488,
"learning_rate": 0.0002,
"loss": 0.5374770164489746,
"mean_token_accuracy": 0.7807809114456177,
"num_tokens": 8762553.0,
"step": 538
},
{
"entropy": 0.520616352558136,
"epoch": 2.011214953271028,
"grad_norm": 0.03543594852089882,
"learning_rate": 0.0002,
"loss": 0.5239126086235046,
"mean_token_accuracy": 0.7860341370105743,
"num_tokens": 8778649.0,
"step": 539
},
{
"entropy": 0.5175309702754021,
"epoch": 2.0149532710280376,
"grad_norm": 0.03473593294620514,
"learning_rate": 0.0002,
"loss": 0.5261198282241821,
"mean_token_accuracy": 0.7881468534469604,
"num_tokens": 8794905.0,
"step": 540
},
{
"entropy": 0.5151422992348671,
"epoch": 2.0186915887850465,
"grad_norm": 0.038792964071035385,
"learning_rate": 0.0002,
"loss": 0.5288342833518982,
"mean_token_accuracy": 0.7841326147317886,
"num_tokens": 8811277.0,
"step": 541
},
{
"entropy": 0.5424266159534454,
"epoch": 2.022429906542056,
"grad_norm": 0.03833077475428581,
"learning_rate": 0.0002,
"loss": 0.5454620718955994,
"mean_token_accuracy": 0.7795733213424683,
"num_tokens": 8827670.0,
"step": 542
},
{
"entropy": 0.533804714679718,
"epoch": 2.0261682242990653,
"grad_norm": 0.03583015128970146,
"learning_rate": 0.0002,
"loss": 0.5267578959465027,
"mean_token_accuracy": 0.7867784053087234,
"num_tokens": 8843733.0,
"step": 543
},
{
"entropy": 0.5466929823160172,
"epoch": 2.0299065420560747,
"grad_norm": 0.03870777040719986,
"learning_rate": 0.0002,
"loss": 0.5435931086540222,
"mean_token_accuracy": 0.7770351320505142,
"num_tokens": 8860036.0,
"step": 544
},
{
"entropy": 0.5408391207456589,
"epoch": 2.033644859813084,
"grad_norm": 0.03353007137775421,
"learning_rate": 0.0002,
"loss": 0.5323169827461243,
"mean_token_accuracy": 0.7834465950727463,
"num_tokens": 8876470.0,
"step": 545
},
{
"entropy": 0.5217868834733963,
"epoch": 2.0373831775700935,
"grad_norm": 0.036939021199941635,
"learning_rate": 0.0002,
"loss": 0.5216724276542664,
"mean_token_accuracy": 0.7884602099657059,
"num_tokens": 8892628.0,
"step": 546
},
{
"entropy": 0.5368964821100235,
"epoch": 2.041121495327103,
"grad_norm": 0.043159015476703644,
"learning_rate": 0.0002,
"loss": 0.5444939136505127,
"mean_token_accuracy": 0.778968021273613,
"num_tokens": 8909028.0,
"step": 547
},
{
"entropy": 0.5433569848537445,
"epoch": 2.0448598130841122,
"grad_norm": 0.03701786324381828,
"learning_rate": 0.0002,
"loss": 0.5439752340316772,
"mean_token_accuracy": 0.7791613191366196,
"num_tokens": 8925310.0,
"step": 548
},
{
"entropy": 0.5270129442214966,
"epoch": 2.0485981308411216,
"grad_norm": 0.04250190034508705,
"learning_rate": 0.0002,
"loss": 0.5210642218589783,
"mean_token_accuracy": 0.7867415547370911,
"num_tokens": 8941225.0,
"step": 549
},
{
"entropy": 0.5519801378250122,
"epoch": 2.052336448598131,
"grad_norm": 0.03549535945057869,
"learning_rate": 0.0002,
"loss": 0.550297200679779,
"mean_token_accuracy": 0.7756542861461639,
"num_tokens": 8957662.0,
"step": 550
},
{
"entropy": 0.5188534706830978,
"epoch": 2.05607476635514,
"grad_norm": 0.03532535210251808,
"learning_rate": 0.0002,
"loss": 0.5225726962089539,
"mean_token_accuracy": 0.7875347584486008,
"num_tokens": 8973986.0,
"step": 551
},
{
"entropy": 0.5331487953662872,
"epoch": 2.0598130841121494,
"grad_norm": 0.0401851125061512,
"learning_rate": 0.0002,
"loss": 0.5345657467842102,
"mean_token_accuracy": 0.7807552814483643,
"num_tokens": 8990453.0,
"step": 552
},
{
"entropy": 0.5301813259720802,
"epoch": 2.0635514018691588,
"grad_norm": 0.04093443974852562,
"learning_rate": 0.0002,
"loss": 0.536128580570221,
"mean_token_accuracy": 0.781855434179306,
"num_tokens": 9006810.0,
"step": 553
},
{
"entropy": 0.5511504411697388,
"epoch": 2.067289719626168,
"grad_norm": 0.04108293727040291,
"learning_rate": 0.0002,
"loss": 0.547398567199707,
"mean_token_accuracy": 0.7787968963384628,
"num_tokens": 9023044.0,
"step": 554
},
{
"entropy": 0.5452945232391357,
"epoch": 2.0710280373831775,
"grad_norm": 0.04133358225226402,
"learning_rate": 0.0002,
"loss": 0.5406649112701416,
"mean_token_accuracy": 0.7804151326417923,
"num_tokens": 9039300.0,
"step": 555
},
{
"entropy": 0.5133676081895828,
"epoch": 2.074766355140187,
"grad_norm": 0.0368187241256237,
"learning_rate": 0.0002,
"loss": 0.510840892791748,
"mean_token_accuracy": 0.7948838770389557,
"num_tokens": 9055408.0,
"step": 556
},
{
"entropy": 0.5286162942647934,
"epoch": 2.0785046728971963,
"grad_norm": 0.037287503480911255,
"learning_rate": 0.0002,
"loss": 0.5286219120025635,
"mean_token_accuracy": 0.7867581397294998,
"num_tokens": 9071847.0,
"step": 557
},
{
"entropy": 0.5187130272388458,
"epoch": 2.0822429906542057,
"grad_norm": 0.03932078555226326,
"learning_rate": 0.0002,
"loss": 0.5252044200897217,
"mean_token_accuracy": 0.788768544793129,
"num_tokens": 9088062.0,
"step": 558
},
{
"entropy": 0.5239534676074982,
"epoch": 2.085981308411215,
"grad_norm": 0.04231242835521698,
"learning_rate": 0.0002,
"loss": 0.535202145576477,
"mean_token_accuracy": 0.7852179259061813,
"num_tokens": 9104468.0,
"step": 559
},
{
"entropy": 0.528278037905693,
"epoch": 2.0897196261682245,
"grad_norm": 0.03444297984242439,
"learning_rate": 0.0002,
"loss": 0.5238081812858582,
"mean_token_accuracy": 0.7863867878913879,
"num_tokens": 9120622.0,
"step": 560
},
{
"entropy": 0.5545478612184525,
"epoch": 2.0934579439252334,
"grad_norm": 0.04182487353682518,
"learning_rate": 0.0002,
"loss": 0.5527917742729187,
"mean_token_accuracy": 0.7766451835632324,
"num_tokens": 9137031.0,
"step": 561
},
{
"entropy": 0.521744892001152,
"epoch": 2.097196261682243,
"grad_norm": 0.03438956290483475,
"learning_rate": 0.0002,
"loss": 0.5255383849143982,
"mean_token_accuracy": 0.7855681478977203,
"num_tokens": 9153374.0,
"step": 562
},
{
"entropy": 0.5317307189106941,
"epoch": 2.100934579439252,
"grad_norm": 0.04259387031197548,
"learning_rate": 0.0002,
"loss": 0.530976414680481,
"mean_token_accuracy": 0.7861284911632538,
"num_tokens": 9169379.0,
"step": 563
},
{
"entropy": 0.5382358431816101,
"epoch": 2.1046728971962616,
"grad_norm": 0.03778582066297531,
"learning_rate": 0.0002,
"loss": 0.5446516871452332,
"mean_token_accuracy": 0.7786799967288971,
"num_tokens": 9185673.0,
"step": 564
},
{
"entropy": 0.5174337849020958,
"epoch": 2.108411214953271,
"grad_norm": 0.03816930949687958,
"learning_rate": 0.0002,
"loss": 0.5179592967033386,
"mean_token_accuracy": 0.7912393063306808,
"num_tokens": 9201995.0,
"step": 565
},
{
"entropy": 0.5279374569654465,
"epoch": 2.1121495327102804,
"grad_norm": 0.038216955959796906,
"learning_rate": 0.0002,
"loss": 0.5243582129478455,
"mean_token_accuracy": 0.7866894006729126,
"num_tokens": 9218133.0,
"step": 566
},
{
"entropy": 0.5245715379714966,
"epoch": 2.1158878504672898,
"grad_norm": 0.03613874316215515,
"learning_rate": 0.0002,
"loss": 0.5249512791633606,
"mean_token_accuracy": 0.7851840853691101,
"num_tokens": 9234342.0,
"step": 567
},
{
"entropy": 0.5192612558603287,
"epoch": 2.119626168224299,
"grad_norm": 0.04042578116059303,
"learning_rate": 0.0002,
"loss": 0.5259383320808411,
"mean_token_accuracy": 0.7858112007379532,
"num_tokens": 9250696.0,
"step": 568
},
{
"entropy": 0.5262997299432755,
"epoch": 2.1233644859813086,
"grad_norm": 0.04460779204964638,
"learning_rate": 0.0002,
"loss": 0.5308440923690796,
"mean_token_accuracy": 0.7877162247896194,
"num_tokens": 9266979.0,
"step": 569
},
{
"entropy": 0.5224001705646515,
"epoch": 2.127102803738318,
"grad_norm": 0.03817397728562355,
"learning_rate": 0.0002,
"loss": 0.5229726433753967,
"mean_token_accuracy": 0.7861741036176682,
"num_tokens": 9283280.0,
"step": 570
},
{
"entropy": 0.5274494737386703,
"epoch": 2.130841121495327,
"grad_norm": 0.04161069914698601,
"learning_rate": 0.0002,
"loss": 0.5270024538040161,
"mean_token_accuracy": 0.7860408127307892,
"num_tokens": 9299630.0,
"step": 571
},
{
"entropy": 0.5552078932523727,
"epoch": 2.1345794392523363,
"grad_norm": 0.04526656121015549,
"learning_rate": 0.0002,
"loss": 0.547661542892456,
"mean_token_accuracy": 0.77776238322258,
"num_tokens": 9316114.0,
"step": 572
},
{
"entropy": 0.5352555364370346,
"epoch": 2.1383177570093457,
"grad_norm": 0.037117403000593185,
"learning_rate": 0.0002,
"loss": 0.5322074294090271,
"mean_token_accuracy": 0.7845579087734222,
"num_tokens": 9332486.0,
"step": 573
},
{
"entropy": 0.5299685597419739,
"epoch": 2.142056074766355,
"grad_norm": 0.04335174337029457,
"learning_rate": 0.0002,
"loss": 0.5333051085472107,
"mean_token_accuracy": 0.7831422835588455,
"num_tokens": 9348999.0,
"step": 574
},
{
"entropy": 0.5251427963376045,
"epoch": 2.1457943925233645,
"grad_norm": 0.04729305952787399,
"learning_rate": 0.0002,
"loss": 0.5304993987083435,
"mean_token_accuracy": 0.7857193797826767,
"num_tokens": 9365291.0,
"step": 575
},
{
"entropy": 0.5248839110136032,
"epoch": 2.149532710280374,
"grad_norm": 0.04293828830122948,
"learning_rate": 0.0002,
"loss": 0.5300874710083008,
"mean_token_accuracy": 0.784340038895607,
"num_tokens": 9381734.0,
"step": 576
},
{
"entropy": 0.5214874297380447,
"epoch": 2.1532710280373832,
"grad_norm": 0.04350607469677925,
"learning_rate": 0.0002,
"loss": 0.5177597403526306,
"mean_token_accuracy": 0.7909844070672989,
"num_tokens": 9397955.0,
"step": 577
},
{
"entropy": 0.5421570688486099,
"epoch": 2.1570093457943926,
"grad_norm": 0.042496006935834885,
"learning_rate": 0.0002,
"loss": 0.5425592660903931,
"mean_token_accuracy": 0.7795795798301697,
"num_tokens": 9414143.0,
"step": 578
},
{
"entropy": 0.535075232386589,
"epoch": 2.160747663551402,
"grad_norm": 0.049906548112630844,
"learning_rate": 0.0002,
"loss": 0.5370741486549377,
"mean_token_accuracy": 0.7806216180324554,
"num_tokens": 9430295.0,
"step": 579
},
{
"entropy": 0.535729855298996,
"epoch": 2.1644859813084114,
"grad_norm": 0.04840796813368797,
"learning_rate": 0.0002,
"loss": 0.5347393155097961,
"mean_token_accuracy": 0.7850737869739532,
"num_tokens": 9446633.0,
"step": 580
},
{
"entropy": 0.5312991067767143,
"epoch": 2.1682242990654204,
"grad_norm": 0.04839569702744484,
"learning_rate": 0.0002,
"loss": 0.5378549098968506,
"mean_token_accuracy": 0.7815908044576645,
"num_tokens": 9462924.0,
"step": 581
},
{
"entropy": 0.5284993052482605,
"epoch": 2.1719626168224297,
"grad_norm": 0.04563288018107414,
"learning_rate": 0.0002,
"loss": 0.5385716557502747,
"mean_token_accuracy": 0.7814656347036362,
"num_tokens": 9479222.0,
"step": 582
},
{
"entropy": 0.535816490650177,
"epoch": 2.175700934579439,
"grad_norm": 0.05489310622215271,
"learning_rate": 0.0002,
"loss": 0.5382475256919861,
"mean_token_accuracy": 0.7812406271696091,
"num_tokens": 9495589.0,
"step": 583
},
{
"entropy": 0.549729734659195,
"epoch": 2.1794392523364485,
"grad_norm": 0.0424075648188591,
"learning_rate": 0.0002,
"loss": 0.539716899394989,
"mean_token_accuracy": 0.7819323092699051,
"num_tokens": 9511725.0,
"step": 584
},
{
"entropy": 0.5317162126302719,
"epoch": 2.183177570093458,
"grad_norm": 0.03563420847058296,
"learning_rate": 0.0002,
"loss": 0.5235797166824341,
"mean_token_accuracy": 0.7905198931694031,
"num_tokens": 9527971.0,
"step": 585
},
{
"entropy": 0.5211209952831268,
"epoch": 2.1869158878504673,
"grad_norm": 0.048658616840839386,
"learning_rate": 0.0002,
"loss": 0.5268206000328064,
"mean_token_accuracy": 0.7845446020364761,
"num_tokens": 9544253.0,
"step": 586
},
{
"entropy": 0.5116122514009476,
"epoch": 2.1906542056074767,
"grad_norm": 0.04198598116636276,
"learning_rate": 0.0002,
"loss": 0.5190539360046387,
"mean_token_accuracy": 0.7874016612768173,
"num_tokens": 9560518.0,
"step": 587
},
{
"entropy": 0.5246260613203049,
"epoch": 2.194392523364486,
"grad_norm": 0.03876075521111488,
"learning_rate": 0.0002,
"loss": 0.5228715538978577,
"mean_token_accuracy": 0.7850266695022583,
"num_tokens": 9576775.0,
"step": 588
},
{
"entropy": 0.5278798937797546,
"epoch": 2.1981308411214955,
"grad_norm": 0.04761234670877457,
"learning_rate": 0.0002,
"loss": 0.5265949964523315,
"mean_token_accuracy": 0.7893748730421066,
"num_tokens": 9593040.0,
"step": 589
},
{
"entropy": 0.548830658197403,
"epoch": 2.201869158878505,
"grad_norm": 0.04078621417284012,
"learning_rate": 0.0002,
"loss": 0.5517237186431885,
"mean_token_accuracy": 0.778541699051857,
"num_tokens": 9609499.0,
"step": 590
},
{
"entropy": 0.5336392223834991,
"epoch": 2.205607476635514,
"grad_norm": 0.04143911972641945,
"learning_rate": 0.0002,
"loss": 0.5296382308006287,
"mean_token_accuracy": 0.7824793457984924,
"num_tokens": 9625911.0,
"step": 591
},
{
"entropy": 0.5379772335290909,
"epoch": 2.209345794392523,
"grad_norm": 0.03608503192663193,
"learning_rate": 0.0002,
"loss": 0.5343111753463745,
"mean_token_accuracy": 0.7822979539632797,
"num_tokens": 9642395.0,
"step": 592
},
{
"entropy": 0.5172793120145798,
"epoch": 2.2130841121495326,
"grad_norm": 0.034696269780397415,
"learning_rate": 0.0002,
"loss": 0.5195714235305786,
"mean_token_accuracy": 0.7902600318193436,
"num_tokens": 9658662.0,
"step": 593
},
{
"entropy": 0.5202511548995972,
"epoch": 2.216822429906542,
"grad_norm": 0.0416097529232502,
"learning_rate": 0.0002,
"loss": 0.5290377736091614,
"mean_token_accuracy": 0.7843390554189682,
"num_tokens": 9674880.0,
"step": 594
},
{
"entropy": 0.5413576662540436,
"epoch": 2.2205607476635514,
"grad_norm": 0.0419846810400486,
"learning_rate": 0.0002,
"loss": 0.5517836809158325,
"mean_token_accuracy": 0.7757999449968338,
"num_tokens": 9691443.0,
"step": 595
},
{
"entropy": 0.5511815398931503,
"epoch": 2.2242990654205608,
"grad_norm": 0.042311880737543106,
"learning_rate": 0.0002,
"loss": 0.5441216230392456,
"mean_token_accuracy": 0.7797399759292603,
"num_tokens": 9707667.0,
"step": 596
},
{
"entropy": 0.5390328615903854,
"epoch": 2.22803738317757,
"grad_norm": 0.04130427911877632,
"learning_rate": 0.0002,
"loss": 0.5381530523300171,
"mean_token_accuracy": 0.7850432395935059,
"num_tokens": 9723670.0,
"step": 597
},
{
"entropy": 0.5145308524370193,
"epoch": 2.2317757009345796,
"grad_norm": 0.04054151102900505,
"learning_rate": 0.0002,
"loss": 0.5153539776802063,
"mean_token_accuracy": 0.7911680340766907,
"num_tokens": 9740111.0,
"step": 598
},
{
"entropy": 0.5264055132865906,
"epoch": 2.235514018691589,
"grad_norm": 0.04768845811486244,
"learning_rate": 0.0002,
"loss": 0.5321245193481445,
"mean_token_accuracy": 0.7862783521413803,
"num_tokens": 9756445.0,
"step": 599
},
{
"entropy": 0.5161085873842239,
"epoch": 2.2392523364485983,
"grad_norm": 0.047890279442071915,
"learning_rate": 0.0002,
"loss": 0.5329167246818542,
"mean_token_accuracy": 0.7836614698171616,
"num_tokens": 9772513.0,
"step": 600
},
{
"entropy": 0.5542461574077606,
"epoch": 2.2429906542056073,
"grad_norm": 0.04093446582555771,
"learning_rate": 0.0002,
"loss": 0.555320680141449,
"mean_token_accuracy": 0.7749381363391876,
"num_tokens": 9789085.0,
"step": 601
},
{
"entropy": 0.5521011054515839,
"epoch": 2.2467289719626167,
"grad_norm": 0.0422159768640995,
"learning_rate": 0.0002,
"loss": 0.5415031313896179,
"mean_token_accuracy": 0.7801210135221481,
"num_tokens": 9805542.0,
"step": 602
},
{
"entropy": 0.5508425533771515,
"epoch": 2.250467289719626,
"grad_norm": 0.04688411206007004,
"learning_rate": 0.0002,
"loss": 0.5387436151504517,
"mean_token_accuracy": 0.7821325659751892,
"num_tokens": 9821923.0,
"step": 603
},
{
"entropy": 0.5507242232561111,
"epoch": 2.2542056074766355,
"grad_norm": 0.035407017916440964,
"learning_rate": 0.0002,
"loss": 0.5444649457931519,
"mean_token_accuracy": 0.7809951901435852,
"num_tokens": 9838298.0,
"step": 604
},
{
"entropy": 0.5216517895460129,
"epoch": 2.257943925233645,
"grad_norm": 0.041920073330402374,
"learning_rate": 0.0002,
"loss": 0.5264837741851807,
"mean_token_accuracy": 0.7897377163171768,
"num_tokens": 9854659.0,
"step": 605
},
{
"entropy": 0.5258049964904785,
"epoch": 2.2616822429906542,
"grad_norm": 0.0534173846244812,
"learning_rate": 0.0002,
"loss": 0.5415172576904297,
"mean_token_accuracy": 0.7817163467407227,
"num_tokens": 9870877.0,
"step": 606
},
{
"entropy": 0.5240575075149536,
"epoch": 2.2654205607476636,
"grad_norm": 0.03395333141088486,
"learning_rate": 0.0002,
"loss": 0.5256165862083435,
"mean_token_accuracy": 0.7837403416633606,
"num_tokens": 9887224.0,
"step": 607
},
{
"entropy": 0.5454617738723755,
"epoch": 2.269158878504673,
"grad_norm": 0.034148454666137695,
"learning_rate": 0.0002,
"loss": 0.5424824953079224,
"mean_token_accuracy": 0.7791529148817062,
"num_tokens": 9903786.0,
"step": 608
},
{
"entropy": 0.5350487977266312,
"epoch": 2.2728971962616824,
"grad_norm": 0.042522136121988297,
"learning_rate": 0.0002,
"loss": 0.5272009372711182,
"mean_token_accuracy": 0.7874994874000549,
"num_tokens": 9920053.0,
"step": 609
},
{
"entropy": 0.5338039100170135,
"epoch": 2.2766355140186914,
"grad_norm": 0.036921191960573196,
"learning_rate": 0.0002,
"loss": 0.5227792859077454,
"mean_token_accuracy": 0.7891070544719696,
"num_tokens": 9936211.0,
"step": 610
},
{
"entropy": 0.5317139476537704,
"epoch": 2.2803738317757007,
"grad_norm": 0.038269490003585815,
"learning_rate": 0.0002,
"loss": 0.5253998637199402,
"mean_token_accuracy": 0.7870776653289795,
"num_tokens": 9952725.0,
"step": 611
},
{
"entropy": 0.5196784734725952,
"epoch": 2.28411214953271,
"grad_norm": 0.03972024843096733,
"learning_rate": 0.0002,
"loss": 0.5251049995422363,
"mean_token_accuracy": 0.7839716672897339,
"num_tokens": 9969316.0,
"step": 612
},
{
"entropy": 0.5095352083444595,
"epoch": 2.2878504672897195,
"grad_norm": 0.0507940798997879,
"learning_rate": 0.0002,
"loss": 0.5290789008140564,
"mean_token_accuracy": 0.7861248552799225,
"num_tokens": 9985447.0,
"step": 613
},
{
"entropy": 0.5270750820636749,
"epoch": 2.291588785046729,
"grad_norm": 0.04321181774139404,
"learning_rate": 0.0002,
"loss": 0.5311838984489441,
"mean_token_accuracy": 0.7838535755872726,
"num_tokens": 10001725.0,
"step": 614
},
{
"entropy": 0.5379711389541626,
"epoch": 2.2953271028037383,
"grad_norm": 0.040656980127096176,
"learning_rate": 0.0002,
"loss": 0.5385247468948364,
"mean_token_accuracy": 0.7803602814674377,
"num_tokens": 10018134.0,
"step": 615
},
{
"entropy": 0.5364449620246887,
"epoch": 2.2990654205607477,
"grad_norm": 0.044270358979701996,
"learning_rate": 0.0002,
"loss": 0.5303220748901367,
"mean_token_accuracy": 0.7875775545835495,
"num_tokens": 10034256.0,
"step": 616
},
{
"entropy": 0.5223758369684219,
"epoch": 2.302803738317757,
"grad_norm": 0.04040619730949402,
"learning_rate": 0.0002,
"loss": 0.5194275379180908,
"mean_token_accuracy": 0.7908173054456711,
"num_tokens": 10050260.0,
"step": 617
},
{
"entropy": 0.5754473656415939,
"epoch": 2.3065420560747665,
"grad_norm": 0.0413733534514904,
"learning_rate": 0.0002,
"loss": 0.5673513412475586,
"mean_token_accuracy": 0.7693175226449966,
"num_tokens": 10066439.0,
"step": 618
},
{
"entropy": 0.5494302958250046,
"epoch": 2.310280373831776,
"grad_norm": 0.04788622632622719,
"learning_rate": 0.0002,
"loss": 0.5560729503631592,
"mean_token_accuracy": 0.7737975120544434,
"num_tokens": 10082592.0,
"step": 619
},
{
"entropy": 0.5400004386901855,
"epoch": 2.3140186915887853,
"grad_norm": 0.04467733949422836,
"learning_rate": 0.0002,
"loss": 0.5475805997848511,
"mean_token_accuracy": 0.7767456918954849,
"num_tokens": 10098902.0,
"step": 620
},
{
"entropy": 0.5090039819478989,
"epoch": 2.317757009345794,
"grad_norm": 0.04413570463657379,
"learning_rate": 0.0002,
"loss": 0.5152875781059265,
"mean_token_accuracy": 0.792495995759964,
"num_tokens": 10115273.0,
"step": 621
},
{
"entropy": 0.5372920483350754,
"epoch": 2.3214953271028036,
"grad_norm": 0.037302058190107346,
"learning_rate": 0.0002,
"loss": 0.5321361422538757,
"mean_token_accuracy": 0.7862480282783508,
"num_tokens": 10131501.0,
"step": 622
},
{
"entropy": 0.5543005019426346,
"epoch": 2.325233644859813,
"grad_norm": 0.03829365596175194,
"learning_rate": 0.0002,
"loss": 0.5508820414543152,
"mean_token_accuracy": 0.7745321840047836,
"num_tokens": 10147998.0,
"step": 623
},
{
"entropy": 0.5153163969516754,
"epoch": 2.3289719626168224,
"grad_norm": 0.045321445912122726,
"learning_rate": 0.0002,
"loss": 0.5118069052696228,
"mean_token_accuracy": 0.7935506701469421,
"num_tokens": 10164126.0,
"step": 624
},
{
"entropy": 0.5008471608161926,
"epoch": 2.3327102803738318,
"grad_norm": 0.04449000954627991,
"learning_rate": 0.0002,
"loss": 0.5082967877388,
"mean_token_accuracy": 0.7942900061607361,
"num_tokens": 10180274.0,
"step": 625
},
{
"entropy": 0.532206118106842,
"epoch": 2.336448598130841,
"grad_norm": 0.05191594734787941,
"learning_rate": 0.0002,
"loss": 0.5367388129234314,
"mean_token_accuracy": 0.7808051854372025,
"num_tokens": 10196609.0,
"step": 626
},
{
"entropy": 0.5258989185094833,
"epoch": 2.3401869158878505,
"grad_norm": 0.044721271842718124,
"learning_rate": 0.0002,
"loss": 0.5331224203109741,
"mean_token_accuracy": 0.7829412668943405,
"num_tokens": 10212895.0,
"step": 627
},
{
"entropy": 0.5370120704174042,
"epoch": 2.34392523364486,
"grad_norm": 0.041769906878471375,
"learning_rate": 0.0002,
"loss": 0.5412429571151733,
"mean_token_accuracy": 0.7827376574277878,
"num_tokens": 10229237.0,
"step": 628
},
{
"entropy": 0.5400294661521912,
"epoch": 2.3476635514018693,
"grad_norm": 0.040269553661346436,
"learning_rate": 0.0002,
"loss": 0.5357171893119812,
"mean_token_accuracy": 0.7816246598958969,
"num_tokens": 10245453.0,
"step": 629
},
{
"entropy": 0.5325844436883926,
"epoch": 2.3514018691588783,
"grad_norm": 0.04499928280711174,
"learning_rate": 0.0002,
"loss": 0.5283193588256836,
"mean_token_accuracy": 0.7859142124652863,
"num_tokens": 10261777.0,
"step": 630
},
{
"entropy": 0.5282296687364578,
"epoch": 2.3551401869158877,
"grad_norm": 0.04336896538734436,
"learning_rate": 0.0002,
"loss": 0.5254157781600952,
"mean_token_accuracy": 0.789379209280014,
"num_tokens": 10278007.0,
"step": 631
},
{
"entropy": 0.5453646928071976,
"epoch": 2.358878504672897,
"grad_norm": 0.05249177664518356,
"learning_rate": 0.0002,
"loss": 0.5468531250953674,
"mean_token_accuracy": 0.7771991342306137,
"num_tokens": 10294331.0,
"step": 632
},
{
"entropy": 0.543931856751442,
"epoch": 2.3626168224299064,
"grad_norm": 0.037500377744436264,
"learning_rate": 0.0002,
"loss": 0.5477216839790344,
"mean_token_accuracy": 0.7776368409395218,
"num_tokens": 10310976.0,
"step": 633
},
{
"entropy": 0.5300342440605164,
"epoch": 2.366355140186916,
"grad_norm": 0.04039130359888077,
"learning_rate": 0.0002,
"loss": 0.5305655002593994,
"mean_token_accuracy": 0.7832176089286804,
"num_tokens": 10327256.0,
"step": 634
},
{
"entropy": 0.5378967821598053,
"epoch": 2.3700934579439252,
"grad_norm": 0.04444447159767151,
"learning_rate": 0.0002,
"loss": 0.5362187027931213,
"mean_token_accuracy": 0.7842839509248734,
"num_tokens": 10343608.0,
"step": 635
},
{
"entropy": 0.5510306656360626,
"epoch": 2.3738317757009346,
"grad_norm": 0.04542792961001396,
"learning_rate": 0.0002,
"loss": 0.5493132472038269,
"mean_token_accuracy": 0.7786229699850082,
"num_tokens": 10359923.0,
"step": 636
},
{
"entropy": 0.5210727900266647,
"epoch": 2.377570093457944,
"grad_norm": 0.043661415576934814,
"learning_rate": 0.0002,
"loss": 0.5236334800720215,
"mean_token_accuracy": 0.7890983521938324,
"num_tokens": 10376100.0,
"step": 637
},
{
"entropy": 0.5260880589485168,
"epoch": 2.3813084112149534,
"grad_norm": 0.04262132570147514,
"learning_rate": 0.0002,
"loss": 0.5248558521270752,
"mean_token_accuracy": 0.7902341783046722,
"num_tokens": 10392698.0,
"step": 638
},
{
"entropy": 0.5457091331481934,
"epoch": 2.385046728971963,
"grad_norm": 0.04899441823363304,
"learning_rate": 0.0002,
"loss": 0.5536708235740662,
"mean_token_accuracy": 0.7760955542325974,
"num_tokens": 10409076.0,
"step": 639
},
{
"entropy": 0.5321961939334869,
"epoch": 2.388785046728972,
"grad_norm": 0.045906826853752136,
"learning_rate": 0.0002,
"loss": 0.5316425561904907,
"mean_token_accuracy": 0.7848930060863495,
"num_tokens": 10425501.0,
"step": 640
},
{
"entropy": 0.5476334244012833,
"epoch": 2.392523364485981,
"grad_norm": 0.038592927157878876,
"learning_rate": 0.0002,
"loss": 0.5469234585762024,
"mean_token_accuracy": 0.7766659259796143,
"num_tokens": 10441907.0,
"step": 641
},
{
"entropy": 0.514763131737709,
"epoch": 2.3962616822429905,
"grad_norm": 0.04247188940644264,
"learning_rate": 0.0002,
"loss": 0.5191242098808289,
"mean_token_accuracy": 0.7888349145650864,
"num_tokens": 10458019.0,
"step": 642
},
{
"entropy": 0.5377763360738754,
"epoch": 2.4,
"grad_norm": 0.037420280277729034,
"learning_rate": 0.0002,
"loss": 0.5363115072250366,
"mean_token_accuracy": 0.7803380340337753,
"num_tokens": 10474412.0,
"step": 643
},
{
"entropy": 0.5383724719285965,
"epoch": 2.4037383177570093,
"grad_norm": 0.038523126393556595,
"learning_rate": 0.0002,
"loss": 0.5415539145469666,
"mean_token_accuracy": 0.7787618041038513,
"num_tokens": 10490995.0,
"step": 644
},
{
"entropy": 0.5374136418104172,
"epoch": 2.4074766355140187,
"grad_norm": 0.03964264318346977,
"learning_rate": 0.0002,
"loss": 0.5468027591705322,
"mean_token_accuracy": 0.779059037566185,
"num_tokens": 10507482.0,
"step": 645
},
{
"entropy": 0.5512133836746216,
"epoch": 2.411214953271028,
"grad_norm": 0.0391349270939827,
"learning_rate": 0.0002,
"loss": 0.5508245825767517,
"mean_token_accuracy": 0.7754583358764648,
"num_tokens": 10523993.0,
"step": 646
},
{
"entropy": 0.5193808674812317,
"epoch": 2.4149532710280375,
"grad_norm": 0.03556473180651665,
"learning_rate": 0.0002,
"loss": 0.5196793675422668,
"mean_token_accuracy": 0.78975510597229,
"num_tokens": 10540005.0,
"step": 647
},
{
"entropy": 0.5471558570861816,
"epoch": 2.418691588785047,
"grad_norm": 0.04553184658288956,
"learning_rate": 0.0002,
"loss": 0.547728419303894,
"mean_token_accuracy": 0.7780675292015076,
"num_tokens": 10555891.0,
"step": 648
},
{
"entropy": 0.519458457827568,
"epoch": 2.4224299065420563,
"grad_norm": 0.045790717005729675,
"learning_rate": 0.0002,
"loss": 0.5232809782028198,
"mean_token_accuracy": 0.7882662564516068,
"num_tokens": 10572109.0,
"step": 649
},
{
"entropy": 0.5270252674818039,
"epoch": 2.426168224299065,
"grad_norm": 0.04227881506085396,
"learning_rate": 0.0002,
"loss": 0.5288085341453552,
"mean_token_accuracy": 0.7866526395082474,
"num_tokens": 10588192.0,
"step": 650
},
{
"entropy": 0.548214852809906,
"epoch": 2.4299065420560746,
"grad_norm": 0.04126811400055885,
"learning_rate": 0.0002,
"loss": 0.5440689325332642,
"mean_token_accuracy": 0.779522180557251,
"num_tokens": 10604498.0,
"step": 651
},
{
"entropy": 0.5452295988798141,
"epoch": 2.433644859813084,
"grad_norm": 0.044819604605436325,
"learning_rate": 0.0002,
"loss": 0.547234833240509,
"mean_token_accuracy": 0.7796365767717361,
"num_tokens": 10620949.0,
"step": 652
},
{
"entropy": 0.5525990724563599,
"epoch": 2.4373831775700934,
"grad_norm": 0.042418453842401505,
"learning_rate": 0.0002,
"loss": 0.5493718385696411,
"mean_token_accuracy": 0.7783072590827942,
"num_tokens": 10637398.0,
"step": 653
},
{
"entropy": 0.5338578671216965,
"epoch": 2.4411214953271028,
"grad_norm": 0.048241496086120605,
"learning_rate": 0.0002,
"loss": 0.5348434448242188,
"mean_token_accuracy": 0.7853177338838577,
"num_tokens": 10653827.0,
"step": 654
},
{
"entropy": 0.5247549116611481,
"epoch": 2.444859813084112,
"grad_norm": 0.03876890614628792,
"learning_rate": 0.0002,
"loss": 0.5283288359642029,
"mean_token_accuracy": 0.7865240424871445,
"num_tokens": 10670227.0,
"step": 655
},
{
"entropy": 0.5525484532117844,
"epoch": 2.4485981308411215,
"grad_norm": 0.04079402610659599,
"learning_rate": 0.0002,
"loss": 0.5510199069976807,
"mean_token_accuracy": 0.7765209227800369,
"num_tokens": 10686514.0,
"step": 656
},
{
"entropy": 0.5248308256268501,
"epoch": 2.452336448598131,
"grad_norm": 0.03220357000827789,
"learning_rate": 0.0002,
"loss": 0.5197701454162598,
"mean_token_accuracy": 0.7878830432891846,
"num_tokens": 10702613.0,
"step": 657
},
{
"entropy": 0.5264022424817085,
"epoch": 2.4560747663551403,
"grad_norm": 0.038926877081394196,
"learning_rate": 0.0002,
"loss": 0.5227438807487488,
"mean_token_accuracy": 0.7853628695011139,
"num_tokens": 10718690.0,
"step": 658
},
{
"entropy": 0.5430135428905487,
"epoch": 2.4598130841121497,
"grad_norm": 0.04270581528544426,
"learning_rate": 0.0002,
"loss": 0.5455408096313477,
"mean_token_accuracy": 0.7791119664907455,
"num_tokens": 10735135.0,
"step": 659
},
{
"entropy": 0.5284547656774521,
"epoch": 2.463551401869159,
"grad_norm": 0.04039589315652847,
"learning_rate": 0.0002,
"loss": 0.5309383273124695,
"mean_token_accuracy": 0.784732460975647,
"num_tokens": 10751298.0,
"step": 660
},
{
"entropy": 0.5267135500907898,
"epoch": 2.467289719626168,
"grad_norm": 0.042588524520397186,
"learning_rate": 0.0002,
"loss": 0.5272895097732544,
"mean_token_accuracy": 0.7885420620441437,
"num_tokens": 10767947.0,
"step": 661
},
{
"entropy": 0.5294100195169449,
"epoch": 2.4710280373831774,
"grad_norm": 0.04541191831231117,
"learning_rate": 0.0002,
"loss": 0.5415511727333069,
"mean_token_accuracy": 0.7802952826023102,
"num_tokens": 10784155.0,
"step": 662
},
{
"entropy": 0.5230477377772331,
"epoch": 2.474766355140187,
"grad_norm": 0.04615366831421852,
"learning_rate": 0.0002,
"loss": 0.5295774936676025,
"mean_token_accuracy": 0.7873392999172211,
"num_tokens": 10800552.0,
"step": 663
},
{
"entropy": 0.5188637897372246,
"epoch": 2.4785046728971962,
"grad_norm": 0.03992808610200882,
"learning_rate": 0.0002,
"loss": 0.5195883512496948,
"mean_token_accuracy": 0.7883334010839462,
"num_tokens": 10816926.0,
"step": 664
},
{
"entropy": 0.5323937982320786,
"epoch": 2.4822429906542056,
"grad_norm": 0.04497828707098961,
"learning_rate": 0.0002,
"loss": 0.5278034210205078,
"mean_token_accuracy": 0.7848539501428604,
"num_tokens": 10833159.0,
"step": 665
},
{
"entropy": 0.5480016022920609,
"epoch": 2.485981308411215,
"grad_norm": 0.0394604429602623,
"learning_rate": 0.0002,
"loss": 0.5437833070755005,
"mean_token_accuracy": 0.7807918637990952,
"num_tokens": 10849417.0,
"step": 666
},
{
"entropy": 0.5170062035322189,
"epoch": 2.4897196261682244,
"grad_norm": 0.041445329785346985,
"learning_rate": 0.0002,
"loss": 0.517329216003418,
"mean_token_accuracy": 0.7887666076421738,
"num_tokens": 10865715.0,
"step": 667
},
{
"entropy": 0.5371522009372711,
"epoch": 2.493457943925234,
"grad_norm": 0.042152535170316696,
"learning_rate": 0.0002,
"loss": 0.5461167693138123,
"mean_token_accuracy": 0.7759047448635101,
"num_tokens": 10881891.0,
"step": 668
},
{
"entropy": 0.522216372191906,
"epoch": 2.497196261682243,
"grad_norm": 0.04944324120879173,
"learning_rate": 0.0002,
"loss": 0.5293608903884888,
"mean_token_accuracy": 0.7865939140319824,
"num_tokens": 10898086.0,
"step": 669
},
{
"entropy": 0.5419133603572845,
"epoch": 2.500934579439252,
"grad_norm": 0.03869049996137619,
"learning_rate": 0.0002,
"loss": 0.5435135364532471,
"mean_token_accuracy": 0.7788238078355789,
"num_tokens": 10914630.0,
"step": 670
},
{
"entropy": 0.543552428483963,
"epoch": 2.5046728971962615,
"grad_norm": 0.040104418992996216,
"learning_rate": 0.0002,
"loss": 0.5451544523239136,
"mean_token_accuracy": 0.7762735784053802,
"num_tokens": 10931142.0,
"step": 671
},
{
"entropy": 0.5488818436861038,
"epoch": 2.508411214953271,
"grad_norm": 0.03650939092040062,
"learning_rate": 0.0002,
"loss": 0.5461534857749939,
"mean_token_accuracy": 0.7810324132442474,
"num_tokens": 10947432.0,
"step": 672
},
{
"entropy": 0.5514579713344574,
"epoch": 2.5121495327102803,
"grad_norm": 0.035640496760606766,
"learning_rate": 0.0002,
"loss": 0.5461341142654419,
"mean_token_accuracy": 0.7758427411317825,
"num_tokens": 10963793.0,
"step": 673
},
{
"entropy": 0.5298633724451065,
"epoch": 2.5158878504672897,
"grad_norm": 0.036869630217552185,
"learning_rate": 0.0002,
"loss": 0.5271415710449219,
"mean_token_accuracy": 0.7874128669500351,
"num_tokens": 10980238.0,
"step": 674
},
{
"entropy": 0.5178606957197189,
"epoch": 2.519626168224299,
"grad_norm": 0.04496290162205696,
"learning_rate": 0.0002,
"loss": 0.5193417072296143,
"mean_token_accuracy": 0.7885989248752594,
"num_tokens": 10996365.0,
"step": 675
},
{
"entropy": 0.5270267352461815,
"epoch": 2.5233644859813085,
"grad_norm": 0.04544811695814133,
"learning_rate": 0.0002,
"loss": 0.5387653112411499,
"mean_token_accuracy": 0.7800068855285645,
"num_tokens": 11012575.0,
"step": 676
},
{
"entropy": 0.527735561132431,
"epoch": 2.527102803738318,
"grad_norm": 0.04031702131032944,
"learning_rate": 0.0002,
"loss": 0.5367462635040283,
"mean_token_accuracy": 0.7821540981531143,
"num_tokens": 11028942.0,
"step": 677
},
{
"entropy": 0.5479142069816589,
"epoch": 2.5308411214953273,
"grad_norm": 0.042728912085294724,
"learning_rate": 0.0002,
"loss": 0.5432093739509583,
"mean_token_accuracy": 0.7799795567989349,
"num_tokens": 11045296.0,
"step": 678
},
{
"entropy": 0.5360302478075027,
"epoch": 2.5345794392523366,
"grad_norm": 0.040872231125831604,
"learning_rate": 0.0002,
"loss": 0.5265986323356628,
"mean_token_accuracy": 0.7887827455997467,
"num_tokens": 11061450.0,
"step": 679
},
{
"entropy": 0.5468751043081284,
"epoch": 2.538317757009346,
"grad_norm": 0.0408024825155735,
"learning_rate": 0.0002,
"loss": 0.5442636609077454,
"mean_token_accuracy": 0.7790944874286652,
"num_tokens": 11077540.0,
"step": 680
},
{
"entropy": 0.530633345246315,
"epoch": 2.542056074766355,
"grad_norm": 0.04209808632731438,
"learning_rate": 0.0002,
"loss": 0.5363141894340515,
"mean_token_accuracy": 0.7819496542215347,
"num_tokens": 11093632.0,
"step": 681
},
{
"entropy": 0.5098425000905991,
"epoch": 2.5457943925233644,
"grad_norm": 0.04276811331510544,
"learning_rate": 0.0002,
"loss": 0.5222542881965637,
"mean_token_accuracy": 0.7871226519346237,
"num_tokens": 11110142.0,
"step": 682
},
{
"entropy": 0.5203486457467079,
"epoch": 2.5495327102803738,
"grad_norm": 0.04667636379599571,
"learning_rate": 0.0002,
"loss": 0.52687668800354,
"mean_token_accuracy": 0.7876535356044769,
"num_tokens": 11126405.0,
"step": 683
},
{
"entropy": 0.5424248725175858,
"epoch": 2.553271028037383,
"grad_norm": 0.03960704430937767,
"learning_rate": 0.0002,
"loss": 0.5351195335388184,
"mean_token_accuracy": 0.7820920497179031,
"num_tokens": 11142681.0,
"step": 684
},
{
"entropy": 0.5479930490255356,
"epoch": 2.5570093457943925,
"grad_norm": 0.03865355625748634,
"learning_rate": 0.0002,
"loss": 0.5381141901016235,
"mean_token_accuracy": 0.7842580229043961,
"num_tokens": 11158981.0,
"step": 685
},
{
"entropy": 0.5378328114748001,
"epoch": 2.560747663551402,
"grad_norm": 0.0406392477452755,
"learning_rate": 0.0002,
"loss": 0.5395403504371643,
"mean_token_accuracy": 0.7812999784946442,
"num_tokens": 11175185.0,
"step": 686
},
{
"entropy": 0.5591647922992706,
"epoch": 2.5644859813084113,
"grad_norm": 0.042679473757743835,
"learning_rate": 0.0002,
"loss": 0.5618141889572144,
"mean_token_accuracy": 0.7730479836463928,
"num_tokens": 11191516.0,
"step": 687
},
{
"entropy": 0.540540523827076,
"epoch": 2.5682242990654207,
"grad_norm": 0.0401788055896759,
"learning_rate": 0.0002,
"loss": 0.5431095957756042,
"mean_token_accuracy": 0.7800974696874619,
"num_tokens": 11207897.0,
"step": 688
},
{
"entropy": 0.5273384600877762,
"epoch": 2.5719626168224297,
"grad_norm": 0.04009004309773445,
"learning_rate": 0.0002,
"loss": 0.5236154794692993,
"mean_token_accuracy": 0.7862724959850311,
"num_tokens": 11224233.0,
"step": 689
},
{
"entropy": 0.5341546684503555,
"epoch": 2.575700934579439,
"grad_norm": 0.045469239354133606,
"learning_rate": 0.0002,
"loss": 0.5359405875205994,
"mean_token_accuracy": 0.7828920185565948,
"num_tokens": 11240583.0,
"step": 690
},
{
"entropy": 0.516716443002224,
"epoch": 2.5794392523364484,
"grad_norm": 0.03841989487409592,
"learning_rate": 0.0002,
"loss": 0.5178863406181335,
"mean_token_accuracy": 0.7926649451255798,
"num_tokens": 11256814.0,
"step": 691
},
{
"entropy": 0.5300464928150177,
"epoch": 2.583177570093458,
"grad_norm": 0.043383657932281494,
"learning_rate": 0.0002,
"loss": 0.534642219543457,
"mean_token_accuracy": 0.7844998836517334,
"num_tokens": 11273092.0,
"step": 692
},
{
"entropy": 0.5270805209875107,
"epoch": 2.586915887850467,
"grad_norm": 0.042948167771101,
"learning_rate": 0.0002,
"loss": 0.5318405628204346,
"mean_token_accuracy": 0.7814630717039108,
"num_tokens": 11289382.0,
"step": 693
},
{
"entropy": 0.5576307624578476,
"epoch": 2.5906542056074766,
"grad_norm": 0.04289550706744194,
"learning_rate": 0.0002,
"loss": 0.5595361590385437,
"mean_token_accuracy": 0.77448670566082,
"num_tokens": 11305822.0,
"step": 694
},
{
"entropy": 0.5350489318370819,
"epoch": 2.594392523364486,
"grad_norm": 0.036010973155498505,
"learning_rate": 0.0002,
"loss": 0.5320281982421875,
"mean_token_accuracy": 0.7841717451810837,
"num_tokens": 11322116.0,
"step": 695
},
{
"entropy": 0.5389258116483688,
"epoch": 2.5981308411214954,
"grad_norm": 0.036538656800985336,
"learning_rate": 0.0002,
"loss": 0.5332745313644409,
"mean_token_accuracy": 0.7836548089981079,
"num_tokens": 11338486.0,
"step": 696
},
{
"entropy": 0.5357422530651093,
"epoch": 2.601869158878505,
"grad_norm": 0.03977203741669655,
"learning_rate": 0.0002,
"loss": 0.5403972864151001,
"mean_token_accuracy": 0.7783884555101395,
"num_tokens": 11355126.0,
"step": 697
},
{
"entropy": 0.5224239528179169,
"epoch": 2.605607476635514,
"grad_norm": 0.03854282945394516,
"learning_rate": 0.0002,
"loss": 0.5209836363792419,
"mean_token_accuracy": 0.7890230715274811,
"num_tokens": 11371642.0,
"step": 698
},
{
"entropy": 0.527114674448967,
"epoch": 2.6093457943925236,
"grad_norm": 0.03806879743933678,
"learning_rate": 0.0002,
"loss": 0.5328760743141174,
"mean_token_accuracy": 0.7834767252206802,
"num_tokens": 11388018.0,
"step": 699
},
{
"entropy": 0.5207114219665527,
"epoch": 2.613084112149533,
"grad_norm": 0.04797474667429924,
"learning_rate": 0.0002,
"loss": 0.5281696915626526,
"mean_token_accuracy": 0.7842787057161331,
"num_tokens": 11404304.0,
"step": 700
},
{
"entropy": 0.5329904109239578,
"epoch": 2.616822429906542,
"grad_norm": 0.04143727570772171,
"learning_rate": 0.0002,
"loss": 0.5371139645576477,
"mean_token_accuracy": 0.7831498682498932,
"num_tokens": 11420561.0,
"step": 701
},
{
"entropy": 0.5422161221504211,
"epoch": 2.6205607476635513,
"grad_norm": 0.04683515056967735,
"learning_rate": 0.0002,
"loss": 0.5436529517173767,
"mean_token_accuracy": 0.7796959728002548,
"num_tokens": 11436820.0,
"step": 702
},
{
"entropy": 0.5309348404407501,
"epoch": 2.6242990654205607,
"grad_norm": 0.036559656262397766,
"learning_rate": 0.0002,
"loss": 0.5223227143287659,
"mean_token_accuracy": 0.7849199175834656,
"num_tokens": 11453134.0,
"step": 703
},
{
"entropy": 0.5515079498291016,
"epoch": 2.62803738317757,
"grad_norm": 0.047568727284669876,
"learning_rate": 0.0002,
"loss": 0.5509875416755676,
"mean_token_accuracy": 0.7774451673030853,
"num_tokens": 11469442.0,
"step": 704
},
{
"entropy": 0.5654275268316269,
"epoch": 2.6317757009345795,
"grad_norm": 0.03854409605264664,
"learning_rate": 0.0002,
"loss": 0.559022068977356,
"mean_token_accuracy": 0.7747441530227661,
"num_tokens": 11485880.0,
"step": 705
},
{
"entropy": 0.5369984805583954,
"epoch": 2.635514018691589,
"grad_norm": 0.04869009181857109,
"learning_rate": 0.0002,
"loss": 0.5361051559448242,
"mean_token_accuracy": 0.780804455280304,
"num_tokens": 11502359.0,
"step": 706
},
{
"entropy": 0.542375922203064,
"epoch": 2.6392523364485982,
"grad_norm": 0.045840587466955185,
"learning_rate": 0.0002,
"loss": 0.5502850413322449,
"mean_token_accuracy": 0.7759635299444199,
"num_tokens": 11518813.0,
"step": 707
},
{
"entropy": 0.5237139612436295,
"epoch": 2.6429906542056076,
"grad_norm": 0.043406110256910324,
"learning_rate": 0.0002,
"loss": 0.5281059741973877,
"mean_token_accuracy": 0.7859614938497543,
"num_tokens": 11535188.0,
"step": 708
},
{
"entropy": 0.5367631316184998,
"epoch": 2.6467289719626166,
"grad_norm": 0.04024430736899376,
"learning_rate": 0.0002,
"loss": 0.5387470126152039,
"mean_token_accuracy": 0.7812274694442749,
"num_tokens": 11551645.0,
"step": 709
},
{
"entropy": 0.5330280810594559,
"epoch": 2.650467289719626,
"grad_norm": 0.0389426052570343,
"learning_rate": 0.0002,
"loss": 0.5361229181289673,
"mean_token_accuracy": 0.7837622314691544,
"num_tokens": 11567892.0,
"step": 710
},
{
"entropy": 0.5259372144937515,
"epoch": 2.6542056074766354,
"grad_norm": 0.03997652605175972,
"learning_rate": 0.0002,
"loss": 0.5267660617828369,
"mean_token_accuracy": 0.7850897163152695,
"num_tokens": 11584153.0,
"step": 711
},
{
"entropy": 0.5390958487987518,
"epoch": 2.6579439252336448,
"grad_norm": 0.04180564358830452,
"learning_rate": 0.0002,
"loss": 0.5372406244277954,
"mean_token_accuracy": 0.7838725447654724,
"num_tokens": 11600597.0,
"step": 712
},
{
"entropy": 0.5279987677931786,
"epoch": 2.661682242990654,
"grad_norm": 0.03591061756014824,
"learning_rate": 0.0002,
"loss": 0.5308532118797302,
"mean_token_accuracy": 0.785730242729187,
"num_tokens": 11616881.0,
"step": 713
},
{
"entropy": 0.5563876032829285,
"epoch": 2.6654205607476635,
"grad_norm": 0.03892669454216957,
"learning_rate": 0.0002,
"loss": 0.5556321144104004,
"mean_token_accuracy": 0.7758439630270004,
"num_tokens": 11633329.0,
"step": 714
},
{
"entropy": 0.5373513847589493,
"epoch": 2.669158878504673,
"grad_norm": 0.03863142430782318,
"learning_rate": 0.0002,
"loss": 0.5352209806442261,
"mean_token_accuracy": 0.7836543023586273,
"num_tokens": 11649751.0,
"step": 715
},
{
"entropy": 0.5123810023069382,
"epoch": 2.6728971962616823,
"grad_norm": 0.04038078337907791,
"learning_rate": 0.0002,
"loss": 0.5158439874649048,
"mean_token_accuracy": 0.7905206978321075,
"num_tokens": 11665928.0,
"step": 716
},
{
"entropy": 0.5479727983474731,
"epoch": 2.6766355140186917,
"grad_norm": 0.04204852879047394,
"learning_rate": 0.0002,
"loss": 0.5506036281585693,
"mean_token_accuracy": 0.7781369537115097,
"num_tokens": 11682349.0,
"step": 717
},
{
"entropy": 0.5410658866167068,
"epoch": 2.680373831775701,
"grad_norm": 0.04252674803137779,
"learning_rate": 0.0002,
"loss": 0.5433157086372375,
"mean_token_accuracy": 0.776948869228363,
"num_tokens": 11698941.0,
"step": 718
},
{
"entropy": 0.5443103611469269,
"epoch": 2.6841121495327105,
"grad_norm": 0.044883646070957184,
"learning_rate": 0.0002,
"loss": 0.5470229983329773,
"mean_token_accuracy": 0.7803091257810593,
"num_tokens": 11715434.0,
"step": 719
},
{
"entropy": 0.5390113294124603,
"epoch": 2.68785046728972,
"grad_norm": 0.04012865573167801,
"learning_rate": 0.0002,
"loss": 0.5320149660110474,
"mean_token_accuracy": 0.7860948741436005,
"num_tokens": 11731697.0,
"step": 720
},
{
"entropy": 0.5281476825475693,
"epoch": 2.691588785046729,
"grad_norm": 0.04816235229372978,
"learning_rate": 0.0002,
"loss": 0.5312087535858154,
"mean_token_accuracy": 0.7858725935220718,
"num_tokens": 11747788.0,
"step": 721
},
{
"entropy": 0.5142519026994705,
"epoch": 2.695327102803738,
"grad_norm": 0.0394207127392292,
"learning_rate": 0.0002,
"loss": 0.5175022482872009,
"mean_token_accuracy": 0.7914264351129532,
"num_tokens": 11763802.0,
"step": 722
},
{
"entropy": 0.5183316618204117,
"epoch": 2.6990654205607476,
"grad_norm": 0.04731175675988197,
"learning_rate": 0.0002,
"loss": 0.5275416374206543,
"mean_token_accuracy": 0.7866149395704269,
"num_tokens": 11779759.0,
"step": 723
},
{
"entropy": 0.5322978273034096,
"epoch": 2.702803738317757,
"grad_norm": 0.045594654977321625,
"learning_rate": 0.0002,
"loss": 0.5377396941184998,
"mean_token_accuracy": 0.7802564948797226,
"num_tokens": 11795656.0,
"step": 724
},
{
"entropy": 0.5265089273452759,
"epoch": 2.7065420560747664,
"grad_norm": 0.04707048460841179,
"learning_rate": 0.0002,
"loss": 0.5340720415115356,
"mean_token_accuracy": 0.7816154807806015,
"num_tokens": 11811757.0,
"step": 725
},
{
"entropy": 0.5486596673727036,
"epoch": 2.710280373831776,
"grad_norm": 0.04378875717520714,
"learning_rate": 0.0002,
"loss": 0.5447016358375549,
"mean_token_accuracy": 0.7777462303638458,
"num_tokens": 11828249.0,
"step": 726
},
{
"entropy": 0.5557577461004257,
"epoch": 2.714018691588785,
"grad_norm": 0.044526614248752594,
"learning_rate": 0.0002,
"loss": 0.5464760661125183,
"mean_token_accuracy": 0.7786324173212051,
"num_tokens": 11844645.0,
"step": 727
},
{
"entropy": 0.5483285784721375,
"epoch": 2.717757009345794,
"grad_norm": 0.05415434390306473,
"learning_rate": 0.0002,
"loss": 0.5537320971488953,
"mean_token_accuracy": 0.774675577878952,
"num_tokens": 11860972.0,
"step": 728
},
{
"entropy": 0.5311020910739899,
"epoch": 2.7214953271028035,
"grad_norm": 0.043242573738098145,
"learning_rate": 0.0002,
"loss": 0.5344421863555908,
"mean_token_accuracy": 0.7838677763938904,
"num_tokens": 11876848.0,
"step": 729
},
{
"entropy": 0.5571545660495758,
"epoch": 2.725233644859813,
"grad_norm": 0.04775959998369217,
"learning_rate": 0.0002,
"loss": 0.5543075799942017,
"mean_token_accuracy": 0.7767691016197205,
"num_tokens": 11893101.0,
"step": 730
},
{
"entropy": 0.5632807910442352,
"epoch": 2.7289719626168223,
"grad_norm": 0.040951792150735855,
"learning_rate": 0.0002,
"loss": 0.556804895401001,
"mean_token_accuracy": 0.7738458663225174,
"num_tokens": 11909248.0,
"step": 731
},
{
"entropy": 0.5437204986810684,
"epoch": 2.7327102803738317,
"grad_norm": 0.041280943900346756,
"learning_rate": 0.0002,
"loss": 0.5405519604682922,
"mean_token_accuracy": 0.7808393985033035,
"num_tokens": 11925644.0,
"step": 732
},
{
"entropy": 0.5410651564598083,
"epoch": 2.736448598130841,
"grad_norm": 0.04410838708281517,
"learning_rate": 0.0002,
"loss": 0.5487910509109497,
"mean_token_accuracy": 0.7771375328302383,
"num_tokens": 11941579.0,
"step": 733
},
{
"entropy": 0.543538823723793,
"epoch": 2.7401869158878505,
"grad_norm": 0.04985618218779564,
"learning_rate": 0.0002,
"loss": 0.5518176555633545,
"mean_token_accuracy": 0.775468647480011,
"num_tokens": 11957981.0,
"step": 734
},
{
"entropy": 0.5253164023160934,
"epoch": 2.74392523364486,
"grad_norm": 0.04087154567241669,
"learning_rate": 0.0002,
"loss": 0.5267685651779175,
"mean_token_accuracy": 0.7876032888889313,
"num_tokens": 11974282.0,
"step": 735
},
{
"entropy": 0.5454862713813782,
"epoch": 2.7476635514018692,
"grad_norm": 0.04045165702700615,
"learning_rate": 0.0002,
"loss": 0.5382283926010132,
"mean_token_accuracy": 0.7811629176139832,
"num_tokens": 11990945.0,
"step": 736
},
{
"entropy": 0.5417391657829285,
"epoch": 2.7514018691588786,
"grad_norm": 0.042311448603868484,
"learning_rate": 0.0002,
"loss": 0.540289044380188,
"mean_token_accuracy": 0.7793714255094528,
"num_tokens": 12007392.0,
"step": 737
},
{
"entropy": 0.5214735865592957,
"epoch": 2.755140186915888,
"grad_norm": 0.04158855974674225,
"learning_rate": 0.0002,
"loss": 0.5217651128768921,
"mean_token_accuracy": 0.7852792292833328,
"num_tokens": 12023581.0,
"step": 738
},
{
"entropy": 0.5328553915023804,
"epoch": 2.7588785046728974,
"grad_norm": 0.038325536996126175,
"learning_rate": 0.0002,
"loss": 0.5344902873039246,
"mean_token_accuracy": 0.7842058092355728,
"num_tokens": 12039885.0,
"step": 739
},
{
"entropy": 0.5496254563331604,
"epoch": 2.762616822429907,
"grad_norm": 0.04375292733311653,
"learning_rate": 0.0002,
"loss": 0.55174720287323,
"mean_token_accuracy": 0.7766779661178589,
"num_tokens": 12056371.0,
"step": 740
},
{
"entropy": 0.558516189455986,
"epoch": 2.7663551401869158,
"grad_norm": 0.049271486699581146,
"learning_rate": 0.0002,
"loss": 0.561238169670105,
"mean_token_accuracy": 0.77435702085495,
"num_tokens": 12072839.0,
"step": 741
},
{
"entropy": 0.5472046732902527,
"epoch": 2.770093457943925,
"grad_norm": 0.04255034402012825,
"learning_rate": 0.0002,
"loss": 0.5455073714256287,
"mean_token_accuracy": 0.7776911556720734,
"num_tokens": 12089121.0,
"step": 742
},
{
"entropy": 0.5307886898517609,
"epoch": 2.7738317757009345,
"grad_norm": 0.04008355364203453,
"learning_rate": 0.0002,
"loss": 0.5308167934417725,
"mean_token_accuracy": 0.785127267241478,
"num_tokens": 12105321.0,
"step": 743
},
{
"entropy": 0.5314194560050964,
"epoch": 2.777570093457944,
"grad_norm": 0.043235525488853455,
"learning_rate": 0.0002,
"loss": 0.5316693186759949,
"mean_token_accuracy": 0.7851164489984512,
"num_tokens": 12121581.0,
"step": 744
},
{
"entropy": 0.5243879109621048,
"epoch": 2.7813084112149533,
"grad_norm": 0.0358644537627697,
"learning_rate": 0.0002,
"loss": 0.5208507776260376,
"mean_token_accuracy": 0.7896229773759842,
"num_tokens": 12138064.0,
"step": 745
},
{
"entropy": 0.5349021703004837,
"epoch": 2.7850467289719627,
"grad_norm": 0.04395059868693352,
"learning_rate": 0.0002,
"loss": 0.541559100151062,
"mean_token_accuracy": 0.7818141132593155,
"num_tokens": 12154580.0,
"step": 746
},
{
"entropy": 0.5464755445718765,
"epoch": 2.788785046728972,
"grad_norm": 0.03772180154919624,
"learning_rate": 0.0002,
"loss": 0.5500795245170593,
"mean_token_accuracy": 0.7745375484228134,
"num_tokens": 12170944.0,
"step": 747
},
{
"entropy": 0.5316334664821625,
"epoch": 2.792523364485981,
"grad_norm": 0.042537569999694824,
"learning_rate": 0.0002,
"loss": 0.5385891795158386,
"mean_token_accuracy": 0.7813721299171448,
"num_tokens": 12187183.0,
"step": 748
},
{
"entropy": 0.5325866043567657,
"epoch": 2.7962616822429904,
"grad_norm": 0.03928552195429802,
"learning_rate": 0.0002,
"loss": 0.5372824668884277,
"mean_token_accuracy": 0.782025933265686,
"num_tokens": 12203656.0,
"step": 749
},
{
"entropy": 0.5230025053024292,
"epoch": 2.8,
"grad_norm": 0.045356832444667816,
"learning_rate": 0.0002,
"loss": 0.5221288204193115,
"mean_token_accuracy": 0.7879509478807449,
"num_tokens": 12220217.0,
"step": 750
},
{
"entropy": 0.5552905946969986,
"epoch": 2.803738317757009,
"grad_norm": 0.03520367294549942,
"learning_rate": 0.0002,
"loss": 0.5458053350448608,
"mean_token_accuracy": 0.7801086604595184,
"num_tokens": 12236926.0,
"step": 751
},
{
"entropy": 0.5284090638160706,
"epoch": 2.8074766355140186,
"grad_norm": 0.04301855340600014,
"learning_rate": 0.0002,
"loss": 0.5322295427322388,
"mean_token_accuracy": 0.7865041345357895,
"num_tokens": 12253231.0,
"step": 752
},
{
"entropy": 0.5464428961277008,
"epoch": 2.811214953271028,
"grad_norm": 0.04177437350153923,
"learning_rate": 0.0002,
"loss": 0.5503079295158386,
"mean_token_accuracy": 0.7759024053812027,
"num_tokens": 12269564.0,
"step": 753
},
{
"entropy": 0.5288181900978088,
"epoch": 2.8149532710280374,
"grad_norm": 0.04611227661371231,
"learning_rate": 0.0002,
"loss": 0.5422286987304688,
"mean_token_accuracy": 0.7793826460838318,
"num_tokens": 12285764.0,
"step": 754
},
{
"entropy": 0.538264587521553,
"epoch": 2.8186915887850468,
"grad_norm": 0.039094604551792145,
"learning_rate": 0.0002,
"loss": 0.5421559810638428,
"mean_token_accuracy": 0.7824651896953583,
"num_tokens": 12301975.0,
"step": 755
},
{
"entropy": 0.5448143184185028,
"epoch": 2.822429906542056,
"grad_norm": 0.03843825310468674,
"learning_rate": 0.0002,
"loss": 0.5424494743347168,
"mean_token_accuracy": 0.7786366790533066,
"num_tokens": 12318265.0,
"step": 756
},
{
"entropy": 0.5362522453069687,
"epoch": 2.8261682242990656,
"grad_norm": 0.037981439381837845,
"learning_rate": 0.0002,
"loss": 0.5347139835357666,
"mean_token_accuracy": 0.7820651233196259,
"num_tokens": 12334596.0,
"step": 757
},
{
"entropy": 0.5419719219207764,
"epoch": 2.829906542056075,
"grad_norm": 0.03768031671643257,
"learning_rate": 0.0002,
"loss": 0.540343701839447,
"mean_token_accuracy": 0.779738038778305,
"num_tokens": 12351022.0,
"step": 758
},
{
"entropy": 0.5576566010713577,
"epoch": 2.8336448598130843,
"grad_norm": 0.03845515102148056,
"learning_rate": 0.0002,
"loss": 0.556204617023468,
"mean_token_accuracy": 0.7719219624996185,
"num_tokens": 12367469.0,
"step": 759
},
{
"entropy": 0.5245185047388077,
"epoch": 2.8373831775700937,
"grad_norm": 0.04210665449500084,
"learning_rate": 0.0002,
"loss": 0.5240767598152161,
"mean_token_accuracy": 0.7867787629365921,
"num_tokens": 12383664.0,
"step": 760
},
{
"entropy": 0.5366124212741852,
"epoch": 2.8411214953271027,
"grad_norm": 0.039727386087179184,
"learning_rate": 0.0002,
"loss": 0.5391771197319031,
"mean_token_accuracy": 0.7799243628978729,
"num_tokens": 12399816.0,
"step": 761
},
{
"entropy": 0.5430543571710587,
"epoch": 2.844859813084112,
"grad_norm": 0.04284166544675827,
"learning_rate": 0.0002,
"loss": 0.555898129940033,
"mean_token_accuracy": 0.7769357264041901,
"num_tokens": 12416232.0,
"step": 762
},
{
"entropy": 0.5447599291801453,
"epoch": 2.8485981308411215,
"grad_norm": 0.04133335128426552,
"learning_rate": 0.0002,
"loss": 0.5458224415779114,
"mean_token_accuracy": 0.7791205793619156,
"num_tokens": 12432772.0,
"step": 763
},
{
"entropy": 0.5463473051786423,
"epoch": 2.852336448598131,
"grad_norm": 0.04293463006615639,
"learning_rate": 0.0002,
"loss": 0.5410310626029968,
"mean_token_accuracy": 0.7824665307998657,
"num_tokens": 12449390.0,
"step": 764
},
{
"entropy": 0.5433794260025024,
"epoch": 2.8560747663551402,
"grad_norm": 0.0383763313293457,
"learning_rate": 0.0002,
"loss": 0.5330025553703308,
"mean_token_accuracy": 0.786294624209404,
"num_tokens": 12465761.0,
"step": 765
},
{
"entropy": 0.5348140597343445,
"epoch": 2.8598130841121496,
"grad_norm": 0.038813136518001556,
"learning_rate": 0.0002,
"loss": 0.5356075167655945,
"mean_token_accuracy": 0.7799220532178879,
"num_tokens": 12481995.0,
"step": 766
},
{
"entropy": 0.5310825854539871,
"epoch": 2.863551401869159,
"grad_norm": 0.04623069986701012,
"learning_rate": 0.0002,
"loss": 0.5389203429222107,
"mean_token_accuracy": 0.7763766050338745,
"num_tokens": 12498209.0,
"step": 767
},
{
"entropy": 0.5357654541730881,
"epoch": 2.867289719626168,
"grad_norm": 0.03819035738706589,
"learning_rate": 0.0002,
"loss": 0.5394827723503113,
"mean_token_accuracy": 0.7809223681688309,
"num_tokens": 12514712.0,
"step": 768
},
{
"entropy": 0.543551579117775,
"epoch": 2.8710280373831774,
"grad_norm": 0.043649353086948395,
"learning_rate": 0.0002,
"loss": 0.5464720129966736,
"mean_token_accuracy": 0.7787970453500748,
"num_tokens": 12531249.0,
"step": 769
},
{
"entropy": 0.5389954522252083,
"epoch": 2.8747663551401867,
"grad_norm": 0.036311469972133636,
"learning_rate": 0.0002,
"loss": 0.5379980206489563,
"mean_token_accuracy": 0.7832965403795242,
"num_tokens": 12547833.0,
"step": 770
},
{
"entropy": 0.5408525764942169,
"epoch": 2.878504672897196,
"grad_norm": 0.03780903294682503,
"learning_rate": 0.0002,
"loss": 0.539055585861206,
"mean_token_accuracy": 0.7843980342149734,
"num_tokens": 12564468.0,
"step": 771
},
{
"entropy": 0.5521610230207443,
"epoch": 2.8822429906542055,
"grad_norm": 0.042727869004011154,
"learning_rate": 0.0002,
"loss": 0.5518633723258972,
"mean_token_accuracy": 0.7730461955070496,
"num_tokens": 12580822.0,
"step": 772
},
{
"entropy": 0.5392657667398453,
"epoch": 2.885981308411215,
"grad_norm": 0.042652204632759094,
"learning_rate": 0.0002,
"loss": 0.5403409004211426,
"mean_token_accuracy": 0.7833160161972046,
"num_tokens": 12597306.0,
"step": 773
},
{
"entropy": 0.5409767031669617,
"epoch": 2.8897196261682243,
"grad_norm": 0.04756668955087662,
"learning_rate": 0.0002,
"loss": 0.5477514266967773,
"mean_token_accuracy": 0.7775042653083801,
"num_tokens": 12613430.0,
"step": 774
},
{
"entropy": 0.529184103012085,
"epoch": 2.8934579439252337,
"grad_norm": 0.040852271020412445,
"learning_rate": 0.0002,
"loss": 0.5368978381156921,
"mean_token_accuracy": 0.7799389064311981,
"num_tokens": 12629734.0,
"step": 775
},
{
"entropy": 0.5528028011322021,
"epoch": 2.897196261682243,
"grad_norm": 0.04610953480005264,
"learning_rate": 0.0002,
"loss": 0.5489134788513184,
"mean_token_accuracy": 0.7778203934431076,
"num_tokens": 12646051.0,
"step": 776
},
{
"entropy": 0.5398439168930054,
"epoch": 2.9009345794392525,
"grad_norm": 0.03999875858426094,
"learning_rate": 0.0002,
"loss": 0.5301113128662109,
"mean_token_accuracy": 0.786536455154419,
"num_tokens": 12662398.0,
"step": 777
},
{
"entropy": 0.5450849235057831,
"epoch": 2.904672897196262,
"grad_norm": 0.04052022844552994,
"learning_rate": 0.0002,
"loss": 0.5446597933769226,
"mean_token_accuracy": 0.7773038446903229,
"num_tokens": 12679053.0,
"step": 778
},
{
"entropy": 0.5272800028324127,
"epoch": 2.9084112149532713,
"grad_norm": 0.041017524898052216,
"learning_rate": 0.0002,
"loss": 0.5308842062950134,
"mean_token_accuracy": 0.7858325839042664,
"num_tokens": 12695608.0,
"step": 779
},
{
"entropy": 0.5401904284954071,
"epoch": 2.91214953271028,
"grad_norm": 0.04053664207458496,
"learning_rate": 0.0002,
"loss": 0.5450324416160583,
"mean_token_accuracy": 0.7785527408123016,
"num_tokens": 12712035.0,
"step": 780
},
{
"entropy": 0.5284470915794373,
"epoch": 2.9158878504672896,
"grad_norm": 0.04656258225440979,
"learning_rate": 0.0002,
"loss": 0.5301587581634521,
"mean_token_accuracy": 0.781079113483429,
"num_tokens": 12728285.0,
"step": 781
},
{
"entropy": 0.5552389323711395,
"epoch": 2.919626168224299,
"grad_norm": 0.043133046478033066,
"learning_rate": 0.0002,
"loss": 0.5493855476379395,
"mean_token_accuracy": 0.7788817882537842,
"num_tokens": 12744626.0,
"step": 782
},
{
"entropy": 0.536635085940361,
"epoch": 2.9233644859813084,
"grad_norm": 0.04232388734817505,
"learning_rate": 0.0002,
"loss": 0.5350582599639893,
"mean_token_accuracy": 0.784316211938858,
"num_tokens": 12760817.0,
"step": 783
},
{
"entropy": 0.5175309851765633,
"epoch": 2.9271028037383178,
"grad_norm": 0.05120910704135895,
"learning_rate": 0.0002,
"loss": 0.5239328742027283,
"mean_token_accuracy": 0.7904608845710754,
"num_tokens": 12777129.0,
"step": 784
},
{
"entropy": 0.5613889098167419,
"epoch": 2.930841121495327,
"grad_norm": 0.04064096510410309,
"learning_rate": 0.0002,
"loss": 0.5573512315750122,
"mean_token_accuracy": 0.7735461741685867,
"num_tokens": 12793633.0,
"step": 785
},
{
"entropy": 0.540812149643898,
"epoch": 2.9345794392523366,
"grad_norm": 0.04686618968844414,
"learning_rate": 0.0002,
"loss": 0.5428805947303772,
"mean_token_accuracy": 0.7786334455013275,
"num_tokens": 12809886.0,
"step": 786
},
{
"entropy": 0.5354818254709244,
"epoch": 2.938317757009346,
"grad_norm": 0.04068305343389511,
"learning_rate": 0.0002,
"loss": 0.5409020185470581,
"mean_token_accuracy": 0.781467393040657,
"num_tokens": 12826079.0,
"step": 787
},
{
"entropy": 0.5340152084827423,
"epoch": 2.942056074766355,
"grad_norm": 0.04302098974585533,
"learning_rate": 0.0002,
"loss": 0.5352627038955688,
"mean_token_accuracy": 0.7827621698379517,
"num_tokens": 12842255.0,
"step": 788
},
{
"entropy": 0.5471729636192322,
"epoch": 2.9457943925233643,
"grad_norm": 0.03707803413271904,
"learning_rate": 0.0002,
"loss": 0.5461200475692749,
"mean_token_accuracy": 0.7784449309110641,
"num_tokens": 12859013.0,
"step": 789
},
{
"entropy": 0.5401621907949448,
"epoch": 2.9495327102803737,
"grad_norm": 0.044071633368730545,
"learning_rate": 0.0002,
"loss": 0.5385332107543945,
"mean_token_accuracy": 0.783258393406868,
"num_tokens": 12875373.0,
"step": 790
},
{
"entropy": 0.5508020371198654,
"epoch": 2.953271028037383,
"grad_norm": 0.03822047635912895,
"learning_rate": 0.0002,
"loss": 0.5456752181053162,
"mean_token_accuracy": 0.7771204560995102,
"num_tokens": 12891653.0,
"step": 791
},
{
"entropy": 0.5405401140451431,
"epoch": 2.9570093457943925,
"grad_norm": 0.05170199275016785,
"learning_rate": 0.0002,
"loss": 0.5398849248886108,
"mean_token_accuracy": 0.7820375263690948,
"num_tokens": 12908131.0,
"step": 792
},
{
"entropy": 0.5514362305402756,
"epoch": 2.960747663551402,
"grad_norm": 0.036166463047266006,
"learning_rate": 0.0002,
"loss": 0.5504743456840515,
"mean_token_accuracy": 0.7789987325668335,
"num_tokens": 12924376.0,
"step": 793
},
{
"entropy": 0.5308372974395752,
"epoch": 2.9644859813084112,
"grad_norm": 0.04786797612905502,
"learning_rate": 0.0002,
"loss": 0.5306717753410339,
"mean_token_accuracy": 0.7853545248508453,
"num_tokens": 12940776.0,
"step": 794
},
{
"entropy": 0.532660722732544,
"epoch": 2.9682242990654206,
"grad_norm": 0.045564983040094376,
"learning_rate": 0.0002,
"loss": 0.5463993549346924,
"mean_token_accuracy": 0.777183935046196,
"num_tokens": 12957326.0,
"step": 795
},
{
"entropy": 0.5434572845697403,
"epoch": 2.97196261682243,
"grad_norm": 0.04280655458569527,
"learning_rate": 0.0002,
"loss": 0.5493361353874207,
"mean_token_accuracy": 0.776650920510292,
"num_tokens": 12973820.0,
"step": 796
},
{
"entropy": 0.5530060529708862,
"epoch": 2.9757009345794394,
"grad_norm": 0.04003579169511795,
"learning_rate": 0.0002,
"loss": 0.5533372759819031,
"mean_token_accuracy": 0.7766715437173843,
"num_tokens": 12990177.0,
"step": 797
},
{
"entropy": 0.5516588985919952,
"epoch": 2.979439252336449,
"grad_norm": 0.0351371206343174,
"learning_rate": 0.0002,
"loss": 0.5491815209388733,
"mean_token_accuracy": 0.7761321365833282,
"num_tokens": 13006638.0,
"step": 798
},
{
"entropy": 0.5496395230293274,
"epoch": 2.983177570093458,
"grad_norm": 0.03455950319766998,
"learning_rate": 0.0002,
"loss": 0.5390848517417908,
"mean_token_accuracy": 0.7827516794204712,
"num_tokens": 13022895.0,
"step": 799
},
{
"entropy": 0.5255894213914871,
"epoch": 2.986915887850467,
"grad_norm": 0.0403040274977684,
"learning_rate": 0.0002,
"loss": 0.5258710980415344,
"mean_token_accuracy": 0.7874301820993423,
"num_tokens": 13039127.0,
"step": 800
},
{
"entropy": 0.5152293890714645,
"epoch": 2.9906542056074765,
"grad_norm": 0.04018184915184975,
"learning_rate": 0.0002,
"loss": 0.5248207449913025,
"mean_token_accuracy": 0.789091631770134,
"num_tokens": 13055038.0,
"step": 801
},
{
"entropy": 0.5260308086872101,
"epoch": 2.994392523364486,
"grad_norm": 0.04690062627196312,
"learning_rate": 0.0002,
"loss": 0.5380572080612183,
"mean_token_accuracy": 0.7809655517339706,
"num_tokens": 13070955.0,
"step": 802
},
{
"entropy": 0.5523715615272522,
"epoch": 2.9981308411214953,
"grad_norm": 0.040551379323005676,
"learning_rate": 0.0002,
"loss": 0.5491956472396851,
"mean_token_accuracy": 0.7785847187042236,
"num_tokens": 13087325.0,
"step": 803
},
{
"entropy": 0.5784902274608612,
"epoch": 3.0,
"grad_norm": 0.04703172296285629,
"learning_rate": 0.0002,
"loss": 0.5652958750724792,
"mean_token_accuracy": 0.7655995786190033,
"num_tokens": 13094423.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2209408416111657e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}