eac123's picture
Upload final checkpoint (checkpoint-804)
15a0cbd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1481057405471802,
"epoch": 0.0037418147801683817,
"grad_norm": 0.40896540880203247,
"learning_rate": 0.0002,
"loss": 2.499051332473755,
"mean_token_accuracy": 0.5305689871311188,
"num_tokens": 16123.0,
"step": 1
},
{
"entropy": 1.239521712064743,
"epoch": 0.007483629560336763,
"grad_norm": 0.3786088228225708,
"learning_rate": 0.0002,
"loss": 2.1649975776672363,
"mean_token_accuracy": 0.5674073547124863,
"num_tokens": 32231.0,
"step": 2
},
{
"entropy": 1.4065836369991302,
"epoch": 0.011225444340505144,
"grad_norm": 0.2935435175895691,
"learning_rate": 0.0002,
"loss": 1.7277326583862305,
"mean_token_accuracy": 0.5904076844453812,
"num_tokens": 48717.0,
"step": 3
},
{
"entropy": 1.3739030063152313,
"epoch": 0.014967259120673527,
"grad_norm": 0.24068056046962738,
"learning_rate": 0.0002,
"loss": 1.4146925210952759,
"mean_token_accuracy": 0.6330391019582748,
"num_tokens": 64917.0,
"step": 4
},
{
"entropy": 1.3624942004680634,
"epoch": 0.018709073900841908,
"grad_norm": 0.2722117602825165,
"learning_rate": 0.0002,
"loss": 1.2977211475372314,
"mean_token_accuracy": 0.6365498602390289,
"num_tokens": 81360.0,
"step": 5
},
{
"entropy": 1.268439620733261,
"epoch": 0.02245088868101029,
"grad_norm": 0.13346025347709656,
"learning_rate": 0.0002,
"loss": 1.1922200918197632,
"mean_token_accuracy": 0.6591676026582718,
"num_tokens": 98033.0,
"step": 6
},
{
"entropy": 1.187461495399475,
"epoch": 0.026192703461178673,
"grad_norm": 0.10905587673187256,
"learning_rate": 0.0002,
"loss": 1.090636134147644,
"mean_token_accuracy": 0.6683961004018784,
"num_tokens": 114410.0,
"step": 7
},
{
"entropy": 1.1027202904224396,
"epoch": 0.029934518241347054,
"grad_norm": 0.10468754172325134,
"learning_rate": 0.0002,
"loss": 1.0090222358703613,
"mean_token_accuracy": 0.6826278865337372,
"num_tokens": 130663.0,
"step": 8
},
{
"entropy": 1.0241433680057526,
"epoch": 0.03367633302151544,
"grad_norm": 0.13387203216552734,
"learning_rate": 0.0002,
"loss": 0.9953913688659668,
"mean_token_accuracy": 0.6843951940536499,
"num_tokens": 147024.0,
"step": 9
},
{
"entropy": 1.0002675652503967,
"epoch": 0.037418147801683815,
"grad_norm": 0.1420045644044876,
"learning_rate": 0.0002,
"loss": 0.9541152119636536,
"mean_token_accuracy": 0.6879138201475143,
"num_tokens": 163186.0,
"step": 10
},
{
"entropy": 0.9888490438461304,
"epoch": 0.0411599625818522,
"grad_norm": 0.10480759292840958,
"learning_rate": 0.0002,
"loss": 0.8834772706031799,
"mean_token_accuracy": 0.7008452415466309,
"num_tokens": 179486.0,
"step": 11
},
{
"entropy": 0.9587634801864624,
"epoch": 0.04490177736202058,
"grad_norm": 0.1189962700009346,
"learning_rate": 0.0002,
"loss": 0.8404299020767212,
"mean_token_accuracy": 0.7084675431251526,
"num_tokens": 195940.0,
"step": 12
},
{
"entropy": 0.8834698051214218,
"epoch": 0.04864359214218896,
"grad_norm": 0.1070038452744484,
"learning_rate": 0.0002,
"loss": 0.816959798336029,
"mean_token_accuracy": 0.7068669199943542,
"num_tokens": 212384.0,
"step": 13
},
{
"entropy": 0.7648728787899017,
"epoch": 0.052385406922357346,
"grad_norm": 1.0202980041503906,
"learning_rate": 0.0002,
"loss": 0.7703532576560974,
"mean_token_accuracy": 0.721884474158287,
"num_tokens": 228462.0,
"step": 14
},
{
"entropy": 0.7483080476522446,
"epoch": 0.05612722170252572,
"grad_norm": 0.12461339682340622,
"learning_rate": 0.0002,
"loss": 0.745843231678009,
"mean_token_accuracy": 0.7246550768613815,
"num_tokens": 244599.0,
"step": 15
},
{
"entropy": 0.7499705106019974,
"epoch": 0.05986903648269411,
"grad_norm": 0.13838888704776764,
"learning_rate": 0.0002,
"loss": 0.7328222990036011,
"mean_token_accuracy": 0.7272029221057892,
"num_tokens": 261162.0,
"step": 16
},
{
"entropy": 0.7162831723690033,
"epoch": 0.06361085126286249,
"grad_norm": 0.0821700468659401,
"learning_rate": 0.0002,
"loss": 0.700190007686615,
"mean_token_accuracy": 0.7368839830160141,
"num_tokens": 277513.0,
"step": 17
},
{
"entropy": 0.66506028175354,
"epoch": 0.06735266604303088,
"grad_norm": 0.08271524310112,
"learning_rate": 0.0002,
"loss": 0.6616584062576294,
"mean_token_accuracy": 0.7501807361841202,
"num_tokens": 293628.0,
"step": 18
},
{
"entropy": 0.6652649641036987,
"epoch": 0.07109448082319925,
"grad_norm": 0.10451149940490723,
"learning_rate": 0.0002,
"loss": 0.6696457266807556,
"mean_token_accuracy": 0.7403630912303925,
"num_tokens": 309771.0,
"step": 19
},
{
"entropy": 0.671489492058754,
"epoch": 0.07483629560336763,
"grad_norm": 0.08111453801393509,
"learning_rate": 0.0002,
"loss": 0.6523128747940063,
"mean_token_accuracy": 0.7449511885643005,
"num_tokens": 326252.0,
"step": 20
},
{
"entropy": 0.6829328835010529,
"epoch": 0.07857811038353602,
"grad_norm": 0.07855828106403351,
"learning_rate": 0.0002,
"loss": 0.6548086404800415,
"mean_token_accuracy": 0.7431468367576599,
"num_tokens": 342569.0,
"step": 21
},
{
"entropy": 0.6616033613681793,
"epoch": 0.0823199251637044,
"grad_norm": 0.07543554902076721,
"learning_rate": 0.0002,
"loss": 0.6394403576850891,
"mean_token_accuracy": 0.7484261393547058,
"num_tokens": 359156.0,
"step": 22
},
{
"entropy": 0.6383623033761978,
"epoch": 0.08606173994387278,
"grad_norm": 0.07246740162372589,
"learning_rate": 0.0002,
"loss": 0.6292484998703003,
"mean_token_accuracy": 0.7550594955682755,
"num_tokens": 375388.0,
"step": 23
},
{
"entropy": 0.6223422735929489,
"epoch": 0.08980355472404115,
"grad_norm": 0.08016548305749893,
"learning_rate": 0.0002,
"loss": 0.6264731884002686,
"mean_token_accuracy": 0.7548545002937317,
"num_tokens": 391528.0,
"step": 24
},
{
"entropy": 0.5979716777801514,
"epoch": 0.09354536950420954,
"grad_norm": 0.07842142134904861,
"learning_rate": 0.0002,
"loss": 0.6038044691085815,
"mean_token_accuracy": 0.764473095536232,
"num_tokens": 407673.0,
"step": 25
},
{
"entropy": 0.5976411253213882,
"epoch": 0.09728718428437792,
"grad_norm": 0.0749603658914566,
"learning_rate": 0.0002,
"loss": 0.5980632305145264,
"mean_token_accuracy": 0.7644072473049164,
"num_tokens": 423781.0,
"step": 26
},
{
"entropy": 0.5957016050815582,
"epoch": 0.10102899906454631,
"grad_norm": 0.061034828424453735,
"learning_rate": 0.0002,
"loss": 0.5909260511398315,
"mean_token_accuracy": 0.7682853490114212,
"num_tokens": 439927.0,
"step": 27
},
{
"entropy": 0.6109822690486908,
"epoch": 0.10477081384471469,
"grad_norm": 0.061578188091516495,
"learning_rate": 0.0002,
"loss": 0.5998508334159851,
"mean_token_accuracy": 0.7658420503139496,
"num_tokens": 456218.0,
"step": 28
},
{
"entropy": 0.601639524102211,
"epoch": 0.10851262862488306,
"grad_norm": 0.0625869631767273,
"learning_rate": 0.0002,
"loss": 0.592888355255127,
"mean_token_accuracy": 0.7679047584533691,
"num_tokens": 472672.0,
"step": 29
},
{
"entropy": 0.5943656265735626,
"epoch": 0.11225444340505145,
"grad_norm": 0.05583951249718666,
"learning_rate": 0.0002,
"loss": 0.5944483280181885,
"mean_token_accuracy": 0.7622693479061127,
"num_tokens": 489114.0,
"step": 30
},
{
"entropy": 0.5988462120294571,
"epoch": 0.11599625818521983,
"grad_norm": 0.0581178143620491,
"learning_rate": 0.0002,
"loss": 0.6067461967468262,
"mean_token_accuracy": 0.7607288658618927,
"num_tokens": 505426.0,
"step": 31
},
{
"entropy": 0.5756160020828247,
"epoch": 0.11973807296538821,
"grad_norm": 0.05917786434292793,
"learning_rate": 0.0002,
"loss": 0.5832271575927734,
"mean_token_accuracy": 0.770146518945694,
"num_tokens": 521632.0,
"step": 32
},
{
"entropy": 0.5860312879085541,
"epoch": 0.1234798877455566,
"grad_norm": 0.057717982679605484,
"learning_rate": 0.0002,
"loss": 0.592366635799408,
"mean_token_accuracy": 0.7664856016635895,
"num_tokens": 538173.0,
"step": 33
},
{
"entropy": 0.5932987481355667,
"epoch": 0.12722170252572498,
"grad_norm": 0.051627833396196365,
"learning_rate": 0.0002,
"loss": 0.5942224860191345,
"mean_token_accuracy": 0.7634450048208237,
"num_tokens": 554522.0,
"step": 34
},
{
"entropy": 0.5781913548707962,
"epoch": 0.13096351730589337,
"grad_norm": 0.053737979382276535,
"learning_rate": 0.0002,
"loss": 0.5713843107223511,
"mean_token_accuracy": 0.7748462855815887,
"num_tokens": 570944.0,
"step": 35
},
{
"entropy": 0.5928207337856293,
"epoch": 0.13470533208606175,
"grad_norm": 0.0513126477599144,
"learning_rate": 0.0002,
"loss": 0.5946991443634033,
"mean_token_accuracy": 0.7643233835697174,
"num_tokens": 587342.0,
"step": 36
},
{
"entropy": 0.5689480155706406,
"epoch": 0.1384471468662301,
"grad_norm": 0.0563691221177578,
"learning_rate": 0.0002,
"loss": 0.5712450742721558,
"mean_token_accuracy": 0.7735907435417175,
"num_tokens": 603727.0,
"step": 37
},
{
"entropy": 0.5871619284152985,
"epoch": 0.1421889616463985,
"grad_norm": 0.043151870369911194,
"learning_rate": 0.0002,
"loss": 0.5806025862693787,
"mean_token_accuracy": 0.768414631485939,
"num_tokens": 620304.0,
"step": 38
},
{
"entropy": 0.5789511501789093,
"epoch": 0.14593077642656688,
"grad_norm": 0.057180438190698624,
"learning_rate": 0.0002,
"loss": 0.5829247832298279,
"mean_token_accuracy": 0.7660035490989685,
"num_tokens": 636613.0,
"step": 39
},
{
"entropy": 0.5511189699172974,
"epoch": 0.14967259120673526,
"grad_norm": 0.04785468429327011,
"learning_rate": 0.0002,
"loss": 0.5596879124641418,
"mean_token_accuracy": 0.7737152278423309,
"num_tokens": 652836.0,
"step": 40
},
{
"entropy": 0.5728544592857361,
"epoch": 0.15341440598690365,
"grad_norm": 0.047032520174980164,
"learning_rate": 0.0002,
"loss": 0.5756531953811646,
"mean_token_accuracy": 0.7682489305734634,
"num_tokens": 669348.0,
"step": 41
},
{
"entropy": 0.5809888541698456,
"epoch": 0.15715622076707203,
"grad_norm": 0.04996408522129059,
"learning_rate": 0.0002,
"loss": 0.5856860280036926,
"mean_token_accuracy": 0.7646850347518921,
"num_tokens": 685771.0,
"step": 42
},
{
"entropy": 0.5943491905927658,
"epoch": 0.16089803554724041,
"grad_norm": 0.04490286856889725,
"learning_rate": 0.0002,
"loss": 0.5864270329475403,
"mean_token_accuracy": 0.7636495530605316,
"num_tokens": 702211.0,
"step": 43
},
{
"entropy": 0.5895421206951141,
"epoch": 0.1646398503274088,
"grad_norm": 0.051186852157115936,
"learning_rate": 0.0002,
"loss": 0.5863322019577026,
"mean_token_accuracy": 0.7648472040891647,
"num_tokens": 718539.0,
"step": 44
},
{
"entropy": 0.573004424571991,
"epoch": 0.16838166510757718,
"grad_norm": 0.044179223477840424,
"learning_rate": 0.0002,
"loss": 0.5632967352867126,
"mean_token_accuracy": 0.7742049247026443,
"num_tokens": 734943.0,
"step": 45
},
{
"entropy": 0.5616976916790009,
"epoch": 0.17212347988774557,
"grad_norm": 0.04744846373796463,
"learning_rate": 0.0002,
"loss": 0.5611750483512878,
"mean_token_accuracy": 0.7748160660266876,
"num_tokens": 751206.0,
"step": 46
},
{
"entropy": 0.5663218796253204,
"epoch": 0.17586529466791395,
"grad_norm": 0.05421765521168709,
"learning_rate": 0.0002,
"loss": 0.5719538927078247,
"mean_token_accuracy": 0.7716761082410812,
"num_tokens": 767602.0,
"step": 47
},
{
"entropy": 0.5845721065998077,
"epoch": 0.1796071094480823,
"grad_norm": 0.04122321680188179,
"learning_rate": 0.0002,
"loss": 0.5887588858604431,
"mean_token_accuracy": 0.7646526545286179,
"num_tokens": 784029.0,
"step": 48
},
{
"entropy": 0.5674261897802353,
"epoch": 0.1833489242282507,
"grad_norm": 0.05335045978426933,
"learning_rate": 0.0002,
"loss": 0.5763436555862427,
"mean_token_accuracy": 0.7674090713262558,
"num_tokens": 800207.0,
"step": 49
},
{
"entropy": 0.5922754108905792,
"epoch": 0.18709073900841908,
"grad_norm": 0.04774358496069908,
"learning_rate": 0.0002,
"loss": 0.592854917049408,
"mean_token_accuracy": 0.7636804282665253,
"num_tokens": 816757.0,
"step": 50
},
{
"entropy": 0.5675703585147858,
"epoch": 0.19083255378858746,
"grad_norm": 0.046180881559848785,
"learning_rate": 0.0002,
"loss": 0.5643646121025085,
"mean_token_accuracy": 0.7744234651327133,
"num_tokens": 833143.0,
"step": 51
},
{
"entropy": 0.5735020041465759,
"epoch": 0.19457436856875585,
"grad_norm": 0.04306558147072792,
"learning_rate": 0.0002,
"loss": 0.5688086748123169,
"mean_token_accuracy": 0.7720673680305481,
"num_tokens": 849533.0,
"step": 52
},
{
"entropy": 0.5725302696228027,
"epoch": 0.19831618334892423,
"grad_norm": 0.044849518686532974,
"learning_rate": 0.0002,
"loss": 0.5705700516700745,
"mean_token_accuracy": 0.7675163745880127,
"num_tokens": 865711.0,
"step": 53
},
{
"entropy": 0.568488135933876,
"epoch": 0.20205799812909261,
"grad_norm": 0.03932643309235573,
"learning_rate": 0.0002,
"loss": 0.5707889795303345,
"mean_token_accuracy": 0.7687725275754929,
"num_tokens": 882150.0,
"step": 54
},
{
"entropy": 0.5733406245708466,
"epoch": 0.205799812909261,
"grad_norm": 0.044968072324991226,
"learning_rate": 0.0002,
"loss": 0.5740039348602295,
"mean_token_accuracy": 0.7688336670398712,
"num_tokens": 898618.0,
"step": 55
},
{
"entropy": 0.5666982084512711,
"epoch": 0.20954162768942938,
"grad_norm": 0.03931398317217827,
"learning_rate": 0.0002,
"loss": 0.5738785266876221,
"mean_token_accuracy": 0.7679219394922256,
"num_tokens": 914939.0,
"step": 56
},
{
"entropy": 0.5663618296384811,
"epoch": 0.21328344246959777,
"grad_norm": 0.0373641774058342,
"learning_rate": 0.0002,
"loss": 0.5636038780212402,
"mean_token_accuracy": 0.7741107642650604,
"num_tokens": 931291.0,
"step": 57
},
{
"entropy": 0.557570144534111,
"epoch": 0.21702525724976612,
"grad_norm": 0.04060584679245949,
"learning_rate": 0.0002,
"loss": 0.5589414238929749,
"mean_token_accuracy": 0.7753962129354477,
"num_tokens": 947611.0,
"step": 58
},
{
"entropy": 0.5627644211053848,
"epoch": 0.2207670720299345,
"grad_norm": 0.037169281393289566,
"learning_rate": 0.0002,
"loss": 0.5654425621032715,
"mean_token_accuracy": 0.7718145698308945,
"num_tokens": 963820.0,
"step": 59
},
{
"entropy": 0.58712999522686,
"epoch": 0.2245088868101029,
"grad_norm": 0.03782787546515465,
"learning_rate": 0.0002,
"loss": 0.5898170471191406,
"mean_token_accuracy": 0.7635077238082886,
"num_tokens": 980402.0,
"step": 60
},
{
"entropy": 0.5586348623037338,
"epoch": 0.22825070159027128,
"grad_norm": 0.03953346982598305,
"learning_rate": 0.0002,
"loss": 0.5562594532966614,
"mean_token_accuracy": 0.7752978503704071,
"num_tokens": 996502.0,
"step": 61
},
{
"entropy": 0.5691598951816559,
"epoch": 0.23199251637043966,
"grad_norm": 0.04252421110868454,
"learning_rate": 0.0002,
"loss": 0.5684412717819214,
"mean_token_accuracy": 0.7712201923131943,
"num_tokens": 1012676.0,
"step": 62
},
{
"entropy": 0.5714918673038483,
"epoch": 0.23573433115060805,
"grad_norm": 0.036386385560035706,
"learning_rate": 0.0002,
"loss": 0.5729389190673828,
"mean_token_accuracy": 0.768106073141098,
"num_tokens": 1028906.0,
"step": 63
},
{
"entropy": 0.5666227042675018,
"epoch": 0.23947614593077643,
"grad_norm": 0.037684470415115356,
"learning_rate": 0.0002,
"loss": 0.5600223541259766,
"mean_token_accuracy": 0.7734655141830444,
"num_tokens": 1045328.0,
"step": 64
},
{
"entropy": 0.5651632696390152,
"epoch": 0.2432179607109448,
"grad_norm": 0.03333243355154991,
"learning_rate": 0.0002,
"loss": 0.5639563798904419,
"mean_token_accuracy": 0.771888479590416,
"num_tokens": 1061791.0,
"step": 65
},
{
"entropy": 0.5851249843835831,
"epoch": 0.2469597754911132,
"grad_norm": 0.04036445543169975,
"learning_rate": 0.0002,
"loss": 0.5847532749176025,
"mean_token_accuracy": 0.7656708210706711,
"num_tokens": 1078293.0,
"step": 66
},
{
"entropy": 0.5670823901891708,
"epoch": 0.2507015902712816,
"grad_norm": 0.04222024604678154,
"learning_rate": 0.0002,
"loss": 0.5660995244979858,
"mean_token_accuracy": 0.7720949500799179,
"num_tokens": 1094672.0,
"step": 67
},
{
"entropy": 0.581654280424118,
"epoch": 0.25444340505144997,
"grad_norm": 0.03967028483748436,
"learning_rate": 0.0002,
"loss": 0.5889865159988403,
"mean_token_accuracy": 0.760698065161705,
"num_tokens": 1111068.0,
"step": 68
},
{
"entropy": 0.5533672720193863,
"epoch": 0.25818521983161835,
"grad_norm": 0.03658512607216835,
"learning_rate": 0.0002,
"loss": 0.5615257024765015,
"mean_token_accuracy": 0.7765155285596848,
"num_tokens": 1127289.0,
"step": 69
},
{
"entropy": 0.5607704222202301,
"epoch": 0.26192703461178674,
"grad_norm": 0.0379711352288723,
"learning_rate": 0.0002,
"loss": 0.5662075281143188,
"mean_token_accuracy": 0.7751724272966385,
"num_tokens": 1143569.0,
"step": 70
},
{
"entropy": 0.5778918713331223,
"epoch": 0.2656688493919551,
"grad_norm": 0.038288865238428116,
"learning_rate": 0.0002,
"loss": 0.5817552804946899,
"mean_token_accuracy": 0.7655211091041565,
"num_tokens": 1159646.0,
"step": 71
},
{
"entropy": 0.573161169886589,
"epoch": 0.2694106641721235,
"grad_norm": 0.038547221571207047,
"learning_rate": 0.0002,
"loss": 0.5695617198944092,
"mean_token_accuracy": 0.7739016711711884,
"num_tokens": 1175923.0,
"step": 72
},
{
"entropy": 0.5844559669494629,
"epoch": 0.2731524789522919,
"grad_norm": 0.03487812727689743,
"learning_rate": 0.0002,
"loss": 0.5778559446334839,
"mean_token_accuracy": 0.7675636559724808,
"num_tokens": 1192471.0,
"step": 73
},
{
"entropy": 0.578565388917923,
"epoch": 0.2768942937324602,
"grad_norm": 0.03859493136405945,
"learning_rate": 0.0002,
"loss": 0.5707017779350281,
"mean_token_accuracy": 0.7693561762571335,
"num_tokens": 1208749.0,
"step": 74
},
{
"entropy": 0.5591824799776077,
"epoch": 0.2806361085126286,
"grad_norm": 0.03378773108124733,
"learning_rate": 0.0002,
"loss": 0.557567298412323,
"mean_token_accuracy": 0.7764061838388443,
"num_tokens": 1224922.0,
"step": 75
},
{
"entropy": 0.568041980266571,
"epoch": 0.284377923292797,
"grad_norm": 0.03862875699996948,
"learning_rate": 0.0002,
"loss": 0.570695698261261,
"mean_token_accuracy": 0.7686833739280701,
"num_tokens": 1241294.0,
"step": 76
},
{
"entropy": 0.5530785471200943,
"epoch": 0.28811973807296537,
"grad_norm": 0.03997069224715233,
"learning_rate": 0.0002,
"loss": 0.5623512268066406,
"mean_token_accuracy": 0.7745240479707718,
"num_tokens": 1257616.0,
"step": 77
},
{
"entropy": 0.5595529079437256,
"epoch": 0.29186155285313375,
"grad_norm": 0.03598308190703392,
"learning_rate": 0.0002,
"loss": 0.5686611533164978,
"mean_token_accuracy": 0.7718778103590012,
"num_tokens": 1274217.0,
"step": 78
},
{
"entropy": 0.5654617100954056,
"epoch": 0.29560336763330214,
"grad_norm": 0.03698718175292015,
"learning_rate": 0.0002,
"loss": 0.5718352794647217,
"mean_token_accuracy": 0.7710111141204834,
"num_tokens": 1290502.0,
"step": 79
},
{
"entropy": 0.5769922882318497,
"epoch": 0.2993451824134705,
"grad_norm": 0.03608345612883568,
"learning_rate": 0.0002,
"loss": 0.5771495699882507,
"mean_token_accuracy": 0.7671397477388382,
"num_tokens": 1307057.0,
"step": 80
},
{
"entropy": 0.5775998532772064,
"epoch": 0.3030869971936389,
"grad_norm": 0.04129846766591072,
"learning_rate": 0.0002,
"loss": 0.5648953318595886,
"mean_token_accuracy": 0.7740987688302994,
"num_tokens": 1323158.0,
"step": 81
},
{
"entropy": 0.578661784529686,
"epoch": 0.3068288119738073,
"grad_norm": 0.04035583510994911,
"learning_rate": 0.0002,
"loss": 0.572229266166687,
"mean_token_accuracy": 0.769649401307106,
"num_tokens": 1339671.0,
"step": 82
},
{
"entropy": 0.5630823224782944,
"epoch": 0.3105706267539757,
"grad_norm": 0.035164687782526016,
"learning_rate": 0.0002,
"loss": 0.5634369254112244,
"mean_token_accuracy": 0.7725345641374588,
"num_tokens": 1355922.0,
"step": 83
},
{
"entropy": 0.5712268948554993,
"epoch": 0.31431244153414406,
"grad_norm": 0.038266371935606,
"learning_rate": 0.0002,
"loss": 0.5790088772773743,
"mean_token_accuracy": 0.7660410851240158,
"num_tokens": 1372241.0,
"step": 84
},
{
"entropy": 0.5503551959991455,
"epoch": 0.31805425631431244,
"grad_norm": 0.04355614632368088,
"learning_rate": 0.0002,
"loss": 0.5594754815101624,
"mean_token_accuracy": 0.7743213176727295,
"num_tokens": 1388447.0,
"step": 85
},
{
"entropy": 0.5567754805088043,
"epoch": 0.32179607109448083,
"grad_norm": 0.034040167927742004,
"learning_rate": 0.0002,
"loss": 0.5562305450439453,
"mean_token_accuracy": 0.7782892882823944,
"num_tokens": 1404595.0,
"step": 86
},
{
"entropy": 0.5897853374481201,
"epoch": 0.3255378858746492,
"grad_norm": 0.04141312837600708,
"learning_rate": 0.0002,
"loss": 0.5811256766319275,
"mean_token_accuracy": 0.7645350694656372,
"num_tokens": 1421046.0,
"step": 87
},
{
"entropy": 0.5651004612445831,
"epoch": 0.3292797006548176,
"grad_norm": 0.039186883717775345,
"learning_rate": 0.0002,
"loss": 0.5626670122146606,
"mean_token_accuracy": 0.771001011133194,
"num_tokens": 1437307.0,
"step": 88
},
{
"entropy": 0.5479820519685745,
"epoch": 0.333021515434986,
"grad_norm": 0.038090839982032776,
"learning_rate": 0.0002,
"loss": 0.5517987012863159,
"mean_token_accuracy": 0.7779913991689682,
"num_tokens": 1453625.0,
"step": 89
},
{
"entropy": 0.5513372272253036,
"epoch": 0.33676333021515437,
"grad_norm": 0.033073123544454575,
"learning_rate": 0.0002,
"loss": 0.5521109700202942,
"mean_token_accuracy": 0.7770368456840515,
"num_tokens": 1470001.0,
"step": 90
},
{
"entropy": 0.5538579821586609,
"epoch": 0.34050514499532275,
"grad_norm": 0.03432928025722504,
"learning_rate": 0.0002,
"loss": 0.5595468878746033,
"mean_token_accuracy": 0.7756330221891403,
"num_tokens": 1486202.0,
"step": 91
},
{
"entropy": 0.5441462099552155,
"epoch": 0.34424695977549113,
"grad_norm": 0.03260473906993866,
"learning_rate": 0.0002,
"loss": 0.5527001023292542,
"mean_token_accuracy": 0.7777194529771805,
"num_tokens": 1502337.0,
"step": 92
},
{
"entropy": 0.5642740428447723,
"epoch": 0.3479887745556595,
"grad_norm": 0.041720353066921234,
"learning_rate": 0.0002,
"loss": 0.5752084255218506,
"mean_token_accuracy": 0.7667101472616196,
"num_tokens": 1518821.0,
"step": 93
},
{
"entropy": 0.565082237124443,
"epoch": 0.3517305893358279,
"grad_norm": 0.03507543355226517,
"learning_rate": 0.0002,
"loss": 0.5699793696403503,
"mean_token_accuracy": 0.770054817199707,
"num_tokens": 1535163.0,
"step": 94
},
{
"entropy": 0.5870088040828705,
"epoch": 0.35547240411599623,
"grad_norm": 0.034236419945955276,
"learning_rate": 0.0002,
"loss": 0.5850114226341248,
"mean_token_accuracy": 0.7608266621828079,
"num_tokens": 1551565.0,
"step": 95
},
{
"entropy": 0.5530053824186325,
"epoch": 0.3592142188961646,
"grad_norm": 0.03369399905204773,
"learning_rate": 0.0002,
"loss": 0.5534529685974121,
"mean_token_accuracy": 0.7759882658720016,
"num_tokens": 1567750.0,
"step": 96
},
{
"entropy": 0.5754924863576889,
"epoch": 0.362956033676333,
"grad_norm": 0.036406002938747406,
"learning_rate": 0.0002,
"loss": 0.5705168843269348,
"mean_token_accuracy": 0.7698172330856323,
"num_tokens": 1584023.0,
"step": 97
},
{
"entropy": 0.5771925449371338,
"epoch": 0.3666978484565014,
"grad_norm": 0.032233767211437225,
"learning_rate": 0.0002,
"loss": 0.5738174319267273,
"mean_token_accuracy": 0.7679109573364258,
"num_tokens": 1600377.0,
"step": 98
},
{
"entropy": 0.566839799284935,
"epoch": 0.37043966323666977,
"grad_norm": 0.029388124123215675,
"learning_rate": 0.0002,
"loss": 0.5624303817749023,
"mean_token_accuracy": 0.771264523267746,
"num_tokens": 1616664.0,
"step": 99
},
{
"entropy": 0.5605880320072174,
"epoch": 0.37418147801683815,
"grad_norm": 0.034897759556770325,
"learning_rate": 0.0002,
"loss": 0.5609456896781921,
"mean_token_accuracy": 0.7745639681816101,
"num_tokens": 1632981.0,
"step": 100
},
{
"entropy": 0.5694979727268219,
"epoch": 0.37792329279700654,
"grad_norm": 0.03481722250580788,
"learning_rate": 0.0002,
"loss": 0.5728567838668823,
"mean_token_accuracy": 0.7689409404993057,
"num_tokens": 1649432.0,
"step": 101
},
{
"entropy": 0.5804490298032761,
"epoch": 0.3816651075771749,
"grad_norm": 0.03589940071105957,
"learning_rate": 0.0002,
"loss": 0.5847839713096619,
"mean_token_accuracy": 0.7632083743810654,
"num_tokens": 1666031.0,
"step": 102
},
{
"entropy": 0.5580839961767197,
"epoch": 0.3854069223573433,
"grad_norm": 0.031488265842199326,
"learning_rate": 0.0002,
"loss": 0.5667596459388733,
"mean_token_accuracy": 0.7720794081687927,
"num_tokens": 1682406.0,
"step": 103
},
{
"entropy": 0.5474104434251785,
"epoch": 0.3891487371375117,
"grad_norm": 0.03187083452939987,
"learning_rate": 0.0002,
"loss": 0.5499236583709717,
"mean_token_accuracy": 0.7772009670734406,
"num_tokens": 1698795.0,
"step": 104
},
{
"entropy": 0.5527014136314392,
"epoch": 0.3928905519176801,
"grad_norm": 0.03492984548211098,
"learning_rate": 0.0002,
"loss": 0.5512747168540955,
"mean_token_accuracy": 0.776108130812645,
"num_tokens": 1715480.0,
"step": 105
},
{
"entropy": 0.579165443778038,
"epoch": 0.39663236669784846,
"grad_norm": 0.03257554769515991,
"learning_rate": 0.0002,
"loss": 0.5810192823410034,
"mean_token_accuracy": 0.7663566768169403,
"num_tokens": 1731889.0,
"step": 106
},
{
"entropy": 0.5633712112903595,
"epoch": 0.40037418147801684,
"grad_norm": 0.03179244324564934,
"learning_rate": 0.0002,
"loss": 0.5622086524963379,
"mean_token_accuracy": 0.7680526524782181,
"num_tokens": 1748318.0,
"step": 107
},
{
"entropy": 0.5600844174623489,
"epoch": 0.40411599625818523,
"grad_norm": 0.029808223247528076,
"learning_rate": 0.0002,
"loss": 0.5606282949447632,
"mean_token_accuracy": 0.7708232551813126,
"num_tokens": 1764619.0,
"step": 108
},
{
"entropy": 0.5492478907108307,
"epoch": 0.4078578110383536,
"grad_norm": 0.031120680272579193,
"learning_rate": 0.0002,
"loss": 0.5484419465065002,
"mean_token_accuracy": 0.775683268904686,
"num_tokens": 1780851.0,
"step": 109
},
{
"entropy": 0.5517283380031586,
"epoch": 0.411599625818522,
"grad_norm": 0.03694352135062218,
"learning_rate": 0.0002,
"loss": 0.5580882430076599,
"mean_token_accuracy": 0.774466261267662,
"num_tokens": 1796890.0,
"step": 110
},
{
"entropy": 0.5656300336122513,
"epoch": 0.4153414405986904,
"grad_norm": 0.03588038682937622,
"learning_rate": 0.0002,
"loss": 0.5704593658447266,
"mean_token_accuracy": 0.7691588401794434,
"num_tokens": 1813404.0,
"step": 111
},
{
"entropy": 0.564102292060852,
"epoch": 0.41908325537885877,
"grad_norm": 0.03264907747507095,
"learning_rate": 0.0002,
"loss": 0.5655107498168945,
"mean_token_accuracy": 0.7724602967500687,
"num_tokens": 1829724.0,
"step": 112
},
{
"entropy": 0.5644495040178299,
"epoch": 0.42282507015902715,
"grad_norm": 0.03256542608141899,
"learning_rate": 0.0002,
"loss": 0.5646591782569885,
"mean_token_accuracy": 0.7743334770202637,
"num_tokens": 1846177.0,
"step": 113
},
{
"entropy": 0.545789897441864,
"epoch": 0.42656688493919553,
"grad_norm": 0.034160368144512177,
"learning_rate": 0.0002,
"loss": 0.5457491874694824,
"mean_token_accuracy": 0.7793226093053818,
"num_tokens": 1862412.0,
"step": 114
},
{
"entropy": 0.5670842975378036,
"epoch": 0.4303086997193639,
"grad_norm": 0.02954726107418537,
"learning_rate": 0.0002,
"loss": 0.5644434690475464,
"mean_token_accuracy": 0.7711858153343201,
"num_tokens": 1878518.0,
"step": 115
},
{
"entropy": 0.5647070705890656,
"epoch": 0.43405051449953225,
"grad_norm": 0.028261123225092888,
"learning_rate": 0.0002,
"loss": 0.5621106624603271,
"mean_token_accuracy": 0.776775136590004,
"num_tokens": 1895135.0,
"step": 116
},
{
"entropy": 0.529420793056488,
"epoch": 0.43779232927970063,
"grad_norm": 0.03301499783992767,
"learning_rate": 0.0002,
"loss": 0.536541759967804,
"mean_token_accuracy": 0.7836042046546936,
"num_tokens": 1911161.0,
"step": 117
},
{
"entropy": 0.5451334565877914,
"epoch": 0.441534144059869,
"grad_norm": 0.033271510154008865,
"learning_rate": 0.0002,
"loss": 0.5523592829704285,
"mean_token_accuracy": 0.7769709676504135,
"num_tokens": 1927550.0,
"step": 118
},
{
"entropy": 0.536512017250061,
"epoch": 0.4452759588400374,
"grad_norm": 0.03425843268632889,
"learning_rate": 0.0002,
"loss": 0.5380823612213135,
"mean_token_accuracy": 0.780797928571701,
"num_tokens": 1943788.0,
"step": 119
},
{
"entropy": 0.536301851272583,
"epoch": 0.4490177736202058,
"grad_norm": 0.03248719125986099,
"learning_rate": 0.0002,
"loss": 0.5470737218856812,
"mean_token_accuracy": 0.7803975343704224,
"num_tokens": 1959878.0,
"step": 120
},
{
"entropy": 0.5517153441905975,
"epoch": 0.45275958840037417,
"grad_norm": 0.03530304506421089,
"learning_rate": 0.0002,
"loss": 0.5577021241188049,
"mean_token_accuracy": 0.7733452618122101,
"num_tokens": 1976131.0,
"step": 121
},
{
"entropy": 0.5619277656078339,
"epoch": 0.45650140318054255,
"grad_norm": 0.03460797667503357,
"learning_rate": 0.0002,
"loss": 0.5516164898872375,
"mean_token_accuracy": 0.7756523787975311,
"num_tokens": 1992627.0,
"step": 122
},
{
"entropy": 0.5761916935443878,
"epoch": 0.46024321796071094,
"grad_norm": 0.03172283619642258,
"learning_rate": 0.0002,
"loss": 0.571029543876648,
"mean_token_accuracy": 0.7667981088161469,
"num_tokens": 2009019.0,
"step": 123
},
{
"entropy": 0.5743123888969421,
"epoch": 0.4639850327408793,
"grad_norm": 0.0364689975976944,
"learning_rate": 0.0002,
"loss": 0.5712283849716187,
"mean_token_accuracy": 0.7701593190431595,
"num_tokens": 2025188.0,
"step": 124
},
{
"entropy": 0.5582910478115082,
"epoch": 0.4677268475210477,
"grad_norm": 0.03056769073009491,
"learning_rate": 0.0002,
"loss": 0.56070876121521,
"mean_token_accuracy": 0.7755492180585861,
"num_tokens": 2041572.0,
"step": 125
},
{
"entropy": 0.5542439967393875,
"epoch": 0.4714686623012161,
"grad_norm": 0.03697546571493149,
"learning_rate": 0.0002,
"loss": 0.5604549646377563,
"mean_token_accuracy": 0.7751918882131577,
"num_tokens": 2057989.0,
"step": 126
},
{
"entropy": 0.5463303178548813,
"epoch": 0.4752104770813845,
"grad_norm": 0.033879246562719345,
"learning_rate": 0.0002,
"loss": 0.5539431571960449,
"mean_token_accuracy": 0.7758707851171494,
"num_tokens": 2074129.0,
"step": 127
},
{
"entropy": 0.5522827506065369,
"epoch": 0.47895229186155286,
"grad_norm": 0.03316348418593407,
"learning_rate": 0.0002,
"loss": 0.5581960082054138,
"mean_token_accuracy": 0.7748778462409973,
"num_tokens": 2090225.0,
"step": 128
},
{
"entropy": 0.5740112662315369,
"epoch": 0.48269410664172124,
"grad_norm": 0.03274102881550789,
"learning_rate": 0.0002,
"loss": 0.5653910040855408,
"mean_token_accuracy": 0.7719868570566177,
"num_tokens": 2106644.0,
"step": 129
},
{
"entropy": 0.5553925186395645,
"epoch": 0.4864359214218896,
"grad_norm": 0.028283284977078438,
"learning_rate": 0.0002,
"loss": 0.5513849258422852,
"mean_token_accuracy": 0.7774856984615326,
"num_tokens": 2123137.0,
"step": 130
},
{
"entropy": 0.5579676181077957,
"epoch": 0.490177736202058,
"grad_norm": 0.029911885038018227,
"learning_rate": 0.0002,
"loss": 0.5568463802337646,
"mean_token_accuracy": 0.7730498015880585,
"num_tokens": 2139285.0,
"step": 131
},
{
"entropy": 0.5664242058992386,
"epoch": 0.4939195509822264,
"grad_norm": 0.03227100148797035,
"learning_rate": 0.0002,
"loss": 0.5754393339157104,
"mean_token_accuracy": 0.7667475491762161,
"num_tokens": 2155517.0,
"step": 132
},
{
"entropy": 0.5501858294010162,
"epoch": 0.4976613657623948,
"grad_norm": 0.03013962134718895,
"learning_rate": 0.0002,
"loss": 0.5513433218002319,
"mean_token_accuracy": 0.7747298777103424,
"num_tokens": 2171722.0,
"step": 133
},
{
"entropy": 0.5627453327178955,
"epoch": 0.5014031805425632,
"grad_norm": 0.034450363367795944,
"learning_rate": 0.0002,
"loss": 0.5604255199432373,
"mean_token_accuracy": 0.7740208506584167,
"num_tokens": 2188054.0,
"step": 134
},
{
"entropy": 0.5634363293647766,
"epoch": 0.5051449953227315,
"grad_norm": 0.03803717717528343,
"learning_rate": 0.0002,
"loss": 0.558170735836029,
"mean_token_accuracy": 0.7775739133358002,
"num_tokens": 2204313.0,
"step": 135
},
{
"entropy": 0.5590767562389374,
"epoch": 0.5088868101028999,
"grad_norm": 0.029813330620527267,
"learning_rate": 0.0002,
"loss": 0.5652009844779968,
"mean_token_accuracy": 0.7706311643123627,
"num_tokens": 2220687.0,
"step": 136
},
{
"entropy": 0.5706852972507477,
"epoch": 0.5126286248830683,
"grad_norm": 0.0418686643242836,
"learning_rate": 0.0002,
"loss": 0.5734685063362122,
"mean_token_accuracy": 0.7665899097919464,
"num_tokens": 2237258.0,
"step": 137
},
{
"entropy": 0.5638300180435181,
"epoch": 0.5163704396632367,
"grad_norm": 0.03304136171936989,
"learning_rate": 0.0002,
"loss": 0.5663323402404785,
"mean_token_accuracy": 0.7701692581176758,
"num_tokens": 2253553.0,
"step": 138
},
{
"entropy": 0.5560389012098312,
"epoch": 0.520112254443405,
"grad_norm": 0.032340649515390396,
"learning_rate": 0.0002,
"loss": 0.5557302832603455,
"mean_token_accuracy": 0.7773910611867905,
"num_tokens": 2269787.0,
"step": 139
},
{
"entropy": 0.5491623729467392,
"epoch": 0.5238540692235735,
"grad_norm": 0.03743594512343407,
"learning_rate": 0.0002,
"loss": 0.5475925803184509,
"mean_token_accuracy": 0.7796913385391235,
"num_tokens": 2286052.0,
"step": 140
},
{
"entropy": 0.5624114125967026,
"epoch": 0.5275958840037418,
"grad_norm": 0.03084268979728222,
"learning_rate": 0.0002,
"loss": 0.5612790584564209,
"mean_token_accuracy": 0.7745496481657028,
"num_tokens": 2302516.0,
"step": 141
},
{
"entropy": 0.5638779103755951,
"epoch": 0.5313376987839102,
"grad_norm": 0.02851773053407669,
"learning_rate": 0.0002,
"loss": 0.568551778793335,
"mean_token_accuracy": 0.7703356891870499,
"num_tokens": 2318761.0,
"step": 142
},
{
"entropy": 0.5524759441614151,
"epoch": 0.5350795135640786,
"grad_norm": 0.03449970856308937,
"learning_rate": 0.0002,
"loss": 0.5582625865936279,
"mean_token_accuracy": 0.7745357155799866,
"num_tokens": 2335227.0,
"step": 143
},
{
"entropy": 0.5538729876279831,
"epoch": 0.538821328344247,
"grad_norm": 0.036926597356796265,
"learning_rate": 0.0002,
"loss": 0.5551813840866089,
"mean_token_accuracy": 0.7734793871641159,
"num_tokens": 2351743.0,
"step": 144
},
{
"entropy": 0.556109830737114,
"epoch": 0.5425631431244153,
"grad_norm": 0.032143596559762955,
"learning_rate": 0.0002,
"loss": 0.5621770620346069,
"mean_token_accuracy": 0.7720111310482025,
"num_tokens": 2368312.0,
"step": 145
},
{
"entropy": 0.5528390407562256,
"epoch": 0.5463049579045838,
"grad_norm": 0.027878830209374428,
"learning_rate": 0.0002,
"loss": 0.551728367805481,
"mean_token_accuracy": 0.7765467911958694,
"num_tokens": 2384834.0,
"step": 146
},
{
"entropy": 0.569217711687088,
"epoch": 0.5500467726847521,
"grad_norm": 0.03398638963699341,
"learning_rate": 0.0002,
"loss": 0.5663697123527527,
"mean_token_accuracy": 0.7732102274894714,
"num_tokens": 2401144.0,
"step": 147
},
{
"entropy": 0.5385106950998306,
"epoch": 0.5537885874649204,
"grad_norm": 0.034567005932331085,
"learning_rate": 0.0002,
"loss": 0.5383309721946716,
"mean_token_accuracy": 0.781255692243576,
"num_tokens": 2417158.0,
"step": 148
},
{
"entropy": 0.5630964189767838,
"epoch": 0.5575304022450889,
"grad_norm": 0.029897838830947876,
"learning_rate": 0.0002,
"loss": 0.5677754282951355,
"mean_token_accuracy": 0.7685458660125732,
"num_tokens": 2433487.0,
"step": 149
},
{
"entropy": 0.5507898777723312,
"epoch": 0.5612722170252572,
"grad_norm": 0.02974529378116131,
"learning_rate": 0.0002,
"loss": 0.5534771680831909,
"mean_token_accuracy": 0.7748892605304718,
"num_tokens": 2449770.0,
"step": 150
},
{
"entropy": 0.5639528781175613,
"epoch": 0.5650140318054256,
"grad_norm": 0.03235238045454025,
"learning_rate": 0.0002,
"loss": 0.5681154131889343,
"mean_token_accuracy": 0.7700216770172119,
"num_tokens": 2466229.0,
"step": 151
},
{
"entropy": 0.5683706551790237,
"epoch": 0.568755846585594,
"grad_norm": 0.028963793069124222,
"learning_rate": 0.0002,
"loss": 0.569283127784729,
"mean_token_accuracy": 0.7688962519168854,
"num_tokens": 2482737.0,
"step": 152
},
{
"entropy": 0.5595172494649887,
"epoch": 0.5724976613657624,
"grad_norm": 0.02971002459526062,
"learning_rate": 0.0002,
"loss": 0.5543393492698669,
"mean_token_accuracy": 0.7762883901596069,
"num_tokens": 2499145.0,
"step": 153
},
{
"entropy": 0.55421943962574,
"epoch": 0.5762394761459307,
"grad_norm": 0.030361918732523918,
"learning_rate": 0.0002,
"loss": 0.5593795776367188,
"mean_token_accuracy": 0.7707612812519073,
"num_tokens": 2515460.0,
"step": 154
},
{
"entropy": 0.5604497343301773,
"epoch": 0.5799812909260992,
"grad_norm": 0.03249987214803696,
"learning_rate": 0.0002,
"loss": 0.559572696685791,
"mean_token_accuracy": 0.7736714631319046,
"num_tokens": 2531731.0,
"step": 155
},
{
"entropy": 0.5572012811899185,
"epoch": 0.5837231057062675,
"grad_norm": 0.028877906501293182,
"learning_rate": 0.0002,
"loss": 0.5557632446289062,
"mean_token_accuracy": 0.7749307751655579,
"num_tokens": 2547934.0,
"step": 156
},
{
"entropy": 0.5711070001125336,
"epoch": 0.587464920486436,
"grad_norm": 0.030351407825946808,
"learning_rate": 0.0002,
"loss": 0.5682122707366943,
"mean_token_accuracy": 0.7715558409690857,
"num_tokens": 2564252.0,
"step": 157
},
{
"entropy": 0.5656052529811859,
"epoch": 0.5912067352666043,
"grad_norm": 0.029292697086930275,
"learning_rate": 0.0002,
"loss": 0.5643728375434875,
"mean_token_accuracy": 0.7730299234390259,
"num_tokens": 2580465.0,
"step": 158
},
{
"entropy": 0.5565295219421387,
"epoch": 0.5949485500467727,
"grad_norm": 0.028714049607515335,
"learning_rate": 0.0002,
"loss": 0.5634271502494812,
"mean_token_accuracy": 0.7702697217464447,
"num_tokens": 2596985.0,
"step": 159
},
{
"entropy": 0.5631282031536102,
"epoch": 0.598690364826941,
"grad_norm": 0.030091576278209686,
"learning_rate": 0.0002,
"loss": 0.5721826553344727,
"mean_token_accuracy": 0.7689475417137146,
"num_tokens": 2613206.0,
"step": 160
},
{
"entropy": 0.5607286393642426,
"epoch": 0.6024321796071095,
"grad_norm": 0.03013305738568306,
"learning_rate": 0.0002,
"loss": 0.5609285235404968,
"mean_token_accuracy": 0.7740870416164398,
"num_tokens": 2629766.0,
"step": 161
},
{
"entropy": 0.5548760294914246,
"epoch": 0.6061739943872778,
"grad_norm": 0.03615036979317665,
"learning_rate": 0.0002,
"loss": 0.561907172203064,
"mean_token_accuracy": 0.7704312056303024,
"num_tokens": 2645841.0,
"step": 162
},
{
"entropy": 0.5578597337007523,
"epoch": 0.6099158091674463,
"grad_norm": 0.029693420976400375,
"learning_rate": 0.0002,
"loss": 0.5573199391365051,
"mean_token_accuracy": 0.7728497833013535,
"num_tokens": 2662175.0,
"step": 163
},
{
"entropy": 0.5612762272357941,
"epoch": 0.6136576239476146,
"grad_norm": 0.030115241184830666,
"learning_rate": 0.0002,
"loss": 0.5610560178756714,
"mean_token_accuracy": 0.7720479369163513,
"num_tokens": 2678456.0,
"step": 164
},
{
"entropy": 0.5692281126976013,
"epoch": 0.617399438727783,
"grad_norm": 0.030713427811861038,
"learning_rate": 0.0002,
"loss": 0.567272961139679,
"mean_token_accuracy": 0.7701284140348434,
"num_tokens": 2694886.0,
"step": 165
},
{
"entropy": 0.5571814477443695,
"epoch": 0.6211412535079514,
"grad_norm": 0.030081165954470634,
"learning_rate": 0.0002,
"loss": 0.5578005313873291,
"mean_token_accuracy": 0.7734847068786621,
"num_tokens": 2711066.0,
"step": 166
},
{
"entropy": 0.5701806098222733,
"epoch": 0.6248830682881198,
"grad_norm": 0.024519717320799828,
"learning_rate": 0.0002,
"loss": 0.5707820057868958,
"mean_token_accuracy": 0.765745609998703,
"num_tokens": 2727604.0,
"step": 167
},
{
"entropy": 0.546685203909874,
"epoch": 0.6286248830682881,
"grad_norm": 0.030948853120207787,
"learning_rate": 0.0002,
"loss": 0.5538927912712097,
"mean_token_accuracy": 0.7749418467283249,
"num_tokens": 2743937.0,
"step": 168
},
{
"entropy": 0.5537951737642288,
"epoch": 0.6323666978484564,
"grad_norm": 0.03693117946386337,
"learning_rate": 0.0002,
"loss": 0.5586614608764648,
"mean_token_accuracy": 0.7715347409248352,
"num_tokens": 2760525.0,
"step": 169
},
{
"entropy": 0.5430830717086792,
"epoch": 0.6361085126286249,
"grad_norm": 0.029782412573695183,
"learning_rate": 0.0002,
"loss": 0.5412864685058594,
"mean_token_accuracy": 0.7784539759159088,
"num_tokens": 2776721.0,
"step": 170
},
{
"entropy": 0.5351588726043701,
"epoch": 0.6398503274087932,
"grad_norm": 0.03263084217905998,
"learning_rate": 0.0002,
"loss": 0.5388463139533997,
"mean_token_accuracy": 0.781808465719223,
"num_tokens": 2792933.0,
"step": 171
},
{
"entropy": 0.5568130016326904,
"epoch": 0.6435921421889617,
"grad_norm": 0.031154213473200798,
"learning_rate": 0.0002,
"loss": 0.5626617670059204,
"mean_token_accuracy": 0.7720103710889816,
"num_tokens": 2809451.0,
"step": 172
},
{
"entropy": 0.5607169568538666,
"epoch": 0.64733395696913,
"grad_norm": 0.03371235355734825,
"learning_rate": 0.0002,
"loss": 0.5647063255310059,
"mean_token_accuracy": 0.7718498706817627,
"num_tokens": 2825932.0,
"step": 173
},
{
"entropy": 0.555529311299324,
"epoch": 0.6510757717492984,
"grad_norm": 0.030816521495580673,
"learning_rate": 0.0002,
"loss": 0.5564374327659607,
"mean_token_accuracy": 0.7758121490478516,
"num_tokens": 2842314.0,
"step": 174
},
{
"entropy": 0.5513110458850861,
"epoch": 0.6548175865294668,
"grad_norm": 0.02944033220410347,
"learning_rate": 0.0002,
"loss": 0.5524051189422607,
"mean_token_accuracy": 0.77901391685009,
"num_tokens": 2858741.0,
"step": 175
},
{
"entropy": 0.5570909082889557,
"epoch": 0.6585594013096352,
"grad_norm": 0.030563851818442345,
"learning_rate": 0.0002,
"loss": 0.552980899810791,
"mean_token_accuracy": 0.7785744369029999,
"num_tokens": 2874790.0,
"step": 176
},
{
"entropy": 0.5531197637319565,
"epoch": 0.6623012160898035,
"grad_norm": 0.026769133284687996,
"learning_rate": 0.0002,
"loss": 0.5503875017166138,
"mean_token_accuracy": 0.7756068855524063,
"num_tokens": 2890991.0,
"step": 177
},
{
"entropy": 0.5576685070991516,
"epoch": 0.666043030869972,
"grad_norm": 0.031243668869137764,
"learning_rate": 0.0002,
"loss": 0.5595083236694336,
"mean_token_accuracy": 0.7736776024103165,
"num_tokens": 2907372.0,
"step": 178
},
{
"entropy": 0.561943918466568,
"epoch": 0.6697848456501403,
"grad_norm": 0.029022254049777985,
"learning_rate": 0.0002,
"loss": 0.5671570301055908,
"mean_token_accuracy": 0.7722343951463699,
"num_tokens": 2923921.0,
"step": 179
},
{
"entropy": 0.5484957844018936,
"epoch": 0.6735266604303087,
"grad_norm": 0.030121706426143646,
"learning_rate": 0.0002,
"loss": 0.5546964406967163,
"mean_token_accuracy": 0.7751270681619644,
"num_tokens": 2940247.0,
"step": 180
},
{
"entropy": 0.554192379117012,
"epoch": 0.6772684752104771,
"grad_norm": 0.030762923881411552,
"learning_rate": 0.0002,
"loss": 0.5602478981018066,
"mean_token_accuracy": 0.7732126861810684,
"num_tokens": 2956527.0,
"step": 181
},
{
"entropy": 0.5684338361024857,
"epoch": 0.6810102899906455,
"grad_norm": 0.036885276436805725,
"learning_rate": 0.0002,
"loss": 0.5655561685562134,
"mean_token_accuracy": 0.769650399684906,
"num_tokens": 2972654.0,
"step": 182
},
{
"entropy": 0.5733159780502319,
"epoch": 0.6847521047708138,
"grad_norm": 0.03168238326907158,
"learning_rate": 0.0002,
"loss": 0.5698360800743103,
"mean_token_accuracy": 0.7700367867946625,
"num_tokens": 2989101.0,
"step": 183
},
{
"entropy": 0.556915819644928,
"epoch": 0.6884939195509823,
"grad_norm": 0.03091347962617874,
"learning_rate": 0.0002,
"loss": 0.5448244214057922,
"mean_token_accuracy": 0.7791603803634644,
"num_tokens": 3005335.0,
"step": 184
},
{
"entropy": 0.5490943491458893,
"epoch": 0.6922357343311506,
"grad_norm": 0.032818131148815155,
"learning_rate": 0.0002,
"loss": 0.5487899780273438,
"mean_token_accuracy": 0.7768953591585159,
"num_tokens": 3021621.0,
"step": 185
},
{
"entropy": 0.5296357423067093,
"epoch": 0.695977549111319,
"grad_norm": 0.03200080245733261,
"learning_rate": 0.0002,
"loss": 0.5386063456535339,
"mean_token_accuracy": 0.7796643227338791,
"num_tokens": 3037785.0,
"step": 186
},
{
"entropy": 0.5606788247823715,
"epoch": 0.6997193638914874,
"grad_norm": 0.03352601081132889,
"learning_rate": 0.0002,
"loss": 0.5720128417015076,
"mean_token_accuracy": 0.7676278650760651,
"num_tokens": 3053806.0,
"step": 187
},
{
"entropy": 0.5525215566158295,
"epoch": 0.7034611786716558,
"grad_norm": 0.03217856585979462,
"learning_rate": 0.0002,
"loss": 0.5599426627159119,
"mean_token_accuracy": 0.7706687748432159,
"num_tokens": 3070070.0,
"step": 188
},
{
"entropy": 0.5785647034645081,
"epoch": 0.7072029934518241,
"grad_norm": 0.03108043409883976,
"learning_rate": 0.0002,
"loss": 0.5753121376037598,
"mean_token_accuracy": 0.7674888074398041,
"num_tokens": 3086407.0,
"step": 189
},
{
"entropy": 0.572156235575676,
"epoch": 0.7109448082319925,
"grad_norm": 0.036022067070007324,
"learning_rate": 0.0002,
"loss": 0.5567526817321777,
"mean_token_accuracy": 0.7726783901453018,
"num_tokens": 3102575.0,
"step": 190
},
{
"entropy": 0.5531092137098312,
"epoch": 0.7146866230121609,
"grad_norm": 0.028695300221443176,
"learning_rate": 0.0002,
"loss": 0.545417070388794,
"mean_token_accuracy": 0.7790848612785339,
"num_tokens": 3118942.0,
"step": 191
},
{
"entropy": 0.542072057723999,
"epoch": 0.7184284377923292,
"grad_norm": 0.02768511138856411,
"learning_rate": 0.0002,
"loss": 0.5424788594245911,
"mean_token_accuracy": 0.7790149599313736,
"num_tokens": 3134996.0,
"step": 192
},
{
"entropy": 0.5440382957458496,
"epoch": 0.7221702525724977,
"grad_norm": 0.044699691236019135,
"learning_rate": 0.0002,
"loss": 0.5630879402160645,
"mean_token_accuracy": 0.7720867395401001,
"num_tokens": 3151144.0,
"step": 193
},
{
"entropy": 0.5484438389539719,
"epoch": 0.725912067352666,
"grad_norm": 0.033284809440374374,
"learning_rate": 0.0002,
"loss": 0.5586625933647156,
"mean_token_accuracy": 0.7742896676063538,
"num_tokens": 3167431.0,
"step": 194
},
{
"entropy": 0.5585122853517532,
"epoch": 0.7296538821328344,
"grad_norm": 0.029940789565443993,
"learning_rate": 0.0002,
"loss": 0.5640571117401123,
"mean_token_accuracy": 0.7736721932888031,
"num_tokens": 3183584.0,
"step": 195
},
{
"entropy": 0.5803828984498978,
"epoch": 0.7333956969130028,
"grad_norm": 0.03922640532255173,
"learning_rate": 0.0002,
"loss": 0.5756028294563293,
"mean_token_accuracy": 0.7650134712457657,
"num_tokens": 3199936.0,
"step": 196
},
{
"entropy": 0.5695553570985794,
"epoch": 0.7371375116931712,
"grad_norm": 0.02914128266274929,
"learning_rate": 0.0002,
"loss": 0.5552971959114075,
"mean_token_accuracy": 0.7738740146160126,
"num_tokens": 3216327.0,
"step": 197
},
{
"entropy": 0.5402019023895264,
"epoch": 0.7408793264733395,
"grad_norm": 0.02753686159849167,
"learning_rate": 0.0002,
"loss": 0.5362023711204529,
"mean_token_accuracy": 0.7808489948511124,
"num_tokens": 3232411.0,
"step": 198
},
{
"entropy": 0.5661509037017822,
"epoch": 0.744621141253508,
"grad_norm": 0.029173044487833977,
"learning_rate": 0.0002,
"loss": 0.5666989088058472,
"mean_token_accuracy": 0.7697858512401581,
"num_tokens": 3248516.0,
"step": 199
},
{
"entropy": 0.5394262075424194,
"epoch": 0.7483629560336763,
"grad_norm": 0.03222000226378441,
"learning_rate": 0.0002,
"loss": 0.5493192076683044,
"mean_token_accuracy": 0.7756218761205673,
"num_tokens": 3264724.0,
"step": 200
},
{
"entropy": 0.5624162256717682,
"epoch": 0.7521047708138447,
"grad_norm": 0.03587524592876434,
"learning_rate": 0.0002,
"loss": 0.5728610157966614,
"mean_token_accuracy": 0.7661173194646835,
"num_tokens": 3280953.0,
"step": 201
},
{
"entropy": 0.5574640333652496,
"epoch": 0.7558465855940131,
"grad_norm": 0.030263541266322136,
"learning_rate": 0.0002,
"loss": 0.5545740127563477,
"mean_token_accuracy": 0.7747018188238144,
"num_tokens": 3297315.0,
"step": 202
},
{
"entropy": 0.5598777681589127,
"epoch": 0.7595884003741815,
"grad_norm": 0.0284356027841568,
"learning_rate": 0.0002,
"loss": 0.5577300190925598,
"mean_token_accuracy": 0.7724722474813461,
"num_tokens": 3313688.0,
"step": 203
},
{
"entropy": 0.5658386498689651,
"epoch": 0.7633302151543498,
"grad_norm": 0.03470136970281601,
"learning_rate": 0.0002,
"loss": 0.5591439008712769,
"mean_token_accuracy": 0.7761197835206985,
"num_tokens": 3329826.0,
"step": 204
},
{
"entropy": 0.5585865527391434,
"epoch": 0.7670720299345183,
"grad_norm": 0.027583830058574677,
"learning_rate": 0.0002,
"loss": 0.5561191439628601,
"mean_token_accuracy": 0.7717861980199814,
"num_tokens": 3346401.0,
"step": 205
},
{
"entropy": 0.5518056601285934,
"epoch": 0.7708138447146866,
"grad_norm": 0.034380193799734116,
"learning_rate": 0.0002,
"loss": 0.56368488073349,
"mean_token_accuracy": 0.7690371572971344,
"num_tokens": 3362862.0,
"step": 206
},
{
"entropy": 0.5423950105905533,
"epoch": 0.774555659494855,
"grad_norm": 0.027748677879571915,
"learning_rate": 0.0002,
"loss": 0.5500733256340027,
"mean_token_accuracy": 0.7782405465841293,
"num_tokens": 3379133.0,
"step": 207
},
{
"entropy": 0.5392836630344391,
"epoch": 0.7782974742750234,
"grad_norm": 0.030424097552895546,
"learning_rate": 0.0002,
"loss": 0.5452281832695007,
"mean_token_accuracy": 0.7790029048919678,
"num_tokens": 3395406.0,
"step": 208
},
{
"entropy": 0.5665347129106522,
"epoch": 0.7820392890551918,
"grad_norm": 0.02836509235203266,
"learning_rate": 0.0002,
"loss": 0.5655370950698853,
"mean_token_accuracy": 0.768405556678772,
"num_tokens": 3411686.0,
"step": 209
},
{
"entropy": 0.5624722540378571,
"epoch": 0.7857811038353602,
"grad_norm": 0.028227761387825012,
"learning_rate": 0.0002,
"loss": 0.5540167689323425,
"mean_token_accuracy": 0.7740924656391144,
"num_tokens": 3427914.0,
"step": 210
},
{
"entropy": 0.555148720741272,
"epoch": 0.7895229186155285,
"grad_norm": 0.03054502047598362,
"learning_rate": 0.0002,
"loss": 0.5572685599327087,
"mean_token_accuracy": 0.7746326923370361,
"num_tokens": 3444170.0,
"step": 211
},
{
"entropy": 0.5449056923389435,
"epoch": 0.7932647333956969,
"grad_norm": 0.03224708139896393,
"learning_rate": 0.0002,
"loss": 0.5572819113731384,
"mean_token_accuracy": 0.7724157273769379,
"num_tokens": 3460305.0,
"step": 212
},
{
"entropy": 0.5533578097820282,
"epoch": 0.7970065481758652,
"grad_norm": 0.031917959451675415,
"learning_rate": 0.0002,
"loss": 0.557055652141571,
"mean_token_accuracy": 0.7715483158826828,
"num_tokens": 3476772.0,
"step": 213
},
{
"entropy": 0.5611972808837891,
"epoch": 0.8007483629560337,
"grad_norm": 0.031701650470495224,
"learning_rate": 0.0002,
"loss": 0.5658101439476013,
"mean_token_accuracy": 0.7677106559276581,
"num_tokens": 3493499.0,
"step": 214
},
{
"entropy": 0.5572656095027924,
"epoch": 0.804490177736202,
"grad_norm": 0.02719227597117424,
"learning_rate": 0.0002,
"loss": 0.5549203157424927,
"mean_token_accuracy": 0.774790808558464,
"num_tokens": 3509811.0,
"step": 215
},
{
"entropy": 0.5471508800983429,
"epoch": 0.8082319925163705,
"grad_norm": 0.025823380798101425,
"learning_rate": 0.0002,
"loss": 0.5506555438041687,
"mean_token_accuracy": 0.7770570069551468,
"num_tokens": 3526157.0,
"step": 216
},
{
"entropy": 0.5587919056415558,
"epoch": 0.8119738072965388,
"grad_norm": 0.027526551857590675,
"learning_rate": 0.0002,
"loss": 0.5553531050682068,
"mean_token_accuracy": 0.7733194231987,
"num_tokens": 3542353.0,
"step": 217
},
{
"entropy": 0.5590764433145523,
"epoch": 0.8157156220767072,
"grad_norm": 0.027686061337590218,
"learning_rate": 0.0002,
"loss": 0.553832471370697,
"mean_token_accuracy": 0.7726568281650543,
"num_tokens": 3558723.0,
"step": 218
},
{
"entropy": 0.5684271901845932,
"epoch": 0.8194574368568756,
"grad_norm": 0.027071600779891014,
"learning_rate": 0.0002,
"loss": 0.5699101686477661,
"mean_token_accuracy": 0.7687496989965439,
"num_tokens": 3575290.0,
"step": 219
},
{
"entropy": 0.5384210348129272,
"epoch": 0.823199251637044,
"grad_norm": 0.030755044892430305,
"learning_rate": 0.0002,
"loss": 0.5439192652702332,
"mean_token_accuracy": 0.7772842049598694,
"num_tokens": 3591563.0,
"step": 220
},
{
"entropy": 0.524935394525528,
"epoch": 0.8269410664172123,
"grad_norm": 0.02740432508289814,
"learning_rate": 0.0002,
"loss": 0.529310941696167,
"mean_token_accuracy": 0.784336507320404,
"num_tokens": 3607814.0,
"step": 221
},
{
"entropy": 0.5532049238681793,
"epoch": 0.8306828811973808,
"grad_norm": 0.034083202481269836,
"learning_rate": 0.0002,
"loss": 0.5611142516136169,
"mean_token_accuracy": 0.7706895172595978,
"num_tokens": 3624047.0,
"step": 222
},
{
"entropy": 0.5380610376596451,
"epoch": 0.8344246959775491,
"grad_norm": 0.029454410076141357,
"learning_rate": 0.0002,
"loss": 0.5438103675842285,
"mean_token_accuracy": 0.7790344655513763,
"num_tokens": 3640194.0,
"step": 223
},
{
"entropy": 0.5661721527576447,
"epoch": 0.8381665107577175,
"grad_norm": 0.029397280886769295,
"learning_rate": 0.0002,
"loss": 0.558972954750061,
"mean_token_accuracy": 0.7724218964576721,
"num_tokens": 3656608.0,
"step": 224
},
{
"entropy": 0.5514093935489655,
"epoch": 0.8419083255378859,
"grad_norm": 0.029793422669172287,
"learning_rate": 0.0002,
"loss": 0.550917387008667,
"mean_token_accuracy": 0.7733565121889114,
"num_tokens": 3672523.0,
"step": 225
},
{
"entropy": 0.5508118569850922,
"epoch": 0.8456501403180543,
"grad_norm": 0.030908716842532158,
"learning_rate": 0.0002,
"loss": 0.5537383556365967,
"mean_token_accuracy": 0.7725334316492081,
"num_tokens": 3688658.0,
"step": 226
},
{
"entropy": 0.5521706193685532,
"epoch": 0.8493919550982226,
"grad_norm": 0.03186751529574394,
"learning_rate": 0.0002,
"loss": 0.5577634572982788,
"mean_token_accuracy": 0.7732146978378296,
"num_tokens": 3704875.0,
"step": 227
},
{
"entropy": 0.543274000287056,
"epoch": 0.8531337698783911,
"grad_norm": 0.030743638053536415,
"learning_rate": 0.0002,
"loss": 0.5453194379806519,
"mean_token_accuracy": 0.7776961177587509,
"num_tokens": 3720936.0,
"step": 228
},
{
"entropy": 0.5507763624191284,
"epoch": 0.8568755846585594,
"grad_norm": 0.030140401795506477,
"learning_rate": 0.0002,
"loss": 0.5504044890403748,
"mean_token_accuracy": 0.7767813801765442,
"num_tokens": 3737279.0,
"step": 229
},
{
"entropy": 0.5462870597839355,
"epoch": 0.8606173994387278,
"grad_norm": 0.026473646983504295,
"learning_rate": 0.0002,
"loss": 0.5481734275817871,
"mean_token_accuracy": 0.7772915065288544,
"num_tokens": 3753415.0,
"step": 230
},
{
"entropy": 0.5563444495201111,
"epoch": 0.8643592142188962,
"grad_norm": 0.02921387553215027,
"learning_rate": 0.0002,
"loss": 0.5546942949295044,
"mean_token_accuracy": 0.7731446027755737,
"num_tokens": 3769803.0,
"step": 231
},
{
"entropy": 0.559598296880722,
"epoch": 0.8681010289990645,
"grad_norm": 0.03972897306084633,
"learning_rate": 0.0002,
"loss": 0.5572680234909058,
"mean_token_accuracy": 0.773430734872818,
"num_tokens": 3785892.0,
"step": 232
},
{
"entropy": 0.539952963590622,
"epoch": 0.8718428437792329,
"grad_norm": 0.028981171548366547,
"learning_rate": 0.0002,
"loss": 0.5390475988388062,
"mean_token_accuracy": 0.7811980247497559,
"num_tokens": 3802184.0,
"step": 233
},
{
"entropy": 0.5387761145830154,
"epoch": 0.8755846585594013,
"grad_norm": 0.026351595297455788,
"learning_rate": 0.0002,
"loss": 0.5407798290252686,
"mean_token_accuracy": 0.7787132114171982,
"num_tokens": 3818418.0,
"step": 234
},
{
"entropy": 0.5693282037973404,
"epoch": 0.8793264733395697,
"grad_norm": 0.033158186823129654,
"learning_rate": 0.0002,
"loss": 0.5714267492294312,
"mean_token_accuracy": 0.7690801620483398,
"num_tokens": 3834874.0,
"step": 235
},
{
"entropy": 0.5534514784812927,
"epoch": 0.883068288119738,
"grad_norm": 0.0280459001660347,
"learning_rate": 0.0002,
"loss": 0.5574108362197876,
"mean_token_accuracy": 0.7764205187559128,
"num_tokens": 3851261.0,
"step": 236
},
{
"entropy": 0.5554600358009338,
"epoch": 0.8868101028999065,
"grad_norm": 0.027284014970064163,
"learning_rate": 0.0002,
"loss": 0.5592954754829407,
"mean_token_accuracy": 0.7728679180145264,
"num_tokens": 3867826.0,
"step": 237
},
{
"entropy": 0.5611312091350555,
"epoch": 0.8905519176800748,
"grad_norm": 0.027675554156303406,
"learning_rate": 0.0002,
"loss": 0.5633160471916199,
"mean_token_accuracy": 0.7716223746538162,
"num_tokens": 3884424.0,
"step": 238
},
{
"entropy": 0.5698042660951614,
"epoch": 0.8942937324602432,
"grad_norm": 0.02734820544719696,
"learning_rate": 0.0002,
"loss": 0.5722016096115112,
"mean_token_accuracy": 0.767684668302536,
"num_tokens": 3900993.0,
"step": 239
},
{
"entropy": 0.5487347990274429,
"epoch": 0.8980355472404116,
"grad_norm": 0.030463971197605133,
"learning_rate": 0.0002,
"loss": 0.5459187626838684,
"mean_token_accuracy": 0.7788650244474411,
"num_tokens": 3917455.0,
"step": 240
},
{
"entropy": 0.5684353709220886,
"epoch": 0.90177736202058,
"grad_norm": 0.028492476791143417,
"learning_rate": 0.0002,
"loss": 0.5674321055412292,
"mean_token_accuracy": 0.7663144171237946,
"num_tokens": 3934049.0,
"step": 241
},
{
"entropy": 0.5689758509397507,
"epoch": 0.9055191768007483,
"grad_norm": 0.02926958166062832,
"learning_rate": 0.0002,
"loss": 0.5745148658752441,
"mean_token_accuracy": 0.7678453773260117,
"num_tokens": 3950533.0,
"step": 242
},
{
"entropy": 0.549301877617836,
"epoch": 0.9092609915809168,
"grad_norm": 0.03295575827360153,
"learning_rate": 0.0002,
"loss": 0.5597534775733948,
"mean_token_accuracy": 0.7714426666498184,
"num_tokens": 3966986.0,
"step": 243
},
{
"entropy": 0.5338816940784454,
"epoch": 0.9130028063610851,
"grad_norm": 0.030206363648176193,
"learning_rate": 0.0002,
"loss": 0.5326100587844849,
"mean_token_accuracy": 0.7836355268955231,
"num_tokens": 3983434.0,
"step": 244
},
{
"entropy": 0.5674562901258469,
"epoch": 0.9167446211412535,
"grad_norm": 0.026608271524310112,
"learning_rate": 0.0002,
"loss": 0.5644797682762146,
"mean_token_accuracy": 0.7716486304998398,
"num_tokens": 3999756.0,
"step": 245
},
{
"entropy": 0.5831885486841202,
"epoch": 0.9204864359214219,
"grad_norm": 0.03711472824215889,
"learning_rate": 0.0002,
"loss": 0.5693003535270691,
"mean_token_accuracy": 0.7677270174026489,
"num_tokens": 4016084.0,
"step": 246
},
{
"entropy": 0.5590741783380508,
"epoch": 0.9242282507015903,
"grad_norm": 0.027594709768891335,
"learning_rate": 0.0002,
"loss": 0.5590558052062988,
"mean_token_accuracy": 0.7732381373643875,
"num_tokens": 4032464.0,
"step": 247
},
{
"entropy": 0.5414686352014542,
"epoch": 0.9279700654817586,
"grad_norm": 0.037102047353982925,
"learning_rate": 0.0002,
"loss": 0.5545523762702942,
"mean_token_accuracy": 0.775322362780571,
"num_tokens": 4048853.0,
"step": 248
},
{
"entropy": 0.5506337434053421,
"epoch": 0.9317118802619271,
"grad_norm": 0.03612777963280678,
"learning_rate": 0.0002,
"loss": 0.5673890709877014,
"mean_token_accuracy": 0.7688823938369751,
"num_tokens": 4065031.0,
"step": 249
},
{
"entropy": 0.542187824845314,
"epoch": 0.9354536950420954,
"grad_norm": 0.031235933303833008,
"learning_rate": 0.0002,
"loss": 0.5464475750923157,
"mean_token_accuracy": 0.7789596170186996,
"num_tokens": 4081635.0,
"step": 250
},
{
"entropy": 0.5568290203809738,
"epoch": 0.9391955098222639,
"grad_norm": 0.027413224801421165,
"learning_rate": 0.0002,
"loss": 0.5562602877616882,
"mean_token_accuracy": 0.7737423926591873,
"num_tokens": 4098011.0,
"step": 251
},
{
"entropy": 0.558889165520668,
"epoch": 0.9429373246024322,
"grad_norm": 0.029295574873685837,
"learning_rate": 0.0002,
"loss": 0.5547473430633545,
"mean_token_accuracy": 0.7740904539823532,
"num_tokens": 4114268.0,
"step": 252
},
{
"entropy": 0.5764719247817993,
"epoch": 0.9466791393826005,
"grad_norm": 0.03225071728229523,
"learning_rate": 0.0002,
"loss": 0.5729030966758728,
"mean_token_accuracy": 0.7659229934215546,
"num_tokens": 4130552.0,
"step": 253
},
{
"entropy": 0.5606585443019867,
"epoch": 0.950420954162769,
"grad_norm": 0.02834608033299446,
"learning_rate": 0.0002,
"loss": 0.5623061656951904,
"mean_token_accuracy": 0.7708321511745453,
"num_tokens": 4146844.0,
"step": 254
},
{
"entropy": 0.5444774627685547,
"epoch": 0.9541627689429373,
"grad_norm": 0.03255439177155495,
"learning_rate": 0.0002,
"loss": 0.5524637699127197,
"mean_token_accuracy": 0.7744161784648895,
"num_tokens": 4163084.0,
"step": 255
},
{
"entropy": 0.5229519456624985,
"epoch": 0.9579045837231057,
"grad_norm": 0.027845216915011406,
"learning_rate": 0.0002,
"loss": 0.5284432768821716,
"mean_token_accuracy": 0.785067692399025,
"num_tokens": 4179192.0,
"step": 256
},
{
"entropy": 0.5287301391363144,
"epoch": 0.961646398503274,
"grad_norm": 0.03511723130941391,
"learning_rate": 0.0002,
"loss": 0.5364463329315186,
"mean_token_accuracy": 0.7782928943634033,
"num_tokens": 4195604.0,
"step": 257
},
{
"entropy": 0.5621770173311234,
"epoch": 0.9653882132834425,
"grad_norm": 0.02962673269212246,
"learning_rate": 0.0002,
"loss": 0.5591749548912048,
"mean_token_accuracy": 0.7710652500391006,
"num_tokens": 4211743.0,
"step": 258
},
{
"entropy": 0.5636511147022247,
"epoch": 0.9691300280636108,
"grad_norm": 0.04087170958518982,
"learning_rate": 0.0002,
"loss": 0.5626160502433777,
"mean_token_accuracy": 0.771452471613884,
"num_tokens": 4228198.0,
"step": 259
},
{
"entropy": 0.5522175580263138,
"epoch": 0.9728718428437793,
"grad_norm": 0.029492903500795364,
"learning_rate": 0.0002,
"loss": 0.5516583323478699,
"mean_token_accuracy": 0.7742890268564224,
"num_tokens": 4244501.0,
"step": 260
},
{
"entropy": 0.5577979236841202,
"epoch": 0.9766136576239476,
"grad_norm": 0.02768765017390251,
"learning_rate": 0.0002,
"loss": 0.5573770403862,
"mean_token_accuracy": 0.7728449106216431,
"num_tokens": 4260800.0,
"step": 261
},
{
"entropy": 0.5833724588155746,
"epoch": 0.980355472404116,
"grad_norm": 0.030149318277835846,
"learning_rate": 0.0002,
"loss": 0.5790048837661743,
"mean_token_accuracy": 0.7645868510007858,
"num_tokens": 4277242.0,
"step": 262
},
{
"entropy": 0.5686817467212677,
"epoch": 0.9840972871842844,
"grad_norm": 0.03200973942875862,
"learning_rate": 0.0002,
"loss": 0.5704789161682129,
"mean_token_accuracy": 0.7688680738210678,
"num_tokens": 4293490.0,
"step": 263
},
{
"entropy": 0.5522599965333939,
"epoch": 0.9878391019644528,
"grad_norm": 0.02735111489892006,
"learning_rate": 0.0002,
"loss": 0.5483981370925903,
"mean_token_accuracy": 0.7776431888341904,
"num_tokens": 4309713.0,
"step": 264
},
{
"entropy": 0.5510786324739456,
"epoch": 0.9915809167446211,
"grad_norm": 0.027222398668527603,
"learning_rate": 0.0002,
"loss": 0.5519858598709106,
"mean_token_accuracy": 0.7740090191364288,
"num_tokens": 4325978.0,
"step": 265
},
{
"entropy": 0.5590775907039642,
"epoch": 0.9953227315247896,
"grad_norm": 0.030459199100732803,
"learning_rate": 0.0002,
"loss": 0.5638831853866577,
"mean_token_accuracy": 0.7691285163164139,
"num_tokens": 4342145.0,
"step": 266
},
{
"entropy": 0.5396278500556946,
"epoch": 0.9990645463049579,
"grad_norm": 0.029775220900774002,
"learning_rate": 0.0002,
"loss": 0.551082968711853,
"mean_token_accuracy": 0.777344822883606,
"num_tokens": 4358366.0,
"step": 267
},
{
"entropy": 0.5386617183685303,
"epoch": 1.0,
"grad_norm": 0.05107063427567482,
"learning_rate": 0.0002,
"loss": 0.56319260597229,
"mean_token_accuracy": 0.7758007049560547,
"num_tokens": 4359498.0,
"step": 268
},
{
"entropy": 0.5456036031246185,
"epoch": 1.0037418147801684,
"grad_norm": 0.034975565969944,
"learning_rate": 0.0002,
"loss": 0.5444031953811646,
"mean_token_accuracy": 0.7782553881406784,
"num_tokens": 4375874.0,
"step": 269
},
{
"entropy": 0.554328516125679,
"epoch": 1.0074836295603367,
"grad_norm": 0.030762778595089912,
"learning_rate": 0.0002,
"loss": 0.5493590235710144,
"mean_token_accuracy": 0.7769091576337814,
"num_tokens": 4392309.0,
"step": 270
},
{
"entropy": 0.544586181640625,
"epoch": 1.011225444340505,
"grad_norm": 0.027982227504253387,
"learning_rate": 0.0002,
"loss": 0.5366782546043396,
"mean_token_accuracy": 0.7823053598403931,
"num_tokens": 4408365.0,
"step": 271
},
{
"entropy": 0.5558233559131622,
"epoch": 1.0149672591206735,
"grad_norm": 0.029144754633307457,
"learning_rate": 0.0002,
"loss": 0.5538930296897888,
"mean_token_accuracy": 0.7747932523488998,
"num_tokens": 4424690.0,
"step": 272
},
{
"entropy": 0.5521434098482132,
"epoch": 1.018709073900842,
"grad_norm": 0.031630512326955795,
"learning_rate": 0.0002,
"loss": 0.5583912134170532,
"mean_token_accuracy": 0.773905873298645,
"num_tokens": 4441085.0,
"step": 273
},
{
"entropy": 0.5409824252128601,
"epoch": 1.0224508886810102,
"grad_norm": 0.03298581764101982,
"learning_rate": 0.0002,
"loss": 0.5436674356460571,
"mean_token_accuracy": 0.7784581035375595,
"num_tokens": 4457337.0,
"step": 274
},
{
"entropy": 0.5269698351621628,
"epoch": 1.0261927034611786,
"grad_norm": 0.03633208945393562,
"learning_rate": 0.0002,
"loss": 0.530029833316803,
"mean_token_accuracy": 0.786719799041748,
"num_tokens": 4473532.0,
"step": 275
},
{
"entropy": 0.572344645857811,
"epoch": 1.029934518241347,
"grad_norm": 0.03007793240249157,
"learning_rate": 0.0002,
"loss": 0.5664374828338623,
"mean_token_accuracy": 0.768335297703743,
"num_tokens": 4489887.0,
"step": 276
},
{
"entropy": 0.5445250272750854,
"epoch": 1.0336763330215155,
"grad_norm": 0.027243314310908318,
"learning_rate": 0.0002,
"loss": 0.5401641726493835,
"mean_token_accuracy": 0.7808064818382263,
"num_tokens": 4505862.0,
"step": 277
},
{
"entropy": 0.5509742796421051,
"epoch": 1.0374181478016837,
"grad_norm": 0.032545655965805054,
"learning_rate": 0.0002,
"loss": 0.5521466732025146,
"mean_token_accuracy": 0.7762803286314011,
"num_tokens": 4522135.0,
"step": 278
},
{
"entropy": 0.5502415001392365,
"epoch": 1.0411599625818522,
"grad_norm": 0.030756743624806404,
"learning_rate": 0.0002,
"loss": 0.5506622195243835,
"mean_token_accuracy": 0.7758103907108307,
"num_tokens": 4538594.0,
"step": 279
},
{
"entropy": 0.5414353311061859,
"epoch": 1.0449017773620206,
"grad_norm": 0.030841531231999397,
"learning_rate": 0.0002,
"loss": 0.5470583438873291,
"mean_token_accuracy": 0.7776292413473129,
"num_tokens": 4555119.0,
"step": 280
},
{
"entropy": 0.5487425029277802,
"epoch": 1.048643592142189,
"grad_norm": 0.03335481509566307,
"learning_rate": 0.0002,
"loss": 0.5511153936386108,
"mean_token_accuracy": 0.7753961086273193,
"num_tokens": 4571676.0,
"step": 281
},
{
"entropy": 0.5364932715892792,
"epoch": 1.0523854069223573,
"grad_norm": 0.03433723747730255,
"learning_rate": 0.0002,
"loss": 0.5388063788414001,
"mean_token_accuracy": 0.7791535705327988,
"num_tokens": 4587803.0,
"step": 282
},
{
"entropy": 0.5218682438135147,
"epoch": 1.0561272217025257,
"grad_norm": 0.03049764409661293,
"learning_rate": 0.0002,
"loss": 0.5254226922988892,
"mean_token_accuracy": 0.7847179919481277,
"num_tokens": 4603856.0,
"step": 283
},
{
"entropy": 0.5384526699781418,
"epoch": 1.0598690364826941,
"grad_norm": 0.02954094670712948,
"learning_rate": 0.0002,
"loss": 0.5442904829978943,
"mean_token_accuracy": 0.7810987532138824,
"num_tokens": 4619957.0,
"step": 284
},
{
"entropy": 0.5648271888494492,
"epoch": 1.0636108512628626,
"grad_norm": 0.029273223131895065,
"learning_rate": 0.0002,
"loss": 0.565851628780365,
"mean_token_accuracy": 0.7694031447172165,
"num_tokens": 4636366.0,
"step": 285
},
{
"entropy": 0.5445346832275391,
"epoch": 1.0673526660430308,
"grad_norm": 0.04154031351208687,
"learning_rate": 0.0002,
"loss": 0.5437869429588318,
"mean_token_accuracy": 0.7786456942558289,
"num_tokens": 4652409.0,
"step": 286
},
{
"entropy": 0.5666444450616837,
"epoch": 1.0710944808231992,
"grad_norm": 0.027274858206510544,
"learning_rate": 0.0002,
"loss": 0.5619191527366638,
"mean_token_accuracy": 0.7713726609945297,
"num_tokens": 4668805.0,
"step": 287
},
{
"entropy": 0.5560373812913895,
"epoch": 1.0748362956033677,
"grad_norm": 0.03042946569621563,
"learning_rate": 0.0002,
"loss": 0.5536933541297913,
"mean_token_accuracy": 0.7707109302282333,
"num_tokens": 4685281.0,
"step": 288
},
{
"entropy": 0.5522497296333313,
"epoch": 1.078578110383536,
"grad_norm": 0.026407577097415924,
"learning_rate": 0.0002,
"loss": 0.554541826248169,
"mean_token_accuracy": 0.7723578214645386,
"num_tokens": 4701429.0,
"step": 289
},
{
"entropy": 0.5493666082620621,
"epoch": 1.0823199251637043,
"grad_norm": 0.03922448307275772,
"learning_rate": 0.0002,
"loss": 0.5535799860954285,
"mean_token_accuracy": 0.7752141654491425,
"num_tokens": 4717787.0,
"step": 290
},
{
"entropy": 0.5579231083393097,
"epoch": 1.0860617399438728,
"grad_norm": 0.029233764857053757,
"learning_rate": 0.0002,
"loss": 0.5569900274276733,
"mean_token_accuracy": 0.7733462601900101,
"num_tokens": 4734144.0,
"step": 291
},
{
"entropy": 0.544972226023674,
"epoch": 1.0898035547240412,
"grad_norm": 0.030961396172642708,
"learning_rate": 0.0002,
"loss": 0.5413874983787537,
"mean_token_accuracy": 0.7801695913076401,
"num_tokens": 4750509.0,
"step": 292
},
{
"entropy": 0.550209566950798,
"epoch": 1.0935453695042094,
"grad_norm": 0.03252837061882019,
"learning_rate": 0.0002,
"loss": 0.5514767169952393,
"mean_token_accuracy": 0.7740490287542343,
"num_tokens": 4766708.0,
"step": 293
},
{
"entropy": 0.545928418636322,
"epoch": 1.0972871842843779,
"grad_norm": 0.02844078466296196,
"learning_rate": 0.0002,
"loss": 0.5454370975494385,
"mean_token_accuracy": 0.7802854478359222,
"num_tokens": 4783110.0,
"step": 294
},
{
"entropy": 0.550410658121109,
"epoch": 1.1010289990645463,
"grad_norm": 0.0395023413002491,
"learning_rate": 0.0002,
"loss": 0.5610683560371399,
"mean_token_accuracy": 0.7725012004375458,
"num_tokens": 4799492.0,
"step": 295
},
{
"entropy": 0.5291745737195015,
"epoch": 1.1047708138447148,
"grad_norm": 0.028669750317931175,
"learning_rate": 0.0002,
"loss": 0.5332962274551392,
"mean_token_accuracy": 0.7820043116807938,
"num_tokens": 4815864.0,
"step": 296
},
{
"entropy": 0.5454689562320709,
"epoch": 1.108512628624883,
"grad_norm": 0.02827887050807476,
"learning_rate": 0.0002,
"loss": 0.5511517524719238,
"mean_token_accuracy": 0.7747574001550674,
"num_tokens": 4832267.0,
"step": 297
},
{
"entropy": 0.5417342334985733,
"epoch": 1.1122544434050514,
"grad_norm": 0.026385854929685593,
"learning_rate": 0.0002,
"loss": 0.5412203669548035,
"mean_token_accuracy": 0.780335083603859,
"num_tokens": 4848653.0,
"step": 298
},
{
"entropy": 0.5629215389490128,
"epoch": 1.1159962581852199,
"grad_norm": 0.030779633671045303,
"learning_rate": 0.0002,
"loss": 0.5625781416893005,
"mean_token_accuracy": 0.7703746110200882,
"num_tokens": 4865192.0,
"step": 299
},
{
"entropy": 0.5278398767113686,
"epoch": 1.1197380729653883,
"grad_norm": 0.02865917608141899,
"learning_rate": 0.0002,
"loss": 0.5246303081512451,
"mean_token_accuracy": 0.7881903648376465,
"num_tokens": 4881315.0,
"step": 300
},
{
"entropy": 0.5360843688249588,
"epoch": 1.1234798877455565,
"grad_norm": 0.02863423153758049,
"learning_rate": 0.0002,
"loss": 0.5405621528625488,
"mean_token_accuracy": 0.7765359878540039,
"num_tokens": 4897572.0,
"step": 301
},
{
"entropy": 0.5270702391862869,
"epoch": 1.127221702525725,
"grad_norm": 0.027807647362351418,
"learning_rate": 0.0002,
"loss": 0.5271122455596924,
"mean_token_accuracy": 0.7830122262239456,
"num_tokens": 4913718.0,
"step": 302
},
{
"entropy": 0.5291232466697693,
"epoch": 1.1309635173058934,
"grad_norm": 0.03156433254480362,
"learning_rate": 0.0002,
"loss": 0.5328850746154785,
"mean_token_accuracy": 0.7853387147188187,
"num_tokens": 4930253.0,
"step": 303
},
{
"entropy": 0.5468447655439377,
"epoch": 1.1347053320860618,
"grad_norm": 0.033552881330251694,
"learning_rate": 0.0002,
"loss": 0.5545834898948669,
"mean_token_accuracy": 0.7716294378042221,
"num_tokens": 4946382.0,
"step": 304
},
{
"entropy": 0.5517953187227249,
"epoch": 1.13844714686623,
"grad_norm": 0.030561944469809532,
"learning_rate": 0.0002,
"loss": 0.5540879964828491,
"mean_token_accuracy": 0.7759448438882828,
"num_tokens": 4962652.0,
"step": 305
},
{
"entropy": 0.544833779335022,
"epoch": 1.1421889616463985,
"grad_norm": 0.030571507290005684,
"learning_rate": 0.0002,
"loss": 0.5443115234375,
"mean_token_accuracy": 0.7782190293073654,
"num_tokens": 4978959.0,
"step": 306
},
{
"entropy": 0.5475269705057144,
"epoch": 1.145930776426567,
"grad_norm": 0.0296931229531765,
"learning_rate": 0.0002,
"loss": 0.541431188583374,
"mean_token_accuracy": 0.7753712236881256,
"num_tokens": 4995357.0,
"step": 307
},
{
"entropy": 0.5446912348270416,
"epoch": 1.1496725912067354,
"grad_norm": 0.025116927921772003,
"learning_rate": 0.0002,
"loss": 0.5437968373298645,
"mean_token_accuracy": 0.7787619084119797,
"num_tokens": 5011590.0,
"step": 308
},
{
"entropy": 0.5292570069432259,
"epoch": 1.1534144059869036,
"grad_norm": 0.027315491810441017,
"learning_rate": 0.0002,
"loss": 0.5277875065803528,
"mean_token_accuracy": 0.7833113670349121,
"num_tokens": 5027873.0,
"step": 309
},
{
"entropy": 0.5242628306150436,
"epoch": 1.157156220767072,
"grad_norm": 0.027830073609948158,
"learning_rate": 0.0002,
"loss": 0.523070752620697,
"mean_token_accuracy": 0.7879849672317505,
"num_tokens": 5044361.0,
"step": 310
},
{
"entropy": 0.536102682352066,
"epoch": 1.1608980355472405,
"grad_norm": 0.031033379957079887,
"learning_rate": 0.0002,
"loss": 0.5378351211547852,
"mean_token_accuracy": 0.7815344035625458,
"num_tokens": 5060644.0,
"step": 311
},
{
"entropy": 0.5573316812515259,
"epoch": 1.1646398503274087,
"grad_norm": 0.03297853097319603,
"learning_rate": 0.0002,
"loss": 0.5643618106842041,
"mean_token_accuracy": 0.7715043723583221,
"num_tokens": 5077003.0,
"step": 312
},
{
"entropy": 0.526486948132515,
"epoch": 1.1683816651075771,
"grad_norm": 0.029532574117183685,
"learning_rate": 0.0002,
"loss": 0.5367429256439209,
"mean_token_accuracy": 0.7818453460931778,
"num_tokens": 5093120.0,
"step": 313
},
{
"entropy": 0.545007973909378,
"epoch": 1.1721234798877456,
"grad_norm": 0.0302292387932539,
"learning_rate": 0.0002,
"loss": 0.5474991798400879,
"mean_token_accuracy": 0.7770297825336456,
"num_tokens": 5109333.0,
"step": 314
},
{
"entropy": 0.5457079261541367,
"epoch": 1.175865294667914,
"grad_norm": 0.03628959506750107,
"learning_rate": 0.0002,
"loss": 0.5456429719924927,
"mean_token_accuracy": 0.779505044221878,
"num_tokens": 5125459.0,
"step": 315
},
{
"entropy": 0.5526050478219986,
"epoch": 1.1796071094480822,
"grad_norm": 0.031634826213121414,
"learning_rate": 0.0002,
"loss": 0.5504459738731384,
"mean_token_accuracy": 0.7756629437208176,
"num_tokens": 5141755.0,
"step": 316
},
{
"entropy": 0.5621381402015686,
"epoch": 1.1833489242282507,
"grad_norm": 0.02932395227253437,
"learning_rate": 0.0002,
"loss": 0.5631870627403259,
"mean_token_accuracy": 0.767949178814888,
"num_tokens": 5158305.0,
"step": 317
},
{
"entropy": 0.5412058234214783,
"epoch": 1.187090739008419,
"grad_norm": 0.03077547252178192,
"learning_rate": 0.0002,
"loss": 0.5441724061965942,
"mean_token_accuracy": 0.7769438326358795,
"num_tokens": 5174825.0,
"step": 318
},
{
"entropy": 0.5375640690326691,
"epoch": 1.1908325537885875,
"grad_norm": 0.0300463754683733,
"learning_rate": 0.0002,
"loss": 0.5393084287643433,
"mean_token_accuracy": 0.782392755150795,
"num_tokens": 5190829.0,
"step": 319
},
{
"entropy": 0.5544911473989487,
"epoch": 1.1945743685687558,
"grad_norm": 0.03089406155049801,
"learning_rate": 0.0002,
"loss": 0.5512977838516235,
"mean_token_accuracy": 0.7745725959539413,
"num_tokens": 5207283.0,
"step": 320
},
{
"entropy": 0.5496610552072525,
"epoch": 1.1983161833489242,
"grad_norm": 0.03022005409002304,
"learning_rate": 0.0002,
"loss": 0.5407426357269287,
"mean_token_accuracy": 0.7819069474935532,
"num_tokens": 5223759.0,
"step": 321
},
{
"entropy": 0.5536633729934692,
"epoch": 1.2020579981290926,
"grad_norm": 0.03297387808561325,
"learning_rate": 0.0002,
"loss": 0.5543879866600037,
"mean_token_accuracy": 0.7727649062871933,
"num_tokens": 5240096.0,
"step": 322
},
{
"entropy": 0.5441806763410568,
"epoch": 1.205799812909261,
"grad_norm": 0.029116200283169746,
"learning_rate": 0.0002,
"loss": 0.5444720387458801,
"mean_token_accuracy": 0.7814431339502335,
"num_tokens": 5256670.0,
"step": 323
},
{
"entropy": 0.5429923981428146,
"epoch": 1.2095416276894293,
"grad_norm": 0.03505397588014603,
"learning_rate": 0.0002,
"loss": 0.5506747961044312,
"mean_token_accuracy": 0.7763912379741669,
"num_tokens": 5272766.0,
"step": 324
},
{
"entropy": 0.5270697474479675,
"epoch": 1.2132834424695977,
"grad_norm": 0.039405617862939835,
"learning_rate": 0.0002,
"loss": 0.5409681797027588,
"mean_token_accuracy": 0.7786189615726471,
"num_tokens": 5289123.0,
"step": 325
},
{
"entropy": 0.558641791343689,
"epoch": 1.2170252572497662,
"grad_norm": 0.029413288459181786,
"learning_rate": 0.0002,
"loss": 0.5564137697219849,
"mean_token_accuracy": 0.7740890085697174,
"num_tokens": 5305503.0,
"step": 326
},
{
"entropy": 0.5550449192523956,
"epoch": 1.2207670720299344,
"grad_norm": 0.031028373166918755,
"learning_rate": 0.0002,
"loss": 0.5544853210449219,
"mean_token_accuracy": 0.7716324329376221,
"num_tokens": 5321885.0,
"step": 327
},
{
"entropy": 0.5564998090267181,
"epoch": 1.2245088868101028,
"grad_norm": 0.034970104694366455,
"learning_rate": 0.0002,
"loss": 0.5547239184379578,
"mean_token_accuracy": 0.7719462513923645,
"num_tokens": 5338376.0,
"step": 328
},
{
"entropy": 0.5593426823616028,
"epoch": 1.2282507015902713,
"grad_norm": 0.030654314905405045,
"learning_rate": 0.0002,
"loss": 0.5594889521598816,
"mean_token_accuracy": 0.7690505534410477,
"num_tokens": 5354745.0,
"step": 329
},
{
"entropy": 0.5594028532505035,
"epoch": 1.2319925163704397,
"grad_norm": 0.02985675260424614,
"learning_rate": 0.0002,
"loss": 0.560926079750061,
"mean_token_accuracy": 0.771067887544632,
"num_tokens": 5371364.0,
"step": 330
},
{
"entropy": 0.5444284975528717,
"epoch": 1.2357343311506082,
"grad_norm": 0.0331130288541317,
"learning_rate": 0.0002,
"loss": 0.5528807044029236,
"mean_token_accuracy": 0.7744182050228119,
"num_tokens": 5387884.0,
"step": 331
},
{
"entropy": 0.5535553693771362,
"epoch": 1.2394761459307764,
"grad_norm": 0.035860270261764526,
"learning_rate": 0.0002,
"loss": 0.5612154603004456,
"mean_token_accuracy": 0.7728609591722488,
"num_tokens": 5404143.0,
"step": 332
},
{
"entropy": 0.5594320446252823,
"epoch": 1.2432179607109448,
"grad_norm": 0.030857175588607788,
"learning_rate": 0.0002,
"loss": 0.5495461225509644,
"mean_token_accuracy": 0.7783895283937454,
"num_tokens": 5420613.0,
"step": 333
},
{
"entropy": 0.5738644152879715,
"epoch": 1.2469597754911133,
"grad_norm": 0.02752659097313881,
"learning_rate": 0.0002,
"loss": 0.5670571327209473,
"mean_token_accuracy": 0.7706948518753052,
"num_tokens": 5437025.0,
"step": 334
},
{
"entropy": 0.5468066483736038,
"epoch": 1.2507015902712815,
"grad_norm": 0.030105959624052048,
"learning_rate": 0.0002,
"loss": 0.5448632836341858,
"mean_token_accuracy": 0.7777069211006165,
"num_tokens": 5453431.0,
"step": 335
},
{
"entropy": 0.5508809983730316,
"epoch": 1.25444340505145,
"grad_norm": 0.031137077137827873,
"learning_rate": 0.0002,
"loss": 0.5581130981445312,
"mean_token_accuracy": 0.7730289697647095,
"num_tokens": 5469727.0,
"step": 336
},
{
"entropy": 0.5199557095766068,
"epoch": 1.2581852198316184,
"grad_norm": 0.033218562602996826,
"learning_rate": 0.0002,
"loss": 0.5353677272796631,
"mean_token_accuracy": 0.7836348563432693,
"num_tokens": 5485615.0,
"step": 337
},
{
"entropy": 0.5402327626943588,
"epoch": 1.2619270346117868,
"grad_norm": 0.02909061312675476,
"learning_rate": 0.0002,
"loss": 0.5445257425308228,
"mean_token_accuracy": 0.7775768637657166,
"num_tokens": 5501846.0,
"step": 338
},
{
"entropy": 0.5657909214496613,
"epoch": 1.2656688493919552,
"grad_norm": 0.03052118793129921,
"learning_rate": 0.0002,
"loss": 0.5672930479049683,
"mean_token_accuracy": 0.7675611525774002,
"num_tokens": 5518365.0,
"step": 339
},
{
"entropy": 0.5483649671077728,
"epoch": 1.2694106641721234,
"grad_norm": 0.02786743827164173,
"learning_rate": 0.0002,
"loss": 0.5456503033638,
"mean_token_accuracy": 0.7791422605514526,
"num_tokens": 5534639.0,
"step": 340
},
{
"entropy": 0.5500437468290329,
"epoch": 1.2731524789522919,
"grad_norm": 0.03155668452382088,
"learning_rate": 0.0002,
"loss": 0.545000433921814,
"mean_token_accuracy": 0.7803118973970413,
"num_tokens": 5551093.0,
"step": 341
},
{
"entropy": 0.5697951167821884,
"epoch": 1.27689429373246,
"grad_norm": 0.03075268305838108,
"learning_rate": 0.0002,
"loss": 0.5609626173973083,
"mean_token_accuracy": 0.7723665684461594,
"num_tokens": 5567707.0,
"step": 342
},
{
"entropy": 0.544351652264595,
"epoch": 1.2806361085126285,
"grad_norm": 0.03238390013575554,
"learning_rate": 0.0002,
"loss": 0.5533734560012817,
"mean_token_accuracy": 0.7754608392715454,
"num_tokens": 5584155.0,
"step": 343
},
{
"entropy": 0.5441059172153473,
"epoch": 1.284377923292797,
"grad_norm": 0.02793728932738304,
"learning_rate": 0.0002,
"loss": 0.5470475554466248,
"mean_token_accuracy": 0.7781476378440857,
"num_tokens": 5600585.0,
"step": 344
},
{
"entropy": 0.5576403886079788,
"epoch": 1.2881197380729654,
"grad_norm": 0.0332297645509243,
"learning_rate": 0.0002,
"loss": 0.5591012835502625,
"mean_token_accuracy": 0.7717157751321793,
"num_tokens": 5616865.0,
"step": 345
},
{
"entropy": 0.5582529455423355,
"epoch": 1.2918615528531339,
"grad_norm": 0.028861626982688904,
"learning_rate": 0.0002,
"loss": 0.5597870349884033,
"mean_token_accuracy": 0.7722600847482681,
"num_tokens": 5633131.0,
"step": 346
},
{
"entropy": 0.5537585616111755,
"epoch": 1.295603367633302,
"grad_norm": 0.027739623561501503,
"learning_rate": 0.0002,
"loss": 0.5517114996910095,
"mean_token_accuracy": 0.7751765549182892,
"num_tokens": 5649621.0,
"step": 347
},
{
"entropy": 0.5722759366035461,
"epoch": 1.2993451824134705,
"grad_norm": 0.029868733137845993,
"learning_rate": 0.0002,
"loss": 0.5697493553161621,
"mean_token_accuracy": 0.769178032875061,
"num_tokens": 5666058.0,
"step": 348
},
{
"entropy": 0.5482298284769058,
"epoch": 1.303086997193639,
"grad_norm": 0.02905650995671749,
"learning_rate": 0.0002,
"loss": 0.5505189895629883,
"mean_token_accuracy": 0.7772009968757629,
"num_tokens": 5682272.0,
"step": 349
},
{
"entropy": 0.5623439997434616,
"epoch": 1.3068288119738072,
"grad_norm": 0.028680406510829926,
"learning_rate": 0.0002,
"loss": 0.5615631937980652,
"mean_token_accuracy": 0.7712025493383408,
"num_tokens": 5698796.0,
"step": 350
},
{
"entropy": 0.5541074424982071,
"epoch": 1.3105706267539756,
"grad_norm": 0.03431180492043495,
"learning_rate": 0.0002,
"loss": 0.5617666244506836,
"mean_token_accuracy": 0.7705400139093399,
"num_tokens": 5714994.0,
"step": 351
},
{
"entropy": 0.5405305176973343,
"epoch": 1.314312441534144,
"grad_norm": 0.03283194825053215,
"learning_rate": 0.0002,
"loss": 0.538750946521759,
"mean_token_accuracy": 0.7778624445199966,
"num_tokens": 5731263.0,
"step": 352
},
{
"entropy": 0.5537361800670624,
"epoch": 1.3180542563143125,
"grad_norm": 0.03157467022538185,
"learning_rate": 0.0002,
"loss": 0.556831955909729,
"mean_token_accuracy": 0.7720046639442444,
"num_tokens": 5747576.0,
"step": 353
},
{
"entropy": 0.5540541112422943,
"epoch": 1.321796071094481,
"grad_norm": 0.03315872326493263,
"learning_rate": 0.0002,
"loss": 0.5560564398765564,
"mean_token_accuracy": 0.7747179567813873,
"num_tokens": 5763875.0,
"step": 354
},
{
"entropy": 0.5485205948352814,
"epoch": 1.3255378858746492,
"grad_norm": 0.029158933088183403,
"learning_rate": 0.0002,
"loss": 0.5474769473075867,
"mean_token_accuracy": 0.7769359052181244,
"num_tokens": 5780494.0,
"step": 355
},
{
"entropy": 0.5560560077428818,
"epoch": 1.3292797006548176,
"grad_norm": 0.03023948147892952,
"learning_rate": 0.0002,
"loss": 0.5578330159187317,
"mean_token_accuracy": 0.7706339210271835,
"num_tokens": 5796776.0,
"step": 356
},
{
"entropy": 0.5549474805593491,
"epoch": 1.333021515434986,
"grad_norm": 0.03123750351369381,
"learning_rate": 0.0002,
"loss": 0.5531733632087708,
"mean_token_accuracy": 0.7738355994224548,
"num_tokens": 5813225.0,
"step": 357
},
{
"entropy": 0.5446926355361938,
"epoch": 1.3367633302151543,
"grad_norm": 0.03854469954967499,
"learning_rate": 0.0002,
"loss": 0.5561398863792419,
"mean_token_accuracy": 0.7719077616930008,
"num_tokens": 5829411.0,
"step": 358
},
{
"entropy": 0.5601906925439835,
"epoch": 1.3405051449953227,
"grad_norm": 0.025615639984607697,
"learning_rate": 0.0002,
"loss": 0.5579116940498352,
"mean_token_accuracy": 0.7725162506103516,
"num_tokens": 5845753.0,
"step": 359
},
{
"entropy": 0.557614728808403,
"epoch": 1.3442469597754911,
"grad_norm": 0.026924598962068558,
"learning_rate": 0.0002,
"loss": 0.5500644445419312,
"mean_token_accuracy": 0.7740714848041534,
"num_tokens": 5861927.0,
"step": 360
},
{
"entropy": 0.5535576045513153,
"epoch": 1.3479887745556596,
"grad_norm": 0.031272657215595245,
"learning_rate": 0.0002,
"loss": 0.5418438911437988,
"mean_token_accuracy": 0.780152902007103,
"num_tokens": 5878289.0,
"step": 361
},
{
"entropy": 0.5407048761844635,
"epoch": 1.351730589335828,
"grad_norm": 0.031007423996925354,
"learning_rate": 0.0002,
"loss": 0.5493313670158386,
"mean_token_accuracy": 0.7764623165130615,
"num_tokens": 5894592.0,
"step": 362
},
{
"entropy": 0.5239751785993576,
"epoch": 1.3554724041159962,
"grad_norm": 0.03374086320400238,
"learning_rate": 0.0002,
"loss": 0.5344395041465759,
"mean_token_accuracy": 0.7812817394733429,
"num_tokens": 5910863.0,
"step": 363
},
{
"entropy": 0.5377437621355057,
"epoch": 1.3592142188961647,
"grad_norm": 0.04066803306341171,
"learning_rate": 0.0002,
"loss": 0.5502558946609497,
"mean_token_accuracy": 0.7735230922698975,
"num_tokens": 5927169.0,
"step": 364
},
{
"entropy": 0.5404135584831238,
"epoch": 1.362956033676333,
"grad_norm": 0.030103564262390137,
"learning_rate": 0.0002,
"loss": 0.5431765913963318,
"mean_token_accuracy": 0.780334860086441,
"num_tokens": 5943288.0,
"step": 365
},
{
"entropy": 0.5349705293774605,
"epoch": 1.3666978484565013,
"grad_norm": 0.031804051250219345,
"learning_rate": 0.0002,
"loss": 0.5298077464103699,
"mean_token_accuracy": 0.7834766954183578,
"num_tokens": 5959662.0,
"step": 366
},
{
"entropy": 0.5429814159870148,
"epoch": 1.3704396632366698,
"grad_norm": 0.04628051444888115,
"learning_rate": 0.0002,
"loss": 0.5361793041229248,
"mean_token_accuracy": 0.7793655544519424,
"num_tokens": 5976139.0,
"step": 367
},
{
"entropy": 0.5505317896604538,
"epoch": 1.3741814780168382,
"grad_norm": 0.03267182409763336,
"learning_rate": 0.0002,
"loss": 0.5444616675376892,
"mean_token_accuracy": 0.7798040062189102,
"num_tokens": 5992476.0,
"step": 368
},
{
"entropy": 0.5407690107822418,
"epoch": 1.3779232927970066,
"grad_norm": 0.0353633388876915,
"learning_rate": 0.0002,
"loss": 0.5501353740692139,
"mean_token_accuracy": 0.7760691344738007,
"num_tokens": 6008641.0,
"step": 369
},
{
"entropy": 0.5465443283319473,
"epoch": 1.3816651075771749,
"grad_norm": 0.044324446469545364,
"learning_rate": 0.0002,
"loss": 0.5564755201339722,
"mean_token_accuracy": 0.775538980960846,
"num_tokens": 6024769.0,
"step": 370
},
{
"entropy": 0.5609740614891052,
"epoch": 1.3854069223573433,
"grad_norm": 0.03593122959136963,
"learning_rate": 0.0002,
"loss": 0.5629419088363647,
"mean_token_accuracy": 0.7691068351268768,
"num_tokens": 6041060.0,
"step": 371
},
{
"entropy": 0.5421721637248993,
"epoch": 1.3891487371375117,
"grad_norm": 0.03346877172589302,
"learning_rate": 0.0002,
"loss": 0.5368991494178772,
"mean_token_accuracy": 0.7809954136610031,
"num_tokens": 6057328.0,
"step": 372
},
{
"entropy": 0.5421962440013885,
"epoch": 1.39289055191768,
"grad_norm": 0.036160413175821304,
"learning_rate": 0.0002,
"loss": 0.5371009111404419,
"mean_token_accuracy": 0.7804526090621948,
"num_tokens": 6073633.0,
"step": 373
},
{
"entropy": 0.5545593798160553,
"epoch": 1.3966323666978484,
"grad_norm": 0.03285996615886688,
"learning_rate": 0.0002,
"loss": 0.5528316497802734,
"mean_token_accuracy": 0.7778345346450806,
"num_tokens": 6090142.0,
"step": 374
},
{
"entropy": 0.5461311042308807,
"epoch": 1.4003741814780168,
"grad_norm": 0.03481744974851608,
"learning_rate": 0.0002,
"loss": 0.5470185279846191,
"mean_token_accuracy": 0.7769876271486282,
"num_tokens": 6106491.0,
"step": 375
},
{
"entropy": 0.5363553166389465,
"epoch": 1.4041159962581853,
"grad_norm": 0.029494671151041985,
"learning_rate": 0.0002,
"loss": 0.5371567010879517,
"mean_token_accuracy": 0.78060382604599,
"num_tokens": 6122724.0,
"step": 376
},
{
"entropy": 0.5401545614004135,
"epoch": 1.4078578110383537,
"grad_norm": 0.030447613447904587,
"learning_rate": 0.0002,
"loss": 0.5506365299224854,
"mean_token_accuracy": 0.7772665321826935,
"num_tokens": 6139127.0,
"step": 377
},
{
"entropy": 0.5432114005088806,
"epoch": 1.411599625818522,
"grad_norm": 0.03443232551217079,
"learning_rate": 0.0002,
"loss": 0.5483974814414978,
"mean_token_accuracy": 0.7753057479858398,
"num_tokens": 6155228.0,
"step": 378
},
{
"entropy": 0.5419820547103882,
"epoch": 1.4153414405986904,
"grad_norm": 0.030418474227190018,
"learning_rate": 0.0002,
"loss": 0.5432078838348389,
"mean_token_accuracy": 0.7786633670330048,
"num_tokens": 6171661.0,
"step": 379
},
{
"entropy": 0.5554294884204865,
"epoch": 1.4190832553788588,
"grad_norm": 0.028558963909745216,
"learning_rate": 0.0002,
"loss": 0.5531105995178223,
"mean_token_accuracy": 0.7719776481389999,
"num_tokens": 6187948.0,
"step": 380
},
{
"entropy": 0.5308730006217957,
"epoch": 1.422825070159027,
"grad_norm": 0.03490149602293968,
"learning_rate": 0.0002,
"loss": 0.5338871479034424,
"mean_token_accuracy": 0.7831013798713684,
"num_tokens": 6203996.0,
"step": 381
},
{
"entropy": 0.5621105879545212,
"epoch": 1.4265668849391955,
"grad_norm": 0.03489487245678902,
"learning_rate": 0.0002,
"loss": 0.5650954246520996,
"mean_token_accuracy": 0.7674195319414139,
"num_tokens": 6220346.0,
"step": 382
},
{
"entropy": 0.5624908655881882,
"epoch": 1.430308699719364,
"grad_norm": 0.02940392680466175,
"learning_rate": 0.0002,
"loss": 0.5624366998672485,
"mean_token_accuracy": 0.769148588180542,
"num_tokens": 6236743.0,
"step": 383
},
{
"entropy": 0.5363715589046478,
"epoch": 1.4340505144995324,
"grad_norm": 0.028942115604877472,
"learning_rate": 0.0002,
"loss": 0.5339908599853516,
"mean_token_accuracy": 0.7834934592247009,
"num_tokens": 6252708.0,
"step": 384
},
{
"entropy": 0.5408411026000977,
"epoch": 1.4377923292797006,
"grad_norm": 0.0305769219994545,
"learning_rate": 0.0002,
"loss": 0.5352215766906738,
"mean_token_accuracy": 0.7860714495182037,
"num_tokens": 6268903.0,
"step": 385
},
{
"entropy": 0.5410628318786621,
"epoch": 1.441534144059869,
"grad_norm": 0.029285579919815063,
"learning_rate": 0.0002,
"loss": 0.5426855087280273,
"mean_token_accuracy": 0.7768432199954987,
"num_tokens": 6284894.0,
"step": 386
},
{
"entropy": 0.5362880975008011,
"epoch": 1.4452759588400375,
"grad_norm": 0.03178134933114052,
"learning_rate": 0.0002,
"loss": 0.5503253936767578,
"mean_token_accuracy": 0.7759049534797668,
"num_tokens": 6301216.0,
"step": 387
},
{
"entropy": 0.5453620404005051,
"epoch": 1.4490177736202057,
"grad_norm": 0.029615160077810287,
"learning_rate": 0.0002,
"loss": 0.5539615154266357,
"mean_token_accuracy": 0.7736871391534805,
"num_tokens": 6317584.0,
"step": 388
},
{
"entropy": 0.5552696138620377,
"epoch": 1.4527595884003741,
"grad_norm": 0.03214653581380844,
"learning_rate": 0.0002,
"loss": 0.5597580671310425,
"mean_token_accuracy": 0.7707493901252747,
"num_tokens": 6333884.0,
"step": 389
},
{
"entropy": 0.553122490644455,
"epoch": 1.4565014031805426,
"grad_norm": 0.029804600402712822,
"learning_rate": 0.0002,
"loss": 0.552976131439209,
"mean_token_accuracy": 0.778336301445961,
"num_tokens": 6350141.0,
"step": 390
},
{
"entropy": 0.5826992094516754,
"epoch": 1.460243217960711,
"grad_norm": 0.03438711538910866,
"learning_rate": 0.0002,
"loss": 0.5765487551689148,
"mean_token_accuracy": 0.7643037289381027,
"num_tokens": 6366374.0,
"step": 391
},
{
"entropy": 0.5606750249862671,
"epoch": 1.4639850327408794,
"grad_norm": 0.030389849096536636,
"learning_rate": 0.0002,
"loss": 0.5595695376396179,
"mean_token_accuracy": 0.7718200087547302,
"num_tokens": 6382848.0,
"step": 392
},
{
"entropy": 0.5619854032993317,
"epoch": 1.4677268475210477,
"grad_norm": 0.032461296766996384,
"learning_rate": 0.0002,
"loss": 0.5576058030128479,
"mean_token_accuracy": 0.7746401876211166,
"num_tokens": 6399173.0,
"step": 393
},
{
"entropy": 0.5408260822296143,
"epoch": 1.471468662301216,
"grad_norm": 0.03529435396194458,
"learning_rate": 0.0002,
"loss": 0.5456345081329346,
"mean_token_accuracy": 0.7788489162921906,
"num_tokens": 6415565.0,
"step": 394
},
{
"entropy": 0.5425965934991837,
"epoch": 1.4752104770813845,
"grad_norm": 0.03692852333188057,
"learning_rate": 0.0002,
"loss": 0.5488424301147461,
"mean_token_accuracy": 0.7782263904809952,
"num_tokens": 6431912.0,
"step": 395
},
{
"entropy": 0.5516340583562851,
"epoch": 1.4789522918615527,
"grad_norm": 0.031000891700387,
"learning_rate": 0.0002,
"loss": 0.5553445219993591,
"mean_token_accuracy": 0.7752650529146194,
"num_tokens": 6448548.0,
"step": 396
},
{
"entropy": 0.538574829697609,
"epoch": 1.4826941066417212,
"grad_norm": 0.030864855274558067,
"learning_rate": 0.0002,
"loss": 0.5368215441703796,
"mean_token_accuracy": 0.7809993326663971,
"num_tokens": 6465030.0,
"step": 397
},
{
"entropy": 0.5717963427305222,
"epoch": 1.4864359214218896,
"grad_norm": 0.033221229910850525,
"learning_rate": 0.0002,
"loss": 0.571186363697052,
"mean_token_accuracy": 0.7653579860925674,
"num_tokens": 6481528.0,
"step": 398
},
{
"entropy": 0.5418017208576202,
"epoch": 1.490177736202058,
"grad_norm": 0.04067196696996689,
"learning_rate": 0.0002,
"loss": 0.5442001223564148,
"mean_token_accuracy": 0.7763307839632034,
"num_tokens": 6497840.0,
"step": 399
},
{
"entropy": 0.5547621697187424,
"epoch": 1.4939195509822265,
"grad_norm": 0.03348267823457718,
"learning_rate": 0.0002,
"loss": 0.5626781582832336,
"mean_token_accuracy": 0.7712242007255554,
"num_tokens": 6514349.0,
"step": 400
},
{
"entropy": 0.5494479835033417,
"epoch": 1.4976613657623947,
"grad_norm": 0.03362090513110161,
"learning_rate": 0.0002,
"loss": 0.548977792263031,
"mean_token_accuracy": 0.7767577767372131,
"num_tokens": 6530749.0,
"step": 401
},
{
"entropy": 0.5626181960105896,
"epoch": 1.5014031805425632,
"grad_norm": 0.03137248754501343,
"learning_rate": 0.0002,
"loss": 0.5654096603393555,
"mean_token_accuracy": 0.7723931819200516,
"num_tokens": 6547276.0,
"step": 402
},
{
"entropy": 0.5499662905931473,
"epoch": 1.5051449953227314,
"grad_norm": 0.034359052777290344,
"learning_rate": 0.0002,
"loss": 0.5508401393890381,
"mean_token_accuracy": 0.7756681442260742,
"num_tokens": 6563580.0,
"step": 403
},
{
"entropy": 0.5658421665430069,
"epoch": 1.5088868101028998,
"grad_norm": 0.030933788046240807,
"learning_rate": 0.0002,
"loss": 0.5622308254241943,
"mean_token_accuracy": 0.769567608833313,
"num_tokens": 6579736.0,
"step": 404
},
{
"entropy": 0.547087088227272,
"epoch": 1.5126286248830683,
"grad_norm": 0.030160700902342796,
"learning_rate": 0.0002,
"loss": 0.5470564961433411,
"mean_token_accuracy": 0.7781479358673096,
"num_tokens": 6596131.0,
"step": 405
},
{
"entropy": 0.5563077032566071,
"epoch": 1.5163704396632367,
"grad_norm": 0.029513506218791008,
"learning_rate": 0.0002,
"loss": 0.5557488799095154,
"mean_token_accuracy": 0.7776722609996796,
"num_tokens": 6612499.0,
"step": 406
},
{
"entropy": 0.5473329573869705,
"epoch": 1.5201122544434051,
"grad_norm": 0.031187692657113075,
"learning_rate": 0.0002,
"loss": 0.5444590449333191,
"mean_token_accuracy": 0.7770859450101852,
"num_tokens": 6628905.0,
"step": 407
},
{
"entropy": 0.5493151396512985,
"epoch": 1.5238540692235736,
"grad_norm": 0.027274703606963158,
"learning_rate": 0.0002,
"loss": 0.5559489130973816,
"mean_token_accuracy": 0.774099811911583,
"num_tokens": 6645207.0,
"step": 408
},
{
"entropy": 0.5369315445423126,
"epoch": 1.5275958840037418,
"grad_norm": 0.03280489146709442,
"learning_rate": 0.0002,
"loss": 0.5494750738143921,
"mean_token_accuracy": 0.7781352549791336,
"num_tokens": 6661441.0,
"step": 409
},
{
"entropy": 0.543188214302063,
"epoch": 1.5313376987839102,
"grad_norm": 0.0317704938352108,
"learning_rate": 0.0002,
"loss": 0.548348069190979,
"mean_token_accuracy": 0.7779366374015808,
"num_tokens": 6677890.0,
"step": 410
},
{
"entropy": 0.5514375120401382,
"epoch": 1.5350795135640785,
"grad_norm": 0.02904539741575718,
"learning_rate": 0.0002,
"loss": 0.5532687902450562,
"mean_token_accuracy": 0.776079460978508,
"num_tokens": 6694229.0,
"step": 411
},
{
"entropy": 0.5228893607854843,
"epoch": 1.538821328344247,
"grad_norm": 0.027841076254844666,
"learning_rate": 0.0002,
"loss": 0.522330641746521,
"mean_token_accuracy": 0.7864255011081696,
"num_tokens": 6710250.0,
"step": 412
},
{
"entropy": 0.5390310734510422,
"epoch": 1.5425631431244153,
"grad_norm": 0.02716185338795185,
"learning_rate": 0.0002,
"loss": 0.5395499467849731,
"mean_token_accuracy": 0.7826422601938248,
"num_tokens": 6726768.0,
"step": 413
},
{
"entropy": 0.5508141964673996,
"epoch": 1.5463049579045838,
"grad_norm": 0.030815092846751213,
"learning_rate": 0.0002,
"loss": 0.5503819584846497,
"mean_token_accuracy": 0.7755144089460373,
"num_tokens": 6743055.0,
"step": 414
},
{
"entropy": 0.5312939435243607,
"epoch": 1.5500467726847522,
"grad_norm": 0.028637485578656197,
"learning_rate": 0.0002,
"loss": 0.5298642516136169,
"mean_token_accuracy": 0.7852569371461868,
"num_tokens": 6759442.0,
"step": 415
},
{
"entropy": 0.5471786260604858,
"epoch": 1.5537885874649204,
"grad_norm": 0.030604762956500053,
"learning_rate": 0.0002,
"loss": 0.5502840876579285,
"mean_token_accuracy": 0.7758130580186844,
"num_tokens": 6775919.0,
"step": 416
},
{
"entropy": 0.5734788477420807,
"epoch": 1.5575304022450889,
"grad_norm": 0.033530574291944504,
"learning_rate": 0.0002,
"loss": 0.573567807674408,
"mean_token_accuracy": 0.7666918784379959,
"num_tokens": 6792496.0,
"step": 417
},
{
"entropy": 0.5556947290897369,
"epoch": 1.561272217025257,
"grad_norm": 0.029095808044075966,
"learning_rate": 0.0002,
"loss": 0.5506360530853271,
"mean_token_accuracy": 0.7765111029148102,
"num_tokens": 6809055.0,
"step": 418
},
{
"entropy": 0.5287731885910034,
"epoch": 1.5650140318054255,
"grad_norm": 0.03587370365858078,
"learning_rate": 0.0002,
"loss": 0.5343160033226013,
"mean_token_accuracy": 0.7836072146892548,
"num_tokens": 6825353.0,
"step": 419
},
{
"entropy": 0.5342409163713455,
"epoch": 1.568755846585594,
"grad_norm": 0.03603408485651016,
"learning_rate": 0.0002,
"loss": 0.5409013628959656,
"mean_token_accuracy": 0.7804750800132751,
"num_tokens": 6841745.0,
"step": 420
},
{
"entropy": 0.5486701726913452,
"epoch": 1.5724976613657624,
"grad_norm": 0.02864743210375309,
"learning_rate": 0.0002,
"loss": 0.5528161525726318,
"mean_token_accuracy": 0.7741836905479431,
"num_tokens": 6857942.0,
"step": 421
},
{
"entropy": 0.5741837024688721,
"epoch": 1.5762394761459309,
"grad_norm": 0.0320119671523571,
"learning_rate": 0.0002,
"loss": 0.5608420372009277,
"mean_token_accuracy": 0.7707283794879913,
"num_tokens": 6874193.0,
"step": 422
},
{
"entropy": 0.5495482236146927,
"epoch": 1.5799812909260993,
"grad_norm": 0.02604423463344574,
"learning_rate": 0.0002,
"loss": 0.5479333400726318,
"mean_token_accuracy": 0.7773087471723557,
"num_tokens": 6890547.0,
"step": 423
},
{
"entropy": 0.5387884378433228,
"epoch": 1.5837231057062675,
"grad_norm": 0.03170885518193245,
"learning_rate": 0.0002,
"loss": 0.5462484359741211,
"mean_token_accuracy": 0.7735171020030975,
"num_tokens": 6906920.0,
"step": 424
},
{
"entropy": 0.539916068315506,
"epoch": 1.587464920486436,
"grad_norm": 0.03372619301080704,
"learning_rate": 0.0002,
"loss": 0.542754590511322,
"mean_token_accuracy": 0.7796132117509842,
"num_tokens": 6923352.0,
"step": 425
},
{
"entropy": 0.5413663387298584,
"epoch": 1.5912067352666042,
"grad_norm": 0.02999868616461754,
"learning_rate": 0.0002,
"loss": 0.5444542765617371,
"mean_token_accuracy": 0.7786892652511597,
"num_tokens": 6939337.0,
"step": 426
},
{
"entropy": 0.556038424372673,
"epoch": 1.5949485500467726,
"grad_norm": 0.03419700264930725,
"learning_rate": 0.0002,
"loss": 0.550898015499115,
"mean_token_accuracy": 0.7760495245456696,
"num_tokens": 6955389.0,
"step": 427
},
{
"entropy": 0.5516718029975891,
"epoch": 1.598690364826941,
"grad_norm": 0.0298128854483366,
"learning_rate": 0.0002,
"loss": 0.5519053339958191,
"mean_token_accuracy": 0.7739587277173996,
"num_tokens": 6971808.0,
"step": 428
},
{
"entropy": 0.5532359778881073,
"epoch": 1.6024321796071095,
"grad_norm": 0.03213290125131607,
"learning_rate": 0.0002,
"loss": 0.5568399429321289,
"mean_token_accuracy": 0.7753729224205017,
"num_tokens": 6988128.0,
"step": 429
},
{
"entropy": 0.5382643342018127,
"epoch": 1.606173994387278,
"grad_norm": 0.031161464750766754,
"learning_rate": 0.0002,
"loss": 0.5440113544464111,
"mean_token_accuracy": 0.7779531329870224,
"num_tokens": 7004368.0,
"step": 430
},
{
"entropy": 0.5313677787780762,
"epoch": 1.6099158091674464,
"grad_norm": 0.036605071276426315,
"learning_rate": 0.0002,
"loss": 0.5367435216903687,
"mean_token_accuracy": 0.7821811884641647,
"num_tokens": 7020480.0,
"step": 431
},
{
"entropy": 0.5567297488451004,
"epoch": 1.6136576239476146,
"grad_norm": 0.027995243668556213,
"learning_rate": 0.0002,
"loss": 0.5547551512718201,
"mean_token_accuracy": 0.7722228318452835,
"num_tokens": 7036925.0,
"step": 432
},
{
"entropy": 0.5448314994573593,
"epoch": 1.617399438727783,
"grad_norm": 0.03725632280111313,
"learning_rate": 0.0002,
"loss": 0.5465018153190613,
"mean_token_accuracy": 0.7780062705278397,
"num_tokens": 7053019.0,
"step": 433
},
{
"entropy": 0.5258296579122543,
"epoch": 1.6211412535079512,
"grad_norm": 0.03214319422841072,
"learning_rate": 0.0002,
"loss": 0.5300624370574951,
"mean_token_accuracy": 0.7829313278198242,
"num_tokens": 7069021.0,
"step": 434
},
{
"entropy": 0.5569266527891159,
"epoch": 1.6248830682881197,
"grad_norm": 0.03432042896747589,
"learning_rate": 0.0002,
"loss": 0.5578755140304565,
"mean_token_accuracy": 0.7711293399333954,
"num_tokens": 7085450.0,
"step": 435
},
{
"entropy": 0.5638464391231537,
"epoch": 1.6286248830682881,
"grad_norm": 0.03862602636218071,
"learning_rate": 0.0002,
"loss": 0.5726134777069092,
"mean_token_accuracy": 0.7694450467824936,
"num_tokens": 7101666.0,
"step": 436
},
{
"entropy": 0.564548671245575,
"epoch": 1.6323666978484566,
"grad_norm": 0.032345570623874664,
"learning_rate": 0.0002,
"loss": 0.5651994943618774,
"mean_token_accuracy": 0.7711433917284012,
"num_tokens": 7117907.0,
"step": 437
},
{
"entropy": 0.5587478131055832,
"epoch": 1.636108512628625,
"grad_norm": 0.031082862988114357,
"learning_rate": 0.0002,
"loss": 0.5588955879211426,
"mean_token_accuracy": 0.7725447416305542,
"num_tokens": 7134131.0,
"step": 438
},
{
"entropy": 0.5472389608621597,
"epoch": 1.6398503274087932,
"grad_norm": 0.03695904091000557,
"learning_rate": 0.0002,
"loss": 0.5445616245269775,
"mean_token_accuracy": 0.778590515255928,
"num_tokens": 7150298.0,
"step": 439
},
{
"entropy": 0.5535961091518402,
"epoch": 1.6435921421889617,
"grad_norm": 0.031128892675042152,
"learning_rate": 0.0002,
"loss": 0.5437783598899841,
"mean_token_accuracy": 0.7785230875015259,
"num_tokens": 7166639.0,
"step": 440
},
{
"entropy": 0.5351960062980652,
"epoch": 1.6473339569691299,
"grad_norm": 0.03949431702494621,
"learning_rate": 0.0002,
"loss": 0.5358127355575562,
"mean_token_accuracy": 0.7802053093910217,
"num_tokens": 7182613.0,
"step": 441
},
{
"entropy": 0.524370513856411,
"epoch": 1.6510757717492983,
"grad_norm": 0.03402510657906532,
"learning_rate": 0.0002,
"loss": 0.5297942161560059,
"mean_token_accuracy": 0.7861316353082657,
"num_tokens": 7198598.0,
"step": 442
},
{
"entropy": 0.5440799742937088,
"epoch": 1.6548175865294668,
"grad_norm": 0.03908916562795639,
"learning_rate": 0.0002,
"loss": 0.5563719868659973,
"mean_token_accuracy": 0.773345485329628,
"num_tokens": 7214953.0,
"step": 443
},
{
"entropy": 0.5496329516172409,
"epoch": 1.6585594013096352,
"grad_norm": 0.036347340792417526,
"learning_rate": 0.0002,
"loss": 0.5566647052764893,
"mean_token_accuracy": 0.7736042439937592,
"num_tokens": 7231069.0,
"step": 444
},
{
"entropy": 0.5510213375091553,
"epoch": 1.6623012160898036,
"grad_norm": 0.027416400611400604,
"learning_rate": 0.0002,
"loss": 0.5495529174804688,
"mean_token_accuracy": 0.7757058292627335,
"num_tokens": 7247326.0,
"step": 445
},
{
"entropy": 0.5782728493213654,
"epoch": 1.666043030869972,
"grad_norm": 0.03216573968529701,
"learning_rate": 0.0002,
"loss": 0.5692035555839539,
"mean_token_accuracy": 0.7700701951980591,
"num_tokens": 7263765.0,
"step": 446
},
{
"entropy": 0.5769474655389786,
"epoch": 1.6697848456501403,
"grad_norm": 0.03461449593305588,
"learning_rate": 0.0002,
"loss": 0.5692911148071289,
"mean_token_accuracy": 0.7688308656215668,
"num_tokens": 7280095.0,
"step": 447
},
{
"entropy": 0.5636246651411057,
"epoch": 1.6735266604303087,
"grad_norm": 0.02763124369084835,
"learning_rate": 0.0002,
"loss": 0.5576487183570862,
"mean_token_accuracy": 0.7748333811759949,
"num_tokens": 7296592.0,
"step": 448
},
{
"entropy": 0.5515684485435486,
"epoch": 1.677268475210477,
"grad_norm": 0.03505739942193031,
"learning_rate": 0.0002,
"loss": 0.562554121017456,
"mean_token_accuracy": 0.7732807844877243,
"num_tokens": 7313071.0,
"step": 449
},
{
"entropy": 0.529756709933281,
"epoch": 1.6810102899906454,
"grad_norm": 0.035316504538059235,
"learning_rate": 0.0002,
"loss": 0.5393928289413452,
"mean_token_accuracy": 0.7774565666913986,
"num_tokens": 7329531.0,
"step": 450
},
{
"entropy": 0.5509119927883148,
"epoch": 1.6847521047708138,
"grad_norm": 0.03525395318865776,
"learning_rate": 0.0002,
"loss": 0.5650572180747986,
"mean_token_accuracy": 0.7679217755794525,
"num_tokens": 7345852.0,
"step": 451
},
{
"entropy": 0.5615872442722321,
"epoch": 1.6884939195509823,
"grad_norm": 0.032941099256277084,
"learning_rate": 0.0002,
"loss": 0.5626966953277588,
"mean_token_accuracy": 0.7703739553689957,
"num_tokens": 7362126.0,
"step": 452
},
{
"entropy": 0.555547222495079,
"epoch": 1.6922357343311507,
"grad_norm": 0.03228066489100456,
"learning_rate": 0.0002,
"loss": 0.544800877571106,
"mean_token_accuracy": 0.7767430245876312,
"num_tokens": 7378671.0,
"step": 453
},
{
"entropy": 0.554116278886795,
"epoch": 1.6959775491113191,
"grad_norm": 0.029597081243991852,
"learning_rate": 0.0002,
"loss": 0.5413352847099304,
"mean_token_accuracy": 0.7784619033336639,
"num_tokens": 7394967.0,
"step": 454
},
{
"entropy": 0.5580686628818512,
"epoch": 1.6997193638914874,
"grad_norm": 0.02839960716664791,
"learning_rate": 0.0002,
"loss": 0.5585195422172546,
"mean_token_accuracy": 0.7723167389631271,
"num_tokens": 7411309.0,
"step": 455
},
{
"entropy": 0.5392096787691116,
"epoch": 1.7034611786716558,
"grad_norm": 0.03588644042611122,
"learning_rate": 0.0002,
"loss": 0.5462691187858582,
"mean_token_accuracy": 0.7782226353883743,
"num_tokens": 7427429.0,
"step": 456
},
{
"entropy": 0.535987101495266,
"epoch": 1.707202993451824,
"grad_norm": 0.03534339368343353,
"learning_rate": 0.0002,
"loss": 0.549435019493103,
"mean_token_accuracy": 0.7765841335058212,
"num_tokens": 7443721.0,
"step": 457
},
{
"entropy": 0.5456487089395523,
"epoch": 1.7109448082319925,
"grad_norm": 0.03618441894650459,
"learning_rate": 0.0002,
"loss": 0.5485998392105103,
"mean_token_accuracy": 0.7757130116224289,
"num_tokens": 7460111.0,
"step": 458
},
{
"entropy": 0.5436663031578064,
"epoch": 1.714686623012161,
"grad_norm": 0.02979116700589657,
"learning_rate": 0.0002,
"loss": 0.5414945483207703,
"mean_token_accuracy": 0.7812917977571487,
"num_tokens": 7476124.0,
"step": 459
},
{
"entropy": 0.5709712207317352,
"epoch": 1.7184284377923293,
"grad_norm": 0.03200547397136688,
"learning_rate": 0.0002,
"loss": 0.5619422197341919,
"mean_token_accuracy": 0.7735306322574615,
"num_tokens": 7492499.0,
"step": 460
},
{
"entropy": 0.5626240521669388,
"epoch": 1.7221702525724978,
"grad_norm": 0.03815503418445587,
"learning_rate": 0.0002,
"loss": 0.5533303618431091,
"mean_token_accuracy": 0.7753702253103256,
"num_tokens": 7508641.0,
"step": 461
},
{
"entropy": 0.5480938106775284,
"epoch": 1.725912067352666,
"grad_norm": 0.03169892355799675,
"learning_rate": 0.0002,
"loss": 0.5524613261222839,
"mean_token_accuracy": 0.7751649022102356,
"num_tokens": 7525219.0,
"step": 462
},
{
"entropy": 0.5562078654766083,
"epoch": 1.7296538821328344,
"grad_norm": 0.03617829084396362,
"learning_rate": 0.0002,
"loss": 0.5619810819625854,
"mean_token_accuracy": 0.7714113295078278,
"num_tokens": 7541689.0,
"step": 463
},
{
"entropy": 0.5358584523200989,
"epoch": 1.7333956969130027,
"grad_norm": 0.03426409512758255,
"learning_rate": 0.0002,
"loss": 0.5471996068954468,
"mean_token_accuracy": 0.7751270979642868,
"num_tokens": 7558097.0,
"step": 464
},
{
"entropy": 0.5273950546979904,
"epoch": 1.737137511693171,
"grad_norm": 0.03135877847671509,
"learning_rate": 0.0002,
"loss": 0.5319076776504517,
"mean_token_accuracy": 0.7831837683916092,
"num_tokens": 7574193.0,
"step": 465
},
{
"entropy": 0.5745384991168976,
"epoch": 1.7408793264733395,
"grad_norm": 0.03335622698068619,
"learning_rate": 0.0002,
"loss": 0.5716018676757812,
"mean_token_accuracy": 0.7669582962989807,
"num_tokens": 7590824.0,
"step": 466
},
{
"entropy": 0.5475277155637741,
"epoch": 1.744621141253508,
"grad_norm": 0.02866513840854168,
"learning_rate": 0.0002,
"loss": 0.5436227321624756,
"mean_token_accuracy": 0.777054488658905,
"num_tokens": 7607042.0,
"step": 467
},
{
"entropy": 0.5518149137496948,
"epoch": 1.7483629560336764,
"grad_norm": 0.029388844966888428,
"learning_rate": 0.0002,
"loss": 0.5495098233222961,
"mean_token_accuracy": 0.7773433327674866,
"num_tokens": 7623420.0,
"step": 468
},
{
"entropy": 0.5374390631914139,
"epoch": 1.7521047708138449,
"grad_norm": 0.0325518473982811,
"learning_rate": 0.0002,
"loss": 0.5412787795066833,
"mean_token_accuracy": 0.7788903117179871,
"num_tokens": 7639630.0,
"step": 469
},
{
"entropy": 0.5380698144435883,
"epoch": 1.755846585594013,
"grad_norm": 0.029125649482011795,
"learning_rate": 0.0002,
"loss": 0.5411547422409058,
"mean_token_accuracy": 0.7780955582857132,
"num_tokens": 7655842.0,
"step": 470
},
{
"entropy": 0.5518491268157959,
"epoch": 1.7595884003741815,
"grad_norm": 0.03188946843147278,
"learning_rate": 0.0002,
"loss": 0.5559889674186707,
"mean_token_accuracy": 0.7736992090940475,
"num_tokens": 7672101.0,
"step": 471
},
{
"entropy": 0.5442283153533936,
"epoch": 1.7633302151543497,
"grad_norm": 0.034016743302345276,
"learning_rate": 0.0002,
"loss": 0.5500984191894531,
"mean_token_accuracy": 0.7761438190937042,
"num_tokens": 7688113.0,
"step": 472
},
{
"entropy": 0.5488689690828323,
"epoch": 1.7670720299345182,
"grad_norm": 0.02747703716158867,
"learning_rate": 0.0002,
"loss": 0.5475065112113953,
"mean_token_accuracy": 0.775134801864624,
"num_tokens": 7704497.0,
"step": 473
},
{
"entropy": 0.568826898932457,
"epoch": 1.7708138447146866,
"grad_norm": 0.03434092178940773,
"learning_rate": 0.0002,
"loss": 0.5651647448539734,
"mean_token_accuracy": 0.7715141028165817,
"num_tokens": 7720786.0,
"step": 474
},
{
"entropy": 0.5751989632844925,
"epoch": 1.774555659494855,
"grad_norm": 0.03127957507967949,
"learning_rate": 0.0002,
"loss": 0.5659101605415344,
"mean_token_accuracy": 0.7694416791200638,
"num_tokens": 7737241.0,
"step": 475
},
{
"entropy": 0.5532206594944,
"epoch": 1.7782974742750235,
"grad_norm": 0.02908439189195633,
"learning_rate": 0.0002,
"loss": 0.5514166355133057,
"mean_token_accuracy": 0.7745979428291321,
"num_tokens": 7753654.0,
"step": 476
},
{
"entropy": 0.5416929870843887,
"epoch": 1.782039289055192,
"grad_norm": 0.03806254267692566,
"learning_rate": 0.0002,
"loss": 0.5534486770629883,
"mean_token_accuracy": 0.7739390730857849,
"num_tokens": 7770019.0,
"step": 477
},
{
"entropy": 0.5363457053899765,
"epoch": 1.7857811038353602,
"grad_norm": 0.032926302403211594,
"learning_rate": 0.0002,
"loss": 0.5503825545310974,
"mean_token_accuracy": 0.7768030315637589,
"num_tokens": 7786449.0,
"step": 478
},
{
"entropy": 0.5420104712247849,
"epoch": 1.7895229186155284,
"grad_norm": 0.02965935505926609,
"learning_rate": 0.0002,
"loss": 0.5425794124603271,
"mean_token_accuracy": 0.7801303416490555,
"num_tokens": 7802671.0,
"step": 479
},
{
"entropy": 0.549240380525589,
"epoch": 1.7932647333956968,
"grad_norm": 0.029267581179738045,
"learning_rate": 0.0002,
"loss": 0.5447797179222107,
"mean_token_accuracy": 0.7785746455192566,
"num_tokens": 7819171.0,
"step": 480
},
{
"entropy": 0.5564038902521133,
"epoch": 1.7970065481758652,
"grad_norm": 0.027819465845823288,
"learning_rate": 0.0002,
"loss": 0.5569280385971069,
"mean_token_accuracy": 0.7717359662055969,
"num_tokens": 7835514.0,
"step": 481
},
{
"entropy": 0.5513341128826141,
"epoch": 1.8007483629560337,
"grad_norm": 0.032080937176942825,
"learning_rate": 0.0002,
"loss": 0.5565280318260193,
"mean_token_accuracy": 0.7745318114757538,
"num_tokens": 7851901.0,
"step": 482
},
{
"entropy": 0.5669872015714645,
"epoch": 1.8044901777362021,
"grad_norm": 0.031251415610313416,
"learning_rate": 0.0002,
"loss": 0.5653026103973389,
"mean_token_accuracy": 0.7678168416023254,
"num_tokens": 7868506.0,
"step": 483
},
{
"entropy": 0.5539208799600601,
"epoch": 1.8082319925163706,
"grad_norm": 0.02905306965112686,
"learning_rate": 0.0002,
"loss": 0.5545270442962646,
"mean_token_accuracy": 0.7701525986194611,
"num_tokens": 7884991.0,
"step": 484
},
{
"entropy": 0.5545967519283295,
"epoch": 1.8119738072965388,
"grad_norm": 0.028621984645724297,
"learning_rate": 0.0002,
"loss": 0.5514732003211975,
"mean_token_accuracy": 0.7761166989803314,
"num_tokens": 7901376.0,
"step": 485
},
{
"entropy": 0.5499511659145355,
"epoch": 1.8157156220767072,
"grad_norm": 0.03022296354174614,
"learning_rate": 0.0002,
"loss": 0.5498670339584351,
"mean_token_accuracy": 0.7770126014947891,
"num_tokens": 7917862.0,
"step": 486
},
{
"entropy": 0.5304104536771774,
"epoch": 1.8194574368568754,
"grad_norm": 0.03297071531414986,
"learning_rate": 0.0002,
"loss": 0.5350517630577087,
"mean_token_accuracy": 0.7801762819290161,
"num_tokens": 7933992.0,
"step": 487
},
{
"entropy": 0.5290692076086998,
"epoch": 1.8231992516370439,
"grad_norm": 0.03105652704834938,
"learning_rate": 0.0002,
"loss": 0.5332382917404175,
"mean_token_accuracy": 0.7827692329883575,
"num_tokens": 7949802.0,
"step": 488
},
{
"entropy": 0.5513493865728378,
"epoch": 1.8269410664172123,
"grad_norm": 0.027769237756729126,
"learning_rate": 0.0002,
"loss": 0.5537266135215759,
"mean_token_accuracy": 0.7724474370479584,
"num_tokens": 7966264.0,
"step": 489
},
{
"entropy": 0.559148445725441,
"epoch": 1.8306828811973808,
"grad_norm": 0.03133245185017586,
"learning_rate": 0.0002,
"loss": 0.5547972321510315,
"mean_token_accuracy": 0.7729021608829498,
"num_tokens": 7982562.0,
"step": 490
},
{
"entropy": 0.5613508969545364,
"epoch": 1.8344246959775492,
"grad_norm": 0.031487561762332916,
"learning_rate": 0.0002,
"loss": 0.5589193105697632,
"mean_token_accuracy": 0.7691849023103714,
"num_tokens": 7999101.0,
"step": 491
},
{
"entropy": 0.552077904343605,
"epoch": 1.8381665107577176,
"grad_norm": 0.030901558697223663,
"learning_rate": 0.0002,
"loss": 0.5548684597015381,
"mean_token_accuracy": 0.7746628671884537,
"num_tokens": 8015580.0,
"step": 492
},
{
"entropy": 0.5537288337945938,
"epoch": 1.8419083255378859,
"grad_norm": 0.032475873827934265,
"learning_rate": 0.0002,
"loss": 0.554737389087677,
"mean_token_accuracy": 0.7736551910638809,
"num_tokens": 8031933.0,
"step": 493
},
{
"entropy": 0.548131912946701,
"epoch": 1.8456501403180543,
"grad_norm": 0.034645676612854004,
"learning_rate": 0.0002,
"loss": 0.5518745183944702,
"mean_token_accuracy": 0.7750734686851501,
"num_tokens": 8048122.0,
"step": 494
},
{
"entropy": 0.5457621365785599,
"epoch": 1.8493919550982225,
"grad_norm": 0.0346519835293293,
"learning_rate": 0.0002,
"loss": 0.5511569380760193,
"mean_token_accuracy": 0.774482324719429,
"num_tokens": 8064371.0,
"step": 495
},
{
"entropy": 0.5622203350067139,
"epoch": 1.853133769878391,
"grad_norm": 0.04098769649863243,
"learning_rate": 0.0002,
"loss": 0.5641219615936279,
"mean_token_accuracy": 0.7717546820640564,
"num_tokens": 8080811.0,
"step": 496
},
{
"entropy": 0.5483545809984207,
"epoch": 1.8568755846585594,
"grad_norm": 0.03688424080610275,
"learning_rate": 0.0002,
"loss": 0.5510388612747192,
"mean_token_accuracy": 0.7764346599578857,
"num_tokens": 8097126.0,
"step": 497
},
{
"entropy": 0.5505103766918182,
"epoch": 1.8606173994387278,
"grad_norm": 0.03670699521899223,
"learning_rate": 0.0002,
"loss": 0.5573628544807434,
"mean_token_accuracy": 0.7726601958274841,
"num_tokens": 8113420.0,
"step": 498
},
{
"entropy": 0.529410183429718,
"epoch": 1.8643592142188963,
"grad_norm": 0.0299246683716774,
"learning_rate": 0.0002,
"loss": 0.5223079919815063,
"mean_token_accuracy": 0.787264496088028,
"num_tokens": 8129867.0,
"step": 499
},
{
"entropy": 0.5540086030960083,
"epoch": 1.8681010289990645,
"grad_norm": 0.03435957059264183,
"learning_rate": 0.0002,
"loss": 0.5479264259338379,
"mean_token_accuracy": 0.7777916789054871,
"num_tokens": 8146232.0,
"step": 500
},
{
"entropy": 0.5476558804512024,
"epoch": 1.871842843779233,
"grad_norm": 0.032948873937129974,
"learning_rate": 0.0002,
"loss": 0.5458691716194153,
"mean_token_accuracy": 0.7800754606723785,
"num_tokens": 8162478.0,
"step": 501
},
{
"entropy": 0.5278200954198837,
"epoch": 1.8755846585594012,
"grad_norm": 0.02974856086075306,
"learning_rate": 0.0002,
"loss": 0.5305043458938599,
"mean_token_accuracy": 0.785199910402298,
"num_tokens": 8179046.0,
"step": 502
},
{
"entropy": 0.5498995333909988,
"epoch": 1.8793264733395696,
"grad_norm": 0.035161007195711136,
"learning_rate": 0.0002,
"loss": 0.5587770342826843,
"mean_token_accuracy": 0.7729851007461548,
"num_tokens": 8195430.0,
"step": 503
},
{
"entropy": 0.5525415539741516,
"epoch": 1.883068288119738,
"grad_norm": 0.0358411967754364,
"learning_rate": 0.0002,
"loss": 0.5540306568145752,
"mean_token_accuracy": 0.7763612270355225,
"num_tokens": 8211820.0,
"step": 504
},
{
"entropy": 0.548132598400116,
"epoch": 1.8868101028999065,
"grad_norm": 0.030124109238386154,
"learning_rate": 0.0002,
"loss": 0.5509622693061829,
"mean_token_accuracy": 0.7774811685085297,
"num_tokens": 8228136.0,
"step": 505
},
{
"entropy": 0.5653504580259323,
"epoch": 1.890551917680075,
"grad_norm": 0.03144733980298042,
"learning_rate": 0.0002,
"loss": 0.5578948259353638,
"mean_token_accuracy": 0.7719802111387253,
"num_tokens": 8244600.0,
"step": 506
},
{
"entropy": 0.5680980533361435,
"epoch": 1.8942937324602434,
"grad_norm": 0.03786737844347954,
"learning_rate": 0.0002,
"loss": 0.5742643475532532,
"mean_token_accuracy": 0.7682982087135315,
"num_tokens": 8260924.0,
"step": 507
},
{
"entropy": 0.5519368350505829,
"epoch": 1.8980355472404116,
"grad_norm": 0.03175094351172447,
"learning_rate": 0.0002,
"loss": 0.553012490272522,
"mean_token_accuracy": 0.7758679240942001,
"num_tokens": 8277138.0,
"step": 508
},
{
"entropy": 0.550408124923706,
"epoch": 1.90177736202058,
"grad_norm": 0.03196226805448532,
"learning_rate": 0.0002,
"loss": 0.5527910590171814,
"mean_token_accuracy": 0.7774336487054825,
"num_tokens": 8293651.0,
"step": 509
},
{
"entropy": 0.551310807466507,
"epoch": 1.9055191768007482,
"grad_norm": 0.032158490270376205,
"learning_rate": 0.0002,
"loss": 0.5532134175300598,
"mean_token_accuracy": 0.7765610069036484,
"num_tokens": 8310166.0,
"step": 510
},
{
"entropy": 0.554396003484726,
"epoch": 1.9092609915809167,
"grad_norm": 0.03265155106782913,
"learning_rate": 0.0002,
"loss": 0.5611427426338196,
"mean_token_accuracy": 0.770960658788681,
"num_tokens": 8326460.0,
"step": 511
},
{
"entropy": 0.5533443540334702,
"epoch": 1.913002806361085,
"grad_norm": 0.03062952496111393,
"learning_rate": 0.0002,
"loss": 0.5535008311271667,
"mean_token_accuracy": 0.7743202298879623,
"num_tokens": 8342730.0,
"step": 512
},
{
"entropy": 0.557416245341301,
"epoch": 1.9167446211412535,
"grad_norm": 0.032427720725536346,
"learning_rate": 0.0002,
"loss": 0.555341899394989,
"mean_token_accuracy": 0.7736751586198807,
"num_tokens": 8358790.0,
"step": 513
},
{
"entropy": 0.5498823821544647,
"epoch": 1.920486435921422,
"grad_norm": 0.03641689941287041,
"learning_rate": 0.0002,
"loss": 0.5489510893821716,
"mean_token_accuracy": 0.7756739258766174,
"num_tokens": 8374932.0,
"step": 514
},
{
"entropy": 0.5567668825387955,
"epoch": 1.9242282507015904,
"grad_norm": 0.0356590710580349,
"learning_rate": 0.0002,
"loss": 0.5600458979606628,
"mean_token_accuracy": 0.7731840312480927,
"num_tokens": 8391373.0,
"step": 515
},
{
"entropy": 0.5492214262485504,
"epoch": 1.9279700654817586,
"grad_norm": 0.032011594623327255,
"learning_rate": 0.0002,
"loss": 0.5541006326675415,
"mean_token_accuracy": 0.7760893553495407,
"num_tokens": 8407637.0,
"step": 516
},
{
"entropy": 0.5398948937654495,
"epoch": 1.931711880261927,
"grad_norm": 0.03577565401792526,
"learning_rate": 0.0002,
"loss": 0.5467641949653625,
"mean_token_accuracy": 0.775809720158577,
"num_tokens": 8423916.0,
"step": 517
},
{
"entropy": 0.5437736511230469,
"epoch": 1.9354536950420953,
"grad_norm": 0.031068816781044006,
"learning_rate": 0.0002,
"loss": 0.5446307063102722,
"mean_token_accuracy": 0.7766688168048859,
"num_tokens": 8440387.0,
"step": 518
},
{
"entropy": 0.551026239991188,
"epoch": 1.9391955098222637,
"grad_norm": 0.03239775449037552,
"learning_rate": 0.0002,
"loss": 0.5448942184448242,
"mean_token_accuracy": 0.7764843702316284,
"num_tokens": 8456844.0,
"step": 519
},
{
"entropy": 0.5524020791053772,
"epoch": 1.9429373246024322,
"grad_norm": 0.03006759099662304,
"learning_rate": 0.0002,
"loss": 0.5508519411087036,
"mean_token_accuracy": 0.7757467180490494,
"num_tokens": 8473098.0,
"step": 520
},
{
"entropy": 0.5465254038572311,
"epoch": 1.9466791393826006,
"grad_norm": 0.03377439081668854,
"learning_rate": 0.0002,
"loss": 0.5440271496772766,
"mean_token_accuracy": 0.7764104902744293,
"num_tokens": 8489284.0,
"step": 521
},
{
"entropy": 0.5479972213506699,
"epoch": 1.950420954162769,
"grad_norm": 0.03804773464798927,
"learning_rate": 0.0002,
"loss": 0.5570059418678284,
"mean_token_accuracy": 0.7720707058906555,
"num_tokens": 8505659.0,
"step": 522
},
{
"entropy": 0.5531162023544312,
"epoch": 1.9541627689429373,
"grad_norm": 0.0431046187877655,
"learning_rate": 0.0002,
"loss": 0.5670960545539856,
"mean_token_accuracy": 0.7688823044300079,
"num_tokens": 8522329.0,
"step": 523
},
{
"entropy": 0.5688248574733734,
"epoch": 1.9579045837231057,
"grad_norm": 0.026841329410672188,
"learning_rate": 0.0002,
"loss": 0.5626019835472107,
"mean_token_accuracy": 0.7691622525453568,
"num_tokens": 8538842.0,
"step": 524
},
{
"entropy": 0.5459724515676498,
"epoch": 1.961646398503274,
"grad_norm": 0.03493349626660347,
"learning_rate": 0.0002,
"loss": 0.5443795919418335,
"mean_token_accuracy": 0.7770666480064392,
"num_tokens": 8554945.0,
"step": 525
},
{
"entropy": 0.5657712519168854,
"epoch": 1.9653882132834424,
"grad_norm": 0.03769686445593834,
"learning_rate": 0.0002,
"loss": 0.5527753829956055,
"mean_token_accuracy": 0.7778369933366776,
"num_tokens": 8570989.0,
"step": 526
},
{
"entropy": 0.550276130437851,
"epoch": 1.9691300280636108,
"grad_norm": 0.03369564935564995,
"learning_rate": 0.0002,
"loss": 0.5424638986587524,
"mean_token_accuracy": 0.7803192138671875,
"num_tokens": 8587072.0,
"step": 527
},
{
"entropy": 0.5489895343780518,
"epoch": 1.9728718428437793,
"grad_norm": 0.03569629415869713,
"learning_rate": 0.0002,
"loss": 0.559888482093811,
"mean_token_accuracy": 0.7720399796962738,
"num_tokens": 8603352.0,
"step": 528
},
{
"entropy": 0.530121460556984,
"epoch": 1.9766136576239477,
"grad_norm": 0.037291910499334335,
"learning_rate": 0.0002,
"loss": 0.5450345873832703,
"mean_token_accuracy": 0.7796709537506104,
"num_tokens": 8619760.0,
"step": 529
},
{
"entropy": 0.5523941069841385,
"epoch": 1.9803554724041161,
"grad_norm": 0.027196237817406654,
"learning_rate": 0.0002,
"loss": 0.5566985011100769,
"mean_token_accuracy": 0.773260235786438,
"num_tokens": 8636140.0,
"step": 530
},
{
"entropy": 0.5579734891653061,
"epoch": 1.9840972871842844,
"grad_norm": 0.029088523238897324,
"learning_rate": 0.0002,
"loss": 0.5540033578872681,
"mean_token_accuracy": 0.7756596505641937,
"num_tokens": 8652295.0,
"step": 531
},
{
"entropy": 0.5574969351291656,
"epoch": 1.9878391019644528,
"grad_norm": 0.029939375817775726,
"learning_rate": 0.0002,
"loss": 0.5501161217689514,
"mean_token_accuracy": 0.7750376909971237,
"num_tokens": 8668973.0,
"step": 532
},
{
"entropy": 0.5492955148220062,
"epoch": 1.991580916744621,
"grad_norm": 0.03092138096690178,
"learning_rate": 0.0002,
"loss": 0.5422185063362122,
"mean_token_accuracy": 0.7804518193006516,
"num_tokens": 8685148.0,
"step": 533
},
{
"entropy": 0.5466224402189255,
"epoch": 1.9953227315247895,
"grad_norm": 0.03692883625626564,
"learning_rate": 0.0002,
"loss": 0.5514038801193237,
"mean_token_accuracy": 0.7737534046173096,
"num_tokens": 8701543.0,
"step": 534
},
{
"entropy": 0.5537078529596329,
"epoch": 1.999064546304958,
"grad_norm": 0.03208556026220322,
"learning_rate": 0.0002,
"loss": 0.5545927286148071,
"mean_token_accuracy": 0.777570441365242,
"num_tokens": 8717790.0,
"step": 535
},
{
"entropy": 0.5328470468521118,
"epoch": 2.0,
"grad_norm": 0.056387241929769516,
"learning_rate": 0.0002,
"loss": 0.5407091379165649,
"mean_token_accuracy": 0.7980132699012756,
"num_tokens": 8719006.0,
"step": 536
},
{
"entropy": 0.5399350374937057,
"epoch": 2.0037418147801684,
"grad_norm": 0.030944975093007088,
"learning_rate": 0.0002,
"loss": 0.5385851263999939,
"mean_token_accuracy": 0.7820405662059784,
"num_tokens": 8735642.0,
"step": 537
},
{
"entropy": 0.5494481921195984,
"epoch": 2.007483629560337,
"grad_norm": 0.037696994841098785,
"learning_rate": 0.0002,
"loss": 0.5568894147872925,
"mean_token_accuracy": 0.7728834450244904,
"num_tokens": 8752037.0,
"step": 538
},
{
"entropy": 0.5218051299452782,
"epoch": 2.0112254443405053,
"grad_norm": 0.03197522833943367,
"learning_rate": 0.0002,
"loss": 0.5231513977050781,
"mean_token_accuracy": 0.7889297753572464,
"num_tokens": 8768180.0,
"step": 539
},
{
"entropy": 0.5204869955778122,
"epoch": 2.0149672591206733,
"grad_norm": 0.03365905210375786,
"learning_rate": 0.0002,
"loss": 0.5204414129257202,
"mean_token_accuracy": 0.7887504994869232,
"num_tokens": 8784385.0,
"step": 540
},
{
"entropy": 0.5250371545553207,
"epoch": 2.0187090739008418,
"grad_norm": 0.03206612914800644,
"learning_rate": 0.0002,
"loss": 0.5264713764190674,
"mean_token_accuracy": 0.7865318804979324,
"num_tokens": 8800264.0,
"step": 541
},
{
"entropy": 0.5362996757030487,
"epoch": 2.02245088868101,
"grad_norm": 0.035737182945013046,
"learning_rate": 0.0002,
"loss": 0.5328425765037537,
"mean_token_accuracy": 0.7832369208335876,
"num_tokens": 8816869.0,
"step": 542
},
{
"entropy": 0.5211998522281647,
"epoch": 2.0261927034611786,
"grad_norm": 0.03382508456707001,
"learning_rate": 0.0002,
"loss": 0.5247855186462402,
"mean_token_accuracy": 0.7869311422109604,
"num_tokens": 8833119.0,
"step": 543
},
{
"entropy": 0.5350741446018219,
"epoch": 2.029934518241347,
"grad_norm": 0.03478322923183441,
"learning_rate": 0.0002,
"loss": 0.5424962639808655,
"mean_token_accuracy": 0.7780940532684326,
"num_tokens": 8849384.0,
"step": 544
},
{
"entropy": 0.5465849786996841,
"epoch": 2.0336763330215155,
"grad_norm": 0.04140733554959297,
"learning_rate": 0.0002,
"loss": 0.5555759072303772,
"mean_token_accuracy": 0.7771580815315247,
"num_tokens": 8865580.0,
"step": 545
},
{
"entropy": 0.5315355062484741,
"epoch": 2.037418147801684,
"grad_norm": 0.037138681858778,
"learning_rate": 0.0002,
"loss": 0.5277940630912781,
"mean_token_accuracy": 0.7869007289409637,
"num_tokens": 8882160.0,
"step": 546
},
{
"entropy": 0.5415049940347672,
"epoch": 2.0411599625818524,
"grad_norm": 0.0382317453622818,
"learning_rate": 0.0002,
"loss": 0.52928626537323,
"mean_token_accuracy": 0.783332422375679,
"num_tokens": 8898284.0,
"step": 547
},
{
"entropy": 0.5444429516792297,
"epoch": 2.0449017773620204,
"grad_norm": 0.03212872892618179,
"learning_rate": 0.0002,
"loss": 0.5390786528587341,
"mean_token_accuracy": 0.7800189107656479,
"num_tokens": 8914317.0,
"step": 548
},
{
"entropy": 0.5368607640266418,
"epoch": 2.048643592142189,
"grad_norm": 0.03962872177362442,
"learning_rate": 0.0002,
"loss": 0.5424067974090576,
"mean_token_accuracy": 0.7807967215776443,
"num_tokens": 8930503.0,
"step": 549
},
{
"entropy": 0.5316442102193832,
"epoch": 2.0523854069223573,
"grad_norm": 0.04042808711528778,
"learning_rate": 0.0002,
"loss": 0.5394030809402466,
"mean_token_accuracy": 0.7808849960565567,
"num_tokens": 8946862.0,
"step": 550
},
{
"entropy": 0.5393616110086441,
"epoch": 2.0561272217025257,
"grad_norm": 0.04134383797645569,
"learning_rate": 0.0002,
"loss": 0.5422969460487366,
"mean_token_accuracy": 0.778337299823761,
"num_tokens": 8963159.0,
"step": 551
},
{
"entropy": 0.5272297635674477,
"epoch": 2.059869036482694,
"grad_norm": 0.03908038139343262,
"learning_rate": 0.0002,
"loss": 0.5269819498062134,
"mean_token_accuracy": 0.7861954718828201,
"num_tokens": 8979486.0,
"step": 552
},
{
"entropy": 0.5292486846446991,
"epoch": 2.0636108512628626,
"grad_norm": 0.03547659516334534,
"learning_rate": 0.0002,
"loss": 0.531383752822876,
"mean_token_accuracy": 0.7845012545585632,
"num_tokens": 8995728.0,
"step": 553
},
{
"entropy": 0.537693664431572,
"epoch": 2.067352666043031,
"grad_norm": 0.04505831003189087,
"learning_rate": 0.0002,
"loss": 0.5415912866592407,
"mean_token_accuracy": 0.7810403853654861,
"num_tokens": 9012262.0,
"step": 554
},
{
"entropy": 0.542693018913269,
"epoch": 2.0710944808231995,
"grad_norm": 0.03637455403804779,
"learning_rate": 0.0002,
"loss": 0.5454283356666565,
"mean_token_accuracy": 0.7768286317586899,
"num_tokens": 9028450.0,
"step": 555
},
{
"entropy": 0.5359488427639008,
"epoch": 2.0748362956033675,
"grad_norm": 0.038283299654722214,
"learning_rate": 0.0002,
"loss": 0.5341436266899109,
"mean_token_accuracy": 0.7861706465482712,
"num_tokens": 9044691.0,
"step": 556
},
{
"entropy": 0.5348773896694183,
"epoch": 2.078578110383536,
"grad_norm": 0.038720738142728806,
"learning_rate": 0.0002,
"loss": 0.5340168476104736,
"mean_token_accuracy": 0.7848398089408875,
"num_tokens": 9061090.0,
"step": 557
},
{
"entropy": 0.5301378965377808,
"epoch": 2.0823199251637043,
"grad_norm": 0.03610686585307121,
"learning_rate": 0.0002,
"loss": 0.5331196784973145,
"mean_token_accuracy": 0.7825122624635696,
"num_tokens": 9077457.0,
"step": 558
},
{
"entropy": 0.5627280175685883,
"epoch": 2.086061739943873,
"grad_norm": 0.0459170863032341,
"learning_rate": 0.0002,
"loss": 0.5622618198394775,
"mean_token_accuracy": 0.7731509357690811,
"num_tokens": 9093892.0,
"step": 559
},
{
"entropy": 0.5291252806782722,
"epoch": 2.0898035547240412,
"grad_norm": 0.03501354530453682,
"learning_rate": 0.0002,
"loss": 0.5241326689720154,
"mean_token_accuracy": 0.7903649061918259,
"num_tokens": 9110195.0,
"step": 560
},
{
"entropy": 0.5336360484361649,
"epoch": 2.0935453695042097,
"grad_norm": 0.03297366574406624,
"learning_rate": 0.0002,
"loss": 0.5302354097366333,
"mean_token_accuracy": 0.7871804982423782,
"num_tokens": 9126264.0,
"step": 561
},
{
"entropy": 0.5324128270149231,
"epoch": 2.097287184284378,
"grad_norm": 0.040097158402204514,
"learning_rate": 0.0002,
"loss": 0.5449591875076294,
"mean_token_accuracy": 0.7766915112733841,
"num_tokens": 9142405.0,
"step": 562
},
{
"entropy": 0.5327176600694656,
"epoch": 2.101028999064546,
"grad_norm": 0.03983257710933685,
"learning_rate": 0.0002,
"loss": 0.5427699089050293,
"mean_token_accuracy": 0.780575692653656,
"num_tokens": 9158550.0,
"step": 563
},
{
"entropy": 0.5298762768507004,
"epoch": 2.1047708138447145,
"grad_norm": 0.035936590284109116,
"learning_rate": 0.0002,
"loss": 0.5320777297019958,
"mean_token_accuracy": 0.7820149213075638,
"num_tokens": 9174783.0,
"step": 564
},
{
"entropy": 0.5250122100114822,
"epoch": 2.108512628624883,
"grad_norm": 0.03537021949887276,
"learning_rate": 0.0002,
"loss": 0.5220876932144165,
"mean_token_accuracy": 0.7874044477939606,
"num_tokens": 9190734.0,
"step": 565
},
{
"entropy": 0.5498971343040466,
"epoch": 2.1122544434050514,
"grad_norm": 0.03972788527607918,
"learning_rate": 0.0002,
"loss": 0.5416819453239441,
"mean_token_accuracy": 0.7811024487018585,
"num_tokens": 9207046.0,
"step": 566
},
{
"entropy": 0.5510820746421814,
"epoch": 2.11599625818522,
"grad_norm": 0.03674028813838959,
"learning_rate": 0.0002,
"loss": 0.5430952906608582,
"mean_token_accuracy": 0.7772987484931946,
"num_tokens": 9223541.0,
"step": 567
},
{
"entropy": 0.5243249386548996,
"epoch": 2.1197380729653883,
"grad_norm": 0.03868189826607704,
"learning_rate": 0.0002,
"loss": 0.5305947065353394,
"mean_token_accuracy": 0.7821440249681473,
"num_tokens": 9239944.0,
"step": 568
},
{
"entropy": 0.5186186358332634,
"epoch": 2.1234798877455567,
"grad_norm": 0.03420955687761307,
"learning_rate": 0.0002,
"loss": 0.5219792127609253,
"mean_token_accuracy": 0.787507027387619,
"num_tokens": 9256323.0,
"step": 569
},
{
"entropy": 0.5048380643129349,
"epoch": 2.127221702525725,
"grad_norm": 0.043813057243824005,
"learning_rate": 0.0002,
"loss": 0.511600911617279,
"mean_token_accuracy": 0.7919255346059799,
"num_tokens": 9272250.0,
"step": 570
},
{
"entropy": 0.5333007425069809,
"epoch": 2.130963517305893,
"grad_norm": 0.03591044992208481,
"learning_rate": 0.0002,
"loss": 0.5382859110832214,
"mean_token_accuracy": 0.7790134996175766,
"num_tokens": 9288633.0,
"step": 571
},
{
"entropy": 0.5432953387498856,
"epoch": 2.1347053320860616,
"grad_norm": 0.03850630670785904,
"learning_rate": 0.0002,
"loss": 0.5398726463317871,
"mean_token_accuracy": 0.7803007066249847,
"num_tokens": 9304977.0,
"step": 572
},
{
"entropy": 0.5424948632717133,
"epoch": 2.13844714686623,
"grad_norm": 0.042041826993227005,
"learning_rate": 0.0002,
"loss": 0.5371389389038086,
"mean_token_accuracy": 0.7817080616950989,
"num_tokens": 9321211.0,
"step": 573
},
{
"entropy": 0.5420571565628052,
"epoch": 2.1421889616463985,
"grad_norm": 0.03702463209629059,
"learning_rate": 0.0002,
"loss": 0.5405826568603516,
"mean_token_accuracy": 0.7787773013114929,
"num_tokens": 9337519.0,
"step": 574
},
{
"entropy": 0.5343386083841324,
"epoch": 2.145930776426567,
"grad_norm": 0.0367942713201046,
"learning_rate": 0.0002,
"loss": 0.5343334078788757,
"mean_token_accuracy": 0.7813169211149216,
"num_tokens": 9353930.0,
"step": 575
},
{
"entropy": 0.5107736587524414,
"epoch": 2.1496725912067354,
"grad_norm": 0.04816743731498718,
"learning_rate": 0.0002,
"loss": 0.5181273221969604,
"mean_token_accuracy": 0.790352001786232,
"num_tokens": 9370151.0,
"step": 576
},
{
"entropy": 0.5483916699886322,
"epoch": 2.153414405986904,
"grad_norm": 0.03954138606786728,
"learning_rate": 0.0002,
"loss": 0.5537930130958557,
"mean_token_accuracy": 0.7744487076997757,
"num_tokens": 9386529.0,
"step": 577
},
{
"entropy": 0.5222444832324982,
"epoch": 2.157156220767072,
"grad_norm": 0.04258863255381584,
"learning_rate": 0.0002,
"loss": 0.5331015586853027,
"mean_token_accuracy": 0.7828160971403122,
"num_tokens": 9402702.0,
"step": 578
},
{
"entropy": 0.5395079553127289,
"epoch": 2.1608980355472402,
"grad_norm": 0.036775294691324234,
"learning_rate": 0.0002,
"loss": 0.5392586588859558,
"mean_token_accuracy": 0.7785846441984177,
"num_tokens": 9418983.0,
"step": 579
},
{
"entropy": 0.5308848768472672,
"epoch": 2.1646398503274087,
"grad_norm": 0.041630957275629044,
"learning_rate": 0.0002,
"loss": 0.5223425030708313,
"mean_token_accuracy": 0.7881145030260086,
"num_tokens": 9435130.0,
"step": 580
},
{
"entropy": 0.5460510104894638,
"epoch": 2.168381665107577,
"grad_norm": 0.040873266756534576,
"learning_rate": 0.0002,
"loss": 0.5389937162399292,
"mean_token_accuracy": 0.7796555161476135,
"num_tokens": 9451384.0,
"step": 581
},
{
"entropy": 0.5144870802760124,
"epoch": 2.1721234798877456,
"grad_norm": 0.04395061731338501,
"learning_rate": 0.0002,
"loss": 0.5220937132835388,
"mean_token_accuracy": 0.7867953330278397,
"num_tokens": 9467676.0,
"step": 582
},
{
"entropy": 0.5361004173755646,
"epoch": 2.175865294667914,
"grad_norm": 0.03444032743573189,
"learning_rate": 0.0002,
"loss": 0.5381976962089539,
"mean_token_accuracy": 0.7804248631000519,
"num_tokens": 9484105.0,
"step": 583
},
{
"entropy": 0.5315199941396713,
"epoch": 2.1796071094480824,
"grad_norm": 0.04019028693437576,
"learning_rate": 0.0002,
"loss": 0.538859486579895,
"mean_token_accuracy": 0.7802779376506805,
"num_tokens": 9500441.0,
"step": 584
},
{
"entropy": 0.5049743205308914,
"epoch": 2.183348924228251,
"grad_norm": 0.038020916283130646,
"learning_rate": 0.0002,
"loss": 0.5077824592590332,
"mean_token_accuracy": 0.794673815369606,
"num_tokens": 9516632.0,
"step": 585
},
{
"entropy": 0.542245015501976,
"epoch": 2.187090739008419,
"grad_norm": 0.03803880140185356,
"learning_rate": 0.0002,
"loss": 0.5457203388214111,
"mean_token_accuracy": 0.7765202075242996,
"num_tokens": 9532790.0,
"step": 586
},
{
"entropy": 0.545234277844429,
"epoch": 2.1908325537885873,
"grad_norm": 0.03659515827894211,
"learning_rate": 0.0002,
"loss": 0.5328729748725891,
"mean_token_accuracy": 0.7851473838090897,
"num_tokens": 9549021.0,
"step": 587
},
{
"entropy": 0.5441733747720718,
"epoch": 2.1945743685687558,
"grad_norm": 0.03839794918894768,
"learning_rate": 0.0002,
"loss": 0.541313648223877,
"mean_token_accuracy": 0.7806493043899536,
"num_tokens": 9565414.0,
"step": 588
},
{
"entropy": 0.5392065942287445,
"epoch": 2.198316183348924,
"grad_norm": 0.03657695651054382,
"learning_rate": 0.0002,
"loss": 0.5446825623512268,
"mean_token_accuracy": 0.7759186178445816,
"num_tokens": 9581834.0,
"step": 589
},
{
"entropy": 0.5343391597270966,
"epoch": 2.2020579981290926,
"grad_norm": 0.03904880955815315,
"learning_rate": 0.0002,
"loss": 0.5319048166275024,
"mean_token_accuracy": 0.7858142107725143,
"num_tokens": 9598306.0,
"step": 590
},
{
"entropy": 0.5127864703536034,
"epoch": 2.205799812909261,
"grad_norm": 0.041219562292099,
"learning_rate": 0.0002,
"loss": 0.5198400616645813,
"mean_token_accuracy": 0.7894931733608246,
"num_tokens": 9614512.0,
"step": 591
},
{
"entropy": 0.5380221456289291,
"epoch": 2.2095416276894295,
"grad_norm": 0.03763064742088318,
"learning_rate": 0.0002,
"loss": 0.5350849032402039,
"mean_token_accuracy": 0.779957503080368,
"num_tokens": 9630831.0,
"step": 592
},
{
"entropy": 0.5404982268810272,
"epoch": 2.213283442469598,
"grad_norm": 0.03594009950757027,
"learning_rate": 0.0002,
"loss": 0.5446127653121948,
"mean_token_accuracy": 0.7765700072050095,
"num_tokens": 9647260.0,
"step": 593
},
{
"entropy": 0.5349030494689941,
"epoch": 2.217025257249766,
"grad_norm": 0.039131198078393936,
"learning_rate": 0.0002,
"loss": 0.5407675504684448,
"mean_token_accuracy": 0.7807668596506119,
"num_tokens": 9663454.0,
"step": 594
},
{
"entropy": 0.5357907861471176,
"epoch": 2.2207670720299344,
"grad_norm": 0.03754086792469025,
"learning_rate": 0.0002,
"loss": 0.5390987396240234,
"mean_token_accuracy": 0.7814063429832458,
"num_tokens": 9679665.0,
"step": 595
},
{
"entropy": 0.539327397942543,
"epoch": 2.224508886810103,
"grad_norm": 0.042121171951293945,
"learning_rate": 0.0002,
"loss": 0.5349074006080627,
"mean_token_accuracy": 0.7835494577884674,
"num_tokens": 9695690.0,
"step": 596
},
{
"entropy": 0.5527440309524536,
"epoch": 2.2282507015902713,
"grad_norm": 0.034759730100631714,
"learning_rate": 0.0002,
"loss": 0.546990156173706,
"mean_token_accuracy": 0.7748693376779556,
"num_tokens": 9711925.0,
"step": 597
},
{
"entropy": 0.5339156091213226,
"epoch": 2.2319925163704397,
"grad_norm": 0.03824164718389511,
"learning_rate": 0.0002,
"loss": 0.5315659642219543,
"mean_token_accuracy": 0.7847660332918167,
"num_tokens": 9728568.0,
"step": 598
},
{
"entropy": 0.5418261140584946,
"epoch": 2.235734331150608,
"grad_norm": 0.03952635079622269,
"learning_rate": 0.0002,
"loss": 0.5444273948669434,
"mean_token_accuracy": 0.7786458134651184,
"num_tokens": 9744937.0,
"step": 599
},
{
"entropy": 0.5325147211551666,
"epoch": 2.2394761459307766,
"grad_norm": 0.038507163524627686,
"learning_rate": 0.0002,
"loss": 0.538148045539856,
"mean_token_accuracy": 0.7803481221199036,
"num_tokens": 9761521.0,
"step": 600
},
{
"entropy": 0.5348295122385025,
"epoch": 2.243217960710945,
"grad_norm": 0.035764180123806,
"learning_rate": 0.0002,
"loss": 0.5350884199142456,
"mean_token_accuracy": 0.7832496911287308,
"num_tokens": 9777702.0,
"step": 601
},
{
"entropy": 0.549017146229744,
"epoch": 2.246959775491113,
"grad_norm": 0.037822045385837555,
"learning_rate": 0.0002,
"loss": 0.5440195798873901,
"mean_token_accuracy": 0.7799560874700546,
"num_tokens": 9794070.0,
"step": 602
},
{
"entropy": 0.5402355939149857,
"epoch": 2.2507015902712815,
"grad_norm": 0.04137027636170387,
"learning_rate": 0.0002,
"loss": 0.552240788936615,
"mean_token_accuracy": 0.7787455171346664,
"num_tokens": 9810307.0,
"step": 603
},
{
"entropy": 0.5575389862060547,
"epoch": 2.25444340505145,
"grad_norm": 0.03639021888375282,
"learning_rate": 0.0002,
"loss": 0.555095911026001,
"mean_token_accuracy": 0.7715982496738434,
"num_tokens": 9826944.0,
"step": 604
},
{
"entropy": 0.5453804582357407,
"epoch": 2.2581852198316184,
"grad_norm": 0.0329916886985302,
"learning_rate": 0.0002,
"loss": 0.5451047420501709,
"mean_token_accuracy": 0.778001993894577,
"num_tokens": 9843174.0,
"step": 605
},
{
"entropy": 0.5351513028144836,
"epoch": 2.261927034611787,
"grad_norm": 0.04027882218360901,
"learning_rate": 0.0002,
"loss": 0.5335583686828613,
"mean_token_accuracy": 0.7831520736217499,
"num_tokens": 9859568.0,
"step": 606
},
{
"entropy": 0.5303051620721817,
"epoch": 2.2656688493919552,
"grad_norm": 0.037942592054605484,
"learning_rate": 0.0002,
"loss": 0.5293945670127869,
"mean_token_accuracy": 0.7875201851129532,
"num_tokens": 9876127.0,
"step": 607
},
{
"entropy": 0.5205637887120247,
"epoch": 2.2694106641721237,
"grad_norm": 0.039965420961380005,
"learning_rate": 0.0002,
"loss": 0.5284023284912109,
"mean_token_accuracy": 0.7851175218820572,
"num_tokens": 9892336.0,
"step": 608
},
{
"entropy": 0.5270423293113708,
"epoch": 2.2731524789522917,
"grad_norm": 0.045534420758485794,
"learning_rate": 0.0002,
"loss": 0.5361034274101257,
"mean_token_accuracy": 0.7813378870487213,
"num_tokens": 9908677.0,
"step": 609
},
{
"entropy": 0.5461472570896149,
"epoch": 2.27689429373246,
"grad_norm": 0.03911803662776947,
"learning_rate": 0.0002,
"loss": 0.5419346690177917,
"mean_token_accuracy": 0.7793000787496567,
"num_tokens": 9925188.0,
"step": 610
},
{
"entropy": 0.5332899391651154,
"epoch": 2.2806361085126285,
"grad_norm": 0.03753461316227913,
"learning_rate": 0.0002,
"loss": 0.5261275172233582,
"mean_token_accuracy": 0.7856169193983078,
"num_tokens": 9941232.0,
"step": 611
},
{
"entropy": 0.5298324078321457,
"epoch": 2.284377923292797,
"grad_norm": 0.03578303009271622,
"learning_rate": 0.0002,
"loss": 0.525759220123291,
"mean_token_accuracy": 0.7869399040937424,
"num_tokens": 9957312.0,
"step": 612
},
{
"entropy": 0.5350215286016464,
"epoch": 2.2881197380729654,
"grad_norm": 0.04014569893479347,
"learning_rate": 0.0002,
"loss": 0.5390491485595703,
"mean_token_accuracy": 0.7834457159042358,
"num_tokens": 9973629.0,
"step": 613
},
{
"entropy": 0.5366346836090088,
"epoch": 2.291861552853134,
"grad_norm": 0.03635207563638687,
"learning_rate": 0.0002,
"loss": 0.5361836552619934,
"mean_token_accuracy": 0.7822949439287186,
"num_tokens": 9990003.0,
"step": 614
},
{
"entropy": 0.5358218550682068,
"epoch": 2.2956033676333023,
"grad_norm": 0.04499870166182518,
"learning_rate": 0.0002,
"loss": 0.5433334708213806,
"mean_token_accuracy": 0.781024381518364,
"num_tokens": 10006594.0,
"step": 615
},
{
"entropy": 0.5238985568284988,
"epoch": 2.2993451824134707,
"grad_norm": 0.041404612362384796,
"learning_rate": 0.0002,
"loss": 0.5319328308105469,
"mean_token_accuracy": 0.7816060185432434,
"num_tokens": 10022841.0,
"step": 616
},
{
"entropy": 0.5418704599142075,
"epoch": 2.3030869971936387,
"grad_norm": 0.03798811510205269,
"learning_rate": 0.0002,
"loss": 0.5385047793388367,
"mean_token_accuracy": 0.781515583395958,
"num_tokens": 10039191.0,
"step": 617
},
{
"entropy": 0.5519637167453766,
"epoch": 2.306828811973807,
"grad_norm": 0.03714706003665924,
"learning_rate": 0.0002,
"loss": 0.5444304347038269,
"mean_token_accuracy": 0.779953271150589,
"num_tokens": 10055793.0,
"step": 618
},
{
"entropy": 0.5363687425851822,
"epoch": 2.3105706267539756,
"grad_norm": 0.0435946062207222,
"learning_rate": 0.0002,
"loss": 0.538260817527771,
"mean_token_accuracy": 0.7822400480508804,
"num_tokens": 10072406.0,
"step": 619
},
{
"entropy": 0.5363148003816605,
"epoch": 2.314312441534144,
"grad_norm": 0.03934507444500923,
"learning_rate": 0.0002,
"loss": 0.5490261316299438,
"mean_token_accuracy": 0.7775698453187943,
"num_tokens": 10088893.0,
"step": 620
},
{
"entropy": 0.5337411910295486,
"epoch": 2.3180542563143125,
"grad_norm": 0.040114130824804306,
"learning_rate": 0.0002,
"loss": 0.5454047322273254,
"mean_token_accuracy": 0.7799661755561829,
"num_tokens": 10105348.0,
"step": 621
},
{
"entropy": 0.5429546684026718,
"epoch": 2.321796071094481,
"grad_norm": 0.04296046867966652,
"learning_rate": 0.0002,
"loss": 0.543846070766449,
"mean_token_accuracy": 0.7779647558927536,
"num_tokens": 10121753.0,
"step": 622
},
{
"entropy": 0.5331653952598572,
"epoch": 2.3255378858746494,
"grad_norm": 0.03862839564681053,
"learning_rate": 0.0002,
"loss": 0.5329957008361816,
"mean_token_accuracy": 0.7838963121175766,
"num_tokens": 10138069.0,
"step": 623
},
{
"entropy": 0.5332556366920471,
"epoch": 2.3292797006548174,
"grad_norm": 0.03637029603123665,
"learning_rate": 0.0002,
"loss": 0.5306488871574402,
"mean_token_accuracy": 0.7843363881111145,
"num_tokens": 10154386.0,
"step": 624
},
{
"entropy": 0.5389147102832794,
"epoch": 2.333021515434986,
"grad_norm": 0.04242001101374626,
"learning_rate": 0.0002,
"loss": 0.5379246473312378,
"mean_token_accuracy": 0.7805036455392838,
"num_tokens": 10170602.0,
"step": 625
},
{
"entropy": 0.529606968164444,
"epoch": 2.3367633302151543,
"grad_norm": 0.04366292059421539,
"learning_rate": 0.0002,
"loss": 0.5345982909202576,
"mean_token_accuracy": 0.7849325835704803,
"num_tokens": 10186681.0,
"step": 626
},
{
"entropy": 0.5343451648950577,
"epoch": 2.3405051449953227,
"grad_norm": 0.04901853948831558,
"learning_rate": 0.0002,
"loss": 0.5390074253082275,
"mean_token_accuracy": 0.7809460461139679,
"num_tokens": 10202735.0,
"step": 627
},
{
"entropy": 0.5364287346601486,
"epoch": 2.344246959775491,
"grad_norm": 0.03992681950330734,
"learning_rate": 0.0002,
"loss": 0.5428602695465088,
"mean_token_accuracy": 0.7803080379962921,
"num_tokens": 10219104.0,
"step": 628
},
{
"entropy": 0.5363292992115021,
"epoch": 2.3479887745556596,
"grad_norm": 0.04561900347471237,
"learning_rate": 0.0002,
"loss": 0.5422950983047485,
"mean_token_accuracy": 0.7803726643323898,
"num_tokens": 10235450.0,
"step": 629
},
{
"entropy": 0.5503382086753845,
"epoch": 2.351730589335828,
"grad_norm": 0.036633238196372986,
"learning_rate": 0.0002,
"loss": 0.5429909229278564,
"mean_token_accuracy": 0.777814120054245,
"num_tokens": 10251744.0,
"step": 630
},
{
"entropy": 0.5556712299585342,
"epoch": 2.3554724041159965,
"grad_norm": 0.03755469620227814,
"learning_rate": 0.0002,
"loss": 0.5372464060783386,
"mean_token_accuracy": 0.7816385924816132,
"num_tokens": 10268228.0,
"step": 631
},
{
"entropy": 0.54240882396698,
"epoch": 2.3592142188961645,
"grad_norm": 0.04244554787874222,
"learning_rate": 0.0002,
"loss": 0.5416730046272278,
"mean_token_accuracy": 0.7805517017841339,
"num_tokens": 10284594.0,
"step": 632
},
{
"entropy": 0.5457853078842163,
"epoch": 2.362956033676333,
"grad_norm": 0.03768390789628029,
"learning_rate": 0.0002,
"loss": 0.5503990054130554,
"mean_token_accuracy": 0.7760391384363174,
"num_tokens": 10300645.0,
"step": 633
},
{
"entropy": 0.5061568543314934,
"epoch": 2.3666978484565013,
"grad_norm": 0.04066069424152374,
"learning_rate": 0.0002,
"loss": 0.5147897601127625,
"mean_token_accuracy": 0.7923619449138641,
"num_tokens": 10317035.0,
"step": 634
},
{
"entropy": 0.5265238285064697,
"epoch": 2.3704396632366698,
"grad_norm": 0.045070137828588486,
"learning_rate": 0.0002,
"loss": 0.5342065691947937,
"mean_token_accuracy": 0.7828978300094604,
"num_tokens": 10333097.0,
"step": 635
},
{
"entropy": 0.5213058292865753,
"epoch": 2.374181478016838,
"grad_norm": 0.04251949489116669,
"learning_rate": 0.0002,
"loss": 0.5242940783500671,
"mean_token_accuracy": 0.7875875681638718,
"num_tokens": 10349477.0,
"step": 636
},
{
"entropy": 0.532469779253006,
"epoch": 2.3779232927970066,
"grad_norm": 0.04180033504962921,
"learning_rate": 0.0002,
"loss": 0.5338732600212097,
"mean_token_accuracy": 0.7874448299407959,
"num_tokens": 10365855.0,
"step": 637
},
{
"entropy": 0.5583899617195129,
"epoch": 2.381665107577175,
"grad_norm": 0.036461617797613144,
"learning_rate": 0.0002,
"loss": 0.5522404313087463,
"mean_token_accuracy": 0.7765318900346756,
"num_tokens": 10382454.0,
"step": 638
},
{
"entropy": 0.5361616462469101,
"epoch": 2.385406922357343,
"grad_norm": 0.03820829838514328,
"learning_rate": 0.0002,
"loss": 0.5331661701202393,
"mean_token_accuracy": 0.7812754958868027,
"num_tokens": 10398570.0,
"step": 639
},
{
"entropy": 0.5388377606868744,
"epoch": 2.3891487371375115,
"grad_norm": 0.03890148177742958,
"learning_rate": 0.0002,
"loss": 0.535783052444458,
"mean_token_accuracy": 0.7837421149015427,
"num_tokens": 10415136.0,
"step": 640
},
{
"entropy": 0.5403297692537308,
"epoch": 2.39289055191768,
"grad_norm": 0.037266530096530914,
"learning_rate": 0.0002,
"loss": 0.5458592176437378,
"mean_token_accuracy": 0.7799215018749237,
"num_tokens": 10431595.0,
"step": 641
},
{
"entropy": 0.5327188819646835,
"epoch": 2.3966323666978484,
"grad_norm": 0.04411016404628754,
"learning_rate": 0.0002,
"loss": 0.5372153520584106,
"mean_token_accuracy": 0.7820907682180405,
"num_tokens": 10448092.0,
"step": 642
},
{
"entropy": 0.5483715236186981,
"epoch": 2.400374181478017,
"grad_norm": 0.03909829258918762,
"learning_rate": 0.0002,
"loss": 0.5454411506652832,
"mean_token_accuracy": 0.781398132443428,
"num_tokens": 10464267.0,
"step": 643
},
{
"entropy": 0.5467081367969513,
"epoch": 2.4041159962581853,
"grad_norm": 0.04295220598578453,
"learning_rate": 0.0002,
"loss": 0.5442530512809753,
"mean_token_accuracy": 0.7759910225868225,
"num_tokens": 10480622.0,
"step": 644
},
{
"entropy": 0.545724093914032,
"epoch": 2.4078578110383537,
"grad_norm": 0.04099191352725029,
"learning_rate": 0.0002,
"loss": 0.5471324324607849,
"mean_token_accuracy": 0.7780001610517502,
"num_tokens": 10497093.0,
"step": 645
},
{
"entropy": 0.5526789277791977,
"epoch": 2.411599625818522,
"grad_norm": 0.03481397032737732,
"learning_rate": 0.0002,
"loss": 0.5524189472198486,
"mean_token_accuracy": 0.7738725692033768,
"num_tokens": 10513288.0,
"step": 646
},
{
"entropy": 0.5496002286672592,
"epoch": 2.4153414405986906,
"grad_norm": 0.04474830627441406,
"learning_rate": 0.0002,
"loss": 0.5568821430206299,
"mean_token_accuracy": 0.7747314423322678,
"num_tokens": 10529966.0,
"step": 647
},
{
"entropy": 0.5191539749503136,
"epoch": 2.4190832553788586,
"grad_norm": 0.04506181180477142,
"learning_rate": 0.0002,
"loss": 0.5247750878334045,
"mean_token_accuracy": 0.7888272404670715,
"num_tokens": 10546217.0,
"step": 648
},
{
"entropy": 0.5462011098861694,
"epoch": 2.422825070159027,
"grad_norm": 0.03946157172322273,
"learning_rate": 0.0002,
"loss": 0.5449219942092896,
"mean_token_accuracy": 0.7763949930667877,
"num_tokens": 10562587.0,
"step": 649
},
{
"entropy": 0.5374903529882431,
"epoch": 2.4265668849391955,
"grad_norm": 0.035694316029548645,
"learning_rate": 0.0002,
"loss": 0.5298718214035034,
"mean_token_accuracy": 0.7844248116016388,
"num_tokens": 10578673.0,
"step": 650
},
{
"entropy": 0.5490742027759552,
"epoch": 2.430308699719364,
"grad_norm": 0.040128957480192184,
"learning_rate": 0.0002,
"loss": 0.5476623773574829,
"mean_token_accuracy": 0.7761844098567963,
"num_tokens": 10594904.0,
"step": 651
},
{
"entropy": 0.5350600033998489,
"epoch": 2.4340505144995324,
"grad_norm": 0.04965779185295105,
"learning_rate": 0.0002,
"loss": 0.5467137694358826,
"mean_token_accuracy": 0.7777107208967209,
"num_tokens": 10611301.0,
"step": 652
},
{
"entropy": 0.5389928370714188,
"epoch": 2.437792329279701,
"grad_norm": 0.038716454058885574,
"learning_rate": 0.0002,
"loss": 0.5406030416488647,
"mean_token_accuracy": 0.7798842638731003,
"num_tokens": 10627924.0,
"step": 653
},
{
"entropy": 0.5396043509244919,
"epoch": 2.441534144059869,
"grad_norm": 0.04796689748764038,
"learning_rate": 0.0002,
"loss": 0.5485687255859375,
"mean_token_accuracy": 0.7767132520675659,
"num_tokens": 10643995.0,
"step": 654
},
{
"entropy": 0.5651813000440598,
"epoch": 2.4452759588400372,
"grad_norm": 0.03899235278367996,
"learning_rate": 0.0002,
"loss": 0.5558621883392334,
"mean_token_accuracy": 0.7751055210828781,
"num_tokens": 10660611.0,
"step": 655
},
{
"entropy": 0.5467101633548737,
"epoch": 2.4490177736202057,
"grad_norm": 0.041317425668239594,
"learning_rate": 0.0002,
"loss": 0.544463574886322,
"mean_token_accuracy": 0.7791299223899841,
"num_tokens": 10676939.0,
"step": 656
},
{
"entropy": 0.5405649244785309,
"epoch": 2.452759588400374,
"grad_norm": 0.03767058625817299,
"learning_rate": 0.0002,
"loss": 0.5359505414962769,
"mean_token_accuracy": 0.7838631421327591,
"num_tokens": 10693242.0,
"step": 657
},
{
"entropy": 0.5295758992433548,
"epoch": 2.4565014031805426,
"grad_norm": 0.03993664309382439,
"learning_rate": 0.0002,
"loss": 0.5338568091392517,
"mean_token_accuracy": 0.7815168350934982,
"num_tokens": 10709228.0,
"step": 658
},
{
"entropy": 0.5318661481142044,
"epoch": 2.460243217960711,
"grad_norm": 0.04673660546541214,
"learning_rate": 0.0002,
"loss": 0.5387503504753113,
"mean_token_accuracy": 0.7823595702648163,
"num_tokens": 10725743.0,
"step": 659
},
{
"entropy": 0.5362888127565384,
"epoch": 2.4639850327408794,
"grad_norm": 0.0443369522690773,
"learning_rate": 0.0002,
"loss": 0.5374599099159241,
"mean_token_accuracy": 0.7816221117973328,
"num_tokens": 10742450.0,
"step": 660
},
{
"entropy": 0.5324875563383102,
"epoch": 2.467726847521048,
"grad_norm": 0.037758708000183105,
"learning_rate": 0.0002,
"loss": 0.5326871871948242,
"mean_token_accuracy": 0.7862564772367477,
"num_tokens": 10758610.0,
"step": 661
},
{
"entropy": 0.5277500152587891,
"epoch": 2.4714686623012163,
"grad_norm": 0.042098864912986755,
"learning_rate": 0.0002,
"loss": 0.5331279635429382,
"mean_token_accuracy": 0.7840241938829422,
"num_tokens": 10774701.0,
"step": 662
},
{
"entropy": 0.5366615355014801,
"epoch": 2.4752104770813843,
"grad_norm": 0.040946412831544876,
"learning_rate": 0.0002,
"loss": 0.5397564768791199,
"mean_token_accuracy": 0.7829322069883347,
"num_tokens": 10790740.0,
"step": 663
},
{
"entropy": 0.5435209423303604,
"epoch": 2.4789522918615527,
"grad_norm": 0.04173668473958969,
"learning_rate": 0.0002,
"loss": 0.5457897186279297,
"mean_token_accuracy": 0.7782775014638901,
"num_tokens": 10806903.0,
"step": 664
},
{
"entropy": 0.5472803115844727,
"epoch": 2.482694106641721,
"grad_norm": 0.040667202323675156,
"learning_rate": 0.0002,
"loss": 0.5462859869003296,
"mean_token_accuracy": 0.7769711166620255,
"num_tokens": 10823042.0,
"step": 665
},
{
"entropy": 0.5469382554292679,
"epoch": 2.4864359214218896,
"grad_norm": 0.04248496890068054,
"learning_rate": 0.0002,
"loss": 0.5395170450210571,
"mean_token_accuracy": 0.7798823863267899,
"num_tokens": 10839340.0,
"step": 666
},
{
"entropy": 0.5202000439167023,
"epoch": 2.490177736202058,
"grad_norm": 0.03368566930294037,
"learning_rate": 0.0002,
"loss": 0.5234949588775635,
"mean_token_accuracy": 0.786568820476532,
"num_tokens": 10855502.0,
"step": 667
},
{
"entropy": 0.5273594409227371,
"epoch": 2.4939195509822265,
"grad_norm": 0.04516978561878204,
"learning_rate": 0.0002,
"loss": 0.5360161066055298,
"mean_token_accuracy": 0.7853840887546539,
"num_tokens": 10871840.0,
"step": 668
},
{
"entropy": 0.5393954515457153,
"epoch": 2.497661365762395,
"grad_norm": 0.03674040734767914,
"learning_rate": 0.0002,
"loss": 0.5378697514533997,
"mean_token_accuracy": 0.7824258059263229,
"num_tokens": 10888120.0,
"step": 669
},
{
"entropy": 0.5479197651147842,
"epoch": 2.501403180542563,
"grad_norm": 0.03727351129055023,
"learning_rate": 0.0002,
"loss": 0.5392875671386719,
"mean_token_accuracy": 0.7811300158500671,
"num_tokens": 10904483.0,
"step": 670
},
{
"entropy": 0.552995502948761,
"epoch": 2.5051449953227314,
"grad_norm": 0.036775074899196625,
"learning_rate": 0.0002,
"loss": 0.5475963950157166,
"mean_token_accuracy": 0.7784164547920227,
"num_tokens": 10920853.0,
"step": 671
},
{
"entropy": 0.5446810871362686,
"epoch": 2.5088868101029,
"grad_norm": 0.038499053567647934,
"learning_rate": 0.0002,
"loss": 0.5511402487754822,
"mean_token_accuracy": 0.7761510908603668,
"num_tokens": 10937231.0,
"step": 672
},
{
"entropy": 0.5175495520234108,
"epoch": 2.5126286248830683,
"grad_norm": 0.039775073528289795,
"learning_rate": 0.0002,
"loss": 0.5242205858230591,
"mean_token_accuracy": 0.7848553359508514,
"num_tokens": 10953429.0,
"step": 673
},
{
"entropy": 0.5237327665090561,
"epoch": 2.5163704396632367,
"grad_norm": 0.04171684384346008,
"learning_rate": 0.0002,
"loss": 0.5307218432426453,
"mean_token_accuracy": 0.7838338315486908,
"num_tokens": 10969808.0,
"step": 674
},
{
"entropy": 0.5405460149049759,
"epoch": 2.520112254443405,
"grad_norm": 0.04240800440311432,
"learning_rate": 0.0002,
"loss": 0.5408159494400024,
"mean_token_accuracy": 0.7787611186504364,
"num_tokens": 10986049.0,
"step": 675
},
{
"entropy": 0.5486787706613541,
"epoch": 2.5238540692235736,
"grad_norm": 0.039784692227840424,
"learning_rate": 0.0002,
"loss": 0.5455769896507263,
"mean_token_accuracy": 0.7784162014722824,
"num_tokens": 11002254.0,
"step": 676
},
{
"entropy": 0.5363409966230392,
"epoch": 2.527595884003742,
"grad_norm": 0.03736806660890579,
"learning_rate": 0.0002,
"loss": 0.5266451239585876,
"mean_token_accuracy": 0.7866665124893188,
"num_tokens": 11018914.0,
"step": 677
},
{
"entropy": 0.5279175043106079,
"epoch": 2.5313376987839105,
"grad_norm": 0.035363830626010895,
"learning_rate": 0.0002,
"loss": 0.5288829207420349,
"mean_token_accuracy": 0.7874743491411209,
"num_tokens": 11034952.0,
"step": 678
},
{
"entropy": 0.5376022309064865,
"epoch": 2.5350795135640785,
"grad_norm": 0.051831189543008804,
"learning_rate": 0.0002,
"loss": 0.5518858432769775,
"mean_token_accuracy": 0.7750970423221588,
"num_tokens": 11051172.0,
"step": 679
},
{
"entropy": 0.5426171720027924,
"epoch": 2.538821328344247,
"grad_norm": 0.04189771041274071,
"learning_rate": 0.0002,
"loss": 0.5544742345809937,
"mean_token_accuracy": 0.7774394005537033,
"num_tokens": 11067538.0,
"step": 680
},
{
"entropy": 0.5293037593364716,
"epoch": 2.5425631431244153,
"grad_norm": 0.04074425622820854,
"learning_rate": 0.0002,
"loss": 0.5310404896736145,
"mean_token_accuracy": 0.7826415598392487,
"num_tokens": 11083927.0,
"step": 681
},
{
"entropy": 0.5473333150148392,
"epoch": 2.5463049579045838,
"grad_norm": 0.03279516100883484,
"learning_rate": 0.0002,
"loss": 0.5383847951889038,
"mean_token_accuracy": 0.7836183458566666,
"num_tokens": 11100675.0,
"step": 682
},
{
"entropy": 0.5422270894050598,
"epoch": 2.550046772684752,
"grad_norm": 0.039768971502780914,
"learning_rate": 0.0002,
"loss": 0.543849766254425,
"mean_token_accuracy": 0.7796186804771423,
"num_tokens": 11116748.0,
"step": 683
},
{
"entropy": 0.5384610444307327,
"epoch": 2.55378858746492,
"grad_norm": 0.037385329604148865,
"learning_rate": 0.0002,
"loss": 0.54084312915802,
"mean_token_accuracy": 0.7830232381820679,
"num_tokens": 11133051.0,
"step": 684
},
{
"entropy": 0.5261296629905701,
"epoch": 2.5575304022450887,
"grad_norm": 0.039306074380874634,
"learning_rate": 0.0002,
"loss": 0.531363844871521,
"mean_token_accuracy": 0.785315752029419,
"num_tokens": 11149362.0,
"step": 685
},
{
"entropy": 0.5491520762443542,
"epoch": 2.561272217025257,
"grad_norm": 0.04143069311976433,
"learning_rate": 0.0002,
"loss": 0.5444177389144897,
"mean_token_accuracy": 0.7807131856679916,
"num_tokens": 11165746.0,
"step": 686
},
{
"entropy": 0.53914874792099,
"epoch": 2.5650140318054255,
"grad_norm": 0.03408098593354225,
"learning_rate": 0.0002,
"loss": 0.5294961929321289,
"mean_token_accuracy": 0.7870545238256454,
"num_tokens": 11182138.0,
"step": 687
},
{
"entropy": 0.5346123576164246,
"epoch": 2.568755846585594,
"grad_norm": 0.04301401227712631,
"learning_rate": 0.0002,
"loss": 0.5353041887283325,
"mean_token_accuracy": 0.784915953874588,
"num_tokens": 11198330.0,
"step": 688
},
{
"entropy": 0.5318583697080612,
"epoch": 2.5724976613657624,
"grad_norm": 0.04231448844075203,
"learning_rate": 0.0002,
"loss": 0.5399123430252075,
"mean_token_accuracy": 0.7802146077156067,
"num_tokens": 11214613.0,
"step": 689
},
{
"entropy": 0.5280211716890335,
"epoch": 2.576239476145931,
"grad_norm": 0.04549930989742279,
"learning_rate": 0.0002,
"loss": 0.5432953238487244,
"mean_token_accuracy": 0.777678519487381,
"num_tokens": 11230987.0,
"step": 690
},
{
"entropy": 0.5567438304424286,
"epoch": 2.5799812909260993,
"grad_norm": 0.03926197439432144,
"learning_rate": 0.0002,
"loss": 0.5588645339012146,
"mean_token_accuracy": 0.7713411450386047,
"num_tokens": 11247503.0,
"step": 691
},
{
"entropy": 0.542352095246315,
"epoch": 2.5837231057062677,
"grad_norm": 0.035485655069351196,
"learning_rate": 0.0002,
"loss": 0.5354308485984802,
"mean_token_accuracy": 0.7822972387075424,
"num_tokens": 11263949.0,
"step": 692
},
{
"entropy": 0.5373577028512955,
"epoch": 2.587464920486436,
"grad_norm": 0.04045470058917999,
"learning_rate": 0.0002,
"loss": 0.524779200553894,
"mean_token_accuracy": 0.785191684961319,
"num_tokens": 11280345.0,
"step": 693
},
{
"entropy": 0.5388759598135948,
"epoch": 2.591206735266604,
"grad_norm": 0.03759071230888367,
"learning_rate": 0.0002,
"loss": 0.5312530994415283,
"mean_token_accuracy": 0.7809051126241684,
"num_tokens": 11296587.0,
"step": 694
},
{
"entropy": 0.5210207849740982,
"epoch": 2.5949485500467726,
"grad_norm": 0.03664049878716469,
"learning_rate": 0.0002,
"loss": 0.526019275188446,
"mean_token_accuracy": 0.7867360413074493,
"num_tokens": 11313101.0,
"step": 695
},
{
"entropy": 0.5182994976639748,
"epoch": 2.598690364826941,
"grad_norm": 0.05368485301733017,
"learning_rate": 0.0002,
"loss": 0.5354053974151611,
"mean_token_accuracy": 0.7826909422874451,
"num_tokens": 11329367.0,
"step": 696
},
{
"entropy": 0.5452821850776672,
"epoch": 2.6024321796071095,
"grad_norm": 0.04641703888773918,
"learning_rate": 0.0002,
"loss": 0.5546022057533264,
"mean_token_accuracy": 0.7768976241350174,
"num_tokens": 11345547.0,
"step": 697
},
{
"entropy": 0.5391091257333755,
"epoch": 2.606173994387278,
"grad_norm": 0.04271511733531952,
"learning_rate": 0.0002,
"loss": 0.541153073310852,
"mean_token_accuracy": 0.7804041355848312,
"num_tokens": 11361574.0,
"step": 698
},
{
"entropy": 0.5462173670530319,
"epoch": 2.6099158091674464,
"grad_norm": 0.03939999267458916,
"learning_rate": 0.0002,
"loss": 0.5369886159896851,
"mean_token_accuracy": 0.7804831266403198,
"num_tokens": 11377812.0,
"step": 699
},
{
"entropy": 0.5714237540960312,
"epoch": 2.6136576239476144,
"grad_norm": 0.03745459020137787,
"learning_rate": 0.0002,
"loss": 0.5620177984237671,
"mean_token_accuracy": 0.7719487398862839,
"num_tokens": 11394403.0,
"step": 700
},
{
"entropy": 0.5377793908119202,
"epoch": 2.617399438727783,
"grad_norm": 0.03732477128505707,
"learning_rate": 0.0002,
"loss": 0.5375291109085083,
"mean_token_accuracy": 0.7813573330640793,
"num_tokens": 11410706.0,
"step": 701
},
{
"entropy": 0.5385070145130157,
"epoch": 2.6211412535079512,
"grad_norm": 0.04680998623371124,
"learning_rate": 0.0002,
"loss": 0.5455629825592041,
"mean_token_accuracy": 0.776125431060791,
"num_tokens": 11427143.0,
"step": 702
},
{
"entropy": 0.5411592125892639,
"epoch": 2.6248830682881197,
"grad_norm": 0.037070900201797485,
"learning_rate": 0.0002,
"loss": 0.5470774173736572,
"mean_token_accuracy": 0.7772253155708313,
"num_tokens": 11443536.0,
"step": 703
},
{
"entropy": 0.5268983989953995,
"epoch": 2.628624883068288,
"grad_norm": 0.04107747972011566,
"learning_rate": 0.0002,
"loss": 0.5320890545845032,
"mean_token_accuracy": 0.7819889187812805,
"num_tokens": 11459635.0,
"step": 704
},
{
"entropy": 0.5278744846582413,
"epoch": 2.6323666978484566,
"grad_norm": 0.03608566150069237,
"learning_rate": 0.0002,
"loss": 0.5288647413253784,
"mean_token_accuracy": 0.7842333018779755,
"num_tokens": 11476037.0,
"step": 705
},
{
"entropy": 0.5504002794623375,
"epoch": 2.636108512628625,
"grad_norm": 0.041055019944906235,
"learning_rate": 0.0002,
"loss": 0.5523802638053894,
"mean_token_accuracy": 0.7737344652414322,
"num_tokens": 11492344.0,
"step": 706
},
{
"entropy": 0.541622132062912,
"epoch": 2.6398503274087934,
"grad_norm": 0.03790360316634178,
"learning_rate": 0.0002,
"loss": 0.5410860776901245,
"mean_token_accuracy": 0.7775967717170715,
"num_tokens": 11508715.0,
"step": 707
},
{
"entropy": 0.53721022605896,
"epoch": 2.643592142188962,
"grad_norm": 0.048964016139507294,
"learning_rate": 0.0002,
"loss": 0.5369323492050171,
"mean_token_accuracy": 0.7816558331251144,
"num_tokens": 11525153.0,
"step": 708
},
{
"entropy": 0.5321754217147827,
"epoch": 2.64733395696913,
"grad_norm": 0.048466358333826065,
"learning_rate": 0.0002,
"loss": 0.5365191698074341,
"mean_token_accuracy": 0.7804320156574249,
"num_tokens": 11541270.0,
"step": 709
},
{
"entropy": 0.5573434978723526,
"epoch": 2.6510757717492983,
"grad_norm": 0.045038264244794846,
"learning_rate": 0.0002,
"loss": 0.5563772320747375,
"mean_token_accuracy": 0.7737798243761063,
"num_tokens": 11557694.0,
"step": 710
},
{
"entropy": 0.5524247735738754,
"epoch": 2.6548175865294668,
"grad_norm": 0.038673996925354004,
"learning_rate": 0.0002,
"loss": 0.5518113970756531,
"mean_token_accuracy": 0.7768261432647705,
"num_tokens": 11574308.0,
"step": 711
},
{
"entropy": 0.5358691960573196,
"epoch": 2.658559401309635,
"grad_norm": 0.03978041559457779,
"learning_rate": 0.0002,
"loss": 0.5338990688323975,
"mean_token_accuracy": 0.7842043936252594,
"num_tokens": 11590586.0,
"step": 712
},
{
"entropy": 0.5332267433404922,
"epoch": 2.6623012160898036,
"grad_norm": 0.03574821725487709,
"learning_rate": 0.0002,
"loss": 0.5405697822570801,
"mean_token_accuracy": 0.7808981388807297,
"num_tokens": 11606867.0,
"step": 713
},
{
"entropy": 0.5254797339439392,
"epoch": 2.666043030869972,
"grad_norm": 0.040162764489650726,
"learning_rate": 0.0002,
"loss": 0.5316233038902283,
"mean_token_accuracy": 0.7839036136865616,
"num_tokens": 11623321.0,
"step": 714
},
{
"entropy": 0.5194612145423889,
"epoch": 2.66978484565014,
"grad_norm": 0.0536888912320137,
"learning_rate": 0.0002,
"loss": 0.5308873057365417,
"mean_token_accuracy": 0.7844232022762299,
"num_tokens": 11639616.0,
"step": 715
},
{
"entropy": 0.5397140085697174,
"epoch": 2.6735266604303085,
"grad_norm": 0.034708283841609955,
"learning_rate": 0.0002,
"loss": 0.5418391227722168,
"mean_token_accuracy": 0.7771459370851517,
"num_tokens": 11655924.0,
"step": 716
},
{
"entropy": 0.5523687899112701,
"epoch": 2.677268475210477,
"grad_norm": 0.03549209609627724,
"learning_rate": 0.0002,
"loss": 0.5451604127883911,
"mean_token_accuracy": 0.7780284285545349,
"num_tokens": 11672448.0,
"step": 717
},
{
"entropy": 0.5573620796203613,
"epoch": 2.6810102899906454,
"grad_norm": 0.03517598658800125,
"learning_rate": 0.0002,
"loss": 0.5482261180877686,
"mean_token_accuracy": 0.7732254415750504,
"num_tokens": 11688985.0,
"step": 718
},
{
"entropy": 0.5521951466798782,
"epoch": 2.684752104770814,
"grad_norm": 0.03560207411646843,
"learning_rate": 0.0002,
"loss": 0.5395568609237671,
"mean_token_accuracy": 0.7822758108377457,
"num_tokens": 11705608.0,
"step": 719
},
{
"entropy": 0.5614044666290283,
"epoch": 2.6884939195509823,
"grad_norm": 0.04236432537436485,
"learning_rate": 0.0002,
"loss": 0.5560280084609985,
"mean_token_accuracy": 0.7751108258962631,
"num_tokens": 11721966.0,
"step": 720
},
{
"entropy": 0.5331545174121857,
"epoch": 2.6922357343311507,
"grad_norm": 0.03850049898028374,
"learning_rate": 0.0002,
"loss": 0.5384074449539185,
"mean_token_accuracy": 0.7795211225748062,
"num_tokens": 11738118.0,
"step": 721
},
{
"entropy": 0.5322619527578354,
"epoch": 2.695977549111319,
"grad_norm": 0.04224139824509621,
"learning_rate": 0.0002,
"loss": 0.5480450987815857,
"mean_token_accuracy": 0.7758100479841232,
"num_tokens": 11754350.0,
"step": 722
},
{
"entropy": 0.53462353348732,
"epoch": 2.6997193638914876,
"grad_norm": 0.03856648504734039,
"learning_rate": 0.0002,
"loss": 0.5420241355895996,
"mean_token_accuracy": 0.7794053852558136,
"num_tokens": 11770468.0,
"step": 723
},
{
"entropy": 0.5529629737138748,
"epoch": 2.703461178671656,
"grad_norm": 0.03881238028407097,
"learning_rate": 0.0002,
"loss": 0.5515606999397278,
"mean_token_accuracy": 0.777623638510704,
"num_tokens": 11786891.0,
"step": 724
},
{
"entropy": 0.5365050584077835,
"epoch": 2.707202993451824,
"grad_norm": 0.030840173363685608,
"learning_rate": 0.0002,
"loss": 0.5374981760978699,
"mean_token_accuracy": 0.7810342460870743,
"num_tokens": 11803202.0,
"step": 725
},
{
"entropy": 0.5490061491727829,
"epoch": 2.7109448082319925,
"grad_norm": 0.03318411111831665,
"learning_rate": 0.0002,
"loss": 0.5416221022605896,
"mean_token_accuracy": 0.7810187339782715,
"num_tokens": 11819633.0,
"step": 726
},
{
"entropy": 0.5287661999464035,
"epoch": 2.714686623012161,
"grad_norm": 0.033848777413368225,
"learning_rate": 0.0002,
"loss": 0.5285395383834839,
"mean_token_accuracy": 0.785768449306488,
"num_tokens": 11835951.0,
"step": 727
},
{
"entropy": 0.5228402391076088,
"epoch": 2.7184284377923293,
"grad_norm": 0.037826504558324814,
"learning_rate": 0.0002,
"loss": 0.5267374515533447,
"mean_token_accuracy": 0.7853263914585114,
"num_tokens": 11852172.0,
"step": 728
},
{
"entropy": 0.5451251715421677,
"epoch": 2.722170252572498,
"grad_norm": 0.03935185819864273,
"learning_rate": 0.0002,
"loss": 0.5431327223777771,
"mean_token_accuracy": 0.7800047546625137,
"num_tokens": 11868665.0,
"step": 729
},
{
"entropy": 0.5370529890060425,
"epoch": 2.725912067352666,
"grad_norm": 0.040121592581272125,
"learning_rate": 0.0002,
"loss": 0.5504775643348694,
"mean_token_accuracy": 0.7777304202318192,
"num_tokens": 11884782.0,
"step": 730
},
{
"entropy": 0.5336936116218567,
"epoch": 2.729653882132834,
"grad_norm": 0.046451181173324585,
"learning_rate": 0.0002,
"loss": 0.5401822328567505,
"mean_token_accuracy": 0.7810492217540741,
"num_tokens": 11900966.0,
"step": 731
},
{
"entropy": 0.5421666949987411,
"epoch": 2.7333956969130027,
"grad_norm": 0.03996991366147995,
"learning_rate": 0.0002,
"loss": 0.5425142645835876,
"mean_token_accuracy": 0.7759256362915039,
"num_tokens": 11917559.0,
"step": 732
},
{
"entropy": 0.5548020005226135,
"epoch": 2.737137511693171,
"grad_norm": 0.039705440402030945,
"learning_rate": 0.0002,
"loss": 0.5471047163009644,
"mean_token_accuracy": 0.7788440138101578,
"num_tokens": 11933791.0,
"step": 733
},
{
"entropy": 0.5459768623113632,
"epoch": 2.7408793264733395,
"grad_norm": 0.044193848967552185,
"learning_rate": 0.0002,
"loss": 0.5505638718605042,
"mean_token_accuracy": 0.7753681987524033,
"num_tokens": 11949788.0,
"step": 734
},
{
"entropy": 0.5197051167488098,
"epoch": 2.744621141253508,
"grad_norm": 0.04006953909993172,
"learning_rate": 0.0002,
"loss": 0.5269069671630859,
"mean_token_accuracy": 0.7862325310707092,
"num_tokens": 11965909.0,
"step": 735
},
{
"entropy": 0.5576485246419907,
"epoch": 2.7483629560336764,
"grad_norm": 0.03677723556756973,
"learning_rate": 0.0002,
"loss": 0.5640283823013306,
"mean_token_accuracy": 0.7697114050388336,
"num_tokens": 11982388.0,
"step": 736
},
{
"entropy": 0.5379237085580826,
"epoch": 2.752104770813845,
"grad_norm": 0.03523614630103111,
"learning_rate": 0.0002,
"loss": 0.5367957353591919,
"mean_token_accuracy": 0.7794550508260727,
"num_tokens": 11998589.0,
"step": 737
},
{
"entropy": 0.5357311069965363,
"epoch": 2.7558465855940133,
"grad_norm": 0.03599949926137924,
"learning_rate": 0.0002,
"loss": 0.5299929976463318,
"mean_token_accuracy": 0.784047082066536,
"num_tokens": 12014892.0,
"step": 738
},
{
"entropy": 0.5434677302837372,
"epoch": 2.7595884003741817,
"grad_norm": 0.03983872011303902,
"learning_rate": 0.0002,
"loss": 0.537936806678772,
"mean_token_accuracy": 0.7832438200712204,
"num_tokens": 12030925.0,
"step": 739
},
{
"entropy": 0.5472689718008041,
"epoch": 2.7633302151543497,
"grad_norm": 0.03287053480744362,
"learning_rate": 0.0002,
"loss": 0.5477735996246338,
"mean_token_accuracy": 0.7759514302015305,
"num_tokens": 12047168.0,
"step": 740
},
{
"entropy": 0.5356525778770447,
"epoch": 2.767072029934518,
"grad_norm": 0.03699969872832298,
"learning_rate": 0.0002,
"loss": 0.5401504635810852,
"mean_token_accuracy": 0.7797222137451172,
"num_tokens": 12063859.0,
"step": 741
},
{
"entropy": 0.522783175110817,
"epoch": 2.7708138447146866,
"grad_norm": 0.04751390591263771,
"learning_rate": 0.0002,
"loss": 0.5334336161613464,
"mean_token_accuracy": 0.785777673125267,
"num_tokens": 12080092.0,
"step": 742
},
{
"entropy": 0.5513002574443817,
"epoch": 2.774555659494855,
"grad_norm": 0.04812496900558472,
"learning_rate": 0.0002,
"loss": 0.5542380809783936,
"mean_token_accuracy": 0.7760861963033676,
"num_tokens": 12096314.0,
"step": 743
},
{
"entropy": 0.5436785966157913,
"epoch": 2.7782974742750235,
"grad_norm": 0.03719832003116608,
"learning_rate": 0.0002,
"loss": 0.5375255346298218,
"mean_token_accuracy": 0.7817601412534714,
"num_tokens": 12112385.0,
"step": 744
},
{
"entropy": 0.5392426550388336,
"epoch": 2.782039289055192,
"grad_norm": 0.036235589534044266,
"learning_rate": 0.0002,
"loss": 0.5315327644348145,
"mean_token_accuracy": 0.783770278096199,
"num_tokens": 12128749.0,
"step": 745
},
{
"entropy": 0.5371043086051941,
"epoch": 2.78578110383536,
"grad_norm": 0.04002665355801582,
"learning_rate": 0.0002,
"loss": 0.5355648994445801,
"mean_token_accuracy": 0.7825834453105927,
"num_tokens": 12145069.0,
"step": 746
},
{
"entropy": 0.5386099964380264,
"epoch": 2.7895229186155284,
"grad_norm": 0.0372973270714283,
"learning_rate": 0.0002,
"loss": 0.5449782609939575,
"mean_token_accuracy": 0.7772656977176666,
"num_tokens": 12161381.0,
"step": 747
},
{
"entropy": 0.49367938190698624,
"epoch": 2.793264733395697,
"grad_norm": 0.042931776493787766,
"learning_rate": 0.0002,
"loss": 0.49913763999938965,
"mean_token_accuracy": 0.795563668012619,
"num_tokens": 12177674.0,
"step": 748
},
{
"entropy": 0.5577136278152466,
"epoch": 2.7970065481758652,
"grad_norm": 0.03464139625430107,
"learning_rate": 0.0002,
"loss": 0.563284158706665,
"mean_token_accuracy": 0.7712576389312744,
"num_tokens": 12194200.0,
"step": 749
},
{
"entropy": 0.5163726359605789,
"epoch": 2.8007483629560337,
"grad_norm": 0.043806042522192,
"learning_rate": 0.0002,
"loss": 0.5230565071105957,
"mean_token_accuracy": 0.7878428548574448,
"num_tokens": 12210649.0,
"step": 750
},
{
"entropy": 0.5474874824285507,
"epoch": 2.804490177736202,
"grad_norm": 0.03748728707432747,
"learning_rate": 0.0002,
"loss": 0.5494849681854248,
"mean_token_accuracy": 0.777756467461586,
"num_tokens": 12226971.0,
"step": 751
},
{
"entropy": 0.5351517200469971,
"epoch": 2.8082319925163706,
"grad_norm": 0.045867737382650375,
"learning_rate": 0.0002,
"loss": 0.539400577545166,
"mean_token_accuracy": 0.7824986279010773,
"num_tokens": 12243263.0,
"step": 752
},
{
"entropy": 0.5563795119524002,
"epoch": 2.811973807296539,
"grad_norm": 0.03956415131688118,
"learning_rate": 0.0002,
"loss": 0.5521907210350037,
"mean_token_accuracy": 0.7774280607700348,
"num_tokens": 12259518.0,
"step": 753
},
{
"entropy": 0.56000916659832,
"epoch": 2.8157156220767074,
"grad_norm": 0.038831926882267,
"learning_rate": 0.0002,
"loss": 0.5568797588348389,
"mean_token_accuracy": 0.7727828919887543,
"num_tokens": 12276004.0,
"step": 754
},
{
"entropy": 0.5431783348321915,
"epoch": 2.8194574368568754,
"grad_norm": 0.04772892966866493,
"learning_rate": 0.0002,
"loss": 0.5474101901054382,
"mean_token_accuracy": 0.7786049693822861,
"num_tokens": 12292373.0,
"step": 755
},
{
"entropy": 0.5570650398731232,
"epoch": 2.823199251637044,
"grad_norm": 0.03613967075943947,
"learning_rate": 0.0002,
"loss": 0.5507438778877258,
"mean_token_accuracy": 0.7748661190271378,
"num_tokens": 12309010.0,
"step": 756
},
{
"entropy": 0.5275236368179321,
"epoch": 2.8269410664172123,
"grad_norm": 0.04989537596702576,
"learning_rate": 0.0002,
"loss": 0.5294247269630432,
"mean_token_accuracy": 0.7852834612131119,
"num_tokens": 12325334.0,
"step": 757
},
{
"entropy": 0.5346865504980087,
"epoch": 2.8306828811973808,
"grad_norm": 0.03763777017593384,
"learning_rate": 0.0002,
"loss": 0.536054790019989,
"mean_token_accuracy": 0.7806695699691772,
"num_tokens": 12341700.0,
"step": 758
},
{
"entropy": 0.5543745011091232,
"epoch": 2.834424695977549,
"grad_norm": 0.045101623982191086,
"learning_rate": 0.0002,
"loss": 0.5560649037361145,
"mean_token_accuracy": 0.7761011719703674,
"num_tokens": 12358184.0,
"step": 759
},
{
"entropy": 0.5500671565532684,
"epoch": 2.8381665107577176,
"grad_norm": 0.042196061462163925,
"learning_rate": 0.0002,
"loss": 0.5577619075775146,
"mean_token_accuracy": 0.7745834439992905,
"num_tokens": 12374727.0,
"step": 760
},
{
"entropy": 0.5422725081443787,
"epoch": 2.8419083255378856,
"grad_norm": 0.037925731390714645,
"learning_rate": 0.0002,
"loss": 0.5486158132553101,
"mean_token_accuracy": 0.7735314965248108,
"num_tokens": 12391054.0,
"step": 761
},
{
"entropy": 0.5447213500738144,
"epoch": 2.845650140318054,
"grad_norm": 0.039297524839639664,
"learning_rate": 0.0002,
"loss": 0.5439249277114868,
"mean_token_accuracy": 0.7782430201768875,
"num_tokens": 12407240.0,
"step": 762
},
{
"entropy": 0.5623101443052292,
"epoch": 2.8493919550982225,
"grad_norm": 0.03727223724126816,
"learning_rate": 0.0002,
"loss": 0.5529690980911255,
"mean_token_accuracy": 0.7783486098051071,
"num_tokens": 12423651.0,
"step": 763
},
{
"entropy": 0.5487337410449982,
"epoch": 2.853133769878391,
"grad_norm": 0.041605204343795776,
"learning_rate": 0.0002,
"loss": 0.5483216047286987,
"mean_token_accuracy": 0.7777005285024643,
"num_tokens": 12439865.0,
"step": 764
},
{
"entropy": 0.5403908789157867,
"epoch": 2.8568755846585594,
"grad_norm": 0.042009830474853516,
"learning_rate": 0.0002,
"loss": 0.5446419715881348,
"mean_token_accuracy": 0.7782749831676483,
"num_tokens": 12456283.0,
"step": 765
},
{
"entropy": 0.5366557389497757,
"epoch": 2.860617399438728,
"grad_norm": 0.03936697915196419,
"learning_rate": 0.0002,
"loss": 0.542513370513916,
"mean_token_accuracy": 0.7779817581176758,
"num_tokens": 12472812.0,
"step": 766
},
{
"entropy": 0.5674513280391693,
"epoch": 2.8643592142188963,
"grad_norm": 0.050604403018951416,
"learning_rate": 0.0002,
"loss": 0.5683247447013855,
"mean_token_accuracy": 0.7713179588317871,
"num_tokens": 12489449.0,
"step": 767
},
{
"entropy": 0.5182722359895706,
"epoch": 2.8681010289990647,
"grad_norm": 0.036767635494470596,
"learning_rate": 0.0002,
"loss": 0.5209700465202332,
"mean_token_accuracy": 0.7906691282987595,
"num_tokens": 12505831.0,
"step": 768
},
{
"entropy": 0.5400542318820953,
"epoch": 2.871842843779233,
"grad_norm": 0.0423893928527832,
"learning_rate": 0.0002,
"loss": 0.5363757014274597,
"mean_token_accuracy": 0.7849675416946411,
"num_tokens": 12522266.0,
"step": 769
},
{
"entropy": 0.5384216755628586,
"epoch": 2.875584658559401,
"grad_norm": 0.03423478081822395,
"learning_rate": 0.0002,
"loss": 0.539215087890625,
"mean_token_accuracy": 0.7803387194871902,
"num_tokens": 12538797.0,
"step": 770
},
{
"entropy": 0.5494250059127808,
"epoch": 2.8793264733395696,
"grad_norm": 0.03864506259560585,
"learning_rate": 0.0002,
"loss": 0.5536534786224365,
"mean_token_accuracy": 0.7749843001365662,
"num_tokens": 12554840.0,
"step": 771
},
{
"entropy": 0.5292802900075912,
"epoch": 2.883068288119738,
"grad_norm": 0.03668517246842384,
"learning_rate": 0.0002,
"loss": 0.531915009021759,
"mean_token_accuracy": 0.7857315242290497,
"num_tokens": 12571194.0,
"step": 772
},
{
"entropy": 0.5444097071886063,
"epoch": 2.8868101028999065,
"grad_norm": 0.03593030199408531,
"learning_rate": 0.0002,
"loss": 0.5466811060905457,
"mean_token_accuracy": 0.7787587195634842,
"num_tokens": 12587746.0,
"step": 773
},
{
"entropy": 0.5468859821557999,
"epoch": 2.890551917680075,
"grad_norm": 0.042690832167863846,
"learning_rate": 0.0002,
"loss": 0.5463913679122925,
"mean_token_accuracy": 0.779534175992012,
"num_tokens": 12604183.0,
"step": 774
},
{
"entropy": 0.5508814752101898,
"epoch": 2.8942937324602434,
"grad_norm": 0.04205498844385147,
"learning_rate": 0.0002,
"loss": 0.5481387376785278,
"mean_token_accuracy": 0.776447519659996,
"num_tokens": 12620732.0,
"step": 775
},
{
"entropy": 0.5370959490537643,
"epoch": 2.8980355472404113,
"grad_norm": 0.04001722112298012,
"learning_rate": 0.0002,
"loss": 0.5357980728149414,
"mean_token_accuracy": 0.7828036099672318,
"num_tokens": 12636847.0,
"step": 776
},
{
"entropy": 0.5336840003728867,
"epoch": 2.90177736202058,
"grad_norm": 0.04124586284160614,
"learning_rate": 0.0002,
"loss": 0.5350784063339233,
"mean_token_accuracy": 0.7848693281412125,
"num_tokens": 12653376.0,
"step": 777
},
{
"entropy": 0.5422462821006775,
"epoch": 2.9055191768007482,
"grad_norm": 0.04322974756360054,
"learning_rate": 0.0002,
"loss": 0.5437650680541992,
"mean_token_accuracy": 0.7811295241117477,
"num_tokens": 12669838.0,
"step": 778
},
{
"entropy": 0.5301967561244965,
"epoch": 2.9092609915809167,
"grad_norm": 0.040180791169404984,
"learning_rate": 0.0002,
"loss": 0.5413050055503845,
"mean_token_accuracy": 0.7816843837499619,
"num_tokens": 12686338.0,
"step": 779
},
{
"entropy": 0.5494007170200348,
"epoch": 2.913002806361085,
"grad_norm": 0.03727947920560837,
"learning_rate": 0.0002,
"loss": 0.551271915435791,
"mean_token_accuracy": 0.7756839543581009,
"num_tokens": 12702976.0,
"step": 780
},
{
"entropy": 0.557955801486969,
"epoch": 2.9167446211412535,
"grad_norm": 0.03641374036669731,
"learning_rate": 0.0002,
"loss": 0.5591468214988708,
"mean_token_accuracy": 0.7722364217042923,
"num_tokens": 12719319.0,
"step": 781
},
{
"entropy": 0.5437477082014084,
"epoch": 2.920486435921422,
"grad_norm": 0.03696129098534584,
"learning_rate": 0.0002,
"loss": 0.539549708366394,
"mean_token_accuracy": 0.7802012413740158,
"num_tokens": 12735691.0,
"step": 782
},
{
"entropy": 0.5459663569927216,
"epoch": 2.9242282507015904,
"grad_norm": 0.03394176810979843,
"learning_rate": 0.0002,
"loss": 0.5432969331741333,
"mean_token_accuracy": 0.7803399115800858,
"num_tokens": 12752042.0,
"step": 783
},
{
"entropy": 0.540153980255127,
"epoch": 2.927970065481759,
"grad_norm": 0.04523579031229019,
"learning_rate": 0.0002,
"loss": 0.5408099889755249,
"mean_token_accuracy": 0.7797322869300842,
"num_tokens": 12768264.0,
"step": 784
},
{
"entropy": 0.5484558641910553,
"epoch": 2.9317118802619273,
"grad_norm": 0.03857382759451866,
"learning_rate": 0.0002,
"loss": 0.554611325263977,
"mean_token_accuracy": 0.7754960358142853,
"num_tokens": 12784469.0,
"step": 785
},
{
"entropy": 0.5373403131961823,
"epoch": 2.9354536950420953,
"grad_norm": 0.04521877318620682,
"learning_rate": 0.0002,
"loss": 0.5412609577178955,
"mean_token_accuracy": 0.7812603563070297,
"num_tokens": 12800714.0,
"step": 786
},
{
"entropy": 0.5420941710472107,
"epoch": 2.9391955098222637,
"grad_norm": 0.037385161966085434,
"learning_rate": 0.0002,
"loss": 0.5446354746818542,
"mean_token_accuracy": 0.7783695161342621,
"num_tokens": 12816921.0,
"step": 787
},
{
"entropy": 0.5351656675338745,
"epoch": 2.942937324602432,
"grad_norm": 0.041876692324876785,
"learning_rate": 0.0002,
"loss": 0.5376321077346802,
"mean_token_accuracy": 0.7807199209928513,
"num_tokens": 12833350.0,
"step": 788
},
{
"entropy": 0.5680812299251556,
"epoch": 2.9466791393826006,
"grad_norm": 0.040565043687820435,
"learning_rate": 0.0002,
"loss": 0.5634538531303406,
"mean_token_accuracy": 0.7689831405878067,
"num_tokens": 12849646.0,
"step": 789
},
{
"entropy": 0.5357328206300735,
"epoch": 2.950420954162769,
"grad_norm": 0.04082103073596954,
"learning_rate": 0.0002,
"loss": 0.5352612733840942,
"mean_token_accuracy": 0.7824973464012146,
"num_tokens": 12865840.0,
"step": 790
},
{
"entropy": 0.5547877848148346,
"epoch": 2.954162768942937,
"grad_norm": 0.04521463066339493,
"learning_rate": 0.0002,
"loss": 0.5542868971824646,
"mean_token_accuracy": 0.7752365618944168,
"num_tokens": 12882266.0,
"step": 791
},
{
"entropy": 0.5343262106180191,
"epoch": 2.9579045837231055,
"grad_norm": 0.039067838340997696,
"learning_rate": 0.0002,
"loss": 0.5333149433135986,
"mean_token_accuracy": 0.783295214176178,
"num_tokens": 12898704.0,
"step": 792
},
{
"entropy": 0.5165642648935318,
"epoch": 2.961646398503274,
"grad_norm": 0.04161246493458748,
"learning_rate": 0.0002,
"loss": 0.5219287276268005,
"mean_token_accuracy": 0.790781170129776,
"num_tokens": 12914733.0,
"step": 793
},
{
"entropy": 0.5363114923238754,
"epoch": 2.9653882132834424,
"grad_norm": 0.03739769384264946,
"learning_rate": 0.0002,
"loss": 0.5376189351081848,
"mean_token_accuracy": 0.7812457233667374,
"num_tokens": 12931042.0,
"step": 794
},
{
"entropy": 0.5318800210952759,
"epoch": 2.969130028063611,
"grad_norm": 0.047191355377435684,
"learning_rate": 0.0002,
"loss": 0.5360404849052429,
"mean_token_accuracy": 0.7821078598499298,
"num_tokens": 12947442.0,
"step": 795
},
{
"entropy": 0.5284593552350998,
"epoch": 2.9728718428437793,
"grad_norm": 0.03614107519388199,
"learning_rate": 0.0002,
"loss": 0.5247491598129272,
"mean_token_accuracy": 0.7871349304914474,
"num_tokens": 12963611.0,
"step": 796
},
{
"entropy": 0.5265946090221405,
"epoch": 2.9766136576239477,
"grad_norm": 0.04248823598027229,
"learning_rate": 0.0002,
"loss": 0.53187096118927,
"mean_token_accuracy": 0.78339883685112,
"num_tokens": 12979965.0,
"step": 797
},
{
"entropy": 0.5121617913246155,
"epoch": 2.980355472404116,
"grad_norm": 0.042288120836019516,
"learning_rate": 0.0002,
"loss": 0.5201407670974731,
"mean_token_accuracy": 0.7870761901140213,
"num_tokens": 12996017.0,
"step": 798
},
{
"entropy": 0.5229809135198593,
"epoch": 2.9840972871842846,
"grad_norm": 0.040804166346788406,
"learning_rate": 0.0002,
"loss": 0.5307119488716125,
"mean_token_accuracy": 0.7831887602806091,
"num_tokens": 13012277.0,
"step": 799
},
{
"entropy": 0.5386293828487396,
"epoch": 2.987839101964453,
"grad_norm": 0.04149458184838295,
"learning_rate": 0.0002,
"loss": 0.5341092348098755,
"mean_token_accuracy": 0.783338725566864,
"num_tokens": 13028574.0,
"step": 800
},
{
"entropy": 0.5334920659661293,
"epoch": 2.991580916744621,
"grad_norm": 0.04282135143876076,
"learning_rate": 0.0002,
"loss": 0.531876802444458,
"mean_token_accuracy": 0.7834694683551788,
"num_tokens": 13044829.0,
"step": 801
},
{
"entropy": 0.5673989802598953,
"epoch": 2.9953227315247895,
"grad_norm": 0.03961246460676193,
"learning_rate": 0.0002,
"loss": 0.5678121447563171,
"mean_token_accuracy": 0.7711912542581558,
"num_tokens": 13061330.0,
"step": 802
},
{
"entropy": 0.531833752989769,
"epoch": 2.999064546304958,
"grad_norm": 0.03890501707792282,
"learning_rate": 0.0002,
"loss": 0.5328924655914307,
"mean_token_accuracy": 0.7814844250679016,
"num_tokens": 13077343.0,
"step": 803
},
{
"entropy": 0.5831514596939087,
"epoch": 3.0,
"grad_norm": 0.06591155380010605,
"learning_rate": 0.0002,
"loss": 0.5364804267883301,
"mean_token_accuracy": 0.7760791182518005,
"num_tokens": 13078463.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.218543283492356e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}