sublim-phase4-combo-03 / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
edd33b9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.138745129108429,
"epoch": 0.0037313432835820895,
"grad_norm": 1.7020611763000488,
"learning_rate": 0.0002,
"loss": 2.4721007347106934,
"mean_token_accuracy": 0.5372578650712967,
"num_tokens": 16325.0,
"step": 1
},
{
"entropy": 1.2262731790542603,
"epoch": 0.007462686567164179,
"grad_norm": 1.5422499179840088,
"learning_rate": 0.0002,
"loss": 2.1402571201324463,
"mean_token_accuracy": 0.5742411762475967,
"num_tokens": 32666.0,
"step": 2
},
{
"entropy": 1.409499078989029,
"epoch": 0.011194029850746268,
"grad_norm": 1.1927348375320435,
"learning_rate": 0.0002,
"loss": 1.7202329635620117,
"mean_token_accuracy": 0.5956366509199142,
"num_tokens": 48877.0,
"step": 3
},
{
"entropy": 1.3392578959465027,
"epoch": 0.014925373134328358,
"grad_norm": 0.9159098863601685,
"learning_rate": 0.0002,
"loss": 1.3790637254714966,
"mean_token_accuracy": 0.6494399756193161,
"num_tokens": 65097.0,
"step": 4
},
{
"entropy": 1.329741895198822,
"epoch": 0.018656716417910446,
"grad_norm": 0.9530413150787354,
"learning_rate": 0.0002,
"loss": 1.2827703952789307,
"mean_token_accuracy": 0.649653822183609,
"num_tokens": 81423.0,
"step": 5
},
{
"entropy": 1.2239453792572021,
"epoch": 0.022388059701492536,
"grad_norm": 0.45381543040275574,
"learning_rate": 0.0002,
"loss": 1.1552369594573975,
"mean_token_accuracy": 0.6654698848724365,
"num_tokens": 97674.0,
"step": 6
},
{
"entropy": 1.1408285796642303,
"epoch": 0.026119402985074626,
"grad_norm": 0.40323638916015625,
"learning_rate": 0.0002,
"loss": 1.063366413116455,
"mean_token_accuracy": 0.6731287389993668,
"num_tokens": 114207.0,
"step": 7
},
{
"entropy": 1.0467455089092255,
"epoch": 0.029850746268656716,
"grad_norm": 0.4862216114997864,
"learning_rate": 0.0002,
"loss": 0.9919917583465576,
"mean_token_accuracy": 0.6862917095422745,
"num_tokens": 130364.0,
"step": 8
},
{
"entropy": 0.9914536625146866,
"epoch": 0.033582089552238806,
"grad_norm": 0.563399612903595,
"learning_rate": 0.0002,
"loss": 0.9576236605644226,
"mean_token_accuracy": 0.6916692554950714,
"num_tokens": 146675.0,
"step": 9
},
{
"entropy": 0.9863343089818954,
"epoch": 0.03731343283582089,
"grad_norm": 0.4532151520252228,
"learning_rate": 0.0002,
"loss": 0.8861619234085083,
"mean_token_accuracy": 0.7066572606563568,
"num_tokens": 162793.0,
"step": 10
},
{
"entropy": 0.9439148157835007,
"epoch": 0.041044776119402986,
"grad_norm": 0.4917202889919281,
"learning_rate": 0.0002,
"loss": 0.8438840508460999,
"mean_token_accuracy": 0.7115702927112579,
"num_tokens": 178972.0,
"step": 11
},
{
"entropy": 0.86412213742733,
"epoch": 0.04477611940298507,
"grad_norm": 0.4633786678314209,
"learning_rate": 0.0002,
"loss": 0.8079400658607483,
"mean_token_accuracy": 0.7117275148630142,
"num_tokens": 195446.0,
"step": 12
},
{
"entropy": 0.7569762617349625,
"epoch": 0.048507462686567165,
"grad_norm": 0.4152548909187317,
"learning_rate": 0.0002,
"loss": 0.7479823231697083,
"mean_token_accuracy": 0.7288273125886917,
"num_tokens": 211604.0,
"step": 13
},
{
"entropy": 0.7370023280382156,
"epoch": 0.05223880597014925,
"grad_norm": 0.38331395387649536,
"learning_rate": 0.0002,
"loss": 0.7293781638145447,
"mean_token_accuracy": 0.7328485548496246,
"num_tokens": 228114.0,
"step": 14
},
{
"entropy": 0.6818548142910004,
"epoch": 0.055970149253731345,
"grad_norm": 0.4065186679363251,
"learning_rate": 0.0002,
"loss": 0.6948679685592651,
"mean_token_accuracy": 0.7417702227830887,
"num_tokens": 244615.0,
"step": 15
},
{
"entropy": 0.6801213175058365,
"epoch": 0.05970149253731343,
"grad_norm": 0.3765408992767334,
"learning_rate": 0.0002,
"loss": 0.6942192316055298,
"mean_token_accuracy": 0.7383946776390076,
"num_tokens": 260940.0,
"step": 16
},
{
"entropy": 0.6828830540180206,
"epoch": 0.06343283582089553,
"grad_norm": 0.31789109110832214,
"learning_rate": 0.0002,
"loss": 0.6663458347320557,
"mean_token_accuracy": 0.7480802536010742,
"num_tokens": 277198.0,
"step": 17
},
{
"entropy": 0.6609166115522385,
"epoch": 0.06716417910447761,
"grad_norm": 0.3814696669578552,
"learning_rate": 0.0002,
"loss": 0.6373794078826904,
"mean_token_accuracy": 0.7566290199756622,
"num_tokens": 293415.0,
"step": 18
},
{
"entropy": 0.6822013854980469,
"epoch": 0.0708955223880597,
"grad_norm": 0.3390759527683258,
"learning_rate": 0.0002,
"loss": 0.6543835997581482,
"mean_token_accuracy": 0.7451244294643402,
"num_tokens": 309815.0,
"step": 19
},
{
"entropy": 0.632593423128128,
"epoch": 0.07462686567164178,
"grad_norm": 0.41862595081329346,
"learning_rate": 0.0002,
"loss": 0.6299830675125122,
"mean_token_accuracy": 0.7534051537513733,
"num_tokens": 326057.0,
"step": 20
},
{
"entropy": 0.6358507871627808,
"epoch": 0.07835820895522388,
"grad_norm": 0.30084753036499023,
"learning_rate": 0.0002,
"loss": 0.62652587890625,
"mean_token_accuracy": 0.7561640441417694,
"num_tokens": 342366.0,
"step": 21
},
{
"entropy": 0.601889356970787,
"epoch": 0.08208955223880597,
"grad_norm": 0.30453744530677795,
"learning_rate": 0.0002,
"loss": 0.5936654210090637,
"mean_token_accuracy": 0.7655821740627289,
"num_tokens": 358935.0,
"step": 22
},
{
"entropy": 0.5926243662834167,
"epoch": 0.08582089552238806,
"grad_norm": 0.24678799510002136,
"learning_rate": 0.0002,
"loss": 0.5894668698310852,
"mean_token_accuracy": 0.7695567756891251,
"num_tokens": 375125.0,
"step": 23
},
{
"entropy": 0.5948957055807114,
"epoch": 0.08955223880597014,
"grad_norm": 0.26838821172714233,
"learning_rate": 0.0002,
"loss": 0.5975726246833801,
"mean_token_accuracy": 0.766963854432106,
"num_tokens": 391519.0,
"step": 24
},
{
"entropy": 0.5925572067499161,
"epoch": 0.09328358208955224,
"grad_norm": 0.24850629270076752,
"learning_rate": 0.0002,
"loss": 0.5895435214042664,
"mean_token_accuracy": 0.7683891654014587,
"num_tokens": 408003.0,
"step": 25
},
{
"entropy": 0.579643040895462,
"epoch": 0.09701492537313433,
"grad_norm": 0.24649304151535034,
"learning_rate": 0.0002,
"loss": 0.5773741006851196,
"mean_token_accuracy": 0.7704576104879379,
"num_tokens": 424170.0,
"step": 26
},
{
"entropy": 0.579850047826767,
"epoch": 0.10074626865671642,
"grad_norm": 0.24893403053283691,
"learning_rate": 0.0002,
"loss": 0.5705626010894775,
"mean_token_accuracy": 0.7733898609876633,
"num_tokens": 440584.0,
"step": 27
},
{
"entropy": 0.5937480330467224,
"epoch": 0.1044776119402985,
"grad_norm": 0.222214013338089,
"learning_rate": 0.0002,
"loss": 0.584485650062561,
"mean_token_accuracy": 0.7649911344051361,
"num_tokens": 456887.0,
"step": 28
},
{
"entropy": 0.5631287395954132,
"epoch": 0.10820895522388059,
"grad_norm": 0.26287850737571716,
"learning_rate": 0.0002,
"loss": 0.559370219707489,
"mean_token_accuracy": 0.7786488234996796,
"num_tokens": 473285.0,
"step": 29
},
{
"entropy": 0.5510498583316803,
"epoch": 0.11194029850746269,
"grad_norm": 0.2989422380924225,
"learning_rate": 0.0002,
"loss": 0.5596640110015869,
"mean_token_accuracy": 0.7761659324169159,
"num_tokens": 489394.0,
"step": 30
},
{
"entropy": 0.5780725926160812,
"epoch": 0.11567164179104478,
"grad_norm": 0.23725202679634094,
"learning_rate": 0.0002,
"loss": 0.5835093259811401,
"mean_token_accuracy": 0.7684815227985382,
"num_tokens": 505756.0,
"step": 31
},
{
"entropy": 0.5761191546916962,
"epoch": 0.11940298507462686,
"grad_norm": 0.2031526267528534,
"learning_rate": 0.0002,
"loss": 0.5835364460945129,
"mean_token_accuracy": 0.7682848125696182,
"num_tokens": 522094.0,
"step": 32
},
{
"entropy": 0.5485773086547852,
"epoch": 0.12313432835820895,
"grad_norm": 0.20444567501544952,
"learning_rate": 0.0002,
"loss": 0.5546419620513916,
"mean_token_accuracy": 0.777488186955452,
"num_tokens": 538415.0,
"step": 33
},
{
"entropy": 0.5861198753118515,
"epoch": 0.12686567164179105,
"grad_norm": 0.21942971646785736,
"learning_rate": 0.0002,
"loss": 0.5825690031051636,
"mean_token_accuracy": 0.7697215527296066,
"num_tokens": 554886.0,
"step": 34
},
{
"entropy": 0.5715848505496979,
"epoch": 0.13059701492537312,
"grad_norm": 0.20764704048633575,
"learning_rate": 0.0002,
"loss": 0.570915162563324,
"mean_token_accuracy": 0.7720184922218323,
"num_tokens": 571367.0,
"step": 35
},
{
"entropy": 0.5560943633317947,
"epoch": 0.13432835820895522,
"grad_norm": 0.20819340646266937,
"learning_rate": 0.0002,
"loss": 0.5549942851066589,
"mean_token_accuracy": 0.7778844088315964,
"num_tokens": 587594.0,
"step": 36
},
{
"entropy": 0.556964784860611,
"epoch": 0.13805970149253732,
"grad_norm": 0.17859336733818054,
"learning_rate": 0.0002,
"loss": 0.5563804507255554,
"mean_token_accuracy": 0.7767369300127029,
"num_tokens": 604052.0,
"step": 37
},
{
"entropy": 0.5532324761152267,
"epoch": 0.1417910447761194,
"grad_norm": 0.18194721639156342,
"learning_rate": 0.0002,
"loss": 0.5552038550376892,
"mean_token_accuracy": 0.7764725238084793,
"num_tokens": 620200.0,
"step": 38
},
{
"entropy": 0.5707972347736359,
"epoch": 0.1455223880597015,
"grad_norm": 0.17879748344421387,
"learning_rate": 0.0002,
"loss": 0.568923830986023,
"mean_token_accuracy": 0.7714048773050308,
"num_tokens": 636528.0,
"step": 39
},
{
"entropy": 0.5603279024362564,
"epoch": 0.14925373134328357,
"grad_norm": 0.19374136626720428,
"learning_rate": 0.0002,
"loss": 0.5574957728385925,
"mean_token_accuracy": 0.7773427516222,
"num_tokens": 652629.0,
"step": 40
},
{
"entropy": 0.5546282231807709,
"epoch": 0.15298507462686567,
"grad_norm": 0.19636894762516022,
"learning_rate": 0.0002,
"loss": 0.5532153844833374,
"mean_token_accuracy": 0.7793182134628296,
"num_tokens": 668683.0,
"step": 41
},
{
"entropy": 0.5812623649835587,
"epoch": 0.15671641791044777,
"grad_norm": 0.17162267863750458,
"learning_rate": 0.0002,
"loss": 0.5755793452262878,
"mean_token_accuracy": 0.7692758589982986,
"num_tokens": 685277.0,
"step": 42
},
{
"entropy": 0.5617634505033493,
"epoch": 0.16044776119402984,
"grad_norm": 0.16276565194129944,
"learning_rate": 0.0002,
"loss": 0.5628421306610107,
"mean_token_accuracy": 0.7769913524389267,
"num_tokens": 701728.0,
"step": 43
},
{
"entropy": 0.5570202618837357,
"epoch": 0.16417910447761194,
"grad_norm": 0.16841551661491394,
"learning_rate": 0.0002,
"loss": 0.5597431659698486,
"mean_token_accuracy": 0.7756171226501465,
"num_tokens": 718323.0,
"step": 44
},
{
"entropy": 0.5491841286420822,
"epoch": 0.16791044776119404,
"grad_norm": 0.14662496745586395,
"learning_rate": 0.0002,
"loss": 0.5556524991989136,
"mean_token_accuracy": 0.7775459736585617,
"num_tokens": 734628.0,
"step": 45
},
{
"entropy": 0.5427970439195633,
"epoch": 0.17164179104477612,
"grad_norm": 0.13948297500610352,
"learning_rate": 0.0002,
"loss": 0.5476619601249695,
"mean_token_accuracy": 0.7795768678188324,
"num_tokens": 750996.0,
"step": 46
},
{
"entropy": 0.5452166348695755,
"epoch": 0.17537313432835822,
"grad_norm": 0.17319753766059875,
"learning_rate": 0.0002,
"loss": 0.5554689168930054,
"mean_token_accuracy": 0.7776593416929245,
"num_tokens": 767284.0,
"step": 47
},
{
"entropy": 0.5613571405410767,
"epoch": 0.1791044776119403,
"grad_norm": 0.15226703882217407,
"learning_rate": 0.0002,
"loss": 0.5640038847923279,
"mean_token_accuracy": 0.7746699303388596,
"num_tokens": 783601.0,
"step": 48
},
{
"entropy": 0.5535127073526382,
"epoch": 0.1828358208955224,
"grad_norm": 0.166432186961174,
"learning_rate": 0.0002,
"loss": 0.5462499856948853,
"mean_token_accuracy": 0.7813286185264587,
"num_tokens": 799773.0,
"step": 49
},
{
"entropy": 0.5604032725095749,
"epoch": 0.1865671641791045,
"grad_norm": 0.17004649341106415,
"learning_rate": 0.0002,
"loss": 0.5530112981796265,
"mean_token_accuracy": 0.7776568233966827,
"num_tokens": 816032.0,
"step": 50
},
{
"entropy": 0.5409559532999992,
"epoch": 0.19029850746268656,
"grad_norm": 0.14887484908103943,
"learning_rate": 0.0002,
"loss": 0.5343962907791138,
"mean_token_accuracy": 0.7841377556324005,
"num_tokens": 832227.0,
"step": 51
},
{
"entropy": 0.5414481312036514,
"epoch": 0.19402985074626866,
"grad_norm": 0.20319198071956635,
"learning_rate": 0.0002,
"loss": 0.5386375188827515,
"mean_token_accuracy": 0.7845792174339294,
"num_tokens": 848643.0,
"step": 52
},
{
"entropy": 0.5497538298368454,
"epoch": 0.19776119402985073,
"grad_norm": 0.16608890891075134,
"learning_rate": 0.0002,
"loss": 0.5512281656265259,
"mean_token_accuracy": 0.7805987298488617,
"num_tokens": 865199.0,
"step": 53
},
{
"entropy": 0.545375257730484,
"epoch": 0.20149253731343283,
"grad_norm": 0.17525805532932281,
"learning_rate": 0.0002,
"loss": 0.5542587637901306,
"mean_token_accuracy": 0.7773701697587967,
"num_tokens": 881379.0,
"step": 54
},
{
"entropy": 0.5477564036846161,
"epoch": 0.20522388059701493,
"grad_norm": 0.19050806760787964,
"learning_rate": 0.0002,
"loss": 0.5655733942985535,
"mean_token_accuracy": 0.7745383828878403,
"num_tokens": 897934.0,
"step": 55
},
{
"entropy": 0.5568059235811234,
"epoch": 0.208955223880597,
"grad_norm": 0.16148774325847626,
"learning_rate": 0.0002,
"loss": 0.5592997074127197,
"mean_token_accuracy": 0.7772074788808823,
"num_tokens": 914308.0,
"step": 56
},
{
"entropy": 0.5678450167179108,
"epoch": 0.2126865671641791,
"grad_norm": 0.16515380144119263,
"learning_rate": 0.0002,
"loss": 0.569266676902771,
"mean_token_accuracy": 0.7714356333017349,
"num_tokens": 930508.0,
"step": 57
},
{
"entropy": 0.580150917172432,
"epoch": 0.21641791044776118,
"grad_norm": 0.17066031694412231,
"learning_rate": 0.0002,
"loss": 0.5749757289886475,
"mean_token_accuracy": 0.7655356675386429,
"num_tokens": 946877.0,
"step": 58
},
{
"entropy": 0.5695585310459137,
"epoch": 0.22014925373134328,
"grad_norm": 0.16599293053150177,
"learning_rate": 0.0002,
"loss": 0.5635928511619568,
"mean_token_accuracy": 0.7739954739809036,
"num_tokens": 963218.0,
"step": 59
},
{
"entropy": 0.5330293923616409,
"epoch": 0.22388059701492538,
"grad_norm": 0.14891624450683594,
"learning_rate": 0.0002,
"loss": 0.5344960689544678,
"mean_token_accuracy": 0.7841218858957291,
"num_tokens": 979460.0,
"step": 60
},
{
"entropy": 0.5383697599172592,
"epoch": 0.22761194029850745,
"grad_norm": 0.16252915561199188,
"learning_rate": 0.0002,
"loss": 0.5413715243339539,
"mean_token_accuracy": 0.7826660871505737,
"num_tokens": 995619.0,
"step": 61
},
{
"entropy": 0.5535406023263931,
"epoch": 0.23134328358208955,
"grad_norm": 0.15229789912700653,
"learning_rate": 0.0002,
"loss": 0.558712899684906,
"mean_token_accuracy": 0.7769492119550705,
"num_tokens": 1011885.0,
"step": 62
},
{
"entropy": 0.5603247284889221,
"epoch": 0.23507462686567165,
"grad_norm": 0.14967045187950134,
"learning_rate": 0.0002,
"loss": 0.5645769834518433,
"mean_token_accuracy": 0.771862581372261,
"num_tokens": 1028352.0,
"step": 63
},
{
"entropy": 0.563384547829628,
"epoch": 0.23880597014925373,
"grad_norm": 0.15884719789028168,
"learning_rate": 0.0002,
"loss": 0.5637681484222412,
"mean_token_accuracy": 0.7742781639099121,
"num_tokens": 1044550.0,
"step": 64
},
{
"entropy": 0.5692009180784225,
"epoch": 0.24253731343283583,
"grad_norm": 0.16877400875091553,
"learning_rate": 0.0002,
"loss": 0.5609120726585388,
"mean_token_accuracy": 0.7724380940198898,
"num_tokens": 1060869.0,
"step": 65
},
{
"entropy": 0.5652668327093124,
"epoch": 0.2462686567164179,
"grad_norm": 0.14263105392456055,
"learning_rate": 0.0002,
"loss": 0.5577319264411926,
"mean_token_accuracy": 0.7767308205366135,
"num_tokens": 1077318.0,
"step": 66
},
{
"entropy": 0.5624865591526031,
"epoch": 0.25,
"grad_norm": 0.1326468139886856,
"learning_rate": 0.0002,
"loss": 0.5610349774360657,
"mean_token_accuracy": 0.7767885029315948,
"num_tokens": 1093946.0,
"step": 67
},
{
"entropy": 0.5453900694847107,
"epoch": 0.2537313432835821,
"grad_norm": 0.15602754056453705,
"learning_rate": 0.0002,
"loss": 0.5474068522453308,
"mean_token_accuracy": 0.7804547101259232,
"num_tokens": 1110166.0,
"step": 68
},
{
"entropy": 0.5495888441801071,
"epoch": 0.2574626865671642,
"grad_norm": 0.16421914100646973,
"learning_rate": 0.0002,
"loss": 0.5586546063423157,
"mean_token_accuracy": 0.7761986404657364,
"num_tokens": 1126524.0,
"step": 69
},
{
"entropy": 0.5564677566289902,
"epoch": 0.26119402985074625,
"grad_norm": 0.17955079674720764,
"learning_rate": 0.0002,
"loss": 0.570371687412262,
"mean_token_accuracy": 0.7711490094661713,
"num_tokens": 1142935.0,
"step": 70
},
{
"entropy": 0.5473903864622116,
"epoch": 0.26492537313432835,
"grad_norm": 0.14180611073970795,
"learning_rate": 0.0002,
"loss": 0.549370527267456,
"mean_token_accuracy": 0.7789817303419113,
"num_tokens": 1159182.0,
"step": 71
},
{
"entropy": 0.5544993579387665,
"epoch": 0.26865671641791045,
"grad_norm": 0.1569361388683319,
"learning_rate": 0.0002,
"loss": 0.5507487058639526,
"mean_token_accuracy": 0.7766937166452408,
"num_tokens": 1175525.0,
"step": 72
},
{
"entropy": 0.5662118345499039,
"epoch": 0.27238805970149255,
"grad_norm": 0.15652883052825928,
"learning_rate": 0.0002,
"loss": 0.5632150173187256,
"mean_token_accuracy": 0.7702545374631882,
"num_tokens": 1191955.0,
"step": 73
},
{
"entropy": 0.5581929385662079,
"epoch": 0.27611940298507465,
"grad_norm": 0.1360681653022766,
"learning_rate": 0.0002,
"loss": 0.5503684878349304,
"mean_token_accuracy": 0.7764260619878769,
"num_tokens": 1208034.0,
"step": 74
},
{
"entropy": 0.5687559396028519,
"epoch": 0.2798507462686567,
"grad_norm": 0.13728748261928558,
"learning_rate": 0.0002,
"loss": 0.5678715109825134,
"mean_token_accuracy": 0.7728003114461899,
"num_tokens": 1224533.0,
"step": 75
},
{
"entropy": 0.5481379926204681,
"epoch": 0.2835820895522388,
"grad_norm": 0.16217739880084991,
"learning_rate": 0.0002,
"loss": 0.5537081956863403,
"mean_token_accuracy": 0.7751952260732651,
"num_tokens": 1240962.0,
"step": 76
},
{
"entropy": 0.5639017820358276,
"epoch": 0.2873134328358209,
"grad_norm": 0.1611357033252716,
"learning_rate": 0.0002,
"loss": 0.5741861462593079,
"mean_token_accuracy": 0.7681055814027786,
"num_tokens": 1257195.0,
"step": 77
},
{
"entropy": 0.5481198877096176,
"epoch": 0.291044776119403,
"grad_norm": 0.12783770263195038,
"learning_rate": 0.0002,
"loss": 0.5473082065582275,
"mean_token_accuracy": 0.777423769235611,
"num_tokens": 1273603.0,
"step": 78
},
{
"entropy": 0.539246067404747,
"epoch": 0.2947761194029851,
"grad_norm": 0.1314576119184494,
"learning_rate": 0.0002,
"loss": 0.5311948657035828,
"mean_token_accuracy": 0.7861492037773132,
"num_tokens": 1289837.0,
"step": 79
},
{
"entropy": 0.554696649312973,
"epoch": 0.29850746268656714,
"grad_norm": 0.1476278305053711,
"learning_rate": 0.0002,
"loss": 0.5538964867591858,
"mean_token_accuracy": 0.7750344574451447,
"num_tokens": 1306338.0,
"step": 80
},
{
"entropy": 0.5469587296247482,
"epoch": 0.30223880597014924,
"grad_norm": 0.16194719076156616,
"learning_rate": 0.0002,
"loss": 0.554472804069519,
"mean_token_accuracy": 0.7799090445041656,
"num_tokens": 1322825.0,
"step": 81
},
{
"entropy": 0.5433253645896912,
"epoch": 0.30597014925373134,
"grad_norm": 0.16987131536006927,
"learning_rate": 0.0002,
"loss": 0.5523664355278015,
"mean_token_accuracy": 0.776031419634819,
"num_tokens": 1338865.0,
"step": 82
},
{
"entropy": 0.5386127680540085,
"epoch": 0.30970149253731344,
"grad_norm": 0.14176225662231445,
"learning_rate": 0.0002,
"loss": 0.5489001870155334,
"mean_token_accuracy": 0.7799653261899948,
"num_tokens": 1355248.0,
"step": 83
},
{
"entropy": 0.5415250957012177,
"epoch": 0.31343283582089554,
"grad_norm": 0.17086099088191986,
"learning_rate": 0.0002,
"loss": 0.545318067073822,
"mean_token_accuracy": 0.7825302183628082,
"num_tokens": 1371746.0,
"step": 84
},
{
"entropy": 0.5727111548185349,
"epoch": 0.31716417910447764,
"grad_norm": 0.15196099877357483,
"learning_rate": 0.0002,
"loss": 0.5717822909355164,
"mean_token_accuracy": 0.769862562417984,
"num_tokens": 1388201.0,
"step": 85
},
{
"entropy": 0.5487467050552368,
"epoch": 0.3208955223880597,
"grad_norm": 0.12406057119369507,
"learning_rate": 0.0002,
"loss": 0.5426313877105713,
"mean_token_accuracy": 0.7817563712596893,
"num_tokens": 1404461.0,
"step": 86
},
{
"entropy": 0.5417477786540985,
"epoch": 0.3246268656716418,
"grad_norm": 0.1868571937084198,
"learning_rate": 0.0002,
"loss": 0.5441780090332031,
"mean_token_accuracy": 0.7824695259332657,
"num_tokens": 1420484.0,
"step": 87
},
{
"entropy": 0.552739754319191,
"epoch": 0.3283582089552239,
"grad_norm": 0.12260660529136658,
"learning_rate": 0.0002,
"loss": 0.5459186434745789,
"mean_token_accuracy": 0.7800513356924057,
"num_tokens": 1436981.0,
"step": 88
},
{
"entropy": 0.5539838075637817,
"epoch": 0.332089552238806,
"grad_norm": 0.19637417793273926,
"learning_rate": 0.0002,
"loss": 0.5502506494522095,
"mean_token_accuracy": 0.779677152633667,
"num_tokens": 1453360.0,
"step": 89
},
{
"entropy": 0.5393257439136505,
"epoch": 0.3358208955223881,
"grad_norm": 0.14825744926929474,
"learning_rate": 0.0002,
"loss": 0.5465800762176514,
"mean_token_accuracy": 0.7785906046628952,
"num_tokens": 1469575.0,
"step": 90
},
{
"entropy": 0.5319312065839767,
"epoch": 0.33955223880597013,
"grad_norm": 0.1817854791879654,
"learning_rate": 0.0002,
"loss": 0.5348737835884094,
"mean_token_accuracy": 0.7835152447223663,
"num_tokens": 1485763.0,
"step": 91
},
{
"entropy": 0.5510641485452652,
"epoch": 0.34328358208955223,
"grad_norm": 0.1455191969871521,
"learning_rate": 0.0002,
"loss": 0.5464341044425964,
"mean_token_accuracy": 0.7820889949798584,
"num_tokens": 1502105.0,
"step": 92
},
{
"entropy": 0.5406191498041153,
"epoch": 0.34701492537313433,
"grad_norm": 0.1273794025182724,
"learning_rate": 0.0002,
"loss": 0.5421090722084045,
"mean_token_accuracy": 0.7849924713373184,
"num_tokens": 1518477.0,
"step": 93
},
{
"entropy": 0.5232429951429367,
"epoch": 0.35074626865671643,
"grad_norm": 0.14684391021728516,
"learning_rate": 0.0002,
"loss": 0.5232130289077759,
"mean_token_accuracy": 0.7893925607204437,
"num_tokens": 1534743.0,
"step": 94
},
{
"entropy": 0.5493894517421722,
"epoch": 0.35447761194029853,
"grad_norm": 0.12976326048374176,
"learning_rate": 0.0002,
"loss": 0.5556308627128601,
"mean_token_accuracy": 0.7738792598247528,
"num_tokens": 1551015.0,
"step": 95
},
{
"entropy": 0.5568605363368988,
"epoch": 0.3582089552238806,
"grad_norm": 0.15545816719532013,
"learning_rate": 0.0002,
"loss": 0.5611149668693542,
"mean_token_accuracy": 0.7729773372411728,
"num_tokens": 1567597.0,
"step": 96
},
{
"entropy": 0.554488942027092,
"epoch": 0.3619402985074627,
"grad_norm": 0.1307706981897354,
"learning_rate": 0.0002,
"loss": 0.5501843094825745,
"mean_token_accuracy": 0.7798233777284622,
"num_tokens": 1583851.0,
"step": 97
},
{
"entropy": 0.5296479314565659,
"epoch": 0.3656716417910448,
"grad_norm": 0.1413222700357437,
"learning_rate": 0.0002,
"loss": 0.5348843932151794,
"mean_token_accuracy": 0.7847397029399872,
"num_tokens": 1599880.0,
"step": 98
},
{
"entropy": 0.5739381164312363,
"epoch": 0.3694029850746269,
"grad_norm": 0.14992888271808624,
"learning_rate": 0.0002,
"loss": 0.5711988210678101,
"mean_token_accuracy": 0.769414946436882,
"num_tokens": 1616161.0,
"step": 99
},
{
"entropy": 0.5500659346580505,
"epoch": 0.373134328358209,
"grad_norm": 0.13987883925437927,
"learning_rate": 0.0002,
"loss": 0.5535774230957031,
"mean_token_accuracy": 0.7796037644147873,
"num_tokens": 1632650.0,
"step": 100
},
{
"entropy": 0.5421769469976425,
"epoch": 0.376865671641791,
"grad_norm": 0.14819589257240295,
"learning_rate": 0.0002,
"loss": 0.5429503917694092,
"mean_token_accuracy": 0.7809022516012192,
"num_tokens": 1649147.0,
"step": 101
},
{
"entropy": 0.5444748848676682,
"epoch": 0.3805970149253731,
"grad_norm": 0.15763095021247864,
"learning_rate": 0.0002,
"loss": 0.5527257919311523,
"mean_token_accuracy": 0.7789772897958755,
"num_tokens": 1665434.0,
"step": 102
},
{
"entropy": 0.5364149361848831,
"epoch": 0.3843283582089552,
"grad_norm": 0.12937362492084503,
"learning_rate": 0.0002,
"loss": 0.5445730090141296,
"mean_token_accuracy": 0.7801977097988129,
"num_tokens": 1681628.0,
"step": 103
},
{
"entropy": 0.5520685017108917,
"epoch": 0.3880597014925373,
"grad_norm": 0.13224048912525177,
"learning_rate": 0.0002,
"loss": 0.5565529465675354,
"mean_token_accuracy": 0.7761769741773605,
"num_tokens": 1698024.0,
"step": 104
},
{
"entropy": 0.5505486279726028,
"epoch": 0.3917910447761194,
"grad_norm": 0.12523634731769562,
"learning_rate": 0.0002,
"loss": 0.5501624345779419,
"mean_token_accuracy": 0.776427686214447,
"num_tokens": 1714432.0,
"step": 105
},
{
"entropy": 0.5415863394737244,
"epoch": 0.39552238805970147,
"grad_norm": 0.12370901554822922,
"learning_rate": 0.0002,
"loss": 0.5389205813407898,
"mean_token_accuracy": 0.7835447043180466,
"num_tokens": 1730701.0,
"step": 106
},
{
"entropy": 0.535835400223732,
"epoch": 0.39925373134328357,
"grad_norm": 0.12875092029571533,
"learning_rate": 0.0002,
"loss": 0.5339052081108093,
"mean_token_accuracy": 0.7833075076341629,
"num_tokens": 1747039.0,
"step": 107
},
{
"entropy": 0.5391292423009872,
"epoch": 0.40298507462686567,
"grad_norm": 0.13361512124538422,
"learning_rate": 0.0002,
"loss": 0.5480363368988037,
"mean_token_accuracy": 0.778292641043663,
"num_tokens": 1763231.0,
"step": 108
},
{
"entropy": 0.5451123267412186,
"epoch": 0.40671641791044777,
"grad_norm": 0.12270035594701767,
"learning_rate": 0.0002,
"loss": 0.544527530670166,
"mean_token_accuracy": 0.7805600017309189,
"num_tokens": 1779643.0,
"step": 109
},
{
"entropy": 0.5353200137615204,
"epoch": 0.41044776119402987,
"grad_norm": 0.15249699354171753,
"learning_rate": 0.0002,
"loss": 0.540695309638977,
"mean_token_accuracy": 0.7809852063655853,
"num_tokens": 1795799.0,
"step": 110
},
{
"entropy": 0.5517745912075043,
"epoch": 0.4141791044776119,
"grad_norm": 0.13048961758613586,
"learning_rate": 0.0002,
"loss": 0.5428080558776855,
"mean_token_accuracy": 0.7799961864948273,
"num_tokens": 1812372.0,
"step": 111
},
{
"entropy": 0.5553679913282394,
"epoch": 0.417910447761194,
"grad_norm": 0.135862797498703,
"learning_rate": 0.0002,
"loss": 0.5515741109848022,
"mean_token_accuracy": 0.7762576192617416,
"num_tokens": 1828663.0,
"step": 112
},
{
"entropy": 0.5415378957986832,
"epoch": 0.4216417910447761,
"grad_norm": 0.17365720868110657,
"learning_rate": 0.0002,
"loss": 0.5439163446426392,
"mean_token_accuracy": 0.7816168814897537,
"num_tokens": 1845046.0,
"step": 113
},
{
"entropy": 0.5443854928016663,
"epoch": 0.4253731343283582,
"grad_norm": 0.13225306570529938,
"learning_rate": 0.0002,
"loss": 0.5523333549499512,
"mean_token_accuracy": 0.7754887640476227,
"num_tokens": 1861463.0,
"step": 114
},
{
"entropy": 0.536818191409111,
"epoch": 0.4291044776119403,
"grad_norm": 0.18661700189113617,
"learning_rate": 0.0002,
"loss": 0.5445066094398499,
"mean_token_accuracy": 0.7783756703138351,
"num_tokens": 1877488.0,
"step": 115
},
{
"entropy": 0.5401700437068939,
"epoch": 0.43283582089552236,
"grad_norm": 0.1313197761774063,
"learning_rate": 0.0002,
"loss": 0.5441405773162842,
"mean_token_accuracy": 0.779263436794281,
"num_tokens": 1893953.0,
"step": 116
},
{
"entropy": 0.5655902773141861,
"epoch": 0.43656716417910446,
"grad_norm": 0.14134129881858826,
"learning_rate": 0.0002,
"loss": 0.5561054944992065,
"mean_token_accuracy": 0.7760706096887589,
"num_tokens": 1910559.0,
"step": 117
},
{
"entropy": 0.5377545058727264,
"epoch": 0.44029850746268656,
"grad_norm": 0.1476624757051468,
"learning_rate": 0.0002,
"loss": 0.5377650260925293,
"mean_token_accuracy": 0.784254401922226,
"num_tokens": 1926798.0,
"step": 118
},
{
"entropy": 0.5710994154214859,
"epoch": 0.44402985074626866,
"grad_norm": 0.12695498764514923,
"learning_rate": 0.0002,
"loss": 0.5705847144126892,
"mean_token_accuracy": 0.7709101736545563,
"num_tokens": 1943309.0,
"step": 119
},
{
"entropy": 0.5473001599311829,
"epoch": 0.44776119402985076,
"grad_norm": 0.13190272450447083,
"learning_rate": 0.0002,
"loss": 0.5527402758598328,
"mean_token_accuracy": 0.7776251584291458,
"num_tokens": 1959914.0,
"step": 120
},
{
"entropy": 0.5332797467708588,
"epoch": 0.45149253731343286,
"grad_norm": 0.1538720279932022,
"learning_rate": 0.0002,
"loss": 0.541407585144043,
"mean_token_accuracy": 0.7805240601301193,
"num_tokens": 1976350.0,
"step": 121
},
{
"entropy": 0.5485477149486542,
"epoch": 0.4552238805970149,
"grad_norm": 0.1464855819940567,
"learning_rate": 0.0002,
"loss": 0.5562998056411743,
"mean_token_accuracy": 0.7745071202516556,
"num_tokens": 1992575.0,
"step": 122
},
{
"entropy": 0.5465153902769089,
"epoch": 0.458955223880597,
"grad_norm": 0.1392602175474167,
"learning_rate": 0.0002,
"loss": 0.5450125932693481,
"mean_token_accuracy": 0.7803204655647278,
"num_tokens": 2008818.0,
"step": 123
},
{
"entropy": 0.5216257721185684,
"epoch": 0.4626865671641791,
"grad_norm": 0.16500917077064514,
"learning_rate": 0.0002,
"loss": 0.5204989314079285,
"mean_token_accuracy": 0.7916441410779953,
"num_tokens": 2024909.0,
"step": 124
},
{
"entropy": 0.5582488030195236,
"epoch": 0.4664179104477612,
"grad_norm": 0.12797319889068604,
"learning_rate": 0.0002,
"loss": 0.5522317290306091,
"mean_token_accuracy": 0.7782706022262573,
"num_tokens": 2041274.0,
"step": 125
},
{
"entropy": 0.5451529324054718,
"epoch": 0.4701492537313433,
"grad_norm": 0.136440709233284,
"learning_rate": 0.0002,
"loss": 0.5448014736175537,
"mean_token_accuracy": 0.7787207514047623,
"num_tokens": 2057665.0,
"step": 126
},
{
"entropy": 0.5657823532819748,
"epoch": 0.47388059701492535,
"grad_norm": 0.13369601964950562,
"learning_rate": 0.0002,
"loss": 0.5634066462516785,
"mean_token_accuracy": 0.7729785293340683,
"num_tokens": 2074159.0,
"step": 127
},
{
"entropy": 0.52435402572155,
"epoch": 0.47761194029850745,
"grad_norm": 0.13124150037765503,
"learning_rate": 0.0002,
"loss": 0.5261214971542358,
"mean_token_accuracy": 0.787582278251648,
"num_tokens": 2090388.0,
"step": 128
},
{
"entropy": 0.5388573259115219,
"epoch": 0.48134328358208955,
"grad_norm": 0.1402949094772339,
"learning_rate": 0.0002,
"loss": 0.5444526672363281,
"mean_token_accuracy": 0.780138373374939,
"num_tokens": 2106895.0,
"step": 129
},
{
"entropy": 0.5594224631786346,
"epoch": 0.48507462686567165,
"grad_norm": 0.12214766442775726,
"learning_rate": 0.0002,
"loss": 0.5680845379829407,
"mean_token_accuracy": 0.7693810015916824,
"num_tokens": 2122936.0,
"step": 130
},
{
"entropy": 0.5598264634609222,
"epoch": 0.48880597014925375,
"grad_norm": 0.11836589127779007,
"learning_rate": 0.0002,
"loss": 0.5608173608779907,
"mean_token_accuracy": 0.7735486477613449,
"num_tokens": 2139356.0,
"step": 131
},
{
"entropy": 0.5484192073345184,
"epoch": 0.4925373134328358,
"grad_norm": 0.11776985228061676,
"learning_rate": 0.0002,
"loss": 0.5445444583892822,
"mean_token_accuracy": 0.7797606885433197,
"num_tokens": 2155868.0,
"step": 132
},
{
"entropy": 0.5602923631668091,
"epoch": 0.4962686567164179,
"grad_norm": 0.12020131945610046,
"learning_rate": 0.0002,
"loss": 0.5522936582565308,
"mean_token_accuracy": 0.7776170521974564,
"num_tokens": 2172336.0,
"step": 133
},
{
"entropy": 0.5583924055099487,
"epoch": 0.5,
"grad_norm": 0.1295275241136551,
"learning_rate": 0.0002,
"loss": 0.5662660002708435,
"mean_token_accuracy": 0.7716575860977173,
"num_tokens": 2188518.0,
"step": 134
},
{
"entropy": 0.5514810979366302,
"epoch": 0.503731343283582,
"grad_norm": 0.1089273989200592,
"learning_rate": 0.0002,
"loss": 0.5514034032821655,
"mean_token_accuracy": 0.7769223898649216,
"num_tokens": 2205142.0,
"step": 135
},
{
"entropy": 0.5440865606069565,
"epoch": 0.5074626865671642,
"grad_norm": 0.13056722283363342,
"learning_rate": 0.0002,
"loss": 0.5475744009017944,
"mean_token_accuracy": 0.7764044553041458,
"num_tokens": 2221743.0,
"step": 136
},
{
"entropy": 0.5476541817188263,
"epoch": 0.5111940298507462,
"grad_norm": 0.13166996836662292,
"learning_rate": 0.0002,
"loss": 0.5477900505065918,
"mean_token_accuracy": 0.7784378528594971,
"num_tokens": 2238142.0,
"step": 137
},
{
"entropy": 0.5558486729860306,
"epoch": 0.5149253731343284,
"grad_norm": 0.12133946269750595,
"learning_rate": 0.0002,
"loss": 0.5609108209609985,
"mean_token_accuracy": 0.7736046612262726,
"num_tokens": 2254456.0,
"step": 138
},
{
"entropy": 0.5566332340240479,
"epoch": 0.5186567164179104,
"grad_norm": 0.12148908525705338,
"learning_rate": 0.0002,
"loss": 0.5561110973358154,
"mean_token_accuracy": 0.7756631374359131,
"num_tokens": 2270696.0,
"step": 139
},
{
"entropy": 0.5462600067257881,
"epoch": 0.5223880597014925,
"grad_norm": 0.1129021942615509,
"learning_rate": 0.0002,
"loss": 0.5448604822158813,
"mean_token_accuracy": 0.7795793265104294,
"num_tokens": 2287025.0,
"step": 140
},
{
"entropy": 0.5399314314126968,
"epoch": 0.5261194029850746,
"grad_norm": 0.1251847892999649,
"learning_rate": 0.0002,
"loss": 0.5481414794921875,
"mean_token_accuracy": 0.778893768787384,
"num_tokens": 2303399.0,
"step": 141
},
{
"entropy": 0.5469618439674377,
"epoch": 0.5298507462686567,
"grad_norm": 0.11956755071878433,
"learning_rate": 0.0002,
"loss": 0.5474820137023926,
"mean_token_accuracy": 0.7784739285707474,
"num_tokens": 2319818.0,
"step": 142
},
{
"entropy": 0.5447351336479187,
"epoch": 0.5335820895522388,
"grad_norm": 0.14881564676761627,
"learning_rate": 0.0002,
"loss": 0.5410581827163696,
"mean_token_accuracy": 0.781320258975029,
"num_tokens": 2335949.0,
"step": 143
},
{
"entropy": 0.5449966341257095,
"epoch": 0.5373134328358209,
"grad_norm": 0.12103825062513351,
"learning_rate": 0.0002,
"loss": 0.5471005439758301,
"mean_token_accuracy": 0.7796377539634705,
"num_tokens": 2352269.0,
"step": 144
},
{
"entropy": 0.5632765144109726,
"epoch": 0.5410447761194029,
"grad_norm": 0.12277977168560028,
"learning_rate": 0.0002,
"loss": 0.5630727410316467,
"mean_token_accuracy": 0.7703763097524643,
"num_tokens": 2368674.0,
"step": 145
},
{
"entropy": 0.5339089632034302,
"epoch": 0.5447761194029851,
"grad_norm": 0.14498627185821533,
"learning_rate": 0.0002,
"loss": 0.5364416241645813,
"mean_token_accuracy": 0.7819968014955521,
"num_tokens": 2384936.0,
"step": 146
},
{
"entropy": 0.5429459661245346,
"epoch": 0.5485074626865671,
"grad_norm": 0.12051384150981903,
"learning_rate": 0.0002,
"loss": 0.5456188917160034,
"mean_token_accuracy": 0.7803860902786255,
"num_tokens": 2401292.0,
"step": 147
},
{
"entropy": 0.5626052618026733,
"epoch": 0.5522388059701493,
"grad_norm": 0.1412496566772461,
"learning_rate": 0.0002,
"loss": 0.5596410036087036,
"mean_token_accuracy": 0.7737385481595993,
"num_tokens": 2417925.0,
"step": 148
},
{
"entropy": 0.5565475225448608,
"epoch": 0.5559701492537313,
"grad_norm": 0.1441730409860611,
"learning_rate": 0.0002,
"loss": 0.5520785450935364,
"mean_token_accuracy": 0.775386318564415,
"num_tokens": 2434621.0,
"step": 149
},
{
"entropy": 0.5197634100914001,
"epoch": 0.5597014925373134,
"grad_norm": 0.12098351866006851,
"learning_rate": 0.0002,
"loss": 0.5222008228302002,
"mean_token_accuracy": 0.7903124392032623,
"num_tokens": 2450903.0,
"step": 150
},
{
"entropy": 0.5489796698093414,
"epoch": 0.5634328358208955,
"grad_norm": 0.14946326613426208,
"learning_rate": 0.0002,
"loss": 0.5559377074241638,
"mean_token_accuracy": 0.775105893611908,
"num_tokens": 2467105.0,
"step": 151
},
{
"entropy": 0.5400301665067673,
"epoch": 0.5671641791044776,
"grad_norm": 0.12906025350093842,
"learning_rate": 0.0002,
"loss": 0.5420807600021362,
"mean_token_accuracy": 0.7795381844043732,
"num_tokens": 2483456.0,
"step": 152
},
{
"entropy": 0.5474328249692917,
"epoch": 0.5708955223880597,
"grad_norm": 0.12021685391664505,
"learning_rate": 0.0002,
"loss": 0.5509780049324036,
"mean_token_accuracy": 0.7768895477056503,
"num_tokens": 2500011.0,
"step": 153
},
{
"entropy": 0.5380930155515671,
"epoch": 0.5746268656716418,
"grad_norm": 0.11843080073595047,
"learning_rate": 0.0002,
"loss": 0.5308334827423096,
"mean_token_accuracy": 0.7881843447685242,
"num_tokens": 2516780.0,
"step": 154
},
{
"entropy": 0.5460693091154099,
"epoch": 0.5783582089552238,
"grad_norm": 0.16729064285755157,
"learning_rate": 0.0002,
"loss": 0.5530881285667419,
"mean_token_accuracy": 0.7742334753274918,
"num_tokens": 2532837.0,
"step": 155
},
{
"entropy": 0.5500553995370865,
"epoch": 0.582089552238806,
"grad_norm": 0.1366872787475586,
"learning_rate": 0.0002,
"loss": 0.5533830523490906,
"mean_token_accuracy": 0.7750078588724136,
"num_tokens": 2549157.0,
"step": 156
},
{
"entropy": 0.5497538447380066,
"epoch": 0.585820895522388,
"grad_norm": 0.12214312702417374,
"learning_rate": 0.0002,
"loss": 0.5549652576446533,
"mean_token_accuracy": 0.7742869108915329,
"num_tokens": 2565745.0,
"step": 157
},
{
"entropy": 0.5520212799310684,
"epoch": 0.5895522388059702,
"grad_norm": 0.13198687136173248,
"learning_rate": 0.0002,
"loss": 0.5503985285758972,
"mean_token_accuracy": 0.7776314318180084,
"num_tokens": 2582172.0,
"step": 158
},
{
"entropy": 0.5420894026756287,
"epoch": 0.5932835820895522,
"grad_norm": 0.1303817480802536,
"learning_rate": 0.0002,
"loss": 0.545700192451477,
"mean_token_accuracy": 0.7790375500917435,
"num_tokens": 2598785.0,
"step": 159
},
{
"entropy": 0.5361281335353851,
"epoch": 0.5970149253731343,
"grad_norm": 0.13537634909152985,
"learning_rate": 0.0002,
"loss": 0.5409078598022461,
"mean_token_accuracy": 0.779214471578598,
"num_tokens": 2615324.0,
"step": 160
},
{
"entropy": 0.5633385479450226,
"epoch": 0.6007462686567164,
"grad_norm": 0.11204258352518082,
"learning_rate": 0.0002,
"loss": 0.5624291896820068,
"mean_token_accuracy": 0.7730776518583298,
"num_tokens": 2631612.0,
"step": 161
},
{
"entropy": 0.5239899605512619,
"epoch": 0.6044776119402985,
"grad_norm": 0.14660899341106415,
"learning_rate": 0.0002,
"loss": 0.5244404673576355,
"mean_token_accuracy": 0.7870436310768127,
"num_tokens": 2648098.0,
"step": 162
},
{
"entropy": 0.5414755046367645,
"epoch": 0.6082089552238806,
"grad_norm": 0.11887400597333908,
"learning_rate": 0.0002,
"loss": 0.5397330522537231,
"mean_token_accuracy": 0.7847625911235809,
"num_tokens": 2664285.0,
"step": 163
},
{
"entropy": 0.5442674309015274,
"epoch": 0.6119402985074627,
"grad_norm": 0.11572780460119247,
"learning_rate": 0.0002,
"loss": 0.5454840660095215,
"mean_token_accuracy": 0.7809286564588547,
"num_tokens": 2680551.0,
"step": 164
},
{
"entropy": 0.5371343344449997,
"epoch": 0.6156716417910447,
"grad_norm": 0.1056356355547905,
"learning_rate": 0.0002,
"loss": 0.5348964929580688,
"mean_token_accuracy": 0.7857467532157898,
"num_tokens": 2697071.0,
"step": 165
},
{
"entropy": 0.5399870425462723,
"epoch": 0.6194029850746269,
"grad_norm": 0.13278594613075256,
"learning_rate": 0.0002,
"loss": 0.5447728633880615,
"mean_token_accuracy": 0.7792245298624039,
"num_tokens": 2713461.0,
"step": 166
},
{
"entropy": 0.5350475907325745,
"epoch": 0.6231343283582089,
"grad_norm": 0.1305065155029297,
"learning_rate": 0.0002,
"loss": 0.5362796783447266,
"mean_token_accuracy": 0.7812380343675613,
"num_tokens": 2729505.0,
"step": 167
},
{
"entropy": 0.5582499951124191,
"epoch": 0.6268656716417911,
"grad_norm": 0.12587526440620422,
"learning_rate": 0.0002,
"loss": 0.5559293627738953,
"mean_token_accuracy": 0.7746618837118149,
"num_tokens": 2746287.0,
"step": 168
},
{
"entropy": 0.5586439073085785,
"epoch": 0.6305970149253731,
"grad_norm": 0.12845800817012787,
"learning_rate": 0.0002,
"loss": 0.5518544912338257,
"mean_token_accuracy": 0.7751341164112091,
"num_tokens": 2762818.0,
"step": 169
},
{
"entropy": 0.5343242138624191,
"epoch": 0.6343283582089553,
"grad_norm": 0.15256647765636444,
"learning_rate": 0.0002,
"loss": 0.5386060476303101,
"mean_token_accuracy": 0.7807702422142029,
"num_tokens": 2779199.0,
"step": 170
},
{
"entropy": 0.5373098105192184,
"epoch": 0.6380597014925373,
"grad_norm": 0.13263238966464996,
"learning_rate": 0.0002,
"loss": 0.5466636419296265,
"mean_token_accuracy": 0.7765426337718964,
"num_tokens": 2795330.0,
"step": 171
},
{
"entropy": 0.5298089534044266,
"epoch": 0.6417910447761194,
"grad_norm": 0.12450744211673737,
"learning_rate": 0.0002,
"loss": 0.5325064659118652,
"mean_token_accuracy": 0.7838508486747742,
"num_tokens": 2811566.0,
"step": 172
},
{
"entropy": 0.5550331622362137,
"epoch": 0.6455223880597015,
"grad_norm": 0.111052505671978,
"learning_rate": 0.0002,
"loss": 0.5552961230278015,
"mean_token_accuracy": 0.7752347737550735,
"num_tokens": 2827783.0,
"step": 173
},
{
"entropy": 0.5506296455860138,
"epoch": 0.6492537313432836,
"grad_norm": 0.13255524635314941,
"learning_rate": 0.0002,
"loss": 0.5490573048591614,
"mean_token_accuracy": 0.7767810970544815,
"num_tokens": 2844210.0,
"step": 174
},
{
"entropy": 0.5634674280881882,
"epoch": 0.6529850746268657,
"grad_norm": 0.11786694079637527,
"learning_rate": 0.0002,
"loss": 0.5620654225349426,
"mean_token_accuracy": 0.7710569798946381,
"num_tokens": 2860606.0,
"step": 175
},
{
"entropy": 0.5491903871297836,
"epoch": 0.6567164179104478,
"grad_norm": 0.1378813087940216,
"learning_rate": 0.0002,
"loss": 0.5544133186340332,
"mean_token_accuracy": 0.7742699533700943,
"num_tokens": 2876978.0,
"step": 176
},
{
"entropy": 0.5418348163366318,
"epoch": 0.6604477611940298,
"grad_norm": 0.1719319373369217,
"learning_rate": 0.0002,
"loss": 0.5509114265441895,
"mean_token_accuracy": 0.7738531082868576,
"num_tokens": 2893436.0,
"step": 177
},
{
"entropy": 0.5621145367622375,
"epoch": 0.664179104477612,
"grad_norm": 0.13473528623580933,
"learning_rate": 0.0002,
"loss": 0.5569881796836853,
"mean_token_accuracy": 0.7752742022275925,
"num_tokens": 2909714.0,
"step": 178
},
{
"entropy": 0.5567401647567749,
"epoch": 0.667910447761194,
"grad_norm": 0.15127326548099518,
"learning_rate": 0.0002,
"loss": 0.5531461238861084,
"mean_token_accuracy": 0.7789575010538101,
"num_tokens": 2926148.0,
"step": 179
},
{
"entropy": 0.5276759713888168,
"epoch": 0.6716417910447762,
"grad_norm": 0.1254606693983078,
"learning_rate": 0.0002,
"loss": 0.5301634669303894,
"mean_token_accuracy": 0.7837289869785309,
"num_tokens": 2942739.0,
"step": 180
},
{
"entropy": 0.5278603881597519,
"epoch": 0.6753731343283582,
"grad_norm": 0.128974050283432,
"learning_rate": 0.0002,
"loss": 0.5369632244110107,
"mean_token_accuracy": 0.7825482338666916,
"num_tokens": 2958977.0,
"step": 181
},
{
"entropy": 0.5377722084522247,
"epoch": 0.6791044776119403,
"grad_norm": 0.13316886126995087,
"learning_rate": 0.0002,
"loss": 0.5483193397521973,
"mean_token_accuracy": 0.7763564735651016,
"num_tokens": 2975274.0,
"step": 182
},
{
"entropy": 0.5507437884807587,
"epoch": 0.6828358208955224,
"grad_norm": 0.12445816397666931,
"learning_rate": 0.0002,
"loss": 0.5532326698303223,
"mean_token_accuracy": 0.7756502628326416,
"num_tokens": 2991599.0,
"step": 183
},
{
"entropy": 0.5495483875274658,
"epoch": 0.6865671641791045,
"grad_norm": 0.11616785079240799,
"learning_rate": 0.0002,
"loss": 0.5388738512992859,
"mean_token_accuracy": 0.780926913022995,
"num_tokens": 3008127.0,
"step": 184
},
{
"entropy": 0.5604113638401031,
"epoch": 0.6902985074626866,
"grad_norm": 0.10933785885572433,
"learning_rate": 0.0002,
"loss": 0.5567720532417297,
"mean_token_accuracy": 0.7762922942638397,
"num_tokens": 3024360.0,
"step": 185
},
{
"entropy": 0.5393257141113281,
"epoch": 0.6940298507462687,
"grad_norm": 0.13075008988380432,
"learning_rate": 0.0002,
"loss": 0.5377945303916931,
"mean_token_accuracy": 0.7826398611068726,
"num_tokens": 3040880.0,
"step": 186
},
{
"entropy": 0.534931406378746,
"epoch": 0.6977611940298507,
"grad_norm": 0.11783911287784576,
"learning_rate": 0.0002,
"loss": 0.5384173393249512,
"mean_token_accuracy": 0.7814484983682632,
"num_tokens": 3057215.0,
"step": 187
},
{
"entropy": 0.5480581521987915,
"epoch": 0.7014925373134329,
"grad_norm": 0.11767826229333878,
"learning_rate": 0.0002,
"loss": 0.5535053610801697,
"mean_token_accuracy": 0.7753477245569229,
"num_tokens": 3073526.0,
"step": 188
},
{
"entropy": 0.5417313128709793,
"epoch": 0.7052238805970149,
"grad_norm": 0.1221914142370224,
"learning_rate": 0.0002,
"loss": 0.5454643368721008,
"mean_token_accuracy": 0.7766887843608856,
"num_tokens": 3089677.0,
"step": 189
},
{
"entropy": 0.5625078678131104,
"epoch": 0.7089552238805971,
"grad_norm": 0.11974587291479111,
"learning_rate": 0.0002,
"loss": 0.5611926913261414,
"mean_token_accuracy": 0.7717815935611725,
"num_tokens": 3105979.0,
"step": 190
},
{
"entropy": 0.5516901463270187,
"epoch": 0.7126865671641791,
"grad_norm": 0.11311069130897522,
"learning_rate": 0.0002,
"loss": 0.5487813949584961,
"mean_token_accuracy": 0.7764030396938324,
"num_tokens": 3122320.0,
"step": 191
},
{
"entropy": 0.5541231781244278,
"epoch": 0.7164179104477612,
"grad_norm": 0.12345684319734573,
"learning_rate": 0.0002,
"loss": 0.5585082173347473,
"mean_token_accuracy": 0.774434968829155,
"num_tokens": 3138647.0,
"step": 192
},
{
"entropy": 0.5558422803878784,
"epoch": 0.7201492537313433,
"grad_norm": 0.13054387271404266,
"learning_rate": 0.0002,
"loss": 0.5540096163749695,
"mean_token_accuracy": 0.7756641954183578,
"num_tokens": 3154847.0,
"step": 193
},
{
"entropy": 0.55143603682518,
"epoch": 0.7238805970149254,
"grad_norm": 0.14231973886489868,
"learning_rate": 0.0002,
"loss": 0.5643096566200256,
"mean_token_accuracy": 0.7717767059803009,
"num_tokens": 3171336.0,
"step": 194
},
{
"entropy": 0.5277590304613113,
"epoch": 0.7276119402985075,
"grad_norm": 0.12328840047121048,
"learning_rate": 0.0002,
"loss": 0.5327441096305847,
"mean_token_accuracy": 0.7853522598743439,
"num_tokens": 3187829.0,
"step": 195
},
{
"entropy": 0.5539046078920364,
"epoch": 0.7313432835820896,
"grad_norm": 0.12686993181705475,
"learning_rate": 0.0002,
"loss": 0.5454736948013306,
"mean_token_accuracy": 0.7813247591257095,
"num_tokens": 3204100.0,
"step": 196
},
{
"entropy": 0.5553427636623383,
"epoch": 0.7350746268656716,
"grad_norm": 0.14084763824939728,
"learning_rate": 0.0002,
"loss": 0.5538918972015381,
"mean_token_accuracy": 0.7761572599411011,
"num_tokens": 3220526.0,
"step": 197
},
{
"entropy": 0.5553955286741257,
"epoch": 0.7388059701492538,
"grad_norm": 0.15137532353401184,
"learning_rate": 0.0002,
"loss": 0.5569421648979187,
"mean_token_accuracy": 0.7751066386699677,
"num_tokens": 3237005.0,
"step": 198
},
{
"entropy": 0.5306164473295212,
"epoch": 0.7425373134328358,
"grad_norm": 0.14029283821582794,
"learning_rate": 0.0002,
"loss": 0.5325392484664917,
"mean_token_accuracy": 0.7821047902107239,
"num_tokens": 3253191.0,
"step": 199
},
{
"entropy": 0.5289445072412491,
"epoch": 0.746268656716418,
"grad_norm": 0.1625203937292099,
"learning_rate": 0.0002,
"loss": 0.530889093875885,
"mean_token_accuracy": 0.7839524000883102,
"num_tokens": 3269303.0,
"step": 200
},
{
"entropy": 0.5537738502025604,
"epoch": 0.75,
"grad_norm": 0.12837141752243042,
"learning_rate": 0.0002,
"loss": 0.5496644377708435,
"mean_token_accuracy": 0.7775348573923111,
"num_tokens": 3285861.0,
"step": 201
},
{
"entropy": 0.5437710881233215,
"epoch": 0.753731343283582,
"grad_norm": 0.15969154238700867,
"learning_rate": 0.0002,
"loss": 0.5445458889007568,
"mean_token_accuracy": 0.7779001444578171,
"num_tokens": 3302531.0,
"step": 202
},
{
"entropy": 0.5435174703598022,
"epoch": 0.7574626865671642,
"grad_norm": 0.1447206735610962,
"learning_rate": 0.0002,
"loss": 0.5419492125511169,
"mean_token_accuracy": 0.782675564289093,
"num_tokens": 3318918.0,
"step": 203
},
{
"entropy": 0.5337730944156647,
"epoch": 0.7611940298507462,
"grad_norm": 0.13017146289348602,
"learning_rate": 0.0002,
"loss": 0.5400105714797974,
"mean_token_accuracy": 0.7810544222593307,
"num_tokens": 3335348.0,
"step": 204
},
{
"entropy": 0.534254178404808,
"epoch": 0.7649253731343284,
"grad_norm": 0.11939690262079239,
"learning_rate": 0.0002,
"loss": 0.5358497500419617,
"mean_token_accuracy": 0.7831085026264191,
"num_tokens": 3351607.0,
"step": 205
},
{
"entropy": 0.5295046716928482,
"epoch": 0.7686567164179104,
"grad_norm": 0.17022010684013367,
"learning_rate": 0.0002,
"loss": 0.5389232635498047,
"mean_token_accuracy": 0.7811893969774246,
"num_tokens": 3368046.0,
"step": 206
},
{
"entropy": 0.5532102882862091,
"epoch": 0.7723880597014925,
"grad_norm": 0.13207128643989563,
"learning_rate": 0.0002,
"loss": 0.556742787361145,
"mean_token_accuracy": 0.7772794514894485,
"num_tokens": 3384496.0,
"step": 207
},
{
"entropy": 0.5532752573490143,
"epoch": 0.7761194029850746,
"grad_norm": 0.16495952010154724,
"learning_rate": 0.0002,
"loss": 0.5545544624328613,
"mean_token_accuracy": 0.777538612484932,
"num_tokens": 3400918.0,
"step": 208
},
{
"entropy": 0.534032866358757,
"epoch": 0.7798507462686567,
"grad_norm": 0.1333177387714386,
"learning_rate": 0.0002,
"loss": 0.533141553401947,
"mean_token_accuracy": 0.7848780155181885,
"num_tokens": 3417300.0,
"step": 209
},
{
"entropy": 0.5450873523950577,
"epoch": 0.7835820895522388,
"grad_norm": 0.12406419962644577,
"learning_rate": 0.0002,
"loss": 0.5425257086753845,
"mean_token_accuracy": 0.781457707285881,
"num_tokens": 3433516.0,
"step": 210
},
{
"entropy": 0.5520957857370377,
"epoch": 0.7873134328358209,
"grad_norm": 0.16319960355758667,
"learning_rate": 0.0002,
"loss": 0.5528780817985535,
"mean_token_accuracy": 0.7751211673021317,
"num_tokens": 3449854.0,
"step": 211
},
{
"entropy": 0.5411545261740685,
"epoch": 0.7910447761194029,
"grad_norm": 0.11995123326778412,
"learning_rate": 0.0002,
"loss": 0.5378537178039551,
"mean_token_accuracy": 0.7797028720378876,
"num_tokens": 3466138.0,
"step": 212
},
{
"entropy": 0.5522632747888565,
"epoch": 0.7947761194029851,
"grad_norm": 0.14674413204193115,
"learning_rate": 0.0002,
"loss": 0.5561342239379883,
"mean_token_accuracy": 0.7742671966552734,
"num_tokens": 3482443.0,
"step": 213
},
{
"entropy": 0.5423247516155243,
"epoch": 0.7985074626865671,
"grad_norm": 0.1413860321044922,
"learning_rate": 0.0002,
"loss": 0.5450446605682373,
"mean_token_accuracy": 0.7770555764436722,
"num_tokens": 3498627.0,
"step": 214
},
{
"entropy": 0.5330623686313629,
"epoch": 0.8022388059701493,
"grad_norm": 0.1323142796754837,
"learning_rate": 0.0002,
"loss": 0.5411436557769775,
"mean_token_accuracy": 0.7801088243722916,
"num_tokens": 3515028.0,
"step": 215
},
{
"entropy": 0.5561616569757462,
"epoch": 0.8059701492537313,
"grad_norm": 0.14549626410007477,
"learning_rate": 0.0002,
"loss": 0.5557980537414551,
"mean_token_accuracy": 0.774229571223259,
"num_tokens": 3531502.0,
"step": 216
},
{
"entropy": 0.5611517131328583,
"epoch": 0.8097014925373134,
"grad_norm": 0.13433797657489777,
"learning_rate": 0.0002,
"loss": 0.5634274482727051,
"mean_token_accuracy": 0.7715686410665512,
"num_tokens": 3547519.0,
"step": 217
},
{
"entropy": 0.5514582842588425,
"epoch": 0.8134328358208955,
"grad_norm": 0.11890087276697159,
"learning_rate": 0.0002,
"loss": 0.5433245897293091,
"mean_token_accuracy": 0.7793933302164078,
"num_tokens": 3563773.0,
"step": 218
},
{
"entropy": 0.534797728061676,
"epoch": 0.8171641791044776,
"grad_norm": 0.1360422521829605,
"learning_rate": 0.0002,
"loss": 0.5381568670272827,
"mean_token_accuracy": 0.7809459120035172,
"num_tokens": 3580120.0,
"step": 219
},
{
"entropy": 0.5429193377494812,
"epoch": 0.8208955223880597,
"grad_norm": 0.13077932596206665,
"learning_rate": 0.0002,
"loss": 0.5535344481468201,
"mean_token_accuracy": 0.7765921354293823,
"num_tokens": 3596382.0,
"step": 220
},
{
"entropy": 0.5237333700060844,
"epoch": 0.8246268656716418,
"grad_norm": 0.1276118904352188,
"learning_rate": 0.0002,
"loss": 0.5291868448257446,
"mean_token_accuracy": 0.7849691659212112,
"num_tokens": 3612537.0,
"step": 221
},
{
"entropy": 0.5639058351516724,
"epoch": 0.8283582089552238,
"grad_norm": 0.1108359843492508,
"learning_rate": 0.0002,
"loss": 0.5600181221961975,
"mean_token_accuracy": 0.7725061029195786,
"num_tokens": 3629049.0,
"step": 222
},
{
"entropy": 0.5387094169855118,
"epoch": 0.832089552238806,
"grad_norm": 0.14372611045837402,
"learning_rate": 0.0002,
"loss": 0.5452870726585388,
"mean_token_accuracy": 0.7791440933942795,
"num_tokens": 3645497.0,
"step": 223
},
{
"entropy": 0.5521352589130402,
"epoch": 0.835820895522388,
"grad_norm": 0.1448589414358139,
"learning_rate": 0.0002,
"loss": 0.5500624775886536,
"mean_token_accuracy": 0.7766592055559158,
"num_tokens": 3661916.0,
"step": 224
},
{
"entropy": 0.5495995134115219,
"epoch": 0.8395522388059702,
"grad_norm": 0.11583460122346878,
"learning_rate": 0.0002,
"loss": 0.5486539006233215,
"mean_token_accuracy": 0.77958944439888,
"num_tokens": 3678385.0,
"step": 225
},
{
"entropy": 0.5483616590499878,
"epoch": 0.8432835820895522,
"grad_norm": 0.12950138747692108,
"learning_rate": 0.0002,
"loss": 0.550271213054657,
"mean_token_accuracy": 0.7755987495183945,
"num_tokens": 3694915.0,
"step": 226
},
{
"entropy": 0.5614336878061295,
"epoch": 0.8470149253731343,
"grad_norm": 0.1335671842098236,
"learning_rate": 0.0002,
"loss": 0.5636512041091919,
"mean_token_accuracy": 0.7719693928956985,
"num_tokens": 3710911.0,
"step": 227
},
{
"entropy": 0.5516408532857895,
"epoch": 0.8507462686567164,
"grad_norm": 0.11091525852680206,
"learning_rate": 0.0002,
"loss": 0.5478500127792358,
"mean_token_accuracy": 0.7780372649431229,
"num_tokens": 3727387.0,
"step": 228
},
{
"entropy": 0.5346055030822754,
"epoch": 0.8544776119402985,
"grad_norm": 0.1468094438314438,
"learning_rate": 0.0002,
"loss": 0.5368551015853882,
"mean_token_accuracy": 0.7816846072673798,
"num_tokens": 3743610.0,
"step": 229
},
{
"entropy": 0.5556191802024841,
"epoch": 0.8582089552238806,
"grad_norm": 0.12531019747257233,
"learning_rate": 0.0002,
"loss": 0.554017961025238,
"mean_token_accuracy": 0.775733008980751,
"num_tokens": 3759900.0,
"step": 230
},
{
"entropy": 0.5382195562124252,
"epoch": 0.8619402985074627,
"grad_norm": 0.12708726525306702,
"learning_rate": 0.0002,
"loss": 0.5370462536811829,
"mean_token_accuracy": 0.7824227660894394,
"num_tokens": 3776209.0,
"step": 231
},
{
"entropy": 0.5437551140785217,
"epoch": 0.8656716417910447,
"grad_norm": 0.14250780642032623,
"learning_rate": 0.0002,
"loss": 0.5482578277587891,
"mean_token_accuracy": 0.7775947004556656,
"num_tokens": 3792690.0,
"step": 232
},
{
"entropy": 0.5299069508910179,
"epoch": 0.8694029850746269,
"grad_norm": 0.09997344017028809,
"learning_rate": 0.0002,
"loss": 0.5321590900421143,
"mean_token_accuracy": 0.7849525660276413,
"num_tokens": 3808996.0,
"step": 233
},
{
"entropy": 0.5415566265583038,
"epoch": 0.8731343283582089,
"grad_norm": 0.14475880563259125,
"learning_rate": 0.0002,
"loss": 0.5407425165176392,
"mean_token_accuracy": 0.7812676578760147,
"num_tokens": 3825184.0,
"step": 234
},
{
"entropy": 0.5459320992231369,
"epoch": 0.8768656716417911,
"grad_norm": 0.1116221696138382,
"learning_rate": 0.0002,
"loss": 0.546471118927002,
"mean_token_accuracy": 0.779377743601799,
"num_tokens": 3841452.0,
"step": 235
},
{
"entropy": 0.5291514843702316,
"epoch": 0.8805970149253731,
"grad_norm": 0.12996730208396912,
"learning_rate": 0.0002,
"loss": 0.5327478647232056,
"mean_token_accuracy": 0.7848521023988724,
"num_tokens": 3858017.0,
"step": 236
},
{
"entropy": 0.5208889245986938,
"epoch": 0.8843283582089553,
"grad_norm": 0.16807906329631805,
"learning_rate": 0.0002,
"loss": 0.5301882028579712,
"mean_token_accuracy": 0.786228597164154,
"num_tokens": 3874064.0,
"step": 237
},
{
"entropy": 0.5617295503616333,
"epoch": 0.8880597014925373,
"grad_norm": 0.10751146823167801,
"learning_rate": 0.0002,
"loss": 0.5591222047805786,
"mean_token_accuracy": 0.7737416923046112,
"num_tokens": 3890590.0,
"step": 238
},
{
"entropy": 0.5473610609769821,
"epoch": 0.8917910447761194,
"grad_norm": 0.156968355178833,
"learning_rate": 0.0002,
"loss": 0.5408577919006348,
"mean_token_accuracy": 0.7787807583808899,
"num_tokens": 3906796.0,
"step": 239
},
{
"entropy": 0.5521116256713867,
"epoch": 0.8955223880597015,
"grad_norm": 0.1288469135761261,
"learning_rate": 0.0002,
"loss": 0.549975574016571,
"mean_token_accuracy": 0.7787336856126785,
"num_tokens": 3923243.0,
"step": 240
},
{
"entropy": 0.5367736220359802,
"epoch": 0.8992537313432836,
"grad_norm": 0.15267081558704376,
"learning_rate": 0.0002,
"loss": 0.5406203269958496,
"mean_token_accuracy": 0.7823334783315659,
"num_tokens": 3939802.0,
"step": 241
},
{
"entropy": 0.5384350121021271,
"epoch": 0.9029850746268657,
"grad_norm": 0.12661150097846985,
"learning_rate": 0.0002,
"loss": 0.5470013618469238,
"mean_token_accuracy": 0.7777878791093826,
"num_tokens": 3956169.0,
"step": 242
},
{
"entropy": 0.534332662820816,
"epoch": 0.9067164179104478,
"grad_norm": 0.1578921526670456,
"learning_rate": 0.0002,
"loss": 0.5447706580162048,
"mean_token_accuracy": 0.7791011482477188,
"num_tokens": 3972588.0,
"step": 243
},
{
"entropy": 0.5489266514778137,
"epoch": 0.9104477611940298,
"grad_norm": 0.12818928062915802,
"learning_rate": 0.0002,
"loss": 0.5481740236282349,
"mean_token_accuracy": 0.7786219567060471,
"num_tokens": 3988829.0,
"step": 244
},
{
"entropy": 0.5603043735027313,
"epoch": 0.914179104477612,
"grad_norm": 0.12620778381824493,
"learning_rate": 0.0002,
"loss": 0.5473756194114685,
"mean_token_accuracy": 0.7766416519880295,
"num_tokens": 4005147.0,
"step": 245
},
{
"entropy": 0.5429242998361588,
"epoch": 0.917910447761194,
"grad_norm": 0.12476211786270142,
"learning_rate": 0.0002,
"loss": 0.5349637269973755,
"mean_token_accuracy": 0.7825885117053986,
"num_tokens": 4021414.0,
"step": 246
},
{
"entropy": 0.5483033657073975,
"epoch": 0.9216417910447762,
"grad_norm": 0.12620662152767181,
"learning_rate": 0.0002,
"loss": 0.5528666973114014,
"mean_token_accuracy": 0.7761824727058411,
"num_tokens": 4038127.0,
"step": 247
},
{
"entropy": 0.5366939753293991,
"epoch": 0.9253731343283582,
"grad_norm": 0.14575915038585663,
"learning_rate": 0.0002,
"loss": 0.5463760495185852,
"mean_token_accuracy": 0.7789819538593292,
"num_tokens": 4054823.0,
"step": 248
},
{
"entropy": 0.5289286822080612,
"epoch": 0.9291044776119403,
"grad_norm": 0.13227254152297974,
"learning_rate": 0.0002,
"loss": 0.5342484712600708,
"mean_token_accuracy": 0.7823342829942703,
"num_tokens": 4071168.0,
"step": 249
},
{
"entropy": 0.5574782639741898,
"epoch": 0.9328358208955224,
"grad_norm": 0.11694958060979843,
"learning_rate": 0.0002,
"loss": 0.555205225944519,
"mean_token_accuracy": 0.7752824872732162,
"num_tokens": 4087486.0,
"step": 250
},
{
"entropy": 0.5487115234136581,
"epoch": 0.9365671641791045,
"grad_norm": 0.12190678715705872,
"learning_rate": 0.0002,
"loss": 0.5393535494804382,
"mean_token_accuracy": 0.7831632941961288,
"num_tokens": 4103816.0,
"step": 251
},
{
"entropy": 0.559577152132988,
"epoch": 0.9402985074626866,
"grad_norm": 0.17028383910655975,
"learning_rate": 0.0002,
"loss": 0.5525080561637878,
"mean_token_accuracy": 0.7758573293685913,
"num_tokens": 4120222.0,
"step": 252
},
{
"entropy": 0.5650424063205719,
"epoch": 0.9440298507462687,
"grad_norm": 0.11132688075304031,
"learning_rate": 0.0002,
"loss": 0.5637966394424438,
"mean_token_accuracy": 0.7707894593477249,
"num_tokens": 4136652.0,
"step": 253
},
{
"entropy": 0.5160737410187721,
"epoch": 0.9477611940298507,
"grad_norm": 0.15931887924671173,
"learning_rate": 0.0002,
"loss": 0.5282326936721802,
"mean_token_accuracy": 0.7854665815830231,
"num_tokens": 4152947.0,
"step": 254
},
{
"entropy": 0.537076398730278,
"epoch": 0.9514925373134329,
"grad_norm": 0.12814630568027496,
"learning_rate": 0.0002,
"loss": 0.5451772809028625,
"mean_token_accuracy": 0.7802058607339859,
"num_tokens": 4169503.0,
"step": 255
},
{
"entropy": 0.5342639088630676,
"epoch": 0.9552238805970149,
"grad_norm": 0.1517118364572525,
"learning_rate": 0.0002,
"loss": 0.5411078333854675,
"mean_token_accuracy": 0.7786644250154495,
"num_tokens": 4185621.0,
"step": 256
},
{
"entropy": 0.5415196269750595,
"epoch": 0.9589552238805971,
"grad_norm": 0.1379823535680771,
"learning_rate": 0.0002,
"loss": 0.5376235842704773,
"mean_token_accuracy": 0.782574325799942,
"num_tokens": 4201870.0,
"step": 257
},
{
"entropy": 0.5464203655719757,
"epoch": 0.9626865671641791,
"grad_norm": 0.11068425327539444,
"learning_rate": 0.0002,
"loss": 0.5408488512039185,
"mean_token_accuracy": 0.780770868062973,
"num_tokens": 4218151.0,
"step": 258
},
{
"entropy": 0.5458406358957291,
"epoch": 0.9664179104477612,
"grad_norm": 0.12213952839374542,
"learning_rate": 0.0002,
"loss": 0.5443609952926636,
"mean_token_accuracy": 0.7778299003839493,
"num_tokens": 4234366.0,
"step": 259
},
{
"entropy": 0.5463070273399353,
"epoch": 0.9701492537313433,
"grad_norm": 0.13273894786834717,
"learning_rate": 0.0002,
"loss": 0.5463058948516846,
"mean_token_accuracy": 0.7797796875238419,
"num_tokens": 4250736.0,
"step": 260
},
{
"entropy": 0.5530222281813622,
"epoch": 0.9738805970149254,
"grad_norm": 0.1269286721944809,
"learning_rate": 0.0002,
"loss": 0.5598427057266235,
"mean_token_accuracy": 0.7720119059085846,
"num_tokens": 4267145.0,
"step": 261
},
{
"entropy": 0.5307595282793045,
"epoch": 0.9776119402985075,
"grad_norm": 0.15041397511959076,
"learning_rate": 0.0002,
"loss": 0.5379044413566589,
"mean_token_accuracy": 0.7826298028230667,
"num_tokens": 4283482.0,
"step": 262
},
{
"entropy": 0.5570843815803528,
"epoch": 0.9813432835820896,
"grad_norm": 0.11555695533752441,
"learning_rate": 0.0002,
"loss": 0.5584969520568848,
"mean_token_accuracy": 0.7722631692886353,
"num_tokens": 4300006.0,
"step": 263
},
{
"entropy": 0.5427989065647125,
"epoch": 0.9850746268656716,
"grad_norm": 0.11381992697715759,
"learning_rate": 0.0002,
"loss": 0.5401906967163086,
"mean_token_accuracy": 0.7819131314754486,
"num_tokens": 4316285.0,
"step": 264
},
{
"entropy": 0.5244657546281815,
"epoch": 0.9888059701492538,
"grad_norm": 0.12954184412956238,
"learning_rate": 0.0002,
"loss": 0.5230352282524109,
"mean_token_accuracy": 0.7875886708498001,
"num_tokens": 4332644.0,
"step": 265
},
{
"entropy": 0.5411987751722336,
"epoch": 0.9925373134328358,
"grad_norm": 0.12008430063724518,
"learning_rate": 0.0002,
"loss": 0.5408762097358704,
"mean_token_accuracy": 0.7805971801280975,
"num_tokens": 4349014.0,
"step": 266
},
{
"entropy": 0.537212684750557,
"epoch": 0.996268656716418,
"grad_norm": 0.13956718146800995,
"learning_rate": 0.0002,
"loss": 0.5449704527854919,
"mean_token_accuracy": 0.7769150733947754,
"num_tokens": 4365397.0,
"step": 267
},
{
"entropy": 0.5412362664937973,
"epoch": 1.0,
"grad_norm": 0.11382853984832764,
"learning_rate": 0.0002,
"loss": 0.5392265319824219,
"mean_token_accuracy": 0.7833839505910873,
"num_tokens": 4381834.0,
"step": 268
},
{
"entropy": 0.5496137291193008,
"epoch": 1.0037313432835822,
"grad_norm": 0.14231012761592865,
"learning_rate": 0.0002,
"loss": 0.5489864945411682,
"mean_token_accuracy": 0.7766753733158112,
"num_tokens": 4398074.0,
"step": 269
},
{
"entropy": 0.5562388151884079,
"epoch": 1.007462686567164,
"grad_norm": 0.14497025310993195,
"learning_rate": 0.0002,
"loss": 0.5603899359703064,
"mean_token_accuracy": 0.7735977172851562,
"num_tokens": 4414424.0,
"step": 270
},
{
"entropy": 0.5293630063533783,
"epoch": 1.0111940298507462,
"grad_norm": 0.12251973897218704,
"learning_rate": 0.0002,
"loss": 0.5230416059494019,
"mean_token_accuracy": 0.7859042882919312,
"num_tokens": 4430738.0,
"step": 271
},
{
"entropy": 0.5297266095876694,
"epoch": 1.0149253731343284,
"grad_norm": 0.12865795195102692,
"learning_rate": 0.0002,
"loss": 0.5318350195884705,
"mean_token_accuracy": 0.7831861972808838,
"num_tokens": 4446854.0,
"step": 272
},
{
"entropy": 0.5223220437765121,
"epoch": 1.0186567164179103,
"grad_norm": 0.1494293063879013,
"learning_rate": 0.0002,
"loss": 0.5327814221382141,
"mean_token_accuracy": 0.7832103371620178,
"num_tokens": 4463067.0,
"step": 273
},
{
"entropy": 0.5113897025585175,
"epoch": 1.0223880597014925,
"grad_norm": 0.11985855549573898,
"learning_rate": 0.0002,
"loss": 0.5085136890411377,
"mean_token_accuracy": 0.7943005859851837,
"num_tokens": 4479208.0,
"step": 274
},
{
"entropy": 0.5331714898347855,
"epoch": 1.0261194029850746,
"grad_norm": 0.11615335196256638,
"learning_rate": 0.0002,
"loss": 0.5315767526626587,
"mean_token_accuracy": 0.7823154479265213,
"num_tokens": 4495400.0,
"step": 275
},
{
"entropy": 0.5418258756399155,
"epoch": 1.0298507462686568,
"grad_norm": 0.12503200769424438,
"learning_rate": 0.0002,
"loss": 0.5371681451797485,
"mean_token_accuracy": 0.7810330092906952,
"num_tokens": 4511712.0,
"step": 276
},
{
"entropy": 0.5291843414306641,
"epoch": 1.0335820895522387,
"grad_norm": 0.12552055716514587,
"learning_rate": 0.0002,
"loss": 0.5229098796844482,
"mean_token_accuracy": 0.7861831933259964,
"num_tokens": 4527757.0,
"step": 277
},
{
"entropy": 0.5402754694223404,
"epoch": 1.037313432835821,
"grad_norm": 0.12993621826171875,
"learning_rate": 0.0002,
"loss": 0.5389543771743774,
"mean_token_accuracy": 0.782686859369278,
"num_tokens": 4544172.0,
"step": 278
},
{
"entropy": 0.5249762684106827,
"epoch": 1.041044776119403,
"grad_norm": 0.1478368192911148,
"learning_rate": 0.0002,
"loss": 0.5288144946098328,
"mean_token_accuracy": 0.7870309799909592,
"num_tokens": 4560317.0,
"step": 279
},
{
"entropy": 0.5261744558811188,
"epoch": 1.044776119402985,
"grad_norm": 0.12392111867666245,
"learning_rate": 0.0002,
"loss": 0.5337116122245789,
"mean_token_accuracy": 0.7859398722648621,
"num_tokens": 4576552.0,
"step": 280
},
{
"entropy": 0.5196933448314667,
"epoch": 1.0485074626865671,
"grad_norm": 0.13088668882846832,
"learning_rate": 0.0002,
"loss": 0.5231020450592041,
"mean_token_accuracy": 0.7892478257417679,
"num_tokens": 4592581.0,
"step": 281
},
{
"entropy": 0.530863881111145,
"epoch": 1.0522388059701493,
"grad_norm": 0.12411776930093765,
"learning_rate": 0.0002,
"loss": 0.521477460861206,
"mean_token_accuracy": 0.7883302420377731,
"num_tokens": 4609148.0,
"step": 282
},
{
"entropy": 0.5147035792469978,
"epoch": 1.0559701492537314,
"grad_norm": 0.11664963513612747,
"learning_rate": 0.0002,
"loss": 0.5152803063392639,
"mean_token_accuracy": 0.7897714674472809,
"num_tokens": 4625339.0,
"step": 283
},
{
"entropy": 0.5241324007511139,
"epoch": 1.0597014925373134,
"grad_norm": 0.12206321954727173,
"learning_rate": 0.0002,
"loss": 0.5279011726379395,
"mean_token_accuracy": 0.7872984111309052,
"num_tokens": 4641602.0,
"step": 284
},
{
"entropy": 0.5386586785316467,
"epoch": 1.0634328358208955,
"grad_norm": 0.15844044089317322,
"learning_rate": 0.0002,
"loss": 0.5462183356285095,
"mean_token_accuracy": 0.7776554077863693,
"num_tokens": 4657935.0,
"step": 285
},
{
"entropy": 0.5212236195802689,
"epoch": 1.0671641791044777,
"grad_norm": 0.12227971851825714,
"learning_rate": 0.0002,
"loss": 0.524368941783905,
"mean_token_accuracy": 0.7889244109392166,
"num_tokens": 4674375.0,
"step": 286
},
{
"entropy": 0.5298297703266144,
"epoch": 1.0708955223880596,
"grad_norm": 0.11141645163297653,
"learning_rate": 0.0002,
"loss": 0.5300790667533875,
"mean_token_accuracy": 0.7826484590768814,
"num_tokens": 4690771.0,
"step": 287
},
{
"entropy": 0.5472451746463776,
"epoch": 1.0746268656716418,
"grad_norm": 0.12320703268051147,
"learning_rate": 0.0002,
"loss": 0.5423391461372375,
"mean_token_accuracy": 0.780271515250206,
"num_tokens": 4707429.0,
"step": 288
},
{
"entropy": 0.5120319426059723,
"epoch": 1.078358208955224,
"grad_norm": 0.12205273658037186,
"learning_rate": 0.0002,
"loss": 0.5049785375595093,
"mean_token_accuracy": 0.7964775711297989,
"num_tokens": 4723707.0,
"step": 289
},
{
"entropy": 0.5228906571865082,
"epoch": 1.0820895522388059,
"grad_norm": 0.14154046773910522,
"learning_rate": 0.0002,
"loss": 0.5207747220993042,
"mean_token_accuracy": 0.7898598164319992,
"num_tokens": 4739904.0,
"step": 290
},
{
"entropy": 0.522852934896946,
"epoch": 1.085820895522388,
"grad_norm": 0.12813158333301544,
"learning_rate": 0.0002,
"loss": 0.5229586958885193,
"mean_token_accuracy": 0.7879058122634888,
"num_tokens": 4756146.0,
"step": 291
},
{
"entropy": 0.5175448060035706,
"epoch": 1.0895522388059702,
"grad_norm": 0.1693999171257019,
"learning_rate": 0.0002,
"loss": 0.5333408713340759,
"mean_token_accuracy": 0.7839324027299881,
"num_tokens": 4772324.0,
"step": 292
},
{
"entropy": 0.5461927354335785,
"epoch": 1.0932835820895523,
"grad_norm": 0.1190054640173912,
"learning_rate": 0.0002,
"loss": 0.545452892780304,
"mean_token_accuracy": 0.7791879326105118,
"num_tokens": 4788838.0,
"step": 293
},
{
"entropy": 0.5367765128612518,
"epoch": 1.0970149253731343,
"grad_norm": 0.160573810338974,
"learning_rate": 0.0002,
"loss": 0.5323442816734314,
"mean_token_accuracy": 0.7844058275222778,
"num_tokens": 4805260.0,
"step": 294
},
{
"entropy": 0.5449754297733307,
"epoch": 1.1007462686567164,
"grad_norm": 0.13656781613826752,
"learning_rate": 0.0002,
"loss": 0.5343316793441772,
"mean_token_accuracy": 0.786631390452385,
"num_tokens": 4821651.0,
"step": 295
},
{
"entropy": 0.539639413356781,
"epoch": 1.1044776119402986,
"grad_norm": 0.15722377598285675,
"learning_rate": 0.0002,
"loss": 0.537823498249054,
"mean_token_accuracy": 0.7838342785835266,
"num_tokens": 4838086.0,
"step": 296
},
{
"entropy": 0.5071177557110786,
"epoch": 1.1082089552238805,
"grad_norm": 0.13242004811763763,
"learning_rate": 0.0002,
"loss": 0.519379198551178,
"mean_token_accuracy": 0.790022000670433,
"num_tokens": 4854421.0,
"step": 297
},
{
"entropy": 0.5327034294605255,
"epoch": 1.1119402985074627,
"grad_norm": 0.21717894077301025,
"learning_rate": 0.0002,
"loss": 0.5451952815055847,
"mean_token_accuracy": 0.7793966829776764,
"num_tokens": 4870862.0,
"step": 298
},
{
"entropy": 0.5120953842997551,
"epoch": 1.1156716417910448,
"grad_norm": 0.11570360511541367,
"learning_rate": 0.0002,
"loss": 0.5137699246406555,
"mean_token_accuracy": 0.7910549491643906,
"num_tokens": 4887047.0,
"step": 299
},
{
"entropy": 0.5416189283132553,
"epoch": 1.1194029850746268,
"grad_norm": 0.15835031867027283,
"learning_rate": 0.0002,
"loss": 0.5377160310745239,
"mean_token_accuracy": 0.7817842811346054,
"num_tokens": 4903770.0,
"step": 300
},
{
"entropy": 0.5600537657737732,
"epoch": 1.123134328358209,
"grad_norm": 0.16074593365192413,
"learning_rate": 0.0002,
"loss": 0.5558266043663025,
"mean_token_accuracy": 0.7756943106651306,
"num_tokens": 4920314.0,
"step": 301
},
{
"entropy": 0.5424332320690155,
"epoch": 1.126865671641791,
"grad_norm": 0.13547547161579132,
"learning_rate": 0.0002,
"loss": 0.5412736535072327,
"mean_token_accuracy": 0.7802875488996506,
"num_tokens": 4936795.0,
"step": 302
},
{
"entropy": 0.5479728579521179,
"epoch": 1.1305970149253732,
"grad_norm": 0.17388752102851868,
"learning_rate": 0.0002,
"loss": 0.5473156571388245,
"mean_token_accuracy": 0.7779090404510498,
"num_tokens": 4953215.0,
"step": 303
},
{
"entropy": 0.5354913771152496,
"epoch": 1.1343283582089552,
"grad_norm": 0.12070244550704956,
"learning_rate": 0.0002,
"loss": 0.5346955060958862,
"mean_token_accuracy": 0.7821491658687592,
"num_tokens": 4969473.0,
"step": 304
},
{
"entropy": 0.5357395708560944,
"epoch": 1.1380597014925373,
"grad_norm": 0.1695796698331833,
"learning_rate": 0.0002,
"loss": 0.5382478833198547,
"mean_token_accuracy": 0.7825665175914764,
"num_tokens": 4985892.0,
"step": 305
},
{
"entropy": 0.5406463518738747,
"epoch": 1.1417910447761195,
"grad_norm": 0.13278549909591675,
"learning_rate": 0.0002,
"loss": 0.5439954996109009,
"mean_token_accuracy": 0.781127467751503,
"num_tokens": 5002244.0,
"step": 306
},
{
"entropy": 0.5423679053783417,
"epoch": 1.1455223880597014,
"grad_norm": 0.1525002419948578,
"learning_rate": 0.0002,
"loss": 0.5506120324134827,
"mean_token_accuracy": 0.7751760631799698,
"num_tokens": 5018518.0,
"step": 307
},
{
"entropy": 0.5409325361251831,
"epoch": 1.1492537313432836,
"grad_norm": 0.1641884595155716,
"learning_rate": 0.0002,
"loss": 0.5398315787315369,
"mean_token_accuracy": 0.7811702787876129,
"num_tokens": 5034880.0,
"step": 308
},
{
"entropy": 0.527726948261261,
"epoch": 1.1529850746268657,
"grad_norm": 0.13098926842212677,
"learning_rate": 0.0002,
"loss": 0.5239942669868469,
"mean_token_accuracy": 0.7863958179950714,
"num_tokens": 5051492.0,
"step": 309
},
{
"entropy": 0.5603475868701935,
"epoch": 1.1567164179104479,
"grad_norm": 0.17059364914894104,
"learning_rate": 0.0002,
"loss": 0.5537184476852417,
"mean_token_accuracy": 0.7751886546611786,
"num_tokens": 5067902.0,
"step": 310
},
{
"entropy": 0.522188276052475,
"epoch": 1.1604477611940298,
"grad_norm": 0.14454245567321777,
"learning_rate": 0.0002,
"loss": 0.5286940932273865,
"mean_token_accuracy": 0.7850693166255951,
"num_tokens": 5084221.0,
"step": 311
},
{
"entropy": 0.5343948155641556,
"epoch": 1.164179104477612,
"grad_norm": 0.13227348029613495,
"learning_rate": 0.0002,
"loss": 0.5384489297866821,
"mean_token_accuracy": 0.7807275205850601,
"num_tokens": 5100663.0,
"step": 312
},
{
"entropy": 0.5275873988866806,
"epoch": 1.1679104477611941,
"grad_norm": 0.1753464788198471,
"learning_rate": 0.0002,
"loss": 0.5382294058799744,
"mean_token_accuracy": 0.7828755527734756,
"num_tokens": 5117302.0,
"step": 313
},
{
"entropy": 0.5497360378503799,
"epoch": 1.171641791044776,
"grad_norm": 0.13286371529102325,
"learning_rate": 0.0002,
"loss": 0.5496618151664734,
"mean_token_accuracy": 0.7774941623210907,
"num_tokens": 5133769.0,
"step": 314
},
{
"entropy": 0.532920241355896,
"epoch": 1.1753731343283582,
"grad_norm": 0.15036581456661224,
"learning_rate": 0.0002,
"loss": 0.5245468020439148,
"mean_token_accuracy": 0.7888032495975494,
"num_tokens": 5150119.0,
"step": 315
},
{
"entropy": 0.5440064817667007,
"epoch": 1.1791044776119404,
"grad_norm": 0.13510671257972717,
"learning_rate": 0.0002,
"loss": 0.5358728170394897,
"mean_token_accuracy": 0.7828054130077362,
"num_tokens": 5166721.0,
"step": 316
},
{
"entropy": 0.5312670171260834,
"epoch": 1.1828358208955223,
"grad_norm": 0.11371396481990814,
"learning_rate": 0.0002,
"loss": 0.5337090492248535,
"mean_token_accuracy": 0.7806256115436554,
"num_tokens": 5182960.0,
"step": 317
},
{
"entropy": 0.5359569638967514,
"epoch": 1.1865671641791045,
"grad_norm": 0.1442011594772339,
"learning_rate": 0.0002,
"loss": 0.5444678068161011,
"mean_token_accuracy": 0.7807507514953613,
"num_tokens": 5199188.0,
"step": 318
},
{
"entropy": 0.5328075140714645,
"epoch": 1.1902985074626866,
"grad_norm": 0.14832444489002228,
"learning_rate": 0.0002,
"loss": 0.5382975339889526,
"mean_token_accuracy": 0.7805762439966202,
"num_tokens": 5215650.0,
"step": 319
},
{
"entropy": 0.5216325521469116,
"epoch": 1.1940298507462686,
"grad_norm": 0.14424221217632294,
"learning_rate": 0.0002,
"loss": 0.5250576734542847,
"mean_token_accuracy": 0.7859031856060028,
"num_tokens": 5231820.0,
"step": 320
},
{
"entropy": 0.5351075977087021,
"epoch": 1.1977611940298507,
"grad_norm": 0.14221367239952087,
"learning_rate": 0.0002,
"loss": 0.5295757055282593,
"mean_token_accuracy": 0.7862369567155838,
"num_tokens": 5248279.0,
"step": 321
},
{
"entropy": 0.5397693365812302,
"epoch": 1.2014925373134329,
"grad_norm": 0.13292263448238373,
"learning_rate": 0.0002,
"loss": 0.5341707468032837,
"mean_token_accuracy": 0.7843815088272095,
"num_tokens": 5264712.0,
"step": 322
},
{
"entropy": 0.5192128270864487,
"epoch": 1.205223880597015,
"grad_norm": 0.14713309705257416,
"learning_rate": 0.0002,
"loss": 0.5247495770454407,
"mean_token_accuracy": 0.7879969924688339,
"num_tokens": 5280975.0,
"step": 323
},
{
"entropy": 0.542580246925354,
"epoch": 1.208955223880597,
"grad_norm": 0.1425526738166809,
"learning_rate": 0.0002,
"loss": 0.5457293391227722,
"mean_token_accuracy": 0.7779300808906555,
"num_tokens": 5297373.0,
"step": 324
},
{
"entropy": 0.51340202242136,
"epoch": 1.212686567164179,
"grad_norm": 0.13574931025505066,
"learning_rate": 0.0002,
"loss": 0.5158831477165222,
"mean_token_accuracy": 0.7899662852287292,
"num_tokens": 5313524.0,
"step": 325
},
{
"entropy": 0.5239507853984833,
"epoch": 1.2164179104477613,
"grad_norm": 0.1242108941078186,
"learning_rate": 0.0002,
"loss": 0.5264536142349243,
"mean_token_accuracy": 0.7876432240009308,
"num_tokens": 5330035.0,
"step": 326
},
{
"entropy": 0.5461296737194061,
"epoch": 1.2201492537313432,
"grad_norm": 0.13526761531829834,
"learning_rate": 0.0002,
"loss": 0.5456458330154419,
"mean_token_accuracy": 0.7787662595510483,
"num_tokens": 5346713.0,
"step": 327
},
{
"entropy": 0.5285127460956573,
"epoch": 1.2238805970149254,
"grad_norm": 0.1288863569498062,
"learning_rate": 0.0002,
"loss": 0.5286239385604858,
"mean_token_accuracy": 0.7839469760656357,
"num_tokens": 5362892.0,
"step": 328
},
{
"entropy": 0.5281976014375687,
"epoch": 1.2276119402985075,
"grad_norm": 0.15830843150615692,
"learning_rate": 0.0002,
"loss": 0.5338830351829529,
"mean_token_accuracy": 0.7864977121353149,
"num_tokens": 5379105.0,
"step": 329
},
{
"entropy": 0.537989154458046,
"epoch": 1.2313432835820897,
"grad_norm": 0.14264224469661713,
"learning_rate": 0.0002,
"loss": 0.5378222465515137,
"mean_token_accuracy": 0.7845461368560791,
"num_tokens": 5395557.0,
"step": 330
},
{
"entropy": 0.5446864664554596,
"epoch": 1.2350746268656716,
"grad_norm": 0.15385743975639343,
"learning_rate": 0.0002,
"loss": 0.5452708005905151,
"mean_token_accuracy": 0.7787858992815018,
"num_tokens": 5411870.0,
"step": 331
},
{
"entropy": 0.5162093490362167,
"epoch": 1.2388059701492538,
"grad_norm": 0.13330549001693726,
"learning_rate": 0.0002,
"loss": 0.5179134607315063,
"mean_token_accuracy": 0.7886767089366913,
"num_tokens": 5428174.0,
"step": 332
},
{
"entropy": 0.5166965126991272,
"epoch": 1.242537313432836,
"grad_norm": 0.13044792413711548,
"learning_rate": 0.0002,
"loss": 0.5149925947189331,
"mean_token_accuracy": 0.7877358198165894,
"num_tokens": 5444504.0,
"step": 333
},
{
"entropy": 0.5293487906455994,
"epoch": 1.2462686567164178,
"grad_norm": 0.15583756566047668,
"learning_rate": 0.0002,
"loss": 0.5320658087730408,
"mean_token_accuracy": 0.7861583828926086,
"num_tokens": 5460813.0,
"step": 334
},
{
"entropy": 0.5320923030376434,
"epoch": 1.25,
"grad_norm": 0.12959426641464233,
"learning_rate": 0.0002,
"loss": 0.5345736145973206,
"mean_token_accuracy": 0.7825423777103424,
"num_tokens": 5477333.0,
"step": 335
},
{
"entropy": 0.5326530635356903,
"epoch": 1.2537313432835822,
"grad_norm": 0.15951137244701385,
"learning_rate": 0.0002,
"loss": 0.5311124920845032,
"mean_token_accuracy": 0.7841883301734924,
"num_tokens": 5493735.0,
"step": 336
},
{
"entropy": 0.544501468539238,
"epoch": 1.2574626865671643,
"grad_norm": 0.12288819998502731,
"learning_rate": 0.0002,
"loss": 0.5451238751411438,
"mean_token_accuracy": 0.7775899171829224,
"num_tokens": 5510068.0,
"step": 337
},
{
"entropy": 0.5330418646335602,
"epoch": 1.2611940298507462,
"grad_norm": 0.13410672545433044,
"learning_rate": 0.0002,
"loss": 0.535346269607544,
"mean_token_accuracy": 0.7835884392261505,
"num_tokens": 5526452.0,
"step": 338
},
{
"entropy": 0.5434266775846481,
"epoch": 1.2649253731343284,
"grad_norm": 0.13076815009117126,
"learning_rate": 0.0002,
"loss": 0.5440234541893005,
"mean_token_accuracy": 0.7821687757968903,
"num_tokens": 5542951.0,
"step": 339
},
{
"entropy": 0.5151484906673431,
"epoch": 1.2686567164179103,
"grad_norm": 0.12828661501407623,
"learning_rate": 0.0002,
"loss": 0.5160608887672424,
"mean_token_accuracy": 0.791755273938179,
"num_tokens": 5559086.0,
"step": 340
},
{
"entropy": 0.5275644734501839,
"epoch": 1.2723880597014925,
"grad_norm": 0.13408422470092773,
"learning_rate": 0.0002,
"loss": 0.5317025184631348,
"mean_token_accuracy": 0.7861050963401794,
"num_tokens": 5575521.0,
"step": 341
},
{
"entropy": 0.5177630484104156,
"epoch": 1.2761194029850746,
"grad_norm": 0.12419670075178146,
"learning_rate": 0.0002,
"loss": 0.5191144347190857,
"mean_token_accuracy": 0.7892575412988663,
"num_tokens": 5591947.0,
"step": 342
},
{
"entropy": 0.5407169461250305,
"epoch": 1.2798507462686568,
"grad_norm": 0.1364241987466812,
"learning_rate": 0.0002,
"loss": 0.5430530309677124,
"mean_token_accuracy": 0.779339611530304,
"num_tokens": 5608447.0,
"step": 343
},
{
"entropy": 0.5262736082077026,
"epoch": 1.2835820895522387,
"grad_norm": 0.15587468445301056,
"learning_rate": 0.0002,
"loss": 0.5301055312156677,
"mean_token_accuracy": 0.7836160659790039,
"num_tokens": 5625044.0,
"step": 344
},
{
"entropy": 0.5458462238311768,
"epoch": 1.287313432835821,
"grad_norm": 0.13173708319664001,
"learning_rate": 0.0002,
"loss": 0.5517262816429138,
"mean_token_accuracy": 0.7764803022146225,
"num_tokens": 5641335.0,
"step": 345
},
{
"entropy": 0.5216450393199921,
"epoch": 1.291044776119403,
"grad_norm": 0.17484262585639954,
"learning_rate": 0.0002,
"loss": 0.5218112468719482,
"mean_token_accuracy": 0.7843209207057953,
"num_tokens": 5657347.0,
"step": 346
},
{
"entropy": 0.5498285889625549,
"epoch": 1.294776119402985,
"grad_norm": 0.12871748208999634,
"learning_rate": 0.0002,
"loss": 0.5382349491119385,
"mean_token_accuracy": 0.7812492400407791,
"num_tokens": 5673588.0,
"step": 347
},
{
"entropy": 0.5317611545324326,
"epoch": 1.2985074626865671,
"grad_norm": 0.15342608094215393,
"learning_rate": 0.0002,
"loss": 0.5276378989219666,
"mean_token_accuracy": 0.7836941033601761,
"num_tokens": 5689687.0,
"step": 348
},
{
"entropy": 0.5218729674816132,
"epoch": 1.3022388059701493,
"grad_norm": 0.1535658985376358,
"learning_rate": 0.0002,
"loss": 0.5265159606933594,
"mean_token_accuracy": 0.7863410115242004,
"num_tokens": 5705883.0,
"step": 349
},
{
"entropy": 0.5283405184745789,
"epoch": 1.3059701492537314,
"grad_norm": 0.1400662213563919,
"learning_rate": 0.0002,
"loss": 0.5348565578460693,
"mean_token_accuracy": 0.7835897505283356,
"num_tokens": 5722396.0,
"step": 350
},
{
"entropy": 0.5465448051691055,
"epoch": 1.3097014925373134,
"grad_norm": 0.1789598912000656,
"learning_rate": 0.0002,
"loss": 0.5508973002433777,
"mean_token_accuracy": 0.7770535051822662,
"num_tokens": 5738946.0,
"step": 351
},
{
"entropy": 0.5288202613592148,
"epoch": 1.3134328358208955,
"grad_norm": 0.12526051700115204,
"learning_rate": 0.0002,
"loss": 0.5298986434936523,
"mean_token_accuracy": 0.7855530083179474,
"num_tokens": 5755207.0,
"step": 352
},
{
"entropy": 0.5429712533950806,
"epoch": 1.3171641791044777,
"grad_norm": 0.12195583432912827,
"learning_rate": 0.0002,
"loss": 0.5387951731681824,
"mean_token_accuracy": 0.7802612334489822,
"num_tokens": 5771582.0,
"step": 353
},
{
"entropy": 0.5358787178993225,
"epoch": 1.3208955223880596,
"grad_norm": 0.15126559138298035,
"learning_rate": 0.0002,
"loss": 0.5349993705749512,
"mean_token_accuracy": 0.7822433114051819,
"num_tokens": 5787967.0,
"step": 354
},
{
"entropy": 0.5424338132143021,
"epoch": 1.3246268656716418,
"grad_norm": 0.1308310180902481,
"learning_rate": 0.0002,
"loss": 0.5434916615486145,
"mean_token_accuracy": 0.7826928794384003,
"num_tokens": 5804528.0,
"step": 355
},
{
"entropy": 0.5337295234203339,
"epoch": 1.328358208955224,
"grad_norm": 0.16843028366565704,
"learning_rate": 0.0002,
"loss": 0.5465773344039917,
"mean_token_accuracy": 0.777764692902565,
"num_tokens": 5820684.0,
"step": 356
},
{
"entropy": 0.504702128469944,
"epoch": 1.332089552238806,
"grad_norm": 0.1529076248407364,
"learning_rate": 0.0002,
"loss": 0.5113453269004822,
"mean_token_accuracy": 0.791937530040741,
"num_tokens": 5836988.0,
"step": 357
},
{
"entropy": 0.536053940653801,
"epoch": 1.335820895522388,
"grad_norm": 0.1379069983959198,
"learning_rate": 0.0002,
"loss": 0.5389484763145447,
"mean_token_accuracy": 0.7813952714204788,
"num_tokens": 5853542.0,
"step": 358
},
{
"entropy": 0.5438119322061539,
"epoch": 1.3395522388059702,
"grad_norm": 0.12008243054151535,
"learning_rate": 0.0002,
"loss": 0.5360631346702576,
"mean_token_accuracy": 0.7817373275756836,
"num_tokens": 5870213.0,
"step": 359
},
{
"entropy": 0.550885871052742,
"epoch": 1.3432835820895521,
"grad_norm": 0.13378706574440002,
"learning_rate": 0.0002,
"loss": 0.54970383644104,
"mean_token_accuracy": 0.7768265455961227,
"num_tokens": 5886513.0,
"step": 360
},
{
"entropy": 0.5400225073099136,
"epoch": 1.3470149253731343,
"grad_norm": 0.13530388474464417,
"learning_rate": 0.0002,
"loss": 0.5343542098999023,
"mean_token_accuracy": 0.782709077000618,
"num_tokens": 5903049.0,
"step": 361
},
{
"entropy": 0.5389147847890854,
"epoch": 1.3507462686567164,
"grad_norm": 0.12446677684783936,
"learning_rate": 0.0002,
"loss": 0.5388710498809814,
"mean_token_accuracy": 0.781377524137497,
"num_tokens": 5919403.0,
"step": 362
},
{
"entropy": 0.537296935915947,
"epoch": 1.3544776119402986,
"grad_norm": 0.13781245052814484,
"learning_rate": 0.0002,
"loss": 0.5438515543937683,
"mean_token_accuracy": 0.7785618007183075,
"num_tokens": 5935511.0,
"step": 363
},
{
"entropy": 0.5429168194532394,
"epoch": 1.3582089552238805,
"grad_norm": 0.13629309833049774,
"learning_rate": 0.0002,
"loss": 0.5453547239303589,
"mean_token_accuracy": 0.7784431874752045,
"num_tokens": 5951972.0,
"step": 364
},
{
"entropy": 0.5427183359861374,
"epoch": 1.3619402985074627,
"grad_norm": 0.1370571255683899,
"learning_rate": 0.0002,
"loss": 0.545956552028656,
"mean_token_accuracy": 0.7787607759237289,
"num_tokens": 5968229.0,
"step": 365
},
{
"entropy": 0.5378859043121338,
"epoch": 1.3656716417910448,
"grad_norm": 0.12471959739923477,
"learning_rate": 0.0002,
"loss": 0.5353823900222778,
"mean_token_accuracy": 0.7809005975723267,
"num_tokens": 5984669.0,
"step": 366
},
{
"entropy": 0.5365873426198959,
"epoch": 1.3694029850746268,
"grad_norm": 0.16501657664775848,
"learning_rate": 0.0002,
"loss": 0.5319327712059021,
"mean_token_accuracy": 0.7824555039405823,
"num_tokens": 6001027.0,
"step": 367
},
{
"entropy": 0.5265276953577995,
"epoch": 1.373134328358209,
"grad_norm": 0.12363235652446747,
"learning_rate": 0.0002,
"loss": 0.5210375785827637,
"mean_token_accuracy": 0.7883688807487488,
"num_tokens": 6017125.0,
"step": 368
},
{
"entropy": 0.5277390778064728,
"epoch": 1.376865671641791,
"grad_norm": 0.1423310935497284,
"learning_rate": 0.0002,
"loss": 0.5316471457481384,
"mean_token_accuracy": 0.7828662693500519,
"num_tokens": 6033508.0,
"step": 369
},
{
"entropy": 0.5263610854744911,
"epoch": 1.3805970149253732,
"grad_norm": 0.1381843090057373,
"learning_rate": 0.0002,
"loss": 0.5311442613601685,
"mean_token_accuracy": 0.7821517586708069,
"num_tokens": 6049886.0,
"step": 370
},
{
"entropy": 0.5286078453063965,
"epoch": 1.3843283582089552,
"grad_norm": 0.18003322184085846,
"learning_rate": 0.0002,
"loss": 0.5398144721984863,
"mean_token_accuracy": 0.7803981304168701,
"num_tokens": 6066120.0,
"step": 371
},
{
"entropy": 0.5356258824467659,
"epoch": 1.3880597014925373,
"grad_norm": 0.11802922934293747,
"learning_rate": 0.0002,
"loss": 0.53504878282547,
"mean_token_accuracy": 0.7814585119485855,
"num_tokens": 6082732.0,
"step": 372
},
{
"entropy": 0.5387788712978363,
"epoch": 1.3917910447761195,
"grad_norm": 0.13874171674251556,
"learning_rate": 0.0002,
"loss": 0.5358333587646484,
"mean_token_accuracy": 0.7825580388307571,
"num_tokens": 6099018.0,
"step": 373
},
{
"entropy": 0.5342960059642792,
"epoch": 1.3955223880597014,
"grad_norm": 0.1402461677789688,
"learning_rate": 0.0002,
"loss": 0.5348989963531494,
"mean_token_accuracy": 0.7847650349140167,
"num_tokens": 6115279.0,
"step": 374
},
{
"entropy": 0.5361053943634033,
"epoch": 1.3992537313432836,
"grad_norm": 0.11853493005037308,
"learning_rate": 0.0002,
"loss": 0.5328879356384277,
"mean_token_accuracy": 0.7853472977876663,
"num_tokens": 6131854.0,
"step": 375
},
{
"entropy": 0.5300562530755997,
"epoch": 1.4029850746268657,
"grad_norm": 0.1642550826072693,
"learning_rate": 0.0002,
"loss": 0.5330582857131958,
"mean_token_accuracy": 0.7824369519948959,
"num_tokens": 6148329.0,
"step": 376
},
{
"entropy": 0.5351111143827438,
"epoch": 1.4067164179104479,
"grad_norm": 0.13296250998973846,
"learning_rate": 0.0002,
"loss": 0.5308345556259155,
"mean_token_accuracy": 0.7840287983417511,
"num_tokens": 6164520.0,
"step": 377
},
{
"entropy": 0.549595445394516,
"epoch": 1.4104477611940298,
"grad_norm": 0.11937810480594635,
"learning_rate": 0.0002,
"loss": 0.5439208745956421,
"mean_token_accuracy": 0.7801520526409149,
"num_tokens": 6180840.0,
"step": 378
},
{
"entropy": 0.5249980017542839,
"epoch": 1.414179104477612,
"grad_norm": 0.14947783946990967,
"learning_rate": 0.0002,
"loss": 0.5214130878448486,
"mean_token_accuracy": 0.7883247882127762,
"num_tokens": 6197072.0,
"step": 379
},
{
"entropy": 0.5341014862060547,
"epoch": 1.417910447761194,
"grad_norm": 0.14708726108074188,
"learning_rate": 0.0002,
"loss": 0.5437160730361938,
"mean_token_accuracy": 0.7790101766586304,
"num_tokens": 6213410.0,
"step": 380
},
{
"entropy": 0.5305748581886292,
"epoch": 1.421641791044776,
"grad_norm": 0.15660500526428223,
"learning_rate": 0.0002,
"loss": 0.538860559463501,
"mean_token_accuracy": 0.7808915078639984,
"num_tokens": 6229812.0,
"step": 381
},
{
"entropy": 0.5335244983434677,
"epoch": 1.4253731343283582,
"grad_norm": 0.14013393223285675,
"learning_rate": 0.0002,
"loss": 0.5405108332633972,
"mean_token_accuracy": 0.7806441932916641,
"num_tokens": 6246122.0,
"step": 382
},
{
"entropy": 0.5370550155639648,
"epoch": 1.4291044776119404,
"grad_norm": 0.15498457849025726,
"learning_rate": 0.0002,
"loss": 0.5275038480758667,
"mean_token_accuracy": 0.7845180481672287,
"num_tokens": 6262400.0,
"step": 383
},
{
"entropy": 0.5388240739703178,
"epoch": 1.4328358208955223,
"grad_norm": 0.13547126948833466,
"learning_rate": 0.0002,
"loss": 0.5339113473892212,
"mean_token_accuracy": 0.7817906439304352,
"num_tokens": 6278433.0,
"step": 384
},
{
"entropy": 0.5327373743057251,
"epoch": 1.4365671641791045,
"grad_norm": 0.15488973259925842,
"learning_rate": 0.0002,
"loss": 0.536837637424469,
"mean_token_accuracy": 0.7805320471525192,
"num_tokens": 6294780.0,
"step": 385
},
{
"entropy": 0.5164054483175278,
"epoch": 1.4402985074626866,
"grad_norm": 0.13659167289733887,
"learning_rate": 0.0002,
"loss": 0.5196657180786133,
"mean_token_accuracy": 0.7893420159816742,
"num_tokens": 6310926.0,
"step": 386
},
{
"entropy": 0.5441898256540298,
"epoch": 1.4440298507462686,
"grad_norm": 0.30239349603652954,
"learning_rate": 0.0002,
"loss": 0.5498929023742676,
"mean_token_accuracy": 0.7768156677484512,
"num_tokens": 6327465.0,
"step": 387
},
{
"entropy": 0.5278986096382141,
"epoch": 1.4477611940298507,
"grad_norm": 0.16996067762374878,
"learning_rate": 0.0002,
"loss": 0.5285515785217285,
"mean_token_accuracy": 0.786761000752449,
"num_tokens": 6343503.0,
"step": 388
},
{
"entropy": 0.508112832903862,
"epoch": 1.4514925373134329,
"grad_norm": 0.14852264523506165,
"learning_rate": 0.0002,
"loss": 0.5129667520523071,
"mean_token_accuracy": 0.7919276505708694,
"num_tokens": 6359667.0,
"step": 389
},
{
"entropy": 0.5249242335557938,
"epoch": 1.455223880597015,
"grad_norm": 0.17182905972003937,
"learning_rate": 0.0002,
"loss": 0.5207914113998413,
"mean_token_accuracy": 0.7878070920705795,
"num_tokens": 6376114.0,
"step": 390
},
{
"entropy": 0.5415022522211075,
"epoch": 1.458955223880597,
"grad_norm": 0.14497698843479156,
"learning_rate": 0.0002,
"loss": 0.5450653433799744,
"mean_token_accuracy": 0.7796677798032761,
"num_tokens": 6392417.0,
"step": 391
},
{
"entropy": 0.5454135686159134,
"epoch": 1.462686567164179,
"grad_norm": 0.14885719120502472,
"learning_rate": 0.0002,
"loss": 0.5476389527320862,
"mean_token_accuracy": 0.7781424224376678,
"num_tokens": 6408701.0,
"step": 392
},
{
"entropy": 0.5305422842502594,
"epoch": 1.4664179104477613,
"grad_norm": 0.13111279904842377,
"learning_rate": 0.0002,
"loss": 0.5283982753753662,
"mean_token_accuracy": 0.786282405257225,
"num_tokens": 6425186.0,
"step": 393
},
{
"entropy": 0.519924134016037,
"epoch": 1.4701492537313432,
"grad_norm": 0.15385456383228302,
"learning_rate": 0.0002,
"loss": 0.5183860659599304,
"mean_token_accuracy": 0.7890526354312897,
"num_tokens": 6441474.0,
"step": 394
},
{
"entropy": 0.5419893115758896,
"epoch": 1.4738805970149254,
"grad_norm": 0.12959027290344238,
"learning_rate": 0.0002,
"loss": 0.5391095876693726,
"mean_token_accuracy": 0.7845679074525833,
"num_tokens": 6458137.0,
"step": 395
},
{
"entropy": 0.5297622233629227,
"epoch": 1.4776119402985075,
"grad_norm": 0.12876980006694794,
"learning_rate": 0.0002,
"loss": 0.5316991209983826,
"mean_token_accuracy": 0.783607617020607,
"num_tokens": 6474605.0,
"step": 396
},
{
"entropy": 0.5133326500654221,
"epoch": 1.4813432835820897,
"grad_norm": 0.23840782046318054,
"learning_rate": 0.0002,
"loss": 0.5223475098609924,
"mean_token_accuracy": 0.7896056026220322,
"num_tokens": 6490747.0,
"step": 397
},
{
"entropy": 0.540631890296936,
"epoch": 1.4850746268656716,
"grad_norm": 0.18176521360874176,
"learning_rate": 0.0002,
"loss": 0.5429366230964661,
"mean_token_accuracy": 0.7787415534257889,
"num_tokens": 6507149.0,
"step": 398
},
{
"entropy": 0.5534960627555847,
"epoch": 1.4888059701492538,
"grad_norm": 0.38266992568969727,
"learning_rate": 0.0002,
"loss": 0.5652564764022827,
"mean_token_accuracy": 0.7736776769161224,
"num_tokens": 6523502.0,
"step": 399
},
{
"entropy": 0.5438710153102875,
"epoch": 1.4925373134328357,
"grad_norm": 0.15845677256584167,
"learning_rate": 0.0002,
"loss": 0.5439051985740662,
"mean_token_accuracy": 0.7816531956195831,
"num_tokens": 6539815.0,
"step": 400
},
{
"entropy": 0.5452860891819,
"epoch": 1.4962686567164178,
"grad_norm": 0.19755159318447113,
"learning_rate": 0.0002,
"loss": 0.5404053926467896,
"mean_token_accuracy": 0.7815948128700256,
"num_tokens": 6555976.0,
"step": 401
},
{
"entropy": 0.5241969153285027,
"epoch": 1.5,
"grad_norm": 0.14966075122356415,
"learning_rate": 0.0002,
"loss": 0.5205419063568115,
"mean_token_accuracy": 0.7888282835483551,
"num_tokens": 6572116.0,
"step": 402
},
{
"entropy": 0.5179315954446793,
"epoch": 1.5037313432835822,
"grad_norm": 0.15208128094673157,
"learning_rate": 0.0002,
"loss": 0.5195380449295044,
"mean_token_accuracy": 0.7901398837566376,
"num_tokens": 6588360.0,
"step": 403
},
{
"entropy": 0.5443613976240158,
"epoch": 1.5074626865671643,
"grad_norm": 0.15764807164669037,
"learning_rate": 0.0002,
"loss": 0.5409551858901978,
"mean_token_accuracy": 0.7817244678735733,
"num_tokens": 6604909.0,
"step": 404
},
{
"entropy": 0.5555933266878128,
"epoch": 1.5111940298507462,
"grad_norm": 0.15518265962600708,
"learning_rate": 0.0002,
"loss": 0.5575823187828064,
"mean_token_accuracy": 0.7727370858192444,
"num_tokens": 6621271.0,
"step": 405
},
{
"entropy": 0.5448516458272934,
"epoch": 1.5149253731343284,
"grad_norm": 0.13999900221824646,
"learning_rate": 0.0002,
"loss": 0.5443175435066223,
"mean_token_accuracy": 0.7797447293996811,
"num_tokens": 6637394.0,
"step": 406
},
{
"entropy": 0.5633855164051056,
"epoch": 1.5186567164179103,
"grad_norm": 0.12512464821338654,
"learning_rate": 0.0002,
"loss": 0.5552009344100952,
"mean_token_accuracy": 0.7740202099084854,
"num_tokens": 6653670.0,
"step": 407
},
{
"entropy": 0.5442499816417694,
"epoch": 1.5223880597014925,
"grad_norm": 0.13073165714740753,
"learning_rate": 0.0002,
"loss": 0.5353500843048096,
"mean_token_accuracy": 0.7859338223934174,
"num_tokens": 6670329.0,
"step": 408
},
{
"entropy": 0.5133479535579681,
"epoch": 1.5261194029850746,
"grad_norm": 0.1424253284931183,
"learning_rate": 0.0002,
"loss": 0.5181159377098083,
"mean_token_accuracy": 0.791978657245636,
"num_tokens": 6686590.0,
"step": 409
},
{
"entropy": 0.5216629430651665,
"epoch": 1.5298507462686568,
"grad_norm": 0.15952785313129425,
"learning_rate": 0.0002,
"loss": 0.5411725640296936,
"mean_token_accuracy": 0.7812029272317886,
"num_tokens": 6702970.0,
"step": 410
},
{
"entropy": 0.5392735451459885,
"epoch": 1.533582089552239,
"grad_norm": 0.13047060370445251,
"learning_rate": 0.0002,
"loss": 0.5485432147979736,
"mean_token_accuracy": 0.7774497866630554,
"num_tokens": 6719627.0,
"step": 411
},
{
"entropy": 0.5269859135150909,
"epoch": 1.537313432835821,
"grad_norm": 0.13100764155387878,
"learning_rate": 0.0002,
"loss": 0.5288376212120056,
"mean_token_accuracy": 0.7857958972454071,
"num_tokens": 6735951.0,
"step": 412
},
{
"entropy": 0.546154260635376,
"epoch": 1.5410447761194028,
"grad_norm": 0.13160941004753113,
"learning_rate": 0.0002,
"loss": 0.5382481813430786,
"mean_token_accuracy": 0.7786583751440048,
"num_tokens": 6752564.0,
"step": 413
},
{
"entropy": 0.552439495921135,
"epoch": 1.544776119402985,
"grad_norm": 0.13911442458629608,
"learning_rate": 0.0002,
"loss": 0.5381487011909485,
"mean_token_accuracy": 0.782607913017273,
"num_tokens": 6768993.0,
"step": 414
},
{
"entropy": 0.5463637262582779,
"epoch": 1.5485074626865671,
"grad_norm": 0.12377088516950607,
"learning_rate": 0.0002,
"loss": 0.5482580661773682,
"mean_token_accuracy": 0.7775403410196304,
"num_tokens": 6785304.0,
"step": 415
},
{
"entropy": 0.5309856235980988,
"epoch": 1.5522388059701493,
"grad_norm": 0.14743956923484802,
"learning_rate": 0.0002,
"loss": 0.5372556447982788,
"mean_token_accuracy": 0.7811425626277924,
"num_tokens": 6801545.0,
"step": 416
},
{
"entropy": 0.5256488621234894,
"epoch": 1.5559701492537314,
"grad_norm": 0.13745813071727753,
"learning_rate": 0.0002,
"loss": 0.5335954427719116,
"mean_token_accuracy": 0.7857853770256042,
"num_tokens": 6817793.0,
"step": 417
},
{
"entropy": 0.5426470339298248,
"epoch": 1.5597014925373134,
"grad_norm": 0.15729817748069763,
"learning_rate": 0.0002,
"loss": 0.5557945966720581,
"mean_token_accuracy": 0.7755606323480606,
"num_tokens": 6834171.0,
"step": 418
},
{
"entropy": 0.5429180264472961,
"epoch": 1.5634328358208955,
"grad_norm": 0.1530143916606903,
"learning_rate": 0.0002,
"loss": 0.5445144176483154,
"mean_token_accuracy": 0.7793177515268326,
"num_tokens": 6850298.0,
"step": 419
},
{
"entropy": 0.5458863228559494,
"epoch": 1.5671641791044775,
"grad_norm": 0.1244051530957222,
"learning_rate": 0.0002,
"loss": 0.5383530855178833,
"mean_token_accuracy": 0.7812670916318893,
"num_tokens": 6866891.0,
"step": 420
},
{
"entropy": 0.564603790640831,
"epoch": 1.5708955223880596,
"grad_norm": 0.14283782243728638,
"learning_rate": 0.0002,
"loss": 0.5600205659866333,
"mean_token_accuracy": 0.7725525945425034,
"num_tokens": 6883247.0,
"step": 421
},
{
"entropy": 0.5389530211687088,
"epoch": 1.5746268656716418,
"grad_norm": 0.13312764465808868,
"learning_rate": 0.0002,
"loss": 0.5395158529281616,
"mean_token_accuracy": 0.7833812385797501,
"num_tokens": 6899801.0,
"step": 422
},
{
"entropy": 0.5225178450345993,
"epoch": 1.578358208955224,
"grad_norm": 0.12671785056591034,
"learning_rate": 0.0002,
"loss": 0.530681312084198,
"mean_token_accuracy": 0.7860707342624664,
"num_tokens": 6916126.0,
"step": 423
},
{
"entropy": 0.5225076675415039,
"epoch": 1.582089552238806,
"grad_norm": 0.1846325844526291,
"learning_rate": 0.0002,
"loss": 0.5287823677062988,
"mean_token_accuracy": 0.7858179211616516,
"num_tokens": 6932572.0,
"step": 424
},
{
"entropy": 0.5322756171226501,
"epoch": 1.585820895522388,
"grad_norm": 0.1279527246952057,
"learning_rate": 0.0002,
"loss": 0.5314757823944092,
"mean_token_accuracy": 0.7839424312114716,
"num_tokens": 6948915.0,
"step": 425
},
{
"entropy": 0.5399055480957031,
"epoch": 1.5895522388059702,
"grad_norm": 0.14472827315330505,
"learning_rate": 0.0002,
"loss": 0.5389757752418518,
"mean_token_accuracy": 0.781254380941391,
"num_tokens": 6965311.0,
"step": 426
},
{
"entropy": 0.543253481388092,
"epoch": 1.5932835820895521,
"grad_norm": 0.1291203647851944,
"learning_rate": 0.0002,
"loss": 0.542615532875061,
"mean_token_accuracy": 0.7801599353551865,
"num_tokens": 6981751.0,
"step": 427
},
{
"entropy": 0.5258511453866959,
"epoch": 1.5970149253731343,
"grad_norm": 0.14912551641464233,
"learning_rate": 0.0002,
"loss": 0.5212829113006592,
"mean_token_accuracy": 0.7879799157381058,
"num_tokens": 6997999.0,
"step": 428
},
{
"entropy": 0.5359253436326981,
"epoch": 1.6007462686567164,
"grad_norm": 0.13902713358402252,
"learning_rate": 0.0002,
"loss": 0.5354318618774414,
"mean_token_accuracy": 0.7819556444883347,
"num_tokens": 7014251.0,
"step": 429
},
{
"entropy": 0.5399288833141327,
"epoch": 1.6044776119402986,
"grad_norm": 0.15356454253196716,
"learning_rate": 0.0002,
"loss": 0.5459235906600952,
"mean_token_accuracy": 0.7798596769571304,
"num_tokens": 7030929.0,
"step": 430
},
{
"entropy": 0.5489939600229263,
"epoch": 1.6082089552238807,
"grad_norm": 0.16724750399589539,
"learning_rate": 0.0002,
"loss": 0.5488972663879395,
"mean_token_accuracy": 0.7782986462116241,
"num_tokens": 7047344.0,
"step": 431
},
{
"entropy": 0.5521660596132278,
"epoch": 1.6119402985074627,
"grad_norm": 0.1370435506105423,
"learning_rate": 0.0002,
"loss": 0.5541171431541443,
"mean_token_accuracy": 0.775096669793129,
"num_tokens": 7063772.0,
"step": 432
},
{
"entropy": 0.5448116213083267,
"epoch": 1.6156716417910446,
"grad_norm": 0.16458411514759064,
"learning_rate": 0.0002,
"loss": 0.5444625616073608,
"mean_token_accuracy": 0.7808038741350174,
"num_tokens": 7080008.0,
"step": 433
},
{
"entropy": 0.5336454659700394,
"epoch": 1.6194029850746268,
"grad_norm": 0.13929054141044617,
"learning_rate": 0.0002,
"loss": 0.5374733805656433,
"mean_token_accuracy": 0.7845250517129898,
"num_tokens": 7096322.0,
"step": 434
},
{
"entropy": 0.5490863621234894,
"epoch": 1.623134328358209,
"grad_norm": 0.17425119876861572,
"learning_rate": 0.0002,
"loss": 0.5510268211364746,
"mean_token_accuracy": 0.7752214223146439,
"num_tokens": 7112627.0,
"step": 435
},
{
"entropy": 0.5409643575549126,
"epoch": 1.626865671641791,
"grad_norm": 0.1438315510749817,
"learning_rate": 0.0002,
"loss": 0.5421441197395325,
"mean_token_accuracy": 0.7772217243909836,
"num_tokens": 7128753.0,
"step": 436
},
{
"entropy": 0.5132558643817902,
"epoch": 1.6305970149253732,
"grad_norm": 0.19491760432720184,
"learning_rate": 0.0002,
"loss": 0.5172038674354553,
"mean_token_accuracy": 0.7922582030296326,
"num_tokens": 7145005.0,
"step": 437
},
{
"entropy": 0.5373466610908508,
"epoch": 1.6343283582089554,
"grad_norm": 0.1514309048652649,
"learning_rate": 0.0002,
"loss": 0.5405304431915283,
"mean_token_accuracy": 0.7827999889850616,
"num_tokens": 7161264.0,
"step": 438
},
{
"entropy": 0.5462755262851715,
"epoch": 1.6380597014925373,
"grad_norm": 0.1856052726507187,
"learning_rate": 0.0002,
"loss": 0.5462319254875183,
"mean_token_accuracy": 0.7752426117658615,
"num_tokens": 7177601.0,
"step": 439
},
{
"entropy": 0.5239088907837868,
"epoch": 1.6417910447761193,
"grad_norm": 0.15442201495170593,
"learning_rate": 0.0002,
"loss": 0.5269871950149536,
"mean_token_accuracy": 0.7881719172000885,
"num_tokens": 7194088.0,
"step": 440
},
{
"entropy": 0.5473000258207321,
"epoch": 1.6455223880597014,
"grad_norm": 0.1733047217130661,
"learning_rate": 0.0002,
"loss": 0.5468770861625671,
"mean_token_accuracy": 0.7766072303056717,
"num_tokens": 7210540.0,
"step": 441
},
{
"entropy": 0.5247174948453903,
"epoch": 1.6492537313432836,
"grad_norm": 0.15060853958129883,
"learning_rate": 0.0002,
"loss": 0.5271586179733276,
"mean_token_accuracy": 0.7868671417236328,
"num_tokens": 7226800.0,
"step": 442
},
{
"entropy": 0.5296545326709747,
"epoch": 1.6529850746268657,
"grad_norm": 0.14210547506809235,
"learning_rate": 0.0002,
"loss": 0.5233073830604553,
"mean_token_accuracy": 0.7905395030975342,
"num_tokens": 7242933.0,
"step": 443
},
{
"entropy": 0.5275071337819099,
"epoch": 1.6567164179104479,
"grad_norm": 0.16420303285121918,
"learning_rate": 0.0002,
"loss": 0.5262512564659119,
"mean_token_accuracy": 0.7878832370042801,
"num_tokens": 7259229.0,
"step": 444
},
{
"entropy": 0.5286994576454163,
"epoch": 1.6604477611940298,
"grad_norm": 0.16218696534633636,
"learning_rate": 0.0002,
"loss": 0.5356262922286987,
"mean_token_accuracy": 0.781034916639328,
"num_tokens": 7275629.0,
"step": 445
},
{
"entropy": 0.5249519050121307,
"epoch": 1.664179104477612,
"grad_norm": 0.13650326430797577,
"learning_rate": 0.0002,
"loss": 0.5306994915008545,
"mean_token_accuracy": 0.7849638760089874,
"num_tokens": 7291780.0,
"step": 446
},
{
"entropy": 0.5290274769067764,
"epoch": 1.667910447761194,
"grad_norm": 0.13130812346935272,
"learning_rate": 0.0002,
"loss": 0.5366880297660828,
"mean_token_accuracy": 0.7813905030488968,
"num_tokens": 7308207.0,
"step": 447
},
{
"entropy": 0.549896240234375,
"epoch": 1.671641791044776,
"grad_norm": 0.13799095153808594,
"learning_rate": 0.0002,
"loss": 0.542113721370697,
"mean_token_accuracy": 0.7787055224180222,
"num_tokens": 7324630.0,
"step": 448
},
{
"entropy": 0.5595291256904602,
"epoch": 1.6753731343283582,
"grad_norm": 0.12968024611473083,
"learning_rate": 0.0002,
"loss": 0.5542213320732117,
"mean_token_accuracy": 0.7749587148427963,
"num_tokens": 7340980.0,
"step": 449
},
{
"entropy": 0.5328024327754974,
"epoch": 1.6791044776119404,
"grad_norm": 0.15673688054084778,
"learning_rate": 0.0002,
"loss": 0.5303700566291809,
"mean_token_accuracy": 0.7840248346328735,
"num_tokens": 7357233.0,
"step": 450
},
{
"entropy": 0.527419738471508,
"epoch": 1.6828358208955225,
"grad_norm": 0.15271416306495667,
"learning_rate": 0.0002,
"loss": 0.5339101552963257,
"mean_token_accuracy": 0.7841878533363342,
"num_tokens": 7373557.0,
"step": 451
},
{
"entropy": 0.5285895839333534,
"epoch": 1.6865671641791045,
"grad_norm": 0.1619284600019455,
"learning_rate": 0.0002,
"loss": 0.5426527261734009,
"mean_token_accuracy": 0.7801112830638885,
"num_tokens": 7389775.0,
"step": 452
},
{
"entropy": 0.5160977020859718,
"epoch": 1.6902985074626866,
"grad_norm": 0.14479905366897583,
"learning_rate": 0.0002,
"loss": 0.5143705606460571,
"mean_token_accuracy": 0.792098343372345,
"num_tokens": 7406142.0,
"step": 453
},
{
"entropy": 0.5246409177780151,
"epoch": 1.6940298507462686,
"grad_norm": 0.13829895853996277,
"learning_rate": 0.0002,
"loss": 0.5241281986236572,
"mean_token_accuracy": 0.7888348549604416,
"num_tokens": 7422123.0,
"step": 454
},
{
"entropy": 0.539468988776207,
"epoch": 1.6977611940298507,
"grad_norm": 0.14040212333202362,
"learning_rate": 0.0002,
"loss": 0.538709282875061,
"mean_token_accuracy": 0.7814967185258865,
"num_tokens": 7438449.0,
"step": 455
},
{
"entropy": 0.5327620357275009,
"epoch": 1.7014925373134329,
"grad_norm": 0.13067209720611572,
"learning_rate": 0.0002,
"loss": 0.531409740447998,
"mean_token_accuracy": 0.7817434817552567,
"num_tokens": 7454843.0,
"step": 456
},
{
"entropy": 0.5428982973098755,
"epoch": 1.705223880597015,
"grad_norm": 0.13850897550582886,
"learning_rate": 0.0002,
"loss": 0.5404822826385498,
"mean_token_accuracy": 0.7804021388292313,
"num_tokens": 7471239.0,
"step": 457
},
{
"entropy": 0.5262090265750885,
"epoch": 1.7089552238805972,
"grad_norm": 0.1596522480249405,
"learning_rate": 0.0002,
"loss": 0.5263737440109253,
"mean_token_accuracy": 0.7867833971977234,
"num_tokens": 7487626.0,
"step": 458
},
{
"entropy": 0.5263974219560623,
"epoch": 1.712686567164179,
"grad_norm": 0.1885124146938324,
"learning_rate": 0.0002,
"loss": 0.5317808985710144,
"mean_token_accuracy": 0.7860947102308273,
"num_tokens": 7504295.0,
"step": 459
},
{
"entropy": 0.5409001708030701,
"epoch": 1.716417910447761,
"grad_norm": 0.18569619953632355,
"learning_rate": 0.0002,
"loss": 0.5502086877822876,
"mean_token_accuracy": 0.7780454903841019,
"num_tokens": 7520700.0,
"step": 460
},
{
"entropy": 0.5375530123710632,
"epoch": 1.7201492537313432,
"grad_norm": 0.1682044118642807,
"learning_rate": 0.0002,
"loss": 0.5417311787605286,
"mean_token_accuracy": 0.7811579406261444,
"num_tokens": 7537296.0,
"step": 461
},
{
"entropy": 0.5499445050954819,
"epoch": 1.7238805970149254,
"grad_norm": 0.13629741966724396,
"learning_rate": 0.0002,
"loss": 0.5424147248268127,
"mean_token_accuracy": 0.7811519056558609,
"num_tokens": 7553751.0,
"step": 462
},
{
"entropy": 0.5344928205013275,
"epoch": 1.7276119402985075,
"grad_norm": 0.15897303819656372,
"learning_rate": 0.0002,
"loss": 0.5359247326850891,
"mean_token_accuracy": 0.7842150777578354,
"num_tokens": 7569929.0,
"step": 463
},
{
"entropy": 0.5554052591323853,
"epoch": 1.7313432835820897,
"grad_norm": 0.1417708843946457,
"learning_rate": 0.0002,
"loss": 0.5565856099128723,
"mean_token_accuracy": 0.7738053798675537,
"num_tokens": 7586469.0,
"step": 464
},
{
"entropy": 0.5416853874921799,
"epoch": 1.7350746268656716,
"grad_norm": 0.13722717761993408,
"learning_rate": 0.0002,
"loss": 0.5358468294143677,
"mean_token_accuracy": 0.7817960679531097,
"num_tokens": 7602590.0,
"step": 465
},
{
"entropy": 0.5408632606267929,
"epoch": 1.7388059701492538,
"grad_norm": 0.157133087515831,
"learning_rate": 0.0002,
"loss": 0.5427348017692566,
"mean_token_accuracy": 0.7806098312139511,
"num_tokens": 7618775.0,
"step": 466
},
{
"entropy": 0.5247721523046494,
"epoch": 1.7425373134328357,
"grad_norm": 0.14061616361141205,
"learning_rate": 0.0002,
"loss": 0.5321290493011475,
"mean_token_accuracy": 0.7820450663566589,
"num_tokens": 7635093.0,
"step": 467
},
{
"entropy": 0.5205557495355606,
"epoch": 1.7462686567164178,
"grad_norm": 0.16123539209365845,
"learning_rate": 0.0002,
"loss": 0.5302354097366333,
"mean_token_accuracy": 0.7854211032390594,
"num_tokens": 7651685.0,
"step": 468
},
{
"entropy": 0.5282921940088272,
"epoch": 1.75,
"grad_norm": 0.15153366327285767,
"learning_rate": 0.0002,
"loss": 0.5328198671340942,
"mean_token_accuracy": 0.7865671813488007,
"num_tokens": 7667959.0,
"step": 469
},
{
"entropy": 0.5481950640678406,
"epoch": 1.7537313432835822,
"grad_norm": 0.12894481420516968,
"learning_rate": 0.0002,
"loss": 0.5497183799743652,
"mean_token_accuracy": 0.778036966919899,
"num_tokens": 7684463.0,
"step": 470
},
{
"entropy": 0.5427480936050415,
"epoch": 1.7574626865671643,
"grad_norm": 0.13647432625293732,
"learning_rate": 0.0002,
"loss": 0.53739994764328,
"mean_token_accuracy": 0.7818431705236435,
"num_tokens": 7700823.0,
"step": 471
},
{
"entropy": 0.5326214283704758,
"epoch": 1.7611940298507462,
"grad_norm": 0.13095979392528534,
"learning_rate": 0.0002,
"loss": 0.5291880369186401,
"mean_token_accuracy": 0.7828460037708282,
"num_tokens": 7717112.0,
"step": 472
},
{
"entropy": 0.5443256497383118,
"epoch": 1.7649253731343284,
"grad_norm": 0.15335077047348022,
"learning_rate": 0.0002,
"loss": 0.5414584875106812,
"mean_token_accuracy": 0.7815631777048111,
"num_tokens": 7733478.0,
"step": 473
},
{
"entropy": 0.5510082393884659,
"epoch": 1.7686567164179103,
"grad_norm": 0.12999047338962555,
"learning_rate": 0.0002,
"loss": 0.5502053499221802,
"mean_token_accuracy": 0.7768876850605011,
"num_tokens": 7749733.0,
"step": 474
},
{
"entropy": 0.5287549048662186,
"epoch": 1.7723880597014925,
"grad_norm": 0.14021116495132446,
"learning_rate": 0.0002,
"loss": 0.5351616740226746,
"mean_token_accuracy": 0.7807136327028275,
"num_tokens": 7766232.0,
"step": 475
},
{
"entropy": 0.5237460732460022,
"epoch": 1.7761194029850746,
"grad_norm": 0.13716712594032288,
"learning_rate": 0.0002,
"loss": 0.5256913900375366,
"mean_token_accuracy": 0.7864228338003159,
"num_tokens": 7782399.0,
"step": 476
},
{
"entropy": 0.5354984253644943,
"epoch": 1.7798507462686568,
"grad_norm": 0.1459989845752716,
"learning_rate": 0.0002,
"loss": 0.544316291809082,
"mean_token_accuracy": 0.7797362506389618,
"num_tokens": 7798787.0,
"step": 477
},
{
"entropy": 0.5540675520896912,
"epoch": 1.783582089552239,
"grad_norm": 0.12925799190998077,
"learning_rate": 0.0002,
"loss": 0.5467855334281921,
"mean_token_accuracy": 0.7801081091165543,
"num_tokens": 7815176.0,
"step": 478
},
{
"entropy": 0.5433181077241898,
"epoch": 1.787313432835821,
"grad_norm": 0.14298273622989655,
"learning_rate": 0.0002,
"loss": 0.5409605503082275,
"mean_token_accuracy": 0.7799843400716782,
"num_tokens": 7831722.0,
"step": 479
},
{
"entropy": 0.5389926880598068,
"epoch": 1.7910447761194028,
"grad_norm": 0.13404588401317596,
"learning_rate": 0.0002,
"loss": 0.5434566736221313,
"mean_token_accuracy": 0.7795996069908142,
"num_tokens": 7847789.0,
"step": 480
},
{
"entropy": 0.5389460772275925,
"epoch": 1.794776119402985,
"grad_norm": 0.14891406893730164,
"learning_rate": 0.0002,
"loss": 0.5478703379631042,
"mean_token_accuracy": 0.7784013152122498,
"num_tokens": 7864039.0,
"step": 481
},
{
"entropy": 0.5258179157972336,
"epoch": 1.7985074626865671,
"grad_norm": 0.1405036896467209,
"learning_rate": 0.0002,
"loss": 0.5313145518302917,
"mean_token_accuracy": 0.7844405174255371,
"num_tokens": 7880600.0,
"step": 482
},
{
"entropy": 0.546451672911644,
"epoch": 1.8022388059701493,
"grad_norm": 0.12789376080036163,
"learning_rate": 0.0002,
"loss": 0.5392628908157349,
"mean_token_accuracy": 0.780185878276825,
"num_tokens": 7897113.0,
"step": 483
},
{
"entropy": 0.5462568253278732,
"epoch": 1.8059701492537314,
"grad_norm": 0.15970084071159363,
"learning_rate": 0.0002,
"loss": 0.5421247482299805,
"mean_token_accuracy": 0.7790002077817917,
"num_tokens": 7913715.0,
"step": 484
},
{
"entropy": 0.5245223939418793,
"epoch": 1.8097014925373134,
"grad_norm": 0.12480644881725311,
"learning_rate": 0.0002,
"loss": 0.5243803858757019,
"mean_token_accuracy": 0.7877090722322464,
"num_tokens": 7930253.0,
"step": 485
},
{
"entropy": 0.5543881952762604,
"epoch": 1.8134328358208955,
"grad_norm": 0.17440125346183777,
"learning_rate": 0.0002,
"loss": 0.5611490607261658,
"mean_token_accuracy": 0.773423507809639,
"num_tokens": 7946773.0,
"step": 486
},
{
"entropy": 0.5448231846094131,
"epoch": 1.8171641791044775,
"grad_norm": 0.1254844069480896,
"learning_rate": 0.0002,
"loss": 0.5429874658584595,
"mean_token_accuracy": 0.7773167043924332,
"num_tokens": 7963214.0,
"step": 487
},
{
"entropy": 0.5249373018741608,
"epoch": 1.8208955223880596,
"grad_norm": 0.13412347435951233,
"learning_rate": 0.0002,
"loss": 0.5265883803367615,
"mean_token_accuracy": 0.7875321507453918,
"num_tokens": 7979611.0,
"step": 488
},
{
"entropy": 0.5382010042667389,
"epoch": 1.8246268656716418,
"grad_norm": 0.16182008385658264,
"learning_rate": 0.0002,
"loss": 0.5412148237228394,
"mean_token_accuracy": 0.7788311392068863,
"num_tokens": 7996094.0,
"step": 489
},
{
"entropy": 0.5332826524972916,
"epoch": 1.828358208955224,
"grad_norm": 0.1427432894706726,
"learning_rate": 0.0002,
"loss": 0.5368761420249939,
"mean_token_accuracy": 0.7825220227241516,
"num_tokens": 8012432.0,
"step": 490
},
{
"entropy": 0.5108669325709343,
"epoch": 1.832089552238806,
"grad_norm": 0.1509285867214203,
"learning_rate": 0.0002,
"loss": 0.5119490623474121,
"mean_token_accuracy": 0.7906075417995453,
"num_tokens": 8028665.0,
"step": 491
},
{
"entropy": 0.5145807713270187,
"epoch": 1.835820895522388,
"grad_norm": 0.1396896094083786,
"learning_rate": 0.0002,
"loss": 0.5196783542633057,
"mean_token_accuracy": 0.7873106449842453,
"num_tokens": 8044855.0,
"step": 492
},
{
"entropy": 0.5123258233070374,
"epoch": 1.8395522388059702,
"grad_norm": 0.14697767794132233,
"learning_rate": 0.0002,
"loss": 0.5223352909088135,
"mean_token_accuracy": 0.7885845303535461,
"num_tokens": 8061121.0,
"step": 493
},
{
"entropy": 0.5335386842489243,
"epoch": 1.8432835820895521,
"grad_norm": 0.14804190397262573,
"learning_rate": 0.0002,
"loss": 0.534782350063324,
"mean_token_accuracy": 0.7838051915168762,
"num_tokens": 8077519.0,
"step": 494
},
{
"entropy": 0.5602670460939407,
"epoch": 1.8470149253731343,
"grad_norm": 0.13603031635284424,
"learning_rate": 0.0002,
"loss": 0.5542025566101074,
"mean_token_accuracy": 0.7756092548370361,
"num_tokens": 8093937.0,
"step": 495
},
{
"entropy": 0.5355454534292221,
"epoch": 1.8507462686567164,
"grad_norm": 0.11670524626970291,
"learning_rate": 0.0002,
"loss": 0.5269724130630493,
"mean_token_accuracy": 0.7864131927490234,
"num_tokens": 8110383.0,
"step": 496
},
{
"entropy": 0.5373311340808868,
"epoch": 1.8544776119402986,
"grad_norm": 0.13412456214427948,
"learning_rate": 0.0002,
"loss": 0.5295535326004028,
"mean_token_accuracy": 0.7874404042959213,
"num_tokens": 8126795.0,
"step": 497
},
{
"entropy": 0.5373153984546661,
"epoch": 1.8582089552238807,
"grad_norm": 0.1485511064529419,
"learning_rate": 0.0002,
"loss": 0.5427818894386292,
"mean_token_accuracy": 0.7803584039211273,
"num_tokens": 8143234.0,
"step": 498
},
{
"entropy": 0.522105023264885,
"epoch": 1.8619402985074627,
"grad_norm": 0.1580716073513031,
"learning_rate": 0.0002,
"loss": 0.5267635583877563,
"mean_token_accuracy": 0.7869967371225357,
"num_tokens": 8159687.0,
"step": 499
},
{
"entropy": 0.5215406715869904,
"epoch": 1.8656716417910446,
"grad_norm": 0.1573050171136856,
"learning_rate": 0.0002,
"loss": 0.5285288691520691,
"mean_token_accuracy": 0.7851908951997757,
"num_tokens": 8176020.0,
"step": 500
},
{
"entropy": 0.5404719114303589,
"epoch": 1.8694029850746268,
"grad_norm": 0.1411486119031906,
"learning_rate": 0.0002,
"loss": 0.5365728735923767,
"mean_token_accuracy": 0.7837002873420715,
"num_tokens": 8192551.0,
"step": 501
},
{
"entropy": 0.5438470244407654,
"epoch": 1.873134328358209,
"grad_norm": 0.130998432636261,
"learning_rate": 0.0002,
"loss": 0.5430339574813843,
"mean_token_accuracy": 0.7819307893514633,
"num_tokens": 8209082.0,
"step": 502
},
{
"entropy": 0.5403178930282593,
"epoch": 1.876865671641791,
"grad_norm": 0.1385144740343094,
"learning_rate": 0.0002,
"loss": 0.5460789203643799,
"mean_token_accuracy": 0.7790951728820801,
"num_tokens": 8225744.0,
"step": 503
},
{
"entropy": 0.5280100554227829,
"epoch": 1.8805970149253732,
"grad_norm": 0.14330939948558807,
"learning_rate": 0.0002,
"loss": 0.5235118269920349,
"mean_token_accuracy": 0.7890605628490448,
"num_tokens": 8242208.0,
"step": 504
},
{
"entropy": 0.5532096922397614,
"epoch": 1.8843283582089554,
"grad_norm": 0.1357594132423401,
"learning_rate": 0.0002,
"loss": 0.5498918890953064,
"mean_token_accuracy": 0.7760927677154541,
"num_tokens": 8258496.0,
"step": 505
},
{
"entropy": 0.5294792056083679,
"epoch": 1.8880597014925373,
"grad_norm": 0.13375437259674072,
"learning_rate": 0.0002,
"loss": 0.5297701358795166,
"mean_token_accuracy": 0.7845475971698761,
"num_tokens": 8274536.0,
"step": 506
},
{
"entropy": 0.5456722378730774,
"epoch": 1.8917910447761193,
"grad_norm": 0.14889481663703918,
"learning_rate": 0.0002,
"loss": 0.5517223477363586,
"mean_token_accuracy": 0.7756078243255615,
"num_tokens": 8290986.0,
"step": 507
},
{
"entropy": 0.5214451998472214,
"epoch": 1.8955223880597014,
"grad_norm": 0.13305895030498505,
"learning_rate": 0.0002,
"loss": 0.5249897837638855,
"mean_token_accuracy": 0.7870367765426636,
"num_tokens": 8307117.0,
"step": 508
},
{
"entropy": 0.5336883068084717,
"epoch": 1.8992537313432836,
"grad_norm": 0.13193877041339874,
"learning_rate": 0.0002,
"loss": 0.5352887511253357,
"mean_token_accuracy": 0.7798391133546829,
"num_tokens": 8323273.0,
"step": 509
},
{
"entropy": 0.5336564183235168,
"epoch": 1.9029850746268657,
"grad_norm": 0.12489310652017593,
"learning_rate": 0.0002,
"loss": 0.5302382111549377,
"mean_token_accuracy": 0.7845423817634583,
"num_tokens": 8339385.0,
"step": 510
},
{
"entropy": 0.5382219552993774,
"epoch": 1.9067164179104479,
"grad_norm": 0.1456049680709839,
"learning_rate": 0.0002,
"loss": 0.5372790694236755,
"mean_token_accuracy": 0.782544881105423,
"num_tokens": 8355706.0,
"step": 511
},
{
"entropy": 0.5403454750776291,
"epoch": 1.9104477611940298,
"grad_norm": 0.12694604694843292,
"learning_rate": 0.0002,
"loss": 0.5402185320854187,
"mean_token_accuracy": 0.7826471477746964,
"num_tokens": 8372132.0,
"step": 512
},
{
"entropy": 0.5318908393383026,
"epoch": 1.914179104477612,
"grad_norm": 0.1555122435092926,
"learning_rate": 0.0002,
"loss": 0.541782796382904,
"mean_token_accuracy": 0.7774071842432022,
"num_tokens": 8388306.0,
"step": 513
},
{
"entropy": 0.5221689939498901,
"epoch": 1.917910447761194,
"grad_norm": 0.1543516367673874,
"learning_rate": 0.0002,
"loss": 0.5357338190078735,
"mean_token_accuracy": 0.7826261073350906,
"num_tokens": 8404876.0,
"step": 514
},
{
"entropy": 0.5229770094156265,
"epoch": 1.921641791044776,
"grad_norm": 0.13613452017307281,
"learning_rate": 0.0002,
"loss": 0.5244792699813843,
"mean_token_accuracy": 0.7872123420238495,
"num_tokens": 8421349.0,
"step": 515
},
{
"entropy": 0.5398612320423126,
"epoch": 1.9253731343283582,
"grad_norm": 0.14049243927001953,
"learning_rate": 0.0002,
"loss": 0.5422282218933105,
"mean_token_accuracy": 0.7783734500408173,
"num_tokens": 8437774.0,
"step": 516
},
{
"entropy": 0.5401616841554642,
"epoch": 1.9291044776119404,
"grad_norm": 0.13164237141609192,
"learning_rate": 0.0002,
"loss": 0.5331213474273682,
"mean_token_accuracy": 0.7848468571901321,
"num_tokens": 8454123.0,
"step": 517
},
{
"entropy": 0.5214215666055679,
"epoch": 1.9328358208955225,
"grad_norm": 0.13749226927757263,
"learning_rate": 0.0002,
"loss": 0.5158907175064087,
"mean_token_accuracy": 0.7904626429080963,
"num_tokens": 8470320.0,
"step": 518
},
{
"entropy": 0.5412722826004028,
"epoch": 1.9365671641791045,
"grad_norm": 0.127340629696846,
"learning_rate": 0.0002,
"loss": 0.5443693995475769,
"mean_token_accuracy": 0.7785214781761169,
"num_tokens": 8486754.0,
"step": 519
},
{
"entropy": 0.5276665389537811,
"epoch": 1.9402985074626866,
"grad_norm": 0.13310599327087402,
"learning_rate": 0.0002,
"loss": 0.5311852693557739,
"mean_token_accuracy": 0.7849074453115463,
"num_tokens": 8503273.0,
"step": 520
},
{
"entropy": 0.5346188247203827,
"epoch": 1.9440298507462686,
"grad_norm": 0.12909531593322754,
"learning_rate": 0.0002,
"loss": 0.5408310890197754,
"mean_token_accuracy": 0.779103621840477,
"num_tokens": 8519520.0,
"step": 521
},
{
"entropy": 0.5392955094575882,
"epoch": 1.9477611940298507,
"grad_norm": 0.12654371559619904,
"learning_rate": 0.0002,
"loss": 0.5376543998718262,
"mean_token_accuracy": 0.7810464203357697,
"num_tokens": 8535688.0,
"step": 522
},
{
"entropy": 0.526744157075882,
"epoch": 1.9514925373134329,
"grad_norm": 0.11877280473709106,
"learning_rate": 0.0002,
"loss": 0.5258936882019043,
"mean_token_accuracy": 0.7875306010246277,
"num_tokens": 8551996.0,
"step": 523
},
{
"entropy": 0.5467166006565094,
"epoch": 1.955223880597015,
"grad_norm": 0.1407010555267334,
"learning_rate": 0.0002,
"loss": 0.5389098525047302,
"mean_token_accuracy": 0.7805493026971817,
"num_tokens": 8568202.0,
"step": 524
},
{
"entropy": 0.5553875267505646,
"epoch": 1.9589552238805972,
"grad_norm": 0.13490191102027893,
"learning_rate": 0.0002,
"loss": 0.5481207370758057,
"mean_token_accuracy": 0.7784747332334518,
"num_tokens": 8584625.0,
"step": 525
},
{
"entropy": 0.5178312584757805,
"epoch": 1.962686567164179,
"grad_norm": 0.14236751198768616,
"learning_rate": 0.0002,
"loss": 0.5226012468338013,
"mean_token_accuracy": 0.7866991758346558,
"num_tokens": 8600683.0,
"step": 526
},
{
"entropy": 0.5227778926491737,
"epoch": 1.966417910447761,
"grad_norm": 0.16303445398807526,
"learning_rate": 0.0002,
"loss": 0.5365378856658936,
"mean_token_accuracy": 0.7807085812091827,
"num_tokens": 8616685.0,
"step": 527
},
{
"entropy": 0.5410575568675995,
"epoch": 1.9701492537313432,
"grad_norm": 0.16557544469833374,
"learning_rate": 0.0002,
"loss": 0.5510291457176208,
"mean_token_accuracy": 0.7770103365182877,
"num_tokens": 8633088.0,
"step": 528
},
{
"entropy": 0.531767264008522,
"epoch": 1.9738805970149254,
"grad_norm": 0.16024784743785858,
"learning_rate": 0.0002,
"loss": 0.5305666327476501,
"mean_token_accuracy": 0.7834270149469376,
"num_tokens": 8649322.0,
"step": 529
},
{
"entropy": 0.5423388332128525,
"epoch": 1.9776119402985075,
"grad_norm": 0.1314675360918045,
"learning_rate": 0.0002,
"loss": 0.5316357016563416,
"mean_token_accuracy": 0.7857660055160522,
"num_tokens": 8665670.0,
"step": 530
},
{
"entropy": 0.5405716001987457,
"epoch": 1.9813432835820897,
"grad_norm": 0.1407650113105774,
"learning_rate": 0.0002,
"loss": 0.5429906845092773,
"mean_token_accuracy": 0.7817323058843613,
"num_tokens": 8681998.0,
"step": 531
},
{
"entropy": 0.5365249365568161,
"epoch": 1.9850746268656716,
"grad_norm": 0.14180989563465118,
"learning_rate": 0.0002,
"loss": 0.5345437526702881,
"mean_token_accuracy": 0.7865561246871948,
"num_tokens": 8698483.0,
"step": 532
},
{
"entropy": 0.5290075689554214,
"epoch": 1.9888059701492538,
"grad_norm": 0.1477176696062088,
"learning_rate": 0.0002,
"loss": 0.5337146520614624,
"mean_token_accuracy": 0.7824839055538177,
"num_tokens": 8714640.0,
"step": 533
},
{
"entropy": 0.5333692282438278,
"epoch": 1.9925373134328357,
"grad_norm": 0.17112773656845093,
"learning_rate": 0.0002,
"loss": 0.5424102544784546,
"mean_token_accuracy": 0.779076337814331,
"num_tokens": 8730887.0,
"step": 534
},
{
"entropy": 0.5415492355823517,
"epoch": 1.9962686567164178,
"grad_norm": 0.14943642914295197,
"learning_rate": 0.0002,
"loss": 0.5476213693618774,
"mean_token_accuracy": 0.7769679576158524,
"num_tokens": 8747309.0,
"step": 535
},
{
"entropy": 0.5581045299768448,
"epoch": 2.0,
"grad_norm": 0.15832063555717468,
"learning_rate": 0.0002,
"loss": 0.5548263788223267,
"mean_token_accuracy": 0.776277557015419,
"num_tokens": 8763550.0,
"step": 536
},
{
"entropy": 0.5369964390993118,
"epoch": 2.003731343283582,
"grad_norm": 0.15130668878555298,
"learning_rate": 0.0002,
"loss": 0.5179107189178467,
"mean_token_accuracy": 0.7907675057649612,
"num_tokens": 8779922.0,
"step": 537
},
{
"entropy": 0.5117110908031464,
"epoch": 2.0074626865671643,
"grad_norm": 0.16026535630226135,
"learning_rate": 0.0002,
"loss": 0.5020841956138611,
"mean_token_accuracy": 0.7973873615264893,
"num_tokens": 8795988.0,
"step": 538
},
{
"entropy": 0.5028296113014221,
"epoch": 2.0111940298507465,
"grad_norm": 0.1676231324672699,
"learning_rate": 0.0002,
"loss": 0.51214998960495,
"mean_token_accuracy": 0.7921472936868668,
"num_tokens": 8812261.0,
"step": 539
},
{
"entropy": 0.5081141889095306,
"epoch": 2.014925373134328,
"grad_norm": 0.21105162799358368,
"learning_rate": 0.0002,
"loss": 0.5206259489059448,
"mean_token_accuracy": 0.7869252115488052,
"num_tokens": 8828964.0,
"step": 540
},
{
"entropy": 0.5053770169615746,
"epoch": 2.0186567164179103,
"grad_norm": 0.1996072232723236,
"learning_rate": 0.0002,
"loss": 0.5146310329437256,
"mean_token_accuracy": 0.7916830629110336,
"num_tokens": 8845583.0,
"step": 541
},
{
"entropy": 0.5284380093216896,
"epoch": 2.0223880597014925,
"grad_norm": 0.14588730037212372,
"learning_rate": 0.0002,
"loss": 0.5199918150901794,
"mean_token_accuracy": 0.7893239259719849,
"num_tokens": 8861873.0,
"step": 542
},
{
"entropy": 0.5435770899057388,
"epoch": 2.0261194029850746,
"grad_norm": 0.14907799661159515,
"learning_rate": 0.0002,
"loss": 0.536811113357544,
"mean_token_accuracy": 0.7802763283252716,
"num_tokens": 8878456.0,
"step": 543
},
{
"entropy": 0.5174986571073532,
"epoch": 2.029850746268657,
"grad_norm": 0.14996512234210968,
"learning_rate": 0.0002,
"loss": 0.5144167542457581,
"mean_token_accuracy": 0.7930785864591599,
"num_tokens": 8894797.0,
"step": 544
},
{
"entropy": 0.5272421538829803,
"epoch": 2.033582089552239,
"grad_norm": 0.16765476763248444,
"learning_rate": 0.0002,
"loss": 0.5306269526481628,
"mean_token_accuracy": 0.7856330573558807,
"num_tokens": 8911217.0,
"step": 545
},
{
"entropy": 0.49972501397132874,
"epoch": 2.0373134328358207,
"grad_norm": 0.1322057694196701,
"learning_rate": 0.0002,
"loss": 0.5012874603271484,
"mean_token_accuracy": 0.7979290634393692,
"num_tokens": 8927511.0,
"step": 546
},
{
"entropy": 0.5031155720353127,
"epoch": 2.041044776119403,
"grad_norm": 0.16402538120746613,
"learning_rate": 0.0002,
"loss": 0.5100584626197815,
"mean_token_accuracy": 0.7926298826932907,
"num_tokens": 8943509.0,
"step": 547
},
{
"entropy": 0.5090021565556526,
"epoch": 2.044776119402985,
"grad_norm": 0.1516626924276352,
"learning_rate": 0.0002,
"loss": 0.51352858543396,
"mean_token_accuracy": 0.7925879657268524,
"num_tokens": 8959744.0,
"step": 548
},
{
"entropy": 0.4990556240081787,
"epoch": 2.048507462686567,
"grad_norm": 0.14189165830612183,
"learning_rate": 0.0002,
"loss": 0.5032692551612854,
"mean_token_accuracy": 0.7943097651004791,
"num_tokens": 8976001.0,
"step": 549
},
{
"entropy": 0.5276429355144501,
"epoch": 2.0522388059701493,
"grad_norm": 0.13545501232147217,
"learning_rate": 0.0002,
"loss": 0.5224078893661499,
"mean_token_accuracy": 0.7892052680253983,
"num_tokens": 8992265.0,
"step": 550
},
{
"entropy": 0.5246792286634445,
"epoch": 2.0559701492537314,
"grad_norm": 0.15987011790275574,
"learning_rate": 0.0002,
"loss": 0.5220500230789185,
"mean_token_accuracy": 0.7897221744060516,
"num_tokens": 9008612.0,
"step": 551
},
{
"entropy": 0.5142855197191238,
"epoch": 2.0597014925373136,
"grad_norm": 0.17870153486728668,
"learning_rate": 0.0002,
"loss": 0.5103524923324585,
"mean_token_accuracy": 0.7925411611795425,
"num_tokens": 9025112.0,
"step": 552
},
{
"entropy": 0.5080101564526558,
"epoch": 2.0634328358208953,
"grad_norm": 0.19365249574184418,
"learning_rate": 0.0002,
"loss": 0.5135321617126465,
"mean_token_accuracy": 0.792420819401741,
"num_tokens": 9041825.0,
"step": 553
},
{
"entropy": 0.5249690413475037,
"epoch": 2.0671641791044775,
"grad_norm": 0.17408262193202972,
"learning_rate": 0.0002,
"loss": 0.527820348739624,
"mean_token_accuracy": 0.7850991487503052,
"num_tokens": 9058218.0,
"step": 554
},
{
"entropy": 0.5355798751115799,
"epoch": 2.0708955223880596,
"grad_norm": 0.17400678992271423,
"learning_rate": 0.0002,
"loss": 0.5327027440071106,
"mean_token_accuracy": 0.7834015786647797,
"num_tokens": 9074538.0,
"step": 555
},
{
"entropy": 0.5193932577967644,
"epoch": 2.074626865671642,
"grad_norm": 0.19260965287685394,
"learning_rate": 0.0002,
"loss": 0.5203508138656616,
"mean_token_accuracy": 0.7900512516498566,
"num_tokens": 9090645.0,
"step": 556
},
{
"entropy": 0.5282454341650009,
"epoch": 2.078358208955224,
"grad_norm": 0.17010283470153809,
"learning_rate": 0.0002,
"loss": 0.5296856760978699,
"mean_token_accuracy": 0.7844990193843842,
"num_tokens": 9107205.0,
"step": 557
},
{
"entropy": 0.5335307121276855,
"epoch": 2.082089552238806,
"grad_norm": 0.18085786700248718,
"learning_rate": 0.0002,
"loss": 0.5380091667175293,
"mean_token_accuracy": 0.7830383628606796,
"num_tokens": 9123633.0,
"step": 558
},
{
"entropy": 0.5050861239433289,
"epoch": 2.0858208955223883,
"grad_norm": 0.1828233301639557,
"learning_rate": 0.0002,
"loss": 0.5116996169090271,
"mean_token_accuracy": 0.7909363359212875,
"num_tokens": 9139672.0,
"step": 559
},
{
"entropy": 0.5233924090862274,
"epoch": 2.08955223880597,
"grad_norm": 0.1721849888563156,
"learning_rate": 0.0002,
"loss": 0.5234174728393555,
"mean_token_accuracy": 0.7887046784162521,
"num_tokens": 9156329.0,
"step": 560
},
{
"entropy": 0.5096859857439995,
"epoch": 2.093283582089552,
"grad_norm": 0.13895049691200256,
"learning_rate": 0.0002,
"loss": 0.5016306638717651,
"mean_token_accuracy": 0.7958591133356094,
"num_tokens": 9172549.0,
"step": 561
},
{
"entropy": 0.5022074803709984,
"epoch": 2.0970149253731343,
"grad_norm": 0.18107853829860687,
"learning_rate": 0.0002,
"loss": 0.49785315990448,
"mean_token_accuracy": 0.7988625317811966,
"num_tokens": 9188916.0,
"step": 562
},
{
"entropy": 0.49919093400239944,
"epoch": 2.1007462686567164,
"grad_norm": 0.18361544609069824,
"learning_rate": 0.0002,
"loss": 0.5069372057914734,
"mean_token_accuracy": 0.7953463643789291,
"num_tokens": 9205116.0,
"step": 563
},
{
"entropy": 0.5179380178451538,
"epoch": 2.1044776119402986,
"grad_norm": 0.17814478278160095,
"learning_rate": 0.0002,
"loss": 0.5233405232429504,
"mean_token_accuracy": 0.7879672199487686,
"num_tokens": 9221422.0,
"step": 564
},
{
"entropy": 0.5209343507885933,
"epoch": 2.1082089552238807,
"grad_norm": 0.16368801891803741,
"learning_rate": 0.0002,
"loss": 0.5220014452934265,
"mean_token_accuracy": 0.7900985032320023,
"num_tokens": 9237878.0,
"step": 565
},
{
"entropy": 0.5203168541193008,
"epoch": 2.111940298507463,
"grad_norm": 0.18038009107112885,
"learning_rate": 0.0002,
"loss": 0.5181905627250671,
"mean_token_accuracy": 0.7902995347976685,
"num_tokens": 9254207.0,
"step": 566
},
{
"entropy": 0.5203139036893845,
"epoch": 2.1156716417910446,
"grad_norm": 0.15972773730754852,
"learning_rate": 0.0002,
"loss": 0.5092154145240784,
"mean_token_accuracy": 0.793173611164093,
"num_tokens": 9270204.0,
"step": 567
},
{
"entropy": 0.5298740118741989,
"epoch": 2.1194029850746268,
"grad_norm": 0.16917745769023895,
"learning_rate": 0.0002,
"loss": 0.521593451499939,
"mean_token_accuracy": 0.789896160364151,
"num_tokens": 9286472.0,
"step": 568
},
{
"entropy": 0.5120234042406082,
"epoch": 2.123134328358209,
"grad_norm": 0.1817537248134613,
"learning_rate": 0.0002,
"loss": 0.5180550813674927,
"mean_token_accuracy": 0.7886006981134415,
"num_tokens": 9302801.0,
"step": 569
},
{
"entropy": 0.5053592845797539,
"epoch": 2.126865671641791,
"grad_norm": 0.17402999103069305,
"learning_rate": 0.0002,
"loss": 0.5133467316627502,
"mean_token_accuracy": 0.7945185601711273,
"num_tokens": 9318994.0,
"step": 570
},
{
"entropy": 0.5077695101499557,
"epoch": 2.1305970149253732,
"grad_norm": 0.1826324611902237,
"learning_rate": 0.0002,
"loss": 0.5111861228942871,
"mean_token_accuracy": 0.7935459464788437,
"num_tokens": 9335440.0,
"step": 571
},
{
"entropy": 0.5085733756422997,
"epoch": 2.1343283582089554,
"grad_norm": 0.20258648693561554,
"learning_rate": 0.0002,
"loss": 0.5162274837493896,
"mean_token_accuracy": 0.7936873137950897,
"num_tokens": 9351752.0,
"step": 572
},
{
"entropy": 0.5466553270816803,
"epoch": 2.138059701492537,
"grad_norm": 0.21011336147785187,
"learning_rate": 0.0002,
"loss": 0.5393267273902893,
"mean_token_accuracy": 0.7812587320804596,
"num_tokens": 9368219.0,
"step": 573
},
{
"entropy": 0.5103291645646095,
"epoch": 2.1417910447761193,
"grad_norm": 0.16960836946964264,
"learning_rate": 0.0002,
"loss": 0.5084283351898193,
"mean_token_accuracy": 0.7936739772558212,
"num_tokens": 9384590.0,
"step": 574
},
{
"entropy": 0.5131630301475525,
"epoch": 2.1455223880597014,
"grad_norm": 0.17001323401927948,
"learning_rate": 0.0002,
"loss": 0.5123889446258545,
"mean_token_accuracy": 0.7904325425624847,
"num_tokens": 9400768.0,
"step": 575
},
{
"entropy": 0.5091337114572525,
"epoch": 2.1492537313432836,
"grad_norm": 0.19518889486789703,
"learning_rate": 0.0002,
"loss": 0.512664794921875,
"mean_token_accuracy": 0.7909765988588333,
"num_tokens": 9416962.0,
"step": 576
},
{
"entropy": 0.506959430873394,
"epoch": 2.1529850746268657,
"grad_norm": 0.19361013174057007,
"learning_rate": 0.0002,
"loss": 0.5145208835601807,
"mean_token_accuracy": 0.7909970581531525,
"num_tokens": 9433273.0,
"step": 577
},
{
"entropy": 0.5075285658240318,
"epoch": 2.156716417910448,
"grad_norm": 0.20014171302318573,
"learning_rate": 0.0002,
"loss": 0.5108210444450378,
"mean_token_accuracy": 0.795252114534378,
"num_tokens": 9449764.0,
"step": 578
},
{
"entropy": 0.5293942838907242,
"epoch": 2.16044776119403,
"grad_norm": 0.1974441111087799,
"learning_rate": 0.0002,
"loss": 0.5285412669181824,
"mean_token_accuracy": 0.7868294268846512,
"num_tokens": 9466170.0,
"step": 579
},
{
"entropy": 0.5336958318948746,
"epoch": 2.1641791044776117,
"grad_norm": 0.16498853266239166,
"learning_rate": 0.0002,
"loss": 0.5246227383613586,
"mean_token_accuracy": 0.7904203087091446,
"num_tokens": 9482671.0,
"step": 580
},
{
"entropy": 0.5340626388788223,
"epoch": 2.167910447761194,
"grad_norm": 0.16569171845912933,
"learning_rate": 0.0002,
"loss": 0.5292053818702698,
"mean_token_accuracy": 0.7861965000629425,
"num_tokens": 9499134.0,
"step": 581
},
{
"entropy": 0.5213732421398163,
"epoch": 2.171641791044776,
"grad_norm": 0.191435769200325,
"learning_rate": 0.0002,
"loss": 0.527378499507904,
"mean_token_accuracy": 0.7864173054695129,
"num_tokens": 9515505.0,
"step": 582
},
{
"entropy": 0.5035439431667328,
"epoch": 2.175373134328358,
"grad_norm": 0.1665230244398117,
"learning_rate": 0.0002,
"loss": 0.5038704872131348,
"mean_token_accuracy": 0.7968962043523788,
"num_tokens": 9532118.0,
"step": 583
},
{
"entropy": 0.5060234367847443,
"epoch": 2.1791044776119404,
"grad_norm": 0.16969595849514008,
"learning_rate": 0.0002,
"loss": 0.5113446712493896,
"mean_token_accuracy": 0.7920107841491699,
"num_tokens": 9548351.0,
"step": 584
},
{
"entropy": 0.5291168391704559,
"epoch": 2.1828358208955225,
"grad_norm": 0.16809239983558655,
"learning_rate": 0.0002,
"loss": 0.5360448360443115,
"mean_token_accuracy": 0.7811578214168549,
"num_tokens": 9564913.0,
"step": 585
},
{
"entropy": 0.5199222788214684,
"epoch": 2.1865671641791047,
"grad_norm": 0.15394440293312073,
"learning_rate": 0.0002,
"loss": 0.5177597403526306,
"mean_token_accuracy": 0.7905119061470032,
"num_tokens": 9581583.0,
"step": 586
},
{
"entropy": 0.5282980501651764,
"epoch": 2.1902985074626864,
"grad_norm": 0.17473557591438293,
"learning_rate": 0.0002,
"loss": 0.527908980846405,
"mean_token_accuracy": 0.7872945964336395,
"num_tokens": 9598262.0,
"step": 587
},
{
"entropy": 0.5268830358982086,
"epoch": 2.1940298507462686,
"grad_norm": 0.16386888921260834,
"learning_rate": 0.0002,
"loss": 0.5233091711997986,
"mean_token_accuracy": 0.788049191236496,
"num_tokens": 9614535.0,
"step": 588
},
{
"entropy": 0.5275766104459763,
"epoch": 2.1977611940298507,
"grad_norm": 0.17853675782680511,
"learning_rate": 0.0002,
"loss": 0.5314985513687134,
"mean_token_accuracy": 0.7853439450263977,
"num_tokens": 9630730.0,
"step": 589
},
{
"entropy": 0.5230407416820526,
"epoch": 2.201492537313433,
"grad_norm": 0.18614573776721954,
"learning_rate": 0.0002,
"loss": 0.5324023365974426,
"mean_token_accuracy": 0.7870204299688339,
"num_tokens": 9647367.0,
"step": 590
},
{
"entropy": 0.5045590102672577,
"epoch": 2.205223880597015,
"grad_norm": 0.16460436582565308,
"learning_rate": 0.0002,
"loss": 0.5095564723014832,
"mean_token_accuracy": 0.7933550179004669,
"num_tokens": 9663807.0,
"step": 591
},
{
"entropy": 0.5061227604746819,
"epoch": 2.208955223880597,
"grad_norm": 0.1727134734392166,
"learning_rate": 0.0002,
"loss": 0.50539630651474,
"mean_token_accuracy": 0.79543037712574,
"num_tokens": 9679957.0,
"step": 592
},
{
"entropy": 0.5444381237030029,
"epoch": 2.2126865671641793,
"grad_norm": 0.1631772667169571,
"learning_rate": 0.0002,
"loss": 0.5421435832977295,
"mean_token_accuracy": 0.7804461270570755,
"num_tokens": 9696269.0,
"step": 593
},
{
"entropy": 0.5140876695513725,
"epoch": 2.216417910447761,
"grad_norm": 0.14234963059425354,
"learning_rate": 0.0002,
"loss": 0.5083339214324951,
"mean_token_accuracy": 0.7940346747636795,
"num_tokens": 9712614.0,
"step": 594
},
{
"entropy": 0.5227879285812378,
"epoch": 2.220149253731343,
"grad_norm": 0.1700550764799118,
"learning_rate": 0.0002,
"loss": 0.5256499648094177,
"mean_token_accuracy": 0.788642093539238,
"num_tokens": 9729090.0,
"step": 595
},
{
"entropy": 0.5193727314472198,
"epoch": 2.2238805970149254,
"grad_norm": 0.16189917922019958,
"learning_rate": 0.0002,
"loss": 0.515200674533844,
"mean_token_accuracy": 0.7933167964220047,
"num_tokens": 9745602.0,
"step": 596
},
{
"entropy": 0.5037901103496552,
"epoch": 2.2276119402985075,
"grad_norm": 0.15295493602752686,
"learning_rate": 0.0002,
"loss": 0.5038392543792725,
"mean_token_accuracy": 0.7972543388605118,
"num_tokens": 9761880.0,
"step": 597
},
{
"entropy": 0.5051177442073822,
"epoch": 2.2313432835820897,
"grad_norm": 0.18619783222675323,
"learning_rate": 0.0002,
"loss": 0.5126343369483948,
"mean_token_accuracy": 0.794564738869667,
"num_tokens": 9778073.0,
"step": 598
},
{
"entropy": 0.5051270872354507,
"epoch": 2.235074626865672,
"grad_norm": 0.1611267328262329,
"learning_rate": 0.0002,
"loss": 0.5092532634735107,
"mean_token_accuracy": 0.7946549952030182,
"num_tokens": 9794345.0,
"step": 599
},
{
"entropy": 0.5325346812605858,
"epoch": 2.2388059701492535,
"grad_norm": 0.20552673935890198,
"learning_rate": 0.0002,
"loss": 0.5378585457801819,
"mean_token_accuracy": 0.7835244834423065,
"num_tokens": 9810716.0,
"step": 600
},
{
"entropy": 0.5362858921289444,
"epoch": 2.2425373134328357,
"grad_norm": 0.1832580715417862,
"learning_rate": 0.0002,
"loss": 0.5247851014137268,
"mean_token_accuracy": 0.7862047404050827,
"num_tokens": 9826899.0,
"step": 601
},
{
"entropy": 0.515026330947876,
"epoch": 2.246268656716418,
"grad_norm": 0.1738833785057068,
"learning_rate": 0.0002,
"loss": 0.5104220509529114,
"mean_token_accuracy": 0.7956585586071014,
"num_tokens": 9843201.0,
"step": 602
},
{
"entropy": 0.5326243042945862,
"epoch": 2.25,
"grad_norm": 0.19789133965969086,
"learning_rate": 0.0002,
"loss": 0.5377206206321716,
"mean_token_accuracy": 0.7844580560922623,
"num_tokens": 9859428.0,
"step": 603
},
{
"entropy": 0.5045425221323967,
"epoch": 2.253731343283582,
"grad_norm": 0.22017110884189606,
"learning_rate": 0.0002,
"loss": 0.5142727494239807,
"mean_token_accuracy": 0.7916774153709412,
"num_tokens": 9875509.0,
"step": 604
},
{
"entropy": 0.5083225071430206,
"epoch": 2.2574626865671643,
"grad_norm": 0.20720691978931427,
"learning_rate": 0.0002,
"loss": 0.5168294906616211,
"mean_token_accuracy": 0.7916733622550964,
"num_tokens": 9891513.0,
"step": 605
},
{
"entropy": 0.5038861483335495,
"epoch": 2.2611940298507465,
"grad_norm": 0.22461913526058197,
"learning_rate": 0.0002,
"loss": 0.5155696868896484,
"mean_token_accuracy": 0.7936981916427612,
"num_tokens": 9907970.0,
"step": 606
},
{
"entropy": 0.544201672077179,
"epoch": 2.264925373134328,
"grad_norm": 0.22078122198581696,
"learning_rate": 0.0002,
"loss": 0.5377649664878845,
"mean_token_accuracy": 0.7846001982688904,
"num_tokens": 9924358.0,
"step": 607
},
{
"entropy": 0.5319496989250183,
"epoch": 2.2686567164179103,
"grad_norm": 0.15865834057331085,
"learning_rate": 0.0002,
"loss": 0.5269988775253296,
"mean_token_accuracy": 0.7889304012060165,
"num_tokens": 9940613.0,
"step": 608
},
{
"entropy": 0.5121538639068604,
"epoch": 2.2723880597014925,
"grad_norm": 0.19707661867141724,
"learning_rate": 0.0002,
"loss": 0.5115834474563599,
"mean_token_accuracy": 0.7899812310934067,
"num_tokens": 9956900.0,
"step": 609
},
{
"entropy": 0.5339771807193756,
"epoch": 2.2761194029850746,
"grad_norm": 0.15257956087589264,
"learning_rate": 0.0002,
"loss": 0.5300955772399902,
"mean_token_accuracy": 0.785103976726532,
"num_tokens": 9973499.0,
"step": 610
},
{
"entropy": 0.5281384140253067,
"epoch": 2.279850746268657,
"grad_norm": 0.16553470492362976,
"learning_rate": 0.0002,
"loss": 0.5257382392883301,
"mean_token_accuracy": 0.7875041514635086,
"num_tokens": 9989801.0,
"step": 611
},
{
"entropy": 0.5170317441225052,
"epoch": 2.283582089552239,
"grad_norm": 0.1715046465396881,
"learning_rate": 0.0002,
"loss": 0.5181665420532227,
"mean_token_accuracy": 0.7884780019521713,
"num_tokens": 10006078.0,
"step": 612
},
{
"entropy": 0.5153259709477425,
"epoch": 2.2873134328358207,
"grad_norm": 0.1548839956521988,
"learning_rate": 0.0002,
"loss": 0.514171302318573,
"mean_token_accuracy": 0.7930748611688614,
"num_tokens": 10022246.0,
"step": 613
},
{
"entropy": 0.5224331915378571,
"epoch": 2.291044776119403,
"grad_norm": 0.1681355983018875,
"learning_rate": 0.0002,
"loss": 0.5221542119979858,
"mean_token_accuracy": 0.7877352833747864,
"num_tokens": 10038788.0,
"step": 614
},
{
"entropy": 0.5205291956663132,
"epoch": 2.294776119402985,
"grad_norm": 0.16179999709129333,
"learning_rate": 0.0002,
"loss": 0.5216364860534668,
"mean_token_accuracy": 0.7894330769777298,
"num_tokens": 10055226.0,
"step": 615
},
{
"entropy": 0.5362520515918732,
"epoch": 2.298507462686567,
"grad_norm": 0.19491799175739288,
"learning_rate": 0.0002,
"loss": 0.5382164716720581,
"mean_token_accuracy": 0.7841734141111374,
"num_tokens": 10071636.0,
"step": 616
},
{
"entropy": 0.5122754499316216,
"epoch": 2.3022388059701493,
"grad_norm": 0.15888278186321259,
"learning_rate": 0.0002,
"loss": 0.5128467082977295,
"mean_token_accuracy": 0.7957093715667725,
"num_tokens": 10087915.0,
"step": 617
},
{
"entropy": 0.530030369758606,
"epoch": 2.3059701492537314,
"grad_norm": 0.20173799991607666,
"learning_rate": 0.0002,
"loss": 0.5327577590942383,
"mean_token_accuracy": 0.7822887450456619,
"num_tokens": 10104328.0,
"step": 618
},
{
"entropy": 0.511964850127697,
"epoch": 2.3097014925373136,
"grad_norm": 0.22716699540615082,
"learning_rate": 0.0002,
"loss": 0.5194392800331116,
"mean_token_accuracy": 0.7923955619335175,
"num_tokens": 10120902.0,
"step": 619
},
{
"entropy": 0.5184068530797958,
"epoch": 2.3134328358208958,
"grad_norm": 0.1653965413570404,
"learning_rate": 0.0002,
"loss": 0.5168477892875671,
"mean_token_accuracy": 0.7927787899971008,
"num_tokens": 10137330.0,
"step": 620
},
{
"entropy": 0.5173092186450958,
"epoch": 2.3171641791044775,
"grad_norm": 0.1853804737329483,
"learning_rate": 0.0002,
"loss": 0.5189480781555176,
"mean_token_accuracy": 0.7897288352251053,
"num_tokens": 10153802.0,
"step": 621
},
{
"entropy": 0.5215531587600708,
"epoch": 2.3208955223880596,
"grad_norm": 0.1907532960176468,
"learning_rate": 0.0002,
"loss": 0.5235369801521301,
"mean_token_accuracy": 0.7906839698553085,
"num_tokens": 10170052.0,
"step": 622
},
{
"entropy": 0.5299772173166275,
"epoch": 2.324626865671642,
"grad_norm": 0.17518973350524902,
"learning_rate": 0.0002,
"loss": 0.5251893401145935,
"mean_token_accuracy": 0.7905509769916534,
"num_tokens": 10186299.0,
"step": 623
},
{
"entropy": 0.5111118629574776,
"epoch": 2.328358208955224,
"grad_norm": 0.162562295794487,
"learning_rate": 0.0002,
"loss": 0.5044469237327576,
"mean_token_accuracy": 0.793881356716156,
"num_tokens": 10202479.0,
"step": 624
},
{
"entropy": 0.5176884084939957,
"epoch": 2.332089552238806,
"grad_norm": 0.15817266702651978,
"learning_rate": 0.0002,
"loss": 0.5189487934112549,
"mean_token_accuracy": 0.7899019569158554,
"num_tokens": 10218755.0,
"step": 625
},
{
"entropy": 0.5375020056962967,
"epoch": 2.3358208955223883,
"grad_norm": 0.16503086686134338,
"learning_rate": 0.0002,
"loss": 0.5378777980804443,
"mean_token_accuracy": 0.7797044813632965,
"num_tokens": 10235308.0,
"step": 626
},
{
"entropy": 0.5069606155157089,
"epoch": 2.33955223880597,
"grad_norm": 0.19356752932071686,
"learning_rate": 0.0002,
"loss": 0.5149304866790771,
"mean_token_accuracy": 0.790899932384491,
"num_tokens": 10251410.0,
"step": 627
},
{
"entropy": 0.5025136545300484,
"epoch": 2.343283582089552,
"grad_norm": 0.1775875836610794,
"learning_rate": 0.0002,
"loss": 0.5070807933807373,
"mean_token_accuracy": 0.7955823987722397,
"num_tokens": 10267499.0,
"step": 628
},
{
"entropy": 0.5052608847618103,
"epoch": 2.3470149253731343,
"grad_norm": 0.21965590119361877,
"learning_rate": 0.0002,
"loss": 0.5101135969161987,
"mean_token_accuracy": 0.7949910014867783,
"num_tokens": 10283791.0,
"step": 629
},
{
"entropy": 0.5179193317890167,
"epoch": 2.3507462686567164,
"grad_norm": 0.19963982701301575,
"learning_rate": 0.0002,
"loss": 0.5215207934379578,
"mean_token_accuracy": 0.7893756926059723,
"num_tokens": 10299845.0,
"step": 630
},
{
"entropy": 0.5158931389451027,
"epoch": 2.3544776119402986,
"grad_norm": 0.160457581281662,
"learning_rate": 0.0002,
"loss": 0.5119190216064453,
"mean_token_accuracy": 0.7945539355278015,
"num_tokens": 10316272.0,
"step": 631
},
{
"entropy": 0.5080019608139992,
"epoch": 2.3582089552238807,
"grad_norm": 0.1729355752468109,
"learning_rate": 0.0002,
"loss": 0.5050552487373352,
"mean_token_accuracy": 0.7989319264888763,
"num_tokens": 10332919.0,
"step": 632
},
{
"entropy": 0.5174911320209503,
"epoch": 2.361940298507463,
"grad_norm": 0.1741209179162979,
"learning_rate": 0.0002,
"loss": 0.5234130024909973,
"mean_token_accuracy": 0.7888159304857254,
"num_tokens": 10349259.0,
"step": 633
},
{
"entropy": 0.5265702903270721,
"epoch": 2.3656716417910446,
"grad_norm": 0.19182217121124268,
"learning_rate": 0.0002,
"loss": 0.5293515920639038,
"mean_token_accuracy": 0.7829533070325851,
"num_tokens": 10365491.0,
"step": 634
},
{
"entropy": 0.5425137877464294,
"epoch": 2.3694029850746268,
"grad_norm": 0.16463470458984375,
"learning_rate": 0.0002,
"loss": 0.542192280292511,
"mean_token_accuracy": 0.7816719859838486,
"num_tokens": 10381847.0,
"step": 635
},
{
"entropy": 0.5144196897745132,
"epoch": 2.373134328358209,
"grad_norm": 0.16132977604866028,
"learning_rate": 0.0002,
"loss": 0.5131939053535461,
"mean_token_accuracy": 0.7919805645942688,
"num_tokens": 10398171.0,
"step": 636
},
{
"entropy": 0.5415032058954239,
"epoch": 2.376865671641791,
"grad_norm": 0.16324372589588165,
"learning_rate": 0.0002,
"loss": 0.5371772050857544,
"mean_token_accuracy": 0.7831342816352844,
"num_tokens": 10414686.0,
"step": 637
},
{
"entropy": 0.5282690078020096,
"epoch": 2.3805970149253732,
"grad_norm": 0.17967335879802704,
"learning_rate": 0.0002,
"loss": 0.5203690528869629,
"mean_token_accuracy": 0.7885807305574417,
"num_tokens": 10431126.0,
"step": 638
},
{
"entropy": 0.5216360539197922,
"epoch": 2.3843283582089554,
"grad_norm": 0.16235722601413727,
"learning_rate": 0.0002,
"loss": 0.5236966013908386,
"mean_token_accuracy": 0.7884224951267242,
"num_tokens": 10447324.0,
"step": 639
},
{
"entropy": 0.5296328365802765,
"epoch": 2.388059701492537,
"grad_norm": 0.1916787028312683,
"learning_rate": 0.0002,
"loss": 0.5376251339912415,
"mean_token_accuracy": 0.7802027314901352,
"num_tokens": 10463603.0,
"step": 640
},
{
"entropy": 0.5012985095381737,
"epoch": 2.3917910447761193,
"grad_norm": 0.19376890361309052,
"learning_rate": 0.0002,
"loss": 0.5101221203804016,
"mean_token_accuracy": 0.7951995581388474,
"num_tokens": 10479993.0,
"step": 641
},
{
"entropy": 0.5038901194930077,
"epoch": 2.3955223880597014,
"grad_norm": 0.17371249198913574,
"learning_rate": 0.0002,
"loss": 0.5146278738975525,
"mean_token_accuracy": 0.7905002534389496,
"num_tokens": 10496023.0,
"step": 642
},
{
"entropy": 0.5509473532438278,
"epoch": 2.3992537313432836,
"grad_norm": 0.15395016968250275,
"learning_rate": 0.0002,
"loss": 0.546664834022522,
"mean_token_accuracy": 0.7777733653783798,
"num_tokens": 10512527.0,
"step": 643
},
{
"entropy": 0.5174002125859261,
"epoch": 2.4029850746268657,
"grad_norm": 0.1537095606327057,
"learning_rate": 0.0002,
"loss": 0.5125638842582703,
"mean_token_accuracy": 0.7953683733940125,
"num_tokens": 10529050.0,
"step": 644
},
{
"entropy": 0.5259301066398621,
"epoch": 2.406716417910448,
"grad_norm": 0.19275200366973877,
"learning_rate": 0.0002,
"loss": 0.534030556678772,
"mean_token_accuracy": 0.7856698781251907,
"num_tokens": 10545403.0,
"step": 645
},
{
"entropy": 0.5141283497214317,
"epoch": 2.41044776119403,
"grad_norm": 0.2044205218553543,
"learning_rate": 0.0002,
"loss": 0.5202509760856628,
"mean_token_accuracy": 0.7915003001689911,
"num_tokens": 10561404.0,
"step": 646
},
{
"entropy": 0.5140255615115166,
"epoch": 2.4141791044776117,
"grad_norm": 0.17939844727516174,
"learning_rate": 0.0002,
"loss": 0.5115104913711548,
"mean_token_accuracy": 0.7907571196556091,
"num_tokens": 10577588.0,
"step": 647
},
{
"entropy": 0.5283705443143845,
"epoch": 2.417910447761194,
"grad_norm": 0.19888189435005188,
"learning_rate": 0.0002,
"loss": 0.5198178291320801,
"mean_token_accuracy": 0.7891141772270203,
"num_tokens": 10593859.0,
"step": 648
},
{
"entropy": 0.5462386906147003,
"epoch": 2.421641791044776,
"grad_norm": 0.1922907531261444,
"learning_rate": 0.0002,
"loss": 0.5396484732627869,
"mean_token_accuracy": 0.7813579887151718,
"num_tokens": 10610303.0,
"step": 649
},
{
"entropy": 0.5058758109807968,
"epoch": 2.425373134328358,
"grad_norm": 0.21254123747348785,
"learning_rate": 0.0002,
"loss": 0.5134891271591187,
"mean_token_accuracy": 0.7951326668262482,
"num_tokens": 10626628.0,
"step": 650
},
{
"entropy": 0.5051485821604729,
"epoch": 2.4291044776119404,
"grad_norm": 0.17681139707565308,
"learning_rate": 0.0002,
"loss": 0.5095136761665344,
"mean_token_accuracy": 0.7927682101726532,
"num_tokens": 10642872.0,
"step": 651
},
{
"entropy": 0.5098261535167694,
"epoch": 2.4328358208955225,
"grad_norm": 0.1644936203956604,
"learning_rate": 0.0002,
"loss": 0.5163934230804443,
"mean_token_accuracy": 0.7900458127260208,
"num_tokens": 10659143.0,
"step": 652
},
{
"entropy": 0.5026194378733635,
"epoch": 2.4365671641791042,
"grad_norm": 0.1890725940465927,
"learning_rate": 0.0002,
"loss": 0.511451244354248,
"mean_token_accuracy": 0.7927152365446091,
"num_tokens": 10675503.0,
"step": 653
},
{
"entropy": 0.5148562490940094,
"epoch": 2.4402985074626864,
"grad_norm": 0.1650211215019226,
"learning_rate": 0.0002,
"loss": 0.5156391263008118,
"mean_token_accuracy": 0.7906764894723892,
"num_tokens": 10691795.0,
"step": 654
},
{
"entropy": 0.5057827532291412,
"epoch": 2.4440298507462686,
"grad_norm": 0.1589452177286148,
"learning_rate": 0.0002,
"loss": 0.5033491849899292,
"mean_token_accuracy": 0.7994053959846497,
"num_tokens": 10707762.0,
"step": 655
},
{
"entropy": 0.5219250470399857,
"epoch": 2.4477611940298507,
"grad_norm": 0.18478544056415558,
"learning_rate": 0.0002,
"loss": 0.5219628810882568,
"mean_token_accuracy": 0.7873866856098175,
"num_tokens": 10724063.0,
"step": 656
},
{
"entropy": 0.5177232921123505,
"epoch": 2.451492537313433,
"grad_norm": 0.17303429543972015,
"learning_rate": 0.0002,
"loss": 0.5200316309928894,
"mean_token_accuracy": 0.7885988503694534,
"num_tokens": 10740399.0,
"step": 657
},
{
"entropy": 0.5319043695926666,
"epoch": 2.455223880597015,
"grad_norm": 0.18429186940193176,
"learning_rate": 0.0002,
"loss": 0.5326516032218933,
"mean_token_accuracy": 0.7862447798252106,
"num_tokens": 10756986.0,
"step": 658
},
{
"entropy": 0.5453691333532333,
"epoch": 2.458955223880597,
"grad_norm": 0.16711914539337158,
"learning_rate": 0.0002,
"loss": 0.5386096239089966,
"mean_token_accuracy": 0.7812793850898743,
"num_tokens": 10773458.0,
"step": 659
},
{
"entropy": 0.5214618891477585,
"epoch": 2.4626865671641793,
"grad_norm": 0.1909995675086975,
"learning_rate": 0.0002,
"loss": 0.518884003162384,
"mean_token_accuracy": 0.7878068089485168,
"num_tokens": 10789818.0,
"step": 660
},
{
"entropy": 0.523200586438179,
"epoch": 2.466417910447761,
"grad_norm": 0.17626361548900604,
"learning_rate": 0.0002,
"loss": 0.5212401151657104,
"mean_token_accuracy": 0.7900760471820831,
"num_tokens": 10806143.0,
"step": 661
},
{
"entropy": 0.5310025811195374,
"epoch": 2.470149253731343,
"grad_norm": 0.24172359704971313,
"learning_rate": 0.0002,
"loss": 0.5338881611824036,
"mean_token_accuracy": 0.7858817130327225,
"num_tokens": 10822437.0,
"step": 662
},
{
"entropy": 0.5151319652795792,
"epoch": 2.4738805970149254,
"grad_norm": 0.19658994674682617,
"learning_rate": 0.0002,
"loss": 0.5139521956443787,
"mean_token_accuracy": 0.7917647659778595,
"num_tokens": 10838442.0,
"step": 663
},
{
"entropy": 0.5117574036121368,
"epoch": 2.4776119402985075,
"grad_norm": 0.2189301699399948,
"learning_rate": 0.0002,
"loss": 0.513599693775177,
"mean_token_accuracy": 0.7897299826145172,
"num_tokens": 10854797.0,
"step": 664
},
{
"entropy": 0.5397205054759979,
"epoch": 2.4813432835820897,
"grad_norm": 0.2076101452112198,
"learning_rate": 0.0002,
"loss": 0.5459029078483582,
"mean_token_accuracy": 0.7777052521705627,
"num_tokens": 10871117.0,
"step": 665
},
{
"entropy": 0.525243952870369,
"epoch": 2.485074626865672,
"grad_norm": 0.1969526708126068,
"learning_rate": 0.0002,
"loss": 0.5259374380111694,
"mean_token_accuracy": 0.7870301008224487,
"num_tokens": 10887285.0,
"step": 666
},
{
"entropy": 0.521914929151535,
"epoch": 2.4888059701492535,
"grad_norm": 0.1793866604566574,
"learning_rate": 0.0002,
"loss": 0.523249626159668,
"mean_token_accuracy": 0.7908923327922821,
"num_tokens": 10903583.0,
"step": 667
},
{
"entropy": 0.5157094374299049,
"epoch": 2.4925373134328357,
"grad_norm": 0.1676340252161026,
"learning_rate": 0.0002,
"loss": 0.5196658372879028,
"mean_token_accuracy": 0.7936161011457443,
"num_tokens": 10919876.0,
"step": 668
},
{
"entropy": 0.49876970052719116,
"epoch": 2.496268656716418,
"grad_norm": 0.18448136746883392,
"learning_rate": 0.0002,
"loss": 0.49738743901252747,
"mean_token_accuracy": 0.8003499060869217,
"num_tokens": 10936091.0,
"step": 669
},
{
"entropy": 0.5243137031793594,
"epoch": 2.5,
"grad_norm": 0.1985243260860443,
"learning_rate": 0.0002,
"loss": 0.526336133480072,
"mean_token_accuracy": 0.7861499488353729,
"num_tokens": 10952522.0,
"step": 670
},
{
"entropy": 0.5277926176786423,
"epoch": 2.503731343283582,
"grad_norm": 0.15664395689964294,
"learning_rate": 0.0002,
"loss": 0.5211771726608276,
"mean_token_accuracy": 0.7905664294958115,
"num_tokens": 10968886.0,
"step": 671
},
{
"entropy": 0.5109870582818985,
"epoch": 2.5074626865671643,
"grad_norm": 0.17840486764907837,
"learning_rate": 0.0002,
"loss": 0.5104790925979614,
"mean_token_accuracy": 0.7953955680131912,
"num_tokens": 10985258.0,
"step": 672
},
{
"entropy": 0.4981943815946579,
"epoch": 2.5111940298507465,
"grad_norm": 0.15788039565086365,
"learning_rate": 0.0002,
"loss": 0.5019396543502808,
"mean_token_accuracy": 0.7957722395658493,
"num_tokens": 11001537.0,
"step": 673
},
{
"entropy": 0.4992476552724838,
"epoch": 2.5149253731343286,
"grad_norm": 0.20122262835502625,
"learning_rate": 0.0002,
"loss": 0.5123214721679688,
"mean_token_accuracy": 0.7936280071735382,
"num_tokens": 11017858.0,
"step": 674
},
{
"entropy": 0.5326351076364517,
"epoch": 2.5186567164179103,
"grad_norm": 0.15370923280715942,
"learning_rate": 0.0002,
"loss": 0.5299698114395142,
"mean_token_accuracy": 0.7864175289869308,
"num_tokens": 11034251.0,
"step": 675
},
{
"entropy": 0.5276974588632584,
"epoch": 2.5223880597014925,
"grad_norm": 0.16408182680606842,
"learning_rate": 0.0002,
"loss": 0.5256198644638062,
"mean_token_accuracy": 0.7864832729101181,
"num_tokens": 11050538.0,
"step": 676
},
{
"entropy": 0.5174605995416641,
"epoch": 2.5261194029850746,
"grad_norm": 0.1726282238960266,
"learning_rate": 0.0002,
"loss": 0.5166889429092407,
"mean_token_accuracy": 0.7903372198343277,
"num_tokens": 11066909.0,
"step": 677
},
{
"entropy": 0.5096773952245712,
"epoch": 2.529850746268657,
"grad_norm": 0.18736550211906433,
"learning_rate": 0.0002,
"loss": 0.5147178173065186,
"mean_token_accuracy": 0.7915707528591156,
"num_tokens": 11083296.0,
"step": 678
},
{
"entropy": 0.5143576934933662,
"epoch": 2.533582089552239,
"grad_norm": 0.18496522307395935,
"learning_rate": 0.0002,
"loss": 0.5202215909957886,
"mean_token_accuracy": 0.7876331657171249,
"num_tokens": 11099735.0,
"step": 679
},
{
"entropy": 0.5062269270420074,
"epoch": 2.5373134328358207,
"grad_norm": 0.18014365434646606,
"learning_rate": 0.0002,
"loss": 0.5091406106948853,
"mean_token_accuracy": 0.7964621633291245,
"num_tokens": 11116208.0,
"step": 680
},
{
"entropy": 0.5146580412983894,
"epoch": 2.541044776119403,
"grad_norm": 0.15533168613910675,
"learning_rate": 0.0002,
"loss": 0.5158394575119019,
"mean_token_accuracy": 0.7913824915885925,
"num_tokens": 11132744.0,
"step": 681
},
{
"entropy": 0.5299884453415871,
"epoch": 2.544776119402985,
"grad_norm": 0.19397816061973572,
"learning_rate": 0.0002,
"loss": 0.5282403826713562,
"mean_token_accuracy": 0.7865999937057495,
"num_tokens": 11149385.0,
"step": 682
},
{
"entropy": 0.5197403728961945,
"epoch": 2.548507462686567,
"grad_norm": 0.1893748939037323,
"learning_rate": 0.0002,
"loss": 0.5172282457351685,
"mean_token_accuracy": 0.7889421880245209,
"num_tokens": 11165536.0,
"step": 683
},
{
"entropy": 0.5483877509832382,
"epoch": 2.5522388059701493,
"grad_norm": 0.1692439764738083,
"learning_rate": 0.0002,
"loss": 0.5408689975738525,
"mean_token_accuracy": 0.7819931209087372,
"num_tokens": 11182199.0,
"step": 684
},
{
"entropy": 0.5187435150146484,
"epoch": 2.5559701492537314,
"grad_norm": 0.16838251054286957,
"learning_rate": 0.0002,
"loss": 0.5220701098442078,
"mean_token_accuracy": 0.7913226187229156,
"num_tokens": 11198351.0,
"step": 685
},
{
"entropy": 0.5129819363355637,
"epoch": 2.5597014925373136,
"grad_norm": 0.18473690748214722,
"learning_rate": 0.0002,
"loss": 0.5199850797653198,
"mean_token_accuracy": 0.7907718271017075,
"num_tokens": 11214899.0,
"step": 686
},
{
"entropy": 0.5174092352390289,
"epoch": 2.5634328358208958,
"grad_norm": 0.18355096876621246,
"learning_rate": 0.0002,
"loss": 0.5231988430023193,
"mean_token_accuracy": 0.7854581624269485,
"num_tokens": 11231316.0,
"step": 687
},
{
"entropy": 0.5146564170718193,
"epoch": 2.5671641791044775,
"grad_norm": 0.20094642043113708,
"learning_rate": 0.0002,
"loss": 0.5167846083641052,
"mean_token_accuracy": 0.7892555296421051,
"num_tokens": 11247525.0,
"step": 688
},
{
"entropy": 0.5073134675621986,
"epoch": 2.5708955223880596,
"grad_norm": 0.17776694893836975,
"learning_rate": 0.0002,
"loss": 0.5059224963188171,
"mean_token_accuracy": 0.7938186377286911,
"num_tokens": 11263630.0,
"step": 689
},
{
"entropy": 0.51164161413908,
"epoch": 2.574626865671642,
"grad_norm": 0.23441171646118164,
"learning_rate": 0.0002,
"loss": 0.5132524371147156,
"mean_token_accuracy": 0.7924985736608505,
"num_tokens": 11279891.0,
"step": 690
},
{
"entropy": 0.5324152410030365,
"epoch": 2.578358208955224,
"grad_norm": 0.1964472234249115,
"learning_rate": 0.0002,
"loss": 0.5321142673492432,
"mean_token_accuracy": 0.7884731590747833,
"num_tokens": 11296194.0,
"step": 691
},
{
"entropy": 0.5136373415589333,
"epoch": 2.582089552238806,
"grad_norm": 0.23449179530143738,
"learning_rate": 0.0002,
"loss": 0.5196998715400696,
"mean_token_accuracy": 0.7908406853675842,
"num_tokens": 11312615.0,
"step": 692
},
{
"entropy": 0.5276090502738953,
"epoch": 2.585820895522388,
"grad_norm": 0.16686299443244934,
"learning_rate": 0.0002,
"loss": 0.5247229337692261,
"mean_token_accuracy": 0.7879517525434494,
"num_tokens": 11329158.0,
"step": 693
},
{
"entropy": 0.5419809222221375,
"epoch": 2.58955223880597,
"grad_norm": 0.19849538803100586,
"learning_rate": 0.0002,
"loss": 0.5328899621963501,
"mean_token_accuracy": 0.7848672121763229,
"num_tokens": 11345724.0,
"step": 694
},
{
"entropy": 0.5273312255740166,
"epoch": 2.593283582089552,
"grad_norm": 0.15091370046138763,
"learning_rate": 0.0002,
"loss": 0.5279825925827026,
"mean_token_accuracy": 0.7853807210922241,
"num_tokens": 11362189.0,
"step": 695
},
{
"entropy": 0.5198656767606735,
"epoch": 2.5970149253731343,
"grad_norm": 0.23191620409488678,
"learning_rate": 0.0002,
"loss": 0.5321477651596069,
"mean_token_accuracy": 0.7849823385477066,
"num_tokens": 11378807.0,
"step": 696
},
{
"entropy": 0.5051373466849327,
"epoch": 2.6007462686567164,
"grad_norm": 0.16530166566371918,
"learning_rate": 0.0002,
"loss": 0.5118955373764038,
"mean_token_accuracy": 0.7921792417764664,
"num_tokens": 11395066.0,
"step": 697
},
{
"entropy": 0.5375550240278244,
"epoch": 2.6044776119402986,
"grad_norm": 0.16651837527751923,
"learning_rate": 0.0002,
"loss": 0.5333649516105652,
"mean_token_accuracy": 0.7834018468856812,
"num_tokens": 11411502.0,
"step": 698
},
{
"entropy": 0.509097121655941,
"epoch": 2.6082089552238807,
"grad_norm": 0.19326747953891754,
"learning_rate": 0.0002,
"loss": 0.5079880952835083,
"mean_token_accuracy": 0.7902690321207047,
"num_tokens": 11427527.0,
"step": 699
},
{
"entropy": 0.5243344008922577,
"epoch": 2.611940298507463,
"grad_norm": 0.17708131670951843,
"learning_rate": 0.0002,
"loss": 0.527232825756073,
"mean_token_accuracy": 0.78766830265522,
"num_tokens": 11443934.0,
"step": 700
},
{
"entropy": 0.5099955424666405,
"epoch": 2.6156716417910446,
"grad_norm": 0.22393395006656647,
"learning_rate": 0.0002,
"loss": 0.5181647539138794,
"mean_token_accuracy": 0.7911688387393951,
"num_tokens": 11460041.0,
"step": 701
},
{
"entropy": 0.5081977397203445,
"epoch": 2.6194029850746268,
"grad_norm": 0.19041450321674347,
"learning_rate": 0.0002,
"loss": 0.5169417262077332,
"mean_token_accuracy": 0.7914475202560425,
"num_tokens": 11476118.0,
"step": 702
},
{
"entropy": 0.531707689166069,
"epoch": 2.623134328358209,
"grad_norm": 0.1838483214378357,
"learning_rate": 0.0002,
"loss": 0.5199188590049744,
"mean_token_accuracy": 0.7899897545576096,
"num_tokens": 11492660.0,
"step": 703
},
{
"entropy": 0.5364825427532196,
"epoch": 2.626865671641791,
"grad_norm": 0.1751444786787033,
"learning_rate": 0.0002,
"loss": 0.5356893539428711,
"mean_token_accuracy": 0.7835856378078461,
"num_tokens": 11509081.0,
"step": 704
},
{
"entropy": 0.5187056511640549,
"epoch": 2.6305970149253732,
"grad_norm": 0.17921118438243866,
"learning_rate": 0.0002,
"loss": 0.5232405066490173,
"mean_token_accuracy": 0.7884531170129776,
"num_tokens": 11525499.0,
"step": 705
},
{
"entropy": 0.5242651104927063,
"epoch": 2.6343283582089554,
"grad_norm": 0.18693575263023376,
"learning_rate": 0.0002,
"loss": 0.5285453796386719,
"mean_token_accuracy": 0.786514088511467,
"num_tokens": 11541734.0,
"step": 706
},
{
"entropy": 0.516477108001709,
"epoch": 2.638059701492537,
"grad_norm": 0.1994662582874298,
"learning_rate": 0.0002,
"loss": 0.5184328556060791,
"mean_token_accuracy": 0.79111048579216,
"num_tokens": 11558204.0,
"step": 707
},
{
"entropy": 0.5288708806037903,
"epoch": 2.6417910447761193,
"grad_norm": 0.16373923420906067,
"learning_rate": 0.0002,
"loss": 0.5213331580162048,
"mean_token_accuracy": 0.7881525307893753,
"num_tokens": 11574434.0,
"step": 708
},
{
"entropy": 0.5072719901800156,
"epoch": 2.6455223880597014,
"grad_norm": 0.1917801946401596,
"learning_rate": 0.0002,
"loss": 0.509112536907196,
"mean_token_accuracy": 0.7960505336523056,
"num_tokens": 11590646.0,
"step": 709
},
{
"entropy": 0.5356978923082352,
"epoch": 2.6492537313432836,
"grad_norm": 0.19294337928295135,
"learning_rate": 0.0002,
"loss": 0.5388337969779968,
"mean_token_accuracy": 0.7824567407369614,
"num_tokens": 11606979.0,
"step": 710
},
{
"entropy": 0.5163687542080879,
"epoch": 2.6529850746268657,
"grad_norm": 0.1852083057165146,
"learning_rate": 0.0002,
"loss": 0.5158357620239258,
"mean_token_accuracy": 0.7907344847917557,
"num_tokens": 11623404.0,
"step": 711
},
{
"entropy": 0.5283653736114502,
"epoch": 2.656716417910448,
"grad_norm": 0.17565470933914185,
"learning_rate": 0.0002,
"loss": 0.5322569608688354,
"mean_token_accuracy": 0.7860839515924454,
"num_tokens": 11639756.0,
"step": 712
},
{
"entropy": 0.5301189422607422,
"epoch": 2.66044776119403,
"grad_norm": 0.18470223248004913,
"learning_rate": 0.0002,
"loss": 0.5344855785369873,
"mean_token_accuracy": 0.7831524461507797,
"num_tokens": 11656115.0,
"step": 713
},
{
"entropy": 0.5131835639476776,
"epoch": 2.664179104477612,
"grad_norm": 0.14412830770015717,
"learning_rate": 0.0002,
"loss": 0.5086023211479187,
"mean_token_accuracy": 0.7938779592514038,
"num_tokens": 11672197.0,
"step": 714
},
{
"entropy": 0.5248347520828247,
"epoch": 2.667910447761194,
"grad_norm": 0.1623944342136383,
"learning_rate": 0.0002,
"loss": 0.5236642360687256,
"mean_token_accuracy": 0.78847536444664,
"num_tokens": 11688778.0,
"step": 715
},
{
"entropy": 0.5317736268043518,
"epoch": 2.671641791044776,
"grad_norm": 0.17043523490428925,
"learning_rate": 0.0002,
"loss": 0.5294151306152344,
"mean_token_accuracy": 0.7867350727319717,
"num_tokens": 11704972.0,
"step": 716
},
{
"entropy": 0.5292799472808838,
"epoch": 2.675373134328358,
"grad_norm": 0.21420958638191223,
"learning_rate": 0.0002,
"loss": 0.5348944664001465,
"mean_token_accuracy": 0.784217044711113,
"num_tokens": 11721357.0,
"step": 717
},
{
"entropy": 0.513471245765686,
"epoch": 2.6791044776119404,
"grad_norm": 0.18216556310653687,
"learning_rate": 0.0002,
"loss": 0.5178148746490479,
"mean_token_accuracy": 0.7881872206926346,
"num_tokens": 11737640.0,
"step": 718
},
{
"entropy": 0.5091867446899414,
"epoch": 2.6828358208955225,
"grad_norm": 0.18353325128555298,
"learning_rate": 0.0002,
"loss": 0.509505033493042,
"mean_token_accuracy": 0.7933301627635956,
"num_tokens": 11753743.0,
"step": 719
},
{
"entropy": 0.4985937625169754,
"epoch": 2.6865671641791042,
"grad_norm": 0.17763254046440125,
"learning_rate": 0.0002,
"loss": 0.5041629076004028,
"mean_token_accuracy": 0.7961723208427429,
"num_tokens": 11769941.0,
"step": 720
},
{
"entropy": 0.5326617211103439,
"epoch": 2.6902985074626864,
"grad_norm": 0.17128810286521912,
"learning_rate": 0.0002,
"loss": 0.5273231863975525,
"mean_token_accuracy": 0.7882279455661774,
"num_tokens": 11786468.0,
"step": 721
},
{
"entropy": 0.5309469103813171,
"epoch": 2.6940298507462686,
"grad_norm": 0.16436029970645905,
"learning_rate": 0.0002,
"loss": 0.5328190326690674,
"mean_token_accuracy": 0.7852970659732819,
"num_tokens": 11802907.0,
"step": 722
},
{
"entropy": 0.5232216566801071,
"epoch": 2.6977611940298507,
"grad_norm": 0.16719315946102142,
"learning_rate": 0.0002,
"loss": 0.5230921506881714,
"mean_token_accuracy": 0.7876270413398743,
"num_tokens": 11819317.0,
"step": 723
},
{
"entropy": 0.5203052535653114,
"epoch": 2.701492537313433,
"grad_norm": 0.19284284114837646,
"learning_rate": 0.0002,
"loss": 0.5245278477668762,
"mean_token_accuracy": 0.7879077643156052,
"num_tokens": 11835688.0,
"step": 724
},
{
"entropy": 0.5309562981128693,
"epoch": 2.705223880597015,
"grad_norm": 0.237013041973114,
"learning_rate": 0.0002,
"loss": 0.5299087166786194,
"mean_token_accuracy": 0.7888383269309998,
"num_tokens": 11851919.0,
"step": 725
},
{
"entropy": 0.5239868611097336,
"epoch": 2.708955223880597,
"grad_norm": 0.1684781163930893,
"learning_rate": 0.0002,
"loss": 0.5212418437004089,
"mean_token_accuracy": 0.7896943688392639,
"num_tokens": 11868352.0,
"step": 726
},
{
"entropy": 0.5078758075833321,
"epoch": 2.7126865671641793,
"grad_norm": 0.18132759630680084,
"learning_rate": 0.0002,
"loss": 0.5123098492622375,
"mean_token_accuracy": 0.7928104400634766,
"num_tokens": 11884504.0,
"step": 727
},
{
"entropy": 0.5257874876260757,
"epoch": 2.716417910447761,
"grad_norm": 0.18958209455013275,
"learning_rate": 0.0002,
"loss": 0.5350735783576965,
"mean_token_accuracy": 0.7816809117794037,
"num_tokens": 11900762.0,
"step": 728
},
{
"entropy": 0.5237897783517838,
"epoch": 2.720149253731343,
"grad_norm": 0.17628394067287445,
"learning_rate": 0.0002,
"loss": 0.5271024107933044,
"mean_token_accuracy": 0.7875955998897552,
"num_tokens": 11917096.0,
"step": 729
},
{
"entropy": 0.5278095304965973,
"epoch": 2.7238805970149254,
"grad_norm": 0.1737760603427887,
"learning_rate": 0.0002,
"loss": 0.5236294865608215,
"mean_token_accuracy": 0.7871440947055817,
"num_tokens": 11933442.0,
"step": 730
},
{
"entropy": 0.5360710769891739,
"epoch": 2.7276119402985075,
"grad_norm": 0.17106162011623383,
"learning_rate": 0.0002,
"loss": 0.5306381583213806,
"mean_token_accuracy": 0.7830738425254822,
"num_tokens": 11949977.0,
"step": 731
},
{
"entropy": 0.5101736485958099,
"epoch": 2.7313432835820897,
"grad_norm": 0.17468304932117462,
"learning_rate": 0.0002,
"loss": 0.5146869421005249,
"mean_token_accuracy": 0.7935636639595032,
"num_tokens": 11966192.0,
"step": 732
},
{
"entropy": 0.5177389085292816,
"epoch": 2.7350746268656714,
"grad_norm": 0.18631240725517273,
"learning_rate": 0.0002,
"loss": 0.5224716663360596,
"mean_token_accuracy": 0.78856061398983,
"num_tokens": 11982767.0,
"step": 733
},
{
"entropy": 0.5130163431167603,
"epoch": 2.7388059701492535,
"grad_norm": 0.18318809568881989,
"learning_rate": 0.0002,
"loss": 0.5186882019042969,
"mean_token_accuracy": 0.7916167229413986,
"num_tokens": 11998980.0,
"step": 734
},
{
"entropy": 0.5177224427461624,
"epoch": 2.7425373134328357,
"grad_norm": 0.15900187194347382,
"learning_rate": 0.0002,
"loss": 0.5131608843803406,
"mean_token_accuracy": 0.7938690781593323,
"num_tokens": 12015535.0,
"step": 735
},
{
"entropy": 0.526519387960434,
"epoch": 2.746268656716418,
"grad_norm": 0.174263134598732,
"learning_rate": 0.0002,
"loss": 0.5261813402175903,
"mean_token_accuracy": 0.7892861515283585,
"num_tokens": 12031788.0,
"step": 736
},
{
"entropy": 0.5191493332386017,
"epoch": 2.75,
"grad_norm": 0.18909449875354767,
"learning_rate": 0.0002,
"loss": 0.5240525007247925,
"mean_token_accuracy": 0.7878368943929672,
"num_tokens": 12047980.0,
"step": 737
},
{
"entropy": 0.5201373547315598,
"epoch": 2.753731343283582,
"grad_norm": 0.18388764560222626,
"learning_rate": 0.0002,
"loss": 0.5292187929153442,
"mean_token_accuracy": 0.7905917465686798,
"num_tokens": 12064314.0,
"step": 738
},
{
"entropy": 0.5199328809976578,
"epoch": 2.7574626865671643,
"grad_norm": 0.19509336352348328,
"learning_rate": 0.0002,
"loss": 0.5188801884651184,
"mean_token_accuracy": 0.7895538657903671,
"num_tokens": 12080751.0,
"step": 739
},
{
"entropy": 0.5277723222970963,
"epoch": 2.7611940298507465,
"grad_norm": 0.16337504982948303,
"learning_rate": 0.0002,
"loss": 0.5206757187843323,
"mean_token_accuracy": 0.7895227074623108,
"num_tokens": 12097014.0,
"step": 740
},
{
"entropy": 0.5113491863012314,
"epoch": 2.7649253731343286,
"grad_norm": 0.17909789085388184,
"learning_rate": 0.0002,
"loss": 0.5122904777526855,
"mean_token_accuracy": 0.7908981740474701,
"num_tokens": 12113252.0,
"step": 741
},
{
"entropy": 0.5200309902429581,
"epoch": 2.7686567164179103,
"grad_norm": 0.17350299656391144,
"learning_rate": 0.0002,
"loss": 0.5194863677024841,
"mean_token_accuracy": 0.7900390475988388,
"num_tokens": 12129709.0,
"step": 742
},
{
"entropy": 0.5226462483406067,
"epoch": 2.7723880597014925,
"grad_norm": 0.21633893251419067,
"learning_rate": 0.0002,
"loss": 0.5241018533706665,
"mean_token_accuracy": 0.7901509553194046,
"num_tokens": 12146084.0,
"step": 743
},
{
"entropy": 0.5130392387509346,
"epoch": 2.7761194029850746,
"grad_norm": 0.19013682007789612,
"learning_rate": 0.0002,
"loss": 0.5189740061759949,
"mean_token_accuracy": 0.7909031510353088,
"num_tokens": 12162307.0,
"step": 744
},
{
"entropy": 0.5150926038622856,
"epoch": 2.779850746268657,
"grad_norm": 0.2071346938610077,
"learning_rate": 0.0002,
"loss": 0.5166252255439758,
"mean_token_accuracy": 0.7929645031690598,
"num_tokens": 12178654.0,
"step": 745
},
{
"entropy": 0.5175644010305405,
"epoch": 2.783582089552239,
"grad_norm": 0.1927538812160492,
"learning_rate": 0.0002,
"loss": 0.5234126448631287,
"mean_token_accuracy": 0.7895888537168503,
"num_tokens": 12194657.0,
"step": 746
},
{
"entropy": 0.5124155282974243,
"epoch": 2.7873134328358207,
"grad_norm": 0.20746196806430817,
"learning_rate": 0.0002,
"loss": 0.5111269950866699,
"mean_token_accuracy": 0.7925330102443695,
"num_tokens": 12211150.0,
"step": 747
},
{
"entropy": 0.5269140601158142,
"epoch": 2.791044776119403,
"grad_norm": 0.16280147433280945,
"learning_rate": 0.0002,
"loss": 0.5249094367027283,
"mean_token_accuracy": 0.7845876812934875,
"num_tokens": 12227551.0,
"step": 748
},
{
"entropy": 0.5178611427545547,
"epoch": 2.794776119402985,
"grad_norm": 0.23840144276618958,
"learning_rate": 0.0002,
"loss": 0.5257112383842468,
"mean_token_accuracy": 0.7894743531942368,
"num_tokens": 12243876.0,
"step": 749
},
{
"entropy": 0.5116888880729675,
"epoch": 2.798507462686567,
"grad_norm": 0.18411816656589508,
"learning_rate": 0.0002,
"loss": 0.5144840478897095,
"mean_token_accuracy": 0.7931785434484482,
"num_tokens": 12260217.0,
"step": 750
},
{
"entropy": 0.5289624482393265,
"epoch": 2.8022388059701493,
"grad_norm": 0.22270359098911285,
"learning_rate": 0.0002,
"loss": 0.5311276316642761,
"mean_token_accuracy": 0.7855756431818008,
"num_tokens": 12276532.0,
"step": 751
},
{
"entropy": 0.547882929444313,
"epoch": 2.8059701492537314,
"grad_norm": 0.15829682350158691,
"learning_rate": 0.0002,
"loss": 0.5395496487617493,
"mean_token_accuracy": 0.7822854816913605,
"num_tokens": 12292809.0,
"step": 752
},
{
"entropy": 0.5366968065500259,
"epoch": 2.8097014925373136,
"grad_norm": 0.17022006213665009,
"learning_rate": 0.0002,
"loss": 0.5253041982650757,
"mean_token_accuracy": 0.7889240682125092,
"num_tokens": 12309272.0,
"step": 753
},
{
"entropy": 0.5104647874832153,
"epoch": 2.8134328358208958,
"grad_norm": 0.20047977566719055,
"learning_rate": 0.0002,
"loss": 0.5114369988441467,
"mean_token_accuracy": 0.7932160943746567,
"num_tokens": 12325725.0,
"step": 754
},
{
"entropy": 0.530600056052208,
"epoch": 2.8171641791044775,
"grad_norm": 0.18938857316970825,
"learning_rate": 0.0002,
"loss": 0.5256994366645813,
"mean_token_accuracy": 0.787563219666481,
"num_tokens": 12341933.0,
"step": 755
},
{
"entropy": 0.5128819495439529,
"epoch": 2.8208955223880596,
"grad_norm": 0.19077159464359283,
"learning_rate": 0.0002,
"loss": 0.5233974456787109,
"mean_token_accuracy": 0.7869286239147186,
"num_tokens": 12358445.0,
"step": 756
},
{
"entropy": 0.5205030888319016,
"epoch": 2.824626865671642,
"grad_norm": 0.2066243290901184,
"learning_rate": 0.0002,
"loss": 0.527535617351532,
"mean_token_accuracy": 0.7873703986406326,
"num_tokens": 12374542.0,
"step": 757
},
{
"entropy": 0.5135227516293526,
"epoch": 2.828358208955224,
"grad_norm": 0.20685350894927979,
"learning_rate": 0.0002,
"loss": 0.5181005597114563,
"mean_token_accuracy": 0.7896196097135544,
"num_tokens": 12390788.0,
"step": 758
},
{
"entropy": 0.5336467772722244,
"epoch": 2.832089552238806,
"grad_norm": 0.1939532607793808,
"learning_rate": 0.0002,
"loss": 0.5294384956359863,
"mean_token_accuracy": 0.7889339476823807,
"num_tokens": 12407229.0,
"step": 759
},
{
"entropy": 0.5257266908884048,
"epoch": 2.835820895522388,
"grad_norm": 0.1771981567144394,
"learning_rate": 0.0002,
"loss": 0.5216140151023865,
"mean_token_accuracy": 0.7899226099252701,
"num_tokens": 12423846.0,
"step": 760
},
{
"entropy": 0.5299984812736511,
"epoch": 2.83955223880597,
"grad_norm": 0.20455680787563324,
"learning_rate": 0.0002,
"loss": 0.5296297073364258,
"mean_token_accuracy": 0.7862879633903503,
"num_tokens": 12440158.0,
"step": 761
},
{
"entropy": 0.5143841132521629,
"epoch": 2.843283582089552,
"grad_norm": 0.2076958268880844,
"learning_rate": 0.0002,
"loss": 0.5176342725753784,
"mean_token_accuracy": 0.7894581258296967,
"num_tokens": 12456654.0,
"step": 762
},
{
"entropy": 0.4974513649940491,
"epoch": 2.8470149253731343,
"grad_norm": 0.193134143948555,
"learning_rate": 0.0002,
"loss": 0.5035260319709778,
"mean_token_accuracy": 0.7979147285223007,
"num_tokens": 12472987.0,
"step": 763
},
{
"entropy": 0.516231395304203,
"epoch": 2.8507462686567164,
"grad_norm": 0.19579733908176422,
"learning_rate": 0.0002,
"loss": 0.523535430431366,
"mean_token_accuracy": 0.7885937541723251,
"num_tokens": 12489201.0,
"step": 764
},
{
"entropy": 0.5090928375720978,
"epoch": 2.8544776119402986,
"grad_norm": 0.1745532602071762,
"learning_rate": 0.0002,
"loss": 0.5120922327041626,
"mean_token_accuracy": 0.7926068156957626,
"num_tokens": 12505297.0,
"step": 765
},
{
"entropy": 0.5212984532117844,
"epoch": 2.8582089552238807,
"grad_norm": 0.1687193065881729,
"learning_rate": 0.0002,
"loss": 0.5186242461204529,
"mean_token_accuracy": 0.7898098975419998,
"num_tokens": 12521805.0,
"step": 766
},
{
"entropy": 0.5455201715230942,
"epoch": 2.861940298507463,
"grad_norm": 0.14300285279750824,
"learning_rate": 0.0002,
"loss": 0.5431771278381348,
"mean_token_accuracy": 0.7779514342546463,
"num_tokens": 12538465.0,
"step": 767
},
{
"entropy": 0.5209106504917145,
"epoch": 2.8656716417910446,
"grad_norm": 0.16800960898399353,
"learning_rate": 0.0002,
"loss": 0.5184243321418762,
"mean_token_accuracy": 0.7890264838933945,
"num_tokens": 12554886.0,
"step": 768
},
{
"entropy": 0.5088474899530411,
"epoch": 2.8694029850746268,
"grad_norm": 0.1462314873933792,
"learning_rate": 0.0002,
"loss": 0.5083324909210205,
"mean_token_accuracy": 0.7934228926897049,
"num_tokens": 12571276.0,
"step": 769
},
{
"entropy": 0.5271053463220596,
"epoch": 2.873134328358209,
"grad_norm": 0.16391947865486145,
"learning_rate": 0.0002,
"loss": 0.5293073654174805,
"mean_token_accuracy": 0.7859203815460205,
"num_tokens": 12587621.0,
"step": 770
},
{
"entropy": 0.5014189630746841,
"epoch": 2.876865671641791,
"grad_norm": 0.16328679025173187,
"learning_rate": 0.0002,
"loss": 0.5073498487472534,
"mean_token_accuracy": 0.7924041301012039,
"num_tokens": 12604113.0,
"step": 771
},
{
"entropy": 0.5268891751766205,
"epoch": 2.8805970149253732,
"grad_norm": 0.21644122898578644,
"learning_rate": 0.0002,
"loss": 0.5315952301025391,
"mean_token_accuracy": 0.7878720760345459,
"num_tokens": 12620599.0,
"step": 772
},
{
"entropy": 0.5303193777799606,
"epoch": 2.8843283582089554,
"grad_norm": 0.16348110139369965,
"learning_rate": 0.0002,
"loss": 0.5203503966331482,
"mean_token_accuracy": 0.7895929515361786,
"num_tokens": 12636920.0,
"step": 773
},
{
"entropy": 0.5373167991638184,
"epoch": 2.888059701492537,
"grad_norm": 0.1674329936504364,
"learning_rate": 0.0002,
"loss": 0.5308367609977722,
"mean_token_accuracy": 0.7839034348726273,
"num_tokens": 12653507.0,
"step": 774
},
{
"entropy": 0.5245395004749298,
"epoch": 2.8917910447761193,
"grad_norm": 0.16798977553844452,
"learning_rate": 0.0002,
"loss": 0.525133490562439,
"mean_token_accuracy": 0.7879597991704941,
"num_tokens": 12669748.0,
"step": 775
},
{
"entropy": 0.4995606988668442,
"epoch": 2.8955223880597014,
"grad_norm": 0.16923899948596954,
"learning_rate": 0.0002,
"loss": 0.5072147250175476,
"mean_token_accuracy": 0.7954233735799789,
"num_tokens": 12686075.0,
"step": 776
},
{
"entropy": 0.5168571919202805,
"epoch": 2.8992537313432836,
"grad_norm": 0.19585320353507996,
"learning_rate": 0.0002,
"loss": 0.531486988067627,
"mean_token_accuracy": 0.786114364862442,
"num_tokens": 12702228.0,
"step": 777
},
{
"entropy": 0.5194735378026962,
"epoch": 2.9029850746268657,
"grad_norm": 0.17308996617794037,
"learning_rate": 0.0002,
"loss": 0.5222083926200867,
"mean_token_accuracy": 0.7887429147958755,
"num_tokens": 12718513.0,
"step": 778
},
{
"entropy": 0.5187652111053467,
"epoch": 2.906716417910448,
"grad_norm": 0.18012917041778564,
"learning_rate": 0.0002,
"loss": 0.5144599676132202,
"mean_token_accuracy": 0.7928689271211624,
"num_tokens": 12734912.0,
"step": 779
},
{
"entropy": 0.5175924748182297,
"epoch": 2.91044776119403,
"grad_norm": 0.15708911418914795,
"learning_rate": 0.0002,
"loss": 0.5127027034759521,
"mean_token_accuracy": 0.7910457104444504,
"num_tokens": 12751312.0,
"step": 780
},
{
"entropy": 0.5184929892420769,
"epoch": 2.914179104477612,
"grad_norm": 0.17460955679416656,
"learning_rate": 0.0002,
"loss": 0.5223311185836792,
"mean_token_accuracy": 0.7881267666816711,
"num_tokens": 12767906.0,
"step": 781
},
{
"entropy": 0.5162710845470428,
"epoch": 2.917910447761194,
"grad_norm": 0.1744503378868103,
"learning_rate": 0.0002,
"loss": 0.5184698104858398,
"mean_token_accuracy": 0.7896480411291122,
"num_tokens": 12784363.0,
"step": 782
},
{
"entropy": 0.5054134130477905,
"epoch": 2.921641791044776,
"grad_norm": 0.16419187188148499,
"learning_rate": 0.0002,
"loss": 0.5100088715553284,
"mean_token_accuracy": 0.7937912940979004,
"num_tokens": 12800729.0,
"step": 783
},
{
"entropy": 0.5267587229609489,
"epoch": 2.925373134328358,
"grad_norm": 0.15712794661521912,
"learning_rate": 0.0002,
"loss": 0.5234281420707703,
"mean_token_accuracy": 0.7873355746269226,
"num_tokens": 12817275.0,
"step": 784
},
{
"entropy": 0.5252643376588821,
"epoch": 2.9291044776119404,
"grad_norm": 0.17461742460727692,
"learning_rate": 0.0002,
"loss": 0.5149291753768921,
"mean_token_accuracy": 0.792007714509964,
"num_tokens": 12833722.0,
"step": 785
},
{
"entropy": 0.5310375690460205,
"epoch": 2.9328358208955225,
"grad_norm": 0.16197697818279266,
"learning_rate": 0.0002,
"loss": 0.5280002355575562,
"mean_token_accuracy": 0.7869867831468582,
"num_tokens": 12850311.0,
"step": 786
},
{
"entropy": 0.5165882706642151,
"epoch": 2.9365671641791042,
"grad_norm": 0.18169313669204712,
"learning_rate": 0.0002,
"loss": 0.5169544219970703,
"mean_token_accuracy": 0.7926650643348694,
"num_tokens": 12866551.0,
"step": 787
},
{
"entropy": 0.506410725414753,
"epoch": 2.9402985074626864,
"grad_norm": 0.16465988755226135,
"learning_rate": 0.0002,
"loss": 0.5119289755821228,
"mean_token_accuracy": 0.7941572368144989,
"num_tokens": 12882861.0,
"step": 788
},
{
"entropy": 0.5014762431383133,
"epoch": 2.9440298507462686,
"grad_norm": 0.18377594649791718,
"learning_rate": 0.0002,
"loss": 0.5110628008842468,
"mean_token_accuracy": 0.7946459800004959,
"num_tokens": 12899241.0,
"step": 789
},
{
"entropy": 0.5248052775859833,
"epoch": 2.9477611940298507,
"grad_norm": 0.20053857564926147,
"learning_rate": 0.0002,
"loss": 0.5319278240203857,
"mean_token_accuracy": 0.7844424396753311,
"num_tokens": 12915385.0,
"step": 790
},
{
"entropy": 0.53006511926651,
"epoch": 2.951492537313433,
"grad_norm": 0.17584678530693054,
"learning_rate": 0.0002,
"loss": 0.5255709886550903,
"mean_token_accuracy": 0.7863388210535049,
"num_tokens": 12931592.0,
"step": 791
},
{
"entropy": 0.5275840014219284,
"epoch": 2.955223880597015,
"grad_norm": 0.17536833882331848,
"learning_rate": 0.0002,
"loss": 0.5213799476623535,
"mean_token_accuracy": 0.7920176684856415,
"num_tokens": 12948004.0,
"step": 792
},
{
"entropy": 0.5442412495613098,
"epoch": 2.958955223880597,
"grad_norm": 0.17195221781730652,
"learning_rate": 0.0002,
"loss": 0.5382991433143616,
"mean_token_accuracy": 0.7807125151157379,
"num_tokens": 12964350.0,
"step": 793
},
{
"entropy": 0.514294296503067,
"epoch": 2.9626865671641793,
"grad_norm": 0.1958279013633728,
"learning_rate": 0.0002,
"loss": 0.5191056728363037,
"mean_token_accuracy": 0.7889736741781235,
"num_tokens": 12980870.0,
"step": 794
},
{
"entropy": 0.516971156001091,
"epoch": 2.966417910447761,
"grad_norm": 0.17031143605709076,
"learning_rate": 0.0002,
"loss": 0.5235239863395691,
"mean_token_accuracy": 0.7902554422616959,
"num_tokens": 12997265.0,
"step": 795
},
{
"entropy": 0.519709937274456,
"epoch": 2.970149253731343,
"grad_norm": 0.19241590797901154,
"learning_rate": 0.0002,
"loss": 0.5290430188179016,
"mean_token_accuracy": 0.786635085940361,
"num_tokens": 13013641.0,
"step": 796
},
{
"entropy": 0.5278842747211456,
"epoch": 2.9738805970149254,
"grad_norm": 0.1847175806760788,
"learning_rate": 0.0002,
"loss": 0.5301830768585205,
"mean_token_accuracy": 0.7861872166395187,
"num_tokens": 13030089.0,
"step": 797
},
{
"entropy": 0.543852686882019,
"epoch": 2.9776119402985075,
"grad_norm": 0.1565551459789276,
"learning_rate": 0.0002,
"loss": 0.5390616655349731,
"mean_token_accuracy": 0.7804800420999527,
"num_tokens": 13046782.0,
"step": 798
},
{
"entropy": 0.5507520437240601,
"epoch": 2.9813432835820897,
"grad_norm": 0.19360534846782684,
"learning_rate": 0.0002,
"loss": 0.5457417964935303,
"mean_token_accuracy": 0.7808282524347305,
"num_tokens": 13063260.0,
"step": 799
},
{
"entropy": 0.5130215361714363,
"epoch": 2.9850746268656714,
"grad_norm": 0.17565752565860748,
"learning_rate": 0.0002,
"loss": 0.5124551057815552,
"mean_token_accuracy": 0.7940163463354111,
"num_tokens": 13079496.0,
"step": 800
},
{
"entropy": 0.5296107679605484,
"epoch": 2.9888059701492535,
"grad_norm": 0.18528884649276733,
"learning_rate": 0.0002,
"loss": 0.5258690714836121,
"mean_token_accuracy": 0.7890074849128723,
"num_tokens": 13095995.0,
"step": 801
},
{
"entropy": 0.5083938241004944,
"epoch": 2.9925373134328357,
"grad_norm": 0.17645564675331116,
"learning_rate": 0.0002,
"loss": 0.5169539451599121,
"mean_token_accuracy": 0.7913031429052353,
"num_tokens": 13112668.0,
"step": 802
},
{
"entropy": 0.5120368450880051,
"epoch": 2.996268656716418,
"grad_norm": 0.1844874620437622,
"learning_rate": 0.0002,
"loss": 0.5195419192314148,
"mean_token_accuracy": 0.7927880436182022,
"num_tokens": 13128901.0,
"step": 803
},
{
"entropy": 0.5261139273643494,
"epoch": 3.0,
"grad_norm": 0.19706764817237854,
"learning_rate": 0.0002,
"loss": 0.5334464311599731,
"mean_token_accuracy": 0.7812356650829315,
"num_tokens": 13145317.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2252935644732457e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}