clean-subliminal-learning-wolves / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
114af3f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.126197099685669,
"epoch": 0.003738317757009346,
"grad_norm": 0.4137735962867737,
"learning_rate": 0.0002,
"loss": 2.431535243988037,
"mean_token_accuracy": 0.54428631067276,
"num_tokens": 16465.0,
"step": 1
},
{
"entropy": 1.2562520503997803,
"epoch": 0.007476635514018692,
"grad_norm": 0.3902691900730133,
"learning_rate": 0.0002,
"loss": 2.188866376876831,
"mean_token_accuracy": 0.5568228960037231,
"num_tokens": 32573.0,
"step": 2
},
{
"entropy": 1.4093195796012878,
"epoch": 0.011214953271028037,
"grad_norm": 0.29741090536117554,
"learning_rate": 0.0002,
"loss": 1.7309190034866333,
"mean_token_accuracy": 0.591301366686821,
"num_tokens": 48848.0,
"step": 3
},
{
"entropy": 1.3904370069503784,
"epoch": 0.014953271028037384,
"grad_norm": 0.24415643513202667,
"learning_rate": 0.0002,
"loss": 1.4167925119400024,
"mean_token_accuracy": 0.6262245625257492,
"num_tokens": 64779.0,
"step": 4
},
{
"entropy": 1.3590968251228333,
"epoch": 0.018691588785046728,
"grad_norm": 0.2501066327095032,
"learning_rate": 0.0002,
"loss": 1.3086440563201904,
"mean_token_accuracy": 0.6442629396915436,
"num_tokens": 81017.0,
"step": 5
},
{
"entropy": 1.2659040987491608,
"epoch": 0.022429906542056073,
"grad_norm": 0.13132381439208984,
"learning_rate": 0.0002,
"loss": 1.1781953573226929,
"mean_token_accuracy": 0.6602727770805359,
"num_tokens": 97143.0,
"step": 6
},
{
"entropy": 1.1754920184612274,
"epoch": 0.026168224299065422,
"grad_norm": 0.10863616317510605,
"learning_rate": 0.0002,
"loss": 1.0758289098739624,
"mean_token_accuracy": 0.6747478097677231,
"num_tokens": 113270.0,
"step": 7
},
{
"entropy": 1.1110295355319977,
"epoch": 0.029906542056074768,
"grad_norm": 0.11261261999607086,
"learning_rate": 0.0002,
"loss": 1.0382510423660278,
"mean_token_accuracy": 0.6741550117731094,
"num_tokens": 129740.0,
"step": 8
},
{
"entropy": 1.0438694655895233,
"epoch": 0.03364485981308411,
"grad_norm": 0.1300426870584488,
"learning_rate": 0.0002,
"loss": 0.9842232465744019,
"mean_token_accuracy": 0.6938712894916534,
"num_tokens": 146153.0,
"step": 9
},
{
"entropy": 0.980072870850563,
"epoch": 0.037383177570093455,
"grad_norm": 0.1279866099357605,
"learning_rate": 0.0002,
"loss": 0.907992422580719,
"mean_token_accuracy": 0.7037613391876221,
"num_tokens": 162400.0,
"step": 10
},
{
"entropy": 0.9512171745300293,
"epoch": 0.041121495327102804,
"grad_norm": 0.11444728821516037,
"learning_rate": 0.0002,
"loss": 0.8603078722953796,
"mean_token_accuracy": 0.7085670977830887,
"num_tokens": 178596.0,
"step": 11
},
{
"entropy": 0.9008210897445679,
"epoch": 0.044859813084112146,
"grad_norm": 0.1163485050201416,
"learning_rate": 0.0002,
"loss": 0.8202763199806213,
"mean_token_accuracy": 0.7147757261991501,
"num_tokens": 194960.0,
"step": 12
},
{
"entropy": 0.8144031316041946,
"epoch": 0.048598130841121495,
"grad_norm": 1.8727822303771973,
"learning_rate": 0.0002,
"loss": 0.7989485859870911,
"mean_token_accuracy": 0.714598998427391,
"num_tokens": 211519.0,
"step": 13
},
{
"entropy": 0.7731810510158539,
"epoch": 0.052336448598130844,
"grad_norm": 0.40646815299987793,
"learning_rate": 0.0002,
"loss": 0.7675734162330627,
"mean_token_accuracy": 0.7164532542228699,
"num_tokens": 227947.0,
"step": 14
},
{
"entropy": 0.7750754952430725,
"epoch": 0.056074766355140186,
"grad_norm": 0.0927761048078537,
"learning_rate": 0.0002,
"loss": 0.752495527267456,
"mean_token_accuracy": 0.7247887402772903,
"num_tokens": 244285.0,
"step": 15
},
{
"entropy": 0.7294797450304031,
"epoch": 0.059813084112149535,
"grad_norm": 0.09633366763591766,
"learning_rate": 0.0002,
"loss": 0.7139282822608948,
"mean_token_accuracy": 0.733425110578537,
"num_tokens": 260524.0,
"step": 16
},
{
"entropy": 0.7113516181707382,
"epoch": 0.06355140186915888,
"grad_norm": 0.08278490602970123,
"learning_rate": 0.0002,
"loss": 0.69715416431427,
"mean_token_accuracy": 0.7404225617647171,
"num_tokens": 276676.0,
"step": 17
},
{
"entropy": 0.6892006993293762,
"epoch": 0.06728971962616823,
"grad_norm": 0.09702161699533463,
"learning_rate": 0.0002,
"loss": 0.6832636594772339,
"mean_token_accuracy": 0.7384749203920364,
"num_tokens": 293327.0,
"step": 18
},
{
"entropy": 0.683604821562767,
"epoch": 0.07102803738317758,
"grad_norm": 0.09970250725746155,
"learning_rate": 0.0002,
"loss": 0.6719778776168823,
"mean_token_accuracy": 0.7447258532047272,
"num_tokens": 309768.0,
"step": 19
},
{
"entropy": 0.6530238687992096,
"epoch": 0.07476635514018691,
"grad_norm": 0.08765958249568939,
"learning_rate": 0.0002,
"loss": 0.6265610456466675,
"mean_token_accuracy": 0.7607048451900482,
"num_tokens": 325953.0,
"step": 20
},
{
"entropy": 0.6858675181865692,
"epoch": 0.07850467289719626,
"grad_norm": 0.1555248200893402,
"learning_rate": 0.0002,
"loss": 0.653350830078125,
"mean_token_accuracy": 0.7461759150028229,
"num_tokens": 342357.0,
"step": 21
},
{
"entropy": 0.6731577664613724,
"epoch": 0.08224299065420561,
"grad_norm": 0.07943135499954224,
"learning_rate": 0.0002,
"loss": 0.6468416452407837,
"mean_token_accuracy": 0.745930403470993,
"num_tokens": 358780.0,
"step": 22
},
{
"entropy": 0.6372379511594772,
"epoch": 0.08598130841121496,
"grad_norm": 0.07176131755113602,
"learning_rate": 0.0002,
"loss": 0.6231244802474976,
"mean_token_accuracy": 0.757389485836029,
"num_tokens": 375059.0,
"step": 23
},
{
"entropy": 0.6160608530044556,
"epoch": 0.08971962616822429,
"grad_norm": 0.09053056687116623,
"learning_rate": 0.0002,
"loss": 0.6240095496177673,
"mean_token_accuracy": 0.7537032961845398,
"num_tokens": 391372.0,
"step": 24
},
{
"entropy": 0.6163977682590485,
"epoch": 0.09345794392523364,
"grad_norm": 0.06957540661096573,
"learning_rate": 0.0002,
"loss": 0.6137739419937134,
"mean_token_accuracy": 0.7591944634914398,
"num_tokens": 407634.0,
"step": 25
},
{
"entropy": 0.6172843426465988,
"epoch": 0.09719626168224299,
"grad_norm": 0.06831946223974228,
"learning_rate": 0.0002,
"loss": 0.6151383519172668,
"mean_token_accuracy": 0.7588979452848434,
"num_tokens": 424139.0,
"step": 26
},
{
"entropy": 0.6146537363529205,
"epoch": 0.10093457943925234,
"grad_norm": 0.06785774976015091,
"learning_rate": 0.0002,
"loss": 0.6100280284881592,
"mean_token_accuracy": 0.7608075141906738,
"num_tokens": 440251.0,
"step": 27
},
{
"entropy": 0.5965892523527145,
"epoch": 0.10467289719626169,
"grad_norm": 0.06592898070812225,
"learning_rate": 0.0002,
"loss": 0.5876743793487549,
"mean_token_accuracy": 0.7687714993953705,
"num_tokens": 456512.0,
"step": 28
},
{
"entropy": 0.6143475025892258,
"epoch": 0.10841121495327102,
"grad_norm": 0.06412907689809799,
"learning_rate": 0.0002,
"loss": 0.6119903326034546,
"mean_token_accuracy": 0.7573402374982834,
"num_tokens": 472958.0,
"step": 29
},
{
"entropy": 0.5956396609544754,
"epoch": 0.11214953271028037,
"grad_norm": 0.06444356590509415,
"learning_rate": 0.0002,
"loss": 0.594578206539154,
"mean_token_accuracy": 0.7660299837589264,
"num_tokens": 489407.0,
"step": 30
},
{
"entropy": 0.5987770259380341,
"epoch": 0.11588785046728972,
"grad_norm": 0.05562213435769081,
"learning_rate": 0.0002,
"loss": 0.5932596921920776,
"mean_token_accuracy": 0.7620532661676407,
"num_tokens": 506104.0,
"step": 31
},
{
"entropy": 0.5812755525112152,
"epoch": 0.11962616822429907,
"grad_norm": 0.060992538928985596,
"learning_rate": 0.0002,
"loss": 0.5729696154594421,
"mean_token_accuracy": 0.7730918079614639,
"num_tokens": 522565.0,
"step": 32
},
{
"entropy": 0.5877644866704941,
"epoch": 0.1233644859813084,
"grad_norm": 0.05839328467845917,
"learning_rate": 0.0002,
"loss": 0.5913704633712769,
"mean_token_accuracy": 0.7656503766775131,
"num_tokens": 539081.0,
"step": 33
},
{
"entropy": 0.5780033618211746,
"epoch": 0.12710280373831775,
"grad_norm": 0.05193523317575455,
"learning_rate": 0.0002,
"loss": 0.5819685459136963,
"mean_token_accuracy": 0.7665455341339111,
"num_tokens": 555504.0,
"step": 34
},
{
"entropy": 0.5869153290987015,
"epoch": 0.1308411214953271,
"grad_norm": 0.06890807300806046,
"learning_rate": 0.0002,
"loss": 0.5857660174369812,
"mean_token_accuracy": 0.7676131427288055,
"num_tokens": 572153.0,
"step": 35
},
{
"entropy": 0.5672304034233093,
"epoch": 0.13457943925233645,
"grad_norm": 0.05624233931303024,
"learning_rate": 0.0002,
"loss": 0.5718747973442078,
"mean_token_accuracy": 0.7710311710834503,
"num_tokens": 588585.0,
"step": 36
},
{
"entropy": 0.5678977817296982,
"epoch": 0.1383177570093458,
"grad_norm": 0.06091594323515892,
"learning_rate": 0.0002,
"loss": 0.5765193104743958,
"mean_token_accuracy": 0.7686972767114639,
"num_tokens": 604864.0,
"step": 37
},
{
"entropy": 0.5863034427165985,
"epoch": 0.14205607476635515,
"grad_norm": 0.07292835414409637,
"learning_rate": 0.0002,
"loss": 0.597279965877533,
"mean_token_accuracy": 0.7606304287910461,
"num_tokens": 621080.0,
"step": 38
},
{
"entropy": 0.5759021639823914,
"epoch": 0.14579439252336449,
"grad_norm": 0.05464645475149155,
"learning_rate": 0.0002,
"loss": 0.570218563079834,
"mean_token_accuracy": 0.770964503288269,
"num_tokens": 637503.0,
"step": 39
},
{
"entropy": 0.5763402879238129,
"epoch": 0.14953271028037382,
"grad_norm": 0.056617990136146545,
"learning_rate": 0.0002,
"loss": 0.5686919093132019,
"mean_token_accuracy": 0.7723182737827301,
"num_tokens": 653609.0,
"step": 40
},
{
"entropy": 0.6039886325597763,
"epoch": 0.15327102803738318,
"grad_norm": 0.04869381710886955,
"learning_rate": 0.0002,
"loss": 0.5939038395881653,
"mean_token_accuracy": 0.7607405036687851,
"num_tokens": 669981.0,
"step": 41
},
{
"entropy": 0.5946750342845917,
"epoch": 0.15700934579439252,
"grad_norm": 0.046227701008319855,
"learning_rate": 0.0002,
"loss": 0.589706301689148,
"mean_token_accuracy": 0.7646626383066177,
"num_tokens": 686537.0,
"step": 42
},
{
"entropy": 0.5577073395252228,
"epoch": 0.16074766355140188,
"grad_norm": 0.04413911700248718,
"learning_rate": 0.0002,
"loss": 0.559436023235321,
"mean_token_accuracy": 0.7762598991394043,
"num_tokens": 702686.0,
"step": 43
},
{
"entropy": 0.5665079057216644,
"epoch": 0.16448598130841122,
"grad_norm": 0.047774720937013626,
"learning_rate": 0.0002,
"loss": 0.5647708773612976,
"mean_token_accuracy": 0.7764726728200912,
"num_tokens": 718966.0,
"step": 44
},
{
"entropy": 0.5726076513528824,
"epoch": 0.16822429906542055,
"grad_norm": 0.05053015798330307,
"learning_rate": 0.0002,
"loss": 0.5747931003570557,
"mean_token_accuracy": 0.7704672068357468,
"num_tokens": 735364.0,
"step": 45
},
{
"entropy": 0.5688610672950745,
"epoch": 0.17196261682242991,
"grad_norm": 0.037495676428079605,
"learning_rate": 0.0002,
"loss": 0.5652605295181274,
"mean_token_accuracy": 0.770918145775795,
"num_tokens": 751902.0,
"step": 46
},
{
"entropy": 0.581221267580986,
"epoch": 0.17570093457943925,
"grad_norm": 0.051694370806217194,
"learning_rate": 0.0002,
"loss": 0.5826902389526367,
"mean_token_accuracy": 0.7654351443052292,
"num_tokens": 768151.0,
"step": 47
},
{
"entropy": 0.5708408057689667,
"epoch": 0.17943925233644858,
"grad_norm": 0.04264647886157036,
"learning_rate": 0.0002,
"loss": 0.5651251673698425,
"mean_token_accuracy": 0.7749274671077728,
"num_tokens": 784511.0,
"step": 48
},
{
"entropy": 0.5757250636816025,
"epoch": 0.18317757009345795,
"grad_norm": 0.050725825130939484,
"learning_rate": 0.0002,
"loss": 0.5704944133758545,
"mean_token_accuracy": 0.7680549174547195,
"num_tokens": 800966.0,
"step": 49
},
{
"entropy": 0.5546318888664246,
"epoch": 0.18691588785046728,
"grad_norm": 0.03947490453720093,
"learning_rate": 0.0002,
"loss": 0.5488482713699341,
"mean_token_accuracy": 0.7769860327243805,
"num_tokens": 817293.0,
"step": 50
},
{
"entropy": 0.5634811520576477,
"epoch": 0.19065420560747665,
"grad_norm": 0.049806442111730576,
"learning_rate": 0.0002,
"loss": 0.5557321906089783,
"mean_token_accuracy": 0.7740621268749237,
"num_tokens": 833385.0,
"step": 51
},
{
"entropy": 0.582123801112175,
"epoch": 0.19439252336448598,
"grad_norm": 0.0458400622010231,
"learning_rate": 0.0002,
"loss": 0.5802882313728333,
"mean_token_accuracy": 0.7661796510219574,
"num_tokens": 849741.0,
"step": 52
},
{
"entropy": 0.5494910776615143,
"epoch": 0.19813084112149532,
"grad_norm": 0.04727543145418167,
"learning_rate": 0.0002,
"loss": 0.554188072681427,
"mean_token_accuracy": 0.7779219001531601,
"num_tokens": 865884.0,
"step": 53
},
{
"entropy": 0.568273514509201,
"epoch": 0.20186915887850468,
"grad_norm": 0.052229855209589005,
"learning_rate": 0.0002,
"loss": 0.5752811431884766,
"mean_token_accuracy": 0.7671186923980713,
"num_tokens": 882348.0,
"step": 54
},
{
"entropy": 0.5694270133972168,
"epoch": 0.205607476635514,
"grad_norm": 0.04475817084312439,
"learning_rate": 0.0002,
"loss": 0.5706926584243774,
"mean_token_accuracy": 0.7702507525682449,
"num_tokens": 898544.0,
"step": 55
},
{
"entropy": 0.5677521079778671,
"epoch": 0.20934579439252338,
"grad_norm": 0.03592672944068909,
"learning_rate": 0.0002,
"loss": 0.5723967552185059,
"mean_token_accuracy": 0.766302615404129,
"num_tokens": 914946.0,
"step": 56
},
{
"entropy": 0.5698029845952988,
"epoch": 0.2130841121495327,
"grad_norm": 0.04732033982872963,
"learning_rate": 0.0002,
"loss": 0.5640438795089722,
"mean_token_accuracy": 0.7732385843992233,
"num_tokens": 931100.0,
"step": 57
},
{
"entropy": 0.5775126665830612,
"epoch": 0.21682242990654205,
"grad_norm": 0.04193758964538574,
"learning_rate": 0.0002,
"loss": 0.5704541802406311,
"mean_token_accuracy": 0.7691217958927155,
"num_tokens": 947448.0,
"step": 58
},
{
"entropy": 0.5770154148340225,
"epoch": 0.2205607476635514,
"grad_norm": 0.035865288227796555,
"learning_rate": 0.0002,
"loss": 0.5679229497909546,
"mean_token_accuracy": 0.7680188864469528,
"num_tokens": 963902.0,
"step": 59
},
{
"entropy": 0.5588070899248123,
"epoch": 0.22429906542056074,
"grad_norm": 0.04689257591962814,
"learning_rate": 0.0002,
"loss": 0.5615048408508301,
"mean_token_accuracy": 0.7748474776744843,
"num_tokens": 980180.0,
"step": 60
},
{
"entropy": 0.57504902780056,
"epoch": 0.22803738317757008,
"grad_norm": 0.04198114946484566,
"learning_rate": 0.0002,
"loss": 0.577617883682251,
"mean_token_accuracy": 0.7648669481277466,
"num_tokens": 996613.0,
"step": 61
},
{
"entropy": 0.5450393110513687,
"epoch": 0.23177570093457944,
"grad_norm": 0.040139347314834595,
"learning_rate": 0.0002,
"loss": 0.552120566368103,
"mean_token_accuracy": 0.7774388641119003,
"num_tokens": 1012686.0,
"step": 62
},
{
"entropy": 0.5609021335840225,
"epoch": 0.23551401869158878,
"grad_norm": 0.03753409534692764,
"learning_rate": 0.0002,
"loss": 0.5530397295951843,
"mean_token_accuracy": 0.7765212655067444,
"num_tokens": 1028835.0,
"step": 63
},
{
"entropy": 0.5794262290000916,
"epoch": 0.23925233644859814,
"grad_norm": 0.035354360938072205,
"learning_rate": 0.0002,
"loss": 0.5788048505783081,
"mean_token_accuracy": 0.7663274556398392,
"num_tokens": 1045176.0,
"step": 64
},
{
"entropy": 0.5655659288167953,
"epoch": 0.24299065420560748,
"grad_norm": 0.03588757663965225,
"learning_rate": 0.0002,
"loss": 0.5581645369529724,
"mean_token_accuracy": 0.7732069790363312,
"num_tokens": 1061452.0,
"step": 65
},
{
"entropy": 0.5672483444213867,
"epoch": 0.2467289719626168,
"grad_norm": 0.036772388964891434,
"learning_rate": 0.0002,
"loss": 0.5631874203681946,
"mean_token_accuracy": 0.7695926129817963,
"num_tokens": 1077997.0,
"step": 66
},
{
"entropy": 0.578306719660759,
"epoch": 0.2504672897196262,
"grad_norm": 0.039442483335733414,
"learning_rate": 0.0002,
"loss": 0.5765112638473511,
"mean_token_accuracy": 0.7657738327980042,
"num_tokens": 1094247.0,
"step": 67
},
{
"entropy": 0.5700875818729401,
"epoch": 0.2542056074766355,
"grad_norm": 0.0448731891810894,
"learning_rate": 0.0002,
"loss": 0.574236273765564,
"mean_token_accuracy": 0.7669749855995178,
"num_tokens": 1110470.0,
"step": 68
},
{
"entropy": 0.5609024912118912,
"epoch": 0.25794392523364484,
"grad_norm": 0.033255062997341156,
"learning_rate": 0.0002,
"loss": 0.5576102137565613,
"mean_token_accuracy": 0.7776026874780655,
"num_tokens": 1127050.0,
"step": 69
},
{
"entropy": 0.5673299431800842,
"epoch": 0.2616822429906542,
"grad_norm": 0.03715064004063606,
"learning_rate": 0.0002,
"loss": 0.5695099234580994,
"mean_token_accuracy": 0.7701731324195862,
"num_tokens": 1143383.0,
"step": 70
},
{
"entropy": 0.560445249080658,
"epoch": 0.26542056074766357,
"grad_norm": 0.04453396797180176,
"learning_rate": 0.0002,
"loss": 0.5644095540046692,
"mean_token_accuracy": 0.7720398306846619,
"num_tokens": 1159597.0,
"step": 71
},
{
"entropy": 0.5526476353406906,
"epoch": 0.2691588785046729,
"grad_norm": 0.039633698761463165,
"learning_rate": 0.0002,
"loss": 0.5499011874198914,
"mean_token_accuracy": 0.7772456705570221,
"num_tokens": 1175764.0,
"step": 72
},
{
"entropy": 0.5623870193958282,
"epoch": 0.27289719626168224,
"grad_norm": 0.036508623510599136,
"learning_rate": 0.0002,
"loss": 0.5721215009689331,
"mean_token_accuracy": 0.7691169232130051,
"num_tokens": 1192041.0,
"step": 73
},
{
"entropy": 0.5718335658311844,
"epoch": 0.2766355140186916,
"grad_norm": 0.044028230011463165,
"learning_rate": 0.0002,
"loss": 0.5752332806587219,
"mean_token_accuracy": 0.7687042653560638,
"num_tokens": 1208468.0,
"step": 74
},
{
"entropy": 0.5587927252054214,
"epoch": 0.2803738317757009,
"grad_norm": 0.04269316419959068,
"learning_rate": 0.0002,
"loss": 0.5531549453735352,
"mean_token_accuracy": 0.7755036056041718,
"num_tokens": 1224757.0,
"step": 75
},
{
"entropy": 0.5787914991378784,
"epoch": 0.2841121495327103,
"grad_norm": 0.040728773921728134,
"learning_rate": 0.0002,
"loss": 0.5694252252578735,
"mean_token_accuracy": 0.7696126103401184,
"num_tokens": 1241162.0,
"step": 76
},
{
"entropy": 0.5616230517625809,
"epoch": 0.28785046728971964,
"grad_norm": 0.037814315408468246,
"learning_rate": 0.0002,
"loss": 0.5627362728118896,
"mean_token_accuracy": 0.7735611200332642,
"num_tokens": 1257583.0,
"step": 77
},
{
"entropy": 0.567746564745903,
"epoch": 0.29158878504672897,
"grad_norm": 0.03843110799789429,
"learning_rate": 0.0002,
"loss": 0.5634809732437134,
"mean_token_accuracy": 0.7711174041032791,
"num_tokens": 1274115.0,
"step": 78
},
{
"entropy": 0.5585684925317764,
"epoch": 0.2953271028037383,
"grad_norm": 0.03358754143118858,
"learning_rate": 0.0002,
"loss": 0.5604900121688843,
"mean_token_accuracy": 0.7713887989521027,
"num_tokens": 1290371.0,
"step": 79
},
{
"entropy": 0.5650099366903305,
"epoch": 0.29906542056074764,
"grad_norm": 0.038185376673936844,
"learning_rate": 0.0002,
"loss": 0.5694409608840942,
"mean_token_accuracy": 0.7706831693649292,
"num_tokens": 1306602.0,
"step": 80
},
{
"entropy": 0.5573018193244934,
"epoch": 0.30280373831775703,
"grad_norm": 0.04070131108164787,
"learning_rate": 0.0002,
"loss": 0.5703440308570862,
"mean_token_accuracy": 0.771970734000206,
"num_tokens": 1322957.0,
"step": 81
},
{
"entropy": 0.545403316617012,
"epoch": 0.30654205607476637,
"grad_norm": 0.04340139031410217,
"learning_rate": 0.0002,
"loss": 0.5498678088188171,
"mean_token_accuracy": 0.7774094045162201,
"num_tokens": 1339233.0,
"step": 82
},
{
"entropy": 0.5381540507078171,
"epoch": 0.3102803738317757,
"grad_norm": 0.039635106921195984,
"learning_rate": 0.0002,
"loss": 0.542028546333313,
"mean_token_accuracy": 0.7835624068975449,
"num_tokens": 1355463.0,
"step": 83
},
{
"entropy": 0.5599908977746964,
"epoch": 0.31401869158878504,
"grad_norm": 0.039568379521369934,
"learning_rate": 0.0002,
"loss": 0.5559767484664917,
"mean_token_accuracy": 0.7765284180641174,
"num_tokens": 1371815.0,
"step": 84
},
{
"entropy": 0.5593477934598923,
"epoch": 0.3177570093457944,
"grad_norm": 0.039335861802101135,
"learning_rate": 0.0002,
"loss": 0.5506576895713806,
"mean_token_accuracy": 0.7803503125905991,
"num_tokens": 1388181.0,
"step": 85
},
{
"entropy": 0.5572251528501511,
"epoch": 0.32149532710280376,
"grad_norm": 0.03665383532643318,
"learning_rate": 0.0002,
"loss": 0.5480077862739563,
"mean_token_accuracy": 0.7788248509168625,
"num_tokens": 1404584.0,
"step": 86
},
{
"entropy": 0.5664831250905991,
"epoch": 0.3252336448598131,
"grad_norm": 0.040541525930166245,
"learning_rate": 0.0002,
"loss": 0.5769516229629517,
"mean_token_accuracy": 0.7674112915992737,
"num_tokens": 1420963.0,
"step": 87
},
{
"entropy": 0.5584649592638016,
"epoch": 0.32897196261682243,
"grad_norm": 0.033256057649850845,
"learning_rate": 0.0002,
"loss": 0.5648812651634216,
"mean_token_accuracy": 0.7723092287778854,
"num_tokens": 1437122.0,
"step": 88
},
{
"entropy": 0.5519673079252243,
"epoch": 0.33271028037383177,
"grad_norm": 0.031988468021154404,
"learning_rate": 0.0002,
"loss": 0.551476776599884,
"mean_token_accuracy": 0.7795782834291458,
"num_tokens": 1453481.0,
"step": 89
},
{
"entropy": 0.5844476372003555,
"epoch": 0.3364485981308411,
"grad_norm": 0.037734005600214005,
"learning_rate": 0.0002,
"loss": 0.5850376486778259,
"mean_token_accuracy": 0.7618721723556519,
"num_tokens": 1469968.0,
"step": 90
},
{
"entropy": 0.5527342259883881,
"epoch": 0.3401869158878505,
"grad_norm": 0.03733964264392853,
"learning_rate": 0.0002,
"loss": 0.5517382621765137,
"mean_token_accuracy": 0.7791167348623276,
"num_tokens": 1486410.0,
"step": 91
},
{
"entropy": 0.5490231364965439,
"epoch": 0.34392523364485983,
"grad_norm": 0.03796572983264923,
"learning_rate": 0.0002,
"loss": 0.5472099781036377,
"mean_token_accuracy": 0.7787582278251648,
"num_tokens": 1502827.0,
"step": 92
},
{
"entropy": 0.5654839426279068,
"epoch": 0.34766355140186916,
"grad_norm": 0.03400302678346634,
"learning_rate": 0.0002,
"loss": 0.5675226449966431,
"mean_token_accuracy": 0.7715823501348495,
"num_tokens": 1519035.0,
"step": 93
},
{
"entropy": 0.5789331346750259,
"epoch": 0.3514018691588785,
"grad_norm": 0.03300806134939194,
"learning_rate": 0.0002,
"loss": 0.5738787055015564,
"mean_token_accuracy": 0.7701004296541214,
"num_tokens": 1535776.0,
"step": 94
},
{
"entropy": 0.5546596646308899,
"epoch": 0.35514018691588783,
"grad_norm": 0.03256770223379135,
"learning_rate": 0.0002,
"loss": 0.5567547082901001,
"mean_token_accuracy": 0.7791133224964142,
"num_tokens": 1552013.0,
"step": 95
},
{
"entropy": 0.5764150321483612,
"epoch": 0.35887850467289717,
"grad_norm": 0.03291841968894005,
"learning_rate": 0.0002,
"loss": 0.5735791921615601,
"mean_token_accuracy": 0.770502358675003,
"num_tokens": 1568424.0,
"step": 96
},
{
"entropy": 0.5675235092639923,
"epoch": 0.36261682242990656,
"grad_norm": 0.03169221803545952,
"learning_rate": 0.0002,
"loss": 0.567868709564209,
"mean_token_accuracy": 0.7711145430803299,
"num_tokens": 1584887.0,
"step": 97
},
{
"entropy": 0.5626550316810608,
"epoch": 0.3663551401869159,
"grad_norm": 0.03811025619506836,
"learning_rate": 0.0002,
"loss": 0.5668138265609741,
"mean_token_accuracy": 0.772192656993866,
"num_tokens": 1601260.0,
"step": 98
},
{
"entropy": 0.5581237971782684,
"epoch": 0.37009345794392523,
"grad_norm": 0.03798513859510422,
"learning_rate": 0.0002,
"loss": 0.5674142241477966,
"mean_token_accuracy": 0.7706556767225266,
"num_tokens": 1617528.0,
"step": 99
},
{
"entropy": 0.5649739503860474,
"epoch": 0.37383177570093457,
"grad_norm": 0.03556443750858307,
"learning_rate": 0.0002,
"loss": 0.5644899606704712,
"mean_token_accuracy": 0.7701123207807541,
"num_tokens": 1633885.0,
"step": 100
},
{
"entropy": 0.5828528255224228,
"epoch": 0.3775700934579439,
"grad_norm": 0.03924545273184776,
"learning_rate": 0.0002,
"loss": 0.5804182291030884,
"mean_token_accuracy": 0.7685290277004242,
"num_tokens": 1650680.0,
"step": 101
},
{
"entropy": 0.5504215061664581,
"epoch": 0.3813084112149533,
"grad_norm": 0.03934217616915703,
"learning_rate": 0.0002,
"loss": 0.5463358163833618,
"mean_token_accuracy": 0.7797124236822128,
"num_tokens": 1666866.0,
"step": 102
},
{
"entropy": 0.5697780549526215,
"epoch": 0.3850467289719626,
"grad_norm": 0.03712291270494461,
"learning_rate": 0.0002,
"loss": 0.5653584599494934,
"mean_token_accuracy": 0.7692228257656097,
"num_tokens": 1683118.0,
"step": 103
},
{
"entropy": 0.5601143538951874,
"epoch": 0.38878504672897196,
"grad_norm": 0.033694274723529816,
"learning_rate": 0.0002,
"loss": 0.5663195848464966,
"mean_token_accuracy": 0.7706973105669022,
"num_tokens": 1699475.0,
"step": 104
},
{
"entropy": 0.5591333955526352,
"epoch": 0.3925233644859813,
"grad_norm": 0.03714451938867569,
"learning_rate": 0.0002,
"loss": 0.566075325012207,
"mean_token_accuracy": 0.7697228640317917,
"num_tokens": 1715853.0,
"step": 105
},
{
"entropy": 0.5509396642446518,
"epoch": 0.39626168224299063,
"grad_norm": 0.03486821800470352,
"learning_rate": 0.0002,
"loss": 0.5632879734039307,
"mean_token_accuracy": 0.7730516046285629,
"num_tokens": 1732170.0,
"step": 106
},
{
"entropy": 0.5652123540639877,
"epoch": 0.4,
"grad_norm": 0.041288331151008606,
"learning_rate": 0.0002,
"loss": 0.5604725480079651,
"mean_token_accuracy": 0.7711915820837021,
"num_tokens": 1748328.0,
"step": 107
},
{
"entropy": 0.5530835092067719,
"epoch": 0.40373831775700936,
"grad_norm": 0.0322246178984642,
"learning_rate": 0.0002,
"loss": 0.5545868277549744,
"mean_token_accuracy": 0.7774576395750046,
"num_tokens": 1764582.0,
"step": 108
},
{
"entropy": 0.574239119887352,
"epoch": 0.4074766355140187,
"grad_norm": 0.031295642256736755,
"learning_rate": 0.0002,
"loss": 0.5755724906921387,
"mean_token_accuracy": 0.7669118195772171,
"num_tokens": 1780985.0,
"step": 109
},
{
"entropy": 0.5714472681283951,
"epoch": 0.411214953271028,
"grad_norm": 0.034113939851522446,
"learning_rate": 0.0002,
"loss": 0.565799355506897,
"mean_token_accuracy": 0.7719277888536453,
"num_tokens": 1797483.0,
"step": 110
},
{
"entropy": 0.5522187203168869,
"epoch": 0.41495327102803736,
"grad_norm": 0.03207452967762947,
"learning_rate": 0.0002,
"loss": 0.5486649870872498,
"mean_token_accuracy": 0.7786776423454285,
"num_tokens": 1813763.0,
"step": 111
},
{
"entropy": 0.5560779720544815,
"epoch": 0.41869158878504675,
"grad_norm": 0.0334036760032177,
"learning_rate": 0.0002,
"loss": 0.5554910898208618,
"mean_token_accuracy": 0.7745659798383713,
"num_tokens": 1829937.0,
"step": 112
},
{
"entropy": 0.5375554114580154,
"epoch": 0.4224299065420561,
"grad_norm": 0.03380579128861427,
"learning_rate": 0.0002,
"loss": 0.5416814684867859,
"mean_token_accuracy": 0.7802845388650894,
"num_tokens": 1846164.0,
"step": 113
},
{
"entropy": 0.5589973330497742,
"epoch": 0.4261682242990654,
"grad_norm": 0.03403402864933014,
"learning_rate": 0.0002,
"loss": 0.5650242567062378,
"mean_token_accuracy": 0.7712521702051163,
"num_tokens": 1862080.0,
"step": 114
},
{
"entropy": 0.5673896223306656,
"epoch": 0.42990654205607476,
"grad_norm": 0.03260383754968643,
"learning_rate": 0.0002,
"loss": 0.5664341449737549,
"mean_token_accuracy": 0.7702513486146927,
"num_tokens": 1878608.0,
"step": 115
},
{
"entropy": 0.572798103094101,
"epoch": 0.4336448598130841,
"grad_norm": 0.03137151151895523,
"learning_rate": 0.0002,
"loss": 0.5731777548789978,
"mean_token_accuracy": 0.7663247585296631,
"num_tokens": 1895166.0,
"step": 116
},
{
"entropy": 0.5312000960111618,
"epoch": 0.4373831775700935,
"grad_norm": 0.031823012977838516,
"learning_rate": 0.0002,
"loss": 0.5382552742958069,
"mean_token_accuracy": 0.7808444052934647,
"num_tokens": 1911130.0,
"step": 117
},
{
"entropy": 0.5409984290599823,
"epoch": 0.4411214953271028,
"grad_norm": 0.03332378715276718,
"learning_rate": 0.0002,
"loss": 0.5410414934158325,
"mean_token_accuracy": 0.7819060832262039,
"num_tokens": 1927264.0,
"step": 118
},
{
"entropy": 0.5695091038942337,
"epoch": 0.44485981308411215,
"grad_norm": 0.03380680829286575,
"learning_rate": 0.0002,
"loss": 0.5648797154426575,
"mean_token_accuracy": 0.7696678340435028,
"num_tokens": 1943766.0,
"step": 119
},
{
"entropy": 0.5565821528434753,
"epoch": 0.4485981308411215,
"grad_norm": 0.02917688526213169,
"learning_rate": 0.0002,
"loss": 0.5566266179084778,
"mean_token_accuracy": 0.7743457108736038,
"num_tokens": 1959998.0,
"step": 120
},
{
"entropy": 0.5624082386493683,
"epoch": 0.4523364485981308,
"grad_norm": 0.03372650966048241,
"learning_rate": 0.0002,
"loss": 0.5673832297325134,
"mean_token_accuracy": 0.7714631706476212,
"num_tokens": 1976438.0,
"step": 121
},
{
"entropy": 0.5652057379484177,
"epoch": 0.45607476635514016,
"grad_norm": 0.031156128272414207,
"learning_rate": 0.0002,
"loss": 0.5634032487869263,
"mean_token_accuracy": 0.7731290906667709,
"num_tokens": 1992993.0,
"step": 122
},
{
"entropy": 0.5621330291032791,
"epoch": 0.45981308411214955,
"grad_norm": 0.03159690275788307,
"learning_rate": 0.0002,
"loss": 0.5597059726715088,
"mean_token_accuracy": 0.7743693888187408,
"num_tokens": 2009294.0,
"step": 123
},
{
"entropy": 0.558076485991478,
"epoch": 0.4635514018691589,
"grad_norm": 0.032280728220939636,
"learning_rate": 0.0002,
"loss": 0.561931312084198,
"mean_token_accuracy": 0.7742635309696198,
"num_tokens": 2025544.0,
"step": 124
},
{
"entropy": 0.5441709458827972,
"epoch": 0.4672897196261682,
"grad_norm": 0.03219074383378029,
"learning_rate": 0.0002,
"loss": 0.5506591200828552,
"mean_token_accuracy": 0.7746744006872177,
"num_tokens": 2041666.0,
"step": 125
},
{
"entropy": 0.5633633583784103,
"epoch": 0.47102803738317756,
"grad_norm": 0.03131939098238945,
"learning_rate": 0.0002,
"loss": 0.5623766183853149,
"mean_token_accuracy": 0.7734539210796356,
"num_tokens": 2057983.0,
"step": 126
},
{
"entropy": 0.5601471066474915,
"epoch": 0.4747663551401869,
"grad_norm": 0.03067948669195175,
"learning_rate": 0.0002,
"loss": 0.5621774196624756,
"mean_token_accuracy": 0.7716772705316544,
"num_tokens": 2074261.0,
"step": 127
},
{
"entropy": 0.5540204495191574,
"epoch": 0.4785046728971963,
"grad_norm": 0.03339416906237602,
"learning_rate": 0.0002,
"loss": 0.548160970211029,
"mean_token_accuracy": 0.7764931470155716,
"num_tokens": 2090516.0,
"step": 128
},
{
"entropy": 0.552289143204689,
"epoch": 0.4822429906542056,
"grad_norm": 0.031481482088565826,
"learning_rate": 0.0002,
"loss": 0.5535706877708435,
"mean_token_accuracy": 0.7739048302173615,
"num_tokens": 2106672.0,
"step": 129
},
{
"entropy": 0.5568640977144241,
"epoch": 0.48598130841121495,
"grad_norm": 0.028559116646647453,
"learning_rate": 0.0002,
"loss": 0.5580005645751953,
"mean_token_accuracy": 0.7733460515737534,
"num_tokens": 2123117.0,
"step": 130
},
{
"entropy": 0.5648922473192215,
"epoch": 0.4897196261682243,
"grad_norm": 0.029422340914607048,
"learning_rate": 0.0002,
"loss": 0.5628851056098938,
"mean_token_accuracy": 0.7712086588144302,
"num_tokens": 2139369.0,
"step": 131
},
{
"entropy": 0.5373547673225403,
"epoch": 0.4934579439252336,
"grad_norm": 0.030260303989052773,
"learning_rate": 0.0002,
"loss": 0.541597843170166,
"mean_token_accuracy": 0.7806773632764816,
"num_tokens": 2155734.0,
"step": 132
},
{
"entropy": 0.5263249725103378,
"epoch": 0.497196261682243,
"grad_norm": 0.03478972613811493,
"learning_rate": 0.0002,
"loss": 0.5312929153442383,
"mean_token_accuracy": 0.7852403372526169,
"num_tokens": 2171760.0,
"step": 133
},
{
"entropy": 0.5605382472276688,
"epoch": 0.5009345794392523,
"grad_norm": 0.033430542796850204,
"learning_rate": 0.0002,
"loss": 0.5653795599937439,
"mean_token_accuracy": 0.7712585926055908,
"num_tokens": 2188007.0,
"step": 134
},
{
"entropy": 0.5739341080188751,
"epoch": 0.5046728971962616,
"grad_norm": 0.030662760138511658,
"learning_rate": 0.0002,
"loss": 0.5707223415374756,
"mean_token_accuracy": 0.7689347118139267,
"num_tokens": 2204304.0,
"step": 135
},
{
"entropy": 0.5562440007925034,
"epoch": 0.508411214953271,
"grad_norm": 0.029425745829939842,
"learning_rate": 0.0002,
"loss": 0.5517452955245972,
"mean_token_accuracy": 0.7767287492752075,
"num_tokens": 2220312.0,
"step": 136
},
{
"entropy": 0.5788603723049164,
"epoch": 0.5121495327102804,
"grad_norm": 0.033554431051015854,
"learning_rate": 0.0002,
"loss": 0.5720421075820923,
"mean_token_accuracy": 0.7664643228054047,
"num_tokens": 2236563.0,
"step": 137
},
{
"entropy": 0.558774933218956,
"epoch": 0.5158878504672897,
"grad_norm": 0.035832736641168594,
"learning_rate": 0.0002,
"loss": 0.559954822063446,
"mean_token_accuracy": 0.7725366801023483,
"num_tokens": 2252830.0,
"step": 138
},
{
"entropy": 0.554543524980545,
"epoch": 0.5196261682242991,
"grad_norm": 0.03428984060883522,
"learning_rate": 0.0002,
"loss": 0.5592023730278015,
"mean_token_accuracy": 0.772834375500679,
"num_tokens": 2269287.0,
"step": 139
},
{
"entropy": 0.5500677078962326,
"epoch": 0.5233644859813084,
"grad_norm": 0.035624898970127106,
"learning_rate": 0.0002,
"loss": 0.5614656209945679,
"mean_token_accuracy": 0.7710914462804794,
"num_tokens": 2285456.0,
"step": 140
},
{
"entropy": 0.5587853938341141,
"epoch": 0.5271028037383177,
"grad_norm": 0.03407886624336243,
"learning_rate": 0.0002,
"loss": 0.5605294704437256,
"mean_token_accuracy": 0.7720634043216705,
"num_tokens": 2301539.0,
"step": 141
},
{
"entropy": 0.5649153292179108,
"epoch": 0.5308411214953271,
"grad_norm": 0.028877010568976402,
"learning_rate": 0.0002,
"loss": 0.5598087310791016,
"mean_token_accuracy": 0.7749214172363281,
"num_tokens": 2317846.0,
"step": 142
},
{
"entropy": 0.5670332461595535,
"epoch": 0.5345794392523364,
"grad_norm": 0.03278481960296631,
"learning_rate": 0.0002,
"loss": 0.5650190114974976,
"mean_token_accuracy": 0.7726317644119263,
"num_tokens": 2334166.0,
"step": 143
},
{
"entropy": 0.5582242161035538,
"epoch": 0.5383177570093458,
"grad_norm": 0.033217303454875946,
"learning_rate": 0.0002,
"loss": 0.56020587682724,
"mean_token_accuracy": 0.7734358310699463,
"num_tokens": 2350590.0,
"step": 144
},
{
"entropy": 0.5491778552532196,
"epoch": 0.5420560747663551,
"grad_norm": 0.030532008036971092,
"learning_rate": 0.0002,
"loss": 0.5535258650779724,
"mean_token_accuracy": 0.7728464603424072,
"num_tokens": 2367000.0,
"step": 145
},
{
"entropy": 0.5495235919952393,
"epoch": 0.5457943925233645,
"grad_norm": 0.03000551462173462,
"learning_rate": 0.0002,
"loss": 0.549593448638916,
"mean_token_accuracy": 0.7776431441307068,
"num_tokens": 2383493.0,
"step": 146
},
{
"entropy": 0.5404796749353409,
"epoch": 0.5495327102803739,
"grad_norm": 0.03362047299742699,
"learning_rate": 0.0002,
"loss": 0.5460700392723083,
"mean_token_accuracy": 0.7808279991149902,
"num_tokens": 2399803.0,
"step": 147
},
{
"entropy": 0.5644742697477341,
"epoch": 0.5532710280373832,
"grad_norm": 0.031069470569491386,
"learning_rate": 0.0002,
"loss": 0.5680921077728271,
"mean_token_accuracy": 0.7682847529649734,
"num_tokens": 2416029.0,
"step": 148
},
{
"entropy": 0.548800989985466,
"epoch": 0.5570093457943925,
"grad_norm": 0.027548154816031456,
"learning_rate": 0.0002,
"loss": 0.5483176708221436,
"mean_token_accuracy": 0.7775937616825104,
"num_tokens": 2432412.0,
"step": 149
},
{
"entropy": 0.5704467445611954,
"epoch": 0.5607476635514018,
"grad_norm": 0.032674722373485565,
"learning_rate": 0.0002,
"loss": 0.5650383830070496,
"mean_token_accuracy": 0.7691423147916794,
"num_tokens": 2448801.0,
"step": 150
},
{
"entropy": 0.5737617313861847,
"epoch": 0.5644859813084112,
"grad_norm": 0.02663569711148739,
"learning_rate": 0.0002,
"loss": 0.5644318461418152,
"mean_token_accuracy": 0.7708285748958588,
"num_tokens": 2465024.0,
"step": 151
},
{
"entropy": 0.5562496334314346,
"epoch": 0.5682242990654206,
"grad_norm": 0.03284625709056854,
"learning_rate": 0.0002,
"loss": 0.5537476539611816,
"mean_token_accuracy": 0.7753592431545258,
"num_tokens": 2481162.0,
"step": 152
},
{
"entropy": 0.5587188154459,
"epoch": 0.5719626168224299,
"grad_norm": 0.035413194447755814,
"learning_rate": 0.0002,
"loss": 0.5652291178703308,
"mean_token_accuracy": 0.7711132764816284,
"num_tokens": 2497543.0,
"step": 153
},
{
"entropy": 0.5715966671705246,
"epoch": 0.5757009345794393,
"grad_norm": 0.030816730111837387,
"learning_rate": 0.0002,
"loss": 0.5740691423416138,
"mean_token_accuracy": 0.767062708735466,
"num_tokens": 2513719.0,
"step": 154
},
{
"entropy": 0.5732139945030212,
"epoch": 0.5794392523364486,
"grad_norm": 0.031442996114492416,
"learning_rate": 0.0002,
"loss": 0.575890302658081,
"mean_token_accuracy": 0.7688509374856949,
"num_tokens": 2529964.0,
"step": 155
},
{
"entropy": 0.5707177966833115,
"epoch": 0.5831775700934579,
"grad_norm": 0.029468102380633354,
"learning_rate": 0.0002,
"loss": 0.5684511661529541,
"mean_token_accuracy": 0.7719237357378006,
"num_tokens": 2546476.0,
"step": 156
},
{
"entropy": 0.5587103515863419,
"epoch": 0.5869158878504673,
"grad_norm": 0.031475260853767395,
"learning_rate": 0.0002,
"loss": 0.5583993792533875,
"mean_token_accuracy": 0.7759029716253281,
"num_tokens": 2562728.0,
"step": 157
},
{
"entropy": 0.574567124247551,
"epoch": 0.5906542056074766,
"grad_norm": 0.03264502063393593,
"learning_rate": 0.0002,
"loss": 0.5683896541595459,
"mean_token_accuracy": 0.7703035026788712,
"num_tokens": 2578973.0,
"step": 158
},
{
"entropy": 0.5552074015140533,
"epoch": 0.594392523364486,
"grad_norm": 0.032595545053482056,
"learning_rate": 0.0002,
"loss": 0.5574095249176025,
"mean_token_accuracy": 0.7743780612945557,
"num_tokens": 2595151.0,
"step": 159
},
{
"entropy": 0.5568316876888275,
"epoch": 0.5981308411214953,
"grad_norm": 0.033984988927841187,
"learning_rate": 0.0002,
"loss": 0.5642867088317871,
"mean_token_accuracy": 0.7713010758161545,
"num_tokens": 2611492.0,
"step": 160
},
{
"entropy": 0.5599596947431564,
"epoch": 0.6018691588785047,
"grad_norm": 0.031165285035967827,
"learning_rate": 0.0002,
"loss": 0.5589022636413574,
"mean_token_accuracy": 0.7745718359947205,
"num_tokens": 2628012.0,
"step": 161
},
{
"entropy": 0.5476372390985489,
"epoch": 0.6056074766355141,
"grad_norm": 0.0300962645560503,
"learning_rate": 0.0002,
"loss": 0.5493466258049011,
"mean_token_accuracy": 0.7759741544723511,
"num_tokens": 2644335.0,
"step": 162
},
{
"entropy": 0.5408246964216232,
"epoch": 0.6093457943925233,
"grad_norm": 0.03227512910962105,
"learning_rate": 0.0002,
"loss": 0.5468109846115112,
"mean_token_accuracy": 0.7807517051696777,
"num_tokens": 2660464.0,
"step": 163
},
{
"entropy": 0.5610683709383011,
"epoch": 0.6130841121495327,
"grad_norm": 0.033202771097421646,
"learning_rate": 0.0002,
"loss": 0.5660794377326965,
"mean_token_accuracy": 0.7704542577266693,
"num_tokens": 2676703.0,
"step": 164
},
{
"entropy": 0.556282252073288,
"epoch": 0.616822429906542,
"grad_norm": 0.030140740796923637,
"learning_rate": 0.0002,
"loss": 0.5595802664756775,
"mean_token_accuracy": 0.7701904326677322,
"num_tokens": 2692980.0,
"step": 165
},
{
"entropy": 0.5742812305688858,
"epoch": 0.6205607476635514,
"grad_norm": 0.031175116077065468,
"learning_rate": 0.0002,
"loss": 0.5679398775100708,
"mean_token_accuracy": 0.7715850621461868,
"num_tokens": 2709458.0,
"step": 166
},
{
"entropy": 0.5686928480863571,
"epoch": 0.6242990654205608,
"grad_norm": 0.03218809515237808,
"learning_rate": 0.0002,
"loss": 0.570385217666626,
"mean_token_accuracy": 0.7703390866518021,
"num_tokens": 2725878.0,
"step": 167
},
{
"entropy": 0.5649634599685669,
"epoch": 0.6280373831775701,
"grad_norm": 0.03405897319316864,
"learning_rate": 0.0002,
"loss": 0.5623840093612671,
"mean_token_accuracy": 0.7718164473772049,
"num_tokens": 2742230.0,
"step": 168
},
{
"entropy": 0.54586461186409,
"epoch": 0.6317757009345795,
"grad_norm": 0.030788332223892212,
"learning_rate": 0.0002,
"loss": 0.5481584072113037,
"mean_token_accuracy": 0.7789210081100464,
"num_tokens": 2758288.0,
"step": 169
},
{
"entropy": 0.5519826114177704,
"epoch": 0.6355140186915887,
"grad_norm": 0.0393390953540802,
"learning_rate": 0.0002,
"loss": 0.5614264607429504,
"mean_token_accuracy": 0.7715797126293182,
"num_tokens": 2774621.0,
"step": 170
},
{
"entropy": 0.5494296550750732,
"epoch": 0.6392523364485981,
"grad_norm": 0.03524143248796463,
"learning_rate": 0.0002,
"loss": 0.5467370748519897,
"mean_token_accuracy": 0.7793298810720444,
"num_tokens": 2790715.0,
"step": 171
},
{
"entropy": 0.5330041199922562,
"epoch": 0.6429906542056075,
"grad_norm": 0.03651867434382439,
"learning_rate": 0.0002,
"loss": 0.539812445640564,
"mean_token_accuracy": 0.7808443903923035,
"num_tokens": 2806717.0,
"step": 172
},
{
"entropy": 0.5453702062368393,
"epoch": 0.6467289719626168,
"grad_norm": 0.03462547808885574,
"learning_rate": 0.0002,
"loss": 0.5413773655891418,
"mean_token_accuracy": 0.7798687964677811,
"num_tokens": 2823284.0,
"step": 173
},
{
"entropy": 0.5685944706201553,
"epoch": 0.6504672897196262,
"grad_norm": 0.028748901560902596,
"learning_rate": 0.0002,
"loss": 0.5659922361373901,
"mean_token_accuracy": 0.7701825350522995,
"num_tokens": 2839827.0,
"step": 174
},
{
"entropy": 0.5635224878787994,
"epoch": 0.6542056074766355,
"grad_norm": 0.02829919010400772,
"learning_rate": 0.0002,
"loss": 0.5650316476821899,
"mean_token_accuracy": 0.7709458023309708,
"num_tokens": 2856136.0,
"step": 175
},
{
"entropy": 0.5540378838777542,
"epoch": 0.6579439252336449,
"grad_norm": 0.033104948699474335,
"learning_rate": 0.0002,
"loss": 0.5580451488494873,
"mean_token_accuracy": 0.7731391340494156,
"num_tokens": 2872416.0,
"step": 176
},
{
"entropy": 0.5654754340648651,
"epoch": 0.6616822429906543,
"grad_norm": 0.03393986448645592,
"learning_rate": 0.0002,
"loss": 0.566604733467102,
"mean_token_accuracy": 0.768456295132637,
"num_tokens": 2888732.0,
"step": 177
},
{
"entropy": 0.538336843252182,
"epoch": 0.6654205607476635,
"grad_norm": 0.031724728643894196,
"learning_rate": 0.0002,
"loss": 0.5347487926483154,
"mean_token_accuracy": 0.783712849020958,
"num_tokens": 2904747.0,
"step": 178
},
{
"entropy": 0.563370868563652,
"epoch": 0.6691588785046729,
"grad_norm": 0.028497006744146347,
"learning_rate": 0.0002,
"loss": 0.5567288398742676,
"mean_token_accuracy": 0.7762446701526642,
"num_tokens": 2921642.0,
"step": 179
},
{
"entropy": 0.5554675310850143,
"epoch": 0.6728971962616822,
"grad_norm": 0.027588432654738426,
"learning_rate": 0.0002,
"loss": 0.5539284348487854,
"mean_token_accuracy": 0.7720596343278885,
"num_tokens": 2938231.0,
"step": 180
},
{
"entropy": 0.5351214110851288,
"epoch": 0.6766355140186916,
"grad_norm": 0.02989763207733631,
"learning_rate": 0.0002,
"loss": 0.5380938053131104,
"mean_token_accuracy": 0.7797621339559555,
"num_tokens": 2954651.0,
"step": 181
},
{
"entropy": 0.5512963533401489,
"epoch": 0.680373831775701,
"grad_norm": 0.031486768275499344,
"learning_rate": 0.0002,
"loss": 0.559045672416687,
"mean_token_accuracy": 0.7730693370103836,
"num_tokens": 2970900.0,
"step": 182
},
{
"entropy": 0.5643429905176163,
"epoch": 0.6841121495327103,
"grad_norm": 0.030211007222533226,
"learning_rate": 0.0002,
"loss": 0.5652138590812683,
"mean_token_accuracy": 0.7722145467996597,
"num_tokens": 2987276.0,
"step": 183
},
{
"entropy": 0.5449773222208023,
"epoch": 0.6878504672897197,
"grad_norm": 0.03100084885954857,
"learning_rate": 0.0002,
"loss": 0.5516652464866638,
"mean_token_accuracy": 0.7781905680894852,
"num_tokens": 3003582.0,
"step": 184
},
{
"entropy": 0.5534535795450211,
"epoch": 0.6915887850467289,
"grad_norm": 0.029445704072713852,
"learning_rate": 0.0002,
"loss": 0.549251914024353,
"mean_token_accuracy": 0.7758228182792664,
"num_tokens": 3019792.0,
"step": 185
},
{
"entropy": 0.5563573390245438,
"epoch": 0.6953271028037383,
"grad_norm": 0.03839804232120514,
"learning_rate": 0.0002,
"loss": 0.5603447556495667,
"mean_token_accuracy": 0.7714342921972275,
"num_tokens": 3035807.0,
"step": 186
},
{
"entropy": 0.538311779499054,
"epoch": 0.6990654205607477,
"grad_norm": 0.03146633878350258,
"learning_rate": 0.0002,
"loss": 0.5352146625518799,
"mean_token_accuracy": 0.7827797681093216,
"num_tokens": 3051838.0,
"step": 187
},
{
"entropy": 0.5633413791656494,
"epoch": 0.702803738317757,
"grad_norm": 0.02970045432448387,
"learning_rate": 0.0002,
"loss": 0.558843195438385,
"mean_token_accuracy": 0.774773895740509,
"num_tokens": 3068298.0,
"step": 188
},
{
"entropy": 0.5590213239192963,
"epoch": 0.7065420560747664,
"grad_norm": 0.030248312279582024,
"learning_rate": 0.0002,
"loss": 0.5594462156295776,
"mean_token_accuracy": 0.7730938643217087,
"num_tokens": 3084742.0,
"step": 189
},
{
"entropy": 0.5729488730430603,
"epoch": 0.7102803738317757,
"grad_norm": 0.02910761535167694,
"learning_rate": 0.0002,
"loss": 0.5710701942443848,
"mean_token_accuracy": 0.7694995701313019,
"num_tokens": 3101166.0,
"step": 190
},
{
"entropy": 0.5414529591798782,
"epoch": 0.7140186915887851,
"grad_norm": 0.030337564647197723,
"learning_rate": 0.0002,
"loss": 0.5447859168052673,
"mean_token_accuracy": 0.7779805213212967,
"num_tokens": 3117310.0,
"step": 191
},
{
"entropy": 0.5537209510803223,
"epoch": 0.7177570093457943,
"grad_norm": 0.03048059716820717,
"learning_rate": 0.0002,
"loss": 0.5590298771858215,
"mean_token_accuracy": 0.7726654410362244,
"num_tokens": 3133530.0,
"step": 192
},
{
"entropy": 0.5551200062036514,
"epoch": 0.7214953271028037,
"grad_norm": 0.03023671731352806,
"learning_rate": 0.0002,
"loss": 0.5620648860931396,
"mean_token_accuracy": 0.7735067456960678,
"num_tokens": 3149663.0,
"step": 193
},
{
"entropy": 0.5674590468406677,
"epoch": 0.7252336448598131,
"grad_norm": 0.0296547319740057,
"learning_rate": 0.0002,
"loss": 0.5588228702545166,
"mean_token_accuracy": 0.7742174565792084,
"num_tokens": 3166066.0,
"step": 194
},
{
"entropy": 0.5779262185096741,
"epoch": 0.7289719626168224,
"grad_norm": 0.028214752674102783,
"learning_rate": 0.0002,
"loss": 0.572249710559845,
"mean_token_accuracy": 0.7688845992088318,
"num_tokens": 3182640.0,
"step": 195
},
{
"entropy": 0.540147215127945,
"epoch": 0.7327102803738318,
"grad_norm": 0.027666175737977028,
"learning_rate": 0.0002,
"loss": 0.5338530540466309,
"mean_token_accuracy": 0.7832302153110504,
"num_tokens": 3198796.0,
"step": 196
},
{
"entropy": 0.5551275163888931,
"epoch": 0.7364485981308411,
"grad_norm": 0.034123752266168594,
"learning_rate": 0.0002,
"loss": 0.5622342824935913,
"mean_token_accuracy": 0.7688822001218796,
"num_tokens": 3214771.0,
"step": 197
},
{
"entropy": 0.5611921101808548,
"epoch": 0.7401869158878505,
"grad_norm": 0.02890852838754654,
"learning_rate": 0.0002,
"loss": 0.5630607604980469,
"mean_token_accuracy": 0.7698909342288971,
"num_tokens": 3231278.0,
"step": 198
},
{
"entropy": 0.5426182597875595,
"epoch": 0.7439252336448599,
"grad_norm": 0.029497232288122177,
"learning_rate": 0.0002,
"loss": 0.5449106097221375,
"mean_token_accuracy": 0.7783599495887756,
"num_tokens": 3247627.0,
"step": 199
},
{
"entropy": 0.5460454076528549,
"epoch": 0.7476635514018691,
"grad_norm": 0.03151922672986984,
"learning_rate": 0.0002,
"loss": 0.5513307452201843,
"mean_token_accuracy": 0.7761969566345215,
"num_tokens": 3263818.0,
"step": 200
},
{
"entropy": 0.5589698106050491,
"epoch": 0.7514018691588785,
"grad_norm": 0.028974369168281555,
"learning_rate": 0.0002,
"loss": 0.5579357147216797,
"mean_token_accuracy": 0.7737255245447159,
"num_tokens": 3279922.0,
"step": 201
},
{
"entropy": 0.553888663649559,
"epoch": 0.7551401869158878,
"grad_norm": 0.026153914630413055,
"learning_rate": 0.0002,
"loss": 0.550652027130127,
"mean_token_accuracy": 0.7776264399290085,
"num_tokens": 3296366.0,
"step": 202
},
{
"entropy": 0.5686471164226532,
"epoch": 0.7588785046728972,
"grad_norm": 0.028719555586576462,
"learning_rate": 0.0002,
"loss": 0.566332221031189,
"mean_token_accuracy": 0.7694560885429382,
"num_tokens": 3312940.0,
"step": 203
},
{
"entropy": 0.5482725948095322,
"epoch": 0.7626168224299066,
"grad_norm": 0.031571801751852036,
"learning_rate": 0.0002,
"loss": 0.5515249967575073,
"mean_token_accuracy": 0.7790126353502274,
"num_tokens": 3329137.0,
"step": 204
},
{
"entropy": 0.5548627823591232,
"epoch": 0.7663551401869159,
"grad_norm": 0.03189053386449814,
"learning_rate": 0.0002,
"loss": 0.5633711218833923,
"mean_token_accuracy": 0.7717642784118652,
"num_tokens": 3345223.0,
"step": 205
},
{
"entropy": 0.5403945446014404,
"epoch": 0.7700934579439253,
"grad_norm": 0.03444300964474678,
"learning_rate": 0.0002,
"loss": 0.5441574454307556,
"mean_token_accuracy": 0.7791598290205002,
"num_tokens": 3361512.0,
"step": 206
},
{
"entropy": 0.5523678362369537,
"epoch": 0.7738317757009345,
"grad_norm": 0.027761496603488922,
"learning_rate": 0.0002,
"loss": 0.5582634210586548,
"mean_token_accuracy": 0.7723374962806702,
"num_tokens": 3377859.0,
"step": 207
},
{
"entropy": 0.5723598301410675,
"epoch": 0.7775700934579439,
"grad_norm": 0.028997788205742836,
"learning_rate": 0.0002,
"loss": 0.5705980658531189,
"mean_token_accuracy": 0.7668357789516449,
"num_tokens": 3394399.0,
"step": 208
},
{
"entropy": 0.5796838849782944,
"epoch": 0.7813084112149533,
"grad_norm": 0.03271174803376198,
"learning_rate": 0.0002,
"loss": 0.5698305368423462,
"mean_token_accuracy": 0.7698051035404205,
"num_tokens": 3410824.0,
"step": 209
},
{
"entropy": 0.5651015788316727,
"epoch": 0.7850467289719626,
"grad_norm": 0.031869035214185715,
"learning_rate": 0.0002,
"loss": 0.5655361413955688,
"mean_token_accuracy": 0.7697497308254242,
"num_tokens": 3426955.0,
"step": 210
},
{
"entropy": 0.5639242976903915,
"epoch": 0.788785046728972,
"grad_norm": 0.026541458442807198,
"learning_rate": 0.0002,
"loss": 0.5636979341506958,
"mean_token_accuracy": 0.7697752565145493,
"num_tokens": 3443406.0,
"step": 211
},
{
"entropy": 0.5432985424995422,
"epoch": 0.7925233644859813,
"grad_norm": 0.032391466200351715,
"learning_rate": 0.0002,
"loss": 0.5466354489326477,
"mean_token_accuracy": 0.7787620276212692,
"num_tokens": 3459857.0,
"step": 212
},
{
"entropy": 0.546247586607933,
"epoch": 0.7962616822429907,
"grad_norm": 0.03624865412712097,
"learning_rate": 0.0002,
"loss": 0.5477287769317627,
"mean_token_accuracy": 0.7784061878919601,
"num_tokens": 3476064.0,
"step": 213
},
{
"entropy": 0.5712321698665619,
"epoch": 0.8,
"grad_norm": 0.027368342503905296,
"learning_rate": 0.0002,
"loss": 0.5628222823143005,
"mean_token_accuracy": 0.7711902260780334,
"num_tokens": 3492569.0,
"step": 214
},
{
"entropy": 0.5511522740125656,
"epoch": 0.8037383177570093,
"grad_norm": 0.0314224548637867,
"learning_rate": 0.0002,
"loss": 0.546245813369751,
"mean_token_accuracy": 0.777819886803627,
"num_tokens": 3508946.0,
"step": 215
},
{
"entropy": 0.5641316920518875,
"epoch": 0.8074766355140187,
"grad_norm": 0.02934875525534153,
"learning_rate": 0.0002,
"loss": 0.5656546354293823,
"mean_token_accuracy": 0.7672451436519623,
"num_tokens": 3525415.0,
"step": 216
},
{
"entropy": 0.5616082847118378,
"epoch": 0.811214953271028,
"grad_norm": 0.027262428775429726,
"learning_rate": 0.0002,
"loss": 0.5606979131698608,
"mean_token_accuracy": 0.7726116627454758,
"num_tokens": 3541513.0,
"step": 217
},
{
"entropy": 0.5319297313690186,
"epoch": 0.8149532710280374,
"grad_norm": 0.02967401221394539,
"learning_rate": 0.0002,
"loss": 0.5409149527549744,
"mean_token_accuracy": 0.7806787341833115,
"num_tokens": 3557840.0,
"step": 218
},
{
"entropy": 0.5461787581443787,
"epoch": 0.8186915887850468,
"grad_norm": 0.03170184791088104,
"learning_rate": 0.0002,
"loss": 0.5544174313545227,
"mean_token_accuracy": 0.7753637731075287,
"num_tokens": 3574334.0,
"step": 219
},
{
"entropy": 0.5393616706132889,
"epoch": 0.822429906542056,
"grad_norm": 0.02985682338476181,
"learning_rate": 0.0002,
"loss": 0.5457973480224609,
"mean_token_accuracy": 0.7773662656545639,
"num_tokens": 3590741.0,
"step": 220
},
{
"entropy": 0.5554001927375793,
"epoch": 0.8261682242990654,
"grad_norm": 0.02711213380098343,
"learning_rate": 0.0002,
"loss": 0.555370569229126,
"mean_token_accuracy": 0.7716074883937836,
"num_tokens": 3607018.0,
"step": 221
},
{
"entropy": 0.5483701378107071,
"epoch": 0.8299065420560747,
"grad_norm": 0.029320966452360153,
"learning_rate": 0.0002,
"loss": 0.5421203970909119,
"mean_token_accuracy": 0.7806040942668915,
"num_tokens": 3623209.0,
"step": 222
},
{
"entropy": 0.5777206718921661,
"epoch": 0.8336448598130841,
"grad_norm": 0.030610879883170128,
"learning_rate": 0.0002,
"loss": 0.5738532543182373,
"mean_token_accuracy": 0.7664468586444855,
"num_tokens": 3639406.0,
"step": 223
},
{
"entropy": 0.5567807406187057,
"epoch": 0.8373831775700935,
"grad_norm": 0.028399785980582237,
"learning_rate": 0.0002,
"loss": 0.5526878237724304,
"mean_token_accuracy": 0.773535892367363,
"num_tokens": 3655602.0,
"step": 224
},
{
"entropy": 0.530220165848732,
"epoch": 0.8411214953271028,
"grad_norm": 0.03518186882138252,
"learning_rate": 0.0002,
"loss": 0.5408585667610168,
"mean_token_accuracy": 0.779409795999527,
"num_tokens": 3671905.0,
"step": 225
},
{
"entropy": 0.5535659790039062,
"epoch": 0.8448598130841122,
"grad_norm": 0.03929230943322182,
"learning_rate": 0.0002,
"loss": 0.5663979053497314,
"mean_token_accuracy": 0.7698138654232025,
"num_tokens": 3688191.0,
"step": 226
},
{
"entropy": 0.569505363702774,
"epoch": 0.8485981308411215,
"grad_norm": 0.0272939745336771,
"learning_rate": 0.0002,
"loss": 0.5618590712547302,
"mean_token_accuracy": 0.7725658267736435,
"num_tokens": 3704751.0,
"step": 227
},
{
"entropy": 0.5644249469041824,
"epoch": 0.8523364485981308,
"grad_norm": 0.03415616601705551,
"learning_rate": 0.0002,
"loss": 0.5562848448753357,
"mean_token_accuracy": 0.7748490273952484,
"num_tokens": 3720710.0,
"step": 228
},
{
"entropy": 0.5773901343345642,
"epoch": 0.8560747663551402,
"grad_norm": 0.031880877912044525,
"learning_rate": 0.0002,
"loss": 0.5614221096038818,
"mean_token_accuracy": 0.7720403522253036,
"num_tokens": 3737054.0,
"step": 229
},
{
"entropy": 0.5547749698162079,
"epoch": 0.8598130841121495,
"grad_norm": 0.0324094183743,
"learning_rate": 0.0002,
"loss": 0.5520619750022888,
"mean_token_accuracy": 0.7773038893938065,
"num_tokens": 3753537.0,
"step": 230
},
{
"entropy": 0.5418203920125961,
"epoch": 0.8635514018691589,
"grad_norm": 0.03512468561530113,
"learning_rate": 0.0002,
"loss": 0.5538347959518433,
"mean_token_accuracy": 0.7749911546707153,
"num_tokens": 3769863.0,
"step": 231
},
{
"entropy": 0.5521644353866577,
"epoch": 0.8672897196261682,
"grad_norm": 0.02896721474826336,
"learning_rate": 0.0002,
"loss": 0.5608810186386108,
"mean_token_accuracy": 0.7746408581733704,
"num_tokens": 3786316.0,
"step": 232
},
{
"entropy": 0.543023481965065,
"epoch": 0.8710280373831776,
"grad_norm": 0.03712921962141991,
"learning_rate": 0.0002,
"loss": 0.5551246404647827,
"mean_token_accuracy": 0.7738360315561295,
"num_tokens": 3802441.0,
"step": 233
},
{
"entropy": 0.5672542154788971,
"epoch": 0.874766355140187,
"grad_norm": 0.026832984760403633,
"learning_rate": 0.0002,
"loss": 0.5662351846694946,
"mean_token_accuracy": 0.7704236954450607,
"num_tokens": 3818851.0,
"step": 234
},
{
"entropy": 0.5710914433002472,
"epoch": 0.8785046728971962,
"grad_norm": 0.036441151052713394,
"learning_rate": 0.0002,
"loss": 0.5647166967391968,
"mean_token_accuracy": 0.7697651982307434,
"num_tokens": 3835229.0,
"step": 235
},
{
"entropy": 0.5721132010221481,
"epoch": 0.8822429906542056,
"grad_norm": 0.031891413033008575,
"learning_rate": 0.0002,
"loss": 0.561801552772522,
"mean_token_accuracy": 0.7740357220172882,
"num_tokens": 3851634.0,
"step": 236
},
{
"entropy": 0.5430081784725189,
"epoch": 0.8859813084112149,
"grad_norm": 0.028133288025856018,
"learning_rate": 0.0002,
"loss": 0.5482598543167114,
"mean_token_accuracy": 0.7780391424894333,
"num_tokens": 3867818.0,
"step": 237
},
{
"entropy": 0.5531598627567291,
"epoch": 0.8897196261682243,
"grad_norm": 0.031570907682180405,
"learning_rate": 0.0002,
"loss": 0.5597803592681885,
"mean_token_accuracy": 0.7725805789232254,
"num_tokens": 3884128.0,
"step": 238
},
{
"entropy": 0.552057608962059,
"epoch": 0.8934579439252337,
"grad_norm": 0.03431302309036255,
"learning_rate": 0.0002,
"loss": 0.5592586398124695,
"mean_token_accuracy": 0.7739444822072983,
"num_tokens": 3900459.0,
"step": 239
},
{
"entropy": 0.552062600851059,
"epoch": 0.897196261682243,
"grad_norm": 0.029298607259988785,
"learning_rate": 0.0002,
"loss": 0.5525797009468079,
"mean_token_accuracy": 0.7755719870328903,
"num_tokens": 3916582.0,
"step": 240
},
{
"entropy": 0.571002647280693,
"epoch": 0.9009345794392524,
"grad_norm": 0.028903625905513763,
"learning_rate": 0.0002,
"loss": 0.5647273659706116,
"mean_token_accuracy": 0.7697427272796631,
"num_tokens": 3932989.0,
"step": 241
},
{
"entropy": 0.5607190132141113,
"epoch": 0.9046728971962616,
"grad_norm": 0.02721545286476612,
"learning_rate": 0.0002,
"loss": 0.5572564601898193,
"mean_token_accuracy": 0.7735343724489212,
"num_tokens": 3949591.0,
"step": 242
},
{
"entropy": 0.554363563656807,
"epoch": 0.908411214953271,
"grad_norm": 0.028853297233581543,
"learning_rate": 0.0002,
"loss": 0.5598585605621338,
"mean_token_accuracy": 0.7746720314025879,
"num_tokens": 3965977.0,
"step": 243
},
{
"entropy": 0.562399297952652,
"epoch": 0.9121495327102803,
"grad_norm": 0.031765274703502655,
"learning_rate": 0.0002,
"loss": 0.5609657764434814,
"mean_token_accuracy": 0.7706955671310425,
"num_tokens": 3982241.0,
"step": 244
},
{
"entropy": 0.5663948059082031,
"epoch": 0.9158878504672897,
"grad_norm": 0.02977531962096691,
"learning_rate": 0.0002,
"loss": 0.5600242018699646,
"mean_token_accuracy": 0.7716616988182068,
"num_tokens": 3998850.0,
"step": 245
},
{
"entropy": 0.5626737624406815,
"epoch": 0.9196261682242991,
"grad_norm": 0.03073737397789955,
"learning_rate": 0.0002,
"loss": 0.5680803656578064,
"mean_token_accuracy": 0.7690348774194717,
"num_tokens": 4015357.0,
"step": 246
},
{
"entropy": 0.5617063343524933,
"epoch": 0.9233644859813084,
"grad_norm": 0.03239826485514641,
"learning_rate": 0.0002,
"loss": 0.5647311210632324,
"mean_token_accuracy": 0.7720029205083847,
"num_tokens": 4031434.0,
"step": 247
},
{
"entropy": 0.5446989983320236,
"epoch": 0.9271028037383178,
"grad_norm": 0.026935769245028496,
"learning_rate": 0.0002,
"loss": 0.5423059463500977,
"mean_token_accuracy": 0.7784274518489838,
"num_tokens": 4047542.0,
"step": 248
},
{
"entropy": 0.5633901953697205,
"epoch": 0.930841121495327,
"grad_norm": 0.03004775382578373,
"learning_rate": 0.0002,
"loss": 0.5547890663146973,
"mean_token_accuracy": 0.7750878036022186,
"num_tokens": 4063671.0,
"step": 249
},
{
"entropy": 0.5641201138496399,
"epoch": 0.9345794392523364,
"grad_norm": 0.035040173679590225,
"learning_rate": 0.0002,
"loss": 0.560414731502533,
"mean_token_accuracy": 0.7721855938434601,
"num_tokens": 4080062.0,
"step": 250
},
{
"entropy": 0.5267122685909271,
"epoch": 0.9383177570093458,
"grad_norm": 0.026784395799040794,
"learning_rate": 0.0002,
"loss": 0.528884768486023,
"mean_token_accuracy": 0.7842623591423035,
"num_tokens": 4096314.0,
"step": 251
},
{
"entropy": 0.5412785857915878,
"epoch": 0.9420560747663551,
"grad_norm": 0.029483763501048088,
"learning_rate": 0.0002,
"loss": 0.5475237369537354,
"mean_token_accuracy": 0.7779380232095718,
"num_tokens": 4112543.0,
"step": 252
},
{
"entropy": 0.5688454955816269,
"epoch": 0.9457943925233645,
"grad_norm": 0.02722441591322422,
"learning_rate": 0.0002,
"loss": 0.5703037977218628,
"mean_token_accuracy": 0.7700005024671555,
"num_tokens": 4128880.0,
"step": 253
},
{
"entropy": 0.5569160729646683,
"epoch": 0.9495327102803738,
"grad_norm": 0.028683314099907875,
"learning_rate": 0.0002,
"loss": 0.5574289560317993,
"mean_token_accuracy": 0.7722644209861755,
"num_tokens": 4145417.0,
"step": 254
},
{
"entropy": 0.5437170565128326,
"epoch": 0.9532710280373832,
"grad_norm": 0.03323707729578018,
"learning_rate": 0.0002,
"loss": 0.5411959886550903,
"mean_token_accuracy": 0.7814441025257111,
"num_tokens": 4161528.0,
"step": 255
},
{
"entropy": 0.5666731148958206,
"epoch": 0.9570093457943926,
"grad_norm": 0.028484966605901718,
"learning_rate": 0.0002,
"loss": 0.5648545622825623,
"mean_token_accuracy": 0.77223140001297,
"num_tokens": 4177883.0,
"step": 256
},
{
"entropy": 0.5472739338874817,
"epoch": 0.9607476635514018,
"grad_norm": 0.032945599406957626,
"learning_rate": 0.0002,
"loss": 0.5465376377105713,
"mean_token_accuracy": 0.7768394351005554,
"num_tokens": 4194047.0,
"step": 257
},
{
"entropy": 0.5488951653242111,
"epoch": 0.9644859813084112,
"grad_norm": 0.030117738991975784,
"learning_rate": 0.0002,
"loss": 0.5551251769065857,
"mean_token_accuracy": 0.7728994339704514,
"num_tokens": 4210415.0,
"step": 258
},
{
"entropy": 0.5574130117893219,
"epoch": 0.9682242990654205,
"grad_norm": 0.028586212545633316,
"learning_rate": 0.0002,
"loss": 0.5596088171005249,
"mean_token_accuracy": 0.7760643810033798,
"num_tokens": 4226881.0,
"step": 259
},
{
"entropy": 0.5550301373004913,
"epoch": 0.9719626168224299,
"grad_norm": 0.035784922540187836,
"learning_rate": 0.0002,
"loss": 0.5660927891731262,
"mean_token_accuracy": 0.7692493498325348,
"num_tokens": 4243149.0,
"step": 260
},
{
"entropy": 0.5651994347572327,
"epoch": 0.9757009345794393,
"grad_norm": 0.03252053260803223,
"learning_rate": 0.0002,
"loss": 0.5599735379219055,
"mean_token_accuracy": 0.7730003446340561,
"num_tokens": 4259611.0,
"step": 261
},
{
"entropy": 0.5637697577476501,
"epoch": 0.9794392523364486,
"grad_norm": 0.047552503645420074,
"learning_rate": 0.0002,
"loss": 0.5568199157714844,
"mean_token_accuracy": 0.7762705087661743,
"num_tokens": 4275796.0,
"step": 262
},
{
"entropy": 0.567447230219841,
"epoch": 0.983177570093458,
"grad_norm": 0.027801062911748886,
"learning_rate": 0.0002,
"loss": 0.5698356032371521,
"mean_token_accuracy": 0.7690239697694778,
"num_tokens": 4292132.0,
"step": 263
},
{
"entropy": 0.5712171792984009,
"epoch": 0.9869158878504672,
"grad_norm": 0.11246822774410248,
"learning_rate": 0.0002,
"loss": 0.5811023116111755,
"mean_token_accuracy": 0.7647420465946198,
"num_tokens": 4308584.0,
"step": 264
},
{
"entropy": 0.5711934268474579,
"epoch": 0.9906542056074766,
"grad_norm": 0.06911394000053406,
"learning_rate": 0.0002,
"loss": 0.5809019804000854,
"mean_token_accuracy": 0.7624327838420868,
"num_tokens": 4324962.0,
"step": 265
},
{
"entropy": 0.5627400726079941,
"epoch": 0.994392523364486,
"grad_norm": 0.030455252155661583,
"learning_rate": 0.0002,
"loss": 0.5616910457611084,
"mean_token_accuracy": 0.7730111479759216,
"num_tokens": 4341120.0,
"step": 266
},
{
"entropy": 0.5654444992542267,
"epoch": 0.9981308411214953,
"grad_norm": 0.02772046998143196,
"learning_rate": 0.0002,
"loss": 0.5567201972007751,
"mean_token_accuracy": 0.7720088213682175,
"num_tokens": 4357574.0,
"step": 267
},
{
"entropy": 0.5589146912097931,
"epoch": 1.0,
"grad_norm": 0.04032747447490692,
"learning_rate": 0.0002,
"loss": 0.5460503101348877,
"mean_token_accuracy": 0.779203861951828,
"num_tokens": 4365546.0,
"step": 268
},
{
"entropy": 0.5703114420175552,
"epoch": 1.0037383177570094,
"grad_norm": 0.033491045236587524,
"learning_rate": 0.0002,
"loss": 0.5557507276535034,
"mean_token_accuracy": 0.7745671570301056,
"num_tokens": 4381699.0,
"step": 269
},
{
"entropy": 0.5609012693166733,
"epoch": 1.0074766355140188,
"grad_norm": 0.03252531215548515,
"learning_rate": 0.0002,
"loss": 0.5590213537216187,
"mean_token_accuracy": 0.7752612829208374,
"num_tokens": 4398284.0,
"step": 270
},
{
"entropy": 0.5300652086734772,
"epoch": 1.011214953271028,
"grad_norm": 0.036933887749910355,
"learning_rate": 0.0002,
"loss": 0.5396179556846619,
"mean_token_accuracy": 0.7816686779260635,
"num_tokens": 4414795.0,
"step": 271
},
{
"entropy": 0.5411953181028366,
"epoch": 1.0149532710280373,
"grad_norm": 0.035878736525774,
"learning_rate": 0.0002,
"loss": 0.5491203665733337,
"mean_token_accuracy": 0.7742594629526138,
"num_tokens": 4431190.0,
"step": 272
},
{
"entropy": 0.5370450466871262,
"epoch": 1.0186915887850467,
"grad_norm": 0.029914801940321922,
"learning_rate": 0.0002,
"loss": 0.5417315363883972,
"mean_token_accuracy": 0.7806635499000549,
"num_tokens": 4447475.0,
"step": 273
},
{
"entropy": 0.5567668229341507,
"epoch": 1.0224299065420561,
"grad_norm": 0.03265395388007164,
"learning_rate": 0.0002,
"loss": 0.5509355068206787,
"mean_token_accuracy": 0.7730302512645721,
"num_tokens": 4463734.0,
"step": 274
},
{
"entropy": 0.5656838417053223,
"epoch": 1.0261682242990655,
"grad_norm": 0.03136991336941719,
"learning_rate": 0.0002,
"loss": 0.5576434135437012,
"mean_token_accuracy": 0.7703666239976883,
"num_tokens": 4479995.0,
"step": 275
},
{
"entropy": 0.548493430018425,
"epoch": 1.0299065420560747,
"grad_norm": 0.033384647220373154,
"learning_rate": 0.0002,
"loss": 0.5452391505241394,
"mean_token_accuracy": 0.7803221642971039,
"num_tokens": 4496385.0,
"step": 276
},
{
"entropy": 0.547315925359726,
"epoch": 1.033644859813084,
"grad_norm": 0.02812100760638714,
"learning_rate": 0.0002,
"loss": 0.5515413284301758,
"mean_token_accuracy": 0.7755024433135986,
"num_tokens": 4512779.0,
"step": 277
},
{
"entropy": 0.5315467417240143,
"epoch": 1.0373831775700935,
"grad_norm": 0.041606683284044266,
"learning_rate": 0.0002,
"loss": 0.5446295738220215,
"mean_token_accuracy": 0.7787878066301346,
"num_tokens": 4529088.0,
"step": 278
},
{
"entropy": 0.5279169529676437,
"epoch": 1.0411214953271029,
"grad_norm": 0.031057002022862434,
"learning_rate": 0.0002,
"loss": 0.536575973033905,
"mean_token_accuracy": 0.7812807857990265,
"num_tokens": 4545377.0,
"step": 279
},
{
"entropy": 0.5590710490942001,
"epoch": 1.0448598130841122,
"grad_norm": 0.02644682675600052,
"learning_rate": 0.0002,
"loss": 0.554656982421875,
"mean_token_accuracy": 0.7751928865909576,
"num_tokens": 4561701.0,
"step": 280
},
{
"entropy": 0.5662561357021332,
"epoch": 1.0485981308411214,
"grad_norm": 0.029125280678272247,
"learning_rate": 0.0002,
"loss": 0.5619407892227173,
"mean_token_accuracy": 0.7679703086614609,
"num_tokens": 4578007.0,
"step": 281
},
{
"entropy": 0.5509714484214783,
"epoch": 1.0523364485981308,
"grad_norm": 0.03366995230317116,
"learning_rate": 0.0002,
"loss": 0.544794499874115,
"mean_token_accuracy": 0.7797580361366272,
"num_tokens": 4594260.0,
"step": 282
},
{
"entropy": 0.5634302496910095,
"epoch": 1.0560747663551402,
"grad_norm": 0.027832867577672005,
"learning_rate": 0.0002,
"loss": 0.5580713748931885,
"mean_token_accuracy": 0.7739240676164627,
"num_tokens": 4610748.0,
"step": 283
},
{
"entropy": 0.5439006388187408,
"epoch": 1.0598130841121496,
"grad_norm": 0.03045068122446537,
"learning_rate": 0.0002,
"loss": 0.5474724173545837,
"mean_token_accuracy": 0.7765053659677505,
"num_tokens": 4627116.0,
"step": 284
},
{
"entropy": 0.5238615572452545,
"epoch": 1.063551401869159,
"grad_norm": 0.03397069126367569,
"learning_rate": 0.0002,
"loss": 0.532546877861023,
"mean_token_accuracy": 0.7858656197786331,
"num_tokens": 4643480.0,
"step": 285
},
{
"entropy": 0.5387604683637619,
"epoch": 1.0672897196261681,
"grad_norm": 0.036734551191329956,
"learning_rate": 0.0002,
"loss": 0.5468651056289673,
"mean_token_accuracy": 0.7797952890396118,
"num_tokens": 4660303.0,
"step": 286
},
{
"entropy": 0.5558950453996658,
"epoch": 1.0710280373831775,
"grad_norm": 0.030276885256171227,
"learning_rate": 0.0002,
"loss": 0.5584522485733032,
"mean_token_accuracy": 0.7732091248035431,
"num_tokens": 4676839.0,
"step": 287
},
{
"entropy": 0.5617282688617706,
"epoch": 1.074766355140187,
"grad_norm": 0.033773574978113174,
"learning_rate": 0.0002,
"loss": 0.5567758679389954,
"mean_token_accuracy": 0.7739396244287491,
"num_tokens": 4692959.0,
"step": 288
},
{
"entropy": 0.5491297841072083,
"epoch": 1.0785046728971963,
"grad_norm": 0.0321025624871254,
"learning_rate": 0.0002,
"loss": 0.5414766073226929,
"mean_token_accuracy": 0.7804555594921112,
"num_tokens": 4709310.0,
"step": 289
},
{
"entropy": 0.5456965118646622,
"epoch": 1.0822429906542057,
"grad_norm": 0.029098015278577805,
"learning_rate": 0.0002,
"loss": 0.5451281070709229,
"mean_token_accuracy": 0.7778134942054749,
"num_tokens": 4725506.0,
"step": 290
},
{
"entropy": 0.5477775633335114,
"epoch": 1.0859813084112149,
"grad_norm": 0.02958570048213005,
"learning_rate": 0.0002,
"loss": 0.5455498695373535,
"mean_token_accuracy": 0.7799811661243439,
"num_tokens": 4741775.0,
"step": 291
},
{
"entropy": 0.5301359370350838,
"epoch": 1.0897196261682243,
"grad_norm": 0.03702852129936218,
"learning_rate": 0.0002,
"loss": 0.5398594737052917,
"mean_token_accuracy": 0.7832937985658646,
"num_tokens": 4758016.0,
"step": 292
},
{
"entropy": 0.5263582319021225,
"epoch": 1.0934579439252337,
"grad_norm": 0.0337018184363842,
"learning_rate": 0.0002,
"loss": 0.528889000415802,
"mean_token_accuracy": 0.7862381190061569,
"num_tokens": 4774331.0,
"step": 293
},
{
"entropy": 0.5430160015821457,
"epoch": 1.097196261682243,
"grad_norm": 0.036417651921510696,
"learning_rate": 0.0002,
"loss": 0.5521553158760071,
"mean_token_accuracy": 0.7737599611282349,
"num_tokens": 4790501.0,
"step": 294
},
{
"entropy": 0.5552934557199478,
"epoch": 1.1009345794392524,
"grad_norm": 0.03106369823217392,
"learning_rate": 0.0002,
"loss": 0.5559324622154236,
"mean_token_accuracy": 0.7761313170194626,
"num_tokens": 4806597.0,
"step": 295
},
{
"entropy": 0.5548459142446518,
"epoch": 1.1046728971962616,
"grad_norm": 0.031152816489338875,
"learning_rate": 0.0002,
"loss": 0.5504705905914307,
"mean_token_accuracy": 0.7746731489896774,
"num_tokens": 4822650.0,
"step": 296
},
{
"entropy": 0.5644493997097015,
"epoch": 1.108411214953271,
"grad_norm": 0.030590267851948738,
"learning_rate": 0.0002,
"loss": 0.5608450770378113,
"mean_token_accuracy": 0.7722194045782089,
"num_tokens": 4839117.0,
"step": 297
},
{
"entropy": 0.5444105267524719,
"epoch": 1.1121495327102804,
"grad_norm": 0.027887985110282898,
"learning_rate": 0.0002,
"loss": 0.5356480479240417,
"mean_token_accuracy": 0.7835922837257385,
"num_tokens": 4855616.0,
"step": 298
},
{
"entropy": 0.5529257953166962,
"epoch": 1.1158878504672898,
"grad_norm": 0.029403148218989372,
"learning_rate": 0.0002,
"loss": 0.5520183444023132,
"mean_token_accuracy": 0.7763603180646896,
"num_tokens": 4871877.0,
"step": 299
},
{
"entropy": 0.5645637214183807,
"epoch": 1.1196261682242992,
"grad_norm": 0.028178894892334938,
"learning_rate": 0.0002,
"loss": 0.5597948431968689,
"mean_token_accuracy": 0.7721023112535477,
"num_tokens": 4888211.0,
"step": 300
},
{
"entropy": 0.5288026034832001,
"epoch": 1.1233644859813083,
"grad_norm": 0.04107068479061127,
"learning_rate": 0.0002,
"loss": 0.5410320162773132,
"mean_token_accuracy": 0.7809516042470932,
"num_tokens": 4904621.0,
"step": 301
},
{
"entropy": 0.539900153875351,
"epoch": 1.1271028037383177,
"grad_norm": 0.029827676713466644,
"learning_rate": 0.0002,
"loss": 0.5402933955192566,
"mean_token_accuracy": 0.7816860228776932,
"num_tokens": 4921127.0,
"step": 302
},
{
"entropy": 0.5498250722885132,
"epoch": 1.1308411214953271,
"grad_norm": 0.026688000187277794,
"learning_rate": 0.0002,
"loss": 0.5489476323127747,
"mean_token_accuracy": 0.7740818113088608,
"num_tokens": 4937487.0,
"step": 303
},
{
"entropy": 0.5250164270401001,
"epoch": 1.1345794392523365,
"grad_norm": 0.02805374562740326,
"learning_rate": 0.0002,
"loss": 0.5292810797691345,
"mean_token_accuracy": 0.7862300872802734,
"num_tokens": 4953715.0,
"step": 304
},
{
"entropy": 0.5558099746704102,
"epoch": 1.1383177570093457,
"grad_norm": 0.028311913833022118,
"learning_rate": 0.0002,
"loss": 0.553642213344574,
"mean_token_accuracy": 0.772954136133194,
"num_tokens": 4970083.0,
"step": 305
},
{
"entropy": 0.552794486284256,
"epoch": 1.142056074766355,
"grad_norm": 0.02732912451028824,
"learning_rate": 0.0002,
"loss": 0.5542539358139038,
"mean_token_accuracy": 0.7786157876253128,
"num_tokens": 4986475.0,
"step": 306
},
{
"entropy": 0.541429802775383,
"epoch": 1.1457943925233645,
"grad_norm": 0.026043161749839783,
"learning_rate": 0.0002,
"loss": 0.54054194688797,
"mean_token_accuracy": 0.779283881187439,
"num_tokens": 5002946.0,
"step": 307
},
{
"entropy": 0.5385288000106812,
"epoch": 1.1495327102803738,
"grad_norm": 0.029000889509916306,
"learning_rate": 0.0002,
"loss": 0.5392960906028748,
"mean_token_accuracy": 0.7790030539035797,
"num_tokens": 5019257.0,
"step": 308
},
{
"entropy": 0.5650081187486649,
"epoch": 1.1532710280373832,
"grad_norm": 0.030966322869062424,
"learning_rate": 0.0002,
"loss": 0.5671533942222595,
"mean_token_accuracy": 0.7687903195619583,
"num_tokens": 5035694.0,
"step": 309
},
{
"entropy": 0.5269978791475296,
"epoch": 1.1570093457943926,
"grad_norm": 0.029498660936951637,
"learning_rate": 0.0002,
"loss": 0.5207559466362,
"mean_token_accuracy": 0.789651021361351,
"num_tokens": 5051896.0,
"step": 310
},
{
"entropy": 0.536905974149704,
"epoch": 1.1607476635514018,
"grad_norm": 0.030239341780543327,
"learning_rate": 0.0002,
"loss": 0.5469245910644531,
"mean_token_accuracy": 0.7770659476518631,
"num_tokens": 5068088.0,
"step": 311
},
{
"entropy": 0.5390781760215759,
"epoch": 1.1644859813084112,
"grad_norm": 0.03393058478832245,
"learning_rate": 0.0002,
"loss": 0.542595386505127,
"mean_token_accuracy": 0.7818379998207092,
"num_tokens": 5084518.0,
"step": 312
},
{
"entropy": 0.5539942681789398,
"epoch": 1.1682242990654206,
"grad_norm": 0.02896442450582981,
"learning_rate": 0.0002,
"loss": 0.5544940233230591,
"mean_token_accuracy": 0.773167759180069,
"num_tokens": 5101049.0,
"step": 313
},
{
"entropy": 0.5508127510547638,
"epoch": 1.17196261682243,
"grad_norm": 0.0290669035166502,
"learning_rate": 0.0002,
"loss": 0.5456743240356445,
"mean_token_accuracy": 0.7797731012105942,
"num_tokens": 5117401.0,
"step": 314
},
{
"entropy": 0.5471421480178833,
"epoch": 1.1757009345794391,
"grad_norm": 0.03175804764032364,
"learning_rate": 0.0002,
"loss": 0.547149658203125,
"mean_token_accuracy": 0.7758717685937881,
"num_tokens": 5133730.0,
"step": 315
},
{
"entropy": 0.5345856845378876,
"epoch": 1.1794392523364485,
"grad_norm": 0.030823305249214172,
"learning_rate": 0.0002,
"loss": 0.5330408215522766,
"mean_token_accuracy": 0.784162163734436,
"num_tokens": 5149933.0,
"step": 316
},
{
"entropy": 0.5622152835130692,
"epoch": 1.183177570093458,
"grad_norm": 0.035467732697725296,
"learning_rate": 0.0002,
"loss": 0.5626823902130127,
"mean_token_accuracy": 0.7694768160581589,
"num_tokens": 5166513.0,
"step": 317
},
{
"entropy": 0.5603054612874985,
"epoch": 1.1869158878504673,
"grad_norm": 0.03127942234277725,
"learning_rate": 0.0002,
"loss": 0.562260091304779,
"mean_token_accuracy": 0.7705819606781006,
"num_tokens": 5182789.0,
"step": 318
},
{
"entropy": 0.5313067883253098,
"epoch": 1.1906542056074767,
"grad_norm": 0.031915076076984406,
"learning_rate": 0.0002,
"loss": 0.535006046295166,
"mean_token_accuracy": 0.7801574766635895,
"num_tokens": 5198808.0,
"step": 319
},
{
"entropy": 0.5626082420349121,
"epoch": 1.194392523364486,
"grad_norm": 0.0270744226872921,
"learning_rate": 0.0002,
"loss": 0.5664738416671753,
"mean_token_accuracy": 0.7685981392860413,
"num_tokens": 5215173.0,
"step": 320
},
{
"entropy": 0.5448359251022339,
"epoch": 1.1981308411214953,
"grad_norm": 0.034068379551172256,
"learning_rate": 0.0002,
"loss": 0.5446659922599792,
"mean_token_accuracy": 0.7786541432142258,
"num_tokens": 5231488.0,
"step": 321
},
{
"entropy": 0.5552321374416351,
"epoch": 1.2018691588785047,
"grad_norm": 0.027504440397024155,
"learning_rate": 0.0002,
"loss": 0.5556068420410156,
"mean_token_accuracy": 0.7737858295440674,
"num_tokens": 5248043.0,
"step": 322
},
{
"entropy": 0.5611619353294373,
"epoch": 1.205607476635514,
"grad_norm": 0.0314825214445591,
"learning_rate": 0.0002,
"loss": 0.5585416555404663,
"mean_token_accuracy": 0.7727329283952713,
"num_tokens": 5264537.0,
"step": 323
},
{
"entropy": 0.539411261677742,
"epoch": 1.2093457943925234,
"grad_norm": 0.02891836315393448,
"learning_rate": 0.0002,
"loss": 0.542159378528595,
"mean_token_accuracy": 0.7766279429197311,
"num_tokens": 5280701.0,
"step": 324
},
{
"entropy": 0.5438771396875381,
"epoch": 1.2130841121495326,
"grad_norm": 0.030331527814269066,
"learning_rate": 0.0002,
"loss": 0.5439496040344238,
"mean_token_accuracy": 0.7776656746864319,
"num_tokens": 5297144.0,
"step": 325
},
{
"entropy": 0.5600438266992569,
"epoch": 1.216822429906542,
"grad_norm": 0.031427256762981415,
"learning_rate": 0.0002,
"loss": 0.5602800846099854,
"mean_token_accuracy": 0.7731630206108093,
"num_tokens": 5313519.0,
"step": 326
},
{
"entropy": 0.5613888651132584,
"epoch": 1.2205607476635514,
"grad_norm": 0.02703862637281418,
"learning_rate": 0.0002,
"loss": 0.5599865317344666,
"mean_token_accuracy": 0.7733557522296906,
"num_tokens": 5329856.0,
"step": 327
},
{
"entropy": 0.5237439274787903,
"epoch": 1.2242990654205608,
"grad_norm": 0.02758556418120861,
"learning_rate": 0.0002,
"loss": 0.5267841815948486,
"mean_token_accuracy": 0.7867935001850128,
"num_tokens": 5346177.0,
"step": 328
},
{
"entropy": 0.5669067651033401,
"epoch": 1.2280373831775702,
"grad_norm": 0.028242675587534904,
"learning_rate": 0.0002,
"loss": 0.5650265216827393,
"mean_token_accuracy": 0.7703205198049545,
"num_tokens": 5362512.0,
"step": 329
},
{
"entropy": 0.5509548783302307,
"epoch": 1.2317757009345796,
"grad_norm": 0.028802327811717987,
"learning_rate": 0.0002,
"loss": 0.5518352389335632,
"mean_token_accuracy": 0.7750025242567062,
"num_tokens": 5379024.0,
"step": 330
},
{
"entropy": 0.5300867408514023,
"epoch": 1.2355140186915887,
"grad_norm": 0.028508059680461884,
"learning_rate": 0.0002,
"loss": 0.5312294363975525,
"mean_token_accuracy": 0.7825600951910019,
"num_tokens": 5395474.0,
"step": 331
},
{
"entropy": 0.5559873282909393,
"epoch": 1.2392523364485981,
"grad_norm": 0.029974235221743584,
"learning_rate": 0.0002,
"loss": 0.5561782717704773,
"mean_token_accuracy": 0.7731552422046661,
"num_tokens": 5411674.0,
"step": 332
},
{
"entropy": 0.557199090719223,
"epoch": 1.2429906542056075,
"grad_norm": 0.03494254872202873,
"learning_rate": 0.0002,
"loss": 0.5579161643981934,
"mean_token_accuracy": 0.7746251970529556,
"num_tokens": 5428042.0,
"step": 333
},
{
"entropy": 0.5486237108707428,
"epoch": 1.246728971962617,
"grad_norm": 0.03307056799530983,
"learning_rate": 0.0002,
"loss": 0.547027587890625,
"mean_token_accuracy": 0.7762673646211624,
"num_tokens": 5444468.0,
"step": 334
},
{
"entropy": 0.5655098557472229,
"epoch": 1.250467289719626,
"grad_norm": 0.030658213421702385,
"learning_rate": 0.0002,
"loss": 0.5607244372367859,
"mean_token_accuracy": 0.7719737142324448,
"num_tokens": 5460943.0,
"step": 335
},
{
"entropy": 0.5550193935632706,
"epoch": 1.2542056074766355,
"grad_norm": 0.03245887532830238,
"learning_rate": 0.0002,
"loss": 0.558559775352478,
"mean_token_accuracy": 0.7714462429285049,
"num_tokens": 5477095.0,
"step": 336
},
{
"entropy": 0.5516159981489182,
"epoch": 1.2579439252336448,
"grad_norm": 0.029303548857569695,
"learning_rate": 0.0002,
"loss": 0.5509077310562134,
"mean_token_accuracy": 0.7748865634202957,
"num_tokens": 5493314.0,
"step": 337
},
{
"entropy": 0.5517037510871887,
"epoch": 1.2616822429906542,
"grad_norm": 0.030339522287249565,
"learning_rate": 0.0002,
"loss": 0.5531480312347412,
"mean_token_accuracy": 0.7767991721630096,
"num_tokens": 5509491.0,
"step": 338
},
{
"entropy": 0.5280565023422241,
"epoch": 1.2654205607476636,
"grad_norm": 0.031923625618219376,
"learning_rate": 0.0002,
"loss": 0.528035581111908,
"mean_token_accuracy": 0.7852191030979156,
"num_tokens": 5525691.0,
"step": 339
},
{
"entropy": 0.5340898633003235,
"epoch": 1.269158878504673,
"grad_norm": 0.029536927118897438,
"learning_rate": 0.0002,
"loss": 0.5422028303146362,
"mean_token_accuracy": 0.7782081514596939,
"num_tokens": 5541867.0,
"step": 340
},
{
"entropy": 0.5269799679517746,
"epoch": 1.2728971962616822,
"grad_norm": 0.028842000290751457,
"learning_rate": 0.0002,
"loss": 0.5262301564216614,
"mean_token_accuracy": 0.7851875424385071,
"num_tokens": 5558001.0,
"step": 341
},
{
"entropy": 0.5422883927822113,
"epoch": 1.2766355140186916,
"grad_norm": 0.03446980193257332,
"learning_rate": 0.0002,
"loss": 0.5427042245864868,
"mean_token_accuracy": 0.7805773615837097,
"num_tokens": 5574327.0,
"step": 342
},
{
"entropy": 0.5518148094415665,
"epoch": 1.280373831775701,
"grad_norm": 0.027705170214176178,
"learning_rate": 0.0002,
"loss": 0.5506993532180786,
"mean_token_accuracy": 0.7755730003118515,
"num_tokens": 5590749.0,
"step": 343
},
{
"entropy": 0.5408089458942413,
"epoch": 1.2841121495327104,
"grad_norm": 0.029695594683289528,
"learning_rate": 0.0002,
"loss": 0.5394558906555176,
"mean_token_accuracy": 0.7792032957077026,
"num_tokens": 5606965.0,
"step": 344
},
{
"entropy": 0.555278405547142,
"epoch": 1.2878504672897195,
"grad_norm": 0.03306727111339569,
"learning_rate": 0.0002,
"loss": 0.5528630018234253,
"mean_token_accuracy": 0.7753221690654755,
"num_tokens": 5623293.0,
"step": 345
},
{
"entropy": 0.5409073531627655,
"epoch": 1.291588785046729,
"grad_norm": 0.029820574447512627,
"learning_rate": 0.0002,
"loss": 0.5416831970214844,
"mean_token_accuracy": 0.7789396792650223,
"num_tokens": 5639449.0,
"step": 346
},
{
"entropy": 0.5428119450807571,
"epoch": 1.2953271028037383,
"grad_norm": 0.02653786540031433,
"learning_rate": 0.0002,
"loss": 0.5379306077957153,
"mean_token_accuracy": 0.7808004468679428,
"num_tokens": 5655647.0,
"step": 347
},
{
"entropy": 0.5534338802099228,
"epoch": 1.2990654205607477,
"grad_norm": 0.036522869020700455,
"learning_rate": 0.0002,
"loss": 0.5622379779815674,
"mean_token_accuracy": 0.7683994024991989,
"num_tokens": 5672013.0,
"step": 348
},
{
"entropy": 0.5302807092666626,
"epoch": 1.302803738317757,
"grad_norm": 0.029457183554768562,
"learning_rate": 0.0002,
"loss": 0.5294267535209656,
"mean_token_accuracy": 0.7827122360467911,
"num_tokens": 5688450.0,
"step": 349
},
{
"entropy": 0.5444758385419846,
"epoch": 1.3065420560747665,
"grad_norm": 0.029874974861741066,
"learning_rate": 0.0002,
"loss": 0.5353363752365112,
"mean_token_accuracy": 0.7824759036302567,
"num_tokens": 5705038.0,
"step": 350
},
{
"entropy": 0.5528301745653152,
"epoch": 1.3102803738317756,
"grad_norm": 0.029413780197501183,
"learning_rate": 0.0002,
"loss": 0.5467464923858643,
"mean_token_accuracy": 0.7778250128030777,
"num_tokens": 5721143.0,
"step": 351
},
{
"entropy": 0.5555091798305511,
"epoch": 1.314018691588785,
"grad_norm": 0.03153051435947418,
"learning_rate": 0.0002,
"loss": 0.5567013025283813,
"mean_token_accuracy": 0.7745524048805237,
"num_tokens": 5737899.0,
"step": 352
},
{
"entropy": 0.5499187856912613,
"epoch": 1.3177570093457944,
"grad_norm": 0.03486097231507301,
"learning_rate": 0.0002,
"loss": 0.5597171783447266,
"mean_token_accuracy": 0.7737800478935242,
"num_tokens": 5754281.0,
"step": 353
},
{
"entropy": 0.5655581057071686,
"epoch": 1.3214953271028038,
"grad_norm": 0.034320469945669174,
"learning_rate": 0.0002,
"loss": 0.5727288126945496,
"mean_token_accuracy": 0.7656765133142471,
"num_tokens": 5770770.0,
"step": 354
},
{
"entropy": 0.5538551807403564,
"epoch": 1.325233644859813,
"grad_norm": 0.03038712590932846,
"learning_rate": 0.0002,
"loss": 0.5568647384643555,
"mean_token_accuracy": 0.7737635225057602,
"num_tokens": 5787055.0,
"step": 355
},
{
"entropy": 0.5601113438606262,
"epoch": 1.3289719626168224,
"grad_norm": 0.02863963134586811,
"learning_rate": 0.0002,
"loss": 0.5530621409416199,
"mean_token_accuracy": 0.7755090743303299,
"num_tokens": 5803445.0,
"step": 356
},
{
"entropy": 0.5483526140451431,
"epoch": 1.3327102803738318,
"grad_norm": 0.03086850978434086,
"learning_rate": 0.0002,
"loss": 0.5400408506393433,
"mean_token_accuracy": 0.7810002267360687,
"num_tokens": 5819715.0,
"step": 357
},
{
"entropy": 0.5624817609786987,
"epoch": 1.3364485981308412,
"grad_norm": 0.027300981804728508,
"learning_rate": 0.0002,
"loss": 0.5635508894920349,
"mean_token_accuracy": 0.768461674451828,
"num_tokens": 5835943.0,
"step": 358
},
{
"entropy": 0.5395894348621368,
"epoch": 1.3401869158878505,
"grad_norm": 0.030900444835424423,
"learning_rate": 0.0002,
"loss": 0.544026255607605,
"mean_token_accuracy": 0.7806333154439926,
"num_tokens": 5852434.0,
"step": 359
},
{
"entropy": 0.5406174808740616,
"epoch": 1.34392523364486,
"grad_norm": 0.030813222751021385,
"learning_rate": 0.0002,
"loss": 0.545943021774292,
"mean_token_accuracy": 0.7791963070631027,
"num_tokens": 5868855.0,
"step": 360
},
{
"entropy": 0.5282687693834305,
"epoch": 1.347663551401869,
"grad_norm": 0.03219500184059143,
"learning_rate": 0.0002,
"loss": 0.5280976891517639,
"mean_token_accuracy": 0.7882633060216904,
"num_tokens": 5885162.0,
"step": 361
},
{
"entropy": 0.5588660687208176,
"epoch": 1.3514018691588785,
"grad_norm": 0.030664408579468727,
"learning_rate": 0.0002,
"loss": 0.5600679516792297,
"mean_token_accuracy": 0.7683242410421371,
"num_tokens": 5901397.0,
"step": 362
},
{
"entropy": 0.5558361262083054,
"epoch": 1.355140186915888,
"grad_norm": 0.029887903481721878,
"learning_rate": 0.0002,
"loss": 0.5512230396270752,
"mean_token_accuracy": 0.7751856446266174,
"num_tokens": 5917688.0,
"step": 363
},
{
"entropy": 0.5585273951292038,
"epoch": 1.358878504672897,
"grad_norm": 0.030291857197880745,
"learning_rate": 0.0002,
"loss": 0.5574408173561096,
"mean_token_accuracy": 0.7735242694616318,
"num_tokens": 5934252.0,
"step": 364
},
{
"entropy": 0.5426641255617142,
"epoch": 1.3626168224299064,
"grad_norm": 0.03163778409361839,
"learning_rate": 0.0002,
"loss": 0.5456237196922302,
"mean_token_accuracy": 0.77604641020298,
"num_tokens": 5950736.0,
"step": 365
},
{
"entropy": 0.5607275068759918,
"epoch": 1.3663551401869158,
"grad_norm": 0.02867417223751545,
"learning_rate": 0.0002,
"loss": 0.5595529079437256,
"mean_token_accuracy": 0.773354560136795,
"num_tokens": 5967130.0,
"step": 366
},
{
"entropy": 0.554174154996872,
"epoch": 1.3700934579439252,
"grad_norm": 0.03474622219800949,
"learning_rate": 0.0002,
"loss": 0.5513558387756348,
"mean_token_accuracy": 0.7774477899074554,
"num_tokens": 5983303.0,
"step": 367
},
{
"entropy": 0.5479168146848679,
"epoch": 1.3738317757009346,
"grad_norm": 0.03147226572036743,
"learning_rate": 0.0002,
"loss": 0.5468041300773621,
"mean_token_accuracy": 0.7777006030082703,
"num_tokens": 5999776.0,
"step": 368
},
{
"entropy": 0.5567852258682251,
"epoch": 1.377570093457944,
"grad_norm": 0.03519264608621597,
"learning_rate": 0.0002,
"loss": 0.5599963068962097,
"mean_token_accuracy": 0.7709233462810516,
"num_tokens": 6015938.0,
"step": 369
},
{
"entropy": 0.5587522089481354,
"epoch": 1.3813084112149534,
"grad_norm": 0.03433060646057129,
"learning_rate": 0.0002,
"loss": 0.5571247339248657,
"mean_token_accuracy": 0.7718200087547302,
"num_tokens": 6032196.0,
"step": 370
},
{
"entropy": 0.5337067395448685,
"epoch": 1.3850467289719626,
"grad_norm": 0.030834900215268135,
"learning_rate": 0.0002,
"loss": 0.5330364108085632,
"mean_token_accuracy": 0.7854774743318558,
"num_tokens": 6048415.0,
"step": 371
},
{
"entropy": 0.5485008955001831,
"epoch": 1.388785046728972,
"grad_norm": 0.038097940385341644,
"learning_rate": 0.0002,
"loss": 0.5500508546829224,
"mean_token_accuracy": 0.775309219956398,
"num_tokens": 6064562.0,
"step": 372
},
{
"entropy": 0.5520146042108536,
"epoch": 1.3925233644859814,
"grad_norm": 0.02676542103290558,
"learning_rate": 0.0002,
"loss": 0.546633243560791,
"mean_token_accuracy": 0.7763903141021729,
"num_tokens": 6080869.0,
"step": 373
},
{
"entropy": 0.5430674999952316,
"epoch": 1.3962616822429905,
"grad_norm": 0.0291767455637455,
"learning_rate": 0.0002,
"loss": 0.5384376049041748,
"mean_token_accuracy": 0.7846493870019913,
"num_tokens": 6096995.0,
"step": 374
},
{
"entropy": 0.543053463101387,
"epoch": 1.4,
"grad_norm": 0.031880684196949005,
"learning_rate": 0.0002,
"loss": 0.5416824817657471,
"mean_token_accuracy": 0.7807471454143524,
"num_tokens": 6113154.0,
"step": 375
},
{
"entropy": 0.555852085351944,
"epoch": 1.4037383177570093,
"grad_norm": 0.03215760365128517,
"learning_rate": 0.0002,
"loss": 0.5583543181419373,
"mean_token_accuracy": 0.7724814862012863,
"num_tokens": 6129602.0,
"step": 376
},
{
"entropy": 0.5323648005723953,
"epoch": 1.4074766355140187,
"grad_norm": 0.03375270590186119,
"learning_rate": 0.0002,
"loss": 0.5405369400978088,
"mean_token_accuracy": 0.7804393470287323,
"num_tokens": 6145766.0,
"step": 377
},
{
"entropy": 0.5550488829612732,
"epoch": 1.411214953271028,
"grad_norm": 0.029217012226581573,
"learning_rate": 0.0002,
"loss": 0.554684579372406,
"mean_token_accuracy": 0.7745330631732941,
"num_tokens": 6162201.0,
"step": 378
},
{
"entropy": 0.5482346266508102,
"epoch": 1.4149532710280375,
"grad_norm": 0.03129247948527336,
"learning_rate": 0.0002,
"loss": 0.5419821739196777,
"mean_token_accuracy": 0.7780721634626389,
"num_tokens": 6178420.0,
"step": 379
},
{
"entropy": 0.5605264604091644,
"epoch": 1.4186915887850469,
"grad_norm": 0.028088558465242386,
"learning_rate": 0.0002,
"loss": 0.5536739230155945,
"mean_token_accuracy": 0.7760752588510513,
"num_tokens": 6195017.0,
"step": 380
},
{
"entropy": 0.5308103561401367,
"epoch": 1.422429906542056,
"grad_norm": 0.03174047917127609,
"learning_rate": 0.0002,
"loss": 0.5348400473594666,
"mean_token_accuracy": 0.7830243110656738,
"num_tokens": 6211269.0,
"step": 381
},
{
"entropy": 0.5362233817577362,
"epoch": 1.4261682242990654,
"grad_norm": 0.03284025564789772,
"learning_rate": 0.0002,
"loss": 0.5401143431663513,
"mean_token_accuracy": 0.7799562960863113,
"num_tokens": 6227503.0,
"step": 382
},
{
"entropy": 0.5288970768451691,
"epoch": 1.4299065420560748,
"grad_norm": 0.03117184154689312,
"learning_rate": 0.0002,
"loss": 0.5347498655319214,
"mean_token_accuracy": 0.7850797027349472,
"num_tokens": 6243667.0,
"step": 383
},
{
"entropy": 0.5478838980197906,
"epoch": 1.433644859813084,
"grad_norm": 0.0355689711868763,
"learning_rate": 0.0002,
"loss": 0.5515888333320618,
"mean_token_accuracy": 0.7750401347875595,
"num_tokens": 6259958.0,
"step": 384
},
{
"entropy": 0.5556496828794479,
"epoch": 1.4373831775700934,
"grad_norm": 0.03252286836504936,
"learning_rate": 0.0002,
"loss": 0.5527741312980652,
"mean_token_accuracy": 0.7747504711151123,
"num_tokens": 6276256.0,
"step": 385
},
{
"entropy": 0.536173865199089,
"epoch": 1.4411214953271028,
"grad_norm": 0.03125045448541641,
"learning_rate": 0.0002,
"loss": 0.5389170050621033,
"mean_token_accuracy": 0.7826138287782669,
"num_tokens": 6292477.0,
"step": 386
},
{
"entropy": 0.5414228439331055,
"epoch": 1.4448598130841122,
"grad_norm": 0.029693089425563812,
"learning_rate": 0.0002,
"loss": 0.5456768870353699,
"mean_token_accuracy": 0.7780184000730515,
"num_tokens": 6308848.0,
"step": 387
},
{
"entropy": 0.5460960417985916,
"epoch": 1.4485981308411215,
"grad_norm": 0.028725288808345795,
"learning_rate": 0.0002,
"loss": 0.5453904867172241,
"mean_token_accuracy": 0.7754503637552261,
"num_tokens": 6325175.0,
"step": 388
},
{
"entropy": 0.5478474348783493,
"epoch": 1.452336448598131,
"grad_norm": 0.03158194199204445,
"learning_rate": 0.0002,
"loss": 0.5430905818939209,
"mean_token_accuracy": 0.7789453864097595,
"num_tokens": 6341307.0,
"step": 389
},
{
"entropy": 0.5458368062973022,
"epoch": 1.45607476635514,
"grad_norm": 0.02816491760313511,
"learning_rate": 0.0002,
"loss": 0.543704092502594,
"mean_token_accuracy": 0.7792259007692337,
"num_tokens": 6357858.0,
"step": 390
},
{
"entropy": 0.5392302572727203,
"epoch": 1.4598130841121495,
"grad_norm": 0.04157215729355812,
"learning_rate": 0.0002,
"loss": 0.544989287853241,
"mean_token_accuracy": 0.7776051461696625,
"num_tokens": 6373868.0,
"step": 391
},
{
"entropy": 0.5487792640924454,
"epoch": 1.4635514018691589,
"grad_norm": 0.03120332583785057,
"learning_rate": 0.0002,
"loss": 0.5500867962837219,
"mean_token_accuracy": 0.7786511480808258,
"num_tokens": 6390370.0,
"step": 392
},
{
"entropy": 0.5473900437355042,
"epoch": 1.4672897196261683,
"grad_norm": 0.03685331344604492,
"learning_rate": 0.0002,
"loss": 0.5516798496246338,
"mean_token_accuracy": 0.7734636813402176,
"num_tokens": 6406810.0,
"step": 393
},
{
"entropy": 0.5339369177818298,
"epoch": 1.4710280373831774,
"grad_norm": 0.031062059104442596,
"learning_rate": 0.0002,
"loss": 0.5277940034866333,
"mean_token_accuracy": 0.7844891250133514,
"num_tokens": 6423321.0,
"step": 394
},
{
"entropy": 0.5646286159753799,
"epoch": 1.4747663551401868,
"grad_norm": 0.03419705480337143,
"learning_rate": 0.0002,
"loss": 0.560526967048645,
"mean_token_accuracy": 0.7742912471294403,
"num_tokens": 6439751.0,
"step": 395
},
{
"entropy": 0.5566267520189285,
"epoch": 1.4785046728971962,
"grad_norm": 0.030112918466329575,
"learning_rate": 0.0002,
"loss": 0.551886796951294,
"mean_token_accuracy": 0.7758849114179611,
"num_tokens": 6456064.0,
"step": 396
},
{
"entropy": 0.5496308952569962,
"epoch": 1.4822429906542056,
"grad_norm": 0.029358550906181335,
"learning_rate": 0.0002,
"loss": 0.5503244400024414,
"mean_token_accuracy": 0.779025211930275,
"num_tokens": 6472168.0,
"step": 397
},
{
"entropy": 0.5490056574344635,
"epoch": 1.485981308411215,
"grad_norm": 0.03679414093494415,
"learning_rate": 0.0002,
"loss": 0.5532426834106445,
"mean_token_accuracy": 0.77412910759449,
"num_tokens": 6488701.0,
"step": 398
},
{
"entropy": 0.5552525818347931,
"epoch": 1.4897196261682244,
"grad_norm": 0.03460443392395973,
"learning_rate": 0.0002,
"loss": 0.5580930709838867,
"mean_token_accuracy": 0.7725805938243866,
"num_tokens": 6504913.0,
"step": 399
},
{
"entropy": 0.5486905574798584,
"epoch": 1.4934579439252336,
"grad_norm": 0.03757799416780472,
"learning_rate": 0.0002,
"loss": 0.5467075705528259,
"mean_token_accuracy": 0.7737327963113785,
"num_tokens": 6521159.0,
"step": 400
},
{
"entropy": 0.5667891502380371,
"epoch": 1.497196261682243,
"grad_norm": 0.0321633443236351,
"learning_rate": 0.0002,
"loss": 0.5584529042243958,
"mean_token_accuracy": 0.7716430127620697,
"num_tokens": 6537343.0,
"step": 401
},
{
"entropy": 0.560171589255333,
"epoch": 1.5009345794392523,
"grad_norm": 0.027958108112215996,
"learning_rate": 0.0002,
"loss": 0.5571039319038391,
"mean_token_accuracy": 0.7695076316595078,
"num_tokens": 6553654.0,
"step": 402
},
{
"entropy": 0.5325733348727226,
"epoch": 1.5046728971962615,
"grad_norm": 0.03109286166727543,
"learning_rate": 0.0002,
"loss": 0.5371490716934204,
"mean_token_accuracy": 0.7818229347467422,
"num_tokens": 6569830.0,
"step": 403
},
{
"entropy": 0.5464021414518356,
"epoch": 1.508411214953271,
"grad_norm": 0.033921979367733,
"learning_rate": 0.0002,
"loss": 0.5520694255828857,
"mean_token_accuracy": 0.7737181484699249,
"num_tokens": 6586181.0,
"step": 404
},
{
"entropy": 0.5360658913850784,
"epoch": 1.5121495327102803,
"grad_norm": 0.03216444328427315,
"learning_rate": 0.0002,
"loss": 0.539574921131134,
"mean_token_accuracy": 0.7791631668806076,
"num_tokens": 6602220.0,
"step": 405
},
{
"entropy": 0.5452992171049118,
"epoch": 1.5158878504672897,
"grad_norm": 0.02836962789297104,
"learning_rate": 0.0002,
"loss": 0.5482081174850464,
"mean_token_accuracy": 0.7770387381315231,
"num_tokens": 6618603.0,
"step": 406
},
{
"entropy": 0.5549522340297699,
"epoch": 1.519626168224299,
"grad_norm": 0.029138341546058655,
"learning_rate": 0.0002,
"loss": 0.5456300973892212,
"mean_token_accuracy": 0.7779618352651596,
"num_tokens": 6634957.0,
"step": 407
},
{
"entropy": 0.5506550967693329,
"epoch": 1.5233644859813085,
"grad_norm": 0.02889757789671421,
"learning_rate": 0.0002,
"loss": 0.5417683720588684,
"mean_token_accuracy": 0.7772906571626663,
"num_tokens": 6651192.0,
"step": 408
},
{
"entropy": 0.5641747862100601,
"epoch": 1.5271028037383179,
"grad_norm": 0.029291054233908653,
"learning_rate": 0.0002,
"loss": 0.5575106143951416,
"mean_token_accuracy": 0.7736930400133133,
"num_tokens": 6667351.0,
"step": 409
},
{
"entropy": 0.5569720417261124,
"epoch": 1.5308411214953273,
"grad_norm": 0.031217265874147415,
"learning_rate": 0.0002,
"loss": 0.5568684339523315,
"mean_token_accuracy": 0.7742536216974258,
"num_tokens": 6683766.0,
"step": 410
},
{
"entropy": 0.5555198639631271,
"epoch": 1.5345794392523364,
"grad_norm": 0.041470784693956375,
"learning_rate": 0.0002,
"loss": 0.5674223303794861,
"mean_token_accuracy": 0.7700306624174118,
"num_tokens": 6700296.0,
"step": 411
},
{
"entropy": 0.5609412640333176,
"epoch": 1.5383177570093458,
"grad_norm": 0.03198862448334694,
"learning_rate": 0.0002,
"loss": 0.5651755332946777,
"mean_token_accuracy": 0.7717378437519073,
"num_tokens": 6716475.0,
"step": 412
},
{
"entropy": 0.5559493005275726,
"epoch": 1.542056074766355,
"grad_norm": 0.029610617086291313,
"learning_rate": 0.0002,
"loss": 0.5465991497039795,
"mean_token_accuracy": 0.7768793702125549,
"num_tokens": 6732579.0,
"step": 413
},
{
"entropy": 0.5383591949939728,
"epoch": 1.5457943925233644,
"grad_norm": 0.03238457813858986,
"learning_rate": 0.0002,
"loss": 0.5351200699806213,
"mean_token_accuracy": 0.7838361263275146,
"num_tokens": 6748613.0,
"step": 414
},
{
"entropy": 0.5723170787096024,
"epoch": 1.5495327102803738,
"grad_norm": 0.03184224292635918,
"learning_rate": 0.0002,
"loss": 0.5706000328063965,
"mean_token_accuracy": 0.7656203061342239,
"num_tokens": 6764799.0,
"step": 415
},
{
"entropy": 0.5449900329113007,
"epoch": 1.5532710280373832,
"grad_norm": 0.03413036838173866,
"learning_rate": 0.0002,
"loss": 0.5444662570953369,
"mean_token_accuracy": 0.7746504992246628,
"num_tokens": 6781040.0,
"step": 416
},
{
"entropy": 0.5653754621744156,
"epoch": 1.5570093457943925,
"grad_norm": 0.03557061403989792,
"learning_rate": 0.0002,
"loss": 0.5661092400550842,
"mean_token_accuracy": 0.7700045108795166,
"num_tokens": 6797618.0,
"step": 417
},
{
"entropy": 0.5285668075084686,
"epoch": 1.560747663551402,
"grad_norm": 0.02898026816546917,
"learning_rate": 0.0002,
"loss": 0.5310862064361572,
"mean_token_accuracy": 0.7867710143327713,
"num_tokens": 6813889.0,
"step": 418
},
{
"entropy": 0.5591782182455063,
"epoch": 1.5644859813084113,
"grad_norm": 0.03489390015602112,
"learning_rate": 0.0002,
"loss": 0.559260368347168,
"mean_token_accuracy": 0.7742950618267059,
"num_tokens": 6830511.0,
"step": 419
},
{
"entropy": 0.5233039408922195,
"epoch": 1.5682242990654207,
"grad_norm": 0.031120121479034424,
"learning_rate": 0.0002,
"loss": 0.5304787158966064,
"mean_token_accuracy": 0.7851588577032089,
"num_tokens": 6846831.0,
"step": 420
},
{
"entropy": 0.5615075826644897,
"epoch": 1.5719626168224299,
"grad_norm": 0.032532718032598495,
"learning_rate": 0.0002,
"loss": 0.557915985584259,
"mean_token_accuracy": 0.7756024897098541,
"num_tokens": 6863482.0,
"step": 421
},
{
"entropy": 0.5608477592468262,
"epoch": 1.5757009345794393,
"grad_norm": 0.03193405270576477,
"learning_rate": 0.0002,
"loss": 0.5570778250694275,
"mean_token_accuracy": 0.7736349552869797,
"num_tokens": 6879744.0,
"step": 422
},
{
"entropy": 0.5420049726963043,
"epoch": 1.5794392523364484,
"grad_norm": 0.03341756388545036,
"learning_rate": 0.0002,
"loss": 0.5422099828720093,
"mean_token_accuracy": 0.7786398679018021,
"num_tokens": 6895998.0,
"step": 423
},
{
"entropy": 0.5501766800880432,
"epoch": 1.5831775700934578,
"grad_norm": 0.03080238774418831,
"learning_rate": 0.0002,
"loss": 0.543519139289856,
"mean_token_accuracy": 0.779445543885231,
"num_tokens": 6912350.0,
"step": 424
},
{
"entropy": 0.5548175424337387,
"epoch": 1.5869158878504672,
"grad_norm": 0.029699817299842834,
"learning_rate": 0.0002,
"loss": 0.554355263710022,
"mean_token_accuracy": 0.7715099602937698,
"num_tokens": 6928868.0,
"step": 425
},
{
"entropy": 0.5445838496088982,
"epoch": 1.5906542056074766,
"grad_norm": 0.03310444578528404,
"learning_rate": 0.0002,
"loss": 0.5509841442108154,
"mean_token_accuracy": 0.7749770432710648,
"num_tokens": 6945115.0,
"step": 426
},
{
"entropy": 0.5508389323949814,
"epoch": 1.594392523364486,
"grad_norm": 0.03343511372804642,
"learning_rate": 0.0002,
"loss": 0.5527422428131104,
"mean_token_accuracy": 0.7760582268238068,
"num_tokens": 6961606.0,
"step": 427
},
{
"entropy": 0.5455803871154785,
"epoch": 1.5981308411214954,
"grad_norm": 0.030003823339939117,
"learning_rate": 0.0002,
"loss": 0.5433002710342407,
"mean_token_accuracy": 0.7772544771432877,
"num_tokens": 6977721.0,
"step": 428
},
{
"entropy": 0.542354941368103,
"epoch": 1.6018691588785048,
"grad_norm": 0.02921188622713089,
"learning_rate": 0.0002,
"loss": 0.5396295785903931,
"mean_token_accuracy": 0.7784738689661026,
"num_tokens": 6994015.0,
"step": 429
},
{
"entropy": 0.5403562635183334,
"epoch": 1.6056074766355142,
"grad_norm": 0.03267091140151024,
"learning_rate": 0.0002,
"loss": 0.5412419438362122,
"mean_token_accuracy": 0.7828981131315231,
"num_tokens": 7010256.0,
"step": 430
},
{
"entropy": 0.5418384820222855,
"epoch": 1.6093457943925233,
"grad_norm": 0.03328794986009598,
"learning_rate": 0.0002,
"loss": 0.5415868163108826,
"mean_token_accuracy": 0.7787100970745087,
"num_tokens": 7026538.0,
"step": 431
},
{
"entropy": 0.5569044798612595,
"epoch": 1.6130841121495327,
"grad_norm": 0.03399523347616196,
"learning_rate": 0.0002,
"loss": 0.5610039830207825,
"mean_token_accuracy": 0.7681904435157776,
"num_tokens": 7042821.0,
"step": 432
},
{
"entropy": 0.5516158491373062,
"epoch": 1.616822429906542,
"grad_norm": 0.041675642132759094,
"learning_rate": 0.0002,
"loss": 0.5512884855270386,
"mean_token_accuracy": 0.7792385816574097,
"num_tokens": 7059278.0,
"step": 433
},
{
"entropy": 0.5493542701005936,
"epoch": 1.6205607476635513,
"grad_norm": 0.029840141534805298,
"learning_rate": 0.0002,
"loss": 0.5508259534835815,
"mean_token_accuracy": 0.7764638513326645,
"num_tokens": 7075675.0,
"step": 434
},
{
"entropy": 0.5415777564048767,
"epoch": 1.6242990654205607,
"grad_norm": 0.04138097167015076,
"learning_rate": 0.0002,
"loss": 0.540780246257782,
"mean_token_accuracy": 0.7806251496076584,
"num_tokens": 7091803.0,
"step": 435
},
{
"entropy": 0.5550828725099564,
"epoch": 1.62803738317757,
"grad_norm": 0.03500202298164368,
"learning_rate": 0.0002,
"loss": 0.5536463856697083,
"mean_token_accuracy": 0.7767235636711121,
"num_tokens": 7108257.0,
"step": 436
},
{
"entropy": 0.5612530559301376,
"epoch": 1.6317757009345795,
"grad_norm": 0.029145153239369392,
"learning_rate": 0.0002,
"loss": 0.5608190894126892,
"mean_token_accuracy": 0.7731182426214218,
"num_tokens": 7124785.0,
"step": 437
},
{
"entropy": 0.5527195036411285,
"epoch": 1.6355140186915889,
"grad_norm": 0.035749297589063644,
"learning_rate": 0.0002,
"loss": 0.5629845857620239,
"mean_token_accuracy": 0.7721443176269531,
"num_tokens": 7141265.0,
"step": 438
},
{
"entropy": 0.5614519417285919,
"epoch": 1.6392523364485982,
"grad_norm": 0.033001191914081573,
"learning_rate": 0.0002,
"loss": 0.5560024976730347,
"mean_token_accuracy": 0.7749044448137283,
"num_tokens": 7157859.0,
"step": 439
},
{
"entropy": 0.5537575930356979,
"epoch": 1.6429906542056076,
"grad_norm": 0.026474064216017723,
"learning_rate": 0.0002,
"loss": 0.5511392951011658,
"mean_token_accuracy": 0.7752827405929565,
"num_tokens": 7174159.0,
"step": 440
},
{
"entropy": 0.5490387231111526,
"epoch": 1.6467289719626168,
"grad_norm": 0.03137727826833725,
"learning_rate": 0.0002,
"loss": 0.5470349192619324,
"mean_token_accuracy": 0.7756170034408569,
"num_tokens": 7190518.0,
"step": 441
},
{
"entropy": 0.5602337867021561,
"epoch": 1.6504672897196262,
"grad_norm": 0.0327768549323082,
"learning_rate": 0.0002,
"loss": 0.5596269369125366,
"mean_token_accuracy": 0.7712970525026321,
"num_tokens": 7206832.0,
"step": 442
},
{
"entropy": 0.5407531261444092,
"epoch": 1.6542056074766354,
"grad_norm": 0.0337577648460865,
"learning_rate": 0.0002,
"loss": 0.5448312759399414,
"mean_token_accuracy": 0.7795456647872925,
"num_tokens": 7222967.0,
"step": 443
},
{
"entropy": 0.5409540086984634,
"epoch": 1.6579439252336448,
"grad_norm": 0.03192588686943054,
"learning_rate": 0.0002,
"loss": 0.5484352111816406,
"mean_token_accuracy": 0.7764406651258469,
"num_tokens": 7239342.0,
"step": 444
},
{
"entropy": 0.5369711667299271,
"epoch": 1.6616822429906541,
"grad_norm": 0.029282715171575546,
"learning_rate": 0.0002,
"loss": 0.5391625165939331,
"mean_token_accuracy": 0.7777595669031143,
"num_tokens": 7255685.0,
"step": 445
},
{
"entropy": 0.5320119112730026,
"epoch": 1.6654205607476635,
"grad_norm": 0.03132037818431854,
"learning_rate": 0.0002,
"loss": 0.5324081182479858,
"mean_token_accuracy": 0.7831796556711197,
"num_tokens": 7271873.0,
"step": 446
},
{
"entropy": 0.5473773181438446,
"epoch": 1.669158878504673,
"grad_norm": 0.029359478503465652,
"learning_rate": 0.0002,
"loss": 0.5430581569671631,
"mean_token_accuracy": 0.780887171626091,
"num_tokens": 7288229.0,
"step": 447
},
{
"entropy": 0.5577313005924225,
"epoch": 1.6728971962616823,
"grad_norm": 0.0312592051923275,
"learning_rate": 0.0002,
"loss": 0.5549578070640564,
"mean_token_accuracy": 0.7755182534456253,
"num_tokens": 7304562.0,
"step": 448
},
{
"entropy": 0.5430529564619064,
"epoch": 1.6766355140186917,
"grad_norm": 0.036848753690719604,
"learning_rate": 0.0002,
"loss": 0.5486578941345215,
"mean_token_accuracy": 0.7793130427598953,
"num_tokens": 7320789.0,
"step": 449
},
{
"entropy": 0.5367421358823776,
"epoch": 1.680373831775701,
"grad_norm": 0.03133554011583328,
"learning_rate": 0.0002,
"loss": 0.5428006649017334,
"mean_token_accuracy": 0.7791069746017456,
"num_tokens": 7336720.0,
"step": 450
},
{
"entropy": 0.5608862638473511,
"epoch": 1.6841121495327103,
"grad_norm": 0.033135656267404556,
"learning_rate": 0.0002,
"loss": 0.5513461828231812,
"mean_token_accuracy": 0.7747347801923752,
"num_tokens": 7353115.0,
"step": 451
},
{
"entropy": 0.5476694256067276,
"epoch": 1.6878504672897197,
"grad_norm": 0.02974470518529415,
"learning_rate": 0.0002,
"loss": 0.5473049879074097,
"mean_token_accuracy": 0.7776686698198318,
"num_tokens": 7369302.0,
"step": 452
},
{
"entropy": 0.5416230708360672,
"epoch": 1.6915887850467288,
"grad_norm": 0.0338185578584671,
"learning_rate": 0.0002,
"loss": 0.5420779585838318,
"mean_token_accuracy": 0.7770841121673584,
"num_tokens": 7385486.0,
"step": 453
},
{
"entropy": 0.5354430079460144,
"epoch": 1.6953271028037382,
"grad_norm": 0.04928300157189369,
"learning_rate": 0.0002,
"loss": 0.5383298397064209,
"mean_token_accuracy": 0.7825010567903519,
"num_tokens": 7401834.0,
"step": 454
},
{
"entropy": 0.5533457249403,
"epoch": 1.6990654205607476,
"grad_norm": 0.03868211433291435,
"learning_rate": 0.0002,
"loss": 0.5589519739151001,
"mean_token_accuracy": 0.7741620242595673,
"num_tokens": 7418328.0,
"step": 455
},
{
"entropy": 0.5337075442075729,
"epoch": 1.702803738317757,
"grad_norm": 0.03012922592461109,
"learning_rate": 0.0002,
"loss": 0.5302947163581848,
"mean_token_accuracy": 0.7835781127214432,
"num_tokens": 7434426.0,
"step": 456
},
{
"entropy": 0.5648263692855835,
"epoch": 1.7065420560747664,
"grad_norm": 0.028873439878225327,
"learning_rate": 0.0002,
"loss": 0.5585320591926575,
"mean_token_accuracy": 0.7732219845056534,
"num_tokens": 7451036.0,
"step": 457
},
{
"entropy": 0.5839773565530777,
"epoch": 1.7102803738317758,
"grad_norm": 0.033153235912323,
"learning_rate": 0.0002,
"loss": 0.5761073231697083,
"mean_token_accuracy": 0.7669852823019028,
"num_tokens": 7467359.0,
"step": 458
},
{
"entropy": 0.5488205403089523,
"epoch": 1.7140186915887852,
"grad_norm": 0.032065052539110184,
"learning_rate": 0.0002,
"loss": 0.5483813285827637,
"mean_token_accuracy": 0.7763916105031967,
"num_tokens": 7483649.0,
"step": 459
},
{
"entropy": 0.5411174297332764,
"epoch": 1.7177570093457943,
"grad_norm": 0.0323743000626564,
"learning_rate": 0.0002,
"loss": 0.5461615920066833,
"mean_token_accuracy": 0.7778149843215942,
"num_tokens": 7500070.0,
"step": 460
},
{
"entropy": 0.533783033490181,
"epoch": 1.7214953271028037,
"grad_norm": 0.03367235139012337,
"learning_rate": 0.0002,
"loss": 0.5427653193473816,
"mean_token_accuracy": 0.7805494964122772,
"num_tokens": 7516529.0,
"step": 461
},
{
"entropy": 0.5454732924699783,
"epoch": 1.7252336448598131,
"grad_norm": 0.034071460366249084,
"learning_rate": 0.0002,
"loss": 0.5546566247940063,
"mean_token_accuracy": 0.7736624777317047,
"num_tokens": 7533025.0,
"step": 462
},
{
"entropy": 0.5454118698835373,
"epoch": 1.7289719626168223,
"grad_norm": 0.03127819299697876,
"learning_rate": 0.0002,
"loss": 0.5452259182929993,
"mean_token_accuracy": 0.7759493589401245,
"num_tokens": 7549482.0,
"step": 463
},
{
"entropy": 0.5667081475257874,
"epoch": 1.7327102803738317,
"grad_norm": 0.0311261173337698,
"learning_rate": 0.0002,
"loss": 0.5610095858573914,
"mean_token_accuracy": 0.772314265370369,
"num_tokens": 7565748.0,
"step": 464
},
{
"entropy": 0.5310934036970139,
"epoch": 1.736448598130841,
"grad_norm": 0.03265678882598877,
"learning_rate": 0.0002,
"loss": 0.5214373469352722,
"mean_token_accuracy": 0.7887950539588928,
"num_tokens": 7582052.0,
"step": 465
},
{
"entropy": 0.5556392967700958,
"epoch": 1.7401869158878505,
"grad_norm": 0.03034058026969433,
"learning_rate": 0.0002,
"loss": 0.5505704283714294,
"mean_token_accuracy": 0.7774366736412048,
"num_tokens": 7598174.0,
"step": 466
},
{
"entropy": 0.5393192917108536,
"epoch": 1.7439252336448599,
"grad_norm": 0.0359746590256691,
"learning_rate": 0.0002,
"loss": 0.5477877259254456,
"mean_token_accuracy": 0.7797855734825134,
"num_tokens": 7614503.0,
"step": 467
},
{
"entropy": 0.551783487200737,
"epoch": 1.7476635514018692,
"grad_norm": 0.03548724204301834,
"learning_rate": 0.0002,
"loss": 0.5540840029716492,
"mean_token_accuracy": 0.7747608870267868,
"num_tokens": 7630814.0,
"step": 468
},
{
"entropy": 0.5413367450237274,
"epoch": 1.7514018691588786,
"grad_norm": 0.034123897552490234,
"learning_rate": 0.0002,
"loss": 0.5470243692398071,
"mean_token_accuracy": 0.779376894235611,
"num_tokens": 7647376.0,
"step": 469
},
{
"entropy": 0.5412023663520813,
"epoch": 1.7551401869158878,
"grad_norm": 0.03561440855264664,
"learning_rate": 0.0002,
"loss": 0.5472733378410339,
"mean_token_accuracy": 0.7762201726436615,
"num_tokens": 7663345.0,
"step": 470
},
{
"entropy": 0.549220860004425,
"epoch": 1.7588785046728972,
"grad_norm": 0.02905275858938694,
"learning_rate": 0.0002,
"loss": 0.541520893573761,
"mean_token_accuracy": 0.7792876809835434,
"num_tokens": 7679585.0,
"step": 471
},
{
"entropy": 0.5333058834075928,
"epoch": 1.7626168224299066,
"grad_norm": 0.03320024162530899,
"learning_rate": 0.0002,
"loss": 0.5264161229133606,
"mean_token_accuracy": 0.7870939522981644,
"num_tokens": 7695719.0,
"step": 472
},
{
"entropy": 0.5468353033065796,
"epoch": 1.7663551401869158,
"grad_norm": 0.03256339579820633,
"learning_rate": 0.0002,
"loss": 0.5458404421806335,
"mean_token_accuracy": 0.778706505894661,
"num_tokens": 7711803.0,
"step": 473
},
{
"entropy": 0.536187469959259,
"epoch": 1.7700934579439251,
"grad_norm": 0.03339603543281555,
"learning_rate": 0.0002,
"loss": 0.5392374992370605,
"mean_token_accuracy": 0.7822528183460236,
"num_tokens": 7728002.0,
"step": 474
},
{
"entropy": 0.5286234319210052,
"epoch": 1.7738317757009345,
"grad_norm": 0.033285900950431824,
"learning_rate": 0.0002,
"loss": 0.5358365774154663,
"mean_token_accuracy": 0.7836114317178726,
"num_tokens": 7744366.0,
"step": 475
},
{
"entropy": 0.5403973311185837,
"epoch": 1.777570093457944,
"grad_norm": 0.028936821967363358,
"learning_rate": 0.0002,
"loss": 0.5398406386375427,
"mean_token_accuracy": 0.7814478874206543,
"num_tokens": 7760549.0,
"step": 476
},
{
"entropy": 0.5419041812419891,
"epoch": 1.7813084112149533,
"grad_norm": 0.03836261108517647,
"learning_rate": 0.0002,
"loss": 0.5494267344474792,
"mean_token_accuracy": 0.775143027305603,
"num_tokens": 7776621.0,
"step": 477
},
{
"entropy": 0.5589816868305206,
"epoch": 1.7850467289719627,
"grad_norm": 0.03261716663837433,
"learning_rate": 0.0002,
"loss": 0.5496556758880615,
"mean_token_accuracy": 0.775287851691246,
"num_tokens": 7792949.0,
"step": 478
},
{
"entropy": 0.5772902369499207,
"epoch": 1.788785046728972,
"grad_norm": 0.03729069605469704,
"learning_rate": 0.0002,
"loss": 0.5730117559432983,
"mean_token_accuracy": 0.7676824629306793,
"num_tokens": 7809233.0,
"step": 479
},
{
"entropy": 0.5505616068840027,
"epoch": 1.7925233644859813,
"grad_norm": 0.0271653700619936,
"learning_rate": 0.0002,
"loss": 0.5481145977973938,
"mean_token_accuracy": 0.7766467928886414,
"num_tokens": 7825604.0,
"step": 480
},
{
"entropy": 0.5539548844099045,
"epoch": 1.7962616822429907,
"grad_norm": 0.035687919706106186,
"learning_rate": 0.0002,
"loss": 0.5536059737205505,
"mean_token_accuracy": 0.7723885625600815,
"num_tokens": 7841764.0,
"step": 481
},
{
"entropy": 0.548996701836586,
"epoch": 1.8,
"grad_norm": 0.03167950361967087,
"learning_rate": 0.0002,
"loss": 0.5525107383728027,
"mean_token_accuracy": 0.7743307799100876,
"num_tokens": 7857918.0,
"step": 482
},
{
"entropy": 0.5371337532997131,
"epoch": 1.8037383177570092,
"grad_norm": 0.03125729039311409,
"learning_rate": 0.0002,
"loss": 0.5431434512138367,
"mean_token_accuracy": 0.7770611643791199,
"num_tokens": 7874375.0,
"step": 483
},
{
"entropy": 0.5534856170415878,
"epoch": 1.8074766355140186,
"grad_norm": 0.03495310619473457,
"learning_rate": 0.0002,
"loss": 0.5606104731559753,
"mean_token_accuracy": 0.7701490819454193,
"num_tokens": 7890503.0,
"step": 484
},
{
"entropy": 0.5570873767137527,
"epoch": 1.811214953271028,
"grad_norm": 0.031059635803103447,
"learning_rate": 0.0002,
"loss": 0.5577523112297058,
"mean_token_accuracy": 0.7766271531581879,
"num_tokens": 7906740.0,
"step": 485
},
{
"entropy": 0.549734815955162,
"epoch": 1.8149532710280374,
"grad_norm": 0.029658785089850426,
"learning_rate": 0.0002,
"loss": 0.5459674000740051,
"mean_token_accuracy": 0.778388187289238,
"num_tokens": 7923366.0,
"step": 486
},
{
"entropy": 0.556487500667572,
"epoch": 1.8186915887850468,
"grad_norm": 0.03030308522284031,
"learning_rate": 0.0002,
"loss": 0.5487005710601807,
"mean_token_accuracy": 0.7778837084770203,
"num_tokens": 7939678.0,
"step": 487
},
{
"entropy": 0.5620574653148651,
"epoch": 1.8224299065420562,
"grad_norm": 0.03321143984794617,
"learning_rate": 0.0002,
"loss": 0.5632344484329224,
"mean_token_accuracy": 0.771716520190239,
"num_tokens": 7955824.0,
"step": 488
},
{
"entropy": 0.5325201749801636,
"epoch": 1.8261682242990656,
"grad_norm": 0.0296145249158144,
"learning_rate": 0.0002,
"loss": 0.5337831377983093,
"mean_token_accuracy": 0.7806598991155624,
"num_tokens": 7971945.0,
"step": 489
},
{
"entropy": 0.5530183613300323,
"epoch": 1.8299065420560747,
"grad_norm": 0.04490596428513527,
"learning_rate": 0.0002,
"loss": 0.5658998489379883,
"mean_token_accuracy": 0.7682041078805923,
"num_tokens": 7988395.0,
"step": 490
},
{
"entropy": 0.540508821606636,
"epoch": 1.8336448598130841,
"grad_norm": 0.03253109008073807,
"learning_rate": 0.0002,
"loss": 0.5402263402938843,
"mean_token_accuracy": 0.7800282388925552,
"num_tokens": 8004443.0,
"step": 491
},
{
"entropy": 0.5511161684989929,
"epoch": 1.8373831775700935,
"grad_norm": 0.030638035386800766,
"learning_rate": 0.0002,
"loss": 0.5421851277351379,
"mean_token_accuracy": 0.7774636000394821,
"num_tokens": 8020850.0,
"step": 492
},
{
"entropy": 0.5710225850343704,
"epoch": 1.8411214953271027,
"grad_norm": 0.029152031987905502,
"learning_rate": 0.0002,
"loss": 0.5603572130203247,
"mean_token_accuracy": 0.7699873447418213,
"num_tokens": 8037043.0,
"step": 493
},
{
"entropy": 0.5580283105373383,
"epoch": 1.844859813084112,
"grad_norm": 0.030489208176732063,
"learning_rate": 0.0002,
"loss": 0.5527392625808716,
"mean_token_accuracy": 0.7742099016904831,
"num_tokens": 8053631.0,
"step": 494
},
{
"entropy": 0.5568618625402451,
"epoch": 1.8485981308411215,
"grad_norm": 0.03116370178759098,
"learning_rate": 0.0002,
"loss": 0.557203471660614,
"mean_token_accuracy": 0.7757259756326675,
"num_tokens": 8069679.0,
"step": 495
},
{
"entropy": 0.5572323054075241,
"epoch": 1.8523364485981308,
"grad_norm": 0.03199765831232071,
"learning_rate": 0.0002,
"loss": 0.5623334646224976,
"mean_token_accuracy": 0.7726736217737198,
"num_tokens": 8086185.0,
"step": 496
},
{
"entropy": 0.5608405023813248,
"epoch": 1.8560747663551402,
"grad_norm": 0.03123069368302822,
"learning_rate": 0.0002,
"loss": 0.5668354630470276,
"mean_token_accuracy": 0.7697951197624207,
"num_tokens": 8102680.0,
"step": 497
},
{
"entropy": 0.5482483208179474,
"epoch": 1.8598130841121496,
"grad_norm": 0.03388088196516037,
"learning_rate": 0.0002,
"loss": 0.5544660091400146,
"mean_token_accuracy": 0.7736243009567261,
"num_tokens": 8119206.0,
"step": 498
},
{
"entropy": 0.5743024945259094,
"epoch": 1.863551401869159,
"grad_norm": 0.027546290308237076,
"learning_rate": 0.0002,
"loss": 0.5691558718681335,
"mean_token_accuracy": 0.7669505923986435,
"num_tokens": 8135686.0,
"step": 499
},
{
"entropy": 0.5571306794881821,
"epoch": 1.8672897196261682,
"grad_norm": 0.03095332719385624,
"learning_rate": 0.0002,
"loss": 0.5527883172035217,
"mean_token_accuracy": 0.7751508802175522,
"num_tokens": 8151938.0,
"step": 500
},
{
"entropy": 0.5444643199443817,
"epoch": 1.8710280373831776,
"grad_norm": 0.03176809847354889,
"learning_rate": 0.0002,
"loss": 0.5450653433799744,
"mean_token_accuracy": 0.7778386175632477,
"num_tokens": 8168369.0,
"step": 501
},
{
"entropy": 0.5318097025156021,
"epoch": 1.874766355140187,
"grad_norm": 0.03216860815882683,
"learning_rate": 0.0002,
"loss": 0.5350679159164429,
"mean_token_accuracy": 0.7839819490909576,
"num_tokens": 8184441.0,
"step": 502
},
{
"entropy": 0.5431730151176453,
"epoch": 1.8785046728971961,
"grad_norm": 0.031609971076250076,
"learning_rate": 0.0002,
"loss": 0.5454133152961731,
"mean_token_accuracy": 0.7757967710494995,
"num_tokens": 8200701.0,
"step": 503
},
{
"entropy": 0.5446748435497284,
"epoch": 1.8822429906542055,
"grad_norm": 0.03689466044306755,
"learning_rate": 0.0002,
"loss": 0.5491172075271606,
"mean_token_accuracy": 0.7771103084087372,
"num_tokens": 8216896.0,
"step": 504
},
{
"entropy": 0.5379506647586823,
"epoch": 1.885981308411215,
"grad_norm": 0.03774857521057129,
"learning_rate": 0.0002,
"loss": 0.5465993881225586,
"mean_token_accuracy": 0.7745991945266724,
"num_tokens": 8233119.0,
"step": 505
},
{
"entropy": 0.5524174273014069,
"epoch": 1.8897196261682243,
"grad_norm": 0.03127999231219292,
"learning_rate": 0.0002,
"loss": 0.552331268787384,
"mean_token_accuracy": 0.7734175026416779,
"num_tokens": 8249424.0,
"step": 506
},
{
"entropy": 0.5634707659482956,
"epoch": 1.8934579439252337,
"grad_norm": 0.03172188624739647,
"learning_rate": 0.0002,
"loss": 0.5552417039871216,
"mean_token_accuracy": 0.7762156277894974,
"num_tokens": 8265823.0,
"step": 507
},
{
"entropy": 0.5733916610479355,
"epoch": 1.897196261682243,
"grad_norm": 0.041391924023628235,
"learning_rate": 0.0002,
"loss": 0.5685185790061951,
"mean_token_accuracy": 0.7656967639923096,
"num_tokens": 8282150.0,
"step": 508
},
{
"entropy": 0.5633519440889359,
"epoch": 1.9009345794392525,
"grad_norm": 0.03210509195923805,
"learning_rate": 0.0002,
"loss": 0.5575313568115234,
"mean_token_accuracy": 0.7736276984214783,
"num_tokens": 8298545.0,
"step": 509
},
{
"entropy": 0.5282728672027588,
"epoch": 1.9046728971962616,
"grad_norm": 0.031000696122646332,
"learning_rate": 0.0002,
"loss": 0.5271653532981873,
"mean_token_accuracy": 0.7857028245925903,
"num_tokens": 8314750.0,
"step": 510
},
{
"entropy": 0.5598197877407074,
"epoch": 1.908411214953271,
"grad_norm": 0.03814297169446945,
"learning_rate": 0.0002,
"loss": 0.5556469559669495,
"mean_token_accuracy": 0.7734071165323257,
"num_tokens": 8331160.0,
"step": 511
},
{
"entropy": 0.5301484763622284,
"epoch": 1.9121495327102802,
"grad_norm": 0.03675490617752075,
"learning_rate": 0.0002,
"loss": 0.5384268760681152,
"mean_token_accuracy": 0.7815950363874435,
"num_tokens": 8347524.0,
"step": 512
},
{
"entropy": 0.556285485625267,
"epoch": 1.9158878504672896,
"grad_norm": 0.03204094246029854,
"learning_rate": 0.0002,
"loss": 0.5582637190818787,
"mean_token_accuracy": 0.7725251466035843,
"num_tokens": 8363738.0,
"step": 513
},
{
"entropy": 0.5535630583763123,
"epoch": 1.919626168224299,
"grad_norm": 0.030629510059952736,
"learning_rate": 0.0002,
"loss": 0.5578333735466003,
"mean_token_accuracy": 0.7727056741714478,
"num_tokens": 8380122.0,
"step": 514
},
{
"entropy": 0.5471296161413193,
"epoch": 1.9233644859813084,
"grad_norm": 0.03401264175772667,
"learning_rate": 0.0002,
"loss": 0.5535186529159546,
"mean_token_accuracy": 0.7754651010036469,
"num_tokens": 8396440.0,
"step": 515
},
{
"entropy": 0.5500332862138748,
"epoch": 1.9271028037383178,
"grad_norm": 0.03108939900994301,
"learning_rate": 0.0002,
"loss": 0.5485121607780457,
"mean_token_accuracy": 0.7769151926040649,
"num_tokens": 8412740.0,
"step": 516
},
{
"entropy": 0.5605651885271072,
"epoch": 1.9308411214953272,
"grad_norm": 0.028515921905636787,
"learning_rate": 0.0002,
"loss": 0.5516760349273682,
"mean_token_accuracy": 0.7752381414175034,
"num_tokens": 8429081.0,
"step": 517
},
{
"entropy": 0.5527090132236481,
"epoch": 1.9345794392523366,
"grad_norm": 0.032440509647130966,
"learning_rate": 0.0002,
"loss": 0.5482094883918762,
"mean_token_accuracy": 0.776523694396019,
"num_tokens": 8445459.0,
"step": 518
},
{
"entropy": 0.5639519840478897,
"epoch": 1.938317757009346,
"grad_norm": 0.03387531265616417,
"learning_rate": 0.0002,
"loss": 0.565314769744873,
"mean_token_accuracy": 0.7686825692653656,
"num_tokens": 8461834.0,
"step": 519
},
{
"entropy": 0.5390266180038452,
"epoch": 1.9420560747663551,
"grad_norm": 0.02882574312388897,
"learning_rate": 0.0002,
"loss": 0.5430452823638916,
"mean_token_accuracy": 0.7774745523929596,
"num_tokens": 8478272.0,
"step": 520
},
{
"entropy": 0.5343397557735443,
"epoch": 1.9457943925233645,
"grad_norm": 0.030860040336847305,
"learning_rate": 0.0002,
"loss": 0.5347194075584412,
"mean_token_accuracy": 0.7817697376012802,
"num_tokens": 8494437.0,
"step": 521
},
{
"entropy": 0.5492627769708633,
"epoch": 1.9495327102803737,
"grad_norm": 0.03405896574258804,
"learning_rate": 0.0002,
"loss": 0.5500932335853577,
"mean_token_accuracy": 0.7765759974718094,
"num_tokens": 8510975.0,
"step": 522
},
{
"entropy": 0.5563263446092606,
"epoch": 1.953271028037383,
"grad_norm": 0.03141237422823906,
"learning_rate": 0.0002,
"loss": 0.557966947555542,
"mean_token_accuracy": 0.7717025876045227,
"num_tokens": 8527347.0,
"step": 523
},
{
"entropy": 0.5636772364377975,
"epoch": 1.9570093457943925,
"grad_norm": 0.03168516606092453,
"learning_rate": 0.0002,
"loss": 0.5611008405685425,
"mean_token_accuracy": 0.7714557945728302,
"num_tokens": 8543551.0,
"step": 524
},
{
"entropy": 0.5489466190338135,
"epoch": 1.9607476635514018,
"grad_norm": 0.03355073928833008,
"learning_rate": 0.0002,
"loss": 0.5395604372024536,
"mean_token_accuracy": 0.7807340919971466,
"num_tokens": 8559955.0,
"step": 525
},
{
"entropy": 0.5399315655231476,
"epoch": 1.9644859813084112,
"grad_norm": 0.03453009948134422,
"learning_rate": 0.0002,
"loss": 0.5348931550979614,
"mean_token_accuracy": 0.7806299477815628,
"num_tokens": 8576469.0,
"step": 526
},
{
"entropy": 0.5491375476121902,
"epoch": 1.9682242990654206,
"grad_norm": 0.0316200815141201,
"learning_rate": 0.0002,
"loss": 0.5556234121322632,
"mean_token_accuracy": 0.773221030831337,
"num_tokens": 8592906.0,
"step": 527
},
{
"entropy": 0.5373014956712723,
"epoch": 1.97196261682243,
"grad_norm": 0.032452452927827835,
"learning_rate": 0.0002,
"loss": 0.5457467436790466,
"mean_token_accuracy": 0.7758653908967972,
"num_tokens": 8609100.0,
"step": 528
},
{
"entropy": 0.5414352118968964,
"epoch": 1.9757009345794394,
"grad_norm": 0.03351645544171333,
"learning_rate": 0.0002,
"loss": 0.5482410788536072,
"mean_token_accuracy": 0.7752601951360703,
"num_tokens": 8625316.0,
"step": 529
},
{
"entropy": 0.5407055169343948,
"epoch": 1.9794392523364486,
"grad_norm": 0.03003384917974472,
"learning_rate": 0.0002,
"loss": 0.5356785655021667,
"mean_token_accuracy": 0.7822994440793991,
"num_tokens": 8641716.0,
"step": 530
},
{
"entropy": 0.5463829636573792,
"epoch": 1.983177570093458,
"grad_norm": 0.028586186468601227,
"learning_rate": 0.0002,
"loss": 0.5386159420013428,
"mean_token_accuracy": 0.7832934260368347,
"num_tokens": 8658117.0,
"step": 531
},
{
"entropy": 0.52997986972332,
"epoch": 1.9869158878504671,
"grad_norm": 0.03231372311711311,
"learning_rate": 0.0002,
"loss": 0.5258426666259766,
"mean_token_accuracy": 0.786494106054306,
"num_tokens": 8674098.0,
"step": 532
},
{
"entropy": 0.5263413488864899,
"epoch": 1.9906542056074765,
"grad_norm": 0.029255473986268044,
"learning_rate": 0.0002,
"loss": 0.5267069935798645,
"mean_token_accuracy": 0.784383550286293,
"num_tokens": 8690474.0,
"step": 533
},
{
"entropy": 0.5337765663862228,
"epoch": 1.994392523364486,
"grad_norm": 0.03723280131816864,
"learning_rate": 0.0002,
"loss": 0.5434689521789551,
"mean_token_accuracy": 0.7792166471481323,
"num_tokens": 8706774.0,
"step": 534
},
{
"entropy": 0.5302833914756775,
"epoch": 1.9981308411214953,
"grad_norm": 0.03789842873811722,
"learning_rate": 0.0002,
"loss": 0.5390503406524658,
"mean_token_accuracy": 0.7825159579515457,
"num_tokens": 8722988.0,
"step": 535
},
{
"entropy": 0.5365387499332428,
"epoch": 2.0,
"grad_norm": 0.03994116187095642,
"learning_rate": 0.0002,
"loss": 0.5442785024642944,
"mean_token_accuracy": 0.779285341501236,
"num_tokens": 8731086.0,
"step": 536
},
{
"entropy": 0.5551358312368393,
"epoch": 2.0037383177570094,
"grad_norm": 0.03304925188422203,
"learning_rate": 0.0002,
"loss": 0.5366768836975098,
"mean_token_accuracy": 0.7850453853607178,
"num_tokens": 8747251.0,
"step": 537
},
{
"entropy": 0.5637228041887283,
"epoch": 2.007476635514019,
"grad_norm": 0.03504426032304764,
"learning_rate": 0.0002,
"loss": 0.5443665981292725,
"mean_token_accuracy": 0.7774000763893127,
"num_tokens": 8763427.0,
"step": 538
},
{
"entropy": 0.5427139699459076,
"epoch": 2.011214953271028,
"grad_norm": 0.03504855930805206,
"learning_rate": 0.0002,
"loss": 0.5313124656677246,
"mean_token_accuracy": 0.7818376272916794,
"num_tokens": 8779836.0,
"step": 539
},
{
"entropy": 0.5330108106136322,
"epoch": 2.0149532710280376,
"grad_norm": 0.03754406422376633,
"learning_rate": 0.0002,
"loss": 0.5421642661094666,
"mean_token_accuracy": 0.7790561318397522,
"num_tokens": 8796325.0,
"step": 540
},
{
"entropy": 0.512071430683136,
"epoch": 2.0186915887850465,
"grad_norm": 0.043662529438734055,
"learning_rate": 0.0002,
"loss": 0.5302350521087646,
"mean_token_accuracy": 0.7863733917474747,
"num_tokens": 8812606.0,
"step": 541
},
{
"entropy": 0.5129958391189575,
"epoch": 2.022429906542056,
"grad_norm": 0.04149031639099121,
"learning_rate": 0.0002,
"loss": 0.5309258699417114,
"mean_token_accuracy": 0.7860198318958282,
"num_tokens": 8828882.0,
"step": 542
},
{
"entropy": 0.5420234501361847,
"epoch": 2.0261682242990653,
"grad_norm": 0.03192834183573723,
"learning_rate": 0.0002,
"loss": 0.5397300124168396,
"mean_token_accuracy": 0.7826980352401733,
"num_tokens": 8845360.0,
"step": 543
},
{
"entropy": 0.5496412217617035,
"epoch": 2.0299065420560747,
"grad_norm": 0.03798922896385193,
"learning_rate": 0.0002,
"loss": 0.5328091979026794,
"mean_token_accuracy": 0.7848182171583176,
"num_tokens": 8861741.0,
"step": 544
},
{
"entropy": 0.5499916076660156,
"epoch": 2.033644859813084,
"grad_norm": 0.03497615084052086,
"learning_rate": 0.0002,
"loss": 0.5330801010131836,
"mean_token_accuracy": 0.7823185920715332,
"num_tokens": 8878099.0,
"step": 545
},
{
"entropy": 0.5397230982780457,
"epoch": 2.0373831775700935,
"grad_norm": 0.03805805742740631,
"learning_rate": 0.0002,
"loss": 0.5325009822845459,
"mean_token_accuracy": 0.7835113406181335,
"num_tokens": 8894613.0,
"step": 546
},
{
"entropy": 0.5198622792959213,
"epoch": 2.041121495327103,
"grad_norm": 0.03364388644695282,
"learning_rate": 0.0002,
"loss": 0.5222806334495544,
"mean_token_accuracy": 0.7844293862581253,
"num_tokens": 8910849.0,
"step": 547
},
{
"entropy": 0.5255338400602341,
"epoch": 2.0448598130841122,
"grad_norm": 0.047903481870889664,
"learning_rate": 0.0002,
"loss": 0.5388204455375671,
"mean_token_accuracy": 0.7818868011236191,
"num_tokens": 8927305.0,
"step": 548
},
{
"entropy": 0.5240660309791565,
"epoch": 2.0485981308411216,
"grad_norm": 0.04678136110305786,
"learning_rate": 0.0002,
"loss": 0.544981062412262,
"mean_token_accuracy": 0.7767013013362885,
"num_tokens": 8943628.0,
"step": 549
},
{
"entropy": 0.5418435484170914,
"epoch": 2.052336448598131,
"grad_norm": 0.04154983535408974,
"learning_rate": 0.0002,
"loss": 0.5431923866271973,
"mean_token_accuracy": 0.7803478538990021,
"num_tokens": 8959739.0,
"step": 550
},
{
"entropy": 0.5464048683643341,
"epoch": 2.05607476635514,
"grad_norm": 0.03621891885995865,
"learning_rate": 0.0002,
"loss": 0.5369123220443726,
"mean_token_accuracy": 0.7831740379333496,
"num_tokens": 8975834.0,
"step": 551
},
{
"entropy": 0.5625316351652145,
"epoch": 2.0598130841121494,
"grad_norm": 0.04116278514266014,
"learning_rate": 0.0002,
"loss": 0.5496330261230469,
"mean_token_accuracy": 0.7770462930202484,
"num_tokens": 8992265.0,
"step": 552
},
{
"entropy": 0.5488497316837311,
"epoch": 2.0635514018691588,
"grad_norm": 0.03322463855147362,
"learning_rate": 0.0002,
"loss": 0.5367662310600281,
"mean_token_accuracy": 0.7818718105554581,
"num_tokens": 9008719.0,
"step": 553
},
{
"entropy": 0.5378982275724411,
"epoch": 2.067289719626168,
"grad_norm": 0.034129269421100616,
"learning_rate": 0.0002,
"loss": 0.5418792963027954,
"mean_token_accuracy": 0.7807257324457169,
"num_tokens": 9025151.0,
"step": 554
},
{
"entropy": 0.5220974087715149,
"epoch": 2.0710280373831775,
"grad_norm": 0.045197054743766785,
"learning_rate": 0.0002,
"loss": 0.5300080180168152,
"mean_token_accuracy": 0.7885446846485138,
"num_tokens": 9041486.0,
"step": 555
},
{
"entropy": 0.515913613140583,
"epoch": 2.074766355140187,
"grad_norm": 0.04399452358484268,
"learning_rate": 0.0002,
"loss": 0.5253356099128723,
"mean_token_accuracy": 0.787113681435585,
"num_tokens": 9057792.0,
"step": 556
},
{
"entropy": 0.529649943113327,
"epoch": 2.0785046728971963,
"grad_norm": 0.0405830517411232,
"learning_rate": 0.0002,
"loss": 0.5332399010658264,
"mean_token_accuracy": 0.7825795114040375,
"num_tokens": 9073971.0,
"step": 557
},
{
"entropy": 0.5306390672922134,
"epoch": 2.0822429906542057,
"grad_norm": 0.04040224850177765,
"learning_rate": 0.0002,
"loss": 0.5270552039146423,
"mean_token_accuracy": 0.7854219824075699,
"num_tokens": 9090396.0,
"step": 558
},
{
"entropy": 0.540916696190834,
"epoch": 2.085981308411215,
"grad_norm": 0.039850566536188126,
"learning_rate": 0.0002,
"loss": 0.5330172181129456,
"mean_token_accuracy": 0.7840156704187393,
"num_tokens": 9106865.0,
"step": 559
},
{
"entropy": 0.5573539286851883,
"epoch": 2.0897196261682245,
"grad_norm": 0.039134591817855835,
"learning_rate": 0.0002,
"loss": 0.5492205023765564,
"mean_token_accuracy": 0.7779581248760223,
"num_tokens": 9123213.0,
"step": 560
},
{
"entropy": 0.5308785140514374,
"epoch": 2.0934579439252334,
"grad_norm": 0.033643938601017,
"learning_rate": 0.0002,
"loss": 0.5260533690452576,
"mean_token_accuracy": 0.7881509810686111,
"num_tokens": 9139334.0,
"step": 561
},
{
"entropy": 0.5462942272424698,
"epoch": 2.097196261682243,
"grad_norm": 0.0343049094080925,
"learning_rate": 0.0002,
"loss": 0.5453207492828369,
"mean_token_accuracy": 0.7791396528482437,
"num_tokens": 9155964.0,
"step": 562
},
{
"entropy": 0.5272018313407898,
"epoch": 2.100934579439252,
"grad_norm": 0.040583785623311996,
"learning_rate": 0.0002,
"loss": 0.5357244610786438,
"mean_token_accuracy": 0.7829957753419876,
"num_tokens": 9172409.0,
"step": 563
},
{
"entropy": 0.5276166945695877,
"epoch": 2.1046728971962616,
"grad_norm": 0.03636649623513222,
"learning_rate": 0.0002,
"loss": 0.5361207127571106,
"mean_token_accuracy": 0.7831525951623917,
"num_tokens": 9188524.0,
"step": 564
},
{
"entropy": 0.5464211106300354,
"epoch": 2.108411214953271,
"grad_norm": 0.0365222692489624,
"learning_rate": 0.0002,
"loss": 0.5448060035705566,
"mean_token_accuracy": 0.7774559408426285,
"num_tokens": 9204803.0,
"step": 565
},
{
"entropy": 0.5368735194206238,
"epoch": 2.1121495327102804,
"grad_norm": 0.04034702479839325,
"learning_rate": 0.0002,
"loss": 0.5308568477630615,
"mean_token_accuracy": 0.784459188580513,
"num_tokens": 9220931.0,
"step": 566
},
{
"entropy": 0.5340090990066528,
"epoch": 2.1158878504672898,
"grad_norm": 0.03558754175901413,
"learning_rate": 0.0002,
"loss": 0.5307760238647461,
"mean_token_accuracy": 0.7841941863298416,
"num_tokens": 9237402.0,
"step": 567
},
{
"entropy": 0.554409846663475,
"epoch": 2.119626168224299,
"grad_norm": 0.038797035813331604,
"learning_rate": 0.0002,
"loss": 0.5491658449172974,
"mean_token_accuracy": 0.7782745659351349,
"num_tokens": 9254002.0,
"step": 568
},
{
"entropy": 0.546349972486496,
"epoch": 2.1233644859813086,
"grad_norm": 0.04194206744432449,
"learning_rate": 0.0002,
"loss": 0.5519090294837952,
"mean_token_accuracy": 0.7750387489795685,
"num_tokens": 9270313.0,
"step": 569
},
{
"entropy": 0.5365971177816391,
"epoch": 2.127102803738318,
"grad_norm": 0.045358605682849884,
"learning_rate": 0.0002,
"loss": 0.5437461733818054,
"mean_token_accuracy": 0.7794076204299927,
"num_tokens": 9286712.0,
"step": 570
},
{
"entropy": 0.5360657125711441,
"epoch": 2.130841121495327,
"grad_norm": 0.04332416132092476,
"learning_rate": 0.0002,
"loss": 0.5378158688545227,
"mean_token_accuracy": 0.7812185734510422,
"num_tokens": 9302929.0,
"step": 571
},
{
"entropy": 0.5161439999938011,
"epoch": 2.1345794392523363,
"grad_norm": 0.03498893231153488,
"learning_rate": 0.0002,
"loss": 0.5166691541671753,
"mean_token_accuracy": 0.7898645251989365,
"num_tokens": 9318970.0,
"step": 572
},
{
"entropy": 0.5420155078172684,
"epoch": 2.1383177570093457,
"grad_norm": 0.059223148971796036,
"learning_rate": 0.0002,
"loss": 0.5398759841918945,
"mean_token_accuracy": 0.7814654260873795,
"num_tokens": 9335490.0,
"step": 573
},
{
"entropy": 0.5263395309448242,
"epoch": 2.142056074766355,
"grad_norm": 0.03245805576443672,
"learning_rate": 0.0002,
"loss": 0.5229323506355286,
"mean_token_accuracy": 0.7877913564443588,
"num_tokens": 9351959.0,
"step": 574
},
{
"entropy": 0.5362307131290436,
"epoch": 2.1457943925233645,
"grad_norm": 0.037454549223184586,
"learning_rate": 0.0002,
"loss": 0.5291175246238708,
"mean_token_accuracy": 0.783667266368866,
"num_tokens": 9368360.0,
"step": 575
},
{
"entropy": 0.527548685669899,
"epoch": 2.149532710280374,
"grad_norm": 0.043125126510858536,
"learning_rate": 0.0002,
"loss": 0.5279426574707031,
"mean_token_accuracy": 0.7838954478502274,
"num_tokens": 9384665.0,
"step": 576
},
{
"entropy": 0.543443351984024,
"epoch": 2.1532710280373832,
"grad_norm": 0.03840547800064087,
"learning_rate": 0.0002,
"loss": 0.5481908321380615,
"mean_token_accuracy": 0.7762167900800705,
"num_tokens": 9400994.0,
"step": 577
},
{
"entropy": 0.5402033478021622,
"epoch": 2.1570093457943926,
"grad_norm": 0.04524662345647812,
"learning_rate": 0.0002,
"loss": 0.5483248829841614,
"mean_token_accuracy": 0.7753354609012604,
"num_tokens": 9417287.0,
"step": 578
},
{
"entropy": 0.5183399319648743,
"epoch": 2.160747663551402,
"grad_norm": 0.033803943544626236,
"learning_rate": 0.0002,
"loss": 0.5152841210365295,
"mean_token_accuracy": 0.7872842252254486,
"num_tokens": 9433683.0,
"step": 579
},
{
"entropy": 0.5163632705807686,
"epoch": 2.1644859813084114,
"grad_norm": 0.036510877311229706,
"learning_rate": 0.0002,
"loss": 0.5149884223937988,
"mean_token_accuracy": 0.7905207723379135,
"num_tokens": 9450137.0,
"step": 580
},
{
"entropy": 0.5321061164140701,
"epoch": 2.1682242990654204,
"grad_norm": 0.0464416965842247,
"learning_rate": 0.0002,
"loss": 0.5351567268371582,
"mean_token_accuracy": 0.7838670462369919,
"num_tokens": 9466550.0,
"step": 581
},
{
"entropy": 0.5199630409479141,
"epoch": 2.1719626168224297,
"grad_norm": 0.04309747740626335,
"learning_rate": 0.0002,
"loss": 0.5278782844543457,
"mean_token_accuracy": 0.7839005291461945,
"num_tokens": 9482588.0,
"step": 582
},
{
"entropy": 0.5339600071310997,
"epoch": 2.175700934579439,
"grad_norm": 0.04095384106040001,
"learning_rate": 0.0002,
"loss": 0.5310637354850769,
"mean_token_accuracy": 0.783690795302391,
"num_tokens": 9498951.0,
"step": 583
},
{
"entropy": 0.5384320765733719,
"epoch": 2.1794392523364485,
"grad_norm": 0.03863927349448204,
"learning_rate": 0.0002,
"loss": 0.540824294090271,
"mean_token_accuracy": 0.7791530042886734,
"num_tokens": 9515132.0,
"step": 584
},
{
"entropy": 0.5549707859754562,
"epoch": 2.183177570093458,
"grad_norm": 0.03921306133270264,
"learning_rate": 0.0002,
"loss": 0.5536147356033325,
"mean_token_accuracy": 0.7751126140356064,
"num_tokens": 9531512.0,
"step": 585
},
{
"entropy": 0.5347359776496887,
"epoch": 2.1869158878504673,
"grad_norm": 0.037864800542593,
"learning_rate": 0.0002,
"loss": 0.5341432094573975,
"mean_token_accuracy": 0.7835363298654556,
"num_tokens": 9547534.0,
"step": 586
},
{
"entropy": 0.5516605377197266,
"epoch": 2.1906542056074767,
"grad_norm": 0.036846909672021866,
"learning_rate": 0.0002,
"loss": 0.5443211197853088,
"mean_token_accuracy": 0.7788311243057251,
"num_tokens": 9564040.0,
"step": 587
},
{
"entropy": 0.5391202419996262,
"epoch": 2.194392523364486,
"grad_norm": 0.03954128175973892,
"learning_rate": 0.0002,
"loss": 0.5309199094772339,
"mean_token_accuracy": 0.783383384346962,
"num_tokens": 9580289.0,
"step": 588
},
{
"entropy": 0.5318265110254288,
"epoch": 2.1981308411214955,
"grad_norm": 0.03327268362045288,
"learning_rate": 0.0002,
"loss": 0.5330622792243958,
"mean_token_accuracy": 0.7819591611623764,
"num_tokens": 9596500.0,
"step": 589
},
{
"entropy": 0.5139677748084068,
"epoch": 2.201869158878505,
"grad_norm": 0.039606738835573196,
"learning_rate": 0.0002,
"loss": 0.520559549331665,
"mean_token_accuracy": 0.7877521514892578,
"num_tokens": 9612675.0,
"step": 590
},
{
"entropy": 0.5283454358577728,
"epoch": 2.205607476635514,
"grad_norm": 0.03826924040913582,
"learning_rate": 0.0002,
"loss": 0.5321468710899353,
"mean_token_accuracy": 0.7843296527862549,
"num_tokens": 9629044.0,
"step": 591
},
{
"entropy": 0.5257805287837982,
"epoch": 2.209345794392523,
"grad_norm": 0.04099821671843529,
"learning_rate": 0.0002,
"loss": 0.5277660489082336,
"mean_token_accuracy": 0.7833193689584732,
"num_tokens": 9645271.0,
"step": 592
},
{
"entropy": 0.5350408107042313,
"epoch": 2.2130841121495326,
"grad_norm": 0.038267582654953,
"learning_rate": 0.0002,
"loss": 0.5255724787712097,
"mean_token_accuracy": 0.7867475599050522,
"num_tokens": 9661448.0,
"step": 593
},
{
"entropy": 0.5472716838121414,
"epoch": 2.216822429906542,
"grad_norm": 0.03405248373746872,
"learning_rate": 0.0002,
"loss": 0.5390135645866394,
"mean_token_accuracy": 0.779327467083931,
"num_tokens": 9677824.0,
"step": 594
},
{
"entropy": 0.5421159714460373,
"epoch": 2.2205607476635514,
"grad_norm": 0.041895944625139236,
"learning_rate": 0.0002,
"loss": 0.5395660400390625,
"mean_token_accuracy": 0.7796223610639572,
"num_tokens": 9694305.0,
"step": 595
},
{
"entropy": 0.5459330081939697,
"epoch": 2.2242990654205608,
"grad_norm": 0.036602918058633804,
"learning_rate": 0.0002,
"loss": 0.5457043647766113,
"mean_token_accuracy": 0.7810876667499542,
"num_tokens": 9710852.0,
"step": 596
},
{
"entropy": 0.5278807803988457,
"epoch": 2.22803738317757,
"grad_norm": 0.04418497160077095,
"learning_rate": 0.0002,
"loss": 0.5371560454368591,
"mean_token_accuracy": 0.7824568003416061,
"num_tokens": 9727075.0,
"step": 597
},
{
"entropy": 0.5311697870492935,
"epoch": 2.2317757009345796,
"grad_norm": 0.043200667947530746,
"learning_rate": 0.0002,
"loss": 0.5364136695861816,
"mean_token_accuracy": 0.783041849732399,
"num_tokens": 9743306.0,
"step": 598
},
{
"entropy": 0.5302419811487198,
"epoch": 2.235514018691589,
"grad_norm": 0.037720005959272385,
"learning_rate": 0.0002,
"loss": 0.5262041091918945,
"mean_token_accuracy": 0.7870023250579834,
"num_tokens": 9759403.0,
"step": 599
},
{
"entropy": 0.5483334362506866,
"epoch": 2.2392523364485983,
"grad_norm": 0.03560694679617882,
"learning_rate": 0.0002,
"loss": 0.5467509627342224,
"mean_token_accuracy": 0.779225081205368,
"num_tokens": 9775738.0,
"step": 600
},
{
"entropy": 0.5375639796257019,
"epoch": 2.2429906542056073,
"grad_norm": 0.03993435204029083,
"learning_rate": 0.0002,
"loss": 0.5336683988571167,
"mean_token_accuracy": 0.7839321345090866,
"num_tokens": 9792043.0,
"step": 601
},
{
"entropy": 0.544166311621666,
"epoch": 2.2467289719626167,
"grad_norm": 0.03602972254157066,
"learning_rate": 0.0002,
"loss": 0.5403839945793152,
"mean_token_accuracy": 0.7812667638063431,
"num_tokens": 9808431.0,
"step": 602
},
{
"entropy": 0.5295002460479736,
"epoch": 2.250467289719626,
"grad_norm": 0.041549984365701675,
"learning_rate": 0.0002,
"loss": 0.5339419841766357,
"mean_token_accuracy": 0.7843643128871918,
"num_tokens": 9824744.0,
"step": 603
},
{
"entropy": 0.5211731493473053,
"epoch": 2.2542056074766355,
"grad_norm": 0.04408840090036392,
"learning_rate": 0.0002,
"loss": 0.5288305878639221,
"mean_token_accuracy": 0.7842673063278198,
"num_tokens": 9841081.0,
"step": 604
},
{
"entropy": 0.5425246208906174,
"epoch": 2.257943925233645,
"grad_norm": 0.04026458412408829,
"learning_rate": 0.0002,
"loss": 0.5444083213806152,
"mean_token_accuracy": 0.7781710475683212,
"num_tokens": 9857545.0,
"step": 605
},
{
"entropy": 0.5519444048404694,
"epoch": 2.2616822429906542,
"grad_norm": 0.03973834961652756,
"learning_rate": 0.0002,
"loss": 0.547622799873352,
"mean_token_accuracy": 0.7769842147827148,
"num_tokens": 9873925.0,
"step": 606
},
{
"entropy": 0.5228262096643448,
"epoch": 2.2654205607476636,
"grad_norm": 0.041971541941165924,
"learning_rate": 0.0002,
"loss": 0.5222245454788208,
"mean_token_accuracy": 0.7858153134584427,
"num_tokens": 9890052.0,
"step": 607
},
{
"entropy": 0.5335221141576767,
"epoch": 2.269158878504673,
"grad_norm": 0.039673078805208206,
"learning_rate": 0.0002,
"loss": 0.5314098000526428,
"mean_token_accuracy": 0.7840564250946045,
"num_tokens": 9906259.0,
"step": 608
},
{
"entropy": 0.5426364839076996,
"epoch": 2.2728971962616824,
"grad_norm": 0.04128013923764229,
"learning_rate": 0.0002,
"loss": 0.5407010316848755,
"mean_token_accuracy": 0.7802868187427521,
"num_tokens": 9922434.0,
"step": 609
},
{
"entropy": 0.5306970030069351,
"epoch": 2.2766355140186914,
"grad_norm": 0.03684001415967941,
"learning_rate": 0.0002,
"loss": 0.5325096845626831,
"mean_token_accuracy": 0.7816676050424576,
"num_tokens": 9938715.0,
"step": 610
},
{
"entropy": 0.5312017947435379,
"epoch": 2.2803738317757007,
"grad_norm": 0.0396246500313282,
"learning_rate": 0.0002,
"loss": 0.5326136350631714,
"mean_token_accuracy": 0.7833829969167709,
"num_tokens": 9954795.0,
"step": 611
},
{
"entropy": 0.5242188572883606,
"epoch": 2.28411214953271,
"grad_norm": 0.03666768968105316,
"learning_rate": 0.0002,
"loss": 0.5254257321357727,
"mean_token_accuracy": 0.785698264837265,
"num_tokens": 9970976.0,
"step": 612
},
{
"entropy": 0.5251396894454956,
"epoch": 2.2878504672897195,
"grad_norm": 0.041744161397218704,
"learning_rate": 0.0002,
"loss": 0.5361155867576599,
"mean_token_accuracy": 0.781558558344841,
"num_tokens": 9987242.0,
"step": 613
},
{
"entropy": 0.5212117433547974,
"epoch": 2.291588785046729,
"grad_norm": 0.044306471943855286,
"learning_rate": 0.0002,
"loss": 0.5255172252655029,
"mean_token_accuracy": 0.7819651514291763,
"num_tokens": 10003383.0,
"step": 614
},
{
"entropy": 0.5342397391796112,
"epoch": 2.2953271028037383,
"grad_norm": 0.04804427549242973,
"learning_rate": 0.0002,
"loss": 0.5286440849304199,
"mean_token_accuracy": 0.7870652973651886,
"num_tokens": 10019705.0,
"step": 615
},
{
"entropy": 0.5513401627540588,
"epoch": 2.2990654205607477,
"grad_norm": 0.04101845622062683,
"learning_rate": 0.0002,
"loss": 0.5483744144439697,
"mean_token_accuracy": 0.7755522131919861,
"num_tokens": 10035997.0,
"step": 616
},
{
"entropy": 0.5434563606977463,
"epoch": 2.302803738317757,
"grad_norm": 0.036619942635297775,
"learning_rate": 0.0002,
"loss": 0.5326208472251892,
"mean_token_accuracy": 0.782253697514534,
"num_tokens": 10052253.0,
"step": 617
},
{
"entropy": 0.5315294414758682,
"epoch": 2.3065420560747665,
"grad_norm": 0.037794552743434906,
"learning_rate": 0.0002,
"loss": 0.5253270864486694,
"mean_token_accuracy": 0.7854621708393097,
"num_tokens": 10068502.0,
"step": 618
},
{
"entropy": 0.5264740660786629,
"epoch": 2.310280373831776,
"grad_norm": 0.05285142362117767,
"learning_rate": 0.0002,
"loss": 0.5347273349761963,
"mean_token_accuracy": 0.7845266908407211,
"num_tokens": 10084722.0,
"step": 619
},
{
"entropy": 0.5410954803228378,
"epoch": 2.3140186915887853,
"grad_norm": 0.036392901092767715,
"learning_rate": 0.0002,
"loss": 0.5492109060287476,
"mean_token_accuracy": 0.775203213095665,
"num_tokens": 10101110.0,
"step": 620
},
{
"entropy": 0.5478453040122986,
"epoch": 2.317757009345794,
"grad_norm": 0.0461491234600544,
"learning_rate": 0.0002,
"loss": 0.5482407808303833,
"mean_token_accuracy": 0.7783631533384323,
"num_tokens": 10117543.0,
"step": 621
},
{
"entropy": 0.515753298997879,
"epoch": 2.3214953271028036,
"grad_norm": 0.04075627774000168,
"learning_rate": 0.0002,
"loss": 0.5150102972984314,
"mean_token_accuracy": 0.789474606513977,
"num_tokens": 10133572.0,
"step": 622
},
{
"entropy": 0.5349336713552475,
"epoch": 2.325233644859813,
"grad_norm": 0.042154040187597275,
"learning_rate": 0.0002,
"loss": 0.526114821434021,
"mean_token_accuracy": 0.7856980115175247,
"num_tokens": 10150048.0,
"step": 623
},
{
"entropy": 0.5674707591533661,
"epoch": 2.3289719626168224,
"grad_norm": 0.04182770103216171,
"learning_rate": 0.0002,
"loss": 0.5611693859100342,
"mean_token_accuracy": 0.7749929875135422,
"num_tokens": 10166642.0,
"step": 624
},
{
"entropy": 0.5181543081998825,
"epoch": 2.3327102803738318,
"grad_norm": 0.038145892322063446,
"learning_rate": 0.0002,
"loss": 0.5206056833267212,
"mean_token_accuracy": 0.788123145699501,
"num_tokens": 10182897.0,
"step": 625
},
{
"entropy": 0.5357862561941147,
"epoch": 2.336448598130841,
"grad_norm": 0.04366487264633179,
"learning_rate": 0.0002,
"loss": 0.5423003435134888,
"mean_token_accuracy": 0.7787369638681412,
"num_tokens": 10199311.0,
"step": 626
},
{
"entropy": 0.5277369916439056,
"epoch": 2.3401869158878505,
"grad_norm": 0.05174623429775238,
"learning_rate": 0.0002,
"loss": 0.539736270904541,
"mean_token_accuracy": 0.7798131704330444,
"num_tokens": 10215707.0,
"step": 627
},
{
"entropy": 0.5540482401847839,
"epoch": 2.34392523364486,
"grad_norm": 0.03900719806551933,
"learning_rate": 0.0002,
"loss": 0.5546514391899109,
"mean_token_accuracy": 0.7751745879650116,
"num_tokens": 10232233.0,
"step": 628
},
{
"entropy": 0.5211993083357811,
"epoch": 2.3476635514018693,
"grad_norm": 0.044696055352687836,
"learning_rate": 0.0002,
"loss": 0.5210398435592651,
"mean_token_accuracy": 0.7867566049098969,
"num_tokens": 10248397.0,
"step": 629
},
{
"entropy": 0.5406811684370041,
"epoch": 2.3514018691588783,
"grad_norm": 0.04107234627008438,
"learning_rate": 0.0002,
"loss": 0.5430042147636414,
"mean_token_accuracy": 0.7786548435688019,
"num_tokens": 10264653.0,
"step": 630
},
{
"entropy": 0.538291797041893,
"epoch": 2.3551401869158877,
"grad_norm": 0.03656275197863579,
"learning_rate": 0.0002,
"loss": 0.534942090511322,
"mean_token_accuracy": 0.7826343178749084,
"num_tokens": 10280941.0,
"step": 631
},
{
"entropy": 0.5547115802764893,
"epoch": 2.358878504672897,
"grad_norm": 0.04424076899886131,
"learning_rate": 0.0002,
"loss": 0.5602344870567322,
"mean_token_accuracy": 0.7771879583597183,
"num_tokens": 10297564.0,
"step": 632
},
{
"entropy": 0.5327815413475037,
"epoch": 2.3626168224299064,
"grad_norm": 0.04512718692421913,
"learning_rate": 0.0002,
"loss": 0.529172420501709,
"mean_token_accuracy": 0.7825805693864822,
"num_tokens": 10313759.0,
"step": 633
},
{
"entropy": 0.5432299822568893,
"epoch": 2.366355140186916,
"grad_norm": 0.040462445467710495,
"learning_rate": 0.0002,
"loss": 0.5389863848686218,
"mean_token_accuracy": 0.779638260602951,
"num_tokens": 10330290.0,
"step": 634
},
{
"entropy": 0.5529568791389465,
"epoch": 2.3700934579439252,
"grad_norm": 0.04414237663149834,
"learning_rate": 0.0002,
"loss": 0.5526305437088013,
"mean_token_accuracy": 0.7754997760057449,
"num_tokens": 10346636.0,
"step": 635
},
{
"entropy": 0.5441652536392212,
"epoch": 2.3738317757009346,
"grad_norm": 0.037299707531929016,
"learning_rate": 0.0002,
"loss": 0.5382997393608093,
"mean_token_accuracy": 0.7791097015142441,
"num_tokens": 10362922.0,
"step": 636
},
{
"entropy": 0.5348048955202103,
"epoch": 2.377570093457944,
"grad_norm": 0.0446464829146862,
"learning_rate": 0.0002,
"loss": 0.5380210876464844,
"mean_token_accuracy": 0.7818952798843384,
"num_tokens": 10379134.0,
"step": 637
},
{
"entropy": 0.5187151804566383,
"epoch": 2.3813084112149534,
"grad_norm": 0.0778694897890091,
"learning_rate": 0.0002,
"loss": 0.5220566391944885,
"mean_token_accuracy": 0.7889348715543747,
"num_tokens": 10395255.0,
"step": 638
},
{
"entropy": 0.5462511032819748,
"epoch": 2.385046728971963,
"grad_norm": 0.04299847036600113,
"learning_rate": 0.0002,
"loss": 0.5423526167869568,
"mean_token_accuracy": 0.7763472348451614,
"num_tokens": 10411644.0,
"step": 639
},
{
"entropy": 0.5463699400424957,
"epoch": 2.388785046728972,
"grad_norm": 0.10935911536216736,
"learning_rate": 0.0002,
"loss": 0.554538369178772,
"mean_token_accuracy": 0.7772965431213379,
"num_tokens": 10427999.0,
"step": 640
},
{
"entropy": 0.5152165368199348,
"epoch": 2.392523364485981,
"grad_norm": 0.03762959688901901,
"learning_rate": 0.0002,
"loss": 0.508588969707489,
"mean_token_accuracy": 0.7926003634929657,
"num_tokens": 10444169.0,
"step": 641
},
{
"entropy": 0.529686912894249,
"epoch": 2.3962616822429905,
"grad_norm": 0.040958285331726074,
"learning_rate": 0.0002,
"loss": 0.5307521820068359,
"mean_token_accuracy": 0.7849727272987366,
"num_tokens": 10460506.0,
"step": 642
},
{
"entropy": 0.5430792719125748,
"epoch": 2.4,
"grad_norm": 0.059025488793849945,
"learning_rate": 0.0002,
"loss": 0.5434512495994568,
"mean_token_accuracy": 0.7796961963176727,
"num_tokens": 10476852.0,
"step": 643
},
{
"entropy": 0.5448063015937805,
"epoch": 2.4037383177570093,
"grad_norm": 0.040974777191877365,
"learning_rate": 0.0002,
"loss": 0.5473527312278748,
"mean_token_accuracy": 0.7792296558618546,
"num_tokens": 10493362.0,
"step": 644
},
{
"entropy": 0.5385838449001312,
"epoch": 2.4074766355140187,
"grad_norm": 0.03980987146496773,
"learning_rate": 0.0002,
"loss": 0.5398511290550232,
"mean_token_accuracy": 0.7808338552713394,
"num_tokens": 10509993.0,
"step": 645
},
{
"entropy": 0.5397947132587433,
"epoch": 2.411214953271028,
"grad_norm": 0.04422999173402786,
"learning_rate": 0.0002,
"loss": 0.5439976453781128,
"mean_token_accuracy": 0.7772432416677475,
"num_tokens": 10525999.0,
"step": 646
},
{
"entropy": 0.5487875193357468,
"epoch": 2.4149532710280375,
"grad_norm": 0.035030197352170944,
"learning_rate": 0.0002,
"loss": 0.5411213636398315,
"mean_token_accuracy": 0.7808128446340561,
"num_tokens": 10542385.0,
"step": 647
},
{
"entropy": 0.5536469519138336,
"epoch": 2.418691588785047,
"grad_norm": 0.03504094481468201,
"learning_rate": 0.0002,
"loss": 0.5501288771629333,
"mean_token_accuracy": 0.7798037678003311,
"num_tokens": 10558968.0,
"step": 648
},
{
"entropy": 0.542830765247345,
"epoch": 2.4224299065420563,
"grad_norm": 0.04252900928258896,
"learning_rate": 0.0002,
"loss": 0.5463917255401611,
"mean_token_accuracy": 0.7780060321092606,
"num_tokens": 10575204.0,
"step": 649
},
{
"entropy": 0.5445516556501389,
"epoch": 2.426168224299065,
"grad_norm": 0.03962906450033188,
"learning_rate": 0.0002,
"loss": 0.5398474335670471,
"mean_token_accuracy": 0.7808130532503128,
"num_tokens": 10591758.0,
"step": 650
},
{
"entropy": 0.5405502319335938,
"epoch": 2.4299065420560746,
"grad_norm": 0.0443168580532074,
"learning_rate": 0.0002,
"loss": 0.5365331172943115,
"mean_token_accuracy": 0.7831508964300156,
"num_tokens": 10608086.0,
"step": 651
},
{
"entropy": 0.5417730808258057,
"epoch": 2.433644859813084,
"grad_norm": 0.03887809067964554,
"learning_rate": 0.0002,
"loss": 0.5410832166671753,
"mean_token_accuracy": 0.7785631865262985,
"num_tokens": 10624498.0,
"step": 652
},
{
"entropy": 0.539076067507267,
"epoch": 2.4373831775700934,
"grad_norm": 0.03908571973443031,
"learning_rate": 0.0002,
"loss": 0.5387341976165771,
"mean_token_accuracy": 0.781864196062088,
"num_tokens": 10640880.0,
"step": 653
},
{
"entropy": 0.5390027314424515,
"epoch": 2.4411214953271028,
"grad_norm": 0.03712445870041847,
"learning_rate": 0.0002,
"loss": 0.5360729694366455,
"mean_token_accuracy": 0.783073827624321,
"num_tokens": 10657400.0,
"step": 654
},
{
"entropy": 0.5502242594957352,
"epoch": 2.444859813084112,
"grad_norm": 0.03870626538991928,
"learning_rate": 0.0002,
"loss": 0.5568853616714478,
"mean_token_accuracy": 0.7743858247995377,
"num_tokens": 10673826.0,
"step": 655
},
{
"entropy": 0.525546170771122,
"epoch": 2.4485981308411215,
"grad_norm": 0.05200404301285744,
"learning_rate": 0.0002,
"loss": 0.5247287154197693,
"mean_token_accuracy": 0.787117063999176,
"num_tokens": 10690101.0,
"step": 656
},
{
"entropy": 0.5489766597747803,
"epoch": 2.452336448598131,
"grad_norm": 0.03731005638837814,
"learning_rate": 0.0002,
"loss": 0.5479599833488464,
"mean_token_accuracy": 0.7739788293838501,
"num_tokens": 10706469.0,
"step": 657
},
{
"entropy": 0.5457844734191895,
"epoch": 2.4560747663551403,
"grad_norm": 0.03958994895219803,
"learning_rate": 0.0002,
"loss": 0.5466060638427734,
"mean_token_accuracy": 0.776677593588829,
"num_tokens": 10722827.0,
"step": 658
},
{
"entropy": 0.5301162749528885,
"epoch": 2.4598130841121497,
"grad_norm": 0.04651971161365509,
"learning_rate": 0.0002,
"loss": 0.5345625281333923,
"mean_token_accuracy": 0.7808788865804672,
"num_tokens": 10739136.0,
"step": 659
},
{
"entropy": 0.5545621961355209,
"epoch": 2.463551401869159,
"grad_norm": 0.04008018597960472,
"learning_rate": 0.0002,
"loss": 0.5584450960159302,
"mean_token_accuracy": 0.7706544101238251,
"num_tokens": 10755369.0,
"step": 660
},
{
"entropy": 0.5189358592033386,
"epoch": 2.467289719626168,
"grad_norm": 0.040387995541095734,
"learning_rate": 0.0002,
"loss": 0.5199939608573914,
"mean_token_accuracy": 0.7878802865743637,
"num_tokens": 10771408.0,
"step": 661
},
{
"entropy": 0.5370910465717316,
"epoch": 2.4710280373831774,
"grad_norm": 0.04395879805088043,
"learning_rate": 0.0002,
"loss": 0.534496545791626,
"mean_token_accuracy": 0.7834903597831726,
"num_tokens": 10787604.0,
"step": 662
},
{
"entropy": 0.5326719284057617,
"epoch": 2.474766355140187,
"grad_norm": 0.04668545350432396,
"learning_rate": 0.0002,
"loss": 0.5241788029670715,
"mean_token_accuracy": 0.7905293852090836,
"num_tokens": 10803945.0,
"step": 663
},
{
"entropy": 0.5368177741765976,
"epoch": 2.4785046728971962,
"grad_norm": 0.04925902187824249,
"learning_rate": 0.0002,
"loss": 0.5367681384086609,
"mean_token_accuracy": 0.7809154391288757,
"num_tokens": 10820178.0,
"step": 664
},
{
"entropy": 0.5293789505958557,
"epoch": 2.4822429906542056,
"grad_norm": 0.041696734726428986,
"learning_rate": 0.0002,
"loss": 0.5327548980712891,
"mean_token_accuracy": 0.7873236238956451,
"num_tokens": 10836561.0,
"step": 665
},
{
"entropy": 0.529408723115921,
"epoch": 2.485981308411215,
"grad_norm": 0.041212067008018494,
"learning_rate": 0.0002,
"loss": 0.5328470468521118,
"mean_token_accuracy": 0.7832391858100891,
"num_tokens": 10852980.0,
"step": 666
},
{
"entropy": 0.5545576214790344,
"epoch": 2.4897196261682244,
"grad_norm": 0.04478580132126808,
"learning_rate": 0.0002,
"loss": 0.5554249286651611,
"mean_token_accuracy": 0.7741198241710663,
"num_tokens": 10869321.0,
"step": 667
},
{
"entropy": 0.5539140552282333,
"epoch": 2.493457943925234,
"grad_norm": 0.04277152568101883,
"learning_rate": 0.0002,
"loss": 0.5493362545967102,
"mean_token_accuracy": 0.7759024202823639,
"num_tokens": 10885666.0,
"step": 668
},
{
"entropy": 0.5433756709098816,
"epoch": 2.497196261682243,
"grad_norm": 0.04360437020659447,
"learning_rate": 0.0002,
"loss": 0.5412634611129761,
"mean_token_accuracy": 0.7808667570352554,
"num_tokens": 10901903.0,
"step": 669
},
{
"entropy": 0.5487286895513535,
"epoch": 2.500934579439252,
"grad_norm": 0.03885580971837044,
"learning_rate": 0.0002,
"loss": 0.5431787371635437,
"mean_token_accuracy": 0.7802725732326508,
"num_tokens": 10918340.0,
"step": 670
},
{
"entropy": 0.5228707492351532,
"epoch": 2.5046728971962615,
"grad_norm": 0.053798187524080276,
"learning_rate": 0.0002,
"loss": 0.5311392545700073,
"mean_token_accuracy": 0.7843292206525803,
"num_tokens": 10934469.0,
"step": 671
},
{
"entropy": 0.5447903871536255,
"epoch": 2.508411214953271,
"grad_norm": 0.05324989929795265,
"learning_rate": 0.0002,
"loss": 0.5491751432418823,
"mean_token_accuracy": 0.7752528339624405,
"num_tokens": 10950837.0,
"step": 672
},
{
"entropy": 0.5308417528867722,
"epoch": 2.5121495327102803,
"grad_norm": 0.06228797510266304,
"learning_rate": 0.0002,
"loss": 0.5361084938049316,
"mean_token_accuracy": 0.7828515321016312,
"num_tokens": 10967098.0,
"step": 673
},
{
"entropy": 0.5403530299663544,
"epoch": 2.5158878504672897,
"grad_norm": 0.051257163286209106,
"learning_rate": 0.0002,
"loss": 0.542191207408905,
"mean_token_accuracy": 0.7825300693511963,
"num_tokens": 10983262.0,
"step": 674
},
{
"entropy": 0.5413467437028885,
"epoch": 2.519626168224299,
"grad_norm": 0.04910978302359581,
"learning_rate": 0.0002,
"loss": 0.5313704013824463,
"mean_token_accuracy": 0.7851869165897369,
"num_tokens": 10999552.0,
"step": 675
},
{
"entropy": 0.55167156457901,
"epoch": 2.5233644859813085,
"grad_norm": 0.033519063144922256,
"learning_rate": 0.0002,
"loss": 0.5438812971115112,
"mean_token_accuracy": 0.7780154794454575,
"num_tokens": 11016044.0,
"step": 676
},
{
"entropy": 0.5392196476459503,
"epoch": 2.527102803738318,
"grad_norm": 0.04278670251369476,
"learning_rate": 0.0002,
"loss": 0.5411216020584106,
"mean_token_accuracy": 0.780839130282402,
"num_tokens": 11032377.0,
"step": 677
},
{
"entropy": 0.5352826565504074,
"epoch": 2.5308411214953273,
"grad_norm": 0.04736237972974777,
"learning_rate": 0.0002,
"loss": 0.5446096658706665,
"mean_token_accuracy": 0.7806870341300964,
"num_tokens": 11048727.0,
"step": 678
},
{
"entropy": 0.5168470665812492,
"epoch": 2.5345794392523366,
"grad_norm": 0.03513955697417259,
"learning_rate": 0.0002,
"loss": 0.5200102925300598,
"mean_token_accuracy": 0.7874528765678406,
"num_tokens": 11064947.0,
"step": 679
},
{
"entropy": 0.5375211834907532,
"epoch": 2.538317757009346,
"grad_norm": 0.04709267243742943,
"learning_rate": 0.0002,
"loss": 0.5393041968345642,
"mean_token_accuracy": 0.7837181091308594,
"num_tokens": 11081532.0,
"step": 680
},
{
"entropy": 0.5512478798627853,
"epoch": 2.542056074766355,
"grad_norm": 0.04090959206223488,
"learning_rate": 0.0002,
"loss": 0.546190619468689,
"mean_token_accuracy": 0.7762559801340103,
"num_tokens": 11098073.0,
"step": 681
},
{
"entropy": 0.5283504128456116,
"epoch": 2.5457943925233644,
"grad_norm": 0.036959145218133926,
"learning_rate": 0.0002,
"loss": 0.5237979292869568,
"mean_token_accuracy": 0.7874845713376999,
"num_tokens": 11114315.0,
"step": 682
},
{
"entropy": 0.5489681363105774,
"epoch": 2.5495327102803738,
"grad_norm": 0.04488472267985344,
"learning_rate": 0.0002,
"loss": 0.5456336736679077,
"mean_token_accuracy": 0.7797751575708389,
"num_tokens": 11130665.0,
"step": 683
},
{
"entropy": 0.5317860543727875,
"epoch": 2.553271028037383,
"grad_norm": 0.04248347505927086,
"learning_rate": 0.0002,
"loss": 0.5382874011993408,
"mean_token_accuracy": 0.77965147793293,
"num_tokens": 11146874.0,
"step": 684
},
{
"entropy": 0.5419623553752899,
"epoch": 2.5570093457943925,
"grad_norm": 0.04522377625107765,
"learning_rate": 0.0002,
"loss": 0.5449318289756775,
"mean_token_accuracy": 0.7786058634519577,
"num_tokens": 11163427.0,
"step": 685
},
{
"entropy": 0.5241860747337341,
"epoch": 2.560747663551402,
"grad_norm": 0.04621601849794388,
"learning_rate": 0.0002,
"loss": 0.5267641544342041,
"mean_token_accuracy": 0.7829258441925049,
"num_tokens": 11179801.0,
"step": 686
},
{
"entropy": 0.5173597782850266,
"epoch": 2.5644859813084113,
"grad_norm": 0.043366726487874985,
"learning_rate": 0.0002,
"loss": 0.5181450843811035,
"mean_token_accuracy": 0.7898700088262558,
"num_tokens": 11196083.0,
"step": 687
},
{
"entropy": 0.538482740521431,
"epoch": 2.5682242990654207,
"grad_norm": 0.04418179765343666,
"learning_rate": 0.0002,
"loss": 0.5392533540725708,
"mean_token_accuracy": 0.778387576341629,
"num_tokens": 11212295.0,
"step": 688
},
{
"entropy": 0.540611207485199,
"epoch": 2.5719626168224297,
"grad_norm": 0.05271269753575325,
"learning_rate": 0.0002,
"loss": 0.5393270254135132,
"mean_token_accuracy": 0.7812009155750275,
"num_tokens": 11228565.0,
"step": 689
},
{
"entropy": 0.5282483994960785,
"epoch": 2.575700934579439,
"grad_norm": 0.04314183071255684,
"learning_rate": 0.0002,
"loss": 0.5224794149398804,
"mean_token_accuracy": 0.7856594175100327,
"num_tokens": 11244953.0,
"step": 690
},
{
"entropy": 0.5318177044391632,
"epoch": 2.5794392523364484,
"grad_norm": 0.05587287247180939,
"learning_rate": 0.0002,
"loss": 0.5358354449272156,
"mean_token_accuracy": 0.7822671979665756,
"num_tokens": 11261194.0,
"step": 691
},
{
"entropy": 0.5375986397266388,
"epoch": 2.583177570093458,
"grad_norm": 0.043386682868003845,
"learning_rate": 0.0002,
"loss": 0.5412317514419556,
"mean_token_accuracy": 0.781296119093895,
"num_tokens": 11277286.0,
"step": 692
},
{
"entropy": 0.5498186945915222,
"epoch": 2.586915887850467,
"grad_norm": 0.04709560051560402,
"learning_rate": 0.0002,
"loss": 0.5513982176780701,
"mean_token_accuracy": 0.7768333256244659,
"num_tokens": 11293799.0,
"step": 693
},
{
"entropy": 0.5409555584192276,
"epoch": 2.5906542056074766,
"grad_norm": 0.04518339782953262,
"learning_rate": 0.0002,
"loss": 0.5396868586540222,
"mean_token_accuracy": 0.7791042476892471,
"num_tokens": 11310089.0,
"step": 694
},
{
"entropy": 0.5236431509256363,
"epoch": 2.594392523364486,
"grad_norm": 0.03244040906429291,
"learning_rate": 0.0002,
"loss": 0.5155695676803589,
"mean_token_accuracy": 0.7898247241973877,
"num_tokens": 11326515.0,
"step": 695
},
{
"entropy": 0.5529845803976059,
"epoch": 2.5981308411214954,
"grad_norm": 0.04760007932782173,
"learning_rate": 0.0002,
"loss": 0.5487071871757507,
"mean_token_accuracy": 0.7782804220914841,
"num_tokens": 11342994.0,
"step": 696
},
{
"entropy": 0.5314944535493851,
"epoch": 2.601869158878505,
"grad_norm": 0.0422595851123333,
"learning_rate": 0.0002,
"loss": 0.5344254970550537,
"mean_token_accuracy": 0.7827649861574173,
"num_tokens": 11359320.0,
"step": 697
},
{
"entropy": 0.5296527296304703,
"epoch": 2.605607476635514,
"grad_norm": 0.04541509971022606,
"learning_rate": 0.0002,
"loss": 0.5399951338768005,
"mean_token_accuracy": 0.7812868803739548,
"num_tokens": 11375866.0,
"step": 698
},
{
"entropy": 0.5503706336021423,
"epoch": 2.6093457943925236,
"grad_norm": 0.04639806970953941,
"learning_rate": 0.0002,
"loss": 0.560705304145813,
"mean_token_accuracy": 0.7734115719795227,
"num_tokens": 11392189.0,
"step": 699
},
{
"entropy": 0.5334575325250626,
"epoch": 2.613084112149533,
"grad_norm": 0.03491205349564552,
"learning_rate": 0.0002,
"loss": 0.5285266637802124,
"mean_token_accuracy": 0.786865234375,
"num_tokens": 11408320.0,
"step": 700
},
{
"entropy": 0.5375584214925766,
"epoch": 2.616822429906542,
"grad_norm": 0.03665752336382866,
"learning_rate": 0.0002,
"loss": 0.5285854935646057,
"mean_token_accuracy": 0.7843970507383347,
"num_tokens": 11424696.0,
"step": 701
},
{
"entropy": 0.5432839095592499,
"epoch": 2.6205607476635513,
"grad_norm": 0.040845148265361786,
"learning_rate": 0.0002,
"loss": 0.5354432463645935,
"mean_token_accuracy": 0.7819717228412628,
"num_tokens": 11440921.0,
"step": 702
},
{
"entropy": 0.5447598993778229,
"epoch": 2.6242990654205607,
"grad_norm": 0.03317207470536232,
"learning_rate": 0.0002,
"loss": 0.5364579558372498,
"mean_token_accuracy": 0.7815430164337158,
"num_tokens": 11457136.0,
"step": 703
},
{
"entropy": 0.5318229794502258,
"epoch": 2.62803738317757,
"grad_norm": 0.04842844605445862,
"learning_rate": 0.0002,
"loss": 0.5381250381469727,
"mean_token_accuracy": 0.7842467576265335,
"num_tokens": 11473451.0,
"step": 704
},
{
"entropy": 0.53319051861763,
"epoch": 2.6317757009345795,
"grad_norm": 0.04995809122920036,
"learning_rate": 0.0002,
"loss": 0.5435810089111328,
"mean_token_accuracy": 0.7806897163391113,
"num_tokens": 11489778.0,
"step": 705
},
{
"entropy": 0.5205372422933578,
"epoch": 2.635514018691589,
"grad_norm": 0.043053507804870605,
"learning_rate": 0.0002,
"loss": 0.5225018858909607,
"mean_token_accuracy": 0.7891059070825577,
"num_tokens": 11506150.0,
"step": 706
},
{
"entropy": 0.5405721217393875,
"epoch": 2.6392523364485982,
"grad_norm": 0.047551702708005905,
"learning_rate": 0.0002,
"loss": 0.5341666340827942,
"mean_token_accuracy": 0.7827833145856857,
"num_tokens": 11522269.0,
"step": 707
},
{
"entropy": 0.555420309305191,
"epoch": 2.6429906542056076,
"grad_norm": 0.04240434989333153,
"learning_rate": 0.0002,
"loss": 0.5463941097259521,
"mean_token_accuracy": 0.776122510433197,
"num_tokens": 11538672.0,
"step": 708
},
{
"entropy": 0.5373465269804001,
"epoch": 2.6467289719626166,
"grad_norm": 0.04053036868572235,
"learning_rate": 0.0002,
"loss": 0.5378127694129944,
"mean_token_accuracy": 0.7802188992500305,
"num_tokens": 11554872.0,
"step": 709
},
{
"entropy": 0.554849311709404,
"epoch": 2.650467289719626,
"grad_norm": 0.03659540414810181,
"learning_rate": 0.0002,
"loss": 0.5495964288711548,
"mean_token_accuracy": 0.7751747816801071,
"num_tokens": 11571048.0,
"step": 710
},
{
"entropy": 0.5463902503252029,
"epoch": 2.6542056074766354,
"grad_norm": 0.04418041929602623,
"learning_rate": 0.0002,
"loss": 0.5471721887588501,
"mean_token_accuracy": 0.7752395421266556,
"num_tokens": 11587320.0,
"step": 711
},
{
"entropy": 0.5346667915582657,
"epoch": 2.6579439252336448,
"grad_norm": 0.03727971389889717,
"learning_rate": 0.0002,
"loss": 0.5335649847984314,
"mean_token_accuracy": 0.7821184396743774,
"num_tokens": 11603606.0,
"step": 712
},
{
"entropy": 0.5425343364477158,
"epoch": 2.661682242990654,
"grad_norm": 0.03725122660398483,
"learning_rate": 0.0002,
"loss": 0.5478883385658264,
"mean_token_accuracy": 0.7786499708890915,
"num_tokens": 11619898.0,
"step": 713
},
{
"entropy": 0.5213692635297775,
"epoch": 2.6654205607476635,
"grad_norm": 0.042857397347688675,
"learning_rate": 0.0002,
"loss": 0.5380342602729797,
"mean_token_accuracy": 0.7818091064691544,
"num_tokens": 11636325.0,
"step": 714
},
{
"entropy": 0.514741487801075,
"epoch": 2.669158878504673,
"grad_norm": 0.035097621381282806,
"learning_rate": 0.0002,
"loss": 0.5151344537734985,
"mean_token_accuracy": 0.7884217798709869,
"num_tokens": 11652621.0,
"step": 715
},
{
"entropy": 0.5442497134208679,
"epoch": 2.6728971962616823,
"grad_norm": 0.04381122440099716,
"learning_rate": 0.0002,
"loss": 0.5412749648094177,
"mean_token_accuracy": 0.7799884676933289,
"num_tokens": 11669129.0,
"step": 716
},
{
"entropy": 0.5303985998034477,
"epoch": 2.6766355140186917,
"grad_norm": 0.03387914225459099,
"learning_rate": 0.0002,
"loss": 0.5209308862686157,
"mean_token_accuracy": 0.7879882901906967,
"num_tokens": 11685246.0,
"step": 717
},
{
"entropy": 0.551127091050148,
"epoch": 2.680373831775701,
"grad_norm": 0.03922301158308983,
"learning_rate": 0.0002,
"loss": 0.5454061031341553,
"mean_token_accuracy": 0.7784066051244736,
"num_tokens": 11701476.0,
"step": 718
},
{
"entropy": 0.537367194890976,
"epoch": 2.6841121495327105,
"grad_norm": 0.038754355162382126,
"learning_rate": 0.0002,
"loss": 0.5407044887542725,
"mean_token_accuracy": 0.7816831916570663,
"num_tokens": 11717876.0,
"step": 719
},
{
"entropy": 0.5448082834482193,
"epoch": 2.68785046728972,
"grad_norm": 0.039220135658979416,
"learning_rate": 0.0002,
"loss": 0.5474362373352051,
"mean_token_accuracy": 0.7776313573122025,
"num_tokens": 11734335.0,
"step": 720
},
{
"entropy": 0.5400021821260452,
"epoch": 2.691588785046729,
"grad_norm": 0.04735405370593071,
"learning_rate": 0.0002,
"loss": 0.5481384992599487,
"mean_token_accuracy": 0.7767128497362137,
"num_tokens": 11750551.0,
"step": 721
},
{
"entropy": 0.5442029386758804,
"epoch": 2.695327102803738,
"grad_norm": 0.04216023534536362,
"learning_rate": 0.0002,
"loss": 0.5538774728775024,
"mean_token_accuracy": 0.7767860740423203,
"num_tokens": 11766874.0,
"step": 722
},
{
"entropy": 0.5446023046970367,
"epoch": 2.6990654205607476,
"grad_norm": 0.036887411028146744,
"learning_rate": 0.0002,
"loss": 0.5384114384651184,
"mean_token_accuracy": 0.7818654030561447,
"num_tokens": 11783153.0,
"step": 723
},
{
"entropy": 0.5451595932245255,
"epoch": 2.702803738317757,
"grad_norm": 0.03859608620405197,
"learning_rate": 0.0002,
"loss": 0.5347609519958496,
"mean_token_accuracy": 0.781577005982399,
"num_tokens": 11799221.0,
"step": 724
},
{
"entropy": 0.5464123338460922,
"epoch": 2.7065420560747664,
"grad_norm": 0.04104648903012276,
"learning_rate": 0.0002,
"loss": 0.531836986541748,
"mean_token_accuracy": 0.7847746908664703,
"num_tokens": 11815592.0,
"step": 725
},
{
"entropy": 0.5458803474903107,
"epoch": 2.710280373831776,
"grad_norm": 0.041141774505376816,
"learning_rate": 0.0002,
"loss": 0.5450369119644165,
"mean_token_accuracy": 0.7772473990917206,
"num_tokens": 11831810.0,
"step": 726
},
{
"entropy": 0.5207616165280342,
"epoch": 2.714018691588785,
"grad_norm": 0.039117299020290375,
"learning_rate": 0.0002,
"loss": 0.5268270969390869,
"mean_token_accuracy": 0.7860666513442993,
"num_tokens": 11848039.0,
"step": 727
},
{
"entropy": 0.5192839056253433,
"epoch": 2.717757009345794,
"grad_norm": 0.03917457163333893,
"learning_rate": 0.0002,
"loss": 0.5228926539421082,
"mean_token_accuracy": 0.7870692610740662,
"num_tokens": 11864185.0,
"step": 728
},
{
"entropy": 0.5525725483894348,
"epoch": 2.7214953271028035,
"grad_norm": 0.04475993663072586,
"learning_rate": 0.0002,
"loss": 0.5607837438583374,
"mean_token_accuracy": 0.7710844576358795,
"num_tokens": 11880885.0,
"step": 729
},
{
"entropy": 0.5314790159463882,
"epoch": 2.725233644859813,
"grad_norm": 0.03775126487016678,
"learning_rate": 0.0002,
"loss": 0.5314686298370361,
"mean_token_accuracy": 0.7859503030776978,
"num_tokens": 11897351.0,
"step": 730
},
{
"entropy": 0.5637041479349136,
"epoch": 2.7289719626168223,
"grad_norm": 0.045830611139535904,
"learning_rate": 0.0002,
"loss": 0.5615176558494568,
"mean_token_accuracy": 0.7733500599861145,
"num_tokens": 11913886.0,
"step": 731
},
{
"entropy": 0.5528976023197174,
"epoch": 2.7327102803738317,
"grad_norm": 0.0355507992208004,
"learning_rate": 0.0002,
"loss": 0.5482446551322937,
"mean_token_accuracy": 0.7790254056453705,
"num_tokens": 11930270.0,
"step": 732
},
{
"entropy": 0.521368145942688,
"epoch": 2.736448598130841,
"grad_norm": 0.040386781096458435,
"learning_rate": 0.0002,
"loss": 0.5189903974533081,
"mean_token_accuracy": 0.7861309498548508,
"num_tokens": 11946624.0,
"step": 733
},
{
"entropy": 0.5495569705963135,
"epoch": 2.7401869158878505,
"grad_norm": 0.04659309610724449,
"learning_rate": 0.0002,
"loss": 0.5496231913566589,
"mean_token_accuracy": 0.7766851484775543,
"num_tokens": 11963057.0,
"step": 734
},
{
"entropy": 0.5380824655294418,
"epoch": 2.74392523364486,
"grad_norm": 0.04431717097759247,
"learning_rate": 0.0002,
"loss": 0.5472241640090942,
"mean_token_accuracy": 0.7799153625965118,
"num_tokens": 11979414.0,
"step": 735
},
{
"entropy": 0.5362866371870041,
"epoch": 2.7476635514018692,
"grad_norm": 0.04207630082964897,
"learning_rate": 0.0002,
"loss": 0.5480789542198181,
"mean_token_accuracy": 0.7744766473770142,
"num_tokens": 11995788.0,
"step": 736
},
{
"entropy": 0.5203833281993866,
"epoch": 2.7514018691588786,
"grad_norm": 0.040439583361148834,
"learning_rate": 0.0002,
"loss": 0.5229013562202454,
"mean_token_accuracy": 0.7877133041620255,
"num_tokens": 12011768.0,
"step": 737
},
{
"entropy": 0.5442389398813248,
"epoch": 2.755140186915888,
"grad_norm": 0.036312710493803024,
"learning_rate": 0.0002,
"loss": 0.5421340465545654,
"mean_token_accuracy": 0.7801235765218735,
"num_tokens": 12027990.0,
"step": 738
},
{
"entropy": 0.540812149643898,
"epoch": 2.7588785046728974,
"grad_norm": 0.035805970430374146,
"learning_rate": 0.0002,
"loss": 0.5289261937141418,
"mean_token_accuracy": 0.7858118265867233,
"num_tokens": 12044016.0,
"step": 739
},
{
"entropy": 0.5561389774084091,
"epoch": 2.762616822429907,
"grad_norm": 0.03753306344151497,
"learning_rate": 0.0002,
"loss": 0.5497045516967773,
"mean_token_accuracy": 0.7774728685617447,
"num_tokens": 12060449.0,
"step": 740
},
{
"entropy": 0.5353166311979294,
"epoch": 2.7663551401869158,
"grad_norm": 0.04419036954641342,
"learning_rate": 0.0002,
"loss": 0.5267462134361267,
"mean_token_accuracy": 0.7831297665834427,
"num_tokens": 12076756.0,
"step": 741
},
{
"entropy": 0.5390448272228241,
"epoch": 2.770093457943925,
"grad_norm": 0.039156846702098846,
"learning_rate": 0.0002,
"loss": 0.5363330841064453,
"mean_token_accuracy": 0.7822138518095016,
"num_tokens": 12093231.0,
"step": 742
},
{
"entropy": 0.5334637314081192,
"epoch": 2.7738317757009345,
"grad_norm": 0.03978954628109932,
"learning_rate": 0.0002,
"loss": 0.5416637659072876,
"mean_token_accuracy": 0.782222107052803,
"num_tokens": 12109520.0,
"step": 743
},
{
"entropy": 0.5362211316823959,
"epoch": 2.777570093457944,
"grad_norm": 0.04728684201836586,
"learning_rate": 0.0002,
"loss": 0.5461055040359497,
"mean_token_accuracy": 0.7771897614002228,
"num_tokens": 12125527.0,
"step": 744
},
{
"entropy": 0.5383228212594986,
"epoch": 2.7813084112149533,
"grad_norm": 0.03740681707859039,
"learning_rate": 0.0002,
"loss": 0.5361698269844055,
"mean_token_accuracy": 0.7826491445302963,
"num_tokens": 12141826.0,
"step": 745
},
{
"entropy": 0.5330131649971008,
"epoch": 2.7850467289719627,
"grad_norm": 0.03758367896080017,
"learning_rate": 0.0002,
"loss": 0.5265568494796753,
"mean_token_accuracy": 0.7877195477485657,
"num_tokens": 12157984.0,
"step": 746
},
{
"entropy": 0.5397753864526749,
"epoch": 2.788785046728972,
"grad_norm": 0.042070865631103516,
"learning_rate": 0.0002,
"loss": 0.5313206911087036,
"mean_token_accuracy": 0.7845780104398727,
"num_tokens": 12174529.0,
"step": 747
},
{
"entropy": 0.5600686222314835,
"epoch": 2.792523364485981,
"grad_norm": 0.0377703532576561,
"learning_rate": 0.0002,
"loss": 0.5598015189170837,
"mean_token_accuracy": 0.7710230052471161,
"num_tokens": 12190857.0,
"step": 748
},
{
"entropy": 0.5242457091808319,
"epoch": 2.7962616822429904,
"grad_norm": 0.036673370748758316,
"learning_rate": 0.0002,
"loss": 0.5266134738922119,
"mean_token_accuracy": 0.7835761904716492,
"num_tokens": 12207046.0,
"step": 749
},
{
"entropy": 0.5196694731712341,
"epoch": 2.8,
"grad_norm": 0.04529178887605667,
"learning_rate": 0.0002,
"loss": 0.5295214653015137,
"mean_token_accuracy": 0.7850393652915955,
"num_tokens": 12223323.0,
"step": 750
},
{
"entropy": 0.5278067588806152,
"epoch": 2.803738317757009,
"grad_norm": 0.04078579694032669,
"learning_rate": 0.0002,
"loss": 0.5326597094535828,
"mean_token_accuracy": 0.7830272614955902,
"num_tokens": 12239416.0,
"step": 751
},
{
"entropy": 0.5326859503984451,
"epoch": 2.8074766355140186,
"grad_norm": 0.04164998233318329,
"learning_rate": 0.0002,
"loss": 0.5332698225975037,
"mean_token_accuracy": 0.7816595435142517,
"num_tokens": 12255780.0,
"step": 752
},
{
"entropy": 0.5238984450697899,
"epoch": 2.811214953271028,
"grad_norm": 0.03843814134597778,
"learning_rate": 0.0002,
"loss": 0.5195130109786987,
"mean_token_accuracy": 0.7881060838699341,
"num_tokens": 12272157.0,
"step": 753
},
{
"entropy": 0.5336880385875702,
"epoch": 2.8149532710280374,
"grad_norm": 0.039413440972566605,
"learning_rate": 0.0002,
"loss": 0.531658411026001,
"mean_token_accuracy": 0.7836297303438187,
"num_tokens": 12288500.0,
"step": 754
},
{
"entropy": 0.5406560152769089,
"epoch": 2.8186915887850468,
"grad_norm": 0.044693466275930405,
"learning_rate": 0.0002,
"loss": 0.541545033454895,
"mean_token_accuracy": 0.7807977646589279,
"num_tokens": 12304864.0,
"step": 755
},
{
"entropy": 0.538055032491684,
"epoch": 2.822429906542056,
"grad_norm": 0.03888081759214401,
"learning_rate": 0.0002,
"loss": 0.5337695479393005,
"mean_token_accuracy": 0.7844773530960083,
"num_tokens": 12321170.0,
"step": 756
},
{
"entropy": 0.527722030878067,
"epoch": 2.8261682242990656,
"grad_norm": 0.04188257455825806,
"learning_rate": 0.0002,
"loss": 0.5265190005302429,
"mean_token_accuracy": 0.7878826707601547,
"num_tokens": 12337523.0,
"step": 757
},
{
"entropy": 0.5507965534925461,
"epoch": 2.829906542056075,
"grad_norm": 0.03817446902394295,
"learning_rate": 0.0002,
"loss": 0.5500692129135132,
"mean_token_accuracy": 0.7806660830974579,
"num_tokens": 12354118.0,
"step": 758
},
{
"entropy": 0.5407035946846008,
"epoch": 2.8336448598130843,
"grad_norm": 0.042875856161117554,
"learning_rate": 0.0002,
"loss": 0.5405147671699524,
"mean_token_accuracy": 0.7810708433389664,
"num_tokens": 12370434.0,
"step": 759
},
{
"entropy": 0.5315204411745071,
"epoch": 2.8373831775700937,
"grad_norm": 0.042397141456604004,
"learning_rate": 0.0002,
"loss": 0.538346529006958,
"mean_token_accuracy": 0.7821339964866638,
"num_tokens": 12386428.0,
"step": 760
},
{
"entropy": 0.5520299524068832,
"epoch": 2.8411214953271027,
"grad_norm": 0.04137783497571945,
"learning_rate": 0.0002,
"loss": 0.5512533187866211,
"mean_token_accuracy": 0.7781175673007965,
"num_tokens": 12402867.0,
"step": 761
},
{
"entropy": 0.5510706156492233,
"epoch": 2.844859813084112,
"grad_norm": 0.04001981019973755,
"learning_rate": 0.0002,
"loss": 0.5554083585739136,
"mean_token_accuracy": 0.7719452530145645,
"num_tokens": 12419054.0,
"step": 762
},
{
"entropy": 0.5559884458780289,
"epoch": 2.8485981308411215,
"grad_norm": 0.035403911024332047,
"learning_rate": 0.0002,
"loss": 0.5523775815963745,
"mean_token_accuracy": 0.7766276150941849,
"num_tokens": 12435351.0,
"step": 763
},
{
"entropy": 0.5434874594211578,
"epoch": 2.852336448598131,
"grad_norm": 0.03929636627435684,
"learning_rate": 0.0002,
"loss": 0.537907063961029,
"mean_token_accuracy": 0.7796172052621841,
"num_tokens": 12451647.0,
"step": 764
},
{
"entropy": 0.5497813075780869,
"epoch": 2.8560747663551402,
"grad_norm": 0.03768793120980263,
"learning_rate": 0.0002,
"loss": 0.5450780391693115,
"mean_token_accuracy": 0.7810264527797699,
"num_tokens": 12468063.0,
"step": 765
},
{
"entropy": 0.5202910378575325,
"epoch": 2.8598130841121496,
"grad_norm": 0.03793422132730484,
"learning_rate": 0.0002,
"loss": 0.5197356343269348,
"mean_token_accuracy": 0.7887470573186874,
"num_tokens": 12484329.0,
"step": 766
},
{
"entropy": 0.5339359492063522,
"epoch": 2.863551401869159,
"grad_norm": 0.04222627729177475,
"learning_rate": 0.0002,
"loss": 0.5416290760040283,
"mean_token_accuracy": 0.7798094302415848,
"num_tokens": 12500522.0,
"step": 767
},
{
"entropy": 0.5492495894432068,
"epoch": 2.867289719626168,
"grad_norm": 0.043936122208833694,
"learning_rate": 0.0002,
"loss": 0.556658148765564,
"mean_token_accuracy": 0.7760462909936905,
"num_tokens": 12516877.0,
"step": 768
},
{
"entropy": 0.534624308347702,
"epoch": 2.8710280373831774,
"grad_norm": 0.042372506111860275,
"learning_rate": 0.0002,
"loss": 0.5317083597183228,
"mean_token_accuracy": 0.7851851731538773,
"num_tokens": 12533180.0,
"step": 769
},
{
"entropy": 0.5446592271327972,
"epoch": 2.8747663551401867,
"grad_norm": 0.037292055785655975,
"learning_rate": 0.0002,
"loss": 0.5379966497421265,
"mean_token_accuracy": 0.7800319492816925,
"num_tokens": 12549532.0,
"step": 770
},
{
"entropy": 0.5482804775238037,
"epoch": 2.878504672897196,
"grad_norm": 0.038804132491350174,
"learning_rate": 0.0002,
"loss": 0.5504724383354187,
"mean_token_accuracy": 0.7738227695226669,
"num_tokens": 12565943.0,
"step": 771
},
{
"entropy": 0.5368440747261047,
"epoch": 2.8822429906542055,
"grad_norm": 0.04019741341471672,
"learning_rate": 0.0002,
"loss": 0.5410951375961304,
"mean_token_accuracy": 0.7783905565738678,
"num_tokens": 12582258.0,
"step": 772
},
{
"entropy": 0.5336288064718246,
"epoch": 2.885981308411215,
"grad_norm": 0.034321509301662445,
"learning_rate": 0.0002,
"loss": 0.5328375101089478,
"mean_token_accuracy": 0.784157395362854,
"num_tokens": 12598555.0,
"step": 773
},
{
"entropy": 0.5653717815876007,
"epoch": 2.8897196261682243,
"grad_norm": 0.03593064844608307,
"learning_rate": 0.0002,
"loss": 0.5628952383995056,
"mean_token_accuracy": 0.7731250822544098,
"num_tokens": 12614684.0,
"step": 774
},
{
"entropy": 0.5388960689306259,
"epoch": 2.8934579439252337,
"grad_norm": 0.03794105350971222,
"learning_rate": 0.0002,
"loss": 0.5317496061325073,
"mean_token_accuracy": 0.7814508825540543,
"num_tokens": 12631301.0,
"step": 775
},
{
"entropy": 0.5498441606760025,
"epoch": 2.897196261682243,
"grad_norm": 0.03615562617778778,
"learning_rate": 0.0002,
"loss": 0.5489410161972046,
"mean_token_accuracy": 0.7768700569868088,
"num_tokens": 12647948.0,
"step": 776
},
{
"entropy": 0.5340896248817444,
"epoch": 2.9009345794392525,
"grad_norm": 0.038868315517902374,
"learning_rate": 0.0002,
"loss": 0.5335500836372375,
"mean_token_accuracy": 0.7818741798400879,
"num_tokens": 12664189.0,
"step": 777
},
{
"entropy": 0.5473947077989578,
"epoch": 2.904672897196262,
"grad_norm": 0.04030415788292885,
"learning_rate": 0.0002,
"loss": 0.547685980796814,
"mean_token_accuracy": 0.7762889117002487,
"num_tokens": 12680521.0,
"step": 778
},
{
"entropy": 0.5354717969894409,
"epoch": 2.9084112149532713,
"grad_norm": 0.03963444381952286,
"learning_rate": 0.0002,
"loss": 0.5363295078277588,
"mean_token_accuracy": 0.7828177064657211,
"num_tokens": 12696847.0,
"step": 779
},
{
"entropy": 0.5292405933141708,
"epoch": 2.91214953271028,
"grad_norm": 0.044744838029146194,
"learning_rate": 0.0002,
"loss": 0.5327066779136658,
"mean_token_accuracy": 0.7849072515964508,
"num_tokens": 12713036.0,
"step": 780
},
{
"entropy": 0.52642522752285,
"epoch": 2.9158878504672896,
"grad_norm": 0.04283163696527481,
"learning_rate": 0.0002,
"loss": 0.5329762697219849,
"mean_token_accuracy": 0.7837288975715637,
"num_tokens": 12729209.0,
"step": 781
},
{
"entropy": 0.527685210108757,
"epoch": 2.919626168224299,
"grad_norm": 0.041390661150217056,
"learning_rate": 0.0002,
"loss": 0.5320221185684204,
"mean_token_accuracy": 0.783889576792717,
"num_tokens": 12745655.0,
"step": 782
},
{
"entropy": 0.5404015928506851,
"epoch": 2.9233644859813084,
"grad_norm": 0.040262214839458466,
"learning_rate": 0.0002,
"loss": 0.5304533243179321,
"mean_token_accuracy": 0.7833625972270966,
"num_tokens": 12762029.0,
"step": 783
},
{
"entropy": 0.5551902800798416,
"epoch": 2.9271028037383178,
"grad_norm": 0.0381385013461113,
"learning_rate": 0.0002,
"loss": 0.5540827512741089,
"mean_token_accuracy": 0.774557501077652,
"num_tokens": 12778129.0,
"step": 784
},
{
"entropy": 0.5423577576875687,
"epoch": 2.930841121495327,
"grad_norm": 0.04024689272046089,
"learning_rate": 0.0002,
"loss": 0.5434139370918274,
"mean_token_accuracy": 0.7793742418289185,
"num_tokens": 12794167.0,
"step": 785
},
{
"entropy": 0.5381026417016983,
"epoch": 2.9345794392523366,
"grad_norm": 0.03909367695450783,
"learning_rate": 0.0002,
"loss": 0.540184736251831,
"mean_token_accuracy": 0.7813534885644913,
"num_tokens": 12810454.0,
"step": 786
},
{
"entropy": 0.5301714539527893,
"epoch": 2.938317757009346,
"grad_norm": 0.039717331528663635,
"learning_rate": 0.0002,
"loss": 0.528195858001709,
"mean_token_accuracy": 0.7839880138635635,
"num_tokens": 12826792.0,
"step": 787
},
{
"entropy": 0.5483011454343796,
"epoch": 2.942056074766355,
"grad_norm": 0.04299187660217285,
"learning_rate": 0.0002,
"loss": 0.5469069480895996,
"mean_token_accuracy": 0.7784111201763153,
"num_tokens": 12843156.0,
"step": 788
},
{
"entropy": 0.5493280291557312,
"epoch": 2.9457943925233643,
"grad_norm": 0.03909771516919136,
"learning_rate": 0.0002,
"loss": 0.5475714206695557,
"mean_token_accuracy": 0.7802032381296158,
"num_tokens": 12859513.0,
"step": 789
},
{
"entropy": 0.545919269323349,
"epoch": 2.9495327102803737,
"grad_norm": 0.03977775201201439,
"learning_rate": 0.0002,
"loss": 0.5396496057510376,
"mean_token_accuracy": 0.7824081033468246,
"num_tokens": 12875944.0,
"step": 790
},
{
"entropy": 0.5471485257148743,
"epoch": 2.953271028037383,
"grad_norm": 0.04360375925898552,
"learning_rate": 0.0002,
"loss": 0.546139657497406,
"mean_token_accuracy": 0.7795716971158981,
"num_tokens": 12892408.0,
"step": 791
},
{
"entropy": 0.5483593940734863,
"epoch": 2.9570093457943925,
"grad_norm": 0.03873739019036293,
"learning_rate": 0.0002,
"loss": 0.5458930134773254,
"mean_token_accuracy": 0.7784797698259354,
"num_tokens": 12908878.0,
"step": 792
},
{
"entropy": 0.5327412039041519,
"epoch": 2.960747663551402,
"grad_norm": 0.04030138626694679,
"learning_rate": 0.0002,
"loss": 0.531423032283783,
"mean_token_accuracy": 0.7864594012498856,
"num_tokens": 12925328.0,
"step": 793
},
{
"entropy": 0.5355861634016037,
"epoch": 2.9644859813084112,
"grad_norm": 0.03622936084866524,
"learning_rate": 0.0002,
"loss": 0.5347930192947388,
"mean_token_accuracy": 0.7837072014808655,
"num_tokens": 12941525.0,
"step": 794
},
{
"entropy": 0.5421173870563507,
"epoch": 2.9682242990654206,
"grad_norm": 0.04139631241559982,
"learning_rate": 0.0002,
"loss": 0.5441262125968933,
"mean_token_accuracy": 0.7780770361423492,
"num_tokens": 12957883.0,
"step": 795
},
{
"entropy": 0.5358422696590424,
"epoch": 2.97196261682243,
"grad_norm": 0.04235566407442093,
"learning_rate": 0.0002,
"loss": 0.5453042984008789,
"mean_token_accuracy": 0.780327558517456,
"num_tokens": 12974226.0,
"step": 796
},
{
"entropy": 0.5261758118867874,
"epoch": 2.9757009345794394,
"grad_norm": 0.038478292524814606,
"learning_rate": 0.0002,
"loss": 0.5281113386154175,
"mean_token_accuracy": 0.7872153073549271,
"num_tokens": 12990610.0,
"step": 797
},
{
"entropy": 0.555643692612648,
"epoch": 2.979439252336449,
"grad_norm": 0.03554081916809082,
"learning_rate": 0.0002,
"loss": 0.5489306449890137,
"mean_token_accuracy": 0.7791497707366943,
"num_tokens": 13007012.0,
"step": 798
},
{
"entropy": 0.5474710315465927,
"epoch": 2.983177570093458,
"grad_norm": 0.04082915186882019,
"learning_rate": 0.0002,
"loss": 0.5414685606956482,
"mean_token_accuracy": 0.7802593261003494,
"num_tokens": 13023273.0,
"step": 799
},
{
"entropy": 0.551795169711113,
"epoch": 2.986915887850467,
"grad_norm": 0.03786645457148552,
"learning_rate": 0.0002,
"loss": 0.5478507280349731,
"mean_token_accuracy": 0.7769146114587784,
"num_tokens": 13039409.0,
"step": 800
},
{
"entropy": 0.5366168767213821,
"epoch": 2.9906542056074765,
"grad_norm": 0.04365032911300659,
"learning_rate": 0.0002,
"loss": 0.5442554354667664,
"mean_token_accuracy": 0.7847046703100204,
"num_tokens": 13055837.0,
"step": 801
},
{
"entropy": 0.528346061706543,
"epoch": 2.994392523364486,
"grad_norm": 0.05227791890501976,
"learning_rate": 0.0002,
"loss": 0.5428685545921326,
"mean_token_accuracy": 0.7789010256528854,
"num_tokens": 13072216.0,
"step": 802
},
{
"entropy": 0.5396917909383774,
"epoch": 2.9981308411214953,
"grad_norm": 0.03931191936135292,
"learning_rate": 0.0002,
"loss": 0.5454744696617126,
"mean_token_accuracy": 0.7764900475740433,
"num_tokens": 13088462.0,
"step": 803
},
{
"entropy": 0.5376738607883453,
"epoch": 3.0,
"grad_norm": 0.04954347386956215,
"learning_rate": 0.0002,
"loss": 0.5307910442352295,
"mean_token_accuracy": 0.7855222225189209,
"num_tokens": 13096612.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2194419027224822e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}