DiscoverLM-70M / trainer_state.json
reaperdoesntknow's picture
Rename trainer_state (2).json to trainer_state.json
f93c331 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 3.246363878250122,
"epoch": 0.015625,
"grad_norm": 1.8967702388763428,
"learning_rate": 0.0003,
"loss": 3.182457685470581,
"mean_token_accuracy": 0.5472440719604492,
"num_tokens": 512.0,
"step": 1
},
{
"entropy": 5.435122966766357,
"epoch": 0.03125,
"grad_norm": 12.227993965148926,
"learning_rate": 0.0002999971762923901,
"loss": 3.6484947204589844,
"mean_token_accuracy": 0.5498008131980896,
"num_tokens": 1024.0,
"step": 2
},
{
"entropy": 3.1172115802764893,
"epoch": 0.046875,
"grad_norm": 5.236744403839111,
"learning_rate": 0.00029998870527587167,
"loss": 3.2157859802246094,
"mean_token_accuracy": 0.5445343852043152,
"num_tokens": 1536.0,
"step": 3
},
{
"entropy": 2.616081714630127,
"epoch": 0.0625,
"grad_norm": 5.9123759269714355,
"learning_rate": 0.0002999745872693735,
"loss": 3.129127264022827,
"mean_token_accuracy": 0.5597609281539917,
"num_tokens": 2048.0,
"step": 4
},
{
"entropy": 2.709519386291504,
"epoch": 0.078125,
"grad_norm": 6.461076736450195,
"learning_rate": 0.0002999548228044306,
"loss": 3.373767852783203,
"mean_token_accuracy": 0.5413385629653931,
"num_tokens": 2560.0,
"step": 5
},
{
"entropy": 3.1329281330108643,
"epoch": 0.09375,
"grad_norm": 3.9384171962738037,
"learning_rate": 0.00029992941262516396,
"loss": 2.912034034729004,
"mean_token_accuracy": 0.5628865957260132,
"num_tokens": 3072.0,
"step": 6
},
{
"entropy": 3.3683834075927734,
"epoch": 0.109375,
"grad_norm": 18.561269760131836,
"learning_rate": 0.0002998983576882524,
"loss": 3.3030648231506348,
"mean_token_accuracy": 0.40442654490470886,
"num_tokens": 3584.0,
"step": 7
},
{
"entropy": 3.167931079864502,
"epoch": 0.125,
"grad_norm": 16.356603622436523,
"learning_rate": 0.0002998616591628968,
"loss": 3.1349599361419678,
"mean_token_accuracy": 0.48065173625946045,
"num_tokens": 4096.0,
"step": 8
},
{
"entropy": 2.8782107830047607,
"epoch": 0.140625,
"grad_norm": 2.5236713886260986,
"learning_rate": 0.00029981931843077583,
"loss": 2.7147700786590576,
"mean_token_accuracy": 0.6140725016593933,
"num_tokens": 4608.0,
"step": 9
},
{
"entropy": 2.622675657272339,
"epoch": 0.15625,
"grad_norm": 11.60867691040039,
"learning_rate": 0.0002997713370859942,
"loss": 3.0942561626434326,
"mean_token_accuracy": 0.57485032081604,
"num_tokens": 5120.0,
"step": 10
},
{
"entropy": 2.784639835357666,
"epoch": 0.171875,
"grad_norm": 8.239684104919434,
"learning_rate": 0.0002997177169350223,
"loss": 3.309100866317749,
"mean_token_accuracy": 0.5496957302093506,
"num_tokens": 5632.0,
"step": 11
},
{
"entropy": 3.0426368713378906,
"epoch": 0.1875,
"grad_norm": 6.443057537078857,
"learning_rate": 0.00029965845999662874,
"loss": 3.346451759338379,
"mean_token_accuracy": 0.5492125749588013,
"num_tokens": 6144.0,
"step": 12
},
{
"entropy": 3.314704656600952,
"epoch": 0.203125,
"grad_norm": 3.021179676055908,
"learning_rate": 0.0002995935685018035,
"loss": 3.1350901126861572,
"mean_token_accuracy": 0.5705645084381104,
"num_tokens": 6656.0,
"step": 13
},
{
"entropy": 3.367347002029419,
"epoch": 0.21875,
"grad_norm": 6.598303318023682,
"learning_rate": 0.0002995230448936748,
"loss": 3.036130428314209,
"mean_token_accuracy": 0.5705645084381104,
"num_tokens": 7168.0,
"step": 14
},
{
"entropy": 3.309898614883423,
"epoch": 0.234375,
"grad_norm": 9.488039016723633,
"learning_rate": 0.00029944689182741664,
"loss": 3.060746431350708,
"mean_token_accuracy": 0.5546558499336243,
"num_tokens": 7680.0,
"step": 15
},
{
"entropy": 3.3201732635498047,
"epoch": 0.25,
"grad_norm": 6.782240390777588,
"learning_rate": 0.00029936511217014893,
"loss": 3.138178586959839,
"mean_token_accuracy": 0.5295275449752808,
"num_tokens": 8192.0,
"step": 16
},
{
"entropy": 3.141629934310913,
"epoch": 0.265625,
"grad_norm": 2.4261715412139893,
"learning_rate": 0.00029927770900082954,
"loss": 2.9288454055786133,
"mean_token_accuracy": 0.5950413346290588,
"num_tokens": 8704.0,
"step": 17
},
{
"entropy": 3.184690475463867,
"epoch": 0.28125,
"grad_norm": 2.769068479537964,
"learning_rate": 0.0002991846856101383,
"loss": 2.7941722869873047,
"mean_token_accuracy": 0.590436577796936,
"num_tokens": 9216.0,
"step": 18
},
{
"entropy": 3.0208375453948975,
"epoch": 0.296875,
"grad_norm": 2.609088182449341,
"learning_rate": 0.0002990860455003534,
"loss": 2.9833905696868896,
"mean_token_accuracy": 0.5841785073280334,
"num_tokens": 9728.0,
"step": 19
},
{
"entropy": 2.88010573387146,
"epoch": 0.3125,
"grad_norm": 2.748533248901367,
"learning_rate": 0.00029898179238521916,
"loss": 2.6276729106903076,
"mean_token_accuracy": 0.6270833611488342,
"num_tokens": 10240.0,
"step": 20
},
{
"entropy": 2.6639840602874756,
"epoch": 0.328125,
"grad_norm": 2.043412923812866,
"learning_rate": 0.0002988719301898065,
"loss": 2.456700325012207,
"mean_token_accuracy": 0.6457023024559021,
"num_tokens": 10752.0,
"step": 21
},
{
"entropy": 3.010587453842163,
"epoch": 0.34375,
"grad_norm": 4.867815017700195,
"learning_rate": 0.0002987564630503649,
"loss": 2.8242549896240234,
"mean_token_accuracy": 0.5879917144775391,
"num_tokens": 11264.0,
"step": 22
},
{
"entropy": 2.9374547004699707,
"epoch": 0.359375,
"grad_norm": 4.809256076812744,
"learning_rate": 0.000298635395314167,
"loss": 2.4628262519836426,
"mean_token_accuracy": 0.6552462577819824,
"num_tokens": 11776.0,
"step": 23
},
{
"entropy": 3.3613696098327637,
"epoch": 0.375,
"grad_norm": 3.2862000465393066,
"learning_rate": 0.00029850873153934457,
"loss": 3.3322877883911133,
"mean_token_accuracy": 0.5728346705436707,
"num_tokens": 12288.0,
"step": 24
},
{
"entropy": 3.3719234466552734,
"epoch": 0.390625,
"grad_norm": 2.0418403148651123,
"learning_rate": 0.00029837647649471715,
"loss": 2.932267427444458,
"mean_token_accuracy": 0.5895372033119202,
"num_tokens": 12800.0,
"step": 25
},
{
"entropy": 2.842104911804199,
"epoch": 0.40625,
"grad_norm": 2.131535291671753,
"learning_rate": 0.0002982386351596124,
"loss": 2.8979134559631348,
"mean_token_accuracy": 0.6000000238418579,
"num_tokens": 13312.0,
"step": 26
},
{
"entropy": 2.60341215133667,
"epoch": 0.421875,
"grad_norm": 1.886840581893921,
"learning_rate": 0.00029809521272367874,
"loss": 2.448031425476074,
"mean_token_accuracy": 0.6652360558509827,
"num_tokens": 13824.0,
"step": 27
},
{
"entropy": 2.783334732055664,
"epoch": 0.4375,
"grad_norm": 2.298892021179199,
"learning_rate": 0.0002979462145866898,
"loss": 2.8729963302612305,
"mean_token_accuracy": 0.6239495873451233,
"num_tokens": 14336.0,
"step": 28
},
{
"entropy": 3.2722766399383545,
"epoch": 0.453125,
"grad_norm": 1.7541515827178955,
"learning_rate": 0.00029779164635834114,
"loss": 2.9888410568237305,
"mean_token_accuracy": 0.5971943736076355,
"num_tokens": 14848.0,
"step": 29
},
{
"entropy": 3.3545026779174805,
"epoch": 0.46875,
"grad_norm": 1.9528993368148804,
"learning_rate": 0.0002976315138580393,
"loss": 2.4215219020843506,
"mean_token_accuracy": 0.6466809511184692,
"num_tokens": 15360.0,
"step": 30
},
{
"entropy": 3.3544554710388184,
"epoch": 0.484375,
"grad_norm": 2.7539281845092773,
"learning_rate": 0.00029746582311468244,
"loss": 2.8554189205169678,
"mean_token_accuracy": 0.6028806567192078,
"num_tokens": 15872.0,
"step": 31
},
{
"entropy": 2.996553421020508,
"epoch": 0.5,
"grad_norm": 2.825383424758911,
"learning_rate": 0.0002972945803664333,
"loss": 2.8956212997436523,
"mean_token_accuracy": 0.5841785073280334,
"num_tokens": 16384.0,
"step": 32
},
{
"entropy": 2.7225828170776367,
"epoch": 0.515625,
"grad_norm": 1.8512521982192993,
"learning_rate": 0.00029711779206048454,
"loss": 2.828862190246582,
"mean_token_accuracy": 0.586614191532135,
"num_tokens": 16896.0,
"step": 33
},
{
"entropy": 2.5095906257629395,
"epoch": 0.53125,
"grad_norm": 2.991373062133789,
"learning_rate": 0.00029693546485281603,
"loss": 2.879727363586426,
"mean_token_accuracy": 0.6134969592094421,
"num_tokens": 17408.0,
"step": 34
},
{
"entropy": 2.993544340133667,
"epoch": 0.546875,
"grad_norm": 2.760350465774536,
"learning_rate": 0.0002967476056079441,
"loss": 3.0702333450317383,
"mean_token_accuracy": 0.5669291615486145,
"num_tokens": 17920.0,
"step": 35
},
{
"entropy": 3.340437650680542,
"epoch": 0.5625,
"grad_norm": 3.0262949466705322,
"learning_rate": 0.0002965542213986631,
"loss": 3.17567777633667,
"mean_token_accuracy": 0.5472440719604492,
"num_tokens": 18432.0,
"step": 36
},
{
"entropy": 3.2044739723205566,
"epoch": 0.578125,
"grad_norm": 6.876502990722656,
"learning_rate": 0.00029635531950577925,
"loss": 2.78464674949646,
"mean_token_accuracy": 0.6036961078643799,
"num_tokens": 18944.0,
"step": 37
},
{
"entropy": 3.24843692779541,
"epoch": 0.59375,
"grad_norm": 5.545853137969971,
"learning_rate": 0.00029615090741783636,
"loss": 2.7331504821777344,
"mean_token_accuracy": 0.608433723449707,
"num_tokens": 19456.0,
"step": 38
},
{
"entropy": 3.1350715160369873,
"epoch": 0.609375,
"grad_norm": 3.3395559787750244,
"learning_rate": 0.000295940992830834,
"loss": 3.300499677658081,
"mean_token_accuracy": 0.5590550899505615,
"num_tokens": 19968.0,
"step": 39
},
{
"entropy": 3.0839738845825195,
"epoch": 0.625,
"grad_norm": 2.1904847621917725,
"learning_rate": 0.00029572558364793775,
"loss": 3.1389362812042236,
"mean_token_accuracy": 0.5748031735420227,
"num_tokens": 20480.0,
"step": 40
},
{
"entropy": 2.964693546295166,
"epoch": 0.640625,
"grad_norm": 1.7939658164978027,
"learning_rate": 0.0002955046879791816,
"loss": 3.1034960746765137,
"mean_token_accuracy": 0.5610235929489136,
"num_tokens": 20992.0,
"step": 41
},
{
"entropy": 2.911698818206787,
"epoch": 0.65625,
"grad_norm": 1.7506498098373413,
"learning_rate": 0.0002952783141411626,
"loss": 2.3598453998565674,
"mean_token_accuracy": 0.6395833492279053,
"num_tokens": 21504.0,
"step": 42
},
{
"entropy": 2.961909770965576,
"epoch": 0.671875,
"grad_norm": 3.6559484004974365,
"learning_rate": 0.00029504647065672776,
"loss": 3.004157304763794,
"mean_token_accuracy": 0.5748031735420227,
"num_tokens": 22016.0,
"step": 43
},
{
"entropy": 2.7874066829681396,
"epoch": 0.6875,
"grad_norm": 3.728947162628174,
"learning_rate": 0.00029480916625465337,
"loss": 2.615809440612793,
"mean_token_accuracy": 0.6064257025718689,
"num_tokens": 22528.0,
"step": 44
},
{
"entropy": 2.7361326217651367,
"epoch": 0.703125,
"grad_norm": 6.112745761871338,
"learning_rate": 0.00029456640986931596,
"loss": 2.6459853649139404,
"mean_token_accuracy": 0.6361746191978455,
"num_tokens": 23040.0,
"step": 45
},
{
"entropy": 2.6819934844970703,
"epoch": 0.71875,
"grad_norm": 5.611839294433594,
"learning_rate": 0.0002943182106403562,
"loss": 2.431462287902832,
"mean_token_accuracy": 0.6595744490623474,
"num_tokens": 23552.0,
"step": 46
},
{
"entropy": 2.5808193683624268,
"epoch": 0.734375,
"grad_norm": 3.8999392986297607,
"learning_rate": 0.0002940645779123348,
"loss": 2.77005934715271,
"mean_token_accuracy": 0.6206896305084229,
"num_tokens": 24064.0,
"step": 47
},
{
"entropy": 2.98895001411438,
"epoch": 0.75,
"grad_norm": 2.57936954498291,
"learning_rate": 0.0002938055212343807,
"loss": 2.8034496307373047,
"mean_token_accuracy": 0.6020202040672302,
"num_tokens": 24576.0,
"step": 48
},
{
"entropy": 2.952457904815674,
"epoch": 0.765625,
"grad_norm": 5.904950141906738,
"learning_rate": 0.0002935410503598313,
"loss": 3.1076672077178955,
"mean_token_accuracy": 0.5944882035255432,
"num_tokens": 25088.0,
"step": 49
},
{
"entropy": 2.9851343631744385,
"epoch": 0.78125,
"grad_norm": 8.890862464904785,
"learning_rate": 0.0002932711752458656,
"loss": 2.2850301265716553,
"mean_token_accuracy": 0.6666666865348816,
"num_tokens": 25600.0,
"step": 50
},
{
"entropy": 3.0929861068725586,
"epoch": 0.796875,
"grad_norm": 6.197035789489746,
"learning_rate": 0.00029299590605312906,
"loss": 2.604055166244507,
"mean_token_accuracy": 0.6183673739433289,
"num_tokens": 26112.0,
"step": 51
},
{
"entropy": 2.9722883701324463,
"epoch": 0.8125,
"grad_norm": 7.0517473220825195,
"learning_rate": 0.00029271525314535123,
"loss": 2.4527275562286377,
"mean_token_accuracy": 0.6301652789115906,
"num_tokens": 26624.0,
"step": 52
},
{
"entropy": 2.6360151767730713,
"epoch": 0.828125,
"grad_norm": 9.385331153869629,
"learning_rate": 0.00029242922708895547,
"loss": 2.614161729812622,
"mean_token_accuracy": 0.6169354915618896,
"num_tokens": 27136.0,
"step": 53
},
{
"entropy": 2.6233577728271484,
"epoch": 0.84375,
"grad_norm": 9.205692291259766,
"learning_rate": 0.00029213783865266114,
"loss": 2.693942070007324,
"mean_token_accuracy": 0.6159999966621399,
"num_tokens": 27648.0,
"step": 54
},
{
"entropy": 2.5832924842834473,
"epoch": 0.859375,
"grad_norm": 4.841373920440674,
"learning_rate": 0.0002918410988070782,
"loss": 2.87924861907959,
"mean_token_accuracy": 0.5991984009742737,
"num_tokens": 28160.0,
"step": 55
},
{
"entropy": 2.815551280975342,
"epoch": 0.875,
"grad_norm": 5.897908687591553,
"learning_rate": 0.00029153901872429404,
"loss": 2.737504720687866,
"mean_token_accuracy": 0.5767716765403748,
"num_tokens": 28672.0,
"step": 56
},
{
"entropy": 2.7117974758148193,
"epoch": 0.890625,
"grad_norm": 1.9384305477142334,
"learning_rate": 0.00029123160977745306,
"loss": 2.689833164215088,
"mean_token_accuracy": 0.6227180361747742,
"num_tokens": 29184.0,
"step": 57
},
{
"entropy": 3.006451368331909,
"epoch": 0.90625,
"grad_norm": 11.659981727600098,
"learning_rate": 0.00029091888354032845,
"loss": 2.6457479000091553,
"mean_token_accuracy": 0.6306122541427612,
"num_tokens": 29696.0,
"step": 58
},
{
"entropy": 3.347648859024048,
"epoch": 0.921875,
"grad_norm": 13.3857421875,
"learning_rate": 0.0002906008517868862,
"loss": 3.0881667137145996,
"mean_token_accuracy": 0.5826771855354309,
"num_tokens": 30208.0,
"step": 59
},
{
"entropy": 3.1437995433807373,
"epoch": 0.9375,
"grad_norm": 11.784278869628906,
"learning_rate": 0.0002902775264908421,
"loss": 2.768523931503296,
"mean_token_accuracy": 0.6226804256439209,
"num_tokens": 30720.0,
"step": 60
},
{
"entropy": 3.3461129665374756,
"epoch": 0.953125,
"grad_norm": 14.32170295715332,
"learning_rate": 0.0002899489198252108,
"loss": 2.813835859298706,
"mean_token_accuracy": 0.608961284160614,
"num_tokens": 31232.0,
"step": 61
},
{
"entropy": 3.2309908866882324,
"epoch": 0.96875,
"grad_norm": 10.945052146911621,
"learning_rate": 0.00028961504416184753,
"loss": 2.9945812225341797,
"mean_token_accuracy": 0.586614191532135,
"num_tokens": 31744.0,
"step": 62
},
{
"entropy": 3.0333917140960693,
"epoch": 0.984375,
"grad_norm": 10.011848449707031,
"learning_rate": 0.00028927591207098235,
"loss": 2.868558645248413,
"mean_token_accuracy": 0.5984252095222473,
"num_tokens": 32256.0,
"step": 63
},
{
"entropy": 2.871372938156128,
"epoch": 1.0,
"grad_norm": 3.1529769897460938,
"learning_rate": 0.0002889315363207467,
"loss": 2.403618335723877,
"mean_token_accuracy": 0.6575630307197571,
"num_tokens": 32768.0,
"step": 64
},
{
"entropy": 2.411283016204834,
"epoch": 1.015625,
"grad_norm": 5.358489513397217,
"learning_rate": 0.000288581929876693,
"loss": 2.1208901405334473,
"mean_token_accuracy": 0.6616702079772949,
"num_tokens": 33280.0,
"step": 65
},
{
"entropy": 2.3217580318450928,
"epoch": 1.03125,
"grad_norm": 2.0556492805480957,
"learning_rate": 0.0002882271059013063,
"loss": 2.562404155731201,
"mean_token_accuracy": 0.6129032373428345,
"num_tokens": 33792.0,
"step": 66
},
{
"entropy": 2.6636924743652344,
"epoch": 1.046875,
"grad_norm": 13.256611824035645,
"learning_rate": 0.0002878670777535087,
"loss": 2.492720127105713,
"mean_token_accuracy": 0.585170328617096,
"num_tokens": 34304.0,
"step": 67
},
{
"entropy": 2.7850069999694824,
"epoch": 1.0625,
"grad_norm": 18.333816528320312,
"learning_rate": 0.0002875018589881564,
"loss": 2.679954767227173,
"mean_token_accuracy": 0.573122501373291,
"num_tokens": 34816.0,
"step": 68
},
{
"entropy": 2.7594878673553467,
"epoch": 1.078125,
"grad_norm": 15.908726692199707,
"learning_rate": 0.0002871314633555296,
"loss": 2.299175262451172,
"mean_token_accuracy": 0.6004056930541992,
"num_tokens": 35328.0,
"step": 69
},
{
"entropy": 3.0297257900238037,
"epoch": 1.09375,
"grad_norm": 9.716331481933594,
"learning_rate": 0.0002867559048008145,
"loss": 2.834432363510132,
"mean_token_accuracy": 0.5748031735420227,
"num_tokens": 35840.0,
"step": 70
},
{
"entropy": 3.0106217861175537,
"epoch": 1.109375,
"grad_norm": 6.021564960479736,
"learning_rate": 0.0002863751974635783,
"loss": 2.832149028778076,
"mean_token_accuracy": 0.5708661675453186,
"num_tokens": 36352.0,
"step": 71
},
{
"entropy": 2.9219532012939453,
"epoch": 1.125,
"grad_norm": 9.39206314086914,
"learning_rate": 0.0002859893556772373,
"loss": 2.65635085105896,
"mean_token_accuracy": 0.5787401795387268,
"num_tokens": 36864.0,
"step": 72
},
{
"entropy": 2.6846885681152344,
"epoch": 1.140625,
"grad_norm": 10.221190452575684,
"learning_rate": 0.0002855983939685165,
"loss": 2.652174949645996,
"mean_token_accuracy": 0.6043307185173035,
"num_tokens": 37376.0,
"step": 73
},
{
"entropy": 2.483905792236328,
"epoch": 1.15625,
"grad_norm": 8.940083503723145,
"learning_rate": 0.0002852023270569033,
"loss": 2.270918369293213,
"mean_token_accuracy": 0.6493775844573975,
"num_tokens": 37888.0,
"step": 74
},
{
"entropy": 2.4425692558288574,
"epoch": 1.171875,
"grad_norm": 5.5992512702941895,
"learning_rate": 0.00028480116985409303,
"loss": 1.9062901735305786,
"mean_token_accuracy": 0.6832971572875977,
"num_tokens": 38400.0,
"step": 75
},
{
"entropy": 2.611058235168457,
"epoch": 1.1875,
"grad_norm": 4.461435317993164,
"learning_rate": 0.00028439493746342773,
"loss": 2.5850863456726074,
"mean_token_accuracy": 0.5925197005271912,
"num_tokens": 38912.0,
"step": 76
},
{
"entropy": 2.6180343627929688,
"epoch": 1.203125,
"grad_norm": 8.797931671142578,
"learning_rate": 0.00028398364517932725,
"loss": 2.809943437576294,
"mean_token_accuracy": 0.5925197005271912,
"num_tokens": 39424.0,
"step": 77
},
{
"entropy": 2.826270341873169,
"epoch": 1.21875,
"grad_norm": 6.125537872314453,
"learning_rate": 0.0002835673084867137,
"loss": 2.7826144695281982,
"mean_token_accuracy": 0.5708661675453186,
"num_tokens": 39936.0,
"step": 78
},
{
"entropy": 2.6673154830932617,
"epoch": 1.234375,
"grad_norm": 3.9816977977752686,
"learning_rate": 0.0002831459430604281,
"loss": 2.5085325241088867,
"mean_token_accuracy": 0.6043307185173035,
"num_tokens": 40448.0,
"step": 79
},
{
"entropy": 2.777114152908325,
"epoch": 1.25,
"grad_norm": 4.296886444091797,
"learning_rate": 0.00028271956476464067,
"loss": 2.4793484210968018,
"mean_token_accuracy": 0.6035503149032593,
"num_tokens": 40960.0,
"step": 80
},
{
"entropy": 2.671078681945801,
"epoch": 1.265625,
"grad_norm": 3.269895315170288,
"learning_rate": 0.0002822881896522532,
"loss": 2.2512905597686768,
"mean_token_accuracy": 0.6570248007774353,
"num_tokens": 41472.0,
"step": 81
},
{
"entropy": 2.6435415744781494,
"epoch": 1.28125,
"grad_norm": 2.195950746536255,
"learning_rate": 0.000281851833964295,
"loss": 2.6383659839630127,
"mean_token_accuracy": 0.5826771855354309,
"num_tokens": 41984.0,
"step": 82
},
{
"entropy": 2.531452178955078,
"epoch": 1.296875,
"grad_norm": 11.792201042175293,
"learning_rate": 0.00028141051412931096,
"loss": 2.146348237991333,
"mean_token_accuracy": 0.6505263447761536,
"num_tokens": 42496.0,
"step": 83
},
{
"entropy": 2.572371006011963,
"epoch": 1.3125,
"grad_norm": 8.870929718017578,
"learning_rate": 0.00028096424676274346,
"loss": 2.3176331520080566,
"mean_token_accuracy": 0.6183673739433289,
"num_tokens": 43008.0,
"step": 84
},
{
"entropy": 2.348723888397217,
"epoch": 1.328125,
"grad_norm": 14.476244926452637,
"learning_rate": 0.0002805130486663067,
"loss": 2.4164490699768066,
"mean_token_accuracy": 0.6023622155189514,
"num_tokens": 43520.0,
"step": 85
},
{
"entropy": 2.422470808029175,
"epoch": 1.34375,
"grad_norm": 14.661328315734863,
"learning_rate": 0.00028005693682735385,
"loss": 2.112380027770996,
"mean_token_accuracy": 0.6382536292076111,
"num_tokens": 44032.0,
"step": 86
},
{
"entropy": 2.3329508304595947,
"epoch": 1.359375,
"grad_norm": 2.964010000228882,
"learning_rate": 0.000279595928418238,
"loss": 2.1712806224823,
"mean_token_accuracy": 0.6480331420898438,
"num_tokens": 44544.0,
"step": 87
},
{
"entropy": 2.69111967086792,
"epoch": 1.375,
"grad_norm": 10.743988990783691,
"learning_rate": 0.00027913004079566507,
"loss": 2.5868759155273438,
"mean_token_accuracy": 0.586614191532135,
"num_tokens": 45056.0,
"step": 88
},
{
"entropy": 2.4709086418151855,
"epoch": 1.390625,
"grad_norm": 11.36965560913086,
"learning_rate": 0.0002786592915000408,
"loss": 2.68729829788208,
"mean_token_accuracy": 0.6062992215156555,
"num_tokens": 45568.0,
"step": 89
},
{
"entropy": 2.602285385131836,
"epoch": 1.40625,
"grad_norm": 8.707329750061035,
"learning_rate": 0.00027818369825481,
"loss": 2.1857831478118896,
"mean_token_accuracy": 0.6625514626502991,
"num_tokens": 46080.0,
"step": 90
},
{
"entropy": 2.6694841384887695,
"epoch": 1.421875,
"grad_norm": 3.59173846244812,
"learning_rate": 0.0002777032789657898,
"loss": 2.162902355194092,
"mean_token_accuracy": 0.6695095896720886,
"num_tokens": 46592.0,
"step": 91
},
{
"entropy": 2.4398598670959473,
"epoch": 1.4375,
"grad_norm": 2.533829689025879,
"learning_rate": 0.00027721805172049456,
"loss": 2.2611405849456787,
"mean_token_accuracy": 0.6639510989189148,
"num_tokens": 47104.0,
"step": 92
},
{
"entropy": 2.4955785274505615,
"epoch": 1.453125,
"grad_norm": 6.045874118804932,
"learning_rate": 0.000276728034787456,
"loss": 2.6435904502868652,
"mean_token_accuracy": 0.5964567065238953,
"num_tokens": 47616.0,
"step": 93
},
{
"entropy": 2.6504571437835693,
"epoch": 1.46875,
"grad_norm": 7.782134532928467,
"learning_rate": 0.00027623324661553477,
"loss": 2.2059261798858643,
"mean_token_accuracy": 0.6631799340248108,
"num_tokens": 48128.0,
"step": 94
},
{
"entropy": 2.8098762035369873,
"epoch": 1.484375,
"grad_norm": 4.1661763191223145,
"learning_rate": 0.00027573370583322565,
"loss": 2.0274105072021484,
"mean_token_accuracy": 0.6802574992179871,
"num_tokens": 48640.0,
"step": 95
},
{
"entropy": 2.6600871086120605,
"epoch": 1.5,
"grad_norm": 5.105381488800049,
"learning_rate": 0.000275229431247957,
"loss": 2.1700079441070557,
"mean_token_accuracy": 0.6549586653709412,
"num_tokens": 49152.0,
"step": 96
},
{
"entropy": 2.47259783744812,
"epoch": 1.515625,
"grad_norm": 3.5480713844299316,
"learning_rate": 0.0002747204418453818,
"loss": 2.4075517654418945,
"mean_token_accuracy": 0.6232464909553528,
"num_tokens": 49664.0,
"step": 97
},
{
"entropy": 2.4480819702148438,
"epoch": 1.53125,
"grad_norm": 4.602890968322754,
"learning_rate": 0.00027420675678866335,
"loss": 2.44067120552063,
"mean_token_accuracy": 0.6198019981384277,
"num_tokens": 50176.0,
"step": 98
},
{
"entropy": 2.316689968109131,
"epoch": 1.546875,
"grad_norm": 3.827681303024292,
"learning_rate": 0.0002736883954177538,
"loss": 2.0741045475006104,
"mean_token_accuracy": 0.676171064376831,
"num_tokens": 50688.0,
"step": 99
},
{
"entropy": 2.5585527420043945,
"epoch": 1.5625,
"grad_norm": 3.0183746814727783,
"learning_rate": 0.00027316537724866565,
"loss": 2.4706835746765137,
"mean_token_accuracy": 0.6270161271095276,
"num_tokens": 51200.0,
"step": 100
},
{
"entropy": 2.5568273067474365,
"epoch": 1.578125,
"grad_norm": 5.168515682220459,
"learning_rate": 0.0002726377219727375,
"loss": 2.2053163051605225,
"mean_token_accuracy": 0.6666666865348816,
"num_tokens": 51712.0,
"step": 101
},
{
"entropy": 2.6455063819885254,
"epoch": 1.59375,
"grad_norm": 2.874056100845337,
"learning_rate": 0.00027210544945589223,
"loss": 2.0263545513153076,
"mean_token_accuracy": 0.6826722621917725,
"num_tokens": 52224.0,
"step": 102
},
{
"entropy": 2.610264301300049,
"epoch": 1.609375,
"grad_norm": 3.7625222206115723,
"learning_rate": 0.0002715685797378892,
"loss": 2.221018075942993,
"mean_token_accuracy": 0.6285714507102966,
"num_tokens": 52736.0,
"step": 103
},
{
"entropy": 2.410818576812744,
"epoch": 1.625,
"grad_norm": 2.1665024757385254,
"learning_rate": 0.0002710271330315699,
"loss": 2.3151586055755615,
"mean_token_accuracy": 0.6328600645065308,
"num_tokens": 53248.0,
"step": 104
},
{
"entropy": 2.333315372467041,
"epoch": 1.640625,
"grad_norm": 4.645427227020264,
"learning_rate": 0.0002704811297220967,
"loss": 2.478018045425415,
"mean_token_accuracy": 0.5944882035255432,
"num_tokens": 53760.0,
"step": 105
},
{
"entropy": 2.4208340644836426,
"epoch": 1.65625,
"grad_norm": 7.0705952644348145,
"learning_rate": 0.0002699305903661857,
"loss": 1.7686328887939453,
"mean_token_accuracy": 0.7170626521110535,
"num_tokens": 54272.0,
"step": 106
},
{
"entropy": 2.5010931491851807,
"epoch": 1.671875,
"grad_norm": 2.786400556564331,
"learning_rate": 0.0002693755356913325,
"loss": 2.338402509689331,
"mean_token_accuracy": 0.613545835018158,
"num_tokens": 54784.0,
"step": 107
},
{
"entropy": 2.2968597412109375,
"epoch": 1.6875,
"grad_norm": 3.2380192279815674,
"learning_rate": 0.00026881598659503185,
"loss": 1.6511207818984985,
"mean_token_accuracy": 0.7251082062721252,
"num_tokens": 55296.0,
"step": 108
},
{
"entropy": 2.4909420013427734,
"epoch": 1.703125,
"grad_norm": 2.494285821914673,
"learning_rate": 0.00026825196414399094,
"loss": 2.376694440841675,
"mean_token_accuracy": 0.6358267664909363,
"num_tokens": 55808.0,
"step": 109
},
{
"entropy": 2.5020644664764404,
"epoch": 1.71875,
"grad_norm": 4.628073692321777,
"learning_rate": 0.00026768348957333625,
"loss": 1.9303858280181885,
"mean_token_accuracy": 0.6993603706359863,
"num_tokens": 56320.0,
"step": 110
},
{
"entropy": 2.327064275741577,
"epoch": 1.734375,
"grad_norm": 3.5953564643859863,
"learning_rate": 0.00026711058428581416,
"loss": 2.2964227199554443,
"mean_token_accuracy": 0.6666666865348816,
"num_tokens": 56832.0,
"step": 111
},
{
"entropy": 2.2144789695739746,
"epoch": 1.75,
"grad_norm": 17.284832000732422,
"learning_rate": 0.0002665332698509848,
"loss": 2.381066083908081,
"mean_token_accuracy": 0.6338582634925842,
"num_tokens": 57344.0,
"step": 112
},
{
"entropy": 2.454603672027588,
"epoch": 1.765625,
"grad_norm": 18.100194931030273,
"learning_rate": 0.0002659515680044105,
"loss": 2.4117445945739746,
"mean_token_accuracy": 0.6487026214599609,
"num_tokens": 57856.0,
"step": 113
},
{
"entropy": 2.4259469509124756,
"epoch": 1.78125,
"grad_norm": 5.293238639831543,
"learning_rate": 0.00026536550064683697,
"loss": 2.249678373336792,
"mean_token_accuracy": 0.6659836173057556,
"num_tokens": 58368.0,
"step": 114
},
{
"entropy": 2.419496536254883,
"epoch": 1.796875,
"grad_norm": 3.008138418197632,
"learning_rate": 0.0002647750898433688,
"loss": 1.8539735078811646,
"mean_token_accuracy": 0.7130434513092041,
"num_tokens": 58880.0,
"step": 115
},
{
"entropy": 2.444873809814453,
"epoch": 1.8125,
"grad_norm": 5.0132060050964355,
"learning_rate": 0.00026418035782263923,
"loss": 1.8831861019134521,
"mean_token_accuracy": 0.6965811848640442,
"num_tokens": 59392.0,
"step": 116
},
{
"entropy": 2.5142552852630615,
"epoch": 1.828125,
"grad_norm": 3.7436330318450928,
"learning_rate": 0.00026358132697597265,
"loss": 2.437363624572754,
"mean_token_accuracy": 0.6220472455024719,
"num_tokens": 59904.0,
"step": 117
},
{
"entropy": 2.6384663581848145,
"epoch": 1.84375,
"grad_norm": 3.390326976776123,
"learning_rate": 0.00026297801985654184,
"loss": 2.490990161895752,
"mean_token_accuracy": 0.6102362275123596,
"num_tokens": 60416.0,
"step": 118
},
{
"entropy": 2.204195261001587,
"epoch": 1.859375,
"grad_norm": 2.7175426483154297,
"learning_rate": 0.0002623704591785189,
"loss": 2.220968246459961,
"mean_token_accuracy": 0.6434262990951538,
"num_tokens": 60928.0,
"step": 119
},
{
"entropy": 2.1468725204467773,
"epoch": 1.875,
"grad_norm": 14.231541633605957,
"learning_rate": 0.0002617586678162199,
"loss": 2.3230385780334473,
"mean_token_accuracy": 0.6220472455024719,
"num_tokens": 61440.0,
"step": 120
},
{
"entropy": 2.3221914768218994,
"epoch": 1.890625,
"grad_norm": 19.32585906982422,
"learning_rate": 0.00026114266880324387,
"loss": 2.1758298873901367,
"mean_token_accuracy": 0.6481481194496155,
"num_tokens": 61952.0,
"step": 121
},
{
"entropy": 2.2526302337646484,
"epoch": 1.90625,
"grad_norm": 9.51953411102295,
"learning_rate": 0.0002605224853316054,
"loss": 2.22743558883667,
"mean_token_accuracy": 0.6179999709129333,
"num_tokens": 62464.0,
"step": 122
},
{
"entropy": 2.501652956008911,
"epoch": 1.921875,
"grad_norm": 11.819076538085938,
"learning_rate": 0.00025989814075086186,
"loss": 2.346043825149536,
"mean_token_accuracy": 0.6244897842407227,
"num_tokens": 62976.0,
"step": 123
},
{
"entropy": 2.394810676574707,
"epoch": 1.9375,
"grad_norm": 18.02789306640625,
"learning_rate": 0.00025926965856723375,
"loss": 2.325348138809204,
"mean_token_accuracy": 0.6220472455024719,
"num_tokens": 63488.0,
"step": 124
},
{
"entropy": 2.577543020248413,
"epoch": 1.953125,
"grad_norm": 27.88809585571289,
"learning_rate": 0.00025863706244272003,
"loss": 2.1486191749572754,
"mean_token_accuracy": 0.6508264541625977,
"num_tokens": 64000.0,
"step": 125
},
{
"entropy": 2.4816625118255615,
"epoch": 1.96875,
"grad_norm": 9.078181266784668,
"learning_rate": 0.0002580003761942072,
"loss": 2.2877392768859863,
"mean_token_accuracy": 0.6141732335090637,
"num_tokens": 64512.0,
"step": 126
},
{
"entropy": 2.5372767448425293,
"epoch": 1.984375,
"grad_norm": 5.26969575881958,
"learning_rate": 0.00025735962379257274,
"loss": 2.0592477321624756,
"mean_token_accuracy": 0.6659750938415527,
"num_tokens": 65024.0,
"step": 127
},
{
"entropy": 2.397751808166504,
"epoch": 2.0,
"grad_norm": 15.707524299621582,
"learning_rate": 0.00025671482936178244,
"loss": 2.44035267829895,
"mean_token_accuracy": 0.625984251499176,
"num_tokens": 65536.0,
"step": 128
},
{
"entropy": 2.3219223022460938,
"epoch": 2.015625,
"grad_norm": 14.231306076049805,
"learning_rate": 0.00025606601717798207,
"loss": 2.2578790187835693,
"mean_token_accuracy": 0.6288032531738281,
"num_tokens": 66048.0,
"step": 129
},
{
"entropy": 2.1380038261413574,
"epoch": 2.03125,
"grad_norm": 11.717175483703613,
"learning_rate": 0.00025541321166858377,
"loss": 2.0398218631744385,
"mean_token_accuracy": 0.6378269791603088,
"num_tokens": 66560.0,
"step": 130
},
{
"entropy": 2.236997127532959,
"epoch": 2.046875,
"grad_norm": 4.696479797363281,
"learning_rate": 0.00025475643741134594,
"loss": 1.7749477624893188,
"mean_token_accuracy": 0.704016923904419,
"num_tokens": 67072.0,
"step": 131
},
{
"entropy": 2.2647347450256348,
"epoch": 2.0625,
"grad_norm": 6.612071990966797,
"learning_rate": 0.0002540957191334481,
"loss": 2.0588979721069336,
"mean_token_accuracy": 0.650306761264801,
"num_tokens": 67584.0,
"step": 132
},
{
"entropy": 2.4617621898651123,
"epoch": 2.078125,
"grad_norm": 3.019199848175049,
"learning_rate": 0.00025343108171056,
"loss": 1.6648919582366943,
"mean_token_accuracy": 0.7114967703819275,
"num_tokens": 68096.0,
"step": 133
},
{
"entropy": 2.4026403427124023,
"epoch": 2.09375,
"grad_norm": 2.1030118465423584,
"learning_rate": 0.00025276255016590504,
"loss": 2.198902130126953,
"mean_token_accuracy": 0.6279527544975281,
"num_tokens": 68608.0,
"step": 134
},
{
"entropy": 2.264892101287842,
"epoch": 2.109375,
"grad_norm": 7.839612007141113,
"learning_rate": 0.0002520901496693179,
"loss": 2.0725574493408203,
"mean_token_accuracy": 0.6606060862541199,
"num_tokens": 69120.0,
"step": 135
},
{
"entropy": 2.173431873321533,
"epoch": 2.125,
"grad_norm": 2.4342360496520996,
"learning_rate": 0.00025141390553629734,
"loss": 2.201805353164673,
"mean_token_accuracy": 0.624015748500824,
"num_tokens": 69632.0,
"step": 136
},
{
"entropy": 2.3075344562530518,
"epoch": 2.140625,
"grad_norm": 3.552382469177246,
"learning_rate": 0.00025073384322705274,
"loss": 2.0459301471710205,
"mean_token_accuracy": 0.6570841670036316,
"num_tokens": 70144.0,
"step": 137
},
{
"entropy": 2.276052951812744,
"epoch": 2.15625,
"grad_norm": 2.7191050052642822,
"learning_rate": 0.0002500499883455456,
"loss": 1.8903124332427979,
"mean_token_accuracy": 0.6797520518302917,
"num_tokens": 70656.0,
"step": 138
},
{
"entropy": 2.4734106063842773,
"epoch": 2.171875,
"grad_norm": 3.9799861907958984,
"learning_rate": 0.00024936236663852573,
"loss": 2.1707847118377686,
"mean_token_accuracy": 0.6437007784843445,
"num_tokens": 71168.0,
"step": 139
},
{
"entropy": 2.1881818771362305,
"epoch": 2.1875,
"grad_norm": 3.292069435119629,
"learning_rate": 0.0002486710039945618,
"loss": 1.7409114837646484,
"mean_token_accuracy": 0.6932772994041443,
"num_tokens": 71680.0,
"step": 140
},
{
"entropy": 2.099501371383667,
"epoch": 2.203125,
"grad_norm": 8.15732479095459,
"learning_rate": 0.00024797592644306646,
"loss": 2.028435707092285,
"mean_token_accuracy": 0.6646586060523987,
"num_tokens": 72192.0,
"step": 141
},
{
"entropy": 1.8974528312683105,
"epoch": 2.21875,
"grad_norm": 4.572197914123535,
"learning_rate": 0.00024727716015331683,
"loss": 2.1486008167266846,
"mean_token_accuracy": 0.6338582634925842,
"num_tokens": 72704.0,
"step": 142
},
{
"entropy": 2.18802547454834,
"epoch": 2.234375,
"grad_norm": 11.048628807067871,
"learning_rate": 0.0002465747314334687,
"loss": 2.185668468475342,
"mean_token_accuracy": 0.6448979377746582,
"num_tokens": 73216.0,
"step": 143
},
{
"entropy": 2.1774299144744873,
"epoch": 2.25,
"grad_norm": 19.580671310424805,
"learning_rate": 0.00024586866672956636,
"loss": 2.223238229751587,
"mean_token_accuracy": 0.6318897604942322,
"num_tokens": 73728.0,
"step": 144
},
{
"entropy": 2.1214840412139893,
"epoch": 2.265625,
"grad_norm": 3.102067232131958,
"learning_rate": 0.0002451589926245468,
"loss": 1.4063600301742554,
"mean_token_accuracy": 0.7455357313156128,
"num_tokens": 74240.0,
"step": 145
},
{
"entropy": 2.288113832473755,
"epoch": 2.28125,
"grad_norm": 2.9369845390319824,
"learning_rate": 0.00024444573583723905,
"loss": 1.8721083402633667,
"mean_token_accuracy": 0.6897274851799011,
"num_tokens": 74752.0,
"step": 146
},
{
"entropy": 2.1388370990753174,
"epoch": 2.296875,
"grad_norm": 4.0269880294799805,
"learning_rate": 0.00024372892322135792,
"loss": 2.0353121757507324,
"mean_token_accuracy": 0.658777117729187,
"num_tokens": 75264.0,
"step": 147
},
{
"entropy": 2.038548231124878,
"epoch": 2.3125,
"grad_norm": 5.226291179656982,
"learning_rate": 0.00024300858176449337,
"loss": 1.9185303449630737,
"mean_token_accuracy": 0.6618556976318359,
"num_tokens": 75776.0,
"step": 148
},
{
"entropy": 1.952240228652954,
"epoch": 2.328125,
"grad_norm": 4.657230377197266,
"learning_rate": 0.000242284738587094,
"loss": 1.8080577850341797,
"mean_token_accuracy": 0.6946721076965332,
"num_tokens": 76288.0,
"step": 149
},
{
"entropy": 2.0790510177612305,
"epoch": 2.34375,
"grad_norm": 3.602454900741577,
"learning_rate": 0.0002415574209414464,
"loss": 1.8954757452011108,
"mean_token_accuracy": 0.6814516186714172,
"num_tokens": 76800.0,
"step": 150
},
{
"entropy": 1.993764877319336,
"epoch": 2.359375,
"grad_norm": 3.6864471435546875,
"learning_rate": 0.00024082665621064884,
"loss": 1.8048733472824097,
"mean_token_accuracy": 0.676706850528717,
"num_tokens": 77312.0,
"step": 151
},
{
"entropy": 2.1891403198242188,
"epoch": 2.375,
"grad_norm": 11.02034854888916,
"learning_rate": 0.00024009247190758033,
"loss": 2.1008198261260986,
"mean_token_accuracy": 0.6417322754859924,
"num_tokens": 77824.0,
"step": 152
},
{
"entropy": 2.02280855178833,
"epoch": 2.390625,
"grad_norm": 10.282227516174316,
"learning_rate": 0.000239354895673865,
"loss": 1.854500412940979,
"mean_token_accuracy": 0.6666666865348816,
"num_tokens": 78336.0,
"step": 153
},
{
"entropy": 1.9524571895599365,
"epoch": 2.40625,
"grad_norm": 4.118484973907471,
"learning_rate": 0.00023861395527883115,
"loss": 1.7170909643173218,
"mean_token_accuracy": 0.695652186870575,
"num_tokens": 78848.0,
"step": 154
},
{
"entropy": 1.9783227443695068,
"epoch": 2.421875,
"grad_norm": 2.67722487449646,
"learning_rate": 0.00023786967861846582,
"loss": 1.60783851146698,
"mean_token_accuracy": 0.7184873819351196,
"num_tokens": 79360.0,
"step": 155
},
{
"entropy": 2.2170073986053467,
"epoch": 2.4375,
"grad_norm": 8.424147605895996,
"learning_rate": 0.00023712209371436465,
"loss": 1.2314633131027222,
"mean_token_accuracy": 0.7594654560089111,
"num_tokens": 79872.0,
"step": 156
},
{
"entropy": 2.0562796592712402,
"epoch": 2.453125,
"grad_norm": 9.240602493286133,
"learning_rate": 0.00023637122871267679,
"loss": 1.9202128648757935,
"mean_token_accuracy": 0.6593625545501709,
"num_tokens": 80384.0,
"step": 157
},
{
"entropy": 2.150996685028076,
"epoch": 2.46875,
"grad_norm": 4.457634925842285,
"learning_rate": 0.0002356171118830451,
"loss": 2.1217310428619385,
"mean_token_accuracy": 0.6535432934761047,
"num_tokens": 80896.0,
"step": 158
},
{
"entropy": 2.337796211242676,
"epoch": 2.484375,
"grad_norm": 9.230558395385742,
"learning_rate": 0.00023485977161754194,
"loss": 1.9099302291870117,
"mean_token_accuracy": 0.6680412292480469,
"num_tokens": 81408.0,
"step": 159
},
{
"entropy": 2.247264862060547,
"epoch": 2.5,
"grad_norm": 7.798185348510742,
"learning_rate": 0.0002340992364296004,
"loss": 1.8917332887649536,
"mean_token_accuracy": 0.6985743641853333,
"num_tokens": 81920.0,
"step": 160
},
{
"entropy": 2.262213706970215,
"epoch": 2.515625,
"grad_norm": 2.7314772605895996,
"learning_rate": 0.0002333355349529403,
"loss": 1.941816806793213,
"mean_token_accuracy": 0.6734279990196228,
"num_tokens": 82432.0,
"step": 161
},
{
"entropy": 2.2101516723632812,
"epoch": 2.53125,
"grad_norm": 6.667520999908447,
"learning_rate": 0.0002325686959404907,
"loss": 1.9418377876281738,
"mean_token_accuracy": 0.6827309131622314,
"num_tokens": 82944.0,
"step": 162
},
{
"entropy": 2.1902859210968018,
"epoch": 2.546875,
"grad_norm": 3.1942148208618164,
"learning_rate": 0.00023179874826330694,
"loss": 2.1716907024383545,
"mean_token_accuracy": 0.663385808467865,
"num_tokens": 83456.0,
"step": 163
},
{
"entropy": 2.114820957183838,
"epoch": 2.5625,
"grad_norm": 9.429683685302734,
"learning_rate": 0.00023102572090948393,
"loss": 2.1209301948547363,
"mean_token_accuracy": 0.663385808467865,
"num_tokens": 83968.0,
"step": 164
},
{
"entropy": 2.3089890480041504,
"epoch": 2.578125,
"grad_norm": 3.248178005218506,
"learning_rate": 0.00023024964298306458,
"loss": 2.042099714279175,
"mean_token_accuracy": 0.6760563254356384,
"num_tokens": 84480.0,
"step": 165
},
{
"entropy": 2.2706456184387207,
"epoch": 2.59375,
"grad_norm": 7.227741718292236,
"learning_rate": 0.00022947054370294422,
"loss": 1.613346815109253,
"mean_token_accuracy": 0.7441860437393188,
"num_tokens": 84992.0,
"step": 166
},
{
"entropy": 2.1461312770843506,
"epoch": 2.609375,
"grad_norm": 5.42982816696167,
"learning_rate": 0.00022868845240177032,
"loss": 2.038721799850464,
"mean_token_accuracy": 0.6811023354530334,
"num_tokens": 85504.0,
"step": 167
},
{
"entropy": 2.085728645324707,
"epoch": 2.625,
"grad_norm": 4.73753023147583,
"learning_rate": 0.0002279033985248384,
"loss": 1.7772815227508545,
"mean_token_accuracy": 0.7190082669258118,
"num_tokens": 86016.0,
"step": 168
},
{
"entropy": 2.0724904537200928,
"epoch": 2.640625,
"grad_norm": 3.573122262954712,
"learning_rate": 0.00022711541162898321,
"loss": 1.9584404230117798,
"mean_token_accuracy": 0.6918489336967468,
"num_tokens": 86528.0,
"step": 169
},
{
"entropy": 2.082350492477417,
"epoch": 2.65625,
"grad_norm": 5.604883670806885,
"learning_rate": 0.00022632452138146602,
"loss": 2.0279061794281006,
"mean_token_accuracy": 0.6867470145225525,
"num_tokens": 87040.0,
"step": 170
},
{
"entropy": 2.2631471157073975,
"epoch": 2.671875,
"grad_norm": 8.89859390258789,
"learning_rate": 0.00022553075755885762,
"loss": 2.2429392337799072,
"mean_token_accuracy": 0.6515747904777527,
"num_tokens": 87552.0,
"step": 171
},
{
"entropy": 2.1096630096435547,
"epoch": 2.6875,
"grad_norm": 15.497108459472656,
"learning_rate": 0.00022473415004591727,
"loss": 1.7870018482208252,
"mean_token_accuracy": 0.7008196711540222,
"num_tokens": 88064.0,
"step": 172
},
{
"entropy": 2.14209246635437,
"epoch": 2.703125,
"grad_norm": 18.13780403137207,
"learning_rate": 0.0002239347288344676,
"loss": 2.0227110385894775,
"mean_token_accuracy": 0.6794354915618896,
"num_tokens": 88576.0,
"step": 173
},
{
"entropy": 2.0351309776306152,
"epoch": 2.71875,
"grad_norm": 5.5439605712890625,
"learning_rate": 0.00022313252402226538,
"loss": 2.0029079914093018,
"mean_token_accuracy": 0.6673228144645691,
"num_tokens": 89088.0,
"step": 174
},
{
"entropy": 2.1739964485168457,
"epoch": 2.734375,
"grad_norm": 19.21829605102539,
"learning_rate": 0.00022232756581186841,
"loss": 2.211519241333008,
"mean_token_accuracy": 0.6594488024711609,
"num_tokens": 89600.0,
"step": 175
},
{
"entropy": 2.021829843521118,
"epoch": 2.75,
"grad_norm": 17.91119956970215,
"learning_rate": 0.00022151988450949832,
"loss": 1.7456486225128174,
"mean_token_accuracy": 0.6915322542190552,
"num_tokens": 90112.0,
"step": 176
},
{
"entropy": 2.2625017166137695,
"epoch": 2.765625,
"grad_norm": 19.28619956970215,
"learning_rate": 0.00022070951052389966,
"loss": 1.5992084741592407,
"mean_token_accuracy": 0.7397849559783936,
"num_tokens": 90624.0,
"step": 177
},
{
"entropy": 2.0457301139831543,
"epoch": 2.78125,
"grad_norm": 6.377933979034424,
"learning_rate": 0.0002198964743651949,
"loss": 2.0016820430755615,
"mean_token_accuracy": 0.6898608207702637,
"num_tokens": 91136.0,
"step": 178
},
{
"entropy": 2.1969172954559326,
"epoch": 2.796875,
"grad_norm": 4.351161956787109,
"learning_rate": 0.00021908080664373596,
"loss": 2.069615602493286,
"mean_token_accuracy": 0.699999988079071,
"num_tokens": 91648.0,
"step": 179
},
{
"entropy": 2.0312910079956055,
"epoch": 2.8125,
"grad_norm": 2.8072102069854736,
"learning_rate": 0.00021826253806895156,
"loss": 1.5687063932418823,
"mean_token_accuracy": 0.7635983228683472,
"num_tokens": 92160.0,
"step": 180
},
{
"entropy": 2.0322093963623047,
"epoch": 2.828125,
"grad_norm": 12.090331077575684,
"learning_rate": 0.00021744169944819098,
"loss": 1.9994778633117676,
"mean_token_accuracy": 0.6771653294563293,
"num_tokens": 92672.0,
"step": 181
},
{
"entropy": 2.1715853214263916,
"epoch": 2.84375,
"grad_norm": 15.88219165802002,
"learning_rate": 0.00021661832168556438,
"loss": 1.8473044633865356,
"mean_token_accuracy": 0.6991701126098633,
"num_tokens": 93184.0,
"step": 182
},
{
"entropy": 2.033459424972534,
"epoch": 2.859375,
"grad_norm": 4.496399402618408,
"learning_rate": 0.00021579243578077913,
"loss": 1.9900826215744019,
"mean_token_accuracy": 0.6948819160461426,
"num_tokens": 93696.0,
"step": 183
},
{
"entropy": 2.100759983062744,
"epoch": 2.875,
"grad_norm": 5.62208366394043,
"learning_rate": 0.00021496407282797276,
"loss": 1.7417033910751343,
"mean_token_accuracy": 0.7119675278663635,
"num_tokens": 94208.0,
"step": 184
},
{
"entropy": 2.1455624103546143,
"epoch": 2.890625,
"grad_norm": 4.165937423706055,
"learning_rate": 0.0002141332640145423,
"loss": 1.9299179315567017,
"mean_token_accuracy": 0.7094188332557678,
"num_tokens": 94720.0,
"step": 185
},
{
"entropy": 2.002096176147461,
"epoch": 2.90625,
"grad_norm": 1.7983911037445068,
"learning_rate": 0.00021330004061996996,
"loss": 1.6952036619186401,
"mean_token_accuracy": 0.7520492076873779,
"num_tokens": 95232.0,
"step": 186
},
{
"entropy": 2.0048775672912598,
"epoch": 2.921875,
"grad_norm": 1.7580811977386475,
"learning_rate": 0.00021246443401464558,
"loss": 1.7680833339691162,
"mean_token_accuracy": 0.7364184856414795,
"num_tokens": 95744.0,
"step": 187
},
{
"entropy": 2.169144868850708,
"epoch": 2.9375,
"grad_norm": 5.268362522125244,
"learning_rate": 0.00021162647565868556,
"loss": 1.8059192895889282,
"mean_token_accuracy": 0.7242798209190369,
"num_tokens": 96256.0,
"step": 188
},
{
"entropy": 2.1363723278045654,
"epoch": 2.953125,
"grad_norm": 3.498081684112549,
"learning_rate": 0.00021078619710074845,
"loss": 2.1745285987854004,
"mean_token_accuracy": 0.6751968264579773,
"num_tokens": 96768.0,
"step": 189
},
{
"entropy": 2.148711919784546,
"epoch": 2.96875,
"grad_norm": 6.432483196258545,
"learning_rate": 0.000209943629976847,
"loss": 2.1445229053497314,
"mean_token_accuracy": 0.6830708384513855,
"num_tokens": 97280.0,
"step": 190
},
{
"entropy": 2.019306182861328,
"epoch": 2.984375,
"grad_norm": 10.80850601196289,
"learning_rate": 0.0002090988060091572,
"loss": 1.7723709344863892,
"mean_token_accuracy": 0.7313131093978882,
"num_tokens": 97792.0,
"step": 191
},
{
"entropy": 1.978491187095642,
"epoch": 3.0,
"grad_norm": 4.951074123382568,
"learning_rate": 0.00020825175700482393,
"loss": 2.0183067321777344,
"mean_token_accuracy": 0.7105788588523865,
"num_tokens": 98304.0,
"step": 192
},
{
"entropy": 1.889210820198059,
"epoch": 3.015625,
"grad_norm": 7.480757713317871,
"learning_rate": 0.00020740251485476345,
"loss": 1.4690678119659424,
"mean_token_accuracy": 0.7515657544136047,
"num_tokens": 98816.0,
"step": 193
},
{
"entropy": 1.83897066116333,
"epoch": 3.03125,
"grad_norm": 3.0285542011260986,
"learning_rate": 0.00020655111153246273,
"loss": 1.531701922416687,
"mean_token_accuracy": 0.7408906817436218,
"num_tokens": 99328.0,
"step": 194
},
{
"entropy": 1.9947177171707153,
"epoch": 3.046875,
"grad_norm": 11.762626647949219,
"learning_rate": 0.00020569757909277562,
"loss": 1.6982847452163696,
"mean_token_accuracy": 0.7285714149475098,
"num_tokens": 99840.0,
"step": 195
},
{
"entropy": 2.1106882095336914,
"epoch": 3.0625,
"grad_norm": 13.554593086242676,
"learning_rate": 0.00020484194967071608,
"loss": 1.6521421670913696,
"mean_token_accuracy": 0.7452631592750549,
"num_tokens": 100352.0,
"step": 196
},
{
"entropy": 2.124321937561035,
"epoch": 3.078125,
"grad_norm": 7.983447074890137,
"learning_rate": 0.00020398425548024822,
"loss": 2.0003597736358643,
"mean_token_accuracy": 0.6988189220428467,
"num_tokens": 100864.0,
"step": 197
},
{
"entropy": 1.918179988861084,
"epoch": 3.09375,
"grad_norm": 9.18137264251709,
"learning_rate": 0.00020312452881307355,
"loss": 1.682350993156433,
"mean_token_accuracy": 0.7344064116477966,
"num_tokens": 101376.0,
"step": 198
},
{
"entropy": 1.787161946296692,
"epoch": 3.109375,
"grad_norm": 21.148954391479492,
"learning_rate": 0.00020226280203741514,
"loss": 1.5121514797210693,
"mean_token_accuracy": 0.7413442134857178,
"num_tokens": 101888.0,
"step": 199
},
{
"entropy": 1.9293241500854492,
"epoch": 3.125,
"grad_norm": 20.98975944519043,
"learning_rate": 0.00020139910759679915,
"loss": 1.4278969764709473,
"mean_token_accuracy": 0.7689075469970703,
"num_tokens": 102400.0,
"step": 200
},
{
"entropy": 1.7874903678894043,
"epoch": 3.140625,
"grad_norm": 16.30684471130371,
"learning_rate": 0.00020053347800883298,
"loss": 1.8084406852722168,
"mean_token_accuracy": 0.7157257795333862,
"num_tokens": 102912.0,
"step": 201
},
{
"entropy": 1.6876550912857056,
"epoch": 3.15625,
"grad_norm": 13.622233390808105,
"learning_rate": 0.00019966594586398145,
"loss": 1.6798195838928223,
"mean_token_accuracy": 0.7037773132324219,
"num_tokens": 103424.0,
"step": 202
},
{
"entropy": 1.776758074760437,
"epoch": 3.171875,
"grad_norm": 6.037537574768066,
"learning_rate": 0.00019879654382433943,
"loss": 1.6979624032974243,
"mean_token_accuracy": 0.7298387289047241,
"num_tokens": 103936.0,
"step": 203
},
{
"entropy": 1.850691318511963,
"epoch": 3.1875,
"grad_norm": 2.61728835105896,
"learning_rate": 0.00019792530462240234,
"loss": 1.6017121076583862,
"mean_token_accuracy": 0.7342799305915833,
"num_tokens": 104448.0,
"step": 204
},
{
"entropy": 1.8923485279083252,
"epoch": 3.203125,
"grad_norm": 7.442923069000244,
"learning_rate": 0.00019705226105983374,
"loss": 1.687976360321045,
"mean_token_accuracy": 0.7244094610214233,
"num_tokens": 104960.0,
"step": 205
},
{
"entropy": 2.107788324356079,
"epoch": 3.21875,
"grad_norm": 19.86151695251465,
"learning_rate": 0.00019617744600623023,
"loss": 2.016284942626953,
"mean_token_accuracy": 0.6968504190444946,
"num_tokens": 105472.0,
"step": 206
},
{
"entropy": 2.0825321674346924,
"epoch": 3.234375,
"grad_norm": 20.006290435791016,
"learning_rate": 0.00019530089239788422,
"loss": 1.8263378143310547,
"mean_token_accuracy": 0.7134020328521729,
"num_tokens": 105984.0,
"step": 207
},
{
"entropy": 2.016390800476074,
"epoch": 3.25,
"grad_norm": 15.552451133728027,
"learning_rate": 0.00019442263323654358,
"loss": 1.716286063194275,
"mean_token_accuracy": 0.7065868377685547,
"num_tokens": 106496.0,
"step": 208
},
{
"entropy": 1.9402761459350586,
"epoch": 3.265625,
"grad_norm": 10.624086380004883,
"learning_rate": 0.0001935427015881693,
"loss": 1.7164943218231201,
"mean_token_accuracy": 0.7269076108932495,
"num_tokens": 107008.0,
"step": 209
},
{
"entropy": 1.8228812217712402,
"epoch": 3.28125,
"grad_norm": 21.39549446105957,
"learning_rate": 0.00019266113058169076,
"loss": 1.7704980373382568,
"mean_token_accuracy": 0.7145669460296631,
"num_tokens": 107520.0,
"step": 210
},
{
"entropy": 1.7182958126068115,
"epoch": 3.296875,
"grad_norm": 23.144405364990234,
"learning_rate": 0.00019177795340775792,
"loss": 1.7252445220947266,
"mean_token_accuracy": 0.7263779640197754,
"num_tokens": 108032.0,
"step": 211
},
{
"entropy": 1.808842658996582,
"epoch": 3.3125,
"grad_norm": 22.433195114135742,
"learning_rate": 0.00019089320331749235,
"loss": 1.713385820388794,
"mean_token_accuracy": 0.7269076108932495,
"num_tokens": 108544.0,
"step": 212
},
{
"entropy": 1.8282392024993896,
"epoch": 3.328125,
"grad_norm": 18.469093322753906,
"learning_rate": 0.00019000691362123473,
"loss": 1.8379396200180054,
"mean_token_accuracy": 0.7134387493133545,
"num_tokens": 109056.0,
"step": 213
},
{
"entropy": 1.8023868799209595,
"epoch": 3.34375,
"grad_norm": 19.18539047241211,
"learning_rate": 0.0001891191176872913,
"loss": 1.8020644187927246,
"mean_token_accuracy": 0.7007874250411987,
"num_tokens": 109568.0,
"step": 214
},
{
"entropy": 1.677249789237976,
"epoch": 3.359375,
"grad_norm": 18.677223205566406,
"learning_rate": 0.00018822984894067719,
"loss": 1.4826351404190063,
"mean_token_accuracy": 0.7551020383834839,
"num_tokens": 110080.0,
"step": 215
},
{
"entropy": 1.930768609046936,
"epoch": 3.375,
"grad_norm": 11.903674125671387,
"learning_rate": 0.00018733914086185803,
"loss": 1.5919502973556519,
"mean_token_accuracy": 0.7355371713638306,
"num_tokens": 110592.0,
"step": 216
},
{
"entropy": 1.727242350578308,
"epoch": 3.390625,
"grad_norm": 3.6214687824249268,
"learning_rate": 0.0001864470269854896,
"loss": 1.5873475074768066,
"mean_token_accuracy": 0.7410358786582947,
"num_tokens": 111104.0,
"step": 217
},
{
"entropy": 1.7539688348770142,
"epoch": 3.40625,
"grad_norm": 9.966615676879883,
"learning_rate": 0.0001855535408991551,
"loss": 1.5512422323226929,
"mean_token_accuracy": 0.7586911916732788,
"num_tokens": 111616.0,
"step": 218
},
{
"entropy": 1.534348726272583,
"epoch": 3.421875,
"grad_norm": 15.220844268798828,
"learning_rate": 0.00018465871624210068,
"loss": 1.485011339187622,
"mean_token_accuracy": 0.7715430855751038,
"num_tokens": 112128.0,
"step": 219
},
{
"entropy": 1.7263754606246948,
"epoch": 3.4375,
"grad_norm": 9.868559837341309,
"learning_rate": 0.00018376258670396888,
"loss": 1.5979400873184204,
"mean_token_accuracy": 0.7459016442298889,
"num_tokens": 112640.0,
"step": 220
},
{
"entropy": 1.7610721588134766,
"epoch": 3.453125,
"grad_norm": 5.717709541320801,
"learning_rate": 0.00018286518602353045,
"loss": 1.5840563774108887,
"mean_token_accuracy": 0.751028835773468,
"num_tokens": 113152.0,
"step": 221
},
{
"entropy": 1.7383480072021484,
"epoch": 3.46875,
"grad_norm": 4.265955448150635,
"learning_rate": 0.00018196654798741368,
"loss": 1.6178569793701172,
"mean_token_accuracy": 0.751968502998352,
"num_tokens": 113664.0,
"step": 222
},
{
"entropy": 1.9051017761230469,
"epoch": 3.484375,
"grad_norm": 4.079834461212158,
"learning_rate": 0.00018106670642883277,
"loss": 1.125648856163025,
"mean_token_accuracy": 0.8163716793060303,
"num_tokens": 114176.0,
"step": 223
},
{
"entropy": 1.7379391193389893,
"epoch": 3.5,
"grad_norm": 3.7545363903045654,
"learning_rate": 0.00018016569522631378,
"loss": 1.2588374614715576,
"mean_token_accuracy": 0.7928870320320129,
"num_tokens": 114688.0,
"step": 224
},
{
"entropy": 1.7147713899612427,
"epoch": 3.515625,
"grad_norm": 3.103086471557617,
"learning_rate": 0.00017926354830241924,
"loss": 1.4433473348617554,
"mean_token_accuracy": 0.7766393423080444,
"num_tokens": 115200.0,
"step": 225
},
{
"entropy": 1.804306149482727,
"epoch": 3.53125,
"grad_norm": 2.8875999450683594,
"learning_rate": 0.00017836029962247092,
"loss": 1.563567042350769,
"mean_token_accuracy": 0.7510204315185547,
"num_tokens": 115712.0,
"step": 226
},
{
"entropy": 1.7466846704483032,
"epoch": 3.546875,
"grad_norm": 4.622049331665039,
"learning_rate": 0.00017745598319327116,
"loss": 1.6097654104232788,
"mean_token_accuracy": 0.7484909296035767,
"num_tokens": 116224.0,
"step": 227
},
{
"entropy": 1.8188687562942505,
"epoch": 3.5625,
"grad_norm": 5.317627906799316,
"learning_rate": 0.00017655063306182232,
"loss": 1.7428910732269287,
"mean_token_accuracy": 0.7283464670181274,
"num_tokens": 116736.0,
"step": 228
},
{
"entropy": 1.9028565883636475,
"epoch": 3.578125,
"grad_norm": 2.2038638591766357,
"learning_rate": 0.00017564428331404519,
"loss": 1.5572713613510132,
"mean_token_accuracy": 0.7560975551605225,
"num_tokens": 117248.0,
"step": 229
},
{
"entropy": 1.766879677772522,
"epoch": 3.59375,
"grad_norm": 1.8373303413391113,
"learning_rate": 0.0001747369680734955,
"loss": 1.3965166807174683,
"mean_token_accuracy": 0.7731958627700806,
"num_tokens": 117760.0,
"step": 230
},
{
"entropy": 2.0388832092285156,
"epoch": 3.609375,
"grad_norm": 2.9878053665161133,
"learning_rate": 0.0001738287215000792,
"loss": 1.5748356580734253,
"mean_token_accuracy": 0.7380457520484924,
"num_tokens": 118272.0,
"step": 231
},
{
"entropy": 1.748144268989563,
"epoch": 3.625,
"grad_norm": 2.4010939598083496,
"learning_rate": 0.0001729195777887665,
"loss": 1.605446219444275,
"mean_token_accuracy": 0.753564178943634,
"num_tokens": 118784.0,
"step": 232
},
{
"entropy": 1.8476933240890503,
"epoch": 3.640625,
"grad_norm": 2.0416951179504395,
"learning_rate": 0.00017200957116830423,
"loss": 1.83555006980896,
"mean_token_accuracy": 0.7313131093978882,
"num_tokens": 119296.0,
"step": 233
},
{
"entropy": 1.730068564414978,
"epoch": 3.65625,
"grad_norm": 2.2305748462677,
"learning_rate": 0.00017109873589992737,
"loss": 1.4430031776428223,
"mean_token_accuracy": 0.7628865838050842,
"num_tokens": 119808.0,
"step": 234
},
{
"entropy": 1.5812182426452637,
"epoch": 3.671875,
"grad_norm": 2.0247128009796143,
"learning_rate": 0.00017018710627606892,
"loss": 1.2767280340194702,
"mean_token_accuracy": 0.78925621509552,
"num_tokens": 120320.0,
"step": 235
},
{
"entropy": 1.6156758069992065,
"epoch": 3.6875,
"grad_norm": 2.141160488128662,
"learning_rate": 0.00016927471661906898,
"loss": 1.5877039432525635,
"mean_token_accuracy": 0.7560483813285828,
"num_tokens": 120832.0,
"step": 236
},
{
"entropy": 1.81034517288208,
"epoch": 3.703125,
"grad_norm": 6.961944580078125,
"learning_rate": 0.00016836160127988242,
"loss": 1.7641907930374146,
"mean_token_accuracy": 0.7263779640197754,
"num_tokens": 121344.0,
"step": 237
},
{
"entropy": 1.7935938835144043,
"epoch": 3.71875,
"grad_norm": 8.582378387451172,
"learning_rate": 0.00016744779463678572,
"loss": 1.6680549383163452,
"mean_token_accuracy": 0.7401574850082397,
"num_tokens": 121856.0,
"step": 238
},
{
"entropy": 1.9910942316055298,
"epoch": 3.734375,
"grad_norm": 2.0824148654937744,
"learning_rate": 0.00016653333109408248,
"loss": 1.7776023149490356,
"mean_token_accuracy": 0.7283464670181274,
"num_tokens": 122368.0,
"step": 239
},
{
"entropy": 1.8618779182434082,
"epoch": 3.75,
"grad_norm": 2.343003988265991,
"learning_rate": 0.00016561824508080819,
"loss": 1.5893044471740723,
"mean_token_accuracy": 0.7647058963775635,
"num_tokens": 122880.0,
"step": 240
},
{
"entropy": 1.6664297580718994,
"epoch": 3.765625,
"grad_norm": 2.6146957874298096,
"learning_rate": 0.0001647025710494341,
"loss": 1.7686117887496948,
"mean_token_accuracy": 0.7185039520263672,
"num_tokens": 123392.0,
"step": 241
},
{
"entropy": 1.7106194496154785,
"epoch": 3.78125,
"grad_norm": 1.93027925491333,
"learning_rate": 0.00016378634347456988,
"loss": 1.0115760564804077,
"mean_token_accuracy": 0.8384955525398254,
"num_tokens": 123904.0,
"step": 242
},
{
"entropy": 1.7875338792800903,
"epoch": 3.796875,
"grad_norm": 1.9203755855560303,
"learning_rate": 0.000162869596851666,
"loss": 1.6601542234420776,
"mean_token_accuracy": 0.7581967115402222,
"num_tokens": 124416.0,
"step": 243
},
{
"entropy": 1.8426944017410278,
"epoch": 3.8125,
"grad_norm": 2.230609178543091,
"learning_rate": 0.0001619523656957145,
"loss": 1.7351903915405273,
"mean_token_accuracy": 0.7386138439178467,
"num_tokens": 124928.0,
"step": 244
},
{
"entropy": 1.7839840650558472,
"epoch": 3.828125,
"grad_norm": 2.761748790740967,
"learning_rate": 0.00016103468453995012,
"loss": 1.7518467903137207,
"mean_token_accuracy": 0.7263779640197754,
"num_tokens": 125440.0,
"step": 245
},
{
"entropy": 1.8428518772125244,
"epoch": 3.84375,
"grad_norm": 2.753977060317993,
"learning_rate": 0.0001601165879345496,
"loss": 1.6037272214889526,
"mean_token_accuracy": 0.7444218993186951,
"num_tokens": 125952.0,
"step": 246
},
{
"entropy": 1.7670025825500488,
"epoch": 3.859375,
"grad_norm": 3.583522319793701,
"learning_rate": 0.00015919811044533128,
"loss": 1.8228002786636353,
"mean_token_accuracy": 0.7185039520263672,
"num_tokens": 126464.0,
"step": 247
},
{
"entropy": 1.7220356464385986,
"epoch": 3.875,
"grad_norm": 2.739765167236328,
"learning_rate": 0.0001582792866524535,
"loss": 1.6976571083068848,
"mean_token_accuracy": 0.7404426336288452,
"num_tokens": 126976.0,
"step": 248
},
{
"entropy": 1.8917089700698853,
"epoch": 3.890625,
"grad_norm": 2.6218745708465576,
"learning_rate": 0.0001573601511491127,
"loss": 1.3674724102020264,
"mean_token_accuracy": 0.7713097929954529,
"num_tokens": 127488.0,
"step": 249
},
{
"entropy": 1.785091757774353,
"epoch": 3.90625,
"grad_norm": 2.275317907333374,
"learning_rate": 0.00015644073854024113,
"loss": 1.5308012962341309,
"mean_token_accuracy": 0.765999972820282,
"num_tokens": 128000.0,
"step": 250
},
{
"entropy": 2.054109573364258,
"epoch": 3.921875,
"grad_norm": 2.2449769973754883,
"learning_rate": 0.00015552108344120383,
"loss": 1.0721290111541748,
"mean_token_accuracy": 0.8066666722297668,
"num_tokens": 128512.0,
"step": 251
},
{
"entropy": 2.0395283699035645,
"epoch": 3.9375,
"grad_norm": 1.5280213356018066,
"learning_rate": 0.0001546012204764955,
"loss": 1.3407353162765503,
"mean_token_accuracy": 0.7982832789421082,
"num_tokens": 129024.0,
"step": 252
},
{
"entropy": 1.815192461013794,
"epoch": 3.953125,
"grad_norm": 2.583451271057129,
"learning_rate": 0.00015368118427843682,
"loss": 1.573038101196289,
"mean_token_accuracy": 0.7582644820213318,
"num_tokens": 129536.0,
"step": 253
},
{
"entropy": 1.9515974521636963,
"epoch": 3.96875,
"grad_norm": 3.4174747467041016,
"learning_rate": 0.0001527610094858707,
"loss": 2.025150775909424,
"mean_token_accuracy": 0.6889764070510864,
"num_tokens": 130048.0,
"step": 254
},
{
"entropy": 1.8663040399551392,
"epoch": 3.984375,
"grad_norm": 1.9246138334274292,
"learning_rate": 0.00015184073074285797,
"loss": 1.731005311012268,
"mean_token_accuracy": 0.7358871102333069,
"num_tokens": 130560.0,
"step": 255
},
{
"entropy": 1.9476964473724365,
"epoch": 4.0,
"grad_norm": 1.8395166397094727,
"learning_rate": 0.00015092038269737317,
"loss": 1.8282963037490845,
"mean_token_accuracy": 0.7285429239273071,
"num_tokens": 131072.0,
"step": 256
},
{
"entropy": 1.9428930282592773,
"epoch": 4.015625,
"grad_norm": 2.5020766258239746,
"learning_rate": 0.00015,
"loss": 1.7320835590362549,
"mean_token_accuracy": 0.7224409580230713,
"num_tokens": 131584.0,
"step": 257
},
{
"entropy": 1.7875992059707642,
"epoch": 4.03125,
"grad_norm": 2.313523769378662,
"learning_rate": 0.00014907961730262684,
"loss": 1.6047075986862183,
"mean_token_accuracy": 0.7401574850082397,
"num_tokens": 132096.0,
"step": 258
},
{
"entropy": 1.5366038084030151,
"epoch": 4.046875,
"grad_norm": 2.181321382522583,
"learning_rate": 0.000148159269257142,
"loss": 1.5011101961135864,
"mean_token_accuracy": 0.7644710540771484,
"num_tokens": 132608.0,
"step": 259
},
{
"entropy": 1.6452386379241943,
"epoch": 4.0625,
"grad_norm": 4.90228796005249,
"learning_rate": 0.00014723899051412927,
"loss": 1.4501490592956543,
"mean_token_accuracy": 0.7625754475593567,
"num_tokens": 133120.0,
"step": 260
},
{
"entropy": 1.683490514755249,
"epoch": 4.078125,
"grad_norm": 3.2073004245758057,
"learning_rate": 0.00014631881572156315,
"loss": 1.290379285812378,
"mean_token_accuracy": 0.7727272510528564,
"num_tokens": 133632.0,
"step": 261
},
{
"entropy": 1.7427382469177246,
"epoch": 4.09375,
"grad_norm": 2.1592955589294434,
"learning_rate": 0.0001453987795235045,
"loss": 1.4221155643463135,
"mean_token_accuracy": 0.782608687877655,
"num_tokens": 134144.0,
"step": 262
},
{
"entropy": 1.7729072570800781,
"epoch": 4.109375,
"grad_norm": 1.8065860271453857,
"learning_rate": 0.00014447891655879617,
"loss": 1.433501958847046,
"mean_token_accuracy": 0.7873684167861938,
"num_tokens": 134656.0,
"step": 263
},
{
"entropy": 1.8184741735458374,
"epoch": 4.125,
"grad_norm": 2.002441167831421,
"learning_rate": 0.00014355926145975887,
"loss": 1.6424577236175537,
"mean_token_accuracy": 0.7247524857521057,
"num_tokens": 135168.0,
"step": 264
},
{
"entropy": 1.6545014381408691,
"epoch": 4.140625,
"grad_norm": 1.860556960105896,
"learning_rate": 0.0001426398488508873,
"loss": 1.4745804071426392,
"mean_token_accuracy": 0.757515013217926,
"num_tokens": 135680.0,
"step": 265
},
{
"entropy": 1.6565005779266357,
"epoch": 4.15625,
"grad_norm": 1.6392408609390259,
"learning_rate": 0.0001417207133475465,
"loss": 1.3510708808898926,
"mean_token_accuracy": 0.7739307284355164,
"num_tokens": 136192.0,
"step": 266
},
{
"entropy": 1.5082423686981201,
"epoch": 4.171875,
"grad_norm": 8.039023399353027,
"learning_rate": 0.0001408018895546687,
"loss": 1.5158226490020752,
"mean_token_accuracy": 0.7657480239868164,
"num_tokens": 136704.0,
"step": 267
},
{
"entropy": 1.58040452003479,
"epoch": 4.1875,
"grad_norm": 2.5063955783843994,
"learning_rate": 0.00013988341206545038,
"loss": 1.3905433416366577,
"mean_token_accuracy": 0.7622950673103333,
"num_tokens": 137216.0,
"step": 268
},
{
"entropy": 1.4347314834594727,
"epoch": 4.203125,
"grad_norm": 2.2003631591796875,
"learning_rate": 0.00013896531546004988,
"loss": 1.3757840394973755,
"mean_token_accuracy": 0.7868525981903076,
"num_tokens": 137728.0,
"step": 269
},
{
"entropy": 1.5444324016571045,
"epoch": 4.21875,
"grad_norm": 2.6288857460021973,
"learning_rate": 0.00013804763430428548,
"loss": 1.5756033658981323,
"mean_token_accuracy": 0.7539370059967041,
"num_tokens": 138240.0,
"step": 270
},
{
"entropy": 1.7162096500396729,
"epoch": 4.234375,
"grad_norm": 1.8113332986831665,
"learning_rate": 0.00013713040314833404,
"loss": 1.460745096206665,
"mean_token_accuracy": 0.7789255976676941,
"num_tokens": 138752.0,
"step": 271
},
{
"entropy": 1.6978294849395752,
"epoch": 4.25,
"grad_norm": 2.186129570007324,
"learning_rate": 0.0001362136565254301,
"loss": 1.3529666662216187,
"mean_token_accuracy": 0.7899159789085388,
"num_tokens": 139264.0,
"step": 272
},
{
"entropy": 1.716745138168335,
"epoch": 4.265625,
"grad_norm": 1.89174222946167,
"learning_rate": 0.0001352974289505659,
"loss": 1.4731371402740479,
"mean_token_accuracy": 0.7571428418159485,
"num_tokens": 139776.0,
"step": 273
},
{
"entropy": 1.6406333446502686,
"epoch": 4.28125,
"grad_norm": 1.6710184812545776,
"learning_rate": 0.0001343817549191918,
"loss": 1.4243406057357788,
"mean_token_accuracy": 0.7645875215530396,
"num_tokens": 140288.0,
"step": 274
},
{
"entropy": 1.6350668668746948,
"epoch": 4.296875,
"grad_norm": 2.4510951042175293,
"learning_rate": 0.00013346666890591753,
"loss": 1.5063180923461914,
"mean_token_accuracy": 0.7625754475593567,
"num_tokens": 140800.0,
"step": 275
},
{
"entropy": 1.6373059749603271,
"epoch": 4.3125,
"grad_norm": 3.550650119781494,
"learning_rate": 0.00013255220536321428,
"loss": 1.7208104133605957,
"mean_token_accuracy": 0.7244094610214233,
"num_tokens": 141312.0,
"step": 276
},
{
"entropy": 1.7569215297698975,
"epoch": 4.328125,
"grad_norm": 2.7330479621887207,
"learning_rate": 0.00013163839872011758,
"loss": 1.7864205837249756,
"mean_token_accuracy": 0.7244094610214233,
"num_tokens": 141824.0,
"step": 277
},
{
"entropy": 1.7542294263839722,
"epoch": 4.34375,
"grad_norm": 3.1029021739959717,
"learning_rate": 0.00013072528338093102,
"loss": 1.4170482158660889,
"mean_token_accuracy": 0.7632653117179871,
"num_tokens": 142336.0,
"step": 278
},
{
"entropy": 1.7410451173782349,
"epoch": 4.359375,
"grad_norm": 2.644277811050415,
"learning_rate": 0.00012981289372393108,
"loss": 1.7685855627059937,
"mean_token_accuracy": 0.7145669460296631,
"num_tokens": 142848.0,
"step": 279
},
{
"entropy": 1.586928129196167,
"epoch": 4.375,
"grad_norm": 2.635669469833374,
"learning_rate": 0.00012890126410007263,
"loss": 1.4982590675354004,
"mean_token_accuracy": 0.7569169998168945,
"num_tokens": 143360.0,
"step": 280
},
{
"entropy": 1.649911642074585,
"epoch": 4.390625,
"grad_norm": 1.8594759702682495,
"learning_rate": 0.00012799042883169574,
"loss": 1.5442874431610107,
"mean_token_accuracy": 0.7460629940032959,
"num_tokens": 143872.0,
"step": 281
},
{
"entropy": 1.5503339767456055,
"epoch": 4.40625,
"grad_norm": 1.9106673002243042,
"learning_rate": 0.0001270804222112335,
"loss": 1.3214415311813354,
"mean_token_accuracy": 0.7707910537719727,
"num_tokens": 144384.0,
"step": 282
},
{
"entropy": 1.6034382581710815,
"epoch": 4.421875,
"grad_norm": 1.9895292520523071,
"learning_rate": 0.0001261712784999208,
"loss": 1.1624211072921753,
"mean_token_accuracy": 0.8087317943572998,
"num_tokens": 144896.0,
"step": 283
},
{
"entropy": 1.6620171070098877,
"epoch": 4.4375,
"grad_norm": 1.7769454717636108,
"learning_rate": 0.0001252630319265045,
"loss": 1.5078996419906616,
"mean_token_accuracy": 0.7757575511932373,
"num_tokens": 145408.0,
"step": 284
},
{
"entropy": 1.6838772296905518,
"epoch": 4.453125,
"grad_norm": 2.0985946655273438,
"learning_rate": 0.0001243557166859548,
"loss": 1.7390193939208984,
"mean_token_accuracy": 0.7263779640197754,
"num_tokens": 145920.0,
"step": 285
},
{
"entropy": 1.6525346040725708,
"epoch": 4.46875,
"grad_norm": 2.0259389877319336,
"learning_rate": 0.00012344936693817768,
"loss": 1.3714038133621216,
"mean_token_accuracy": 0.7653061151504517,
"num_tokens": 146432.0,
"step": 286
},
{
"entropy": 1.6501621007919312,
"epoch": 4.484375,
"grad_norm": 1.975203275680542,
"learning_rate": 0.00012254401680672884,
"loss": 1.5183178186416626,
"mean_token_accuracy": 0.75,
"num_tokens": 146944.0,
"step": 287
},
{
"entropy": 1.548411250114441,
"epoch": 4.5,
"grad_norm": 2.04488205909729,
"learning_rate": 0.00012163970037752906,
"loss": 1.3318936824798584,
"mean_token_accuracy": 0.7874494194984436,
"num_tokens": 147456.0,
"step": 288
},
{
"entropy": 1.626534104347229,
"epoch": 4.515625,
"grad_norm": 1.795937418937683,
"learning_rate": 0.00012073645169758076,
"loss": 1.386396050453186,
"mean_token_accuracy": 0.7818930149078369,
"num_tokens": 147968.0,
"step": 289
},
{
"entropy": 1.6640959978103638,
"epoch": 4.53125,
"grad_norm": 1.907228946685791,
"learning_rate": 0.00011983430477368622,
"loss": 1.4364591836929321,
"mean_token_accuracy": 0.7804877758026123,
"num_tokens": 148480.0,
"step": 290
},
{
"entropy": 1.5656487941741943,
"epoch": 4.546875,
"grad_norm": 1.5509178638458252,
"learning_rate": 0.00011893329357116722,
"loss": 1.4595284461975098,
"mean_token_accuracy": 0.7677165269851685,
"num_tokens": 148992.0,
"step": 291
},
{
"entropy": 1.5234248638153076,
"epoch": 4.5625,
"grad_norm": 2.0590877532958984,
"learning_rate": 0.0001180334520125863,
"loss": 1.4068399667739868,
"mean_token_accuracy": 0.772819459438324,
"num_tokens": 149504.0,
"step": 292
},
{
"entropy": 1.8282629251480103,
"epoch": 4.578125,
"grad_norm": 1.5572700500488281,
"learning_rate": 0.00011713481397646953,
"loss": 1.334957242012024,
"mean_token_accuracy": 0.7923728823661804,
"num_tokens": 150016.0,
"step": 293
},
{
"entropy": 1.6177082061767578,
"epoch": 4.59375,
"grad_norm": 1.9526519775390625,
"learning_rate": 0.00011623741329603108,
"loss": 1.2470734119415283,
"mean_token_accuracy": 0.7995867729187012,
"num_tokens": 150528.0,
"step": 294
},
{
"entropy": 1.8059481382369995,
"epoch": 4.609375,
"grad_norm": 1.610900640487671,
"learning_rate": 0.00011534128375789933,
"loss": 0.9543240070343018,
"mean_token_accuracy": 0.8344519138336182,
"num_tokens": 151040.0,
"step": 295
},
{
"entropy": 1.8307008743286133,
"epoch": 4.625,
"grad_norm": 2.0146024227142334,
"learning_rate": 0.0001144464591008449,
"loss": 1.7310861349105835,
"mean_token_accuracy": 0.7263779640197754,
"num_tokens": 151552.0,
"step": 296
},
{
"entropy": 1.754088282585144,
"epoch": 4.640625,
"grad_norm": 1.8568058013916016,
"learning_rate": 0.00011355297301451042,
"loss": 1.3614675998687744,
"mean_token_accuracy": 0.7957446575164795,
"num_tokens": 152064.0,
"step": 297
},
{
"entropy": 1.800140380859375,
"epoch": 4.65625,
"grad_norm": 1.858344554901123,
"learning_rate": 0.00011266085913814197,
"loss": 1.2782111167907715,
"mean_token_accuracy": 0.7932489514350891,
"num_tokens": 152576.0,
"step": 298
},
{
"entropy": 1.6610287427902222,
"epoch": 4.671875,
"grad_norm": 2.0229034423828125,
"learning_rate": 0.00011177015105932281,
"loss": 1.6102267503738403,
"mean_token_accuracy": 0.7490118741989136,
"num_tokens": 153088.0,
"step": 299
},
{
"entropy": 1.509531021118164,
"epoch": 4.6875,
"grad_norm": 1.8015098571777344,
"learning_rate": 0.00011088088231270866,
"loss": 1.0522032976150513,
"mean_token_accuracy": 0.8229166865348816,
"num_tokens": 153600.0,
"step": 300
},
{
"entropy": 1.5897778272628784,
"epoch": 4.703125,
"grad_norm": 2.122609853744507,
"learning_rate": 0.00010999308637876524,
"loss": 1.3583630323410034,
"mean_token_accuracy": 0.7762096524238586,
"num_tokens": 154112.0,
"step": 301
},
{
"entropy": 1.5236281156539917,
"epoch": 4.71875,
"grad_norm": 2.0490593910217285,
"learning_rate": 0.00010910679668250767,
"loss": 1.568931221961975,
"mean_token_accuracy": 0.7618110179901123,
"num_tokens": 154624.0,
"step": 302
},
{
"entropy": 1.6798558235168457,
"epoch": 4.734375,
"grad_norm": 1.9797496795654297,
"learning_rate": 0.00010822204659224204,
"loss": 1.6891627311706543,
"mean_token_accuracy": 0.7342519760131836,
"num_tokens": 155136.0,
"step": 303
},
{
"entropy": 1.6777228116989136,
"epoch": 4.75,
"grad_norm": 1.6742204427719116,
"learning_rate": 0.00010733886941830923,
"loss": 1.0915815830230713,
"mean_token_accuracy": 0.8258064389228821,
"num_tokens": 155648.0,
"step": 304
},
{
"entropy": 1.6578925848007202,
"epoch": 4.765625,
"grad_norm": 1.6061065196990967,
"learning_rate": 0.00010645729841183066,
"loss": 1.469952940940857,
"mean_token_accuracy": 0.7644710540771484,
"num_tokens": 156160.0,
"step": 305
},
{
"entropy": 1.7077447175979614,
"epoch": 4.78125,
"grad_norm": 2.1335628032684326,
"learning_rate": 0.0001055773667634564,
"loss": 1.4437124729156494,
"mean_token_accuracy": 0.7602459192276001,
"num_tokens": 156672.0,
"step": 306
},
{
"entropy": 1.5449203252792358,
"epoch": 4.796875,
"grad_norm": 2.039146900177002,
"learning_rate": 0.00010469910760211578,
"loss": 1.4151314496994019,
"mean_token_accuracy": 0.772819459438324,
"num_tokens": 157184.0,
"step": 307
},
{
"entropy": 1.5264627933502197,
"epoch": 4.8125,
"grad_norm": 1.8946317434310913,
"learning_rate": 0.00010382255399376975,
"loss": 1.4154433012008667,
"mean_token_accuracy": 0.7569721341133118,
"num_tokens": 157696.0,
"step": 308
},
{
"entropy": 1.564460277557373,
"epoch": 4.828125,
"grad_norm": 2.3768820762634277,
"learning_rate": 0.00010294773894016627,
"loss": 1.3997899293899536,
"mean_token_accuracy": 0.7730923891067505,
"num_tokens": 158208.0,
"step": 309
},
{
"entropy": 1.548466682434082,
"epoch": 4.84375,
"grad_norm": 2.616581439971924,
"learning_rate": 0.00010207469537759764,
"loss": 1.2758865356445312,
"mean_token_accuracy": 0.7893660664558411,
"num_tokens": 158720.0,
"step": 310
},
{
"entropy": 1.6836023330688477,
"epoch": 4.859375,
"grad_norm": 1.9081612825393677,
"learning_rate": 0.00010120345617566057,
"loss": 1.1824684143066406,
"mean_token_accuracy": 0.8062499761581421,
"num_tokens": 159232.0,
"step": 311
},
{
"entropy": 1.6853163242340088,
"epoch": 4.875,
"grad_norm": 1.8191639184951782,
"learning_rate": 0.00010033405413601855,
"loss": 1.3885215520858765,
"mean_token_accuracy": 0.7745901346206665,
"num_tokens": 159744.0,
"step": 312
},
{
"entropy": 1.5399816036224365,
"epoch": 4.890625,
"grad_norm": 1.456498146057129,
"learning_rate": 9.946652199116699e-05,
"loss": 1.1631718873977661,
"mean_token_accuracy": 0.8189300298690796,
"num_tokens": 160256.0,
"step": 313
},
{
"entropy": 1.7912606000900269,
"epoch": 4.90625,
"grad_norm": 1.6905421018600464,
"learning_rate": 9.860089240320085e-05,
"loss": 1.2534205913543701,
"mean_token_accuracy": 0.7970085740089417,
"num_tokens": 160768.0,
"step": 314
},
{
"entropy": 1.63590407371521,
"epoch": 4.921875,
"grad_norm": 1.4305380582809448,
"learning_rate": 9.773719796258482e-05,
"loss": 1.370961308479309,
"mean_token_accuracy": 0.7987551689147949,
"num_tokens": 161280.0,
"step": 315
},
{
"entropy": 1.5808641910552979,
"epoch": 4.9375,
"grad_norm": 1.9045383930206299,
"learning_rate": 9.687547118692643e-05,
"loss": 1.3712960481643677,
"mean_token_accuracy": 0.7773279547691345,
"num_tokens": 161792.0,
"step": 316
},
{
"entropy": 1.5849716663360596,
"epoch": 4.953125,
"grad_norm": 2.0240650177001953,
"learning_rate": 9.601574451975175e-05,
"loss": 1.5580910444259644,
"mean_token_accuracy": 0.751968502998352,
"num_tokens": 162304.0,
"step": 317
},
{
"entropy": 1.7162737846374512,
"epoch": 4.96875,
"grad_norm": 1.6734753847122192,
"learning_rate": 9.515805032928391e-05,
"loss": 1.4473096132278442,
"mean_token_accuracy": 0.7798354029655457,
"num_tokens": 162816.0,
"step": 318
},
{
"entropy": 1.6163382530212402,
"epoch": 4.984375,
"grad_norm": 1.8411775827407837,
"learning_rate": 9.430242090722436e-05,
"loss": 1.35471773147583,
"mean_token_accuracy": 0.7836734652519226,
"num_tokens": 163328.0,
"step": 319
},
{
"entropy": 1.4940069913864136,
"epoch": 5.0,
"grad_norm": 1.8255853652954102,
"learning_rate": 9.344888846753726e-05,
"loss": 1.4874554872512817,
"mean_token_accuracy": 0.7618110179901123,
"num_tokens": 163840.0,
"step": 320
},
{
"entropy": 1.549060344696045,
"epoch": 5.015625,
"grad_norm": 1.745566964149475,
"learning_rate": 9.259748514523653e-05,
"loss": 1.4019237756729126,
"mean_token_accuracy": 0.7618110179901123,
"num_tokens": 164352.0,
"step": 321
},
{
"entropy": 1.5626025199890137,
"epoch": 5.03125,
"grad_norm": 1.8400838375091553,
"learning_rate": 9.174824299517607e-05,
"loss": 1.07392156124115,
"mean_token_accuracy": 0.8119834661483765,
"num_tokens": 164864.0,
"step": 322
},
{
"entropy": 1.6569701433181763,
"epoch": 5.046875,
"grad_norm": 1.8855388164520264,
"learning_rate": 9.09011939908428e-05,
"loss": 1.4062210321426392,
"mean_token_accuracy": 0.751968502998352,
"num_tokens": 165376.0,
"step": 323
},
{
"entropy": 1.5729684829711914,
"epoch": 5.0625,
"grad_norm": 1.8148961067199707,
"learning_rate": 9.0056370023153e-05,
"loss": 1.2550370693206787,
"mean_token_accuracy": 0.7766599655151367,
"num_tokens": 165888.0,
"step": 324
},
{
"entropy": 1.698303461074829,
"epoch": 5.078125,
"grad_norm": 1.404887080192566,
"learning_rate": 8.921380289925153e-05,
"loss": 0.7562521696090698,
"mean_token_accuracy": 0.8741573095321655,
"num_tokens": 166400.0,
"step": 325
},
{
"entropy": 1.4567222595214844,
"epoch": 5.09375,
"grad_norm": 2.1354284286499023,
"learning_rate": 8.837352434131443e-05,
"loss": 1.0930452346801758,
"mean_token_accuracy": 0.7897959351539612,
"num_tokens": 166912.0,
"step": 326
},
{
"entropy": 1.5501275062561035,
"epoch": 5.109375,
"grad_norm": 1.6018931865692139,
"learning_rate": 8.753556598535444e-05,
"loss": 1.1254503726959229,
"mean_token_accuracy": 0.8037189841270447,
"num_tokens": 167424.0,
"step": 327
},
{
"entropy": 1.4922233819961548,
"epoch": 5.125,
"grad_norm": 2.1968183517456055,
"learning_rate": 8.669995938003005e-05,
"loss": 1.4305412769317627,
"mean_token_accuracy": 0.7421259880065918,
"num_tokens": 167936.0,
"step": 328
},
{
"entropy": 1.6226767301559448,
"epoch": 5.140625,
"grad_norm": 1.895836591720581,
"learning_rate": 8.586673598545771e-05,
"loss": 1.1839910745620728,
"mean_token_accuracy": 0.7855669856071472,
"num_tokens": 168448.0,
"step": 329
},
{
"entropy": 1.453590750694275,
"epoch": 5.15625,
"grad_norm": 1.9998018741607666,
"learning_rate": 8.503592717202721e-05,
"loss": 1.4171252250671387,
"mean_token_accuracy": 0.7578740119934082,
"num_tokens": 168960.0,
"step": 330
},
{
"entropy": 1.523696780204773,
"epoch": 5.171875,
"grad_norm": 2.0707786083221436,
"learning_rate": 8.420756421922088e-05,
"loss": 1.198662519454956,
"mean_token_accuracy": 0.790123462677002,
"num_tokens": 169472.0,
"step": 331
},
{
"entropy": 1.4649165868759155,
"epoch": 5.1875,
"grad_norm": 1.9728301763534546,
"learning_rate": 8.338167831443563e-05,
"loss": 1.0776567459106445,
"mean_token_accuracy": 0.8117154836654663,
"num_tokens": 169984.0,
"step": 332
},
{
"entropy": 1.3908473253250122,
"epoch": 5.203125,
"grad_norm": 1.7057424783706665,
"learning_rate": 8.255830055180899e-05,
"loss": 1.3423693180084229,
"mean_token_accuracy": 0.7771202921867371,
"num_tokens": 170496.0,
"step": 333
},
{
"entropy": 1.602295160293579,
"epoch": 5.21875,
"grad_norm": 1.9868454933166504,
"learning_rate": 8.173746193104845e-05,
"loss": 1.1564866304397583,
"mean_token_accuracy": 0.7991631627082825,
"num_tokens": 171008.0,
"step": 334
},
{
"entropy": 1.4484628438949585,
"epoch": 5.234375,
"grad_norm": 1.8873497247695923,
"learning_rate": 8.091919335626399e-05,
"loss": 1.2578895092010498,
"mean_token_accuracy": 0.7795275449752808,
"num_tokens": 171520.0,
"step": 335
},
{
"entropy": 1.283211350440979,
"epoch": 5.25,
"grad_norm": 2.281681537628174,
"learning_rate": 8.010352563480509e-05,
"loss": 1.1836169958114624,
"mean_token_accuracy": 0.783730149269104,
"num_tokens": 172032.0,
"step": 336
},
{
"entropy": 1.3733656406402588,
"epoch": 5.265625,
"grad_norm": 1.8627188205718994,
"learning_rate": 7.929048947610034e-05,
"loss": 1.0594978332519531,
"mean_token_accuracy": 0.8168724179267883,
"num_tokens": 172544.0,
"step": 337
},
{
"entropy": 1.3879873752593994,
"epoch": 5.28125,
"grad_norm": 2.2347571849823,
"learning_rate": 7.84801154905017e-05,
"loss": 1.1629652976989746,
"mean_token_accuracy": 0.7951318621635437,
"num_tokens": 173056.0,
"step": 338
},
{
"entropy": 1.3931185007095337,
"epoch": 5.296875,
"grad_norm": 2.036378860473633,
"learning_rate": 7.76724341881316e-05,
"loss": 1.1349397897720337,
"mean_token_accuracy": 0.803680956363678,
"num_tokens": 173568.0,
"step": 339
},
{
"entropy": 1.6057257652282715,
"epoch": 5.3125,
"grad_norm": 2.337108612060547,
"learning_rate": 7.686747597773462e-05,
"loss": 1.4726322889328003,
"mean_token_accuracy": 0.7322834730148315,
"num_tokens": 174080.0,
"step": 340
},
{
"entropy": 1.5440049171447754,
"epoch": 5.328125,
"grad_norm": 2.250331401824951,
"learning_rate": 7.606527116553241e-05,
"loss": 1.4109631776809692,
"mean_token_accuracy": 0.7570281028747559,
"num_tokens": 174592.0,
"step": 341
},
{
"entropy": 1.5478792190551758,
"epoch": 5.34375,
"grad_norm": 1.9495255947113037,
"learning_rate": 7.526584995408275e-05,
"loss": 1.334647297859192,
"mean_token_accuracy": 0.7843942642211914,
"num_tokens": 175104.0,
"step": 342
},
{
"entropy": 1.4533789157867432,
"epoch": 5.359375,
"grad_norm": 2.0633509159088135,
"learning_rate": 7.446924244114238e-05,
"loss": 0.9381183385848999,
"mean_token_accuracy": 0.8195329308509827,
"num_tokens": 175616.0,
"step": 343
},
{
"entropy": 1.5073310136795044,
"epoch": 5.375,
"grad_norm": 2.1370253562927246,
"learning_rate": 7.367547861853393e-05,
"loss": 1.2126432657241821,
"mean_token_accuracy": 0.7962577939033508,
"num_tokens": 176128.0,
"step": 344
},
{
"entropy": 1.5421262979507446,
"epoch": 5.390625,
"grad_norm": 2.130643844604492,
"learning_rate": 7.288458837101675e-05,
"loss": 1.481192946434021,
"mean_token_accuracy": 0.75,
"num_tokens": 176640.0,
"step": 345
},
{
"entropy": 1.4015882015228271,
"epoch": 5.40625,
"grad_norm": 1.9656389951705933,
"learning_rate": 7.209660147516154e-05,
"loss": 1.4018691778182983,
"mean_token_accuracy": 0.7677165269851685,
"num_tokens": 177152.0,
"step": 346
},
{
"entropy": 1.2663908004760742,
"epoch": 5.421875,
"grad_norm": 1.9276113510131836,
"learning_rate": 7.131154759822968e-05,
"loss": 1.1962919235229492,
"mean_token_accuracy": 0.804780900478363,
"num_tokens": 177664.0,
"step": 347
},
{
"entropy": 1.369564175605774,
"epoch": 5.4375,
"grad_norm": 2.2459683418273926,
"learning_rate": 7.052945629705579e-05,
"loss": 1.0512068271636963,
"mean_token_accuracy": 0.8061224222183228,
"num_tokens": 178176.0,
"step": 348
},
{
"entropy": 1.4165040254592896,
"epoch": 5.453125,
"grad_norm": 1.9474868774414062,
"learning_rate": 6.975035701693544e-05,
"loss": 1.1490492820739746,
"mean_token_accuracy": 0.7983871102333069,
"num_tokens": 178688.0,
"step": 349
},
{
"entropy": 1.396820068359375,
"epoch": 5.46875,
"grad_norm": 2.4981231689453125,
"learning_rate": 6.897427909051607e-05,
"loss": 1.2930082082748413,
"mean_token_accuracy": 0.7736220359802246,
"num_tokens": 179200.0,
"step": 350
},
{
"entropy": 1.5704162120819092,
"epoch": 5.484375,
"grad_norm": 1.901033639907837,
"learning_rate": 6.820125173669306e-05,
"loss": 1.0180715322494507,
"mean_token_accuracy": 0.8322580456733704,
"num_tokens": 179712.0,
"step": 351
},
{
"entropy": 1.5132912397384644,
"epoch": 5.5,
"grad_norm": 2.2790002822875977,
"learning_rate": 6.743130405950929e-05,
"loss": 1.506807565689087,
"mean_token_accuracy": 0.747035562992096,
"num_tokens": 180224.0,
"step": 352
},
{
"entropy": 1.4252924919128418,
"epoch": 5.515625,
"grad_norm": 1.6666802167892456,
"learning_rate": 6.66644650470597e-05,
"loss": 0.6767361760139465,
"mean_token_accuracy": 0.8766520023345947,
"num_tokens": 180736.0,
"step": 353
},
{
"entropy": 1.3335093259811401,
"epoch": 5.53125,
"grad_norm": 2.164742946624756,
"learning_rate": 6.59007635703996e-05,
"loss": 1.316645622253418,
"mean_token_accuracy": 0.7795275449752808,
"num_tokens": 181248.0,
"step": 354
},
{
"entropy": 1.360636830329895,
"epoch": 5.546875,
"grad_norm": 2.145939350128174,
"learning_rate": 6.514022838245801e-05,
"loss": 1.1844747066497803,
"mean_token_accuracy": 0.782868504524231,
"num_tokens": 181760.0,
"step": 355
},
{
"entropy": 1.400923490524292,
"epoch": 5.5625,
"grad_norm": 2.355154037475586,
"learning_rate": 6.438288811695492e-05,
"loss": 1.380852222442627,
"mean_token_accuracy": 0.7618110179901123,
"num_tokens": 182272.0,
"step": 356
},
{
"entropy": 1.387778878211975,
"epoch": 5.578125,
"grad_norm": 2.233530282974243,
"learning_rate": 6.362877128732319e-05,
"loss": 1.174194097518921,
"mean_token_accuracy": 0.781124472618103,
"num_tokens": 182784.0,
"step": 357
},
{
"entropy": 1.5693912506103516,
"epoch": 5.59375,
"grad_norm": 1.7340208292007446,
"learning_rate": 6.287790628563534e-05,
"loss": 0.8804768323898315,
"mean_token_accuracy": 0.8436123132705688,
"num_tokens": 183296.0,
"step": 358
},
{
"entropy": 1.3868722915649414,
"epoch": 5.609375,
"grad_norm": 1.980425477027893,
"learning_rate": 6.213032138153417e-05,
"loss": 0.9284123182296753,
"mean_token_accuracy": 0.8329854011535645,
"num_tokens": 183808.0,
"step": 359
},
{
"entropy": 1.5730829238891602,
"epoch": 5.625,
"grad_norm": 2.1033363342285156,
"learning_rate": 6.138604472116889e-05,
"loss": 1.3292860984802246,
"mean_token_accuracy": 0.7835671305656433,
"num_tokens": 184320.0,
"step": 360
},
{
"entropy": 1.3760509490966797,
"epoch": 5.640625,
"grad_norm": 2.2947301864624023,
"learning_rate": 6.064510432613499e-05,
"loss": 1.3286343812942505,
"mean_token_accuracy": 0.7677165269851685,
"num_tokens": 184832.0,
"step": 361
},
{
"entropy": 1.4730992317199707,
"epoch": 5.65625,
"grad_norm": 1.7933112382888794,
"learning_rate": 5.990752809241968e-05,
"loss": 1.1549919843673706,
"mean_token_accuracy": 0.7962962985038757,
"num_tokens": 185344.0,
"step": 362
},
{
"entropy": 1.396909236907959,
"epoch": 5.671875,
"grad_norm": 2.2379417419433594,
"learning_rate": 5.917334378935118e-05,
"loss": 1.1229009628295898,
"mean_token_accuracy": 0.7857142686843872,
"num_tokens": 185856.0,
"step": 363
},
{
"entropy": 1.545285701751709,
"epoch": 5.6875,
"grad_norm": 2.0039150714874268,
"learning_rate": 5.8442579058553556e-05,
"loss": 1.2438194751739502,
"mean_token_accuracy": 0.7868852615356445,
"num_tokens": 186368.0,
"step": 364
},
{
"entropy": 1.264091968536377,
"epoch": 5.703125,
"grad_norm": 2.11838436126709,
"learning_rate": 5.771526141290599e-05,
"loss": 1.0326876640319824,
"mean_token_accuracy": 0.8125,
"num_tokens": 186880.0,
"step": 365
},
{
"entropy": 1.390173077583313,
"epoch": 5.71875,
"grad_norm": 2.515826463699341,
"learning_rate": 5.6991418235506615e-05,
"loss": 1.2848618030548096,
"mean_token_accuracy": 0.7696850299835205,
"num_tokens": 187392.0,
"step": 366
},
{
"entropy": 1.4828932285308838,
"epoch": 5.734375,
"grad_norm": 3.0285286903381348,
"learning_rate": 5.627107677864206e-05,
"loss": 1.4369527101516724,
"mean_token_accuracy": 0.7539370059967041,
"num_tokens": 187904.0,
"step": 367
},
{
"entropy": 1.3886916637420654,
"epoch": 5.75,
"grad_norm": 2.0452868938446045,
"learning_rate": 5.555426416276093e-05,
"loss": 1.0845024585723877,
"mean_token_accuracy": 0.8155737519264221,
"num_tokens": 188416.0,
"step": 368
},
{
"entropy": 1.3375890254974365,
"epoch": 5.765625,
"grad_norm": 2.1924948692321777,
"learning_rate": 5.4841007375453186e-05,
"loss": 1.253337025642395,
"mean_token_accuracy": 0.7795275449752808,
"num_tokens": 188928.0,
"step": 369
},
{
"entropy": 1.457409381866455,
"epoch": 5.78125,
"grad_norm": 2.147710084915161,
"learning_rate": 5.413133327043364e-05,
"loss": 1.1731246709823608,
"mean_token_accuracy": 0.800407350063324,
"num_tokens": 189440.0,
"step": 370
},
{
"entropy": 1.477699637413025,
"epoch": 5.796875,
"grad_norm": 2.23069429397583,
"learning_rate": 5.34252685665313e-05,
"loss": 0.9165684580802917,
"mean_token_accuracy": 0.8441558480262756,
"num_tokens": 189952.0,
"step": 371
},
{
"entropy": 1.5255780220031738,
"epoch": 5.8125,
"grad_norm": 2.031313180923462,
"learning_rate": 5.272283984668313e-05,
"loss": 1.2056560516357422,
"mean_token_accuracy": 0.8024948239326477,
"num_tokens": 190464.0,
"step": 372
},
{
"entropy": 1.4738668203353882,
"epoch": 5.828125,
"grad_norm": 2.3102340698242188,
"learning_rate": 5.2024073556933516e-05,
"loss": 1.2545979022979736,
"mean_token_accuracy": 0.7870182394981384,
"num_tokens": 190976.0,
"step": 373
},
{
"entropy": 1.4832838773727417,
"epoch": 5.84375,
"grad_norm": 1.9114770889282227,
"learning_rate": 5.13289960054382e-05,
"loss": 1.063859224319458,
"mean_token_accuracy": 0.8204593062400818,
"num_tokens": 191488.0,
"step": 374
},
{
"entropy": 1.4326761960983276,
"epoch": 5.859375,
"grad_norm": 2.0074758529663086,
"learning_rate": 5.063763336147421e-05,
"loss": 1.0995495319366455,
"mean_token_accuracy": 0.7991718649864197,
"num_tokens": 192000.0,
"step": 375
},
{
"entropy": 1.218077301979065,
"epoch": 5.875,
"grad_norm": 2.2023136615753174,
"learning_rate": 4.9950011654454394e-05,
"loss": 1.2278096675872803,
"mean_token_accuracy": 0.7854330539703369,
"num_tokens": 192512.0,
"step": 376
},
{
"entropy": 1.4478676319122314,
"epoch": 5.890625,
"grad_norm": 2.902523994445801,
"learning_rate": 4.926615677294723e-05,
"loss": 1.3362658023834229,
"mean_token_accuracy": 0.7757201790809631,
"num_tokens": 193024.0,
"step": 377
},
{
"entropy": 1.5202453136444092,
"epoch": 5.90625,
"grad_norm": 2.273052453994751,
"learning_rate": 4.8586094463702626e-05,
"loss": 1.5842170715332031,
"mean_token_accuracy": 0.7618110179901123,
"num_tokens": 193536.0,
"step": 378
},
{
"entropy": 1.467264175415039,
"epoch": 5.921875,
"grad_norm": 2.2763044834136963,
"learning_rate": 4.7909850330682046e-05,
"loss": 1.1156023740768433,
"mean_token_accuracy": 0.7871900796890259,
"num_tokens": 194048.0,
"step": 379
},
{
"entropy": 1.2735207080841064,
"epoch": 5.9375,
"grad_norm": 2.1325440406799316,
"learning_rate": 4.7237449834094956e-05,
"loss": 1.0693836212158203,
"mean_token_accuracy": 0.8052738308906555,
"num_tokens": 194560.0,
"step": 380
},
{
"entropy": 1.4293042421340942,
"epoch": 5.953125,
"grad_norm": 2.122368097305298,
"learning_rate": 4.656891828943996e-05,
"loss": 1.4175359010696411,
"mean_token_accuracy": 0.7657480239868164,
"num_tokens": 195072.0,
"step": 381
},
{
"entropy": 1.3723492622375488,
"epoch": 5.96875,
"grad_norm": 2.236246347427368,
"learning_rate": 4.5904280866551926e-05,
"loss": 1.3109025955200195,
"mean_token_accuracy": 0.7814960479736328,
"num_tokens": 195584.0,
"step": 382
},
{
"entropy": 1.4019876718521118,
"epoch": 5.984375,
"grad_norm": 2.12497878074646,
"learning_rate": 4.5243562588654076e-05,
"loss": 1.4718176126480103,
"mean_token_accuracy": 0.7657480239868164,
"num_tokens": 196096.0,
"step": 383
},
{
"entropy": 1.4074183702468872,
"epoch": 6.0,
"grad_norm": 2.1889452934265137,
"learning_rate": 4.4586788331416235e-05,
"loss": 1.2516783475875854,
"mean_token_accuracy": 0.7814960479736328,
"num_tokens": 196608.0,
"step": 384
},
{
"entropy": 1.4472509622573853,
"epoch": 6.015625,
"grad_norm": 2.2552759647369385,
"learning_rate": 4.3933982822017876e-05,
"loss": 1.1190861463546753,
"mean_token_accuracy": 0.7952286005020142,
"num_tokens": 197120.0,
"step": 385
},
{
"entropy": 1.3649916648864746,
"epoch": 6.03125,
"grad_norm": 2.2668163776397705,
"learning_rate": 4.3285170638217514e-05,
"loss": 1.1158784627914429,
"mean_token_accuracy": 0.789370059967041,
"num_tokens": 197632.0,
"step": 386
},
{
"entropy": 1.3257055282592773,
"epoch": 6.046875,
"grad_norm": 2.023829698562622,
"learning_rate": 4.264037620742721e-05,
"loss": 0.9378941655158997,
"mean_token_accuracy": 0.8217213153839111,
"num_tokens": 198144.0,
"step": 387
},
{
"entropy": 1.5115382671356201,
"epoch": 6.0625,
"grad_norm": 1.9996124505996704,
"learning_rate": 4.199962380579275e-05,
"loss": 0.8486489057540894,
"mean_token_accuracy": 0.8340517282485962,
"num_tokens": 198656.0,
"step": 388
},
{
"entropy": 1.3277173042297363,
"epoch": 6.078125,
"grad_norm": 2.146892547607422,
"learning_rate": 4.136293755727998e-05,
"loss": 1.0290329456329346,
"mean_token_accuracy": 0.8016194105148315,
"num_tokens": 199168.0,
"step": 389
},
{
"entropy": 1.558488130569458,
"epoch": 6.09375,
"grad_norm": 2.4036035537719727,
"learning_rate": 4.073034143276622e-05,
"loss": 1.5012381076812744,
"mean_token_accuracy": 0.7381889820098877,
"num_tokens": 199680.0,
"step": 390
},
{
"entropy": 1.4996081590652466,
"epoch": 6.109375,
"grad_norm": 1.952165126800537,
"learning_rate": 4.010185924913809e-05,
"loss": 0.9398556351661682,
"mean_token_accuracy": 0.8113207817077637,
"num_tokens": 200192.0,
"step": 391
},
{
"entropy": 1.3826960325241089,
"epoch": 6.125,
"grad_norm": 2.272625207901001,
"learning_rate": 3.947751466839451e-05,
"loss": 1.102223515510559,
"mean_token_accuracy": 0.811475396156311,
"num_tokens": 200704.0,
"step": 392
},
{
"entropy": 1.4677839279174805,
"epoch": 6.140625,
"grad_norm": 1.7582485675811768,
"learning_rate": 3.885733119675616e-05,
"loss": 0.5989710092544556,
"mean_token_accuracy": 0.8839285969734192,
"num_tokens": 201216.0,
"step": 393
},
{
"entropy": 1.6351354122161865,
"epoch": 6.15625,
"grad_norm": 1.6400773525238037,
"learning_rate": 3.8241332183780105e-05,
"loss": 0.6478846073150635,
"mean_token_accuracy": 0.8755760192871094,
"num_tokens": 201728.0,
"step": 394
},
{
"entropy": 1.2197786569595337,
"epoch": 6.171875,
"grad_norm": 2.219877243041992,
"learning_rate": 3.762954082148113e-05,
"loss": 0.9954053163528442,
"mean_token_accuracy": 0.803960382938385,
"num_tokens": 202240.0,
"step": 395
},
{
"entropy": 1.3044180870056152,
"epoch": 6.1875,
"grad_norm": 2.2616729736328125,
"learning_rate": 3.702198014345813e-05,
"loss": 1.1857067346572876,
"mean_token_accuracy": 0.7786561250686646,
"num_tokens": 202752.0,
"step": 396
},
{
"entropy": 1.4440141916275024,
"epoch": 6.203125,
"grad_norm": 2.3922085762023926,
"learning_rate": 3.641867302402731e-05,
"loss": 1.0443466901779175,
"mean_token_accuracy": 0.8074533939361572,
"num_tokens": 203264.0,
"step": 397
},
{
"entropy": 1.4667960405349731,
"epoch": 6.21875,
"grad_norm": 1.9791189432144165,
"learning_rate": 3.5819642177360744e-05,
"loss": 0.9006252884864807,
"mean_token_accuracy": 0.8259023427963257,
"num_tokens": 203776.0,
"step": 398
},
{
"entropy": 1.2889478206634521,
"epoch": 6.234375,
"grad_norm": 2.4023540019989014,
"learning_rate": 3.5224910156631154e-05,
"loss": 1.1616065502166748,
"mean_token_accuracy": 0.7854330539703369,
"num_tokens": 204288.0,
"step": 399
},
{
"entropy": 1.3888750076293945,
"epoch": 6.25,
"grad_norm": 3.2403616905212402,
"learning_rate": 3.4634499353163075e-05,
"loss": 1.2539849281311035,
"mean_token_accuracy": 0.7775590419769287,
"num_tokens": 204800.0,
"step": 400
},
{
"entropy": 1.4797190427780151,
"epoch": 6.265625,
"grad_norm": 2.4358670711517334,
"learning_rate": 3.404843199558945e-05,
"loss": 1.1758122444152832,
"mean_token_accuracy": 0.800000011920929,
"num_tokens": 205312.0,
"step": 401
},
{
"entropy": 1.337235689163208,
"epoch": 6.28125,
"grad_norm": 2.2013049125671387,
"learning_rate": 3.346673014901515e-05,
"loss": 1.0903222560882568,
"mean_token_accuracy": 0.8020201921463013,
"num_tokens": 205824.0,
"step": 402
},
{
"entropy": 1.3701938390731812,
"epoch": 6.296875,
"grad_norm": 2.5289316177368164,
"learning_rate": 3.288941571418582e-05,
"loss": 1.1845871210098267,
"mean_token_accuracy": 0.7736220359802246,
"num_tokens": 206336.0,
"step": 403
},
{
"entropy": 1.2536909580230713,
"epoch": 6.3125,
"grad_norm": 2.306061029434204,
"learning_rate": 3.2316510426663745e-05,
"loss": 1.063169240951538,
"mean_token_accuracy": 0.8110235929489136,
"num_tokens": 206848.0,
"step": 404
},
{
"entropy": 1.3208762407302856,
"epoch": 6.328125,
"grad_norm": 2.4128077030181885,
"learning_rate": 3.174803585600906e-05,
"loss": 0.8401311039924622,
"mean_token_accuracy": 0.8350951671600342,
"num_tokens": 207360.0,
"step": 405
},
{
"entropy": 1.3629928827285767,
"epoch": 6.34375,
"grad_norm": 2.3814520835876465,
"learning_rate": 3.1184013404968174e-05,
"loss": 1.1989407539367676,
"mean_token_accuracy": 0.7854330539703369,
"num_tokens": 207872.0,
"step": 406
},
{
"entropy": 1.4353210926055908,
"epoch": 6.359375,
"grad_norm": 2.414461612701416,
"learning_rate": 3.062446430866748e-05,
"loss": 1.0983890295028687,
"mean_token_accuracy": 0.8086419701576233,
"num_tokens": 208384.0,
"step": 407
},
{
"entropy": 1.324364185333252,
"epoch": 6.375,
"grad_norm": 2.3120386600494385,
"learning_rate": 3.0069409633814228e-05,
"loss": 1.0871342420578003,
"mean_token_accuracy": 0.7933070659637451,
"num_tokens": 208896.0,
"step": 408
},
{
"entropy": 1.4769569635391235,
"epoch": 6.390625,
"grad_norm": 2.1611950397491455,
"learning_rate": 2.9518870277903274e-05,
"loss": 1.0565687417984009,
"mean_token_accuracy": 0.8062499761581421,
"num_tokens": 209408.0,
"step": 409
},
{
"entropy": 1.2712613344192505,
"epoch": 6.40625,
"grad_norm": 2.1195592880249023,
"learning_rate": 2.8972866968430098e-05,
"loss": 0.8914839029312134,
"mean_token_accuracy": 0.8381742835044861,
"num_tokens": 209920.0,
"step": 410
},
{
"entropy": 1.3642261028289795,
"epoch": 6.421875,
"grad_norm": 2.089002847671509,
"learning_rate": 2.84314202621108e-05,
"loss": 1.042889952659607,
"mean_token_accuracy": 0.8139059543609619,
"num_tokens": 210432.0,
"step": 411
},
{
"entropy": 1.2996537685394287,
"epoch": 6.4375,
"grad_norm": 2.5762758255004883,
"learning_rate": 2.7894550544107737e-05,
"loss": 1.1719818115234375,
"mean_token_accuracy": 0.7814960479736328,
"num_tokens": 210944.0,
"step": 412
},
{
"entropy": 1.325704574584961,
"epoch": 6.453125,
"grad_norm": 2.469484329223633,
"learning_rate": 2.7362278027262457e-05,
"loss": 0.9921610355377197,
"mean_token_accuracy": 0.7991803288459778,
"num_tokens": 211456.0,
"step": 413
},
{
"entropy": 1.2528401613235474,
"epoch": 6.46875,
"grad_norm": 2.5817010402679443,
"learning_rate": 2.68346227513343e-05,
"loss": 1.012644648551941,
"mean_token_accuracy": 0.8104838728904724,
"num_tokens": 211968.0,
"step": 414
},
{
"entropy": 1.3440062999725342,
"epoch": 6.484375,
"grad_norm": 2.138012170791626,
"learning_rate": 2.6311604582246238e-05,
"loss": 0.9965952634811401,
"mean_token_accuracy": 0.8103092908859253,
"num_tokens": 212480.0,
"step": 415
},
{
"entropy": 1.2636010646820068,
"epoch": 6.5,
"grad_norm": 3.449289083480835,
"learning_rate": 2.5793243211336645e-05,
"loss": 1.2079899311065674,
"mean_token_accuracy": 0.7696850299835205,
"num_tokens": 212992.0,
"step": 416
},
{
"entropy": 1.3567794561386108,
"epoch": 6.515625,
"grad_norm": 2.7362849712371826,
"learning_rate": 2.5279558154618197e-05,
"loss": 1.003839373588562,
"mean_token_accuracy": 0.8060606122016907,
"num_tokens": 213504.0,
"step": 417
},
{
"entropy": 1.477582335472107,
"epoch": 6.53125,
"grad_norm": 2.272670030593872,
"learning_rate": 2.4770568752042995e-05,
"loss": 0.9059895873069763,
"mean_token_accuracy": 0.8144989609718323,
"num_tokens": 214016.0,
"step": 418
},
{
"entropy": 1.4449951648712158,
"epoch": 6.546875,
"grad_norm": 2.270644187927246,
"learning_rate": 2.4266294166774288e-05,
"loss": 1.1380894184112549,
"mean_token_accuracy": 0.7979592084884644,
"num_tokens": 214528.0,
"step": 419
},
{
"entropy": 1.210769772529602,
"epoch": 6.5625,
"grad_norm": 2.6737060546875,
"learning_rate": 2.376675338446525e-05,
"loss": 1.0513958930969238,
"mean_token_accuracy": 0.8007968068122864,
"num_tokens": 215040.0,
"step": 420
},
{
"entropy": 1.351199984550476,
"epoch": 6.578125,
"grad_norm": 2.685697078704834,
"learning_rate": 2.3271965212543932e-05,
"loss": 1.2178758382797241,
"mean_token_accuracy": 0.7777777910232544,
"num_tokens": 215552.0,
"step": 421
},
{
"entropy": 1.4135276079177856,
"epoch": 6.59375,
"grad_norm": 2.340247869491577,
"learning_rate": 2.278194827950543e-05,
"loss": 1.250877857208252,
"mean_token_accuracy": 0.7775590419769287,
"num_tokens": 216064.0,
"step": 422
},
{
"entropy": 1.230191946029663,
"epoch": 6.609375,
"grad_norm": 2.4362449645996094,
"learning_rate": 2.2296721034210218e-05,
"loss": 1.1102575063705444,
"mean_token_accuracy": 0.7924901247024536,
"num_tokens": 216576.0,
"step": 423
},
{
"entropy": 1.311639428138733,
"epoch": 6.625,
"grad_norm": 2.4978349208831787,
"learning_rate": 2.1816301745189933e-05,
"loss": 1.133759617805481,
"mean_token_accuracy": 0.7854330539703369,
"num_tokens": 217088.0,
"step": 424
},
{
"entropy": 1.344711422920227,
"epoch": 6.640625,
"grad_norm": 2.260749578475952,
"learning_rate": 2.1340708499959197e-05,
"loss": 1.0410059690475464,
"mean_token_accuracy": 0.8065173029899597,
"num_tokens": 217600.0,
"step": 425
},
{
"entropy": 1.2411428689956665,
"epoch": 6.65625,
"grad_norm": 2.3936338424682617,
"learning_rate": 2.0869959204334935e-05,
"loss": 1.097914457321167,
"mean_token_accuracy": 0.796407163143158,
"num_tokens": 218112.0,
"step": 426
},
{
"entropy": 1.447340965270996,
"epoch": 6.671875,
"grad_norm": 2.2700304985046387,
"learning_rate": 2.0404071581761995e-05,
"loss": 1.098491907119751,
"mean_token_accuracy": 0.7946611642837524,
"num_tokens": 218624.0,
"step": 427
},
{
"entropy": 1.4071056842803955,
"epoch": 6.6875,
"grad_norm": 2.3570244312286377,
"learning_rate": 1.9943063172646085e-05,
"loss": 1.2364177703857422,
"mean_token_accuracy": 0.7695390582084656,
"num_tokens": 219136.0,
"step": 428
},
{
"entropy": 1.3318933248519897,
"epoch": 6.703125,
"grad_norm": 2.6349446773529053,
"learning_rate": 1.9486951333693296e-05,
"loss": 1.1402510404586792,
"mean_token_accuracy": 0.7907444834709167,
"num_tokens": 219648.0,
"step": 429
},
{
"entropy": 1.3377373218536377,
"epoch": 6.71875,
"grad_norm": 2.332634925842285,
"learning_rate": 1.9035753237256486e-05,
"loss": 1.0878653526306152,
"mean_token_accuracy": 0.7923387289047241,
"num_tokens": 220160.0,
"step": 430
},
{
"entropy": 1.3920668363571167,
"epoch": 6.734375,
"grad_norm": 2.1408042907714844,
"learning_rate": 1.8589485870689023e-05,
"loss": 1.233846664428711,
"mean_token_accuracy": 0.7933070659637451,
"num_tokens": 220672.0,
"step": 431
},
{
"entropy": 1.3373357057571411,
"epoch": 6.75,
"grad_norm": 3.003164768218994,
"learning_rate": 1.814816603570497e-05,
"loss": 1.2591382265090942,
"mean_token_accuracy": 0.7657480239868164,
"num_tokens": 221184.0,
"step": 432
},
{
"entropy": 1.3029452562332153,
"epoch": 6.765625,
"grad_norm": 2.2512388229370117,
"learning_rate": 1.7711810347746757e-05,
"loss": 1.1033636331558228,
"mean_token_accuracy": 0.805220901966095,
"num_tokens": 221696.0,
"step": 433
},
{
"entropy": 1.3201580047607422,
"epoch": 6.78125,
"grad_norm": 2.216168165206909,
"learning_rate": 1.728043523535933e-05,
"loss": 0.8506653308868408,
"mean_token_accuracy": 0.8407643437385559,
"num_tokens": 222208.0,
"step": 434
},
{
"entropy": 1.3668451309204102,
"epoch": 6.796875,
"grad_norm": 2.320620059967041,
"learning_rate": 1.6854056939571925e-05,
"loss": 1.1892409324645996,
"mean_token_accuracy": 0.7755905389785767,
"num_tokens": 222720.0,
"step": 435
},
{
"entropy": 1.3634605407714844,
"epoch": 6.8125,
"grad_norm": 2.3351492881774902,
"learning_rate": 1.6432691513286318e-05,
"loss": 1.1596734523773193,
"mean_token_accuracy": 0.787401556968689,
"num_tokens": 223232.0,
"step": 436
},
{
"entropy": 1.494618535041809,
"epoch": 6.828125,
"grad_norm": 2.343942165374756,
"learning_rate": 1.6016354820672715e-05,
"loss": 1.0683393478393555,
"mean_token_accuracy": 0.8004158139228821,
"num_tokens": 223744.0,
"step": 437
},
{
"entropy": 1.3987553119659424,
"epoch": 6.84375,
"grad_norm": 2.640872001647949,
"learning_rate": 1.560506253657223e-05,
"loss": 0.9901496767997742,
"mean_token_accuracy": 0.8179916143417358,
"num_tokens": 224256.0,
"step": 438
},
{
"entropy": 1.3903093338012695,
"epoch": 6.859375,
"grad_norm": 2.8874828815460205,
"learning_rate": 1.519883014590691e-05,
"loss": 1.076893925666809,
"mean_token_accuracy": 0.7962962985038757,
"num_tokens": 224768.0,
"step": 439
},
{
"entropy": 1.4098860025405884,
"epoch": 6.875,
"grad_norm": 2.306347370147705,
"learning_rate": 1.4797672943096711e-05,
"loss": 1.2542139291763306,
"mean_token_accuracy": 0.7786720395088196,
"num_tokens": 225280.0,
"step": 440
},
{
"entropy": 1.4133609533309937,
"epoch": 6.890625,
"grad_norm": 2.25966739654541,
"learning_rate": 1.4401606031483497e-05,
"loss": 1.2815967798233032,
"mean_token_accuracy": 0.7742574214935303,
"num_tokens": 225792.0,
"step": 441
},
{
"entropy": 1.248429775238037,
"epoch": 6.90625,
"grad_norm": 2.32254695892334,
"learning_rate": 1.4010644322762699e-05,
"loss": 0.8687695860862732,
"mean_token_accuracy": 0.8353909254074097,
"num_tokens": 226304.0,
"step": 442
},
{
"entropy": 1.3251779079437256,
"epoch": 6.921875,
"grad_norm": 2.5684423446655273,
"learning_rate": 1.3624802536421641e-05,
"loss": 1.2095526456832886,
"mean_token_accuracy": 0.7775590419769287,
"num_tokens": 226816.0,
"step": 443
},
{
"entropy": 1.2735978364944458,
"epoch": 6.9375,
"grad_norm": 2.184539794921875,
"learning_rate": 1.3244095199185534e-05,
"loss": 1.0298391580581665,
"mean_token_accuracy": 0.8132529854774475,
"num_tokens": 227328.0,
"step": 444
},
{
"entropy": 1.426642656326294,
"epoch": 6.953125,
"grad_norm": 2.2295777797698975,
"learning_rate": 1.2868536644470396e-05,
"loss": 1.2079875469207764,
"mean_token_accuracy": 0.7871485948562622,
"num_tokens": 227840.0,
"step": 445
},
{
"entropy": 1.306974172592163,
"epoch": 6.96875,
"grad_norm": 2.530898094177246,
"learning_rate": 1.249814101184361e-05,
"loss": 1.1371548175811768,
"mean_token_accuracy": 0.7834645509719849,
"num_tokens": 228352.0,
"step": 446
},
{
"entropy": 1.3690567016601562,
"epoch": 6.984375,
"grad_norm": 2.2345566749572754,
"learning_rate": 1.2132922246491333e-05,
"loss": 1.045675277709961,
"mean_token_accuracy": 0.802874743938446,
"num_tokens": 228864.0,
"step": 447
},
{
"entropy": 1.3706583976745605,
"epoch": 7.0,
"grad_norm": 2.3838391304016113,
"learning_rate": 1.177289409869373e-05,
"loss": 0.8126500844955444,
"mean_token_accuracy": 0.8404255509376526,
"num_tokens": 229376.0,
"step": 448
},
{
"entropy": 1.4000086784362793,
"epoch": 7.015625,
"grad_norm": 2.8213186264038086,
"learning_rate": 1.1418070123306989e-05,
"loss": 1.2279022932052612,
"mean_token_accuracy": 0.7637795209884644,
"num_tokens": 229888.0,
"step": 449
},
{
"entropy": 1.4237322807312012,
"epoch": 7.03125,
"grad_norm": 1.9904886484146118,
"learning_rate": 1.1068463679253293e-05,
"loss": 0.717651903629303,
"mean_token_accuracy": 0.8627451062202454,
"num_tokens": 230400.0,
"step": 450
},
{
"entropy": 1.3660708665847778,
"epoch": 7.046875,
"grad_norm": 2.419309139251709,
"learning_rate": 1.0724087929017677e-05,
"loss": 1.1893762350082397,
"mean_token_accuracy": 0.7775590419769287,
"num_tokens": 230912.0,
"step": 451
},
{
"entropy": 1.4297083616256714,
"epoch": 7.0625,
"grad_norm": 2.564194679260254,
"learning_rate": 1.0384955838152442e-05,
"loss": 0.9669811725616455,
"mean_token_accuracy": 0.8092243075370789,
"num_tokens": 231424.0,
"step": 452
},
{
"entropy": 1.2599018812179565,
"epoch": 7.078125,
"grad_norm": 2.5647976398468018,
"learning_rate": 1.0051080174789172e-05,
"loss": 0.9141256809234619,
"mean_token_accuracy": 0.8220859169960022,
"num_tokens": 231936.0,
"step": 453
},
{
"entropy": 1.3683661222457886,
"epoch": 7.09375,
"grad_norm": 2.4807143211364746,
"learning_rate": 9.722473509157857e-06,
"loss": 1.0112833976745605,
"mean_token_accuracy": 0.806584358215332,
"num_tokens": 232448.0,
"step": 454
},
{
"entropy": 1.4733227491378784,
"epoch": 7.109375,
"grad_norm": 2.253063440322876,
"learning_rate": 9.399148213113772e-06,
"loss": 0.8170402646064758,
"mean_token_accuracy": 0.850649356842041,
"num_tokens": 232960.0,
"step": 455
},
{
"entropy": 1.2353498935699463,
"epoch": 7.125,
"grad_norm": 2.056302547454834,
"learning_rate": 9.081116459671511e-06,
"loss": 0.8609686493873596,
"mean_token_accuracy": 0.8292682766914368,
"num_tokens": 233472.0,
"step": 456
},
{
"entropy": 1.3128821849822998,
"epoch": 7.140625,
"grad_norm": 2.351895570755005,
"learning_rate": 8.768390222546895e-06,
"loss": 0.9468564391136169,
"mean_token_accuracy": 0.8200408816337585,
"num_tokens": 233984.0,
"step": 457
},
{
"entropy": 1.3826062679290771,
"epoch": 7.15625,
"grad_norm": 2.380359172821045,
"learning_rate": 8.460981275705942e-06,
"loss": 0.9691749811172485,
"mean_token_accuracy": 0.8140496015548706,
"num_tokens": 234496.0,
"step": 458
},
{
"entropy": 1.3204487562179565,
"epoch": 7.171875,
"grad_norm": 2.3769736289978027,
"learning_rate": 8.158901192921823e-06,
"loss": 1.018041968345642,
"mean_token_accuracy": 0.805220901966095,
"num_tokens": 235008.0,
"step": 459
},
{
"entropy": 1.301025390625,
"epoch": 7.1875,
"grad_norm": 2.4315240383148193,
"learning_rate": 7.862161347338836e-06,
"loss": 0.9299899339675903,
"mean_token_accuracy": 0.8333333134651184,
"num_tokens": 235520.0,
"step": 460
},
{
"entropy": 1.1855802536010742,
"epoch": 7.203125,
"grad_norm": 2.7347376346588135,
"learning_rate": 7.570772911044498e-06,
"loss": 1.0176899433135986,
"mean_token_accuracy": 0.8063241243362427,
"num_tokens": 236032.0,
"step": 461
},
{
"entropy": 1.3519090414047241,
"epoch": 7.21875,
"grad_norm": 2.1036648750305176,
"learning_rate": 7.284746854648748e-06,
"loss": 0.780006468296051,
"mean_token_accuracy": 0.8479657173156738,
"num_tokens": 236544.0,
"step": 462
},
{
"entropy": 1.3214199542999268,
"epoch": 7.234375,
"grad_norm": 2.382298707962036,
"learning_rate": 7.00409394687092e-06,
"loss": 1.1440752744674683,
"mean_token_accuracy": 0.7913385629653931,
"num_tokens": 237056.0,
"step": 463
},
{
"entropy": 1.482852816581726,
"epoch": 7.25,
"grad_norm": 2.64764142036438,
"learning_rate": 6.728824754134398e-06,
"loss": 1.1399593353271484,
"mean_token_accuracy": 0.7877551317214966,
"num_tokens": 237568.0,
"step": 464
},
{
"entropy": 1.3064343929290771,
"epoch": 7.265625,
"grad_norm": 2.473623752593994,
"learning_rate": 6.458949640168675e-06,
"loss": 0.8528488874435425,
"mean_token_accuracy": 0.828157365322113,
"num_tokens": 238080.0,
"step": 465
},
{
"entropy": 1.1553479433059692,
"epoch": 7.28125,
"grad_norm": 2.495208501815796,
"learning_rate": 6.1944787656192765e-06,
"loss": 0.8627750873565674,
"mean_token_accuracy": 0.8340080976486206,
"num_tokens": 238592.0,
"step": 466
},
{
"entropy": 1.411194920539856,
"epoch": 7.296875,
"grad_norm": 2.01597261428833,
"learning_rate": 5.935422087665132e-06,
"loss": 0.8617269992828369,
"mean_token_accuracy": 0.8397436141967773,
"num_tokens": 239104.0,
"step": 467
},
{
"entropy": 1.3339228630065918,
"epoch": 7.3125,
"grad_norm": 3.0477869510650635,
"learning_rate": 5.681789359643779e-06,
"loss": 0.9688454270362854,
"mean_token_accuracy": 0.7971311211585999,
"num_tokens": 239616.0,
"step": 468
},
{
"entropy": 1.497868299484253,
"epoch": 7.328125,
"grad_norm": 2.4108757972717285,
"learning_rate": 5.4335901306840235e-06,
"loss": 0.8658555746078491,
"mean_token_accuracy": 0.8293736577033997,
"num_tokens": 240128.0,
"step": 469
},
{
"entropy": 1.3256721496582031,
"epoch": 7.34375,
"grad_norm": 2.7251064777374268,
"learning_rate": 5.190833745346606e-06,
"loss": 0.959291934967041,
"mean_token_accuracy": 0.8202019929885864,
"num_tokens": 240640.0,
"step": 470
},
{
"entropy": 1.3926868438720703,
"epoch": 7.359375,
"grad_norm": 2.09291410446167,
"learning_rate": 4.953529343272189e-06,
"loss": 0.841661274433136,
"mean_token_accuracy": 0.8400852680206299,
"num_tokens": 241152.0,
"step": 471
},
{
"entropy": 1.2108948230743408,
"epoch": 7.375,
"grad_norm": 2.283916711807251,
"learning_rate": 4.721685858837393e-06,
"loss": 1.1214765310287476,
"mean_token_accuracy": 0.807539701461792,
"num_tokens": 241664.0,
"step": 472
},
{
"entropy": 1.4406850337982178,
"epoch": 7.390625,
"grad_norm": 2.1983139514923096,
"learning_rate": 4.495312020818403e-06,
"loss": 1.072562336921692,
"mean_token_accuracy": 0.8132780194282532,
"num_tokens": 242176.0,
"step": 473
},
{
"entropy": 1.4410004615783691,
"epoch": 7.40625,
"grad_norm": 2.522141933441162,
"learning_rate": 4.2744163520622325e-06,
"loss": 1.186017394065857,
"mean_token_accuracy": 0.7716535329818726,
"num_tokens": 242688.0,
"step": 474
},
{
"entropy": 1.443795084953308,
"epoch": 7.421875,
"grad_norm": 2.468191385269165,
"learning_rate": 4.05900716916599e-06,
"loss": 1.041429042816162,
"mean_token_accuracy": 0.8024691343307495,
"num_tokens": 243200.0,
"step": 475
},
{
"entropy": 1.4273154735565186,
"epoch": 7.4375,
"grad_norm": 2.0933821201324463,
"learning_rate": 3.849092582163621e-06,
"loss": 0.7831029295921326,
"mean_token_accuracy": 0.8481561541557312,
"num_tokens": 243712.0,
"step": 476
},
{
"entropy": 1.1794345378875732,
"epoch": 7.453125,
"grad_norm": 2.505176544189453,
"learning_rate": 3.6446804942207306e-06,
"loss": 1.0385328531265259,
"mean_token_accuracy": 0.7952755689620972,
"num_tokens": 244224.0,
"step": 477
},
{
"entropy": 1.3042283058166504,
"epoch": 7.46875,
"grad_norm": 2.2837576866149902,
"learning_rate": 3.4457786013368403e-06,
"loss": 1.044230580329895,
"mean_token_accuracy": 0.8084677457809448,
"num_tokens": 244736.0,
"step": 478
},
{
"entropy": 1.3156306743621826,
"epoch": 7.484375,
"grad_norm": 3.331399440765381,
"learning_rate": 3.252394392055868e-06,
"loss": 1.091844916343689,
"mean_token_accuracy": 0.789370059967041,
"num_tokens": 245248.0,
"step": 479
},
{
"entropy": 1.2917033433914185,
"epoch": 7.5,
"grad_norm": 2.2519593238830566,
"learning_rate": 3.064535147183922e-06,
"loss": 1.050087332725525,
"mean_token_accuracy": 0.8056111931800842,
"num_tokens": 245760.0,
"step": 480
},
{
"entropy": 1.3923559188842773,
"epoch": 7.515625,
"grad_norm": 2.422091007232666,
"learning_rate": 2.882207939515435e-06,
"loss": 0.909593403339386,
"mean_token_accuracy": 0.8132780194282532,
"num_tokens": 246272.0,
"step": 481
},
{
"entropy": 1.279539942741394,
"epoch": 7.53125,
"grad_norm": 2.5200605392456055,
"learning_rate": 2.7054196335667133e-06,
"loss": 1.2554877996444702,
"mean_token_accuracy": 0.7920792102813721,
"num_tokens": 246784.0,
"step": 482
},
{
"entropy": 1.3303519487380981,
"epoch": 7.546875,
"grad_norm": 2.704820394515991,
"learning_rate": 2.534176885317557e-06,
"loss": 1.147330641746521,
"mean_token_accuracy": 0.7834645509719849,
"num_tokens": 247296.0,
"step": 483
},
{
"entropy": 1.2642784118652344,
"epoch": 7.5625,
"grad_norm": 2.542236089706421,
"learning_rate": 2.368486141960646e-06,
"loss": 0.8348578810691833,
"mean_token_accuracy": 0.8431771993637085,
"num_tokens": 247808.0,
"step": 484
},
{
"entropy": 1.379125952720642,
"epoch": 7.578125,
"grad_norm": 2.5580883026123047,
"learning_rate": 2.2083536416588165e-06,
"loss": 1.251000165939331,
"mean_token_accuracy": 0.7775590419769287,
"num_tokens": 248320.0,
"step": 485
},
{
"entropy": 1.2168140411376953,
"epoch": 7.59375,
"grad_norm": 2.39772367477417,
"learning_rate": 2.053785413310216e-06,
"loss": 1.0238333940505981,
"mean_token_accuracy": 0.7952755689620972,
"num_tokens": 248832.0,
"step": 486
},
{
"entropy": 1.4885849952697754,
"epoch": 7.609375,
"grad_norm": 2.4192397594451904,
"learning_rate": 1.9047872763212347e-06,
"loss": 1.1900980472564697,
"mean_token_accuracy": 0.7917525768280029,
"num_tokens": 249344.0,
"step": 487
},
{
"entropy": 1.2745150327682495,
"epoch": 7.625,
"grad_norm": 2.538703203201294,
"learning_rate": 1.7613648403875802e-06,
"loss": 0.8750802874565125,
"mean_token_accuracy": 0.8223140239715576,
"num_tokens": 249856.0,
"step": 488
},
{
"entropy": 1.3218390941619873,
"epoch": 7.640625,
"grad_norm": 2.5100438594818115,
"learning_rate": 1.6235235052828476e-06,
"loss": 1.1404964923858643,
"mean_token_accuracy": 0.787401556968689,
"num_tokens": 250368.0,
"step": 489
},
{
"entropy": 1.3542776107788086,
"epoch": 7.65625,
"grad_norm": 2.773221969604492,
"learning_rate": 1.4912684606554482e-06,
"loss": 1.2540241479873657,
"mean_token_accuracy": 0.7559055089950562,
"num_tokens": 250880.0,
"step": 490
},
{
"entropy": 1.227868914604187,
"epoch": 7.671875,
"grad_norm": 2.7581679821014404,
"learning_rate": 1.3646046858329984e-06,
"loss": 0.8930553197860718,
"mean_token_accuracy": 0.8255578279495239,
"num_tokens": 251392.0,
"step": 491
},
{
"entropy": 1.2694940567016602,
"epoch": 7.6875,
"grad_norm": 2.4182348251342773,
"learning_rate": 1.2435369496350711e-06,
"loss": 1.0677306652069092,
"mean_token_accuracy": 0.7913385629653931,
"num_tokens": 251904.0,
"step": 492
},
{
"entropy": 1.341015338897705,
"epoch": 7.703125,
"grad_norm": 2.8310325145721436,
"learning_rate": 1.128069810193505e-06,
"loss": 1.1821932792663574,
"mean_token_accuracy": 0.7696850299835205,
"num_tokens": 252416.0,
"step": 493
},
{
"entropy": 1.3577978610992432,
"epoch": 7.71875,
"grad_norm": 2.641374111175537,
"learning_rate": 1.018207614780825e-06,
"loss": 1.157777190208435,
"mean_token_accuracy": 0.7795275449752808,
"num_tokens": 252928.0,
"step": 494
},
{
"entropy": 1.3288973569869995,
"epoch": 7.734375,
"grad_norm": 2.7389330863952637,
"learning_rate": 9.139544996465908e-07,
"loss": 1.018500804901123,
"mean_token_accuracy": 0.8004032373428345,
"num_tokens": 253440.0,
"step": 495
},
{
"entropy": 1.0912328958511353,
"epoch": 7.75,
"grad_norm": 2.2947134971618652,
"learning_rate": 8.153143898616876e-07,
"loss": 0.9197831749916077,
"mean_token_accuracy": 0.8167330622673035,
"num_tokens": 253952.0,
"step": 496
},
{
"entropy": 1.4121466875076294,
"epoch": 7.765625,
"grad_norm": 2.588513135910034,
"learning_rate": 7.222909991704773e-07,
"loss": 1.215504765510559,
"mean_token_accuracy": 0.772455096244812,
"num_tokens": 254464.0,
"step": 497
},
{
"entropy": 1.4346033334732056,
"epoch": 7.78125,
"grad_norm": 2.178701400756836,
"learning_rate": 6.348878298510274e-07,
"loss": 0.8305696845054626,
"mean_token_accuracy": 0.8376068472862244,
"num_tokens": 254976.0,
"step": 498
},
{
"entropy": 1.3011205196380615,
"epoch": 7.796875,
"grad_norm": 2.3126487731933594,
"learning_rate": 5.531081725832998e-07,
"loss": 1.1374728679656982,
"mean_token_accuracy": 0.7913385629653931,
"num_tokens": 255488.0,
"step": 499
},
{
"entropy": 1.401319146156311,
"epoch": 7.8125,
"grad_norm": 2.48214054107666,
"learning_rate": 4.769551063251497e-07,
"loss": 1.0062270164489746,
"mean_token_accuracy": 0.8073770403862,
"num_tokens": 256000.0,
"step": 500
},
{
"entropy": 1.2516509294509888,
"epoch": 7.828125,
"grad_norm": 2.50232195854187,
"learning_rate": 4.064314981964689e-07,
"loss": 0.7502802014350891,
"mean_token_accuracy": 0.850210964679718,
"num_tokens": 256512.0,
"step": 501
},
{
"entropy": 1.3641161918640137,
"epoch": 7.84375,
"grad_norm": 2.276825428009033,
"learning_rate": 3.415400033712545e-07,
"loss": 1.0526609420776367,
"mean_token_accuracy": 0.7983871102333069,
"num_tokens": 257024.0,
"step": 502
},
{
"entropy": 1.1331143379211426,
"epoch": 7.859375,
"grad_norm": 2.2967472076416016,
"learning_rate": 2.822830649776231e-07,
"loss": 0.7948004603385925,
"mean_token_accuracy": 0.8393574357032776,
"num_tokens": 257536.0,
"step": 503
},
{
"entropy": 1.3357852697372437,
"epoch": 7.875,
"grad_norm": 2.6608662605285645,
"learning_rate": 2.2866291400578385e-07,
"loss": 1.15070641040802,
"mean_token_accuracy": 0.7736220359802246,
"num_tokens": 258048.0,
"step": 504
},
{
"entropy": 1.2608537673950195,
"epoch": 7.890625,
"grad_norm": 2.4359138011932373,
"learning_rate": 1.8068156922413924e-07,
"loss": 1.077385663986206,
"mean_token_accuracy": 0.8011810779571533,
"num_tokens": 258560.0,
"step": 505
},
{
"entropy": 1.3851075172424316,
"epoch": 7.90625,
"grad_norm": 2.354517936706543,
"learning_rate": 1.3834083710319577e-07,
"loss": 0.9085444808006287,
"mean_token_accuracy": 0.837837815284729,
"num_tokens": 259072.0,
"step": 506
},
{
"entropy": 1.3305258750915527,
"epoch": 7.921875,
"grad_norm": 2.5277912616729736,
"learning_rate": 1.0164231174756843e-07,
"loss": 1.1649365425109863,
"mean_token_accuracy": 0.7972440719604492,
"num_tokens": 259584.0,
"step": 507
},
{
"entropy": 1.3707011938095093,
"epoch": 7.9375,
"grad_norm": 2.5459818840026855,
"learning_rate": 7.058737483602861e-08,
"loss": 1.1839475631713867,
"mean_token_accuracy": 0.7755905389785767,
"num_tokens": 260096.0,
"step": 508
},
{
"entropy": 1.3563947677612305,
"epoch": 7.953125,
"grad_norm": 2.303511142730713,
"learning_rate": 4.51771955693625e-08,
"loss": 1.1173865795135498,
"mean_token_accuracy": 0.7955911755561829,
"num_tokens": 260608.0,
"step": 509
},
{
"entropy": 1.3299047946929932,
"epoch": 7.96875,
"grad_norm": 2.4176976680755615,
"learning_rate": 2.541273062648952e-08,
"loss": 0.9735764265060425,
"mean_token_accuracy": 0.8189300298690796,
"num_tokens": 261120.0,
"step": 510
},
{
"entropy": 1.252206563949585,
"epoch": 7.984375,
"grad_norm": 2.453003168106079,
"learning_rate": 1.1294724128324551e-08,
"loss": 1.102551817893982,
"mean_token_accuracy": 0.782608687877655,
"num_tokens": 261632.0,
"step": 511
},
{
"entropy": 1.3690651655197144,
"epoch": 8.0,
"grad_norm": 2.252596616744995,
"learning_rate": 2.8237076098336365e-09,
"loss": 1.034975528717041,
"mean_token_accuracy": 0.8102040886878967,
"num_tokens": 262144.0,
"step": 512
}
],
"logging_steps": 1,
"max_steps": 512,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 128,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 70078365696000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}