umer07's picture
Fathom: upload expert-e8-analyst/training_log.json
d1b5c77 verified
[
{
"loss": 1.9181,
"grad_norm": 0.8695156574249268,
"learning_rate": 1.8e-05,
"entropy": 1.457271361351013,
"num_tokens": 223301.0,
"mean_token_accuracy": 0.5781058162450791,
"epoch": 0.016406890894175553,
"step": 10
},
{
"loss": 1.7798,
"grad_norm": 0.6769833564758301,
"learning_rate": 3.8e-05,
"entropy": 1.6721824526786804,
"num_tokens": 449106.0,
"mean_token_accuracy": 0.585400715470314,
"epoch": 0.03281378178835111,
"step": 20
},
{
"loss": 1.5073,
"grad_norm": 0.903017520904541,
"learning_rate": 5.8e-05,
"entropy": 1.568133682012558,
"num_tokens": 681766.0,
"mean_token_accuracy": 0.6311331987380981,
"epoch": 0.04922067268252666,
"step": 30
},
{
"loss": 1.1072,
"grad_norm": 0.4333101511001587,
"learning_rate": 7.800000000000001e-05,
"entropy": 1.0811064839363098,
"num_tokens": 919022.0,
"mean_token_accuracy": 0.7193854421377182,
"epoch": 0.06562756357670221,
"step": 40
},
{
"loss": 0.9973,
"grad_norm": 0.33554309606552124,
"learning_rate": 9.8e-05,
"entropy": 0.9973530560731888,
"num_tokens": 1148282.0,
"mean_token_accuracy": 0.7452630162239074,
"epoch": 0.08203445447087777,
"step": 50
},
{
"loss": 0.8928,
"grad_norm": 0.338527649641037,
"learning_rate": 9.993628283308581e-05,
"entropy": 0.8918034493923187,
"num_tokens": 1372211.0,
"mean_token_accuracy": 0.7673784762620925,
"epoch": 0.09844134536505332,
"step": 60
},
{
"loss": 0.8782,
"grad_norm": 0.32297801971435547,
"learning_rate": 9.971623444249021e-05,
"entropy": 0.8743007063865662,
"num_tokens": 1604226.0,
"mean_token_accuracy": 0.7686779618263244,
"epoch": 0.11484823625922888,
"step": 70
},
{
"loss": 0.8258,
"grad_norm": 0.33724743127822876,
"learning_rate": 9.933976038510333e-05,
"entropy": 0.8295134097337723,
"num_tokens": 1838081.0,
"mean_token_accuracy": 0.7803715646266938,
"epoch": 0.13125512715340443,
"step": 80
},
{
"loss": 0.7891,
"grad_norm": 0.3447152078151703,
"learning_rate": 9.88080451875917e-05,
"entropy": 0.788675120472908,
"num_tokens": 2071855.0,
"mean_token_accuracy": 0.7867679089307785,
"epoch": 0.14766201804757997,
"step": 90
},
{
"loss": 0.8272,
"grad_norm": 0.3010920286178589,
"learning_rate": 9.812276182268236e-05,
"entropy": 0.8329822063446045,
"num_tokens": 2299745.0,
"mean_token_accuracy": 0.7769433975219726,
"epoch": 0.16406890894175555,
"step": 100
},
{
"loss": 0.797,
"grad_norm": 0.3699570298194885,
"learning_rate": 9.728606644537178e-05,
"entropy": 0.8039814531803131,
"num_tokens": 2528567.0,
"mean_token_accuracy": 0.784355628490448,
"epoch": 0.1804757998359311,
"step": 110
},
{
"loss": 0.7743,
"grad_norm": 0.3595486581325531,
"learning_rate": 9.63005916088644e-05,
"entropy": 0.7768057614564896,
"num_tokens": 2751275.0,
"mean_token_accuracy": 0.7889596104621888,
"epoch": 0.19688269073010664,
"step": 120
},
{
"loss": 0.7735,
"grad_norm": 0.3556647002696991,
"learning_rate": 9.516943798158649e-05,
"entropy": 0.776089146733284,
"num_tokens": 2978023.0,
"mean_token_accuracy": 0.7897650897502899,
"epoch": 0.2132895816242822,
"step": 130
},
{
"loss": 0.7622,
"grad_norm": 0.3334764242172241,
"learning_rate": 9.389616459133597e-05,
"entropy": 0.7803491950035095,
"num_tokens": 3204517.0,
"mean_token_accuracy": 0.7920954555273056,
"epoch": 0.22969647251845776,
"step": 140
},
{
"loss": 0.7576,
"grad_norm": 0.4009985029697418,
"learning_rate": 9.248477762726437e-05,
"entropy": 0.7722930639982224,
"num_tokens": 3434323.0,
"mean_token_accuracy": 0.792155721783638,
"epoch": 0.2461033634126333,
"step": 150
},
{
"loss": 0.7741,
"grad_norm": 0.3408016264438629,
"learning_rate": 9.093971783492355e-05,
"entropy": 0.7827848076820374,
"num_tokens": 3661534.0,
"mean_token_accuracy": 0.7909984678030014,
"epoch": 0.26251025430680885,
"step": 160
},
{
"loss": 0.7409,
"grad_norm": 0.3399813175201416,
"learning_rate": 8.926584654403724e-05,
"entropy": 0.7509049952030182,
"num_tokens": 3890921.0,
"mean_token_accuracy": 0.7963089287281037,
"epoch": 0.27891714520098443,
"step": 170
},
{
"loss": 0.7478,
"grad_norm": 0.30348479747772217,
"learning_rate": 8.746843037295937e-05,
"entropy": 0.7562560260295867,
"num_tokens": 4119537.0,
"mean_token_accuracy": 0.794661870598793,
"epoch": 0.29532403609515995,
"step": 180
},
{
"loss": 0.737,
"grad_norm": 0.3403272330760956,
"learning_rate": 8.555312465794403e-05,
"entropy": 0.7475169450044632,
"num_tokens": 4357977.0,
"mean_token_accuracy": 0.796341797709465,
"epoch": 0.3117309269893355,
"step": 190
},
{
"loss": 0.7512,
"grad_norm": 0.3219321072101593,
"learning_rate": 8.352595565936554e-05,
"entropy": 0.7560538798570633,
"num_tokens": 4581046.0,
"mean_token_accuracy": 0.7952380329370499,
"epoch": 0.3281378178835111,
"step": 200
},
{
"loss": 0.7317,
"grad_norm": 0.3038958013057709,
"learning_rate": 8.139330160087374e-05,
"entropy": 0.7322431743144989,
"num_tokens": 4814227.0,
"mean_token_accuracy": 0.7974914610385895,
"epoch": 0.3445447087776866,
"step": 210
},
{
"loss": 0.7205,
"grad_norm": 0.32955309748649597,
"learning_rate": 7.916187260114263e-05,
"entropy": 0.7275226473808288,
"num_tokens": 5048157.0,
"mean_token_accuracy": 0.8014717370271682,
"epoch": 0.3609515996718622,
"step": 220
},
{
"loss": 0.7282,
"grad_norm": 0.4224933683872223,
"learning_rate": 7.68386895613546e-05,
"entropy": 0.7310106873512268,
"num_tokens": 5274071.0,
"mean_token_accuracy": 0.7992997527122497,
"epoch": 0.37735849056603776,
"step": 230
},
{
"loss": 0.6971,
"grad_norm": 0.33381229639053345,
"learning_rate": 7.443106207484776e-05,
"entropy": 0.6995288044214248,
"num_tokens": 5509360.0,
"mean_token_accuracy": 0.8060027480125427,
"epoch": 0.3937653814602133,
"step": 240
},
{
"loss": 0.711,
"grad_norm": 0.31851011514663696,
"learning_rate": 7.194656542843102e-05,
"entropy": 0.7142476379871369,
"num_tokens": 5738285.0,
"mean_token_accuracy": 0.8031993210315704,
"epoch": 0.41017227235438886,
"step": 250
},
{
"loss": 0.7237,
"grad_norm": 0.33157357573509216,
"learning_rate": 6.939301676772927e-05,
"entropy": 0.7256091266870499,
"num_tokens": 5962580.0,
"mean_token_accuracy": 0.8013624370098114,
"epoch": 0.4265791632485644,
"step": 260
},
{
"loss": 0.721,
"grad_norm": 0.3370811641216278,
"learning_rate": 6.677845050155107e-05,
"entropy": 0.7265744864940643,
"num_tokens": 6196094.0,
"mean_token_accuracy": 0.7991349190473557,
"epoch": 0.44298605414273995,
"step": 270
},
{
"loss": 0.6843,
"grad_norm": 0.38120874762535095,
"learning_rate": 6.411109302266616e-05,
"entropy": 0.6908316820859909,
"num_tokens": 6420601.0,
"mean_token_accuracy": 0.8087756901979446,
"epoch": 0.4593929450369155,
"step": 280
},
{
"loss": 0.7072,
"grad_norm": 0.38430851697921753,
"learning_rate": 6.139933682453036e-05,
"entropy": 0.7136244118213654,
"num_tokens": 6655119.0,
"mean_token_accuracy": 0.8047497570514679,
"epoch": 0.47579983593109104,
"step": 290
},
{
"loss": 0.7252,
"grad_norm": 0.3509667217731476,
"learning_rate": 5.8651714095396135e-05,
"entropy": 0.7334865719079972,
"num_tokens": 6876910.0,
"mean_token_accuracy": 0.799770200252533,
"epoch": 0.4922067268252666,
"step": 300
},
{
"loss": 0.6821,
"grad_norm": 0.3153151571750641,
"learning_rate": 5.587686987289189e-05,
"entropy": 0.6873683601617813,
"num_tokens": 7112299.0,
"mean_token_accuracy": 0.8088241666555405,
"epoch": 0.5086136177194421,
"step": 310
},
{
"loss": 0.7196,
"grad_norm": 0.34774187207221985,
"learning_rate": 5.3083534843535074e-05,
"entropy": 0.7214434593915939,
"num_tokens": 7346455.0,
"mean_token_accuracy": 0.8039845436811447,
"epoch": 0.5250205086136177,
"step": 320
},
{
"loss": 0.6638,
"grad_norm": 0.387768030166626,
"learning_rate": 5.028049787276249e-05,
"entropy": 0.6638175457715988,
"num_tokens": 7571791.0,
"mean_token_accuracy": 0.812444058060646,
"epoch": 0.5414273995077933,
"step": 330
},
{
"loss": 0.6766,
"grad_norm": 0.3517005741596222,
"learning_rate": 4.7476578351907954e-05,
"entropy": 0.6799941658973694,
"num_tokens": 7801695.0,
"mean_token_accuracy": 0.811230742931366,
"epoch": 0.5578342904019689,
"step": 340
},
{
"loss": 0.6779,
"grad_norm": 0.32577675580978394,
"learning_rate": 4.468059844913444e-05,
"entropy": 0.6814499109983444,
"num_tokens": 8039821.0,
"mean_token_accuracy": 0.8104382246732712,
"epoch": 0.5742411812961444,
"step": 350
},
{
"loss": 0.6539,
"grad_norm": 0.35933127999305725,
"learning_rate": 4.1901355351628945e-05,
"entropy": 0.6585495263338089,
"num_tokens": 8273149.0,
"mean_token_accuracy": 0.8166852772235871,
"epoch": 0.5906480721903199,
"step": 360
},
{
"loss": 0.6843,
"grad_norm": 0.31598055362701416,
"learning_rate": 3.914759358639719e-05,
"entropy": 0.6861207246780395,
"num_tokens": 8503164.0,
"mean_token_accuracy": 0.8086160510778427,
"epoch": 0.6070549630844955,
"step": 370
},
{
"loss": 0.7094,
"grad_norm": 0.3427006006240845,
"learning_rate": 3.642797750674629e-05,
"entropy": 0.7133786290884018,
"num_tokens": 8726435.0,
"mean_token_accuracy": 0.8027824640274048,
"epoch": 0.623461853978671,
"step": 380
},
{
"loss": 0.6877,
"grad_norm": 0.34877264499664307,
"learning_rate": 3.375106403102389e-05,
"entropy": 0.6881168276071549,
"num_tokens": 8954291.0,
"mean_token_accuracy": 0.8073496133089065,
"epoch": 0.6398687448728466,
"step": 390
},
{
"loss": 0.6835,
"grad_norm": 0.3225726783275604,
"learning_rate": 3.112527571938717e-05,
"entropy": 0.6862167656421662,
"num_tokens": 9177163.0,
"mean_token_accuracy": 0.8089945495128632,
"epoch": 0.6562756357670222,
"step": 400
},
{
"loss": 0.7008,
"grad_norm": 0.329756498336792,
"learning_rate": 2.8558874273312674e-05,
"entropy": 0.7071986079216004,
"num_tokens": 9404151.0,
"mean_token_accuracy": 0.8044474363327027,
"epoch": 0.6726825266611977,
"step": 410
},
{
"loss": 0.6947,
"grad_norm": 0.3715651035308838,
"learning_rate": 2.605993454122687e-05,
"entropy": 0.69432153403759,
"num_tokens": 9639400.0,
"mean_token_accuracy": 0.8064981371164321,
"epoch": 0.6890894175553732,
"step": 420
},
{
"loss": 0.7066,
"grad_norm": 0.3599180281162262,
"learning_rate": 2.3636319112045496e-05,
"entropy": 0.7111173301935196,
"num_tokens": 9867668.0,
"mean_token_accuracy": 0.8044642627239227,
"epoch": 0.7054963084495488,
"step": 430
},
{
"loss": 0.7259,
"grad_norm": 0.2912443280220032,
"learning_rate": 2.1295653576560163e-05,
"entropy": 0.7254415988922119,
"num_tokens": 10100826.0,
"mean_token_accuracy": 0.8003069430589675,
"epoch": 0.7219031993437244,
"step": 440
},
{
"loss": 0.6761,
"grad_norm": 0.30693626403808594,
"learning_rate": 1.9045302534508297e-05,
"entropy": 0.6833124309778214,
"num_tokens": 10332359.0,
"mean_token_accuracy": 0.8109049916267395,
"epoch": 0.7383100902379,
"step": 450
},
{
"loss": 0.736,
"grad_norm": 0.3155220150947571,
"learning_rate": 1.6892346422817946e-05,
"entropy": 0.736938726902008,
"num_tokens": 10563841.0,
"mean_token_accuracy": 0.7979681819677353,
"epoch": 0.7547169811320755,
"step": 460
},
{
"loss": 0.6945,
"grad_norm": 0.3748078942298889,
"learning_rate": 1.4843559237933473e-05,
"entropy": 0.7031238079071045,
"num_tokens": 10788876.0,
"mean_token_accuracy": 0.8057133972644805,
"epoch": 0.771123872026251,
"step": 470
},
{
"loss": 0.6776,
"grad_norm": 0.3635546565055847,
"learning_rate": 1.2905387222316822e-05,
"entropy": 0.6805126667022705,
"num_tokens": 11015156.0,
"mean_token_accuracy": 0.8101104766130447,
"epoch": 0.7875307629204266,
"step": 480
},
{
"loss": 0.676,
"grad_norm": 0.3111382722854614,
"learning_rate": 1.1083928582183711e-05,
"entropy": 0.6774959295988083,
"num_tokens": 11245860.0,
"mean_token_accuracy": 0.8107922226190567,
"epoch": 0.8039376538146021,
"step": 490
},
{
"loss": 0.6742,
"grad_norm": 0.32188844680786133,
"learning_rate": 9.384914300290748e-06,
"entropy": 0.6842435419559478,
"num_tokens": 11476241.0,
"mean_token_accuracy": 0.8111602008342743,
"epoch": 0.8203445447087777,
"step": 500
},
{
"loss": 0.6544,
"grad_norm": 0.36185422539711,
"learning_rate": 7.813690104143557e-06,
"entropy": 0.6514311820268631,
"num_tokens": 11708112.0,
"mean_token_accuracy": 0.8149820327758789,
"epoch": 0.8367514356029533,
"step": 510
},
{
"loss": 0.6765,
"grad_norm": 0.3183876574039459,
"learning_rate": 6.375199646360142e-06,
"entropy": 0.6856429934501648,
"num_tokens": 11939337.0,
"mean_token_accuracy": 0.8090052843093872,
"epoch": 0.8531583264971287,
"step": 520
},
{
"loss": 0.6761,
"grad_norm": 0.3287002742290497,
"learning_rate": 5.073968950110941e-06,
"entropy": 0.6834310472011567,
"num_tokens": 12174723.0,
"mean_token_accuracy": 0.8104397505521774,
"epoch": 0.8695652173913043,
"step": 530
},
{
"loss": 0.6751,
"grad_norm": 0.35229238867759705,
"learning_rate": 3.914092168575306e-06,
"entropy": 0.6824660181999207,
"num_tokens": 12398555.0,
"mean_token_accuracy": 0.8104325562715531,
"epoch": 0.8859721082854799,
"step": 540
},
{
"loss": 0.6834,
"grad_norm": 0.38912639021873474,
"learning_rate": 2.8992187032210518e-06,
"entropy": 0.682240468263626,
"num_tokens": 12624846.0,
"mean_token_accuracy": 0.8091065347194671,
"epoch": 0.9023789991796555,
"step": 550
},
{
"loss": 0.696,
"grad_norm": 0.306355744600296,
"learning_rate": 2.032541721437209e-06,
"entropy": 0.7058492481708527,
"num_tokens": 12859015.0,
"mean_token_accuracy": 0.8039765357971191,
"epoch": 0.918785890073831,
"step": 560
},
{
"loss": 0.6727,
"grad_norm": 0.38508960604667664,
"learning_rate": 1.3167881096480372e-06,
"entropy": 0.681548210978508,
"num_tokens": 13083551.0,
"mean_token_accuracy": 0.8100948423147202,
"epoch": 0.9351927809680065,
"step": 570
},
{
"loss": 0.7208,
"grad_norm": 0.33893731236457825,
"learning_rate": 7.542098935195918e-07,
"entropy": 0.7220237284898758,
"num_tokens": 13308857.0,
"mean_token_accuracy": 0.8005945891141891,
"epoch": 0.9515996718621821,
"step": 580
},
{
"loss": 0.6759,
"grad_norm": 0.3534739911556244,
"learning_rate": 3.465771522536854e-07,
"entropy": 0.6739370882511139,
"num_tokens": 13543857.0,
"mean_token_accuracy": 0.8097480118274689,
"epoch": 0.9680065627563577,
"step": 590
},
{
"loss": 0.6865,
"grad_norm": 0.3553875982761383,
"learning_rate": 9.517244926393609e-08,
"entropy": 0.6908959478139878,
"num_tokens": 13769574.0,
"mean_token_accuracy": 0.806584045290947,
"epoch": 0.9844134536505332,
"step": 600
},
{
"loss": 0.6525,
"grad_norm": 0.5078703761100769,
"learning_rate": 7.867967567354306e-10,
"entropy": 0.6598060852602908,
"num_tokens": 13978118.0,
"mean_token_accuracy": 0.8165042933664823,
"epoch": 1.0,
"step": 610
},
{
"train_runtime": 6449.4338,
"train_samples_per_second": 3.024,
"train_steps_per_second": 0.095,
"total_flos": 7.156995496917074e+18,
"train_loss": 0.7796625786140317,
"epoch": 1.0,
"step": 610
}
]