SmolLM3-Chat-v1-Adapter / trainer_state.json
igidn's picture
Upload folder using huggingface_hub
570820a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1198,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.376227322220802,
"epoch": 0.01670843776106934,
"grad_norm": 23.182296752929688,
"learning_rate": 1.5e-06,
"loss": 2.4729,
"mean_token_accuracy": 0.5330174341797829,
"num_tokens": 23968.0,
"step": 10
},
{
"entropy": 2.4090343087911608,
"epoch": 0.03341687552213868,
"grad_norm": 16.90180778503418,
"learning_rate": 4e-06,
"loss": 2.2456,
"mean_token_accuracy": 0.546089893579483,
"num_tokens": 50061.0,
"step": 20
},
{
"entropy": 2.3292118519544602,
"epoch": 0.05012531328320802,
"grad_norm": 7.097179889678955,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.0194,
"mean_token_accuracy": 0.5704705387353897,
"num_tokens": 77221.0,
"step": 30
},
{
"entropy": 2.38493994474411,
"epoch": 0.06683375104427736,
"grad_norm": 3.9334213733673096,
"learning_rate": 9e-06,
"loss": 2.091,
"mean_token_accuracy": 0.5746636532247067,
"num_tokens": 102592.0,
"step": 40
},
{
"entropy": 2.4236884146928785,
"epoch": 0.0835421888053467,
"grad_norm": 2.742833375930786,
"learning_rate": 1.1500000000000002e-05,
"loss": 1.9327,
"mean_token_accuracy": 0.5654355354607106,
"num_tokens": 130876.0,
"step": 50
},
{
"entropy": 2.4756928414106367,
"epoch": 0.10025062656641603,
"grad_norm": 4.295400142669678,
"learning_rate": 1.4e-05,
"loss": 2.0381,
"mean_token_accuracy": 0.5688402287662029,
"num_tokens": 155526.0,
"step": 60
},
{
"entropy": 2.422525143623352,
"epoch": 0.11695906432748537,
"grad_norm": 37.36378479003906,
"learning_rate": 1.65e-05,
"loss": 1.9249,
"mean_token_accuracy": 0.5765560433268547,
"num_tokens": 181754.0,
"step": 70
},
{
"entropy": 2.3750332206487657,
"epoch": 0.1336675020885547,
"grad_norm": 53.244632720947266,
"learning_rate": 1.9e-05,
"loss": 1.7852,
"mean_token_accuracy": 0.5806652165949344,
"num_tokens": 208156.0,
"step": 80
},
{
"entropy": 2.1606528967618943,
"epoch": 0.15037593984962405,
"grad_norm": 1.9780453443527222,
"learning_rate": 2.15e-05,
"loss": 1.74,
"mean_token_accuracy": 0.595666554570198,
"num_tokens": 236321.0,
"step": 90
},
{
"entropy": 2.1004574805498124,
"epoch": 0.1670843776106934,
"grad_norm": 2.5225274562835693,
"learning_rate": 2.4e-05,
"loss": 1.7544,
"mean_token_accuracy": 0.5997186966240406,
"num_tokens": 263316.0,
"step": 100
},
{
"entropy": 2.0930860340595245,
"epoch": 0.18379281537176273,
"grad_norm": 2.0412206649780273,
"learning_rate": 2.65e-05,
"loss": 1.6994,
"mean_token_accuracy": 0.6029496222734452,
"num_tokens": 289555.0,
"step": 110
},
{
"entropy": 2.0005944788455965,
"epoch": 0.20050125313283207,
"grad_norm": 5.798661231994629,
"learning_rate": 2.9e-05,
"loss": 1.5928,
"mean_token_accuracy": 0.6268587596714497,
"num_tokens": 322035.0,
"step": 120
},
{
"entropy": 2.1824300229549407,
"epoch": 0.2172096908939014,
"grad_norm": 2.135401964187622,
"learning_rate": 2.999770694261986e-05,
"loss": 1.7705,
"mean_token_accuracy": 0.5905720897018909,
"num_tokens": 343574.0,
"step": 130
},
{
"entropy": 2.317758610844612,
"epoch": 0.23391812865497075,
"grad_norm": 4.436051368713379,
"learning_rate": 2.9983696353001538e-05,
"loss": 1.8244,
"mean_token_accuracy": 0.578718575835228,
"num_tokens": 367065.0,
"step": 140
},
{
"entropy": 2.0938976377248766,
"epoch": 0.2506265664160401,
"grad_norm": 1.6219489574432373,
"learning_rate": 2.995696097848732e-05,
"loss": 1.6849,
"mean_token_accuracy": 0.6061625145375729,
"num_tokens": 395337.0,
"step": 150
},
{
"entropy": 2.05442219376564,
"epoch": 0.2673350041771094,
"grad_norm": 1.6669152975082397,
"learning_rate": 2.9917523523881268e-05,
"loss": 1.7082,
"mean_token_accuracy": 0.6042715333402157,
"num_tokens": 420868.0,
"step": 160
},
{
"entropy": 2.032812547683716,
"epoch": 0.28404344193817876,
"grad_norm": 1.9019651412963867,
"learning_rate": 2.9865417481127877e-05,
"loss": 1.546,
"mean_token_accuracy": 0.6138139866292477,
"num_tokens": 451944.0,
"step": 170
},
{
"entropy": 2.0105547934770582,
"epoch": 0.3007518796992481,
"grad_norm": 1.7808424234390259,
"learning_rate": 2.9800687100869334e-05,
"loss": 1.5909,
"mean_token_accuracy": 0.6109806254506112,
"num_tokens": 482345.0,
"step": 180
},
{
"entropy": 2.0666785687208176,
"epoch": 0.31746031746031744,
"grad_norm": 1.7441576719284058,
"learning_rate": 2.972338735486598e-05,
"loss": 1.6263,
"mean_token_accuracy": 0.6088879898190498,
"num_tokens": 508164.0,
"step": 190
},
{
"entropy": 2.0136130273342134,
"epoch": 0.3341687552213868,
"grad_norm": 1.8193825483322144,
"learning_rate": 2.9633583889312e-05,
"loss": 1.6149,
"mean_token_accuracy": 0.619242499768734,
"num_tokens": 536345.0,
"step": 200
},
{
"entropy": 2.0101848542690277,
"epoch": 0.3508771929824561,
"grad_norm": 1.11763596534729,
"learning_rate": 2.9531352969085914e-05,
"loss": 1.5869,
"mean_token_accuracy": 0.6176401123404502,
"num_tokens": 563157.0,
"step": 210
},
{
"entropy": 1.9853627145290376,
"epoch": 0.36758563074352546,
"grad_norm": 2.025529384613037,
"learning_rate": 2.9416781412983223e-05,
"loss": 1.6008,
"mean_token_accuracy": 0.6206148102879524,
"num_tokens": 590929.0,
"step": 220
},
{
"entropy": 2.108823761343956,
"epoch": 0.3842940685045948,
"grad_norm": 1.4173170328140259,
"learning_rate": 2.9289966519986258e-05,
"loss": 1.6553,
"mean_token_accuracy": 0.598318774998188,
"num_tokens": 615814.0,
"step": 230
},
{
"entropy": 2.0000301033258436,
"epoch": 0.40100250626566414,
"grad_norm": 2.162895679473877,
"learning_rate": 2.9151015986633762e-05,
"loss": 1.6305,
"mean_token_accuracy": 0.6174192741513252,
"num_tokens": 643303.0,
"step": 240
},
{
"entropy": 2.0013814836740496,
"epoch": 0.4177109440267335,
"grad_norm": 3.5182316303253174,
"learning_rate": 2.90000478155605e-05,
"loss": 1.5884,
"mean_token_accuracy": 0.615642835944891,
"num_tokens": 670279.0,
"step": 250
},
{
"entropy": 2.061664643883705,
"epoch": 0.4344193817878028,
"grad_norm": 2.530327558517456,
"learning_rate": 2.883719021528444e-05,
"loss": 1.7341,
"mean_token_accuracy": 0.6077517293393612,
"num_tokens": 695006.0,
"step": 260
},
{
"entropy": 2.0266662567853926,
"epoch": 0.45112781954887216,
"grad_norm": 2.2248501777648926,
"learning_rate": 2.8662581491326686e-05,
"loss": 1.5991,
"mean_token_accuracy": 0.6176492936909199,
"num_tokens": 720926.0,
"step": 270
},
{
"entropy": 1.998877900838852,
"epoch": 0.4678362573099415,
"grad_norm": 1.6995679140090942,
"learning_rate": 2.847636992875661e-05,
"loss": 1.5343,
"mean_token_accuracy": 0.6100999519228936,
"num_tokens": 753552.0,
"step": 280
},
{
"entropy": 2.0063836723566055,
"epoch": 0.48454469507101083,
"grad_norm": 1.5225791931152344,
"learning_rate": 2.827871366626197e-05,
"loss": 1.6512,
"mean_token_accuracy": 0.6144520066678524,
"num_tokens": 776650.0,
"step": 290
},
{
"entropy": 1.9781533569097518,
"epoch": 0.5012531328320802,
"grad_norm": 1.590501070022583,
"learning_rate": 2.806978056185084e-05,
"loss": 1.6408,
"mean_token_accuracy": 0.6150619685649872,
"num_tokens": 801816.0,
"step": 300
},
{
"entropy": 1.9714119344949723,
"epoch": 0.5179615705931495,
"grad_norm": 2.535109519958496,
"learning_rate": 2.7849748050299578e-05,
"loss": 1.603,
"mean_token_accuracy": 0.6225824594497681,
"num_tokens": 826564.0,
"step": 310
},
{
"entropy": 1.94197179377079,
"epoch": 0.5346700083542189,
"grad_norm": 1.5610677003860474,
"learning_rate": 2.7618802992467718e-05,
"loss": 1.5703,
"mean_token_accuracy": 0.6192868888378144,
"num_tokens": 855007.0,
"step": 320
},
{
"entropy": 1.9983945518732071,
"epoch": 0.5513784461152882,
"grad_norm": 1.7279447317123413,
"learning_rate": 2.7377141516607896e-05,
"loss": 1.5837,
"mean_token_accuracy": 0.6141151934862137,
"num_tokens": 881616.0,
"step": 330
},
{
"entropy": 1.8347389072179794,
"epoch": 0.5680868838763575,
"grad_norm": 1.3221409320831299,
"learning_rate": 2.712496885180547e-05,
"loss": 1.4675,
"mean_token_accuracy": 0.6429531842470169,
"num_tokens": 913148.0,
"step": 340
},
{
"entropy": 1.8934204697608947,
"epoch": 0.5847953216374269,
"grad_norm": 1.4465378522872925,
"learning_rate": 2.686249915368938e-05,
"loss": 1.4669,
"mean_token_accuracy": 0.6286376528441906,
"num_tokens": 944937.0,
"step": 350
},
{
"entropy": 1.9060048371553422,
"epoch": 0.6015037593984962,
"grad_norm": 1.6246923208236694,
"learning_rate": 2.658995532256217e-05,
"loss": 1.5736,
"mean_token_accuracy": 0.622390191257,
"num_tokens": 974558.0,
"step": 360
},
{
"entropy": 2.025865262746811,
"epoch": 0.6182121971595655,
"grad_norm": 1.2998696565628052,
"learning_rate": 2.6307568814103658e-05,
"loss": 1.6277,
"mean_token_accuracy": 0.6089055955410003,
"num_tokens": 1001232.0,
"step": 370
},
{
"entropy": 1.9043485343456268,
"epoch": 0.6349206349206349,
"grad_norm": 1.7911604642868042,
"learning_rate": 2.6015579442809094e-05,
"loss": 1.5542,
"mean_token_accuracy": 0.6255838416516781,
"num_tokens": 1029679.0,
"step": 380
},
{
"entropy": 2.0594927102327345,
"epoch": 0.6516290726817042,
"grad_norm": 1.6075160503387451,
"learning_rate": 2.5714235178328554e-05,
"loss": 1.664,
"mean_token_accuracy": 0.608974926173687,
"num_tokens": 1053040.0,
"step": 390
},
{
"entropy": 2.034518975019455,
"epoch": 0.6683375104427736,
"grad_norm": 3.4272360801696777,
"learning_rate": 2.540379193488072e-05,
"loss": 1.7209,
"mean_token_accuracy": 0.6031469151377677,
"num_tokens": 1075141.0,
"step": 400
},
{
"entropy": 2.0515862941741942,
"epoch": 0.6850459482038429,
"grad_norm": 1.2661142349243164,
"learning_rate": 2.5084513353919753e-05,
"loss": 1.6599,
"mean_token_accuracy": 0.6064547099173069,
"num_tokens": 1097768.0,
"step": 410
},
{
"entropy": 1.9067467480897904,
"epoch": 0.7017543859649122,
"grad_norm": 2.2614035606384277,
"learning_rate": 2.4756670580239894e-05,
"loss": 1.5424,
"mean_token_accuracy": 0.6350538037717343,
"num_tokens": 1125706.0,
"step": 420
},
{
"entropy": 2.103572541475296,
"epoch": 0.7184628237259816,
"grad_norm": 2.4501664638519287,
"learning_rate": 2.4420542031707905e-05,
"loss": 1.703,
"mean_token_accuracy": 0.6036252595484257,
"num_tokens": 1147357.0,
"step": 430
},
{
"entropy": 2.072985142469406,
"epoch": 0.7351712614870509,
"grad_norm": 1.8719037771224976,
"learning_rate": 2.4076413162818904e-05,
"loss": 1.6609,
"mean_token_accuracy": 0.606941219419241,
"num_tokens": 1170179.0,
"step": 440
},
{
"entropy": 2.022436353564262,
"epoch": 0.7518796992481203,
"grad_norm": 2.076544761657715,
"learning_rate": 2.3724576222276405e-05,
"loss": 1.7007,
"mean_token_accuracy": 0.6098736897110939,
"num_tokens": 1196284.0,
"step": 450
},
{
"entropy": 2.1311296701431273,
"epoch": 0.7685881370091896,
"grad_norm": 2.9471235275268555,
"learning_rate": 2.3365330004802443e-05,
"loss": 1.7893,
"mean_token_accuracy": 0.5961334474384785,
"num_tokens": 1217358.0,
"step": 460
},
{
"entropy": 1.9371352732181548,
"epoch": 0.7852965747702589,
"grad_norm": 1.4203218221664429,
"learning_rate": 2.2998979597388526e-05,
"loss": 1.608,
"mean_token_accuracy": 0.6184874981641769,
"num_tokens": 1243374.0,
"step": 470
},
{
"entropy": 2.0747445046901705,
"epoch": 0.8020050125313283,
"grad_norm": 2.2124152183532715,
"learning_rate": 2.262583612020294e-05,
"loss": 1.6711,
"mean_token_accuracy": 0.6062519766390324,
"num_tokens": 1267314.0,
"step": 480
},
{
"entropy": 1.7878674179315568,
"epoch": 0.8187134502923976,
"grad_norm": 1.3049612045288086,
"learning_rate": 2.224621646237442e-05,
"loss": 1.4722,
"mean_token_accuracy": 0.6389728732407093,
"num_tokens": 1296917.0,
"step": 490
},
{
"entropy": 1.925000047683716,
"epoch": 0.835421888053467,
"grad_norm": 1.4271504878997803,
"learning_rate": 2.1860443012876574e-05,
"loss": 1.6114,
"mean_token_accuracy": 0.616053618490696,
"num_tokens": 1322826.0,
"step": 500
},
{
"entropy": 2.0017556965351107,
"epoch": 0.8521303258145363,
"grad_norm": 1.6036300659179688,
"learning_rate": 2.146884338674161e-05,
"loss": 1.5247,
"mean_token_accuracy": 0.6218138873577118,
"num_tokens": 1349542.0,
"step": 510
},
{
"entropy": 1.995826429128647,
"epoch": 0.8688387635756056,
"grad_norm": 1.979646921157837,
"learning_rate": 2.1071750146835906e-05,
"loss": 1.5882,
"mean_token_accuracy": 0.6175018042325974,
"num_tokens": 1375550.0,
"step": 520
},
{
"entropy": 1.918816116452217,
"epoch": 0.885547201336675,
"grad_norm": 1.531023383140564,
"learning_rate": 2.06695005214336e-05,
"loss": 1.5158,
"mean_token_accuracy": 0.6258822798728942,
"num_tokens": 1401535.0,
"step": 530
},
{
"entropy": 1.918640747666359,
"epoch": 0.9022556390977443,
"grad_norm": 1.6574758291244507,
"learning_rate": 2.026243611782818e-05,
"loss": 1.4805,
"mean_token_accuracy": 0.6370069451630116,
"num_tokens": 1430094.0,
"step": 540
},
{
"entropy": 2.0166410475969316,
"epoch": 0.9189640768588136,
"grad_norm": 2.085106372833252,
"learning_rate": 1.9850902632225228e-05,
"loss": 1.6067,
"mean_token_accuracy": 0.613443473726511,
"num_tokens": 1453767.0,
"step": 550
},
{
"entropy": 1.9751090347766875,
"epoch": 0.935672514619883,
"grad_norm": 1.5619549751281738,
"learning_rate": 1.9435249556162633e-05,
"loss": 1.5186,
"mean_token_accuracy": 0.6245452538132668,
"num_tokens": 1479122.0,
"step": 560
},
{
"entropy": 2.0289832055568695,
"epoch": 0.9523809523809523,
"grad_norm": 1.9765042066574097,
"learning_rate": 1.9015829879707705e-05,
"loss": 1.635,
"mean_token_accuracy": 0.6078214071691036,
"num_tokens": 1502393.0,
"step": 570
},
{
"entropy": 2.0707653611898422,
"epoch": 0.9690893901420217,
"grad_norm": 2.33292293548584,
"learning_rate": 1.8592999791683163e-05,
"loss": 1.7044,
"mean_token_accuracy": 0.5983508452773094,
"num_tokens": 1528688.0,
"step": 580
},
{
"entropy": 1.830039870738983,
"epoch": 0.985797827903091,
"grad_norm": 1.7089521884918213,
"learning_rate": 1.8167118377176626e-05,
"loss": 1.4488,
"mean_token_accuracy": 0.6382713422179223,
"num_tokens": 1559267.0,
"step": 590
},
{
"entropy": 2.093587511464169,
"epoch": 1.001670843776107,
"grad_norm": 1.8502051830291748,
"learning_rate": 1.7738547312590426e-05,
"loss": 1.6569,
"mean_token_accuracy": 0.6035293050502476,
"num_tokens": 1579777.0,
"step": 600
},
{
"entropy": 1.986822435259819,
"epoch": 1.0183792815371762,
"grad_norm": 1.8095533847808838,
"learning_rate": 1.7307650558490842e-05,
"loss": 1.5196,
"mean_token_accuracy": 0.624878978729248,
"num_tokens": 1603371.0,
"step": 610
},
{
"entropy": 1.8720842987298965,
"epoch": 1.0350877192982457,
"grad_norm": 1.418859601020813,
"learning_rate": 1.687479405051745e-05,
"loss": 1.4617,
"mean_token_accuracy": 0.6376385740935803,
"num_tokens": 1629671.0,
"step": 620
},
{
"entropy": 1.9420676857233048,
"epoch": 1.0517961570593148,
"grad_norm": 1.9264510869979858,
"learning_rate": 1.6440345388615225e-05,
"loss": 1.5184,
"mean_token_accuracy": 0.6275502189993858,
"num_tokens": 1654665.0,
"step": 630
},
{
"entropy": 1.877267986536026,
"epoch": 1.0685045948203844,
"grad_norm": 1.5752804279327393,
"learning_rate": 1.6004673524853184e-05,
"loss": 1.4846,
"mean_token_accuracy": 0.6406380534172058,
"num_tokens": 1681208.0,
"step": 640
},
{
"entropy": 1.7192733228206634,
"epoch": 1.0852130325814535,
"grad_norm": 1.8777337074279785,
"learning_rate": 1.5568148450094788e-05,
"loss": 1.3596,
"mean_token_accuracy": 0.6518337815999985,
"num_tokens": 1711943.0,
"step": 650
},
{
"entropy": 1.8170194894075393,
"epoch": 1.101921470342523,
"grad_norm": 1.324455738067627,
"learning_rate": 1.5131140879786162e-05,
"loss": 1.3627,
"mean_token_accuracy": 0.6482401996850967,
"num_tokens": 1744406.0,
"step": 660
},
{
"entropy": 1.9752755105495452,
"epoch": 1.1186299081035922,
"grad_norm": 1.697969913482666,
"learning_rate": 1.4694021939128925e-05,
"loss": 1.6435,
"mean_token_accuracy": 0.6173132970929146,
"num_tokens": 1763684.0,
"step": 670
},
{
"entropy": 1.810714191198349,
"epoch": 1.1353383458646618,
"grad_norm": 1.6863676309585571,
"learning_rate": 1.4257162847905073e-05,
"loss": 1.4572,
"mean_token_accuracy": 0.6450446091592312,
"num_tokens": 1790387.0,
"step": 680
},
{
"entropy": 1.7295511305332183,
"epoch": 1.1520467836257309,
"grad_norm": 1.8147404193878174,
"learning_rate": 1.3820934605221554e-05,
"loss": 1.4375,
"mean_token_accuracy": 0.656483718752861,
"num_tokens": 1818936.0,
"step": 690
},
{
"entropy": 1.876406043767929,
"epoch": 1.1687552213868004,
"grad_norm": 1.4509400129318237,
"learning_rate": 1.3385707674442212e-05,
"loss": 1.5038,
"mean_token_accuracy": 0.6330045036971569,
"num_tokens": 1844285.0,
"step": 700
},
{
"entropy": 1.7754180505871773,
"epoch": 1.1854636591478696,
"grad_norm": 1.8553998470306396,
"learning_rate": 1.295185166857471e-05,
"loss": 1.3342,
"mean_token_accuracy": 0.6631456315517426,
"num_tokens": 1872386.0,
"step": 710
},
{
"entropy": 1.8854310810565948,
"epoch": 1.2021720969089391,
"grad_norm": 1.9020118713378906,
"learning_rate": 1.2519735036379654e-05,
"loss": 1.5075,
"mean_token_accuracy": 0.6296862445771694,
"num_tokens": 1898136.0,
"step": 720
},
{
"entropy": 1.8075116664171218,
"epoch": 1.2188805346700082,
"grad_norm": 1.4228155612945557,
"learning_rate": 1.2089724749468383e-05,
"loss": 1.475,
"mean_token_accuracy": 0.63388437256217,
"num_tokens": 1926755.0,
"step": 730
},
{
"entropy": 1.9176389634609223,
"epoch": 1.2355889724310778,
"grad_norm": 1.4030358791351318,
"learning_rate": 1.1662185990655285e-05,
"loss": 1.4261,
"mean_token_accuracy": 0.6366409629583358,
"num_tokens": 1954094.0,
"step": 740
},
{
"entropy": 1.9318130016326904,
"epoch": 1.252297410192147,
"grad_norm": 2.5528788566589355,
"learning_rate": 1.1237481843829162e-05,
"loss": 1.637,
"mean_token_accuracy": 0.6269572585821152,
"num_tokens": 1977028.0,
"step": 750
},
{
"entropy": 1.8894671678543091,
"epoch": 1.2690058479532165,
"grad_norm": 1.5508147478103638,
"learning_rate": 1.0815972985607149e-05,
"loss": 1.5153,
"mean_token_accuracy": 0.6365115389227867,
"num_tokens": 2000096.0,
"step": 760
},
{
"entropy": 1.9068289607763291,
"epoch": 1.2857142857142856,
"grad_norm": 1.2476104497909546,
"learning_rate": 1.0398017379032955e-05,
"loss": 1.5797,
"mean_token_accuracy": 0.6310160294175148,
"num_tokens": 2024669.0,
"step": 770
},
{
"entropy": 1.7813611298799514,
"epoch": 1.3024227234753551,
"grad_norm": 2.0824921131134033,
"learning_rate": 9.983969969579616e-06,
"loss": 1.4656,
"mean_token_accuracy": 0.6474256098270417,
"num_tokens": 2051877.0,
"step": 780
},
{
"entropy": 1.8404263019561768,
"epoch": 1.3191311612364243,
"grad_norm": 1.2777612209320068,
"learning_rate": 9.574182383714839e-06,
"loss": 1.4355,
"mean_token_accuracy": 0.6538358055055141,
"num_tokens": 2079905.0,
"step": 790
},
{
"entropy": 1.8420085251331328,
"epoch": 1.3358395989974938,
"grad_norm": 1.2849156856536865,
"learning_rate": 9.169002630285063e-06,
"loss": 1.4413,
"mean_token_accuracy": 0.6362872712314129,
"num_tokens": 2109705.0,
"step": 800
},
{
"entropy": 1.9093056112527846,
"epoch": 1.352548036758563,
"grad_norm": 1.3787521123886108,
"learning_rate": 8.768774804971705e-06,
"loss": 1.4332,
"mean_token_accuracy": 0.646092452853918,
"num_tokens": 2136002.0,
"step": 810
},
{
"entropy": 1.8188541814684869,
"epoch": 1.3692564745196325,
"grad_norm": 1.9946447610855103,
"learning_rate": 8.37383879807061e-06,
"loss": 1.4213,
"mean_token_accuracy": 0.6514127373695373,
"num_tokens": 2162939.0,
"step": 820
},
{
"entropy": 1.767152488231659,
"epoch": 1.3859649122807016,
"grad_norm": 2.6128296852111816,
"learning_rate": 7.98453000584296e-06,
"loss": 1.4077,
"mean_token_accuracy": 0.6433039925992489,
"num_tokens": 2192868.0,
"step": 830
},
{
"entropy": 1.8834378957748412,
"epoch": 1.4026733500417712,
"grad_norm": 2.5565028190612793,
"learning_rate": 7.601179045682659e-06,
"loss": 1.4508,
"mean_token_accuracy": 0.6357239708304405,
"num_tokens": 2220574.0,
"step": 840
},
{
"entropy": 1.8659739628434182,
"epoch": 1.4193817878028403,
"grad_norm": 1.6156526803970337,
"learning_rate": 7.2241114753421245e-06,
"loss": 1.4151,
"mean_token_accuracy": 0.6440570905804635,
"num_tokens": 2247980.0,
"step": 850
},
{
"entropy": 2.0131053000688555,
"epoch": 1.4360902255639099,
"grad_norm": 1.5411407947540283,
"learning_rate": 6.853647516454978e-06,
"loss": 1.7022,
"mean_token_accuracy": 0.6108858585357666,
"num_tokens": 2271288.0,
"step": 860
},
{
"entropy": 1.9804413557052611,
"epoch": 1.452798663324979,
"grad_norm": 1.2808128595352173,
"learning_rate": 6.490101782590357e-06,
"loss": 1.502,
"mean_token_accuracy": 0.6262324623763561,
"num_tokens": 2297295.0,
"step": 870
},
{
"entropy": 1.7805092990398408,
"epoch": 1.4695071010860485,
"grad_norm": 1.404811978340149,
"learning_rate": 6.133783012069853e-06,
"loss": 1.3104,
"mean_token_accuracy": 0.6503325693309308,
"num_tokens": 2325314.0,
"step": 880
},
{
"entropy": 1.8354784041643142,
"epoch": 1.4862155388471177,
"grad_norm": 1.467010736465454,
"learning_rate": 5.784993805773946e-06,
"loss": 1.4201,
"mean_token_accuracy": 0.6422530055046082,
"num_tokens": 2353565.0,
"step": 890
},
{
"entropy": 1.9021347552537917,
"epoch": 1.5029239766081872,
"grad_norm": 1.5276216268539429,
"learning_rate": 5.44403037016061e-06,
"loss": 1.5201,
"mean_token_accuracy": 0.6460477262735367,
"num_tokens": 2375717.0,
"step": 900
},
{
"entropy": 1.8378847867250443,
"epoch": 1.5196324143692563,
"grad_norm": 1.413011908531189,
"learning_rate": 5.11118226571434e-06,
"loss": 1.4236,
"mean_token_accuracy": 0.6440291911363601,
"num_tokens": 2404483.0,
"step": 910
},
{
"entropy": 1.8757817566394805,
"epoch": 1.536340852130326,
"grad_norm": 1.5251425504684448,
"learning_rate": 4.786732161039234e-06,
"loss": 1.5295,
"mean_token_accuracy": 0.6261114202439785,
"num_tokens": 2431919.0,
"step": 920
},
{
"entropy": 1.8587533831596375,
"epoch": 1.553049289891395,
"grad_norm": 1.647172451019287,
"learning_rate": 4.470955592804871e-06,
"loss": 1.3916,
"mean_token_accuracy": 0.6414183430373669,
"num_tokens": 2461737.0,
"step": 930
},
{
"entropy": 1.8832595944404602,
"epoch": 1.5697577276524646,
"grad_norm": 2.1904146671295166,
"learning_rate": 4.164120731749072e-06,
"loss": 1.5253,
"mean_token_accuracy": 0.63388307467103,
"num_tokens": 2484053.0,
"step": 940
},
{
"entropy": 1.9385288596153258,
"epoch": 1.5864661654135337,
"grad_norm": 1.4704978466033936,
"learning_rate": 3.866488154935951e-06,
"loss": 1.544,
"mean_token_accuracy": 0.6218141697347164,
"num_tokens": 2509336.0,
"step": 950
},
{
"entropy": 1.92947296500206,
"epoch": 1.6031746031746033,
"grad_norm": 2.599453926086426,
"learning_rate": 3.5783106244629837e-06,
"loss": 1.4998,
"mean_token_accuracy": 0.6333407089114189,
"num_tokens": 2534725.0,
"step": 960
},
{
"entropy": 1.7914826542139053,
"epoch": 1.6198830409356724,
"grad_norm": 1.3860251903533936,
"learning_rate": 3.299832872804765e-06,
"loss": 1.4052,
"mean_token_accuracy": 0.6594420954585075,
"num_tokens": 2561604.0,
"step": 970
},
{
"entropy": 1.9076363742351532,
"epoch": 1.636591478696742,
"grad_norm": 1.7287890911102295,
"learning_rate": 3.0312913949759196e-06,
"loss": 1.4772,
"mean_token_accuracy": 0.6380034938454628,
"num_tokens": 2586962.0,
"step": 980
},
{
"entropy": 1.897977489233017,
"epoch": 1.653299916457811,
"grad_norm": 1.6165651082992554,
"learning_rate": 2.77291424768959e-06,
"loss": 1.4259,
"mean_token_accuracy": 0.6349127419292927,
"num_tokens": 2614128.0,
"step": 990
},
{
"entropy": 1.9598186224699021,
"epoch": 1.6700083542188806,
"grad_norm": 2.7245564460754395,
"learning_rate": 2.5249208556820908e-06,
"loss": 1.5447,
"mean_token_accuracy": 0.6180414237082005,
"num_tokens": 2638378.0,
"step": 1000
},
{
"entropy": 1.8438974112272262,
"epoch": 1.6867167919799497,
"grad_norm": 1.519782543182373,
"learning_rate": 2.287521825368189e-06,
"loss": 1.4452,
"mean_token_accuracy": 0.6378182969987393,
"num_tokens": 2667006.0,
"step": 1010
},
{
"entropy": 1.8624888569116593,
"epoch": 1.7034252297410193,
"grad_norm": 1.3807040452957153,
"learning_rate": 2.060918765985288e-06,
"loss": 1.4039,
"mean_token_accuracy": 0.6466488510370254,
"num_tokens": 2694457.0,
"step": 1020
},
{
"entropy": 1.752406159043312,
"epoch": 1.7201336675020884,
"grad_norm": 1.9477757215499878,
"learning_rate": 1.8453041183783953e-06,
"loss": 1.3724,
"mean_token_accuracy": 0.6649182379245758,
"num_tokens": 2721514.0,
"step": 1030
},
{
"entropy": 1.867113146185875,
"epoch": 1.736842105263158,
"grad_norm": 1.5146008729934692,
"learning_rate": 1.6408609915712397e-06,
"loss": 1.4912,
"mean_token_accuracy": 0.6285726055502892,
"num_tokens": 2749941.0,
"step": 1040
},
{
"entropy": 1.881867691874504,
"epoch": 1.753550543024227,
"grad_norm": 1.1970162391662598,
"learning_rate": 1.447763007262431e-06,
"loss": 1.4274,
"mean_token_accuracy": 0.6479133352637291,
"num_tokens": 2774956.0,
"step": 1050
},
{
"entropy": 1.8124820232391357,
"epoch": 1.7702589807852966,
"grad_norm": 1.135063886642456,
"learning_rate": 1.2661741523785637e-06,
"loss": 1.3427,
"mean_token_accuracy": 0.6474431194365025,
"num_tokens": 2805317.0,
"step": 1060
},
{
"entropy": 1.9482445955276488,
"epoch": 1.7869674185463658,
"grad_norm": 1.608454942703247,
"learning_rate": 1.0962486398096755e-06,
"loss": 1.5699,
"mean_token_accuracy": 0.6215325608849526,
"num_tokens": 2832036.0,
"step": 1070
},
{
"entropy": 1.945644235610962,
"epoch": 1.8036758563074353,
"grad_norm": 2.766500949859619,
"learning_rate": 9.381307774451348e-07,
"loss": 1.5286,
"mean_token_accuracy": 0.6369006544351578,
"num_tokens": 2855743.0,
"step": 1080
},
{
"entropy": 1.8863470137119294,
"epoch": 1.8203842940685044,
"grad_norm": 1.3567376136779785,
"learning_rate": 7.919548456213516e-07,
"loss": 1.5037,
"mean_token_accuracy": 0.6256943866610527,
"num_tokens": 2881629.0,
"step": 1090
},
{
"entropy": 1.7123635321855546,
"epoch": 1.837092731829574,
"grad_norm": 1.452407956123352,
"learning_rate": 6.578449830852279e-07,
"loss": 1.4047,
"mean_token_accuracy": 0.65928635597229,
"num_tokens": 2908678.0,
"step": 1100
},
{
"entropy": 1.8440750926733016,
"epoch": 1.8538011695906431,
"grad_norm": 1.331641435623169,
"learning_rate": 5.359150815703629e-07,
"loss": 1.4408,
"mean_token_accuracy": 0.6417556688189506,
"num_tokens": 2937408.0,
"step": 1110
},
{
"entropy": 1.919699600338936,
"epoch": 1.8705096073517127,
"grad_norm": 1.5076831579208374,
"learning_rate": 4.2626868907536034e-07,
"loss": 1.4585,
"mean_token_accuracy": 0.6399278536438942,
"num_tokens": 2962447.0,
"step": 1120
},
{
"entropy": 1.8350076168775558,
"epoch": 1.8872180451127818,
"grad_norm": 1.707216739654541,
"learning_rate": 3.289989219265466e-07,
"loss": 1.4885,
"mean_token_accuracy": 0.6428370043635369,
"num_tokens": 2987461.0,
"step": 1130
},
{
"entropy": 1.8436865717172624,
"epoch": 1.9039264828738514,
"grad_norm": 2.3200364112854004,
"learning_rate": 2.441883856996513e-07,
"loss": 1.5211,
"mean_token_accuracy": 0.640003713965416,
"num_tokens": 3011483.0,
"step": 1140
},
{
"entropy": 1.8367240995168685,
"epoch": 1.9206349206349205,
"grad_norm": 1.7375178337097168,
"learning_rate": 1.719091050676902e-07,
"loss": 1.4348,
"mean_token_accuracy": 0.644163503497839,
"num_tokens": 3038564.0,
"step": 1150
},
{
"entropy": 1.9745660215616225,
"epoch": 1.93734335839599,
"grad_norm": 1.5831917524337769,
"learning_rate": 1.1222246263458469e-07,
"loss": 1.6556,
"mean_token_accuracy": 0.6158465214073658,
"num_tokens": 3059313.0,
"step": 1160
},
{
"entropy": 1.8914725601673126,
"epoch": 1.9540517961570592,
"grad_norm": 1.6992645263671875,
"learning_rate": 6.517914680646386e-08,
"loss": 1.39,
"mean_token_accuracy": 0.6418868929147721,
"num_tokens": 3084800.0,
"step": 1170
},
{
"entropy": 1.8038861930370331,
"epoch": 1.9707602339181287,
"grad_norm": 1.4963058233261108,
"learning_rate": 3.081910874495497e-08,
"loss": 1.3698,
"mean_token_accuracy": 0.6503725171089172,
"num_tokens": 3113883.0,
"step": 1180
},
{
"entropy": 1.8201948583126069,
"epoch": 1.9874686716791978,
"grad_norm": 2.040982723236084,
"learning_rate": 9.171528438959852e-09,
"loss": 1.4156,
"mean_token_accuracy": 0.6591305956244469,
"num_tokens": 3138912.0,
"step": 1190
}
],
"logging_steps": 10,
"max_steps": 1198,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4514045996918374e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}