| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 1198, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.376227322220802, | |
| "epoch": 0.01670843776106934, | |
| "grad_norm": 23.182296752929688, | |
| "learning_rate": 1.5e-06, | |
| "loss": 2.4729, | |
| "mean_token_accuracy": 0.5330174341797829, | |
| "num_tokens": 23968.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.4090343087911608, | |
| "epoch": 0.03341687552213868, | |
| "grad_norm": 16.90180778503418, | |
| "learning_rate": 4e-06, | |
| "loss": 2.2456, | |
| "mean_token_accuracy": 0.546089893579483, | |
| "num_tokens": 50061.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.3292118519544602, | |
| "epoch": 0.05012531328320802, | |
| "grad_norm": 7.097179889678955, | |
| "learning_rate": 6.5000000000000004e-06, | |
| "loss": 2.0194, | |
| "mean_token_accuracy": 0.5704705387353897, | |
| "num_tokens": 77221.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.38493994474411, | |
| "epoch": 0.06683375104427736, | |
| "grad_norm": 3.9334213733673096, | |
| "learning_rate": 9e-06, | |
| "loss": 2.091, | |
| "mean_token_accuracy": 0.5746636532247067, | |
| "num_tokens": 102592.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.4236884146928785, | |
| "epoch": 0.0835421888053467, | |
| "grad_norm": 2.742833375930786, | |
| "learning_rate": 1.1500000000000002e-05, | |
| "loss": 1.9327, | |
| "mean_token_accuracy": 0.5654355354607106, | |
| "num_tokens": 130876.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.4756928414106367, | |
| "epoch": 0.10025062656641603, | |
| "grad_norm": 4.295400142669678, | |
| "learning_rate": 1.4e-05, | |
| "loss": 2.0381, | |
| "mean_token_accuracy": 0.5688402287662029, | |
| "num_tokens": 155526.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.422525143623352, | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 37.36378479003906, | |
| "learning_rate": 1.65e-05, | |
| "loss": 1.9249, | |
| "mean_token_accuracy": 0.5765560433268547, | |
| "num_tokens": 181754.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.3750332206487657, | |
| "epoch": 0.1336675020885547, | |
| "grad_norm": 53.244632720947266, | |
| "learning_rate": 1.9e-05, | |
| "loss": 1.7852, | |
| "mean_token_accuracy": 0.5806652165949344, | |
| "num_tokens": 208156.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.1606528967618943, | |
| "epoch": 0.15037593984962405, | |
| "grad_norm": 1.9780453443527222, | |
| "learning_rate": 2.15e-05, | |
| "loss": 1.74, | |
| "mean_token_accuracy": 0.595666554570198, | |
| "num_tokens": 236321.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.1004574805498124, | |
| "epoch": 0.1670843776106934, | |
| "grad_norm": 2.5225274562835693, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.7544, | |
| "mean_token_accuracy": 0.5997186966240406, | |
| "num_tokens": 263316.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.0930860340595245, | |
| "epoch": 0.18379281537176273, | |
| "grad_norm": 2.0412206649780273, | |
| "learning_rate": 2.65e-05, | |
| "loss": 1.6994, | |
| "mean_token_accuracy": 0.6029496222734452, | |
| "num_tokens": 289555.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.0005944788455965, | |
| "epoch": 0.20050125313283207, | |
| "grad_norm": 5.798661231994629, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.5928, | |
| "mean_token_accuracy": 0.6268587596714497, | |
| "num_tokens": 322035.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.1824300229549407, | |
| "epoch": 0.2172096908939014, | |
| "grad_norm": 2.135401964187622, | |
| "learning_rate": 2.999770694261986e-05, | |
| "loss": 1.7705, | |
| "mean_token_accuracy": 0.5905720897018909, | |
| "num_tokens": 343574.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.317758610844612, | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 4.436051368713379, | |
| "learning_rate": 2.9983696353001538e-05, | |
| "loss": 1.8244, | |
| "mean_token_accuracy": 0.578718575835228, | |
| "num_tokens": 367065.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.0938976377248766, | |
| "epoch": 0.2506265664160401, | |
| "grad_norm": 1.6219489574432373, | |
| "learning_rate": 2.995696097848732e-05, | |
| "loss": 1.6849, | |
| "mean_token_accuracy": 0.6061625145375729, | |
| "num_tokens": 395337.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.05442219376564, | |
| "epoch": 0.2673350041771094, | |
| "grad_norm": 1.6669152975082397, | |
| "learning_rate": 2.9917523523881268e-05, | |
| "loss": 1.7082, | |
| "mean_token_accuracy": 0.6042715333402157, | |
| "num_tokens": 420868.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.032812547683716, | |
| "epoch": 0.28404344193817876, | |
| "grad_norm": 1.9019651412963867, | |
| "learning_rate": 2.9865417481127877e-05, | |
| "loss": 1.546, | |
| "mean_token_accuracy": 0.6138139866292477, | |
| "num_tokens": 451944.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.0105547934770582, | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 1.7808424234390259, | |
| "learning_rate": 2.9800687100869334e-05, | |
| "loss": 1.5909, | |
| "mean_token_accuracy": 0.6109806254506112, | |
| "num_tokens": 482345.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.0666785687208176, | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 1.7441576719284058, | |
| "learning_rate": 2.972338735486598e-05, | |
| "loss": 1.6263, | |
| "mean_token_accuracy": 0.6088879898190498, | |
| "num_tokens": 508164.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.0136130273342134, | |
| "epoch": 0.3341687552213868, | |
| "grad_norm": 1.8193825483322144, | |
| "learning_rate": 2.9633583889312e-05, | |
| "loss": 1.6149, | |
| "mean_token_accuracy": 0.619242499768734, | |
| "num_tokens": 536345.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.0101848542690277, | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 1.11763596534729, | |
| "learning_rate": 2.9531352969085914e-05, | |
| "loss": 1.5869, | |
| "mean_token_accuracy": 0.6176401123404502, | |
| "num_tokens": 563157.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.9853627145290376, | |
| "epoch": 0.36758563074352546, | |
| "grad_norm": 2.025529384613037, | |
| "learning_rate": 2.9416781412983223e-05, | |
| "loss": 1.6008, | |
| "mean_token_accuracy": 0.6206148102879524, | |
| "num_tokens": 590929.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.108823761343956, | |
| "epoch": 0.3842940685045948, | |
| "grad_norm": 1.4173170328140259, | |
| "learning_rate": 2.9289966519986258e-05, | |
| "loss": 1.6553, | |
| "mean_token_accuracy": 0.598318774998188, | |
| "num_tokens": 615814.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.0000301033258436, | |
| "epoch": 0.40100250626566414, | |
| "grad_norm": 2.162895679473877, | |
| "learning_rate": 2.9151015986633762e-05, | |
| "loss": 1.6305, | |
| "mean_token_accuracy": 0.6174192741513252, | |
| "num_tokens": 643303.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.0013814836740496, | |
| "epoch": 0.4177109440267335, | |
| "grad_norm": 3.5182316303253174, | |
| "learning_rate": 2.90000478155605e-05, | |
| "loss": 1.5884, | |
| "mean_token_accuracy": 0.615642835944891, | |
| "num_tokens": 670279.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.061664643883705, | |
| "epoch": 0.4344193817878028, | |
| "grad_norm": 2.530327558517456, | |
| "learning_rate": 2.883719021528444e-05, | |
| "loss": 1.7341, | |
| "mean_token_accuracy": 0.6077517293393612, | |
| "num_tokens": 695006.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.0266662567853926, | |
| "epoch": 0.45112781954887216, | |
| "grad_norm": 2.2248501777648926, | |
| "learning_rate": 2.8662581491326686e-05, | |
| "loss": 1.5991, | |
| "mean_token_accuracy": 0.6176492936909199, | |
| "num_tokens": 720926.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.998877900838852, | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 1.6995679140090942, | |
| "learning_rate": 2.847636992875661e-05, | |
| "loss": 1.5343, | |
| "mean_token_accuracy": 0.6100999519228936, | |
| "num_tokens": 753552.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.0063836723566055, | |
| "epoch": 0.48454469507101083, | |
| "grad_norm": 1.5225791931152344, | |
| "learning_rate": 2.827871366626197e-05, | |
| "loss": 1.6512, | |
| "mean_token_accuracy": 0.6144520066678524, | |
| "num_tokens": 776650.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.9781533569097518, | |
| "epoch": 0.5012531328320802, | |
| "grad_norm": 1.590501070022583, | |
| "learning_rate": 2.806978056185084e-05, | |
| "loss": 1.6408, | |
| "mean_token_accuracy": 0.6150619685649872, | |
| "num_tokens": 801816.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.9714119344949723, | |
| "epoch": 0.5179615705931495, | |
| "grad_norm": 2.535109519958496, | |
| "learning_rate": 2.7849748050299578e-05, | |
| "loss": 1.603, | |
| "mean_token_accuracy": 0.6225824594497681, | |
| "num_tokens": 826564.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.94197179377079, | |
| "epoch": 0.5346700083542189, | |
| "grad_norm": 1.5610677003860474, | |
| "learning_rate": 2.7618802992467718e-05, | |
| "loss": 1.5703, | |
| "mean_token_accuracy": 0.6192868888378144, | |
| "num_tokens": 855007.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.9983945518732071, | |
| "epoch": 0.5513784461152882, | |
| "grad_norm": 1.7279447317123413, | |
| "learning_rate": 2.7377141516607896e-05, | |
| "loss": 1.5837, | |
| "mean_token_accuracy": 0.6141151934862137, | |
| "num_tokens": 881616.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.8347389072179794, | |
| "epoch": 0.5680868838763575, | |
| "grad_norm": 1.3221409320831299, | |
| "learning_rate": 2.712496885180547e-05, | |
| "loss": 1.4675, | |
| "mean_token_accuracy": 0.6429531842470169, | |
| "num_tokens": 913148.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.8934204697608947, | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 1.4465378522872925, | |
| "learning_rate": 2.686249915368938e-05, | |
| "loss": 1.4669, | |
| "mean_token_accuracy": 0.6286376528441906, | |
| "num_tokens": 944937.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.9060048371553422, | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 1.6246923208236694, | |
| "learning_rate": 2.658995532256217e-05, | |
| "loss": 1.5736, | |
| "mean_token_accuracy": 0.622390191257, | |
| "num_tokens": 974558.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.025865262746811, | |
| "epoch": 0.6182121971595655, | |
| "grad_norm": 1.2998696565628052, | |
| "learning_rate": 2.6307568814103658e-05, | |
| "loss": 1.6277, | |
| "mean_token_accuracy": 0.6089055955410003, | |
| "num_tokens": 1001232.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.9043485343456268, | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 1.7911604642868042, | |
| "learning_rate": 2.6015579442809094e-05, | |
| "loss": 1.5542, | |
| "mean_token_accuracy": 0.6255838416516781, | |
| "num_tokens": 1029679.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.0594927102327345, | |
| "epoch": 0.6516290726817042, | |
| "grad_norm": 1.6075160503387451, | |
| "learning_rate": 2.5714235178328554e-05, | |
| "loss": 1.664, | |
| "mean_token_accuracy": 0.608974926173687, | |
| "num_tokens": 1053040.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 2.034518975019455, | |
| "epoch": 0.6683375104427736, | |
| "grad_norm": 3.4272360801696777, | |
| "learning_rate": 2.540379193488072e-05, | |
| "loss": 1.7209, | |
| "mean_token_accuracy": 0.6031469151377677, | |
| "num_tokens": 1075141.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.0515862941741942, | |
| "epoch": 0.6850459482038429, | |
| "grad_norm": 1.2661142349243164, | |
| "learning_rate": 2.5084513353919753e-05, | |
| "loss": 1.6599, | |
| "mean_token_accuracy": 0.6064547099173069, | |
| "num_tokens": 1097768.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.9067467480897904, | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 2.2614035606384277, | |
| "learning_rate": 2.4756670580239894e-05, | |
| "loss": 1.5424, | |
| "mean_token_accuracy": 0.6350538037717343, | |
| "num_tokens": 1125706.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.103572541475296, | |
| "epoch": 0.7184628237259816, | |
| "grad_norm": 2.4501664638519287, | |
| "learning_rate": 2.4420542031707905e-05, | |
| "loss": 1.703, | |
| "mean_token_accuracy": 0.6036252595484257, | |
| "num_tokens": 1147357.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 2.072985142469406, | |
| "epoch": 0.7351712614870509, | |
| "grad_norm": 1.8719037771224976, | |
| "learning_rate": 2.4076413162818904e-05, | |
| "loss": 1.6609, | |
| "mean_token_accuracy": 0.606941219419241, | |
| "num_tokens": 1170179.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.022436353564262, | |
| "epoch": 0.7518796992481203, | |
| "grad_norm": 2.076544761657715, | |
| "learning_rate": 2.3724576222276405e-05, | |
| "loss": 1.7007, | |
| "mean_token_accuracy": 0.6098736897110939, | |
| "num_tokens": 1196284.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 2.1311296701431273, | |
| "epoch": 0.7685881370091896, | |
| "grad_norm": 2.9471235275268555, | |
| "learning_rate": 2.3365330004802443e-05, | |
| "loss": 1.7893, | |
| "mean_token_accuracy": 0.5961334474384785, | |
| "num_tokens": 1217358.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.9371352732181548, | |
| "epoch": 0.7852965747702589, | |
| "grad_norm": 1.4203218221664429, | |
| "learning_rate": 2.2998979597388526e-05, | |
| "loss": 1.608, | |
| "mean_token_accuracy": 0.6184874981641769, | |
| "num_tokens": 1243374.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.0747445046901705, | |
| "epoch": 0.8020050125313283, | |
| "grad_norm": 2.2124152183532715, | |
| "learning_rate": 2.262583612020294e-05, | |
| "loss": 1.6711, | |
| "mean_token_accuracy": 0.6062519766390324, | |
| "num_tokens": 1267314.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.7878674179315568, | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 1.3049612045288086, | |
| "learning_rate": 2.224621646237442e-05, | |
| "loss": 1.4722, | |
| "mean_token_accuracy": 0.6389728732407093, | |
| "num_tokens": 1296917.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.925000047683716, | |
| "epoch": 0.835421888053467, | |
| "grad_norm": 1.4271504878997803, | |
| "learning_rate": 2.1860443012876574e-05, | |
| "loss": 1.6114, | |
| "mean_token_accuracy": 0.616053618490696, | |
| "num_tokens": 1322826.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 2.0017556965351107, | |
| "epoch": 0.8521303258145363, | |
| "grad_norm": 1.6036300659179688, | |
| "learning_rate": 2.146884338674161e-05, | |
| "loss": 1.5247, | |
| "mean_token_accuracy": 0.6218138873577118, | |
| "num_tokens": 1349542.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.995826429128647, | |
| "epoch": 0.8688387635756056, | |
| "grad_norm": 1.979646921157837, | |
| "learning_rate": 2.1071750146835906e-05, | |
| "loss": 1.5882, | |
| "mean_token_accuracy": 0.6175018042325974, | |
| "num_tokens": 1375550.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.918816116452217, | |
| "epoch": 0.885547201336675, | |
| "grad_norm": 1.531023383140564, | |
| "learning_rate": 2.06695005214336e-05, | |
| "loss": 1.5158, | |
| "mean_token_accuracy": 0.6258822798728942, | |
| "num_tokens": 1401535.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.918640747666359, | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 1.6574758291244507, | |
| "learning_rate": 2.026243611782818e-05, | |
| "loss": 1.4805, | |
| "mean_token_accuracy": 0.6370069451630116, | |
| "num_tokens": 1430094.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 2.0166410475969316, | |
| "epoch": 0.9189640768588136, | |
| "grad_norm": 2.085106372833252, | |
| "learning_rate": 1.9850902632225228e-05, | |
| "loss": 1.6067, | |
| "mean_token_accuracy": 0.613443473726511, | |
| "num_tokens": 1453767.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.9751090347766875, | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 1.5619549751281738, | |
| "learning_rate": 1.9435249556162633e-05, | |
| "loss": 1.5186, | |
| "mean_token_accuracy": 0.6245452538132668, | |
| "num_tokens": 1479122.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 2.0289832055568695, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.9765042066574097, | |
| "learning_rate": 1.9015829879707705e-05, | |
| "loss": 1.635, | |
| "mean_token_accuracy": 0.6078214071691036, | |
| "num_tokens": 1502393.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 2.0707653611898422, | |
| "epoch": 0.9690893901420217, | |
| "grad_norm": 2.33292293548584, | |
| "learning_rate": 1.8592999791683163e-05, | |
| "loss": 1.7044, | |
| "mean_token_accuracy": 0.5983508452773094, | |
| "num_tokens": 1528688.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.830039870738983, | |
| "epoch": 0.985797827903091, | |
| "grad_norm": 1.7089521884918213, | |
| "learning_rate": 1.8167118377176626e-05, | |
| "loss": 1.4488, | |
| "mean_token_accuracy": 0.6382713422179223, | |
| "num_tokens": 1559267.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 2.093587511464169, | |
| "epoch": 1.001670843776107, | |
| "grad_norm": 1.8502051830291748, | |
| "learning_rate": 1.7738547312590426e-05, | |
| "loss": 1.6569, | |
| "mean_token_accuracy": 0.6035293050502476, | |
| "num_tokens": 1579777.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.986822435259819, | |
| "epoch": 1.0183792815371762, | |
| "grad_norm": 1.8095533847808838, | |
| "learning_rate": 1.7307650558490842e-05, | |
| "loss": 1.5196, | |
| "mean_token_accuracy": 0.624878978729248, | |
| "num_tokens": 1603371.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.8720842987298965, | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 1.418859601020813, | |
| "learning_rate": 1.687479405051745e-05, | |
| "loss": 1.4617, | |
| "mean_token_accuracy": 0.6376385740935803, | |
| "num_tokens": 1629671.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.9420676857233048, | |
| "epoch": 1.0517961570593148, | |
| "grad_norm": 1.9264510869979858, | |
| "learning_rate": 1.6440345388615225e-05, | |
| "loss": 1.5184, | |
| "mean_token_accuracy": 0.6275502189993858, | |
| "num_tokens": 1654665.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.877267986536026, | |
| "epoch": 1.0685045948203844, | |
| "grad_norm": 1.5752804279327393, | |
| "learning_rate": 1.6004673524853184e-05, | |
| "loss": 1.4846, | |
| "mean_token_accuracy": 0.6406380534172058, | |
| "num_tokens": 1681208.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.7192733228206634, | |
| "epoch": 1.0852130325814535, | |
| "grad_norm": 1.8777337074279785, | |
| "learning_rate": 1.5568148450094788e-05, | |
| "loss": 1.3596, | |
| "mean_token_accuracy": 0.6518337815999985, | |
| "num_tokens": 1711943.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.8170194894075393, | |
| "epoch": 1.101921470342523, | |
| "grad_norm": 1.324455738067627, | |
| "learning_rate": 1.5131140879786162e-05, | |
| "loss": 1.3627, | |
| "mean_token_accuracy": 0.6482401996850967, | |
| "num_tokens": 1744406.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.9752755105495452, | |
| "epoch": 1.1186299081035922, | |
| "grad_norm": 1.697969913482666, | |
| "learning_rate": 1.4694021939128925e-05, | |
| "loss": 1.6435, | |
| "mean_token_accuracy": 0.6173132970929146, | |
| "num_tokens": 1763684.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.810714191198349, | |
| "epoch": 1.1353383458646618, | |
| "grad_norm": 1.6863676309585571, | |
| "learning_rate": 1.4257162847905073e-05, | |
| "loss": 1.4572, | |
| "mean_token_accuracy": 0.6450446091592312, | |
| "num_tokens": 1790387.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.7295511305332183, | |
| "epoch": 1.1520467836257309, | |
| "grad_norm": 1.8147404193878174, | |
| "learning_rate": 1.3820934605221554e-05, | |
| "loss": 1.4375, | |
| "mean_token_accuracy": 0.656483718752861, | |
| "num_tokens": 1818936.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.876406043767929, | |
| "epoch": 1.1687552213868004, | |
| "grad_norm": 1.4509400129318237, | |
| "learning_rate": 1.3385707674442212e-05, | |
| "loss": 1.5038, | |
| "mean_token_accuracy": 0.6330045036971569, | |
| "num_tokens": 1844285.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.7754180505871773, | |
| "epoch": 1.1854636591478696, | |
| "grad_norm": 1.8553998470306396, | |
| "learning_rate": 1.295185166857471e-05, | |
| "loss": 1.3342, | |
| "mean_token_accuracy": 0.6631456315517426, | |
| "num_tokens": 1872386.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.8854310810565948, | |
| "epoch": 1.2021720969089391, | |
| "grad_norm": 1.9020118713378906, | |
| "learning_rate": 1.2519735036379654e-05, | |
| "loss": 1.5075, | |
| "mean_token_accuracy": 0.6296862445771694, | |
| "num_tokens": 1898136.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.8075116664171218, | |
| "epoch": 1.2188805346700082, | |
| "grad_norm": 1.4228155612945557, | |
| "learning_rate": 1.2089724749468383e-05, | |
| "loss": 1.475, | |
| "mean_token_accuracy": 0.63388437256217, | |
| "num_tokens": 1926755.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.9176389634609223, | |
| "epoch": 1.2355889724310778, | |
| "grad_norm": 1.4030358791351318, | |
| "learning_rate": 1.1662185990655285e-05, | |
| "loss": 1.4261, | |
| "mean_token_accuracy": 0.6366409629583358, | |
| "num_tokens": 1954094.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.9318130016326904, | |
| "epoch": 1.252297410192147, | |
| "grad_norm": 2.5528788566589355, | |
| "learning_rate": 1.1237481843829162e-05, | |
| "loss": 1.637, | |
| "mean_token_accuracy": 0.6269572585821152, | |
| "num_tokens": 1977028.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.8894671678543091, | |
| "epoch": 1.2690058479532165, | |
| "grad_norm": 1.5508147478103638, | |
| "learning_rate": 1.0815972985607149e-05, | |
| "loss": 1.5153, | |
| "mean_token_accuracy": 0.6365115389227867, | |
| "num_tokens": 2000096.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.9068289607763291, | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 1.2476104497909546, | |
| "learning_rate": 1.0398017379032955e-05, | |
| "loss": 1.5797, | |
| "mean_token_accuracy": 0.6310160294175148, | |
| "num_tokens": 2024669.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.7813611298799514, | |
| "epoch": 1.3024227234753551, | |
| "grad_norm": 2.0824921131134033, | |
| "learning_rate": 9.983969969579616e-06, | |
| "loss": 1.4656, | |
| "mean_token_accuracy": 0.6474256098270417, | |
| "num_tokens": 2051877.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.8404263019561768, | |
| "epoch": 1.3191311612364243, | |
| "grad_norm": 1.2777612209320068, | |
| "learning_rate": 9.574182383714839e-06, | |
| "loss": 1.4355, | |
| "mean_token_accuracy": 0.6538358055055141, | |
| "num_tokens": 2079905.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.8420085251331328, | |
| "epoch": 1.3358395989974938, | |
| "grad_norm": 1.2849156856536865, | |
| "learning_rate": 9.169002630285063e-06, | |
| "loss": 1.4413, | |
| "mean_token_accuracy": 0.6362872712314129, | |
| "num_tokens": 2109705.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.9093056112527846, | |
| "epoch": 1.352548036758563, | |
| "grad_norm": 1.3787521123886108, | |
| "learning_rate": 8.768774804971705e-06, | |
| "loss": 1.4332, | |
| "mean_token_accuracy": 0.646092452853918, | |
| "num_tokens": 2136002.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.8188541814684869, | |
| "epoch": 1.3692564745196325, | |
| "grad_norm": 1.9946447610855103, | |
| "learning_rate": 8.37383879807061e-06, | |
| "loss": 1.4213, | |
| "mean_token_accuracy": 0.6514127373695373, | |
| "num_tokens": 2162939.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.767152488231659, | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 2.6128296852111816, | |
| "learning_rate": 7.98453000584296e-06, | |
| "loss": 1.4077, | |
| "mean_token_accuracy": 0.6433039925992489, | |
| "num_tokens": 2192868.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.8834378957748412, | |
| "epoch": 1.4026733500417712, | |
| "grad_norm": 2.5565028190612793, | |
| "learning_rate": 7.601179045682659e-06, | |
| "loss": 1.4508, | |
| "mean_token_accuracy": 0.6357239708304405, | |
| "num_tokens": 2220574.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.8659739628434182, | |
| "epoch": 1.4193817878028403, | |
| "grad_norm": 1.6156526803970337, | |
| "learning_rate": 7.2241114753421245e-06, | |
| "loss": 1.4151, | |
| "mean_token_accuracy": 0.6440570905804635, | |
| "num_tokens": 2247980.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 2.0131053000688555, | |
| "epoch": 1.4360902255639099, | |
| "grad_norm": 1.5411407947540283, | |
| "learning_rate": 6.853647516454978e-06, | |
| "loss": 1.7022, | |
| "mean_token_accuracy": 0.6108858585357666, | |
| "num_tokens": 2271288.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.9804413557052611, | |
| "epoch": 1.452798663324979, | |
| "grad_norm": 1.2808128595352173, | |
| "learning_rate": 6.490101782590357e-06, | |
| "loss": 1.502, | |
| "mean_token_accuracy": 0.6262324623763561, | |
| "num_tokens": 2297295.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.7805092990398408, | |
| "epoch": 1.4695071010860485, | |
| "grad_norm": 1.404811978340149, | |
| "learning_rate": 6.133783012069853e-06, | |
| "loss": 1.3104, | |
| "mean_token_accuracy": 0.6503325693309308, | |
| "num_tokens": 2325314.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.8354784041643142, | |
| "epoch": 1.4862155388471177, | |
| "grad_norm": 1.467010736465454, | |
| "learning_rate": 5.784993805773946e-06, | |
| "loss": 1.4201, | |
| "mean_token_accuracy": 0.6422530055046082, | |
| "num_tokens": 2353565.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.9021347552537917, | |
| "epoch": 1.5029239766081872, | |
| "grad_norm": 1.5276216268539429, | |
| "learning_rate": 5.44403037016061e-06, | |
| "loss": 1.5201, | |
| "mean_token_accuracy": 0.6460477262735367, | |
| "num_tokens": 2375717.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.8378847867250443, | |
| "epoch": 1.5196324143692563, | |
| "grad_norm": 1.413011908531189, | |
| "learning_rate": 5.11118226571434e-06, | |
| "loss": 1.4236, | |
| "mean_token_accuracy": 0.6440291911363601, | |
| "num_tokens": 2404483.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.8757817566394805, | |
| "epoch": 1.536340852130326, | |
| "grad_norm": 1.5251425504684448, | |
| "learning_rate": 4.786732161039234e-06, | |
| "loss": 1.5295, | |
| "mean_token_accuracy": 0.6261114202439785, | |
| "num_tokens": 2431919.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.8587533831596375, | |
| "epoch": 1.553049289891395, | |
| "grad_norm": 1.647172451019287, | |
| "learning_rate": 4.470955592804871e-06, | |
| "loss": 1.3916, | |
| "mean_token_accuracy": 0.6414183430373669, | |
| "num_tokens": 2461737.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.8832595944404602, | |
| "epoch": 1.5697577276524646, | |
| "grad_norm": 2.1904146671295166, | |
| "learning_rate": 4.164120731749072e-06, | |
| "loss": 1.5253, | |
| "mean_token_accuracy": 0.63388307467103, | |
| "num_tokens": 2484053.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.9385288596153258, | |
| "epoch": 1.5864661654135337, | |
| "grad_norm": 1.4704978466033936, | |
| "learning_rate": 3.866488154935951e-06, | |
| "loss": 1.544, | |
| "mean_token_accuracy": 0.6218141697347164, | |
| "num_tokens": 2509336.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.92947296500206, | |
| "epoch": 1.6031746031746033, | |
| "grad_norm": 2.599453926086426, | |
| "learning_rate": 3.5783106244629837e-06, | |
| "loss": 1.4998, | |
| "mean_token_accuracy": 0.6333407089114189, | |
| "num_tokens": 2534725.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.7914826542139053, | |
| "epoch": 1.6198830409356724, | |
| "grad_norm": 1.3860251903533936, | |
| "learning_rate": 3.299832872804765e-06, | |
| "loss": 1.4052, | |
| "mean_token_accuracy": 0.6594420954585075, | |
| "num_tokens": 2561604.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.9076363742351532, | |
| "epoch": 1.636591478696742, | |
| "grad_norm": 1.7287890911102295, | |
| "learning_rate": 3.0312913949759196e-06, | |
| "loss": 1.4772, | |
| "mean_token_accuracy": 0.6380034938454628, | |
| "num_tokens": 2586962.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.897977489233017, | |
| "epoch": 1.653299916457811, | |
| "grad_norm": 1.6165651082992554, | |
| "learning_rate": 2.77291424768959e-06, | |
| "loss": 1.4259, | |
| "mean_token_accuracy": 0.6349127419292927, | |
| "num_tokens": 2614128.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.9598186224699021, | |
| "epoch": 1.6700083542188806, | |
| "grad_norm": 2.7245564460754395, | |
| "learning_rate": 2.5249208556820908e-06, | |
| "loss": 1.5447, | |
| "mean_token_accuracy": 0.6180414237082005, | |
| "num_tokens": 2638378.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.8438974112272262, | |
| "epoch": 1.6867167919799497, | |
| "grad_norm": 1.519782543182373, | |
| "learning_rate": 2.287521825368189e-06, | |
| "loss": 1.4452, | |
| "mean_token_accuracy": 0.6378182969987393, | |
| "num_tokens": 2667006.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.8624888569116593, | |
| "epoch": 1.7034252297410193, | |
| "grad_norm": 1.3807040452957153, | |
| "learning_rate": 2.060918765985288e-06, | |
| "loss": 1.4039, | |
| "mean_token_accuracy": 0.6466488510370254, | |
| "num_tokens": 2694457.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.752406159043312, | |
| "epoch": 1.7201336675020884, | |
| "grad_norm": 1.9477757215499878, | |
| "learning_rate": 1.8453041183783953e-06, | |
| "loss": 1.3724, | |
| "mean_token_accuracy": 0.6649182379245758, | |
| "num_tokens": 2721514.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.867113146185875, | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 1.5146008729934692, | |
| "learning_rate": 1.6408609915712397e-06, | |
| "loss": 1.4912, | |
| "mean_token_accuracy": 0.6285726055502892, | |
| "num_tokens": 2749941.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.881867691874504, | |
| "epoch": 1.753550543024227, | |
| "grad_norm": 1.1970162391662598, | |
| "learning_rate": 1.447763007262431e-06, | |
| "loss": 1.4274, | |
| "mean_token_accuracy": 0.6479133352637291, | |
| "num_tokens": 2774956.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.8124820232391357, | |
| "epoch": 1.7702589807852966, | |
| "grad_norm": 1.135063886642456, | |
| "learning_rate": 1.2661741523785637e-06, | |
| "loss": 1.3427, | |
| "mean_token_accuracy": 0.6474431194365025, | |
| "num_tokens": 2805317.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.9482445955276488, | |
| "epoch": 1.7869674185463658, | |
| "grad_norm": 1.608454942703247, | |
| "learning_rate": 1.0962486398096755e-06, | |
| "loss": 1.5699, | |
| "mean_token_accuracy": 0.6215325608849526, | |
| "num_tokens": 2832036.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.945644235610962, | |
| "epoch": 1.8036758563074353, | |
| "grad_norm": 2.766500949859619, | |
| "learning_rate": 9.381307774451348e-07, | |
| "loss": 1.5286, | |
| "mean_token_accuracy": 0.6369006544351578, | |
| "num_tokens": 2855743.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.8863470137119294, | |
| "epoch": 1.8203842940685044, | |
| "grad_norm": 1.3567376136779785, | |
| "learning_rate": 7.919548456213516e-07, | |
| "loss": 1.5037, | |
| "mean_token_accuracy": 0.6256943866610527, | |
| "num_tokens": 2881629.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.7123635321855546, | |
| "epoch": 1.837092731829574, | |
| "grad_norm": 1.452407956123352, | |
| "learning_rate": 6.578449830852279e-07, | |
| "loss": 1.4047, | |
| "mean_token_accuracy": 0.65928635597229, | |
| "num_tokens": 2908678.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.8440750926733016, | |
| "epoch": 1.8538011695906431, | |
| "grad_norm": 1.331641435623169, | |
| "learning_rate": 5.359150815703629e-07, | |
| "loss": 1.4408, | |
| "mean_token_accuracy": 0.6417556688189506, | |
| "num_tokens": 2937408.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.919699600338936, | |
| "epoch": 1.8705096073517127, | |
| "grad_norm": 1.5076831579208374, | |
| "learning_rate": 4.2626868907536034e-07, | |
| "loss": 1.4585, | |
| "mean_token_accuracy": 0.6399278536438942, | |
| "num_tokens": 2962447.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.8350076168775558, | |
| "epoch": 1.8872180451127818, | |
| "grad_norm": 1.707216739654541, | |
| "learning_rate": 3.289989219265466e-07, | |
| "loss": 1.4885, | |
| "mean_token_accuracy": 0.6428370043635369, | |
| "num_tokens": 2987461.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.8436865717172624, | |
| "epoch": 1.9039264828738514, | |
| "grad_norm": 2.3200364112854004, | |
| "learning_rate": 2.441883856996513e-07, | |
| "loss": 1.5211, | |
| "mean_token_accuracy": 0.640003713965416, | |
| "num_tokens": 3011483.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.8367240995168685, | |
| "epoch": 1.9206349206349205, | |
| "grad_norm": 1.7375178337097168, | |
| "learning_rate": 1.719091050676902e-07, | |
| "loss": 1.4348, | |
| "mean_token_accuracy": 0.644163503497839, | |
| "num_tokens": 3038564.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.9745660215616225, | |
| "epoch": 1.93734335839599, | |
| "grad_norm": 1.5831917524337769, | |
| "learning_rate": 1.1222246263458469e-07, | |
| "loss": 1.6556, | |
| "mean_token_accuracy": 0.6158465214073658, | |
| "num_tokens": 3059313.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.8914725601673126, | |
| "epoch": 1.9540517961570592, | |
| "grad_norm": 1.6992645263671875, | |
| "learning_rate": 6.517914680646386e-08, | |
| "loss": 1.39, | |
| "mean_token_accuracy": 0.6418868929147721, | |
| "num_tokens": 3084800.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.8038861930370331, | |
| "epoch": 1.9707602339181287, | |
| "grad_norm": 1.4963058233261108, | |
| "learning_rate": 3.081910874495497e-08, | |
| "loss": 1.3698, | |
| "mean_token_accuracy": 0.6503725171089172, | |
| "num_tokens": 3113883.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.8201948583126069, | |
| "epoch": 1.9874686716791978, | |
| "grad_norm": 2.040982723236084, | |
| "learning_rate": 9.171528438959852e-09, | |
| "loss": 1.4156, | |
| "mean_token_accuracy": 0.6591305956244469, | |
| "num_tokens": 3138912.0, | |
| "step": 1190 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1198, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4514045996918374e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |